diff --git a/.dockerignore b/.dockerignore index ae0ad8fd77..8b378b5dab 100644 --- a/.dockerignore +++ b/.dockerignore @@ -1,27 +1,27 @@ * -!rust-toolchain.toml -!Cargo.toml +# Files !Cargo.lock +!Cargo.toml !Makefile +!rust-toolchain.toml +!scripts/combine_control_files.py +!scripts/ninstall.sh +!vm-cgconfig.conf +# Directories !.cargo/ !.config/ -!control_plane/ !compute_tools/ +!control_plane/ !libs/ +!neon_local/ !pageserver/ !pgxn/ !proxy/ -!safekeeper/ !s3_scrubber/ +!safekeeper/ !storage_broker/ !trace/ -!vendor/postgres-v14/ -!vendor/postgres-v15/ -!vendor/postgres-v16/ +!vendor/postgres-*/ !workspace_hack/ -!neon_local/ -!scripts/ninstall.sh -!scripts/combine_control_files.py -!vm-cgconfig.conf diff --git a/.github/ISSUE_TEMPLATE/epic-template.md b/.github/ISSUE_TEMPLATE/epic-template.md index 019e6e7345..c442f50fde 100644 --- a/.github/ISSUE_TEMPLATE/epic-template.md +++ b/.github/ISSUE_TEMPLATE/epic-template.md @@ -16,9 +16,9 @@ assignees: '' ## Implementation ideas - +## Tasks ```[tasklist] -### Tasks +- [ ] Example Task ``` diff --git a/.github/actionlint.yml b/.github/actionlint.yml index 362480f256..cb36e2eee6 100644 --- a/.github/actionlint.yml +++ b/.github/actionlint.yml @@ -4,6 +4,8 @@ self-hosted-runner: - dev - gen3 - large + # Remove `macos-14` from the list after https://github.com/rhysd/actionlint/pull/392 is merged. + - macos-14 - small - us-east-2 config-variables: diff --git a/.github/actions/allure-report-generate/action.yml b/.github/actions/allure-report-generate/action.yml index abdbba802e..1ecb5ecc7e 100644 --- a/.github/actions/allure-report-generate/action.yml +++ b/.github/actions/allure-report-generate/action.yml @@ -39,7 +39,7 @@ runs: PR_NUMBER=$(jq --raw-output .pull_request.number "$GITHUB_EVENT_PATH" || true) if [ "${PR_NUMBER}" != "null" ]; then BRANCH_OR_PR=pr-${PR_NUMBER} - elif [ "${GITHUB_REF_NAME}" = "main" ] || [ "${GITHUB_REF_NAME}" = "release" ]; then + elif [ "${GITHUB_REF_NAME}" = "main" ] || [ "${GITHUB_REF_NAME}" = "release" ] || [ "${GITHUB_REF_NAME}" = "release-proxy" ]; then # Shortcut for special branches BRANCH_OR_PR=${GITHUB_REF_NAME} else @@ -59,7 +59,7 @@ runs: BUCKET: neon-github-public-dev # TODO: We can replace with a special docker image with Java and Allure pre-installed - - uses: actions/setup-java@v3 + - uses: actions/setup-java@v4 with: distribution: 'temurin' java-version: '17' @@ -76,8 +76,8 @@ runs: rm -f ${ALLURE_ZIP} fi env: - ALLURE_VERSION: 2.24.0 - ALLURE_ZIP_SHA256: 60b1d6ce65d9ef24b23cf9c2c19fd736a123487c38e54759f1ed1a7a77353c90 + ALLURE_VERSION: 2.27.0 + ALLURE_ZIP_SHA256: b071858fb2fa542c65d8f152c5c40d26267b2dfb74df1f1608a589ecca38e777 # Potentially we could have several running build for the same key (for example, for the main branch), so we use improvised lock for this - name: Acquire lock @@ -179,22 +179,11 @@ runs: aws s3 rm "s3://${BUCKET}/${LOCK_FILE}" fi - - name: Store Allure test stat in the DB - if: ${{ !cancelled() && inputs.store-test-results-into-db == 'true' }} - shell: bash -euxo pipefail {0} - env: - COMMIT_SHA: ${{ github.event.pull_request.head.sha || github.sha }} - REPORT_JSON_URL: ${{ steps.generate-report.outputs.report-json-url }} - run: | - export DATABASE_URL=${REGRESS_TEST_RESULT_CONNSTR} - - ./scripts/pysync - - poetry run python3 scripts/ingest_regress_test_result.py \ - --revision ${COMMIT_SHA} \ - --reference ${GITHUB_REF} \ - --build-type unified \ - --ingest ${WORKDIR}/report/data/suites.json + - name: Cache poetry deps + uses: actions/cache@v4 + with: + path: ~/.cache/pypoetry/virtualenvs + key: v2-${{ runner.os }}-python-deps-${{ hashFiles('poetry.lock') }} - name: Store Allure test stat in the DB (new) if: ${{ !cancelled() && inputs.store-test-results-into-db == 'true' }} @@ -226,7 +215,7 @@ runs: rm -rf ${WORKDIR} fi - - uses: actions/github-script@v6 + - uses: actions/github-script@v7 if: always() env: REPORT_URL: ${{ steps.generate-report.outputs.report-url }} diff --git a/.github/actions/allure-report-store/action.yml b/.github/actions/allure-report-store/action.yml index 7ae9937d42..df4a6712ac 100644 --- a/.github/actions/allure-report-store/action.yml +++ b/.github/actions/allure-report-store/action.yml @@ -19,7 +19,7 @@ runs: PR_NUMBER=$(jq --raw-output .pull_request.number "$GITHUB_EVENT_PATH" || true) if [ "${PR_NUMBER}" != "null" ]; then BRANCH_OR_PR=pr-${PR_NUMBER} - elif [ "${GITHUB_REF_NAME}" = "main" ] || [ "${GITHUB_REF_NAME}" = "release" ]; then + elif [ "${GITHUB_REF_NAME}" = "main" ] || [ "${GITHUB_REF_NAME}" = "release" ] || [ "${GITHUB_REF_NAME}" = "release-proxy" ]; then # Shortcut for special branches BRANCH_OR_PR=${GITHUB_REF_NAME} else diff --git a/.github/actions/run-python-test-set/action.yml b/.github/actions/run-python-test-set/action.yml index 8dfa6c465f..d9e543d4bb 100644 --- a/.github/actions/run-python-test-set/action.yml +++ b/.github/actions/run-python-test-set/action.yml @@ -44,6 +44,10 @@ inputs: description: 'Postgres version to use for tests' required: false default: 'v14' + benchmark_durations: + description: 'benchmark durations JSON' + required: false + default: '{}' runs: using: "composite" @@ -76,17 +80,16 @@ runs: - name: Checkout if: inputs.needs_postgres_source == 'true' - uses: actions/checkout@v3 + uses: actions/checkout@v4 with: submodules: true fetch-depth: 1 - name: Cache poetry deps - id: cache_poetry - uses: actions/cache@v3 + uses: actions/cache@v4 with: path: ~/.cache/pypoetry/virtualenvs - key: v1-${{ runner.os }}-python-deps-${{ hashFiles('poetry.lock') }} + key: v2-${{ runner.os }}-python-deps-${{ hashFiles('poetry.lock') }} - name: Install Python deps shell: bash -euxo pipefail {0} @@ -160,7 +163,7 @@ runs: # We use pytest-split plugin to run benchmarks in parallel on different CI runners if [ "${TEST_SELECTION}" = "test_runner/performance" ] && [ "${{ inputs.build_type }}" != "remote" ]; then mkdir -p $TEST_OUTPUT - poetry run ./scripts/benchmark_durations.py "${TEST_RESULT_CONNSTR}" --days 10 --output "$TEST_OUTPUT/benchmark_durations.json" + echo '${{ inputs.benchmark_durations || '{}' }}' > $TEST_OUTPUT/benchmark_durations.json EXTRA_PARAMS="--durations-path $TEST_OUTPUT/benchmark_durations.json $EXTRA_PARAMS" fi diff --git a/.github/workflows/actionlint.yml b/.github/workflows/actionlint.yml index 584828c1d0..f2736614bf 100644 --- a/.github/workflows/actionlint.yml +++ b/.github/workflows/actionlint.yml @@ -16,7 +16,14 @@ concurrency: cancel-in-progress: ${{ github.event_name == 'pull_request' }} jobs: + check-permissions: + if: ${{ !contains(github.event.pull_request.labels.*.name, 'run-no-ci') }} + uses: ./.github/workflows/check-permissions.yml + with: + github-event-name: ${{ github.event_name}} + actionlint: + needs: [ check-permissions ] runs-on: ubuntu-latest steps: - uses: actions/checkout@v4 diff --git a/.github/workflows/approved-for-ci-run.yml b/.github/workflows/approved-for-ci-run.yml index 5b21011b83..69c48d86b9 100644 --- a/.github/workflows/approved-for-ci-run.yml +++ b/.github/workflows/approved-for-ci-run.yml @@ -64,7 +64,7 @@ jobs: steps: - run: gh pr --repo "${GITHUB_REPOSITORY}" edit "${PR_NUMBER}" --remove-label "approved-for-ci-run" - - uses: actions/checkout@v3 + - uses: actions/checkout@v4 with: ref: main token: ${{ secrets.CI_ACCESS_TOKEN }} @@ -93,6 +93,7 @@ jobs: --body-file "body.md" \ --head "${BRANCH}" \ --base "main" \ + --label "run-e2e-tests-in-draft" \ --draft fi diff --git a/.github/workflows/benchmarking.yml b/.github/workflows/benchmarking.yml index 8bf12c31b1..2e56bf909f 100644 --- a/.github/workflows/benchmarking.yml +++ b/.github/workflows/benchmarking.yml @@ -62,11 +62,11 @@ jobs: runs-on: [ self-hosted, us-east-2, x64 ] container: - image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/rust:pinned + image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/build-tools:pinned options: --init steps: - - uses: actions/checkout@v3 + - uses: actions/checkout@v4 - name: Download Neon artifact uses: ./.github/actions/download @@ -214,14 +214,14 @@ jobs: runs-on: [ self-hosted, us-east-2, x64 ] container: - image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/rust:pinned + image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/build-tools:pinned options: --init # Increase timeout to 8h, default timeout is 6h timeout-minutes: 480 steps: - - uses: actions/checkout@v3 + - uses: actions/checkout@v4 - name: Download Neon artifact uses: ./.github/actions/download @@ -362,11 +362,11 @@ jobs: runs-on: [ self-hosted, us-east-2, x64 ] container: - image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/rust:pinned + image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/build-tools:pinned options: --init steps: - - uses: actions/checkout@v3 + - uses: actions/checkout@v4 - name: Download Neon artifact uses: ./.github/actions/download @@ -461,11 +461,11 @@ jobs: runs-on: [ self-hosted, us-east-2, x64 ] container: - image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/rust:pinned + image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/build-tools:pinned options: --init steps: - - uses: actions/checkout@v3 + - uses: actions/checkout@v4 - name: Download Neon artifact uses: ./.github/actions/download @@ -558,11 +558,11 @@ jobs: runs-on: [ self-hosted, us-east-2, x64 ] container: - image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/rust:pinned + image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/build-tools:pinned options: --init steps: - - uses: actions/checkout@v3 + - uses: actions/checkout@v4 - name: Download Neon artifact uses: ./.github/actions/download diff --git a/.github/workflows/build-build-tools-image.yml b/.github/workflows/build-build-tools-image.yml new file mode 100644 index 0000000000..251423e701 --- /dev/null +++ b/.github/workflows/build-build-tools-image.yml @@ -0,0 +1,105 @@ +name: Build build-tools image + +on: + workflow_call: + inputs: + image-tag: + description: "build-tools image tag" + required: true + type: string + outputs: + image-tag: + description: "build-tools tag" + value: ${{ inputs.image-tag }} + image: + description: "build-tools image" + value: neondatabase/build-tools:${{ inputs.image-tag }} + +defaults: + run: + shell: bash -euo pipefail {0} + +concurrency: + group: build-build-tools-image-${{ inputs.image-tag }} + +# No permission for GITHUB_TOKEN by default; the **minimal required** set of permissions should be granted in each job. +permissions: {} + +jobs: + check-image: + uses: ./.github/workflows/check-build-tools-image.yml + + # This job uses older version of GitHub Actions because it's run on gen2 runners, which don't support node 20 (for newer versions) + build-image: + needs: [ check-image ] + if: needs.check-image.outputs.found == 'false' + + strategy: + matrix: + arch: [ x64, arm64 ] + + runs-on: ${{ fromJson(format('["self-hosted", "dev", "{0}"]', matrix.arch)) }} + + env: + IMAGE_TAG: ${{ inputs.image-tag }} + + steps: + - name: Check `input.tag` is correct + env: + INPUTS_IMAGE_TAG: ${{ inputs.image-tag }} + CHECK_IMAGE_TAG : ${{ needs.check-image.outputs.image-tag }} + run: | + if [ "${INPUTS_IMAGE_TAG}" != "${CHECK_IMAGE_TAG}" ]; then + echo "'inputs.image-tag' (${INPUTS_IMAGE_TAG}) does not match the tag of the latest build-tools image 'inputs.image-tag' (${CHECK_IMAGE_TAG})" + exit 1 + fi + + - uses: actions/checkout@v3 + + # Use custom DOCKER_CONFIG directory to avoid conflicts with default settings + # The default value is ~/.docker + - name: Set custom docker config directory + run: | + mkdir -p /tmp/.docker-custom + echo DOCKER_CONFIG=/tmp/.docker-custom >> $GITHUB_ENV + + - uses: docker/setup-buildx-action@v2 + + - uses: docker/login-action@v2 + with: + username: ${{ secrets.NEON_DOCKERHUB_USERNAME }} + password: ${{ secrets.NEON_DOCKERHUB_PASSWORD }} + + - uses: docker/build-push-action@v4 + with: + context: . + provenance: false + push: true + pull: true + file: Dockerfile.build-tools + cache-from: type=registry,ref=neondatabase/build-tools:cache-${{ matrix.arch }} + cache-to: type=registry,ref=neondatabase/build-tools:cache-${{ matrix.arch }},mode=max + tags: neondatabase/build-tools:${{ inputs.image-tag }}-${{ matrix.arch }} + + - name: Remove custom docker config directory + run: | + rm -rf /tmp/.docker-custom + + merge-images: + needs: [ build-image ] + runs-on: ubuntu-latest + + env: + IMAGE_TAG: ${{ inputs.image-tag }} + + steps: + - uses: docker/login-action@v3 + with: + username: ${{ secrets.NEON_DOCKERHUB_USERNAME }} + password: ${{ secrets.NEON_DOCKERHUB_PASSWORD }} + + - name: Create multi-arch image + run: | + docker buildx imagetools create -t neondatabase/build-tools:${IMAGE_TAG} \ + neondatabase/build-tools:${IMAGE_TAG}-x64 \ + neondatabase/build-tools:${IMAGE_TAG}-arm64 diff --git a/.github/workflows/build_and_push_docker_image.yml b/.github/workflows/build_and_push_docker_image.yml deleted file mode 100644 index e401b2f418..0000000000 --- a/.github/workflows/build_and_push_docker_image.yml +++ /dev/null @@ -1,105 +0,0 @@ -name: Build and Push Docker Image - -on: - workflow_call: - inputs: - dockerfile-path: - required: true - type: string - image-name: - required: true - type: string - outputs: - build-tools-tag: - description: "tag generated for build tools" - value: ${{ jobs.tag.outputs.build-tools-tag }} - -jobs: - check-if-build-tools-dockerfile-changed: - runs-on: ubuntu-latest - outputs: - docker_file_changed: ${{ steps.dockerfile.outputs.docker_file_changed }} - steps: - - name: Check if Dockerfile.buildtools has changed - id: dockerfile - run: | - if [[ "$GITHUB_EVENT_NAME" != "pull_request" ]]; then - echo "docker_file_changed=false" >> $GITHUB_OUTPUT - exit - fi - updated_files=$(gh pr --repo neondatabase/neon diff ${{ github.event.pull_request.number }} --name-only) - if [[ $updated_files == *"Dockerfile.buildtools"* ]]; then - echo "docker_file_changed=true" >> $GITHUB_OUTPUT - fi - env: - GH_TOKEN: ${{ secrets.GITHUB_TOKEN }} - - tag: - runs-on: ubuntu-latest - needs: [ check-if-build-tools-dockerfile-changed ] - outputs: - build-tools-tag: ${{steps.buildtools-tag.outputs.image_tag}} - - steps: - - name: Get buildtools tag - env: - DOCKERFILE_CHANGED: ${{ needs.check-if-build-tools-dockerfile-changed.outputs.docker_file_changed }} - run: | - if [[ "$GITHUB_EVENT_NAME" == "pull_request" ]] && [[ "${DOCKERFILE_CHANGED}" == "true" ]]; then - IMAGE_TAG=$GITHUB_RUN_ID - else - IMAGE_TAG=pinned - fi - - echo "image_tag=${IMAGE_TAG}" >> $GITHUB_OUTPUT - shell: bash - id: buildtools-tag - - kaniko: - if: needs.check-if-build-tools-dockerfile-changed.outputs.docker_file_changed == 'true' - needs: [ tag, check-if-build-tools-dockerfile-changed ] - runs-on: [ self-hosted, dev, x64 ] - container: gcr.io/kaniko-project/executor:v1.7.0-debug - - steps: - - name: Checkout - uses: actions/checkout@v1 - - - name: Configure ECR login - run: echo "{\"credsStore\":\"ecr-login\"}" > /kaniko/.docker/config.json - - - name: Kaniko build - run: /kaniko/executor --reproducible --snapshotMode=redo --skip-unused-stages --dockerfile ${{ inputs.dockerfile-path }} --cache=true --cache-repo 369495373322.dkr.ecr.eu-central-1.amazonaws.com/cache --destination 369495373322.dkr.ecr.eu-central-1.amazonaws.com/${{ inputs.image-name }}:${{ needs.tag.outputs.build-tools-tag }}-amd64 - - kaniko-arm: - if: needs.check-if-build-tools-dockerfile-changed.outputs.docker_file_changed == 'true' - needs: [ tag, check-if-build-tools-dockerfile-changed ] - runs-on: [ self-hosted, dev, arm64 ] - container: gcr.io/kaniko-project/executor:v1.7.0-debug - - steps: - - name: Checkout - uses: actions/checkout@v1 - - - name: Configure ECR login - run: echo "{\"credsStore\":\"ecr-login\"}" > /kaniko/.docker/config.json - - - name: Kaniko build - run: /kaniko/executor --reproducible --snapshotMode=redo --skip-unused-stages --dockerfile ${{ inputs.dockerfile-path }} --cache=true --cache-repo 369495373322.dkr.ecr.eu-central-1.amazonaws.com/cache --destination 369495373322.dkr.ecr.eu-central-1.amazonaws.com/${{ inputs.image-name }}:${{ needs.tag.outputs.build-tools-tag }}-arm64 - - manifest: - if: needs.check-if-build-tools-dockerfile-changed.outputs.docker_file_changed == 'true' - name: 'manifest' - runs-on: [ self-hosted, dev, x64 ] - needs: - - tag - - kaniko - - kaniko-arm - - check-if-build-tools-dockerfile-changed - - steps: - - name: Create manifest - run: docker manifest create 369495373322.dkr.ecr.eu-central-1.amazonaws.com/${{ inputs.image-name }}:${{ needs.tag.outputs.build-tools-tag }} --amend 369495373322.dkr.ecr.eu-central-1.amazonaws.com/${{ inputs.image-name }}:${{ needs.tag.outputs.build-tools-tag }}-amd64 --amend 369495373322.dkr.ecr.eu-central-1.amazonaws.com/${{ inputs.image-name }}:${{ needs.tag.outputs.build-tools-tag }}-arm64 - - - name: Push manifest - run: docker manifest push 369495373322.dkr.ecr.eu-central-1.amazonaws.com/${{ inputs.image-name }}:${{ needs.tag.outputs.build-tools-tag }} diff --git a/.github/workflows/build_and_test.yml b/.github/workflows/build_and_test.yml index 7445501f00..810c61de2d 100644 --- a/.github/workflows/build_and_test.yml +++ b/.github/workflows/build_and_test.yml @@ -5,6 +5,7 @@ on: branches: - main - release + - release-proxy pull_request: defaults: @@ -21,31 +22,15 @@ env: COPT: '-Werror' AWS_ACCESS_KEY_ID: ${{ secrets.AWS_ACCESS_KEY_DEV }} AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_SECRET_KEY_DEV }} - NEXTEST_RETRIES: 3 # A concurrency group that we use for e2e-tests runs, matches `concurrency.group` above with `github.repository` as a prefix - E2E_CONCURRENCY_GROUP: ${{ github.repository }}-${{ github.workflow }}-${{ github.ref_name }}-${{ github.ref_name == 'main' && github.sha || 'anysha' }} + E2E_CONCURRENCY_GROUP: ${{ github.repository }}-e2e-tests-${{ github.ref_name }}-${{ github.ref_name == 'main' && github.sha || 'anysha' }} jobs: check-permissions: - runs-on: ubuntu-latest - - steps: - - name: Disallow PRs from forks - if: | - github.event_name == 'pull_request' && - github.event.pull_request.head.repo.full_name != github.repository - - run: | - if [ "${{ contains(fromJSON('["OWNER", "MEMBER", "COLLABORATOR"]'), github.event.pull_request.author_association) }}" = "true" ]; then - MESSAGE="Please create a PR from a branch of ${GITHUB_REPOSITORY} instead of a fork" - else - MESSAGE="The PR should be reviewed and labelled with 'approved-for-ci-run' to trigger a CI run" - fi - - echo >&2 "We don't run CI for PRs from forks" - echo >&2 "${MESSAGE}" - - exit 1 + if: ${{ !contains(github.event.pull_request.labels.*.name, 'run-no-ci') }} + uses: ./.github/workflows/check-permissions.yml + with: + github-event-name: ${{ github.event_name}} cancel-previous-e2e-tests: needs: [ check-permissions ] @@ -70,7 +55,7 @@ jobs: steps: - name: Checkout - uses: actions/checkout@v3 + uses: actions/checkout@v4 with: fetch-depth: 0 @@ -83,6 +68,8 @@ jobs: echo "tag=$(git rev-list --count HEAD)" >> $GITHUB_OUTPUT elif [[ "$GITHUB_REF_NAME" == "release" ]]; then echo "tag=release-$(git rev-list --count HEAD)" >> $GITHUB_OUTPUT + elif [[ "$GITHUB_REF_NAME" == "release-proxy" ]]; then + echo "tag=release-proxy-$(git rev-list --count HEAD)" >> $GITHUB_OUTPUT else echo "GITHUB_REF_NAME (value '$GITHUB_REF_NAME') is not set to either 'main' or 'release'" echo "tag=$GITHUB_RUN_ID" >> $GITHUB_OUTPUT @@ -90,34 +77,39 @@ jobs: shell: bash id: build-tag - build-buildtools-image: + check-build-tools-image: needs: [ check-permissions ] - uses: ./.github/workflows/build_and_push_docker_image.yml + uses: ./.github/workflows/check-build-tools-image.yml + + build-build-tools-image: + needs: [ check-build-tools-image ] + uses: ./.github/workflows/build-build-tools-image.yml with: - dockerfile-path: Dockerfile.buildtools - image-name: build-tools + image-tag: ${{ needs.check-build-tools-image.outputs.image-tag }} secrets: inherit check-codestyle-python: - needs: [ check-permissions, build-buildtools-image ] + needs: [ check-permissions, build-build-tools-image ] runs-on: [ self-hosted, gen3, small ] container: - image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/build-tools:${{ needs.build-buildtools-image.outputs.build-tools-tag }} + image: ${{ needs.build-build-tools-image.outputs.image }} + credentials: + username: ${{ secrets.NEON_DOCKERHUB_USERNAME }} + password: ${{ secrets.NEON_DOCKERHUB_PASSWORD }} options: --init steps: - name: Checkout - uses: actions/checkout@v3 + uses: actions/checkout@v4 with: submodules: false fetch-depth: 1 - name: Cache poetry deps - id: cache_poetry - uses: actions/cache@v3 + uses: actions/cache@v4 with: path: ~/.cache/pypoetry/virtualenvs - key: v1-codestyle-python-deps-${{ hashFiles('poetry.lock') }} + key: v2-${{ runner.os }}-python-deps-${{ hashFiles('poetry.lock') }} - name: Install Python deps run: ./scripts/pysync @@ -132,15 +124,18 @@ jobs: run: poetry run mypy . check-codestyle-rust: - needs: [ check-permissions, build-buildtools-image ] - runs-on: [ self-hosted, gen3, large ] + needs: [ check-permissions, build-build-tools-image ] + runs-on: [ self-hosted, gen3, small ] container: - image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/build-tools:${{ needs.build-buildtools-image.outputs.build-tools-tag }} + image: ${{ needs.build-build-tools-image.outputs.image }} + credentials: + username: ${{ secrets.NEON_DOCKERHUB_USERNAME }} + password: ${{ secrets.NEON_DOCKERHUB_PASSWORD }} options: --init steps: - name: Checkout - uses: actions/checkout@v3 + uses: actions/checkout@v4 with: submodules: true fetch-depth: 1 @@ -148,7 +143,7 @@ jobs: # Disabled for now # - name: Restore cargo deps cache # id: cache_cargo -# uses: actions/cache@v3 +# uses: actions/cache@v4 # with: # path: | # !~/.cargo/registry/src @@ -199,10 +194,13 @@ jobs: run: cargo deny check --hide-inclusion-graph build-neon: - needs: [ check-permissions, tag, build-buildtools-image ] + needs: [ check-permissions, tag, build-build-tools-image ] runs-on: [ self-hosted, gen3, large ] container: - image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/build-tools:${{ needs.build-buildtools-image.outputs.build-tools-tag }} + image: ${{ needs.build-build-tools-image.outputs.image }} + credentials: + username: ${{ secrets.NEON_DOCKERHUB_USERNAME }} + password: ${{ secrets.NEON_DOCKERHUB_PASSWORD }} # Raise locked memory limit for tokio-epoll-uring. # On 5.10 LTS kernels < 5.10.162 (and generally mainline kernels < 5.12), # io_uring will account the memory of the CQ and SQ as locked. @@ -233,7 +231,7 @@ jobs: done - name: Checkout - uses: actions/checkout@v3 + uses: actions/checkout@v4 with: submodules: true fetch-depth: 1 @@ -255,7 +253,7 @@ jobs: done if [ "${FAILED}" = "true" ]; then - echo >&2 "Please update vendors/revisions.json if these changes are intentional" + echo >&2 "Please update vendor/revisions.json if these changes are intentional" exit 1 fi @@ -305,7 +303,7 @@ jobs: # compressed crates. # - name: Cache cargo deps # id: cache_cargo -# uses: actions/cache@v3 +# uses: actions/cache@v4 # with: # path: | # ~/.cargo/registry/ @@ -319,21 +317,21 @@ jobs: - name: Cache postgres v14 build id: cache_pg_14 - uses: actions/cache@v3 + uses: actions/cache@v4 with: path: pg_install/v14 key: v1-${{ runner.os }}-${{ matrix.build_type }}-pg-${{ steps.pg_v14_rev.outputs.pg_rev }}-${{ hashFiles('Makefile') }} - name: Cache postgres v15 build id: cache_pg_15 - uses: actions/cache@v3 + uses: actions/cache@v4 with: path: pg_install/v15 key: v1-${{ runner.os }}-${{ matrix.build_type }}-pg-${{ steps.pg_v15_rev.outputs.pg_rev }}-${{ hashFiles('Makefile') }} - name: Cache postgres v16 build id: cache_pg_16 - uses: actions/cache@v3 + uses: actions/cache@v4 with: path: pg_install/v16 key: v1-${{ runner.os }}-${{ matrix.build_type }}-pg-${{ steps.pg_v16_rev.outputs.pg_rev }}-${{ hashFiles('Makefile') }} @@ -361,6 +359,8 @@ jobs: ${cov_prefix} mold -run cargo build $CARGO_FLAGS $CARGO_FEATURES --bins --tests - name: Run rust tests + env: + NEXTEST_RETRIES: 3 run: | for io_engine in std-fs tokio-epoll-uring ; do NEON_PAGESERVER_UNIT_TEST_VIRTUAL_FILE_IOENGINE=$io_engine ${cov_prefix} cargo nextest run $CARGO_FLAGS $CARGO_FEATURES @@ -438,10 +438,13 @@ jobs: uses: ./.github/actions/save-coverage-data regress-tests: - needs: [ check-permissions, build-neon, build-buildtools-image, tag ] + needs: [ check-permissions, build-neon, build-build-tools-image, tag ] runs-on: [ self-hosted, gen3, large ] container: - image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/build-tools:${{ needs.build-buildtools-image.outputs.build-tools-tag }} + image: ${{ needs.build-build-tools-image.outputs.image }} + credentials: + username: ${{ secrets.NEON_DOCKERHUB_USERNAME }} + password: ${{ secrets.NEON_DOCKERHUB_PASSWORD }} # for changed limits, see comments on `options:` earlier in this file options: --init --shm-size=512mb --ulimit memlock=67108864:67108864 strategy: @@ -451,7 +454,7 @@ jobs: pg_version: [ v14, v15, v16 ] steps: - name: Checkout - uses: actions/checkout@v3 + uses: actions/checkout@v4 with: submodules: true fetch-depth: 1 @@ -472,16 +475,59 @@ jobs: CHECK_ONDISK_DATA_COMPATIBILITY: nonempty BUILD_TAG: ${{ needs.tag.outputs.build-tag }} PAGESERVER_VIRTUAL_FILE_IO_ENGINE: tokio-epoll-uring + PAGESERVER_GET_VECTORED_IMPL: vectored + # Temporary disable this step until we figure out why it's so flaky + # Ref https://github.com/neondatabase/neon/issues/4540 - name: Merge and upload coverage data - if: matrix.build_type == 'debug' && matrix.pg_version == 'v14' + if: | + false && + matrix.build_type == 'debug' && matrix.pg_version == 'v14' uses: ./.github/actions/save-coverage-data - benchmarks: - needs: [ check-permissions, build-neon, build-buildtools-image ] + get-benchmarks-durations: + outputs: + json: ${{ steps.get-benchmark-durations.outputs.json }} + needs: [ check-permissions, build-build-tools-image ] runs-on: [ self-hosted, gen3, small ] container: - image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/build-tools:${{ needs.build-buildtools-image.outputs.build-tools-tag }} + image: ${{ needs.build-build-tools-image.outputs.image }} + credentials: + username: ${{ secrets.NEON_DOCKERHUB_USERNAME }} + password: ${{ secrets.NEON_DOCKERHUB_PASSWORD }} + options: --init + if: github.ref_name == 'main' || contains(github.event.pull_request.labels.*.name, 'run-benchmarks') + steps: + - name: Checkout + uses: actions/checkout@v4 + + - name: Cache poetry deps + uses: actions/cache@v4 + with: + path: ~/.cache/pypoetry/virtualenvs + key: v1-${{ runner.os }}-python-deps-${{ hashFiles('poetry.lock') }} + + - name: Install Python deps + run: ./scripts/pysync + + - name: get benchmark durations + id: get-benchmark-durations + env: + TEST_RESULT_CONNSTR: ${{ secrets.REGRESS_TEST_RESULT_CONNSTR_NEW }} + run: | + poetry run ./scripts/benchmark_durations.py "${TEST_RESULT_CONNSTR}" \ + --days 10 \ + --output /tmp/benchmark_durations.json + echo "json=$(jq --compact-output '.' /tmp/benchmark_durations.json)" >> $GITHUB_OUTPUT + + benchmarks: + needs: [ check-permissions, build-neon, build-build-tools-image, get-benchmarks-durations ] + runs-on: [ self-hosted, gen3, small ] + container: + image: ${{ needs.build-build-tools-image.outputs.image }} + credentials: + username: ${{ secrets.NEON_DOCKERHUB_USERNAME }} + password: ${{ secrets.NEON_DOCKERHUB_PASSWORD }} # for changed limits, see comments on `options:` earlier in this file options: --init --shm-size=512mb --ulimit memlock=67108864:67108864 if: github.ref_name == 'main' || contains(github.event.pull_request.labels.*.name, 'run-benchmarks') @@ -489,11 +535,11 @@ jobs: fail-fast: false matrix: # the amount of groups (N) should be reflected in `extra_params: --splits N ...` - pytest_split_group: [ 1, 2, 3, 4 ] + pytest_split_group: [ 1, 2, 3, 4, 5 ] build_type: [ release ] steps: - name: Checkout - uses: actions/checkout@v3 + uses: actions/checkout@v4 - name: Pytest benchmarks uses: ./.github/actions/run-python-test-set @@ -502,7 +548,8 @@ jobs: test_selection: performance run_in_parallel: false save_perf_report: ${{ github.ref_name == 'main' }} - extra_params: --splits 4 --group ${{ matrix.pytest_split_group }} + extra_params: --splits 5 --group ${{ matrix.pytest_split_group }} + benchmark_durations: ${{ needs.get-benchmarks-durations.outputs.json }} env: VIP_VAP_ACCESS_TOKEN: "${{ secrets.VIP_VAP_ACCESS_TOKEN }}" PERF_TEST_RESULT_CONNSTR: "${{ secrets.PERF_TEST_RESULT_CONNSTR }}" @@ -512,16 +559,19 @@ jobs: # while coverage is currently collected for the debug ones create-test-report: - needs: [ check-permissions, regress-tests, coverage-report, benchmarks, build-buildtools-image ] + needs: [ check-permissions, regress-tests, coverage-report, benchmarks, build-build-tools-image ] if: ${{ !cancelled() && contains(fromJSON('["skipped", "success"]'), needs.check-permissions.result) }} runs-on: [ self-hosted, gen3, small ] container: - image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/build-tools:${{ needs.build-buildtools-image.outputs.build-tools-tag }} + image: ${{ needs.build-build-tools-image.outputs.image }} + credentials: + username: ${{ secrets.NEON_DOCKERHUB_USERNAME }} + password: ${{ secrets.NEON_DOCKERHUB_PASSWORD }} options: --init steps: - - uses: actions/checkout@v3 + - uses: actions/checkout@v4 - name: Create Allure report if: ${{ !cancelled() }} @@ -530,10 +580,9 @@ jobs: with: store-test-results-into-db: true env: - REGRESS_TEST_RESULT_CONNSTR: ${{ secrets.REGRESS_TEST_RESULT_CONNSTR }} REGRESS_TEST_RESULT_CONNSTR_NEW: ${{ secrets.REGRESS_TEST_RESULT_CONNSTR_NEW }} - - uses: actions/github-script@v6 + - uses: actions/github-script@v7 if: ${{ !cancelled() }} with: # Retry script for 5XX server errors: https://github.com/actions/github-script#retries @@ -559,10 +608,13 @@ jobs: }) coverage-report: - needs: [ check-permissions, regress-tests, build-buildtools-image ] + needs: [ check-permissions, regress-tests, build-build-tools-image ] runs-on: [ self-hosted, gen3, small ] container: - image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/build-tools:${{ needs.build-buildtools-image.outputs.build-tools-tag }} + image: ${{ needs.build-build-tools-image.outputs.image }} + credentials: + username: ${{ secrets.NEON_DOCKERHUB_USERNAME }} + password: ${{ secrets.NEON_DOCKERHUB_PASSWORD }} options: --init strategy: fail-fast: false @@ -573,7 +625,7 @@ jobs: coverage-json: ${{ steps.upload-coverage-report-new.outputs.summary-json }} steps: - name: Checkout - uses: actions/checkout@v3 + uses: actions/checkout@v4 with: submodules: true fetch-depth: 0 @@ -608,17 +660,6 @@ jobs: --input-objects=/tmp/coverage/binaries.list \ --format=lcov - - name: Upload coverage report - id: upload-coverage-report - env: - BUCKET: neon-github-public-dev - COMMIT_SHA: ${{ github.event.pull_request.head.sha || github.sha }} - run: | - aws s3 cp --only-show-errors --recursive /tmp/coverage/report s3://${BUCKET}/code-coverage/${COMMIT_SHA} - - REPORT_URL=https://${BUCKET}.s3.amazonaws.com/code-coverage/${COMMIT_SHA}/index.html - echo "report-url=${REPORT_URL}" >> $GITHUB_OUTPUT - - name: Build coverage report NEW id: upload-coverage-report-new env: @@ -653,23 +694,13 @@ jobs: REPORT_URL=https://${BUCKET}.s3.amazonaws.com/code-coverage/${COMMIT_SHA}/lcov/summary.json echo "summary-json=${REPORT_URL}" >> $GITHUB_OUTPUT - - uses: actions/github-script@v6 + - uses: actions/github-script@v7 env: - REPORT_URL: ${{ steps.upload-coverage-report.outputs.report-url }} REPORT_URL_NEW: ${{ steps.upload-coverage-report-new.outputs.report-url }} COMMIT_SHA: ${{ github.event.pull_request.head.sha || github.sha }} with: script: | - const { REPORT_URL, REPORT_URL_NEW, COMMIT_SHA } = process.env - - await github.rest.repos.createCommitStatus({ - owner: context.repo.owner, - repo: context.repo.repo, - sha: `${COMMIT_SHA}`, - state: 'success', - target_url: `${REPORT_URL}`, - context: 'Code coverage report', - }) + const { REPORT_URL_NEW, COMMIT_SHA } = process.env await github.rest.repos.createCommitStatus({ owner: context.repo.owner, @@ -681,206 +712,146 @@ jobs: }) trigger-e2e-tests: + if: ${{ !github.event.pull_request.draft || contains( github.event.pull_request.labels.*.name, 'run-e2e-tests-in-draft') || github.ref_name == 'main' || github.ref_name == 'release' || github.ref_name == 'release-proxy' }} needs: [ check-permissions, promote-images, tag ] - runs-on: [ self-hosted, gen3, small ] - container: - image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/base:pinned - options: --init - steps: - - name: Set PR's status to pending and request a remote CI test - run: | - # For pull requests, GH Actions set "github.sha" variable to point at a fake merge commit - # but we need to use a real sha of a latest commit in the PR's branch for the e2e job, - # to place a job run status update later. - COMMIT_SHA=${{ github.event.pull_request.head.sha }} - # For non-PR kinds of runs, the above will produce an empty variable, pick the original sha value for those - COMMIT_SHA=${COMMIT_SHA:-${{ github.sha }}} - - REMOTE_REPO="${{ github.repository_owner }}/cloud" - - curl -f -X POST \ - https://api.github.com/repos/${{ github.repository }}/statuses/$COMMIT_SHA \ - -H "Accept: application/vnd.github.v3+json" \ - --user "${{ secrets.CI_ACCESS_TOKEN }}" \ - --data \ - "{ - \"state\": \"pending\", - \"context\": \"neon-cloud-e2e\", - \"description\": \"[$REMOTE_REPO] Remote CI job is about to start\" - }" - - curl -f -X POST \ - https://api.github.com/repos/$REMOTE_REPO/actions/workflows/testing.yml/dispatches \ - -H "Accept: application/vnd.github.v3+json" \ - --user "${{ secrets.CI_ACCESS_TOKEN }}" \ - --data \ - "{ - \"ref\": \"main\", - \"inputs\": { - \"ci_job_name\": \"neon-cloud-e2e\", - \"commit_hash\": \"$COMMIT_SHA\", - \"remote_repo\": \"${{ github.repository }}\", - \"storage_image_tag\": \"${{ needs.tag.outputs.build-tag }}\", - \"compute_image_tag\": \"${{ needs.tag.outputs.build-tag }}\", - \"concurrency_group\": \"${{ env.E2E_CONCURRENCY_GROUP }}\" - } - }" + uses: ./.github/workflows/trigger-e2e-tests.yml + secrets: inherit neon-image: - needs: [ check-permissions, build-buildtools-image, tag ] + needs: [ check-permissions, build-build-tools-image, tag ] runs-on: [ self-hosted, gen3, large ] - container: gcr.io/kaniko-project/executor:v1.9.2-debug - defaults: - run: - shell: sh -eu {0} steps: - name: Checkout - uses: actions/checkout@v1 # v3 won't work with kaniko + uses: actions/checkout@v4 with: submodules: true fetch-depth: 0 - - name: Configure ECR and Docker Hub login + # Use custom DOCKER_CONFIG directory to avoid conflicts with default settings + # The default value is ~/.docker + - name: Set custom docker config directory run: | - DOCKERHUB_AUTH=$(echo -n "${{ secrets.NEON_DOCKERHUB_USERNAME }}:${{ secrets.NEON_DOCKERHUB_PASSWORD }}" | base64) - echo "::add-mask::${DOCKERHUB_AUTH}" + mkdir -p .docker-custom + echo DOCKER_CONFIG=$(pwd)/.docker-custom >> $GITHUB_ENV + - uses: docker/setup-buildx-action@v3 - cat <<-EOF > /kaniko/.docker/config.json - { - "auths": { - "https://index.docker.io/v1/": { - "auth": "${DOCKERHUB_AUTH}" - } - }, - "credHelpers": { - "369495373322.dkr.ecr.eu-central-1.amazonaws.com": "ecr-login" - } - } - EOF + - uses: docker/login-action@v3 + with: + username: ${{ secrets.NEON_DOCKERHUB_USERNAME }} + password: ${{ secrets.NEON_DOCKERHUB_PASSWORD }} - - name: Kaniko build neon - run: - /kaniko/executor --reproducible --snapshot-mode=redo --skip-unused-stages --cache=true - --cache-repo 369495373322.dkr.ecr.eu-central-1.amazonaws.com/cache - --context . - --build-arg GIT_VERSION=${{ github.event.pull_request.head.sha || github.sha }} - --build-arg BUILD_TAG=${{ needs.tag.outputs.build-tag }} - --build-arg TAG=${{ needs.build-buildtools-image.outputs.build-tools-tag }} - --build-arg REPOSITORY=369495373322.dkr.ecr.eu-central-1.amazonaws.com - --destination 369495373322.dkr.ecr.eu-central-1.amazonaws.com/neon:${{needs.tag.outputs.build-tag}} - --destination neondatabase/neon:${{needs.tag.outputs.build-tag}} + - uses: docker/login-action@v3 + with: + registry: 369495373322.dkr.ecr.eu-central-1.amazonaws.com + username: ${{ secrets.AWS_ACCESS_KEY_DEV }} + password: ${{ secrets.AWS_SECRET_KEY_DEV }} - # Cleanup script fails otherwise - rm: cannot remove '/nvme/actions-runner/_work/_temp/_github_home/.ecr': Permission denied - - name: Cleanup ECR folder - run: rm -rf ~/.ecr + - uses: docker/build-push-action@v5 + with: + context: . + build-args: | + GIT_VERSION=${{ github.event.pull_request.head.sha || github.sha }} + BUILD_TAG=${{ needs.tag.outputs.build-tag }} + TAG=${{ needs.build-build-tools-image.outputs.image-tag }} + provenance: false + push: true + pull: true + file: Dockerfile + cache-from: type=registry,ref=neondatabase/neon:cache + cache-to: type=registry,ref=neondatabase/neon:cache,mode=max + tags: | + 369495373322.dkr.ecr.eu-central-1.amazonaws.com/neon:${{needs.tag.outputs.build-tag}} + neondatabase/neon:${{needs.tag.outputs.build-tag}} - compute-tools-image: - runs-on: [ self-hosted, gen3, large ] - needs: [ check-permissions, build-buildtools-image, tag ] - container: gcr.io/kaniko-project/executor:v1.9.2-debug - defaults: - run: - shell: sh -eu {0} - - steps: - - name: Checkout - uses: actions/checkout@v1 # v3 won't work with kaniko - - - name: Configure ECR and Docker Hub login + - name: Remove custom docker config directory + if: always() run: | - DOCKERHUB_AUTH=$(echo -n "${{ secrets.NEON_DOCKERHUB_USERNAME }}:${{ secrets.NEON_DOCKERHUB_PASSWORD }}" | base64) - echo "::add-mask::${DOCKERHUB_AUTH}" - - cat <<-EOF > /kaniko/.docker/config.json - { - "auths": { - "https://index.docker.io/v1/": { - "auth": "${DOCKERHUB_AUTH}" - } - }, - "credHelpers": { - "369495373322.dkr.ecr.eu-central-1.amazonaws.com": "ecr-login" - } - } - EOF - - - name: Kaniko build compute tools - run: - /kaniko/executor --reproducible --snapshot-mode=redo --skip-unused-stages --cache=true - --cache-repo 369495373322.dkr.ecr.eu-central-1.amazonaws.com/cache - --context . - --build-arg GIT_VERSION=${{ github.event.pull_request.head.sha || github.sha }} - --build-arg BUILD_TAG=${{needs.tag.outputs.build-tag}} - --build-arg TAG=${{needs.build-buildtools-image.outputs.build-tools-tag}} - --build-arg REPOSITORY=369495373322.dkr.ecr.eu-central-1.amazonaws.com - --dockerfile Dockerfile.compute-tools - --destination 369495373322.dkr.ecr.eu-central-1.amazonaws.com/compute-tools:${{needs.tag.outputs.build-tag}} - --destination neondatabase/compute-tools:${{needs.tag.outputs.build-tag}} - - # Cleanup script fails otherwise - rm: cannot remove '/nvme/actions-runner/_work/_temp/_github_home/.ecr': Permission denied - - name: Cleanup ECR folder - run: rm -rf ~/.ecr + rm -rf .docker-custom compute-node-image: - needs: [ check-permissions, build-buildtools-image, tag ] + needs: [ check-permissions, build-build-tools-image, tag ] runs-on: [ self-hosted, gen3, large ] - container: - image: gcr.io/kaniko-project/executor:v1.9.2-debug - # Workaround for "Resolving download.osgeo.org (download.osgeo.org)... failed: Temporary failure in name resolution."" - # Should be prevented by https://github.com/neondatabase/neon/issues/4281 - options: --add-host=download.osgeo.org:140.211.15.30 + strategy: fail-fast: false matrix: version: [ v14, v15, v16 ] - defaults: - run: - shell: sh -eu {0} steps: - name: Checkout - uses: actions/checkout@v1 # v3 won't work with kaniko + uses: actions/checkout@v4 with: submodules: true fetch-depth: 0 - - name: Configure ECR and Docker Hub login + # Use custom DOCKER_CONFIG directory to avoid conflicts with default settings + # The default value is ~/.docker + - name: Set custom docker config directory run: | - DOCKERHUB_AUTH=$(echo -n "${{ secrets.NEON_DOCKERHUB_USERNAME }}:${{ secrets.NEON_DOCKERHUB_PASSWORD }}" | base64) - echo "::add-mask::${DOCKERHUB_AUTH}" + mkdir -p .docker-custom + echo DOCKER_CONFIG=$(pwd)/.docker-custom >> $GITHUB_ENV + - uses: docker/setup-buildx-action@v3 + with: + # Disable parallelism for docker buildkit. + # As we already build everything with `make -j$(nproc)`, running it in additional level of parallelisam blows up the Runner. + config-inline: | + [worker.oci] + max-parallelism = 1 - cat <<-EOF > /kaniko/.docker/config.json - { - "auths": { - "https://index.docker.io/v1/": { - "auth": "${DOCKERHUB_AUTH}" - } - }, - "credHelpers": { - "369495373322.dkr.ecr.eu-central-1.amazonaws.com": "ecr-login" - } - } - EOF + - uses: docker/login-action@v3 + with: + username: ${{ secrets.NEON_DOCKERHUB_USERNAME }} + password: ${{ secrets.NEON_DOCKERHUB_PASSWORD }} - - name: Kaniko build compute node with extensions - run: - /kaniko/executor --reproducible --snapshot-mode=redo --skip-unused-stages --cache=true - --cache-repo 369495373322.dkr.ecr.eu-central-1.amazonaws.com/cache - --context . - --build-arg GIT_VERSION=${{ github.event.pull_request.head.sha || github.sha }} - --build-arg PG_VERSION=${{ matrix.version }} - --build-arg BUILD_TAG=${{needs.tag.outputs.build-tag}} - --build-arg TAG=${{needs.build-buildtools-image.outputs.build-tools-tag}} - --build-arg REPOSITORY=369495373322.dkr.ecr.eu-central-1.amazonaws.com - --dockerfile Dockerfile.compute-node - --destination 369495373322.dkr.ecr.eu-central-1.amazonaws.com/compute-node-${{ matrix.version }}:${{needs.tag.outputs.build-tag}} - --destination neondatabase/compute-node-${{ matrix.version }}:${{needs.tag.outputs.build-tag}} - --cleanup + - uses: docker/login-action@v3 + with: + registry: 369495373322.dkr.ecr.eu-central-1.amazonaws.com + username: ${{ secrets.AWS_ACCESS_KEY_DEV }} + password: ${{ secrets.AWS_SECRET_KEY_DEV }} - # Cleanup script fails otherwise - rm: cannot remove '/nvme/actions-runner/_work/_temp/_github_home/.ecr': Permission denied - - name: Cleanup ECR folder - run: rm -rf ~/.ecr + - name: Build compute-node image + uses: docker/build-push-action@v5 + with: + context: . + build-args: | + GIT_VERSION=${{ github.event.pull_request.head.sha || github.sha }} + PG_VERSION=${{ matrix.version }} + BUILD_TAG=${{ needs.tag.outputs.build-tag }} + TAG=${{ needs.build-build-tools-image.outputs.image-tag }} + provenance: false + push: true + pull: true + file: Dockerfile.compute-node + cache-from: type=registry,ref=neondatabase/compute-node-${{ matrix.version }}:cache + cache-to: type=registry,ref=neondatabase/compute-node-${{ matrix.version }}:cache,mode=max + tags: | + 369495373322.dkr.ecr.eu-central-1.amazonaws.com/compute-node-${{ matrix.version }}:${{needs.tag.outputs.build-tag}} + neondatabase/compute-node-${{ matrix.version }}:${{needs.tag.outputs.build-tag}} + + - name: Build compute-tools image + # compute-tools are Postgres independent, so build it only once + if: ${{ matrix.version == 'v16' }} + uses: docker/build-push-action@v5 + with: + target: compute-tools-image + context: . + build-args: | + GIT_VERSION=${{ github.event.pull_request.head.sha || github.sha }} + BUILD_TAG=${{ needs.tag.outputs.build-tag }} + TAG=${{ needs.build-build-tools-image.outputs.image-tag }} + provenance: false + push: true + pull: true + file: Dockerfile.compute-node + tags: | + 369495373322.dkr.ecr.eu-central-1.amazonaws.com/compute-tools:${{ needs.tag.outputs.build-tag }} + neondatabase/compute-tools:${{ needs.tag.outputs.build-tag }} + + - name: Remove custom docker config directory + if: always() + run: | + rm -rf .docker-custom vm-compute-node-image: needs: [ check-permissions, tag, compute-node-image ] @@ -893,7 +864,7 @@ jobs: run: shell: sh -eu {0} env: - VM_BUILDER_VERSION: v0.21.0 + VM_BUILDER_VERSION: v0.23.2 steps: - name: Checkout @@ -924,12 +895,12 @@ jobs: docker push 369495373322.dkr.ecr.eu-central-1.amazonaws.com/vm-compute-node-${{ matrix.version }}:${{needs.tag.outputs.build-tag}} test-images: - needs: [ check-permissions, tag, neon-image, compute-node-image, compute-tools-image ] + needs: [ check-permissions, tag, neon-image, compute-node-image ] runs-on: [ self-hosted, gen3, small ] steps: - name: Checkout - uses: actions/checkout@v3 + uses: actions/checkout@v4 with: fetch-depth: 0 @@ -958,7 +929,8 @@ jobs: fi - name: Verify docker-compose example - run: env REPOSITORY=369495373322.dkr.ecr.eu-central-1.amazonaws.com TAG=${{needs.tag.outputs.build-tag}} ./docker-compose/docker_compose_test.sh + timeout-minutes: 20 + run: env TAG=${{needs.tag.outputs.build-tag}} ./docker-compose/docker_compose_test.sh - name: Print logs and clean up if: always() @@ -991,9 +963,7 @@ jobs: crane pull 369495373322.dkr.ecr.eu-central-1.amazonaws.com/vm-compute-node-v16:${{needs.tag.outputs.build-tag}} vm-compute-node-v16 - name: Add latest tag to images - if: | - (github.ref_name == 'main' || github.ref_name == 'release') && - github.event_name != 'workflow_dispatch' + if: github.ref_name == 'main' || github.ref_name == 'release' || github.ref_name == 'release-proxy' run: | crane tag 369495373322.dkr.ecr.eu-central-1.amazonaws.com/neon:${{needs.tag.outputs.build-tag}} latest crane tag 369495373322.dkr.ecr.eu-central-1.amazonaws.com/compute-tools:${{needs.tag.outputs.build-tag}} latest @@ -1005,9 +975,7 @@ jobs: crane tag 369495373322.dkr.ecr.eu-central-1.amazonaws.com/vm-compute-node-v16:${{needs.tag.outputs.build-tag}} latest - name: Push images to production ECR - if: | - (github.ref_name == 'main' || github.ref_name == 'release') && - github.event_name != 'workflow_dispatch' + if: github.ref_name == 'main' || github.ref_name == 'release'|| github.ref_name == 'release-proxy' run: | crane copy 369495373322.dkr.ecr.eu-central-1.amazonaws.com/neon:${{needs.tag.outputs.build-tag}} 093970136003.dkr.ecr.eu-central-1.amazonaws.com/neon:latest crane copy 369495373322.dkr.ecr.eu-central-1.amazonaws.com/compute-tools:${{needs.tag.outputs.build-tag}} 093970136003.dkr.ecr.eu-central-1.amazonaws.com/compute-tools:latest @@ -1031,9 +999,7 @@ jobs: crane push vm-compute-node-v16 neondatabase/vm-compute-node-v16:${{needs.tag.outputs.build-tag}} - name: Push latest tags to Docker Hub - if: | - (github.ref_name == 'main' || github.ref_name == 'release') && - github.event_name != 'workflow_dispatch' + if: github.ref_name == 'main' || github.ref_name == 'release'|| github.ref_name == 'release-proxy' run: | crane tag neondatabase/neon:${{needs.tag.outputs.build-tag}} latest crane tag neondatabase/compute-tools:${{needs.tag.outputs.build-tag}} latest @@ -1123,7 +1089,7 @@ jobs: deploy: needs: [ check-permissions, promote-images, tag, regress-tests, trigger-custom-extensions-build-and-wait ] - if: ( github.ref_name == 'main' || github.ref_name == 'release' ) && github.event_name != 'workflow_dispatch' + if: github.ref_name == 'main' || github.ref_name == 'release'|| github.ref_name == 'release-proxy' runs-on: [ self-hosted, gen3, small ] container: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/ansible:latest @@ -1143,7 +1109,7 @@ jobs: done - name: Checkout - uses: actions/checkout@v3 + uses: actions/checkout@v4 with: submodules: false fetch-depth: 0 @@ -1158,15 +1124,27 @@ jobs: # TODO: move deployPreprodRegion to release (`"$GITHUB_REF_NAME" == "release"` block), once Staging support different compute tag prefixes for different regions gh workflow --repo neondatabase/aws run deploy-dev.yml --ref main -f branch=main -f dockerTag=${{needs.tag.outputs.build-tag}} -f deployPreprodRegion=true elif [[ "$GITHUB_REF_NAME" == "release" ]]; then - gh workflow --repo neondatabase/aws run deploy-prod.yml --ref main -f branch=main -f dockerTag=${{needs.tag.outputs.build-tag}} + gh workflow --repo neondatabase/aws run deploy-prod.yml --ref main \ + -f deployPgSniRouter=false \ + -f deployProxy=false \ + -f deployStorage=true \ + -f deployStorageBroker=true \ + -f branch=main \ + -f dockerTag=${{needs.tag.outputs.build-tag}} + elif [[ "$GITHUB_REF_NAME" == "release-proxy" ]]; then + gh workflow --repo neondatabase/aws run deploy-proxy-prod.yml --ref main \ + -f deployPgSniRouter=true \ + -f deployProxy=true \ + -f branch=main \ + -f dockerTag=${{needs.tag.outputs.build-tag}} else echo "GITHUB_REF_NAME (value '$GITHUB_REF_NAME') is not set to either 'main' or 'release'" exit 1 fi - name: Create git tag - if: github.ref_name == 'release' - uses: actions/github-script@v6 + if: github.ref_name == 'release' || github.ref_name == 'release-proxy' + uses: actions/github-script@v7 with: # Retry script for 5XX server errors: https://github.com/actions/github-script#retries retries: 5 @@ -1178,9 +1156,10 @@ jobs: sha: context.sha, }) + # TODO: check how GitHub releases looks for proxy releases and enable it if it's ok - name: Create GitHub release if: github.ref_name == 'release' - uses: actions/github-script@v6 + uses: actions/github-script@v7 with: # Retry script for 5XX server errors: https://github.com/actions/github-script#retries retries: 5 @@ -1229,3 +1208,11 @@ jobs: time aws s3 cp --only-show-errors s3://${BUCKET}/${S3_KEY} s3://${BUCKET}/${PREFIX}/${FILENAME} done + + pin-build-tools-image: + needs: [ build-build-tools-image, promote-images, regress-tests ] + if: github.ref_name == 'main' + uses: ./.github/workflows/pin-build-tools-image.yml + with: + from-tag: ${{ needs.build-build-tools-image.outputs.image-tag }} + secrets: inherit diff --git a/.github/workflows/check-build-tools-image.yml b/.github/workflows/check-build-tools-image.yml new file mode 100644 index 0000000000..28646dfc19 --- /dev/null +++ b/.github/workflows/check-build-tools-image.yml @@ -0,0 +1,58 @@ +name: Check build-tools image + +on: + workflow_call: + outputs: + image-tag: + description: "build-tools image tag" + value: ${{ jobs.check-image.outputs.tag }} + found: + description: "Whether the image is found in the registry" + value: ${{ jobs.check-image.outputs.found }} + +defaults: + run: + shell: bash -euo pipefail {0} + +# No permission for GITHUB_TOKEN by default; the **minimal required** set of permissions should be granted in each job. +permissions: {} + +jobs: + check-image: + runs-on: ubuntu-latest + outputs: + tag: ${{ steps.get-build-tools-tag.outputs.image-tag }} + found: ${{ steps.check-image.outputs.found }} + + steps: + - name: Get build-tools image tag for the current commit + id: get-build-tools-tag + env: + COMMIT_SHA: ${{ github.event.pull_request.head.sha || github.sha }} + GH_TOKEN: ${{ secrets.GITHUB_TOKEN }} + run: | + LAST_BUILD_TOOLS_SHA=$( + gh api \ + -H "Accept: application/vnd.github+json" \ + -H "X-GitHub-Api-Version: 2022-11-28" \ + --method GET \ + --field path=Dockerfile.build-tools \ + --field sha=${COMMIT_SHA} \ + --field per_page=1 \ + --jq ".[0].sha" \ + "/repos/${GITHUB_REPOSITORY}/commits" + ) + echo "image-tag=${LAST_BUILD_TOOLS_SHA}" | tee -a $GITHUB_OUTPUT + + - name: Check if such tag found in the registry + id: check-image + env: + IMAGE_TAG: ${{ steps.get-build-tools-tag.outputs.image-tag }} + run: | + if docker manifest inspect neondatabase/build-tools:${IMAGE_TAG}; then + found=true + else + found=false + fi + + echo "found=${found}" | tee -a $GITHUB_OUTPUT diff --git a/.github/workflows/check-permissions.yml b/.github/workflows/check-permissions.yml new file mode 100644 index 0000000000..c3357c6cf8 --- /dev/null +++ b/.github/workflows/check-permissions.yml @@ -0,0 +1,36 @@ +name: Check Permissions + +on: + workflow_call: + inputs: + github-event-name: + required: true + type: string + +defaults: + run: + shell: bash -euo pipefail {0} + +# No permission for GITHUB_TOKEN by default; the **minimal required** set of permissions should be granted in each job. +permissions: {} + +jobs: + check-permissions: + runs-on: ubuntu-latest + steps: + - name: Disallow CI runs on PRs from forks + if: | + inputs.github-event-name == 'pull_request' && + github.event.pull_request.head.repo.full_name != github.repository + run: | + if [ "${{ contains(fromJSON('["OWNER", "MEMBER", "COLLABORATOR"]'), github.event.pull_request.author_association) }}" = "true" ]; then + MESSAGE="Please create a PR from a branch of ${GITHUB_REPOSITORY} instead of a fork" + else + MESSAGE="The PR should be reviewed and labelled with 'approved-for-ci-run' to trigger a CI run" + fi + + # TODO: use actions/github-script to post this message as a PR comment + echo >&2 "We don't run CI for PRs from forks" + echo >&2 "${MESSAGE}" + + exit 1 diff --git a/.github/workflows/cleanup-caches-by-a-branch.yml b/.github/workflows/cleanup-caches-by-a-branch.yml new file mode 100644 index 0000000000..d8c225dedb --- /dev/null +++ b/.github/workflows/cleanup-caches-by-a-branch.yml @@ -0,0 +1,32 @@ +# A workflow from +# https://docs.github.com/en/actions/using-workflows/caching-dependencies-to-speed-up-workflows#force-deleting-cache-entries + +name: cleanup caches by a branch +on: + pull_request: + types: + - closed + +jobs: + cleanup: + runs-on: ubuntu-latest + steps: + - name: Cleanup + run: | + gh extension install actions/gh-actions-cache + + echo "Fetching list of cache key" + cacheKeysForPR=$(gh actions-cache list -R $REPO -B $BRANCH -L 100 | cut -f 1 ) + + ## Setting this to not fail the workflow while deleting cache keys. + set +e + echo "Deleting caches..." + for cacheKey in $cacheKeysForPR + do + gh actions-cache delete $cacheKey -R $REPO -B $BRANCH --confirm + done + echo "Done" + env: + GH_TOKEN: ${{ secrets.GITHUB_TOKEN }} + REPO: ${{ github.repository }} + BRANCH: refs/pull/${{ github.event.pull_request.number }}/merge diff --git a/.github/workflows/neon_extra_builds.yml b/.github/workflows/neon_extra_builds.yml index c6c2b7386a..5a2f9d6645 100644 --- a/.github/workflows/neon_extra_builds.yml +++ b/.github/workflows/neon_extra_builds.yml @@ -20,13 +20,31 @@ env: COPT: '-Werror' jobs: + check-permissions: + if: ${{ !contains(github.event.pull_request.labels.*.name, 'run-no-ci') }} + uses: ./.github/workflows/check-permissions.yml + with: + github-event-name: ${{ github.event_name}} + + check-build-tools-image: + needs: [ check-permissions ] + uses: ./.github/workflows/check-build-tools-image.yml + + build-build-tools-image: + needs: [ check-build-tools-image ] + uses: ./.github/workflows/build-build-tools-image.yml + with: + image-tag: ${{ needs.check-build-tools-image.outputs.image-tag }} + secrets: inherit + check-macos-build: + needs: [ check-permissions ] if: | contains(github.event.pull_request.labels.*.name, 'run-extra-build-macos') || contains(github.event.pull_request.labels.*.name, 'run-extra-build-*') || github.ref_name == 'main' timeout-minutes: 90 - runs-on: macos-latest + runs-on: macos-14 env: # Use release build only, to have less debug info around @@ -57,24 +75,24 @@ jobs: - name: Cache postgres v14 build id: cache_pg_14 - uses: actions/cache@v3 + uses: actions/cache@v4 with: path: pg_install/v14 - key: v1-${{ runner.os }}-${{ env.BUILD_TYPE }}-pg-${{ steps.pg_v14_rev.outputs.pg_rev }}-${{ hashFiles('Makefile') }} + key: v1-${{ runner.os }}-${{ runner.arch }}-${{ env.BUILD_TYPE }}-pg-${{ steps.pg_v14_rev.outputs.pg_rev }}-${{ hashFiles('Makefile') }} - name: Cache postgres v15 build id: cache_pg_15 - uses: actions/cache@v3 + uses: actions/cache@v4 with: path: pg_install/v15 - key: v1-${{ runner.os }}-${{ env.BUILD_TYPE }}-pg-${{ steps.pg_v15_rev.outputs.pg_rev }}-${{ hashFiles('Makefile') }} + key: v1-${{ runner.os }}-${{ runner.arch }}-${{ env.BUILD_TYPE }}-pg-${{ steps.pg_v15_rev.outputs.pg_rev }}-${{ hashFiles('Makefile') }} - name: Cache postgres v16 build id: cache_pg_16 - uses: actions/cache@v3 + uses: actions/cache@v4 with: path: pg_install/v16 - key: v1-${{ runner.os }}-${{ env.BUILD_TYPE }}-pg-${{ steps.pg_v16_rev.outputs.pg_rev }}-${{ hashFiles('Makefile') }} + key: v1-${{ runner.os }}-${{ runner.arch }}-${{ env.BUILD_TYPE }}-pg-${{ steps.pg_v16_rev.outputs.pg_rev }}-${{ hashFiles('Makefile') }} - name: Set extra env for macOS run: | @@ -82,14 +100,14 @@ jobs: echo 'CPPFLAGS=-I/usr/local/opt/openssl@3/include' >> $GITHUB_ENV - name: Cache cargo deps - uses: actions/cache@v3 + uses: actions/cache@v4 with: path: | ~/.cargo/registry !~/.cargo/registry/src ~/.cargo/git target - key: v1-${{ runner.os }}-cargo-${{ hashFiles('./Cargo.lock') }}-${{ hashFiles('./rust-toolchain.toml') }}-rust + key: v1-${{ runner.os }}-${{ runner.arch }}-cargo-${{ hashFiles('./Cargo.lock') }}-${{ hashFiles('./rust-toolchain.toml') }}-rust - name: Build postgres v14 if: steps.cache_pg_14.outputs.cache-hit != 'true' @@ -110,12 +128,13 @@ jobs: run: make walproposer-lib -j$(sysctl -n hw.ncpu) - name: Run cargo build - run: cargo build --all --release + run: PQ_LIB_DIR=$(pwd)/pg_install/v16/lib cargo build --all --release - name: Check that no warnings are produced run: ./run_clippy.sh check-linux-arm-build: + needs: [ check-permissions, build-build-tools-image ] timeout-minutes: 90 runs-on: [ self-hosted, dev, arm64 ] @@ -124,12 +143,15 @@ jobs: # Hence keeping target/ (and general cache size) smaller BUILD_TYPE: release CARGO_FEATURES: --features testing - CARGO_FLAGS: --locked --release + CARGO_FLAGS: --release AWS_ACCESS_KEY_ID: ${{ secrets.AWS_ACCESS_KEY_DEV }} AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_SECRET_KEY_DEV }} container: - image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/rust:pinned + image: ${{ needs.build-build-tools-image.outputs.image }} + credentials: + username: ${{ secrets.NEON_DOCKERHUB_USERNAME }} + password: ${{ secrets.NEON_DOCKERHUB_PASSWORD }} options: --init steps: @@ -171,21 +193,21 @@ jobs: - name: Cache postgres v14 build id: cache_pg_14 - uses: actions/cache@v3 + uses: actions/cache@v4 with: path: pg_install/v14 key: v1-${{ runner.os }}-${{ runner.arch }}-${{ env.BUILD_TYPE }}-pg-${{ steps.pg_v14_rev.outputs.pg_rev }}-${{ hashFiles('Makefile') }} - name: Cache postgres v15 build id: cache_pg_15 - uses: actions/cache@v3 + uses: actions/cache@v4 with: path: pg_install/v15 key: v1-${{ runner.os }}-${{ runner.arch }}-${{ env.BUILD_TYPE }}-pg-${{ steps.pg_v15_rev.outputs.pg_rev }}-${{ hashFiles('Makefile') }} - name: Cache postgres v16 build id: cache_pg_16 - uses: actions/cache@v3 + uses: actions/cache@v4 with: path: pg_install/v16 key: v1-${{ runner.os }}-${{ runner.arch }}-${{ env.BUILD_TYPE }}-pg-${{ steps.pg_v16_rev.outputs.pg_rev }}-${{ hashFiles('Makefile') }} @@ -210,18 +232,20 @@ jobs: - name: Run cargo build run: | - mold -run cargo build $CARGO_FLAGS $CARGO_FEATURES --bins --tests + mold -run cargo build --locked $CARGO_FLAGS $CARGO_FEATURES --bins --tests - name: Run cargo test + env: + NEXTEST_RETRIES: 3 run: | - cargo test $CARGO_FLAGS $CARGO_FEATURES + cargo nextest run $CARGO_FEATURES # Run separate tests for real S3 export ENABLE_REAL_S3_REMOTE_STORAGE=nonempty export REMOTE_STORAGE_S3_BUCKET=neon-github-ci-tests export REMOTE_STORAGE_S3_REGION=eu-central-1 # Avoid `$CARGO_FEATURES` since there's no `testing` feature in the e2e tests now - cargo test $CARGO_FLAGS --package remote_storage --test test_real_s3 + cargo nextest run --package remote_storage --test test_real_s3 # Run separate tests for real Azure Blob Storage # XXX: replace region with `eu-central-1`-like region @@ -231,14 +255,18 @@ jobs: export REMOTE_STORAGE_AZURE_CONTAINER="${{ vars.REMOTE_STORAGE_AZURE_CONTAINER }}" export REMOTE_STORAGE_AZURE_REGION="${{ vars.REMOTE_STORAGE_AZURE_REGION }}" # Avoid `$CARGO_FEATURES` since there's no `testing` feature in the e2e tests now - cargo test $CARGO_FLAGS --package remote_storage --test test_real_azure + cargo nextest run --package remote_storage --test test_real_azure check-codestyle-rust-arm: + needs: [ check-permissions, build-build-tools-image ] timeout-minutes: 90 runs-on: [ self-hosted, dev, arm64 ] container: - image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/rust:pinned + image: ${{ needs.build-build-tools-image.outputs.image }} + credentials: + username: ${{ secrets.NEON_DOCKERHUB_USERNAME }} + password: ${{ secrets.NEON_DOCKERHUB_PASSWORD }} options: --init steps: @@ -305,13 +333,17 @@ jobs: run: cargo deny check gather-rust-build-stats: + needs: [ check-permissions, build-build-tools-image ] if: | contains(github.event.pull_request.labels.*.name, 'run-extra-build-stats') || contains(github.event.pull_request.labels.*.name, 'run-extra-build-*') || github.ref_name == 'main' runs-on: [ self-hosted, gen3, large ] container: - image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/rust:pinned + image: ${{ needs.build-build-tools-image.outputs.image }} + credentials: + username: ${{ secrets.NEON_DOCKERHUB_USERNAME }} + password: ${{ secrets.NEON_DOCKERHUB_PASSWORD }} options: --init env: @@ -352,7 +384,7 @@ jobs: echo "report-url=${REPORT_URL}" >> $GITHUB_OUTPUT - name: Publish build stats report - uses: actions/github-script@v6 + uses: actions/github-script@v7 env: REPORT_URL: ${{ steps.upload-stats.outputs.report-url }} SHA: ${{ github.event.pull_request.head.sha || github.sha }} diff --git a/.github/workflows/pg_clients.yml b/.github/workflows/pg_clients.yml index 224b7b4a6d..50e3227a74 100644 --- a/.github/workflows/pg_clients.yml +++ b/.github/workflows/pg_clients.yml @@ -28,7 +28,7 @@ jobs: steps: - name: Checkout - uses: actions/checkout@v3 + uses: actions/checkout@v4 - uses: actions/setup-python@v4 with: @@ -38,11 +38,10 @@ jobs: uses: snok/install-poetry@v1 - name: Cache poetry deps - id: cache_poetry - uses: actions/cache@v3 + uses: actions/cache@v4 with: path: ~/.cache/pypoetry/virtualenvs - key: v1-${{ runner.os }}-python-deps-${{ hashFiles('poetry.lock') }} + key: v2-${{ runner.os }}-python-deps-ubunutu-latest-${{ hashFiles('poetry.lock') }} - name: Install Python deps shell: bash -euxo pipefail {0} @@ -83,7 +82,7 @@ jobs: # It will be fixed after switching to gen2 runner - name: Upload python test logs if: always() - uses: actions/upload-artifact@v3 + uses: actions/upload-artifact@v4 with: retention-days: 7 name: python-test-pg_clients-${{ runner.os }}-stage-logs diff --git a/.github/workflows/pin-build-tools-image.yml b/.github/workflows/pin-build-tools-image.yml new file mode 100644 index 0000000000..c941692066 --- /dev/null +++ b/.github/workflows/pin-build-tools-image.yml @@ -0,0 +1,72 @@ +name: 'Pin build-tools image' + +on: + workflow_dispatch: + inputs: + from-tag: + description: 'Source tag' + required: true + type: string + workflow_call: + inputs: + from-tag: + description: 'Source tag' + required: true + type: string + +defaults: + run: + shell: bash -euo pipefail {0} + +concurrency: + group: pin-build-tools-image-${{ inputs.from-tag }} + +permissions: {} + +jobs: + tag-image: + runs-on: ubuntu-latest + + env: + FROM_TAG: ${{ inputs.from-tag }} + TO_TAG: pinned + + steps: + - name: Check if we really need to pin the image + id: check-manifests + run: | + docker manifest inspect neondatabase/build-tools:${FROM_TAG} > ${FROM_TAG}.json + docker manifest inspect neondatabase/build-tools:${TO_TAG} > ${TO_TAG}.json + + if diff ${FROM_TAG}.json ${TO_TAG}.json; then + skip=true + else + skip=false + fi + + echo "skip=${skip}" | tee -a $GITHUB_OUTPUT + + - uses: docker/login-action@v3 + if: steps.check-manifests.outputs.skip == 'false' + with: + username: ${{ secrets.NEON_DOCKERHUB_USERNAME }} + password: ${{ secrets.NEON_DOCKERHUB_PASSWORD }} + + - name: Tag build-tools with `${{ env.TO_TAG }}` in Docker Hub + if: steps.check-manifests.outputs.skip == 'false' + run: | + docker buildx imagetools create -t neondatabase/build-tools:${TO_TAG} \ + neondatabase/build-tools:${FROM_TAG} + + - uses: docker/login-action@v3 + if: steps.check-manifests.outputs.skip == 'false' + with: + registry: 369495373322.dkr.ecr.eu-central-1.amazonaws.com + username: ${{ secrets.AWS_ACCESS_KEY_DEV }} + password: ${{ secrets.AWS_SECRET_KEY_DEV }} + + - name: Tag build-tools with `${{ env.TO_TAG }}` in ECR + if: steps.check-manifests.outputs.skip == 'false' + run: | + docker buildx imagetools create -t 369495373322.dkr.ecr.eu-central-1.amazonaws.com/build-tools:${TO_TAG} \ + neondatabase/build-tools:${FROM_TAG} diff --git a/.github/workflows/release.yml b/.github/workflows/release.yml index ba37c5827a..b2c9a19588 100644 --- a/.github/workflows/release.yml +++ b/.github/workflows/release.yml @@ -2,12 +2,31 @@ name: Create Release Branch on: schedule: - - cron: '0 6 * * 1' + # It should be kept in sync with if-condition in jobs + - cron: '0 6 * * MON' # Storage release + - cron: '0 6 * * THU' # Proxy release workflow_dispatch: + inputs: + create-storage-release-branch: + type: boolean + description: 'Create Storage release PR' + required: false + create-proxy-release-branch: + type: boolean + description: 'Create Proxy release PR' + required: false + +# No permission for GITHUB_TOKEN by default; the **minimal required** set of permissions should be granted in each job. +permissions: {} + +defaults: + run: + shell: bash -euo pipefail {0} jobs: - create_release_branch: - runs-on: [ ubuntu-latest ] + create-storage-release-branch: + if: ${{ github.event.schedule == '0 6 * * MON' || format('{0}', inputs.create-storage-release-branch) == 'true' }} + runs-on: ubuntu-latest permissions: contents: write # for `git push` @@ -18,27 +37,67 @@ jobs: with: ref: main - - name: Get current date - id: date - run: echo "date=$(date +'%Y-%m-%d')" >> $GITHUB_OUTPUT + - name: Set environment variables + run: | + echo "RELEASE_DATE=$(date +'%Y-%m-%d')" | tee -a $GITHUB_ENV + echo "RELEASE_BRANCH=rc/$(date +'%Y-%m-%d')" | tee -a $GITHUB_ENV - name: Create release branch - run: git checkout -b releases/${{ steps.date.outputs.date }} + run: git checkout -b $RELEASE_BRANCH - name: Push new branch - run: git push origin releases/${{ steps.date.outputs.date }} + run: git push origin $RELEASE_BRANCH - name: Create pull request into release env: GH_TOKEN: ${{ secrets.CI_ACCESS_TOKEN }} run: | cat << EOF > body.md - ## Release ${{ steps.date.outputs.date }} + ## Release ${RELEASE_DATE} - **Please merge this PR using 'Create a merge commit'!** + **Please merge this Pull Request using 'Create a merge commit' button** EOF - gh pr create --title "Release ${{ steps.date.outputs.date }}" \ + gh pr create --title "Release ${RELEASE_DATE}" \ --body-file "body.md" \ - --head "releases/${{ steps.date.outputs.date }}" \ + --head "${RELEASE_BRANCH}" \ --base "release" + + create-proxy-release-branch: + if: ${{ github.event.schedule == '0 6 * * THU' || format('{0}', inputs.create-proxy-release-branch) == 'true' }} + runs-on: ubuntu-latest + + permissions: + contents: write # for `git push` + + steps: + - name: Check out code + uses: actions/checkout@v4 + with: + ref: main + + - name: Set environment variables + run: | + echo "RELEASE_DATE=$(date +'%Y-%m-%d')" | tee -a $GITHUB_ENV + echo "RELEASE_BRANCH=rc/proxy/$(date +'%Y-%m-%d')" | tee -a $GITHUB_ENV + + - name: Create release branch + run: git checkout -b $RELEASE_BRANCH + + - name: Push new branch + run: git push origin $RELEASE_BRANCH + + - name: Create pull request into release + env: + GH_TOKEN: ${{ secrets.CI_ACCESS_TOKEN }} + run: | + cat << EOF > body.md + ## Proxy release ${RELEASE_DATE} + + **Please merge this Pull Request using 'Create a merge commit' button** + EOF + + gh pr create --title "Proxy release ${RELEASE_DATE}" \ + --body-file "body.md" \ + --head "${RELEASE_BRANCH}" \ + --base "release-proxy" diff --git a/.github/workflows/trigger-e2e-tests.yml b/.github/workflows/trigger-e2e-tests.yml new file mode 100644 index 0000000000..ae34cbffe0 --- /dev/null +++ b/.github/workflows/trigger-e2e-tests.yml @@ -0,0 +1,119 @@ +name: Trigger E2E Tests + +on: + pull_request: + types: + - ready_for_review + workflow_call: + +defaults: + run: + shell: bash -euxo pipefail {0} + +env: + # A concurrency group that we use for e2e-tests runs, matches `concurrency.group` above with `github.repository` as a prefix + E2E_CONCURRENCY_GROUP: ${{ github.repository }}-e2e-tests-${{ github.ref_name }}-${{ github.ref_name == 'main' && github.sha || 'anysha' }} + AWS_ACCESS_KEY_ID: ${{ secrets.AWS_ACCESS_KEY_DEV }} + AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_SECRET_KEY_DEV }} + +jobs: + cancel-previous-e2e-tests: + if: github.event_name == 'pull_request' + runs-on: ubuntu-latest + + steps: + - name: Cancel previous e2e-tests runs for this PR + env: + GH_TOKEN: ${{ secrets.CI_ACCESS_TOKEN }} + run: | + gh workflow --repo neondatabase/cloud \ + run cancel-previous-in-concurrency-group.yml \ + --field concurrency_group="${{ env.E2E_CONCURRENCY_GROUP }}" + + tag: + runs-on: [ ubuntu-latest ] + outputs: + build-tag: ${{ steps.build-tag.outputs.tag }} + + steps: + - name: Checkout + uses: actions/checkout@v4 + with: + fetch-depth: 0 + + - name: Get build tag + env: + GH_TOKEN: ${{ secrets.CI_ACCESS_TOKEN }} + CURRENT_BRANCH: ${{ github.head_ref || github.ref_name }} + CURRENT_SHA: ${{ github.event.pull_request.head.sha || github.sha }} + run: | + if [[ "$GITHUB_REF_NAME" == "main" ]]; then + echo "tag=$(git rev-list --count HEAD)" | tee -a $GITHUB_OUTPUT + elif [[ "$GITHUB_REF_NAME" == "release" ]]; then + echo "tag=release-$(git rev-list --count HEAD)" | tee -a $GITHUB_OUTPUT + elif [[ "$GITHUB_REF_NAME" == "release-proxy" ]]; then + echo "tag=release-proxy-$(git rev-list --count HEAD)" >> $GITHUB_OUTPUT + else + echo "GITHUB_REF_NAME (value '$GITHUB_REF_NAME') is not set to either 'main' or 'release'" + BUILD_AND_TEST_RUN_ID=$(gh run list -b $CURRENT_BRANCH -c $CURRENT_SHA -w 'Build and Test' -L 1 --json databaseId --jq '.[].databaseId') + echo "tag=$BUILD_AND_TEST_RUN_ID" | tee -a $GITHUB_OUTPUT + fi + id: build-tag + + trigger-e2e-tests: + needs: [ tag ] + runs-on: [ self-hosted, gen3, small ] + env: + TAG: ${{ needs.tag.outputs.build-tag }} + container: + image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/base:pinned + options: --init + steps: + - name: check if ecr image are present + run: | + for REPO in neon compute-tools compute-node-v14 vm-compute-node-v14 compute-node-v15 vm-compute-node-v15 compute-node-v16 vm-compute-node-v16; do + OUTPUT=$(aws ecr describe-images --repository-name ${REPO} --region eu-central-1 --query "imageDetails[?imageTags[?contains(@, '${TAG}')]]" --output text) + if [ "$OUTPUT" == "" ]; then + echo "$REPO with image tag $TAG not found" >> $GITHUB_OUTPUT + exit 1 + fi + done + + - name: Set PR's status to pending and request a remote CI test + run: | + # For pull requests, GH Actions set "github.sha" variable to point at a fake merge commit + # but we need to use a real sha of a latest commit in the PR's branch for the e2e job, + # to place a job run status update later. + COMMIT_SHA=${{ github.event.pull_request.head.sha }} + # For non-PR kinds of runs, the above will produce an empty variable, pick the original sha value for those + COMMIT_SHA=${COMMIT_SHA:-${{ github.sha }}} + + REMOTE_REPO="${{ github.repository_owner }}/cloud" + + curl -f -X POST \ + https://api.github.com/repos/${{ github.repository }}/statuses/$COMMIT_SHA \ + -H "Accept: application/vnd.github.v3+json" \ + --user "${{ secrets.CI_ACCESS_TOKEN }}" \ + --data \ + "{ + \"state\": \"pending\", + \"context\": \"neon-cloud-e2e\", + \"description\": \"[$REMOTE_REPO] Remote CI job is about to start\" + }" + + curl -f -X POST \ + https://api.github.com/repos/$REMOTE_REPO/actions/workflows/testing.yml/dispatches \ + -H "Accept: application/vnd.github.v3+json" \ + --user "${{ secrets.CI_ACCESS_TOKEN }}" \ + --data \ + "{ + \"ref\": \"main\", + \"inputs\": { + \"ci_job_name\": \"neon-cloud-e2e\", + \"commit_hash\": \"$COMMIT_SHA\", + \"remote_repo\": \"${{ github.repository }}\", + \"storage_image_tag\": \"${TAG}\", + \"compute_image_tag\": \"${TAG}\", + \"concurrency_group\": \"${{ env.E2E_CONCURRENCY_GROUP }}\" + } + }" diff --git a/.github/workflows/update_build_tools_image.yml b/.github/workflows/update_build_tools_image.yml deleted file mode 100644 index 88bab797b7..0000000000 --- a/.github/workflows/update_build_tools_image.yml +++ /dev/null @@ -1,130 +0,0 @@ -name: 'Update build tools image tag' - -# This workflow it used to update tag of build tools in ECR. -# The most common use case is adding/moving `pinned` tag to `${GITHUB_RUN_IT}` image. - -on: - workflow_dispatch: - inputs: - from-tag: - description: 'Source tag' - required: true - type: string - to-tag: - description: 'Destination tag' - required: true - type: string - default: 'pinned' - -defaults: - run: - shell: bash -euo pipefail {0} - -env: - AWS_ACCESS_KEY_ID: ${{ secrets.AWS_ACCESS_KEY_DEV }} - AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_SECRET_KEY_DEV }} - -permissions: {} - -jobs: - tag-image: - runs-on: [ self-hosted, gen3, small ] - container: golang:1.19-bullseye - - env: - IMAGE: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/build-tools - FROM_TAG: ${{ inputs.from-tag }} - TO_TAG: ${{ inputs.to-tag }} - outputs: - next-digest-buildtools: ${{ steps.next-digest.outputs.next-digest-buildtools }} - prev-digest-buildtools: ${{ steps.prev-digest.outputs.prev-digest-buildtools }} - - steps: - - name: Install Crane & ECR helper - run: | - go install github.com/google/go-containerregistry/cmd/crane@a54d64203cffcbf94146e04069aae4a97f228ee2 # v0.16.1 - go install github.com/awslabs/amazon-ecr-credential-helper/ecr-login/cli/docker-credential-ecr-login@adf1bafd791ae7d4ff098108b1e91f36a4da5404 # v0.7.1 - - - name: Configure ECR login - run: | - mkdir /github/home/.docker/ - echo "{\"credsStore\":\"ecr-login\"}" > /github/home/.docker/config.json - - - name: Get source image digest - id: next-digest - run: | - NEXT_DIGEST=$(crane digest ${IMAGE}:${FROM_TAG} || true) - if [ -z "${NEXT_DIGEST}" ]; then - echo >&2 "Image ${IMAGE}:${FROM_TAG} does not exist" - exit 1 - fi - - echo "Current ${IMAGE}@${FROM_TAG} image is ${IMAGE}@${NEXT_DIGEST}" - echo "next-digest-buildtools=$NEXT_DIGEST" >> $GITHUB_OUTPUT - - - name: Get destination image digest (if already exists) - id: prev-digest - run: | - PREV_DIGEST=$(crane digest ${IMAGE}:${TO_TAG} || true) - if [ -z "${PREV_DIGEST}" ]; then - echo >&2 "Image ${IMAGE}:${TO_TAG} does not exist (it's ok)" - else - echo >&2 "Current ${IMAGE}@${TO_TAG} image is ${IMAGE}@${PREV_DIGEST}" - - echo "prev-digest-buildtools=$PREV_DIGEST" >> $GITHUB_OUTPUT - fi - - - name: Tag image - run: | - crane tag "${IMAGE}:${FROM_TAG}" "${TO_TAG}" - - rollback-tag-image: - needs: tag-image - if: ${{ !success() }} - - runs-on: [ self-hosted, gen3, small ] - container: golang:1.19-bullseye - - env: - IMAGE: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/build-tools - FROM_TAG: ${{ inputs.from-tag }} - TO_TAG: ${{ inputs.to-tag }} - - steps: - - name: Install Crane & ECR helper - run: | - go install github.com/google/go-containerregistry/cmd/crane@a54d64203cffcbf94146e04069aae4a97f228ee2 # v0.16.1 - go install github.com/awslabs/amazon-ecr-credential-helper/ecr-login/cli/docker-credential-ecr-login@adf1bafd791ae7d4ff098108b1e91f36a4da5404 # v0.7.1 - - - name: Configure ECR login - run: | - mkdir /github/home/.docker/ - echo "{\"credsStore\":\"ecr-login\"}" > /github/home/.docker/config.json - - - name: Restore previous tag if needed - run: | - NEXT_DIGEST="${{ needs.tag-image.outputs.next-digest-buildtools }}" - PREV_DIGEST="${{ needs.tag-image.outputs.prev-digest-buildtools }}" - - if [ -z "${NEXT_DIGEST}" ]; then - echo >&2 "Image ${IMAGE}:${FROM_TAG} does not exist, nothing to rollback" - exit 0 - fi - - if [ -z "${PREV_DIGEST}" ]; then - # I guess we should delete the tag here/untag the image, but crane does not support it - # - https://github.com/google/go-containerregistry/issues/999 - - echo >&2 "Image ${IMAGE}:${TO_TAG} did not exist, but it was created by the job, no need to rollback" - - exit 0 - fi - - CURRENT_DIGEST=$(crane digest "${IMAGE}:${TO_TAG}") - if [ "${CURRENT_DIGEST}" == "${NEXT_DIGEST}" ]; then - crane tag "${IMAGE}@${PREV_DIGEST}" "${TO_TAG}" - - echo >&2 "Successfully restored ${TO_TAG} tag from ${IMAGE}@${CURRENT_DIGEST} to ${IMAGE}@${PREV_DIGEST}" - else - echo >&2 "Image ${IMAGE}:${TO_TAG}@${CURRENT_DIGEST} is not required to be restored" - fi diff --git a/.gitignore b/.gitignore index 3f4495c9e7..2c38cdcc59 100644 --- a/.gitignore +++ b/.gitignore @@ -9,6 +9,7 @@ test_output/ neon.iml /.neon /integration_tests/.neon +compaction-suite-results.* # Coverage *.profraw diff --git a/CODEOWNERS b/CODEOWNERS index e384dc39f1..5b601f0566 100644 --- a/CODEOWNERS +++ b/CODEOWNERS @@ -1,10 +1,10 @@ /compute_tools/ @neondatabase/control-plane @neondatabase/compute -/control_plane/ @neondatabase/compute @neondatabase/storage -/libs/pageserver_api/ @neondatabase/compute @neondatabase/storage +/control_plane/attachment_service @neondatabase/storage +/libs/pageserver_api/ @neondatabase/storage /libs/postgres_ffi/ @neondatabase/compute /libs/remote_storage/ @neondatabase/storage /libs/safekeeper_api/ @neondatabase/safekeepers -/libs/vm_monitor/ @neondatabase/autoscaling @neondatabase/compute +/libs/vm_monitor/ @neondatabase/autoscaling /pageserver/ @neondatabase/storage /pgxn/ @neondatabase/compute /proxy/ @neondatabase/proxy diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md index b318c295a3..164eb77f58 100644 --- a/CONTRIBUTING.md +++ b/CONTRIBUTING.md @@ -20,7 +20,7 @@ ln -s ../../pre-commit.py .git/hooks/pre-commit This will run following checks on staged files before each commit: - `rustfmt` -- checks for python files, see [obligatory checks](/docs/sourcetree.md#obligatory-checks). +- checks for Python files, see [obligatory checks](/docs/sourcetree.md#obligatory-checks). There is also a separate script `./run_clippy.sh` that runs `cargo clippy` on the whole project and `./scripts/reformat` that runs all formatting tools to ensure the project is up to date. @@ -54,6 +54,9 @@ _An instruction for maintainers_ - If and only if it looks **safe** (i.e. it doesn't contain any malicious code which could expose secrets or harm the CI), then: - Press the "Approve and run" button in GitHub UI - Add the `approved-for-ci-run` label to the PR + - Currently draft PR will skip e2e test (only for internal contributors). After turning the PR 'Ready to Review' CI will trigger e2e test + - Add `run-e2e-tests-in-draft` label to run e2e test in draft PR (override above behaviour) + - The `approved-for-ci-run` workflow will add `run-e2e-tests-in-draft` automatically to run e2e test for external contributors Repeat all steps after any change to the PR. - When the changes are ready to get merged — merge the original PR (not the internal one) @@ -71,16 +74,11 @@ We're using the following approach to make it work: For details see [`approved-for-ci-run.yml`](.github/workflows/approved-for-ci-run.yml) -## How do I add the "pinned" tag to an buildtools image? -We use the `pinned` tag for `Dockerfile.buildtools` build images in our CI/CD setup, currently adding the `pinned` tag is a manual operation. +## How do I make build-tools image "pinned" -You can call it from GitHub UI: https://github.com/neondatabase/neon/actions/workflows/update_build_tools_image.yml, -or using GitHub CLI: +It's possible to update the `pinned` tag of the `build-tools` image using the `pin-build-tools-image.yml` workflow. ```bash -gh workflow -R neondatabase/neon run update_build_tools_image.yml \ - -f from-tag=6254913013 \ - -f to-tag=pinned \ - -# Default `-f to-tag` is `pinned`, so the parameter can be omitted. -``` \ No newline at end of file +gh workflow -R neondatabase/neon run pin-build-tools-image.yml \ + -f from-tag=cc98d9b00d670f182c507ae3783342bd7e64c31e +``` diff --git a/Cargo.lock b/Cargo.lock index a9b9f9c7bf..e35fa564b9 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -25,9 +25,9 @@ checksum = "f26201604c87b1e01bd3d98f8d5d9a8fcbb815e8cedb41ffccbeb4bf593a35fe" [[package]] name = "ahash" -version = "0.8.5" +version = "0.8.9" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "cd7d5a2cecb58716e47d67d5703a249964b14c7be1ec3cad3affc295b2d1c35d" +checksum = "d713b3834d76b85304d4d525563c1276e2e30dc97cc67bfb4585a4a29fc2c89f" dependencies = [ "cfg-if", "const-random", @@ -241,7 +241,7 @@ checksum = "16e62a023e7c117e27523144c5d2459f4397fcc3cab0085af8e2224f643a0193" dependencies = [ "proc-macro2", "quote", - "syn 2.0.32", + "syn 2.0.52", ] [[package]] @@ -252,7 +252,7 @@ checksum = "b9ccdd8f2a161be9bd5c023df56f1b2a0bd1d83872ae53b71a84a12c9bf6e842" dependencies = [ "proc-macro2", "quote", - "syn 2.0.32", + "syn 2.0.52", ] [[package]] @@ -275,18 +275,24 @@ name = "attachment_service" version = "0.1.0" dependencies = [ "anyhow", + "aws-config", + "aws-sdk-secretsmanager", "camino", "clap", "control_plane", + "diesel", + "diesel_migrations", "futures", "git-version", + "humantime", "hyper", "metrics", + "once_cell", "pageserver_api", "pageserver_client", - "postgres_backend", "postgres_connection", - "scopeguard", + "r2d2", + "reqwest", "serde", "serde_json", "thiserror", @@ -305,12 +311,11 @@ checksum = "d468802bab17cbc0cc575e9b053f41e72aa36bfa6b7f55e3529ffa43161b97fa" [[package]] name = "aws-config" -version = "1.0.1" +version = "1.1.4" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "80c950a809d39bc9480207cb1cfc879ace88ea7e3a4392a8e9999e45d6e5692e" +checksum = "8b30c39ebe61f75d1b3785362b1586b41991873c9ab3e317a9181c246fb71d82" dependencies = [ "aws-credential-types", - "aws-http", "aws-runtime", "aws-sdk-sso", "aws-sdk-ssooidc", @@ -325,7 +330,7 @@ dependencies = [ "bytes", "fastrand 2.0.0", "hex", - "http", + "http 0.2.9", "hyper", "ring 0.17.6", "time", @@ -336,9 +341,9 @@ dependencies = [ [[package]] name = "aws-credential-types" -version = "1.0.1" +version = "1.1.4" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "8c1317e1a3514b103cf7d5828bbab3b4d30f56bd22d684f8568bc51b6cfbbb1c" +checksum = "33cc49dcdd31c8b6e79850a179af4c367669150c7ac0135f176c61bec81a70f7" dependencies = [ "aws-smithy-async", "aws-smithy-runtime-api", @@ -346,30 +351,13 @@ dependencies = [ "zeroize", ] -[[package]] -name = "aws-http" -version = "0.60.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "361c4310fdce94328cc2d1ca0c8a48c13f43009c61d3367585685a50ca8c66b6" -dependencies = [ - "aws-smithy-runtime-api", - "aws-smithy-types", - "aws-types", - "bytes", - "http", - "http-body", - "pin-project-lite", - "tracing", -] - [[package]] name = "aws-runtime" -version = "1.0.1" +version = "1.1.4" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "1ed7ef604a15fd0d4d9e43701295161ea6b504b63c44990ead352afea2bc15e9" +checksum = "eb031bff99877c26c28895766f7bb8484a05e24547e370768d6cc9db514662aa" dependencies = [ "aws-credential-types", - "aws-http", "aws-sigv4", "aws-smithy-async", "aws-smithy-eventstream", @@ -377,21 +365,23 @@ dependencies = [ "aws-smithy-runtime-api", "aws-smithy-types", "aws-types", + "bytes", "fastrand 2.0.0", - "http", + "http 0.2.9", + "http-body", "percent-encoding", + "pin-project-lite", "tracing", "uuid", ] [[package]] name = "aws-sdk-s3" -version = "1.4.0" +version = "1.14.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "9dcafc2fe52cc30b2d56685e2fa6a879ba50d79704594852112337a472ddbd24" +checksum = "951f7730f51a2155c711c85c79f337fbc02a577fa99d2a0a8059acfce5392113" dependencies = [ "aws-credential-types", - "aws-http", "aws-runtime", "aws-sigv4", "aws-smithy-async", @@ -405,23 +395,22 @@ dependencies = [ "aws-smithy-xml", "aws-types", "bytes", - "http", + "http 0.2.9", "http-body", "once_cell", "percent-encoding", - "regex", + "regex-lite", "tracing", "url", ] [[package]] -name = "aws-sdk-sso" -version = "1.3.0" +name = "aws-sdk-secretsmanager" +version = "1.14.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "0619ab97a5ca8982e7de073cdc66f93e5f6a1b05afc09e696bec1cb3607cd4df" +checksum = "0a0b64e61e7d632d9df90a2e0f32630c68c24960cab1d27d848718180af883d3" dependencies = [ "aws-credential-types", - "aws-http", "aws-runtime", "aws-smithy-async", "aws-smithy-http", @@ -431,19 +420,42 @@ dependencies = [ "aws-smithy-types", "aws-types", "bytes", - "http", - "regex", + "fastrand 2.0.0", + "http 0.2.9", + "once_cell", + "regex-lite", + "tracing", +] + +[[package]] +name = "aws-sdk-sso" +version = "1.12.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f486420a66caad72635bc2ce0ff6581646e0d32df02aa39dc983bfe794955a5b" +dependencies = [ + "aws-credential-types", + "aws-runtime", + "aws-smithy-async", + "aws-smithy-http", + "aws-smithy-json", + "aws-smithy-runtime", + "aws-smithy-runtime-api", + "aws-smithy-types", + "aws-types", + "bytes", + "http 0.2.9", + "once_cell", + "regex-lite", "tracing", ] [[package]] name = "aws-sdk-ssooidc" -version = "1.3.0" +version = "1.12.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f04b9f5474cc0f35d829510b2ec8c21e352309b46bf9633c5a81fb9321e9b1c7" +checksum = "39ddccf01d82fce9b4a15c8ae8608211ee7db8ed13a70b514bbfe41df3d24841" dependencies = [ "aws-credential-types", - "aws-http", "aws-runtime", "aws-smithy-async", "aws-smithy-http", @@ -453,19 +465,19 @@ dependencies = [ "aws-smithy-types", "aws-types", "bytes", - "http", - "regex", + "http 0.2.9", + "once_cell", + "regex-lite", "tracing", ] [[package]] name = "aws-sdk-sts" -version = "1.3.0" +version = "1.12.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "5700da387716ccfc30b27f44b008f457e1baca5b0f05b6b95455778005e3432a" +checksum = "1a591f8c7e6a621a501b2b5d2e88e1697fcb6274264523a6ad4d5959889a41ce" dependencies = [ "aws-credential-types", - "aws-http", "aws-runtime", "aws-smithy-async", "aws-smithy-http", @@ -476,16 +488,17 @@ dependencies = [ "aws-smithy-types", "aws-smithy-xml", "aws-types", - "http", - "regex", + "http 0.2.9", + "once_cell", + "regex-lite", "tracing", ] [[package]] name = "aws-sigv4" -version = "1.0.1" +version = "1.1.4" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "380adcc8134ad8bbdfeb2ace7626a869914ee266322965276cbc54066186d236" +checksum = "c371c6b0ac54d4605eb6f016624fb5c7c2925d315fdf600ac1bf21b19d5f1742" dependencies = [ "aws-credential-types", "aws-smithy-eventstream", @@ -497,11 +510,11 @@ dependencies = [ "form_urlencoded", "hex", "hmac", - "http", + "http 0.2.9", + "http 1.0.0", "once_cell", "p256", "percent-encoding", - "regex", "ring 0.17.6", "sha2", "subtle", @@ -512,9 +525,9 @@ dependencies = [ [[package]] name = "aws-smithy-async" -version = "1.0.2" +version = "1.1.4" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "3e37ca17d25fe1e210b6d4bdf59b81caebfe99f986201a1228cb5061233b4b13" +checksum = "72ee2d09cce0ef3ae526679b522835d63e75fb427aca5413cd371e490d52dcc6" dependencies = [ "futures-util", "pin-project-lite", @@ -523,9 +536,9 @@ dependencies = [ [[package]] name = "aws-smithy-checksums" -version = "0.60.0" +version = "0.60.4" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "c5a373ec01aede3dd066ec018c1bc4e8f5dd11b2c11c59c8eef1a5c68101f397" +checksum = "be2acd1b9c6ae5859999250ed5a62423aedc5cf69045b844432de15fa2f31f2b" dependencies = [ "aws-smithy-http", "aws-smithy-types", @@ -533,7 +546,7 @@ dependencies = [ "crc32c", "crc32fast", "hex", - "http", + "http 0.2.9", "http-body", "md-5", "pin-project-lite", @@ -544,9 +557,9 @@ dependencies = [ [[package]] name = "aws-smithy-eventstream" -version = "0.60.0" +version = "0.60.4" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "1c669e1e5fc0d79561bf7a122b118bd50c898758354fe2c53eb8f2d31507cbc3" +checksum = "e6363078f927f612b970edf9d1903ef5cef9a64d1e8423525ebb1f0a1633c858" dependencies = [ "aws-smithy-types", "bytes", @@ -555,9 +568,9 @@ dependencies = [ [[package]] name = "aws-smithy-http" -version = "0.60.0" +version = "0.60.4" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "5b1de8aee22f67de467b2e3d0dd0fb30859dc53f579a63bd5381766b987db644" +checksum = "dab56aea3cd9e1101a0a999447fb346afb680ab1406cebc44b32346e25b4117d" dependencies = [ "aws-smithy-eventstream", "aws-smithy-runtime-api", @@ -565,7 +578,7 @@ dependencies = [ "bytes", "bytes-utils", "futures-core", - "http", + "http 0.2.9", "http-body", "once_cell", "percent-encoding", @@ -576,18 +589,18 @@ dependencies = [ [[package]] name = "aws-smithy-json" -version = "0.60.0" +version = "0.60.4" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "6a46dd338dc9576d6a6a5b5a19bd678dcad018ececee11cf28ecd7588bd1a55c" +checksum = "fd3898ca6518f9215f62678870064398f00031912390efd03f1f6ef56d83aa8e" dependencies = [ "aws-smithy-types", ] [[package]] name = "aws-smithy-query" -version = "0.60.0" +version = "0.60.4" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "feb5b8c7a86d4b6399169670723b7e6f21a39fc833a30f5c5a2f997608178129" +checksum = "bda4b1dfc9810e35fba8a620e900522cd1bd4f9578c446e82f49d1ce41d2e9f9" dependencies = [ "aws-smithy-types", "urlencoding", @@ -595,9 +608,9 @@ dependencies = [ [[package]] name = "aws-smithy-runtime" -version = "1.0.2" +version = "1.1.4" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "273479291efc55e7b0bce985b139d86b6031adb8e50f65c1f712f20ba38f6388" +checksum = "fafdab38f40ad7816e7da5dec279400dd505160780083759f01441af1bbb10ea" dependencies = [ "aws-smithy-async", "aws-smithy-http", @@ -606,28 +619,28 @@ dependencies = [ "bytes", "fastrand 2.0.0", "h2", - "http", + "http 0.2.9", "http-body", "hyper", "hyper-rustls", "once_cell", "pin-project-lite", "pin-utils", - "rustls", + "rustls 0.21.9", "tokio", "tracing", ] [[package]] name = "aws-smithy-runtime-api" -version = "1.0.2" +version = "1.1.4" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "c6cebff0d977b6b6feed2fd07db52aac58ba3ccaf26cdd49f1af4add5061bef9" +checksum = "c18276dd28852f34b3bf501f4f3719781f4999a51c7bff1a5c6dc8c4529adc29" dependencies = [ "aws-smithy-async", "aws-smithy-types", "bytes", - "http", + "http 0.2.9", "pin-project-lite", "tokio", "tracing", @@ -636,15 +649,15 @@ dependencies = [ [[package]] name = "aws-smithy-types" -version = "1.0.2" +version = "1.1.4" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d7f48b3f27ddb40ab19892a5abda331f403e3cb877965e4e51171447807104af" +checksum = "bb3e134004170d3303718baa2a4eb4ca64ee0a1c0a7041dca31b38be0fb414f3" dependencies = [ "base64-simd", "bytes", "bytes-utils", "futures-core", - "http", + "http 0.2.9", "http-body", "itoa", "num-integer", @@ -659,24 +672,24 @@ dependencies = [ [[package]] name = "aws-smithy-xml" -version = "0.60.0" +version = "0.60.4" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "0ec40d74a67fd395bc3f6b4ccbdf1543672622d905ef3f979689aea5b730cb95" +checksum = "8604a11b25e9ecaf32f9aa56b9fe253c5e2f606a3477f0071e96d3155a5ed218" dependencies = [ "xmlparser", ] [[package]] name = "aws-types" -version = "1.0.1" +version = "1.1.4" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "8403fc56b1f3761e8efe45771ddc1165e47ec3417c68e68a4519b5cb030159ca" +checksum = "789bbe008e65636fe1b6dbbb374c40c8960d1232b96af5ff4aec349f9c4accf4" dependencies = [ "aws-credential-types", "aws-smithy-async", "aws-smithy-runtime-api", "aws-smithy-types", - "http", + "http 0.2.9", "rustc_version", "tracing", ] @@ -693,7 +706,7 @@ dependencies = [ "bitflags 1.3.2", "bytes", "futures-util", - "http", + "http 0.2.9", "http-body", "hyper", "itoa", @@ -725,7 +738,7 @@ dependencies = [ "async-trait", "bytes", "futures-util", - "http", + "http 0.2.9", "http-body", "mime", "rustversion", @@ -894,6 +907,16 @@ version = "1.6.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "8c3c1a368f70d6cf7302d78f8f7093da241fb8e8807c05cc9e51a125895a6d5b" +[[package]] +name = "bcder" +version = "0.7.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c627747a6774aab38beb35990d88309481378558875a41da1a4b2e373c906ef0" +dependencies = [ + "bytes", + "smallvec", +] + [[package]] name = "bincode" version = "1.3.3" @@ -922,7 +945,7 @@ dependencies = [ "regex", "rustc-hash", "shlex", - "syn 2.0.32", + "syn 2.0.52", "which", ] @@ -973,9 +996,9 @@ checksum = "14c189c53d098945499cdfa7ecc63567cf3886b3332b312a5b4585d8d3a6a610" [[package]] name = "bytes" -version = "1.4.0" +version = "1.5.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "89b2fd2a0dcf38d7971e2194b6b6eebab45ae01067456a7fd93d5547a61b70be" +checksum = "a2bd12c1caf447e69cd4528f47f94d203fd2582878ecb9e9465484c4148a8223" dependencies = [ "serde", ] @@ -1136,7 +1159,7 @@ dependencies = [ "heck", "proc-macro2", "quote", - "syn 2.0.32", + "syn 2.0.52", ] [[package]] @@ -1145,16 +1168,6 @@ version = "0.5.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "2da6da31387c7e4ef160ffab6d5e7f00c42626fe39aea70a7b0f1773f7dd6c1b" -[[package]] -name = "close_fds" -version = "0.3.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "3bc416f33de9d59e79e57560f450d21ff8393adcf1cdfc3e6d8fb93d5f88a2ed" -dependencies = [ - "cfg-if", - "libc", -] - [[package]] name = "colorchoice" version = "1.0.0" @@ -1386,9 +1399,9 @@ dependencies = [ [[package]] name = "crc32c" -version = "0.6.3" +version = "0.6.5" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "3dfea2db42e9927a3845fb268a10a72faed6d416065f77873f05e411457c363e" +checksum = "89254598aa9b9fa608de44b3ae54c810f0f06d755e24c50177f1f8f31ff50ce2" dependencies = [ "rustc_version", ] @@ -1571,7 +1584,7 @@ dependencies = [ "proc-macro2", "quote", "strsim", - "syn 2.0.32", + "syn 2.0.52", ] [[package]] @@ -1582,7 +1595,7 @@ checksum = "29a358ff9f12ec09c3e61fef9b5a9902623a695a46a917b07f269bff1445611a" dependencies = [ "darling_core", "quote", - "syn 2.0.32", + "syn 2.0.52", ] [[package]] @@ -1624,6 +1637,16 @@ dependencies = [ "zeroize", ] +[[package]] +name = "der" +version = "0.7.8" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "fffa369a668c8af7dbf8b5e56c9f744fbd399949ed171606040001947de40b1c" +dependencies = [ + "const-oid", + "zeroize", +] + [[package]] name = "der-parser" version = "8.2.0" @@ -1638,6 +1661,69 @@ dependencies = [ "rusticata-macros", ] +[[package]] +name = "desim" +version = "0.1.0" +dependencies = [ + "anyhow", + "bytes", + "hex", + "parking_lot 0.12.1", + "rand 0.8.5", + "scopeguard", + "smallvec", + "tracing", + "utils", + "workspace_hack", +] + +[[package]] +name = "diesel" +version = "2.1.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "62c6fcf842f17f8c78ecf7c81d75c5ce84436b41ee07e03f490fbb5f5a8731d8" +dependencies = [ + "bitflags 2.4.1", + "byteorder", + "diesel_derives", + "itoa", + "pq-sys", + "r2d2", + "serde_json", +] + +[[package]] +name = "diesel_derives" +version = "2.1.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ef8337737574f55a468005a83499da720f20c65586241ffea339db9ecdfd2b44" +dependencies = [ + "diesel_table_macro_syntax", + "proc-macro2", + "quote", + "syn 2.0.52", +] + +[[package]] +name = "diesel_migrations" +version = "2.1.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6036b3f0120c5961381b570ee20a02432d7e2d27ea60de9578799cf9156914ac" +dependencies = [ + "diesel", + "migrations_internals", + "migrations_macros", +] + +[[package]] +name = "diesel_table_macro_syntax" +version = "0.1.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "fc5557efc453706fed5e4fa85006fe9817c224c3f480a34c7e5959fd700921c5" +dependencies = [ + "syn 2.0.52", +] + [[package]] name = "digest" version = "0.10.7" @@ -1657,7 +1743,7 @@ checksum = "487585f4d0c6655fe74905e2504d8ad6908e4db67f744eb140876906c2f3175d" dependencies = [ "proc-macro2", "quote", - "syn 2.0.32", + "syn 2.0.52", ] [[package]] @@ -1681,10 +1767,10 @@ version = "0.14.8" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "413301934810f597c1d19ca71c8710e99a3f1ba28a0d2ebc01551a2daeea3c5c" dependencies = [ - "der", + "der 0.6.1", "elliptic-curve", "rfc6979", - "signature", + "signature 1.6.4", ] [[package]] @@ -1701,7 +1787,7 @@ checksum = "e7bb888ab5300a19b8e5bceef25ac745ad065f3c9f7efc6de1b91958110891d3" dependencies = [ "base16ct", "crypto-bigint 0.4.9", - "der", + "der 0.6.1", "digest", "ff", "generic-array", @@ -1749,6 +1835,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "e875f1719c16de097dee81ed675e2d9bb63096823ed3f0ca827b7dea3028bbbb" dependencies = [ "enumset_derive", + "serde", ] [[package]] @@ -1760,7 +1847,7 @@ dependencies = [ "darling", "proc-macro2", "quote", - "syn 2.0.32", + "syn 2.0.52", ] [[package]] @@ -1966,9 +2053,9 @@ dependencies = [ [[package]] name = "futures-channel" -version = "0.3.28" +version = "0.3.30" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "955518d47e09b25bbebc7a18df10b81f0c766eaf4c4f1cccef2fca5f2a4fb5f2" +checksum = "eac8f7d7865dcb88bd4373ab671c8cf4508703796caa2b1985a9ca867b3fcb78" dependencies = [ "futures-core", "futures-sink", @@ -1976,9 +2063,9 @@ dependencies = [ [[package]] name = "futures-core" -version = "0.3.28" +version = "0.3.30" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "4bca583b7e26f571124fe5b7561d49cb2868d79116cfa0eefce955557c6fee8c" +checksum = "dfc6580bb841c5a68e9ef15c77ccc837b40a7504914d52e47b8b0e9bbda25a1d" [[package]] name = "futures-executor" @@ -1993,9 +2080,9 @@ dependencies = [ [[package]] name = "futures-io" -version = "0.3.28" +version = "0.3.30" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "4fff74096e71ed47f8e023204cfd0aa1289cd54ae5430a9523be060cdb849964" +checksum = "a44623e20b9681a318efdd71c299b6b222ed6f231972bfe2f224ebad6311f0c1" [[package]] name = "futures-lite" @@ -2014,26 +2101,26 @@ dependencies = [ [[package]] name = "futures-macro" -version = "0.3.28" +version = "0.3.30" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "89ca545a94061b6365f2c7355b4b32bd20df3ff95f02da9329b34ccc3bd6ee72" +checksum = "87750cf4b7a4c0625b1529e4c543c2182106e4dedc60a2a6455e00d212c489ac" dependencies = [ "proc-macro2", "quote", - "syn 2.0.32", + "syn 2.0.52", ] [[package]] name = "futures-sink" -version = "0.3.28" +version = "0.3.30" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f43be4fe21a13b9781a69afa4985b0f6ee0e1afab2c6f454a8cf30e2b2237b6e" +checksum = "9fb8e00e87438d937621c1c6269e53f536c14d3fbd6a042bb24879e57d474fb5" [[package]] name = "futures-task" -version = "0.3.28" +version = "0.3.30" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "76d3d132be6c0e6aa1534069c705a74a5997a356c0dc2f86a47765e5617c5b65" +checksum = "38d84fa142264698cdce1a9f9172cf383a0c82de1bddcf3092901442c4097004" [[package]] name = "futures-timer" @@ -2043,9 +2130,9 @@ checksum = "e64b03909df88034c26dc1547e8970b91f98bdb65165d6a4e9110d94263dbb2c" [[package]] name = "futures-util" -version = "0.3.28" +version = "0.3.30" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "26b01e40b772d54cf6c6d721c1d1abd0647a0106a12ecaa1c186273392a69533" +checksum = "3d6401deb83407ab3da39eba7e33987a73c3df0c82b4bb5813ee871c19c41d48" dependencies = [ "futures-channel", "futures-core", @@ -2149,7 +2236,7 @@ dependencies = [ "futures-core", "futures-sink", "futures-util", - "http", + "http 0.2.9", "indexmap 2.0.1", "slab", "tokio", @@ -2199,11 +2286,11 @@ dependencies = [ [[package]] name = "hashlink" -version = "0.8.2" +version = "0.8.4" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "0761a1b9491c4f2e3d66aa0f62d0fba0af9a0e2852e4d48ea506632a4b56e6aa" +checksum = "e8094feaf31ff591f651a2664fb9cfd92bba7a60ce3197265e9482ebe753c8f7" dependencies = [ - "hashbrown 0.13.2", + "hashbrown 0.14.0", ] [[package]] @@ -2300,6 +2387,17 @@ dependencies = [ "itoa", ] +[[package]] +name = "http" +version = "1.0.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b32afd38673a8016f7c9ae69e5af41a58f81b1d31689040f2f1959594ce194ea" +dependencies = [ + "bytes", + "fnv", + "itoa", +] + [[package]] name = "http-body" version = "0.4.5" @@ -2307,7 +2405,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "d5f38f16d184e36f2408a55281cd658ecbd3ca05cce6d6510a176eca393e26d1" dependencies = [ "bytes", - "http", + "http 0.2.9", "pin-project-lite", ] @@ -2370,7 +2468,7 @@ dependencies = [ "futures-core", "futures-util", "h2", - "http", + "http 0.2.9", "http-body", "httparse", "httpdate", @@ -2389,13 +2487,13 @@ version = "0.24.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "0646026eb1b3eea4cd9ba47912ea5ce9cc07713d105b1a14698f4e6433d348b7" dependencies = [ - "http", + "http 0.2.9", "hyper", "log", - "rustls", + "rustls 0.21.9", "rustls-native-certs", "tokio", - "tokio-rustls", + "tokio-rustls 0.24.0", ] [[package]] @@ -2633,7 +2731,7 @@ checksum = "5c7ea04a7c5c055c175f189b6dc6ba036fd62306b58c66c9f6389036c503a3f4" dependencies = [ "base64 0.21.1", "js-sys", - "pem 3.0.3", + "pem", "ring 0.17.6", "serde", "serde_json", @@ -2660,6 +2758,16 @@ dependencies = [ "libc", ] +[[package]] +name = "lasso" +version = "0.7.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "4644821e1c3d7a560fe13d842d13f587c07348a1a05d3a797152d41c90c56df2" +dependencies = [ + "dashmap", + "hashbrown 0.13.2", +] + [[package]] name = "lazy_static" version = "1.4.0" @@ -2672,6 +2780,17 @@ version = "1.3.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "830d08ce1d1d941e6b30645f1a0eb5643013d835ce3779a5fc208261dbe10f55" +[[package]] +name = "leaky-bucket" +version = "1.0.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8eb491abd89e9794d50f93c8db610a29509123e3fbbc9c8c67a528e9391cd853" +dependencies = [ + "parking_lot 0.12.1", + "tokio", + "tracing", +] + [[package]] name = "libc" version = "0.2.150" @@ -2688,6 +2807,12 @@ dependencies = [ "winapi", ] +[[package]] +name = "libm" +version = "0.2.8" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "4ec2a862134d2a7d32d7983ddcdd1c4923530833c9f2ea1a44fc5fa473989058" + [[package]] name = "linux-raw-sys" version = "0.1.4" @@ -2758,6 +2883,15 @@ version = "2.6.4" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "f665ee40bc4a3c5590afb1e9677db74a508659dfd71e126420da8274909a0167" +[[package]] +name = "memoffset" +version = "0.7.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5de893c32cde5f383baa4c04c5d6dbdd735cfd4a794b0debdb2bb1b421da5ff4" +dependencies = [ + "autocfg", +] + [[package]] name = "memoffset" version = "0.8.0" @@ -2783,10 +2917,35 @@ dependencies = [ "chrono", "libc", "once_cell", + "procfs", "prometheus", + "rand 0.8.5", + "rand_distr", + "twox-hash", "workspace_hack", ] +[[package]] +name = "migrations_internals" +version = "2.1.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0f23f71580015254b020e856feac3df5878c2c7a8812297edd6c0a485ac9dada" +dependencies = [ + "serde", + "toml", +] + +[[package]] +name = "migrations_macros" +version = "2.1.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "cce3325ac70e67bbab5bd837a31cae01f1a6db64e0e744a33cb03a543469ef08" +dependencies = [ + "migrations_internals", + "proc-macro2", + "quote", +] + [[package]] name = "mime" version = "0.3.17" @@ -2820,9 +2979,9 @@ dependencies = [ [[package]] name = "mio" -version = "0.8.10" +version = "0.8.11" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "8f3d0b296e374a4e6f3c7b0a1f5a51d748a0d34c85e7dc48fc3fa9a87657fe09" +checksum = "a4a650543ca06a924e8b371db273b2756685faae30f8487da1b56505a8f78b0c" dependencies = [ "libc", "log", @@ -2866,6 +3025,19 @@ dependencies = [ "libc", ] +[[package]] +name = "nix" +version = "0.26.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "598beaf3cc6fdd9a5dfb1630c2800c7acd31df7aaf0f565796fba2b53ca1af1b" +dependencies = [ + "bitflags 1.3.2", + "cfg-if", + "libc", + "memoffset 0.7.1", + "pin-utils", +] + [[package]] name = "nix" version = "0.27.1" @@ -2988,6 +3160,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "578ede34cf02f8924ab9447f50c28075b4d3e5b269972345e7e0372b38c6cdcd" dependencies = [ "autocfg", + "libm", ] [[package]] @@ -3018,7 +3191,7 @@ dependencies = [ "base64 0.13.1", "chrono", "getrandom 0.2.11", - "http", + "http 0.2.9", "rand 0.8.5", "serde", "serde_json", @@ -3081,7 +3254,7 @@ checksum = "a948666b637a0f465e8564c73e89d4dde00d72d4d473cc972f390fc3dcee7d9c" dependencies = [ "proc-macro2", "quote", - "syn 2.0.32", + "syn 2.0.52", ] [[package]] @@ -3120,7 +3293,7 @@ checksum = "c7594ec0e11d8e33faf03530a4c49af7064ebba81c1480e01be67d90b356508b" dependencies = [ "async-trait", "bytes", - "http", + "http 0.2.9", "opentelemetry_api", "reqwest", ] @@ -3133,7 +3306,7 @@ checksum = "7e5e5a5c4135864099f3faafbe939eb4d7f9b80ebf68a8448da961b32a7c1275" dependencies = [ "async-trait", "futures-core", - "http", + "http 0.2.9", "opentelemetry-http", "opentelemetry-proto", "opentelemetry-semantic-conventions", @@ -3309,6 +3482,7 @@ name = "pageserver" version = "0.1.0" dependencies = [ "anyhow", + "arc-swap", "async-compression", "async-stream", "async-trait", @@ -3318,7 +3492,6 @@ dependencies = [ "camino-tempfile", "chrono", "clap", - "close_fds", "const_format", "consumption_metrics", "crc32c", @@ -3337,6 +3510,7 @@ dependencies = [ "humantime-serde", "hyper", "itertools", + "leaky-bucket", "md5", "metrics", "nix 0.27.1", @@ -3344,6 +3518,7 @@ dependencies = [ "num_cpus", "once_cell", "pageserver_api", + "pageserver_compaction", "pin-project-lite", "postgres", "postgres-protocol", @@ -3394,10 +3569,13 @@ dependencies = [ "bincode", "byteorder", "bytes", + "chrono", "const_format", "enum-map", "hex", + "humantime", "humantime-serde", + "itertools", "postgres_ffi", "rand 0.8.5", "serde", @@ -3431,6 +3609,53 @@ dependencies = [ "workspace_hack", ] +[[package]] +name = "pageserver_compaction" +version = "0.1.0" +dependencies = [ + "anyhow", + "async-compression", + "async-stream", + "async-trait", + "byteorder", + "bytes", + "chrono", + "clap", + "const_format", + "consumption_metrics", + "criterion", + "crossbeam-utils", + "either", + "fail", + "flate2", + "futures", + "git-version", + "hex", + "hex-literal", + "humantime", + "humantime-serde", + "itertools", + "metrics", + "once_cell", + "pageserver_api", + "pin-project-lite", + "rand 0.8.5", + "smallvec", + "svg_fmt", + "sync_wrapper", + "thiserror", + "tokio", + "tokio-io-timeout", + "tokio-util", + "tracing", + "tracing-error", + "tracing-subscriber", + "url", + "utils", + "walkdir", + "workspace_hack", +] + [[package]] name = "parking" version = "2.1.1" @@ -3511,7 +3736,7 @@ dependencies = [ "parquet", "proc-macro2", "quote", - "syn 2.0.32", + "syn 2.0.52", ] [[package]] @@ -3549,16 +3774,6 @@ version = "0.1.2" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "19b17cddbe7ec3f8bc800887bab5e717348c95ea2ca0b1bf0837fb964dc67099" -[[package]] -name = "pem" -version = "2.0.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "6b13fe415cdf3c8e44518e18a7c95a13431d9bdf6d15367d82b23c377fdd441a" -dependencies = [ - "base64 0.21.1", - "serde", -] - [[package]] name = "pem" version = "3.0.3" @@ -3620,7 +3835,7 @@ checksum = "39407670928234ebc5e6e580247dd567ad73a3578460c5990f9503df207e8f07" dependencies = [ "proc-macro2", "quote", - "syn 2.0.32", + "syn 2.0.52", ] [[package]] @@ -3641,8 +3856,8 @@ version = "0.9.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "9eca2c590a5f85da82668fa685c09ce2888b9430e83299debf1f34b65fd4a4ba" dependencies = [ - "der", - "spki", + "der 0.6.1", + "spki 0.6.0", ] [[package]] @@ -3741,14 +3956,14 @@ dependencies = [ "futures", "once_cell", "pq_proto", - "rustls", - "rustls-pemfile", + "rustls 0.22.2", + "rustls-pemfile 2.1.1", "serde", "thiserror", "tokio", "tokio-postgres", "tokio-postgres-rustls", - "tokio-rustls", + "tokio-rustls 0.25.0", "tracing", "workspace_hack", ] @@ -3795,6 +4010,15 @@ version = "0.2.17" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "5b40af805b3121feab8a3c29f04d8ad262fa8e0561883e7653e024ae4479e6de" +[[package]] +name = "pq-sys" +version = "0.4.8" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "31c0052426df997c0cbd30789eb44ca097e3541717a7b8fa36b1c464ee7edebd" +dependencies = [ + "vcpkg", +] + [[package]] name = "pq_proto" version = "0.1.0" @@ -3804,6 +4028,7 @@ dependencies = [ "pin-project-lite", "postgres-protocol", "rand 0.8.5", + "serde", "thiserror", "tokio", "tracing", @@ -3827,7 +4052,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "3b69d39aab54d069e7f2fe8cb970493e7834601ca2d8c65fd7bbd183578080d1" dependencies = [ "proc-macro2", - "syn 2.0.32", + "syn 2.0.52", ] [[package]] @@ -3838,9 +4063,9 @@ checksum = "dc375e1527247fe1a97d8b7156678dfe7c1af2fc075c9a4db3690ecd2a148068" [[package]] name = "proc-macro2" -version = "1.0.66" +version = "1.0.78" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "18fb31db3f9bddb2ea821cde30a9f70117e3f119938b5ee630b7403aa6e2ead9" +checksum = "e2422ad645d89c99f8f3e6b88a9fdeca7fabeac836b1002371c4367c8f984aae" dependencies = [ "unicode-ident", ] @@ -3853,6 +4078,8 @@ checksum = "b1de8dacb0873f77e6aefc6d71e044761fcc68060290f5b1089fcdf84626bb69" dependencies = [ "bitflags 1.3.2", "byteorder", + "chrono", + "flate2", "hex", "lazy_static", "rustix 0.36.16", @@ -3943,6 +4170,7 @@ dependencies = [ "clap", "consumption_metrics", "dashmap", + "env_logger", "futures", "git-version", "hashbrown 0.13.2", @@ -3955,6 +4183,7 @@ dependencies = [ "hyper-tungstenite", "ipnet", "itertools", + "lasso", "md5", "metrics", "native-tls", @@ -3971,6 +4200,7 @@ dependencies = [ "pq_proto", "prometheus", "rand 0.8.5", + "rand_distr", "rcgen", "redis", "regex", @@ -3982,28 +4212,31 @@ dependencies = [ "routerify", "rstest", "rustc-hash", - "rustls", - "rustls-pemfile", + "rustls 0.22.2", + "rustls-pemfile 2.1.1", "scopeguard", "serde", "serde_json", "sha2", + "smallvec", "smol_str", "socket2 0.5.5", "sync_wrapper", "task-local-extensions", "thiserror", - "tls-listener", + "tikv-jemalloc-ctl", + "tikv-jemallocator", "tokio", "tokio-postgres", "tokio-postgres-rustls", - "tokio-rustls", + "tokio-rustls 0.25.0", "tokio-util", "tracing", "tracing-opentelemetry", "tracing-subscriber", "tracing-utils", "url", + "urlencoding", "utils", "uuid", "walkdir", @@ -4024,13 +4257,24 @@ dependencies = [ [[package]] name = "quote" -version = "1.0.32" +version = "1.0.35" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "50f3b39ccfb720540debaa0164757101c08ecb8d326b15358ce76a62c7e85965" +checksum = "291ec9ab5efd934aaf503a6466c5d5251535d108ee747472c3977cc5acc868ef" dependencies = [ "proc-macro2", ] +[[package]] +name = "r2d2" +version = "0.8.10" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "51de85fb3fb6524929c8a2eb85e6b6d363de4e8c48f9e2c2eac4944abc181c93" +dependencies = [ + "log", + "parking_lot 0.12.1", + "scheduled-thread-pool", +] + [[package]] name = "rand" version = "0.7.3" @@ -4093,6 +4337,16 @@ dependencies = [ "getrandom 0.2.11", ] +[[package]] +name = "rand_distr" +version = "0.4.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "32cb0b9bc82b0a0876c2dd994a7e7a2683d3e7390ca40e6886785ef0c7e3ee31" +dependencies = [ + "num-traits", + "rand 0.8.5", +] + [[package]] name = "rand_hc" version = "0.2.0" @@ -4126,12 +4380,12 @@ dependencies = [ [[package]] name = "rcgen" -version = "0.11.1" +version = "0.12.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "4954fbc00dcd4d8282c987710e50ba513d351400dbdd00e803a05172a90d8976" +checksum = "48406db8ac1f3cbc7dcdb56ec355343817958a356ff430259bb07baf7607e1e1" dependencies = [ - "pem 2.0.1", - "ring 0.16.20", + "pem", + "ring 0.17.6", "time", "yasna", ] @@ -4149,15 +4403,15 @@ dependencies = [ "itoa", "percent-encoding", "pin-project-lite", - "rustls", + "rustls 0.21.9", "rustls-native-certs", - "rustls-pemfile", + "rustls-pemfile 1.0.2", "rustls-webpki 0.101.7", "ryu", "sha1_smol", "socket2 0.4.9", "tokio", - "tokio-rustls", + "tokio-rustls 0.24.0", "tokio-util", "url", ] @@ -4212,6 +4466,12 @@ dependencies = [ "regex-syntax 0.8.2", ] +[[package]] +name = "regex-lite" +version = "0.1.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "30b661b2f27137bdbc16f00eda72866a92bb28af1753ffbd56744fb6e2e9cd8e" + [[package]] name = "regex-syntax" version = "0.6.29" @@ -4251,6 +4511,7 @@ dependencies = [ "futures", "futures-util", "http-types", + "humantime", "hyper", "itertools", "metrics", @@ -4262,6 +4523,7 @@ dependencies = [ "serde_json", "test-context", "tokio", + "tokio-stream", "tokio-util", "toml_edit", "tracing", @@ -4281,7 +4543,7 @@ dependencies = [ "futures-core", "futures-util", "h2", - "http", + "http 0.2.9", "http-body", "hyper", "hyper-rustls", @@ -4295,14 +4557,14 @@ dependencies = [ "once_cell", "percent-encoding", "pin-project-lite", - "rustls", - "rustls-pemfile", + "rustls 0.21.9", + "rustls-pemfile 1.0.2", "serde", "serde_json", "serde_urlencoded", "tokio", "tokio-native-tls", - "tokio-rustls", + "tokio-rustls 0.24.0", "tokio-util", "tower-service", "url", @@ -4322,7 +4584,7 @@ checksum = "4531c89d50effe1fac90d095c8b133c20c5c714204feee0bfc3fd158e784209d" dependencies = [ "anyhow", "async-trait", - "http", + "http 0.2.9", "reqwest", "serde", "task-local-extensions", @@ -4340,7 +4602,7 @@ dependencies = [ "chrono", "futures", "getrandom 0.2.11", - "http", + "http 0.2.9", "hyper", "parking_lot 0.11.2", "reqwest", @@ -4427,7 +4689,7 @@ version = "3.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "496c1d3718081c45ba9c31fbfc07417900aa96f4070ff90dc29961836b7a9945" dependencies = [ - "http", + "http 0.2.9", "hyper", "lazy_static", "percent-encoding", @@ -4468,7 +4730,7 @@ dependencies = [ "regex", "relative-path", "rustc_version", - "syn 2.0.32", + "syn 2.0.52", "unicode-ident", ] @@ -4552,6 +4814,20 @@ dependencies = [ "sct", ] +[[package]] +name = "rustls" +version = "0.22.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e87c9956bd9807afa1f77e0f7594af32566e830e088a5576d27c5b6f30f49d41" +dependencies = [ + "log", + "ring 0.17.6", + "rustls-pki-types", + "rustls-webpki 0.102.2", + "subtle", + "zeroize", +] + [[package]] name = "rustls-native-certs" version = "0.6.2" @@ -4559,7 +4835,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "0167bac7a9f490495f3c33013e7722b53cb087ecbe082fb0c6387c96f634ea50" dependencies = [ "openssl-probe", - "rustls-pemfile", + "rustls-pemfile 1.0.2", "schannel", "security-framework", ] @@ -4573,6 +4849,22 @@ dependencies = [ "base64 0.21.1", ] +[[package]] +name = "rustls-pemfile" +version = "2.1.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f48172685e6ff52a556baa527774f61fcaa884f59daf3375c62a3f1cd2549dab" +dependencies = [ + "base64 0.21.1", + "rustls-pki-types", +] + +[[package]] +name = "rustls-pki-types" +version = "1.3.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5ede67b28608b4c60685c7d54122d4400d90f62b40caee7700e700380a390fa8" + [[package]] name = "rustls-webpki" version = "0.100.2" @@ -4593,6 +4885,17 @@ dependencies = [ "untrusted 0.9.0", ] +[[package]] +name = "rustls-webpki" +version = "0.102.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "faaa0a62740bedb9b2ef5afa303da42764c012f743917351dc9a237ea1663610" +dependencies = [ + "ring 0.17.6", + "rustls-pki-types", + "untrusted 0.9.0", +] + [[package]] name = "rustversion" version = "1.0.12" @@ -4635,7 +4938,7 @@ dependencies = [ "serde_with", "thiserror", "tokio", - "tokio-rustls", + "tokio-rustls 0.25.0", "tokio-stream", "tracing", "tracing-appender", @@ -4659,6 +4962,7 @@ dependencies = [ "clap", "const_format", "crc32c", + "desim", "fail", "fs2", "futures", @@ -4674,6 +4978,7 @@ dependencies = [ "postgres_backend", "postgres_ffi", "pq_proto", + "rand 0.8.5", "regex", "remote_storage", "reqwest", @@ -4694,8 +4999,10 @@ dependencies = [ "tokio-util", "toml_edit", "tracing", + "tracing-subscriber", "url", "utils", + "walproposer", "workspace_hack", ] @@ -4728,6 +5035,15 @@ dependencies = [ "windows-sys 0.42.0", ] +[[package]] +name = "scheduled-thread-pool" +version = "0.2.7" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "3cbc66816425a074528352f5789333ecff06ca41b36b0b0efdfbb29edc391a19" +dependencies = [ + "parking_lot 0.12.1", +] + [[package]] name = "scopeguard" version = "1.1.0" @@ -4757,7 +5073,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "3be24c1842290c45df0a7bf069e0c268a747ad05a192f2fd7dcfdbc1cba40928" dependencies = [ "base16ct", - "der", + "der 0.6.1", "generic-array", "pkcs8", "subtle", @@ -4801,7 +5117,7 @@ checksum = "2e95efd0cefa32028cdb9766c96de71d96671072f9fb494dc9fb84c0ef93e52b" dependencies = [ "httpdate", "reqwest", - "rustls", + "rustls 0.21.9", "sentry-backtrace", "sentry-contexts", "sentry-core", @@ -4923,7 +5239,7 @@ checksum = "aafe972d60b0b9bee71a91b92fee2d4fb3c9d7e8f6b179aa99f27203d99a4816" dependencies = [ "proc-macro2", "quote", - "syn 2.0.32", + "syn 2.0.52", ] [[package]] @@ -5004,7 +5320,7 @@ dependencies = [ "darling", "proc-macro2", "quote", - "syn 2.0.32", + "syn 2.0.52", ] [[package]] @@ -5090,6 +5406,15 @@ dependencies = [ "rand_core 0.6.4", ] +[[package]] +name = "signature" +version = "2.2.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "77549399552de45a898a580c1b41d445bf730df867cc44e6c0233bbc4b8329de" +dependencies = [ + "rand_core 0.6.4", +] + [[package]] name = "simple_asn1" version = "0.6.2" @@ -5174,7 +5499,17 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "67cf02bbac7a337dc36e4f5a693db6c21e7863f45070f7064577eb4367a3212b" dependencies = [ "base64ct", - "der", + "der 0.6.1", +] + +[[package]] +name = "spki" +version = "0.7.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d91ed6c858b01f942cd56b37a94b3e0a1798290327d1236e4d9cf4eaca44d29d" +dependencies = [ + "base64ct", + "der 0.7.8", ] [[package]] @@ -5260,9 +5595,9 @@ checksum = "81cdd64d312baedb58e21336b31bc043b77e01cc99033ce76ef539f78e965ebc" [[package]] name = "svg_fmt" -version = "0.4.1" +version = "0.4.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "8fb1df15f412ee2e9dfc1c504260fa695c1c3f10fe9f4a6ee2d2184d7d6450e2" +checksum = "f83ba502a3265efb76efb89b0a2f7782ad6f2675015d4ce37e4b547dda42b499" [[package]] name = "syn" @@ -5277,9 +5612,9 @@ dependencies = [ [[package]] name = "syn" -version = "2.0.32" +version = "2.0.52" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "239814284fd6f1a4ffe4ca893952cdd93c224b6a1571c9a9eadd670295c0c9e2" +checksum = "b699d15b36d1f02c3e7c69f8ffef53de37aefae075d8488d4ba1a7788d574a07" dependencies = [ "proc-macro2", "quote", @@ -5394,22 +5729,22 @@ dependencies = [ [[package]] name = "thiserror" -version = "1.0.47" +version = "1.0.57" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "97a802ec30afc17eee47b2855fc72e0c4cd62be9b4efe6591edde0ec5bd68d8f" +checksum = "1e45bcbe8ed29775f228095caf2cd67af7a4ccf756ebff23a306bf3e8b47b24b" dependencies = [ "thiserror-impl", ] [[package]] name = "thiserror-impl" -version = "1.0.47" +version = "1.0.57" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "6bb623b56e39ab7dcd4b1b98bb6c8f8d907ed255b18de254088016b27a8ee19b" +checksum = "a953cb265bef375dae3de6663da4d3804eee9682ea80d8e2542529b73c531c81" dependencies = [ "proc-macro2", "quote", - "syn 2.0.32", + "syn 2.0.52", ] [[package]] @@ -5433,6 +5768,37 @@ dependencies = [ "ordered-float 2.10.1", ] +[[package]] +name = "tikv-jemalloc-ctl" +version = "0.5.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "619bfed27d807b54f7f776b9430d4f8060e66ee138a28632ca898584d462c31c" +dependencies = [ + "libc", + "paste", + "tikv-jemalloc-sys", +] + +[[package]] +name = "tikv-jemalloc-sys" +version = "0.5.4+5.3.0-patched" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9402443cb8fd499b6f327e40565234ff34dbda27460c5b47db0db77443dd85d1" +dependencies = [ + "cc", + "libc", +] + +[[package]] +name = "tikv-jemallocator" +version = "0.5.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "965fe0c26be5c56c94e38ba547249074803efd52adfb66de62107d95aab3eaca" +dependencies = [ + "libc", + "tikv-jemalloc-sys", +] + [[package]] name = "time" version = "0.3.21" @@ -5497,25 +5863,11 @@ version = "0.1.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "1f3ccbac311fea05f86f61904b462b55fb3df8837a366dfc601a0161d0532f20" -[[package]] -name = "tls-listener" -version = "0.7.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "81294c017957a1a69794f506723519255879e15a870507faf45dfed288b763dd" -dependencies = [ - "futures-util", - "hyper", - "pin-project-lite", - "thiserror", - "tokio", - "tokio-rustls", -] - [[package]] name = "tokio" -version = "1.34.0" +version = "1.36.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d0c014766411e834f7af5b8f4cf46257aab4036ca95e9d2c144a10f59ad6f5b9" +checksum = "61285f6515fa018fb2d1e46eb21223fff441ee8db5d0f1435e8ab4f5cdb80931" dependencies = [ "backtrace", "bytes", @@ -5532,9 +5884,10 @@ dependencies = [ [[package]] name = "tokio-epoll-uring" version = "0.1.0" -source = "git+https://github.com/neondatabase/tokio-epoll-uring.git?branch=main#0dd3a2f8bf3239d34a19719ef1a74146c093126f" +source = "git+https://github.com/neondatabase/tokio-epoll-uring.git?branch=main#868d2c42b5d54ca82fead6e8f2f233b69a540d3e" dependencies = [ "futures", + "nix 0.26.4", "once_cell", "scopeguard", "thiserror", @@ -5562,7 +5915,7 @@ checksum = "5b8a1e28f2deaa14e508979454cb3a223b10b938b45af148bc0986de36f1923b" dependencies = [ "proc-macro2", "quote", - "syn 2.0.32", + "syn 2.0.52", ] [[package]] @@ -5600,16 +5953,17 @@ dependencies = [ [[package]] name = "tokio-postgres-rustls" -version = "0.10.0" +version = "0.11.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "dd5831152cb0d3f79ef5523b357319ba154795d64c7078b2daa95a803b54057f" +checksum = "0ea13f22eda7127c827983bdaf0d7fff9df21c8817bab02815ac277a21143677" dependencies = [ "futures", - "ring 0.16.20", - "rustls", + "ring 0.17.6", + "rustls 0.22.2", "tokio", "tokio-postgres", - "tokio-rustls", + "tokio-rustls 0.25.0", + "x509-certificate", ] [[package]] @@ -5618,7 +5972,18 @@ version = "0.24.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "e0d409377ff5b1e3ca6437aa86c1eb7d40c134bfec254e44c830defa92669db5" dependencies = [ - "rustls", + "rustls 0.21.9", + "tokio", +] + +[[package]] +name = "tokio-rustls" +version = "0.25.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "775e0c0f0adb3a2f22a00c4745d728b479985fc15ee7ca6a2608388c5569860f" +dependencies = [ + "rustls 0.22.2", + "rustls-pki-types", "tokio", ] @@ -5738,7 +6103,7 @@ dependencies = [ "futures-core", "futures-util", "h2", - "http", + "http 0.2.9", "http-body", "hyper", "hyper-timeout", @@ -5746,9 +6111,9 @@ dependencies = [ "pin-project", "prost", "rustls-native-certs", - "rustls-pemfile", + "rustls-pemfile 1.0.2", "tokio", - "tokio-rustls", + "tokio-rustls 0.24.0", "tokio-stream", "tower", "tower-layer", @@ -5844,7 +6209,7 @@ checksum = "0f57e3ca2a01450b1a921183a9c9cbfda207fd822cef4ccb00a65402cbba7a74" dependencies = [ "proc-macro2", "quote", - "syn 2.0.32", + "syn 2.0.52", ] [[package]] @@ -5953,7 +6318,7 @@ dependencies = [ "byteorder", "bytes", "data-encoding", - "http", + "http 0.2.9", "httparse", "log", "rand 0.8.5", @@ -6060,7 +6425,7 @@ dependencies = [ "base64 0.21.1", "log", "once_cell", - "rustls", + "rustls 0.21.9", "rustls-webpki 0.100.2", "url", "webpki-roots 0.23.1", @@ -6069,8 +6434,9 @@ dependencies = [ [[package]] name = "uring-common" version = "0.1.0" -source = "git+https://github.com/neondatabase/tokio-epoll-uring.git?branch=main#0dd3a2f8bf3239d34a19719ef1a74146c093126f" +source = "git+https://github.com/neondatabase/tokio-epoll-uring.git?branch=main#868d2c42b5d54ca82fead6e8f2f233b69a540d3e" dependencies = [ + "bytes", "io-uring", "libc", ] @@ -6127,6 +6493,7 @@ dependencies = [ "hex-literal", "hyper", "jsonwebtoken", + "leaky-bucket", "metrics", "nix 0.27.1", "once_cell", @@ -6301,7 +6668,7 @@ dependencies = [ "once_cell", "proc-macro2", "quote", - "syn 2.0.32", + "syn 2.0.52", "wasm-bindgen-shared", ] @@ -6335,7 +6702,7 @@ checksum = "e128beba882dd1eb6200e1dc92ae6c5dbaa4311aa7bb211ca035779e5efc39f8" dependencies = [ "proc-macro2", "quote", - "syn 2.0.32", + "syn 2.0.52", "wasm-bindgen-backend", "wasm-bindgen-shared", ] @@ -6646,6 +7013,7 @@ dependencies = [ "futures-sink", "futures-util", "getrandom 0.2.11", + "hashbrown 0.13.2", "hashbrown 0.14.0", "hex", "hmac", @@ -6667,19 +7035,18 @@ dependencies = [ "regex-automata 0.4.3", "regex-syntax 0.8.2", "reqwest", - "ring 0.16.20", - "rustls", + "rustls 0.21.9", "scopeguard", "serde", "serde_json", "smallvec", "subtle", "syn 1.0.109", - "syn 2.0.32", + "syn 2.0.52", "time", "time-macros", "tokio", - "tokio-rustls", + "tokio-rustls 0.24.0", "tokio-util", "toml_datetime", "toml_edit", @@ -6690,11 +7057,31 @@ dependencies = [ "tungstenite", "url", "uuid", + "zeroize", "zstd", "zstd-safe", "zstd-sys", ] +[[package]] +name = "x509-certificate" +version = "0.23.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "66534846dec7a11d7c50a74b7cdb208b9a581cad890b7866430d438455847c85" +dependencies = [ + "bcder", + "bytes", + "chrono", + "der 0.7.8", + "hex", + "pem", + "ring 0.17.6", + "signature 2.2.0", + "spki 0.7.3", + "thiserror", + "zeroize", +] + [[package]] name = "x509-parser" version = "0.15.0" @@ -6753,7 +7140,7 @@ checksum = "b3c129550b3e6de3fd0ba67ba5c81818f9805e58b8d7fee80a3a59d2c9fc601a" dependencies = [ "proc-macro2", "quote", - "syn 2.0.32", + "syn 2.0.52", ] [[package]] @@ -6761,6 +7148,20 @@ name = "zeroize" version = "1.6.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "2a0956f1ba7c7909bfb66c2e9e4124ab6f6482560f6628b5aaeba39207c9aad9" +dependencies = [ + "zeroize_derive", +] + +[[package]] +name = "zeroize_derive" +version = "1.4.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ce36e65b0d2999d2aafac989fb249189a141aee1f53c612c1f37d72631959f69" +dependencies = [ + "proc-macro2", + "quote", + "syn 2.0.52", +] [[package]] name = "zstd" diff --git a/Cargo.toml b/Cargo.toml index 0254ea24e1..3f98c1946c 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -5,6 +5,7 @@ members = [ "control_plane", "control_plane/attachment_service", "pageserver", + "pageserver/compaction", "pageserver/ctl", "pageserver/client", "pageserver/pagebench", @@ -18,6 +19,7 @@ members = [ "libs/pageserver_api", "libs/postgres_ffi", "libs/safekeeper_api", + "libs/desim", "libs/utils", "libs/consumption_metrics", "libs/postgres_backend", @@ -48,11 +50,12 @@ azure_storage_blobs = "0.18" flate2 = "1.0.26" async-stream = "0.3" async-trait = "0.1" -aws-config = { version = "1.0", default-features = false, features=["rustls"] } -aws-sdk-s3 = "1.0" -aws-smithy-async = { version = "1.0", default-features = false, features=["rt-tokio"] } -aws-smithy-types = "1.0" -aws-credential-types = "1.0" +aws-config = { version = "1.1.4", default-features = false, features=["rustls"] } +aws-sdk-s3 = "1.14" +aws-sdk-secretsmanager = { version = "1.14.0" } +aws-smithy-async = { version = "1.1.4", default-features = false, features=["rt-tokio"] } +aws-smithy-types = "1.1.4" +aws-credential-types = "1.1.4" axum = { version = "0.6.20", features = ["ws"] } base64 = "0.13.0" bincode = "1.3" @@ -64,7 +67,6 @@ camino = "1.1.6" cfg-if = "1.0.0" chrono = { version = "0.4", default-features = false, features = ["clock"] } clap = { version = "4.0", features = ["derive"] } -close_fds = "0.3.2" comfy-table = "6.1" const_format = "0.2" crc32c = "0.6" @@ -80,7 +82,7 @@ futures-core = "0.3" futures-util = "0.3" git-version = "0.3" hashbrown = "0.13" -hashlink = "0.8.1" +hashlink = "0.8.4" hdrhistogram = "7.5.2" hex = "0.4" hex-literal = "0.4" @@ -95,6 +97,8 @@ inotify = "0.10.2" ipnet = "2.9.0" itertools = "0.10" jsonwebtoken = "9" +lasso = "0.7" +leaky-bucket = "1.0.1" libc = "0.2" md5 = "0.7.0" memoffset = "0.8" @@ -112,6 +116,7 @@ parquet = { version = "49.0.0", default-features = false, features = ["zstd"] } parquet_derive = "49.0.0" pbkdf2 = { version = "0.12.1", features = ["simple", "std"] } pin-project-lite = "0.2" +procfs = "0.14" prometheus = {version = "0.13", default_features=false, features = ["process"]} # removes protobuf dependency prost = "0.11" rand = "0.8" @@ -124,8 +129,8 @@ reqwest-retry = "0.2.2" routerify = "3" rpds = "0.13" rustc-hash = "1.1.0" -rustls = "0.21" -rustls-pemfile = "1" +rustls = "0.22" +rustls-pemfile = "2" rustls-split = "0.3" scopeguard = "1.1" sysinfo = "0.29.2" @@ -149,12 +154,13 @@ tar = "0.4" task-local-extensions = "0.1.4" test-context = "0.1" thiserror = "1.0" -tls-listener = { version = "0.7", features = ["rustls", "hyper-h1"] } +tikv-jemallocator = "0.5" +tikv-jemalloc-ctl = "0.5" tokio = { version = "1.17", features = ["macros"] } tokio-epoll-uring = { git = "https://github.com/neondatabase/tokio-epoll-uring.git" , branch = "main" } tokio-io-timeout = "1.2.0" -tokio-postgres-rustls = "0.10.0" -tokio-rustls = "0.24" +tokio-postgres-rustls = "0.11.0" +tokio-rustls = "0.25" tokio-stream = "0.1" tokio-tar = "0.3" tokio-test = "0.4.3" @@ -166,7 +172,9 @@ tracing = "0.1" tracing-error = "0.2.0" tracing-opentelemetry = "0.20.0" tracing-subscriber = { version = "0.3", default_features = false, features = ["smallvec", "fmt", "tracing-log", "std", "env-filter", "json"] } +twox-hash = { version = "1.6.3", default-features = false } url = "2.2" +urlencoding = "2.1" uuid = { version = "1.6.1", features = ["v4", "v7", "serde"] } walkdir = "2.3.2" webpki-roots = "0.25" @@ -192,12 +200,14 @@ consumption_metrics = { version = "0.1", path = "./libs/consumption_metrics/" } metrics = { version = "0.1", path = "./libs/metrics/" } pageserver_api = { version = "0.1", path = "./libs/pageserver_api/" } pageserver_client = { path = "./pageserver/client" } +pageserver_compaction = { version = "0.1", path = "./pageserver/compaction/" } postgres_backend = { version = "0.1", path = "./libs/postgres_backend/" } postgres_connection = { version = "0.1", path = "./libs/postgres_connection/" } postgres_ffi = { version = "0.1", path = "./libs/postgres_ffi/" } pq_proto = { version = "0.1", path = "./libs/pq_proto/" } remote_storage = { version = "0.1", path = "./libs/remote_storage/" } safekeeper_api = { version = "0.1", path = "./libs/safekeeper_api" } +desim = { version = "0.1", path = "./libs/desim" } storage_broker = { version = "0.1", path = "./storage_broker/" } # Note: main broker code is inside the binary crate, so linking with the library shouldn't be heavy. tenant_size_model = { version = "0.1", path = "./libs/tenant_size_model/" } tracing-utils = { version = "0.1", path = "./libs/tracing-utils/" } @@ -210,7 +220,7 @@ workspace_hack = { version = "0.1", path = "./workspace_hack/" } ## Build dependencies criterion = "0.5.1" -rcgen = "0.11" +rcgen = "0.12" rstest = "0.18" camino-tempfile = "1.0.2" tonic-build = "0.9" diff --git a/Dockerfile b/Dockerfile index 5d5fde4f14..5f82df3e18 100644 --- a/Dockerfile +++ b/Dockerfile @@ -47,12 +47,13 @@ COPY --chown=nonroot . . # Show build caching stats to check if it was used in the end. # Has to be the part of the same RUN since cachepot daemon is killed in the end of this RUN, losing the compilation stats. RUN set -e \ - && mold -run cargo build \ + && RUSTFLAGS="-Clinker=clang -Clink-arg=-fuse-ld=mold -Clink-arg=-Wl,--no-rosegment" cargo build \ --bin pg_sni_router \ --bin pageserver \ --bin pagectl \ --bin safekeeper \ --bin storage_broker \ + --bin storage_controller \ --bin proxy \ --bin neon_local \ --locked --release \ @@ -80,6 +81,7 @@ COPY --from=build --chown=neon:neon /home/nonroot/target/release/pageserver COPY --from=build --chown=neon:neon /home/nonroot/target/release/pagectl /usr/local/bin COPY --from=build --chown=neon:neon /home/nonroot/target/release/safekeeper /usr/local/bin COPY --from=build --chown=neon:neon /home/nonroot/target/release/storage_broker /usr/local/bin +COPY --from=build --chown=neon:neon /home/nonroot/target/release/storage_controller /usr/local/bin COPY --from=build --chown=neon:neon /home/nonroot/target/release/proxy /usr/local/bin COPY --from=build --chown=neon:neon /home/nonroot/target/release/neon_local /usr/local/bin @@ -98,6 +100,11 @@ RUN mkdir -p /data/.neon/ && chown -R neon:neon /data/.neon/ \ -c "listen_pg_addr='0.0.0.0:6400'" \ -c "listen_http_addr='0.0.0.0:9898'" +# When running a binary that links with libpq, default to using our most recent postgres version. Binaries +# that want a particular postgres version will select it explicitly: this is just a default. +ENV LD_LIBRARY_PATH /usr/local/v16/lib + + VOLUME ["/data"] USER neon EXPOSE 6400 diff --git a/Dockerfile.buildtools b/Dockerfile.build-tools similarity index 99% rename from Dockerfile.buildtools rename to Dockerfile.build-tools index 213aed1679..3a452fec32 100644 --- a/Dockerfile.buildtools +++ b/Dockerfile.build-tools @@ -111,7 +111,7 @@ USER nonroot:nonroot WORKDIR /home/nonroot # Python -ENV PYTHON_VERSION=3.9.2 \ +ENV PYTHON_VERSION=3.9.18 \ PYENV_ROOT=/home/nonroot/.pyenv \ PATH=/home/nonroot/.pyenv/shims:/home/nonroot/.pyenv/bin:/home/nonroot/.poetry/bin:$PATH RUN set -e \ @@ -135,7 +135,7 @@ WORKDIR /home/nonroot # Rust # Please keep the version of llvm (installed above) in sync with rust llvm (`rustc --version --verbose | grep LLVM`) -ENV RUSTC_VERSION=1.75.0 +ENV RUSTC_VERSION=1.76.0 ENV RUSTUP_HOME="/home/nonroot/.rustup" ENV PATH="/home/nonroot/.cargo/bin:${PATH}" RUN curl -sSO https://static.rust-lang.org/rustup/dist/$(uname -m)-unknown-linux-gnu/rustup-init && whoami && \ diff --git a/Dockerfile.compute-node b/Dockerfile.compute-node index 299c4097e8..c73b9ce5c9 100644 --- a/Dockerfile.compute-node +++ b/Dockerfile.compute-node @@ -520,8 +520,7 @@ RUN apt-get update && \ libboost-regex1.74-dev \ libboost-serialization1.74-dev \ libboost-system1.74-dev \ - libeigen3-dev \ - libfreetype6-dev + libeigen3-dev ENV PATH "/usr/local/pgsql/bin/:/usr/local/pgsql/:$PATH" RUN wget https://github.com/rdkit/rdkit/archive/refs/tags/Release_2023_03_3.tar.gz -O rdkit.tar.gz && \ @@ -547,6 +546,7 @@ RUN wget https://github.com/rdkit/rdkit/archive/refs/tags/Release_2023_03_3.tar. -D PostgreSQL_LIBRARY_DIR=`pg_config --libdir` \ -D RDK_INSTALL_INTREE=OFF \ -D RDK_INSTALL_COMIC_FONTS=OFF \ + -D RDK_BUILD_FREETYPE_SUPPORT=OFF \ -D CMAKE_BUILD_TYPE=Release \ . && \ make -j $(getconf _NPROCESSORS_ONLN) && \ @@ -639,8 +639,8 @@ FROM build-deps AS pg-anon-pg-build COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/ ENV PATH "/usr/local/pgsql/bin/:$PATH" -RUN wget https://gitlab.com/dalibo/postgresql_anonymizer/-/archive/1.1.0/postgresql_anonymizer-1.1.0.tar.gz -O pg_anon.tar.gz && \ - echo "08b09d2ff9b962f96c60db7e6f8e79cf7253eb8772516998fc35ece08633d3ad pg_anon.tar.gz" | sha256sum --check && \ +RUN wget https://github.com/neondatabase/postgresql_anonymizer/archive/refs/tags/neon_1.1.1.tar.gz -O pg_anon.tar.gz && \ + echo "321ea8d5c1648880aafde850a2c576e4a9e7b9933a34ce272efc839328999fa9 pg_anon.tar.gz" | sha256sum --check && \ mkdir pg_anon-src && cd pg_anon-src && tar xvzf ../pg_anon.tar.gz --strip-components=1 -C . && \ find /usr/local/pgsql -type f | sed 's|^/usr/local/pgsql/||' > /before.txt &&\ make -j $(getconf _NPROCESSORS_ONLN) install PG_CONFIG=/usr/local/pgsql/bin/pg_config && \ @@ -769,6 +769,40 @@ RUN wget https://github.com/eulerto/wal2json/archive/refs/tags/wal2json_2_5.tar. make -j $(getconf _NPROCESSORS_ONLN) && \ make -j $(getconf _NPROCESSORS_ONLN) install +######################################################################################### +# +# Layer "pg_ivm" +# compile pg_ivm extension +# +######################################################################################### +FROM build-deps AS pg-ivm-build +COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/ + +ENV PATH "/usr/local/pgsql/bin/:$PATH" +RUN wget https://github.com/sraoss/pg_ivm/archive/refs/tags/v1.7.tar.gz -O pg_ivm.tar.gz && \ + echo "ebfde04f99203c7be4b0e873f91104090e2e83e5429c32ac242d00f334224d5e pg_ivm.tar.gz" | sha256sum --check && \ + mkdir pg_ivm-src && cd pg_ivm-src && tar xvzf ../pg_ivm.tar.gz --strip-components=1 -C . && \ + make -j $(getconf _NPROCESSORS_ONLN) && \ + make -j $(getconf _NPROCESSORS_ONLN) install && \ + echo 'trusted = true' >> /usr/local/pgsql/share/extension/pg_ivm.control + +######################################################################################### +# +# Layer "pg_partman" +# compile pg_partman extension +# +######################################################################################### +FROM build-deps AS pg-partman-build +COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/ + +ENV PATH "/usr/local/pgsql/bin/:$PATH" +RUN wget https://github.com/pgpartman/pg_partman/archive/refs/tags/v5.0.1.tar.gz -O pg_partman.tar.gz && \ + echo "75b541733a9659a6c90dbd40fccb904a630a32880a6e3044d0c4c5f4c8a65525 pg_partman.tar.gz" | sha256sum --check && \ + mkdir pg_partman-src && cd pg_partman-src && tar xvzf ../pg_partman.tar.gz --strip-components=1 -C . && \ + make -j $(getconf _NPROCESSORS_ONLN) && \ + make -j $(getconf _NPROCESSORS_ONLN) install && \ + echo 'trusted = true' >> /usr/local/pgsql/share/extension/pg_partman.control + ######################################################################################### # # Layer "neon-pg-ext-build" @@ -809,6 +843,9 @@ COPY --from=pg-roaringbitmap-pg-build /usr/local/pgsql/ /usr/local/pgsql/ COPY --from=pg-semver-pg-build /usr/local/pgsql/ /usr/local/pgsql/ COPY --from=pg-embedding-pg-build /usr/local/pgsql/ /usr/local/pgsql/ COPY --from=wal2json-pg-build /usr/local/pgsql /usr/local/pgsql +COPY --from=pg-anon-pg-build /usr/local/pgsql/ /usr/local/pgsql/ +COPY --from=pg-ivm-build /usr/local/pgsql/ /usr/local/pgsql/ +COPY --from=pg-partman-build /usr/local/pgsql/ /usr/local/pgsql/ COPY pgxn/ pgxn/ RUN make -j $(getconf _NPROCESSORS_ONLN) \ @@ -819,6 +856,10 @@ RUN make -j $(getconf _NPROCESSORS_ONLN) \ PG_CONFIG=/usr/local/pgsql/bin/pg_config \ -C pgxn/neon_utils \ -s install && \ + make -j $(getconf _NPROCESSORS_ONLN) \ + PG_CONFIG=/usr/local/pgsql/bin/pg_config \ + -C pgxn/neon_test_utils \ + -s install && \ make -j $(getconf _NPROCESSORS_ONLN) \ PG_CONFIG=/usr/local/pgsql/bin/pg_config \ -C pgxn/neon_rmgr \ @@ -850,7 +891,17 @@ ENV BUILD_TAG=$BUILD_TAG USER nonroot # Copy entire project to get Cargo.* files with proper dependencies for the whole project COPY --chown=nonroot . . -RUN cd compute_tools && cargo build --locked --profile release-line-debug-size-lto +RUN cd compute_tools && mold -run cargo build --locked --profile release-line-debug-size-lto + +######################################################################################### +# +# Final compute-tools image +# +######################################################################################### + +FROM debian:bullseye-slim AS compute-tools-image + +COPY --from=compute-tools /home/nonroot/target/release-line-debug-size-lto/compute_ctl /usr/local/bin/compute_ctl ######################################################################################### # @@ -901,7 +952,7 @@ COPY --from=compute-tools --chown=postgres /home/nonroot/target/release-line-deb # libgeos, libgdal, libsfcgal1, libproj and libprotobuf-c1 for PostGIS # libxml2, libxslt1.1 for xml2 # libzstd1 for zstd -# libboost*, libfreetype6, and zlib1g for rdkit +# libboost* for rdkit # ca-certificates for communicating with s3 by compute_ctl RUN apt update && \ apt install --no-install-recommends -y \ @@ -914,7 +965,6 @@ RUN apt update && \ libboost-serialization1.74.0 \ libboost-system1.74.0 \ libossp-uuid16 \ - libfreetype6 \ libgeos-c1v5 \ libgdal28 \ libproj19 \ @@ -926,7 +976,6 @@ RUN apt update && \ libcurl4-openssl-dev \ locales \ procps \ - zlib1g \ ca-certificates && \ rm -rf /var/lib/apt/lists/* /tmp/* /var/tmp/* && \ localedef -i en_US -c -f UTF-8 -A /usr/share/locale/locale.alias en_US.UTF-8 diff --git a/Dockerfile.compute-tools b/Dockerfile.compute-tools deleted file mode 100644 index cc305cc556..0000000000 --- a/Dockerfile.compute-tools +++ /dev/null @@ -1,32 +0,0 @@ -# First transient image to build compute_tools binaries -# NB: keep in sync with rust image version in .github/workflows/build_and_test.yml -ARG REPOSITORY=neondatabase -ARG IMAGE=build-tools -ARG TAG=pinned -ARG BUILD_TAG - -FROM $REPOSITORY/$IMAGE:$TAG AS rust-build -WORKDIR /home/nonroot - -# Enable https://github.com/paritytech/cachepot to cache Rust crates' compilation results in Docker builds. -# Set up cachepot to use an AWS S3 bucket for cache results, to reuse it between `docker build` invocations. -# cachepot falls back to local filesystem if S3 is misconfigured, not failing the build. -ARG RUSTC_WRAPPER=cachepot -ENV AWS_REGION=eu-central-1 -ENV CACHEPOT_S3_KEY_PREFIX=cachepot -ARG CACHEPOT_BUCKET=neon-github-dev -#ARG AWS_ACCESS_KEY_ID -#ARG AWS_SECRET_ACCESS_KEY -ARG BUILD_TAG -ENV BUILD_TAG=$BUILD_TAG - -COPY . . - -RUN set -e \ - && mold -run cargo build -p compute_tools --locked --release \ - && cachepot -s - -# Final image that only has one binary -FROM debian:bullseye-slim - -COPY --from=rust-build /home/nonroot/target/release/compute_ctl /usr/local/bin/compute_ctl diff --git a/Makefile b/Makefile index 004ca3fbcf..f13f080f1a 100644 --- a/Makefile +++ b/Makefile @@ -51,6 +51,8 @@ CARGO_BUILD_FLAGS += $(filter -j1,$(MAKEFLAGS)) CARGO_CMD_PREFIX += $(if $(filter n,$(MAKEFLAGS)),,+) # Force cargo not to print progress bar CARGO_CMD_PREFIX += CARGO_TERM_PROGRESS_WHEN=never CI=1 +# Set PQ_LIB_DIR to make sure `storage_controller` get linked with bundled libpq (through diesel) +CARGO_CMD_PREFIX += PQ_LIB_DIR=$(POSTGRES_INSTALL_DIR)/v16/lib # # Top level Makefile to build Neon and PostgreSQL @@ -157,8 +159,8 @@ neon-pg-ext-%: postgres-% -C $(POSTGRES_INSTALL_DIR)/build/neon-utils-$* \ -f $(ROOT_PROJECT_DIR)/pgxn/neon_utils/Makefile install -.PHONY: neon-pg-ext-clean-% -neon-pg-ext-clean-%: +.PHONY: neon-pg-clean-ext-% +neon-pg-clean-ext-%: $(MAKE) PG_CONFIG=$(POSTGRES_INSTALL_DIR)/$*/bin/pg_config \ -C $(POSTGRES_INSTALL_DIR)/build/neon-$* \ -f $(ROOT_PROJECT_DIR)/pgxn/neon/Makefile clean @@ -174,10 +176,10 @@ neon-pg-ext-clean-%: # Build walproposer as a static library. walproposer source code is located # in the pgxn/neon directory. -# +# # We also need to include libpgport.a and libpgcommon.a, because walproposer # uses some functions from those libraries. -# +# # Some object files are removed from libpgport.a and libpgcommon.a because # they depend on openssl and other libraries that are not included in our # Rust build. @@ -214,11 +216,11 @@ neon-pg-ext: \ neon-pg-ext-v15 \ neon-pg-ext-v16 -.PHONY: neon-pg-ext-clean -neon-pg-ext-clean: \ - neon-pg-ext-clean-v14 \ - neon-pg-ext-clean-v15 \ - neon-pg-ext-clean-v16 +.PHONY: neon-pg-clean-ext +neon-pg-clean-ext: \ + neon-pg-clean-ext-v14 \ + neon-pg-clean-ext-v15 \ + neon-pg-clean-ext-v16 # shorthand to build all Postgres versions .PHONY: postgres @@ -247,7 +249,7 @@ postgres-check: \ # This doesn't remove the effects of 'configure'. .PHONY: clean -clean: postgres-clean neon-pg-ext-clean +clean: postgres-clean neon-pg-clean-ext $(CARGO_CMD_PREFIX) cargo clean # This removes everything diff --git a/NOTICE b/NOTICE index c13dc2f0b3..52fc751c41 100644 --- a/NOTICE +++ b/NOTICE @@ -1,5 +1,5 @@ Neon -Copyright 2022 Neon Inc. +Copyright 2022 - 2024 Neon Inc. The PostgreSQL submodules in vendor/ are licensed under the PostgreSQL license. See vendor/postgres-vX/COPYRIGHT for details. diff --git a/README.md b/README.md index 98af1edee6..c44ae695d6 100644 --- a/README.md +++ b/README.md @@ -5,7 +5,7 @@ Neon is a serverless open-source alternative to AWS Aurora Postgres. It separates storage and compute and substitutes the PostgreSQL storage layer by redistributing data across a cluster of nodes. ## Quick start -Try the [Neon Free Tier](https://neon.tech/docs/introduction/technical-preview-free-tier/) to create a serverless Postgres instance. Then connect to it with your preferred Postgres client (psql, dbeaver, etc) or use the online [SQL Editor](https://neon.tech/docs/get-started-with-neon/query-with-neon-sql-editor/). See [Connect from any application](https://neon.tech/docs/connect/connect-from-any-app/) for connection instructions. +Try the [Neon Free Tier](https://neon.tech/github) to create a serverless Postgres instance. Then connect to it with your preferred Postgres client (psql, dbeaver, etc) or use the online [SQL Editor](https://neon.tech/docs/get-started-with-neon/query-with-neon-sql-editor/). See [Connect from any application](https://neon.tech/docs/connect/connect-from-any-app/) for connection instructions. Alternatively, compile and run the project [locally](#running-local-installation). @@ -14,8 +14,8 @@ Alternatively, compile and run the project [locally](#running-local-installation A Neon installation consists of compute nodes and the Neon storage engine. Compute nodes are stateless PostgreSQL nodes backed by the Neon storage engine. The Neon storage engine consists of two major components: -- Pageserver. Scalable storage backend for the compute nodes. -- Safekeepers. The safekeepers form a redundant WAL service that received WAL from the compute node, and stores it durably until it has been processed by the pageserver and uploaded to cloud storage. +- Pageserver: Scalable storage backend for the compute nodes. +- Safekeepers: The safekeepers form a redundant WAL service that received WAL from the compute node, and stores it durably until it has been processed by the pageserver and uploaded to cloud storage. See developer documentation in [SUMMARY.md](/docs/SUMMARY.md) for more information. @@ -81,9 +81,9 @@ The project uses [rust toolchain file](./rust-toolchain.toml) to define the vers This file is automatically picked up by [`rustup`](https://rust-lang.github.io/rustup/overrides.html#the-toolchain-file) that installs (if absent) and uses the toolchain version pinned in the file. -rustup users who want to build with another toolchain can use [`rustup override`](https://rust-lang.github.io/rustup/overrides.html#directory-overrides) command to set a specific toolchain for the project's directory. +rustup users who want to build with another toolchain can use the [`rustup override`](https://rust-lang.github.io/rustup/overrides.html#directory-overrides) command to set a specific toolchain for the project's directory. -non-rustup users most probably are not getting the same toolchain automatically from the file, so are responsible to manually verify their toolchain matches the version in the file. +non-rustup users most probably are not getting the same toolchain automatically from the file, so are responsible to manually verify that their toolchain matches the version in the file. Newer rustc versions most probably will work fine, yet older ones might not be supported due to some new features used by the project or the crates. #### Building on Linux @@ -124,7 +124,7 @@ make -j`sysctl -n hw.logicalcpu` -s To run the `psql` client, install the `postgresql-client` package or modify `PATH` and `LD_LIBRARY_PATH` to include `pg_install/bin` and `pg_install/lib`, respectively. To run the integration tests or Python scripts (not required to use the code), install -Python (3.9 or higher), and install python3 packages using `./scripts/pysync` (requires [poetry>=1.3](https://python-poetry.org/)) in the project directory. +Python (3.9 or higher), and install the python3 packages using `./scripts/pysync` (requires [poetry>=1.3](https://python-poetry.org/)) in the project directory. #### Running neon database @@ -166,7 +166,7 @@ Starting postgres at 'postgresql://cloud_admin@127.0.0.1:55432/postgres' 2. Now, it is possible to connect to postgres and run some queries: ```text -> psql -p55432 -h 127.0.0.1 -U cloud_admin postgres +> psql -p 55432 -h 127.0.0.1 -U cloud_admin postgres postgres=# CREATE TABLE t(key int primary key, value text); CREATE TABLE postgres=# insert into t values(1,1); @@ -205,7 +205,7 @@ Starting postgres at 'postgresql://cloud_admin@127.0.0.1:55434/postgres' # this new postgres instance will have all the data from 'main' postgres, # but all modifications would not affect data in original postgres -> psql -p55434 -h 127.0.0.1 -U cloud_admin postgres +> psql -p 55434 -h 127.0.0.1 -U cloud_admin postgres postgres=# select * from t; key | value -----+------- @@ -216,7 +216,7 @@ postgres=# insert into t values(2,2); INSERT 0 1 # check that the new change doesn't affect the 'main' postgres -> psql -p55432 -h 127.0.0.1 -U cloud_admin postgres +> psql -p 55432 -h 127.0.0.1 -U cloud_admin postgres postgres=# select * from t; key | value -----+------- @@ -224,12 +224,18 @@ postgres=# select * from t; (1 row) ``` -4. If you want to run tests afterward (see below), you must stop all the running of the pageserver, safekeeper, and postgres instances +4. If you want to run tests afterwards (see below), you must stop all the running pageserver, safekeeper, and postgres instances you have just started. You can terminate them all with one command: ```sh > cargo neon stop ``` +More advanced usages can be found at [Control Plane and Neon Local](./control_plane/README.md). + +#### Handling build failures + +If you encounter errors during setting up the initial tenant, it's best to stop everything (`cargo neon stop`) and remove the `.neon` directory. Then fix the problems, and start the setup again. + ## Running tests Ensure your dependencies are installed as described [here](https://github.com/neondatabase/neon#dependency-installation-notes). @@ -243,12 +249,28 @@ CARGO_BUILD_FLAGS="--features=testing" make ``` By default, this runs both debug and release modes, and all supported postgres versions. When -testing locally, it is convenient to run just run one set of permutations, like this: +testing locally, it is convenient to run just one set of permutations, like this: ```sh DEFAULT_PG_VERSION=15 BUILD_TYPE=release ./scripts/pytest ``` +## Flamegraphs + +You may find yourself in need of flamegraphs for software in this repository. +You can use [`flamegraph-rs`](https://github.com/flamegraph-rs/flamegraph) or the original [`flamegraph.pl`](https://github.com/brendangregg/FlameGraph). Your choice! + +>[!IMPORTANT] +> If you're using `lld` or `mold`, you need the `--no-rosegment` linker argument. +> It's a [general thing with Rust / lld / mold](https://crbug.com/919499#c16), not specific to this repository. +> See [this PR for further instructions](https://github.com/neondatabase/neon/pull/6764). + +## Cleanup + +For cleaning up the source tree from build artifacts, run `make clean` in the source directory. + +For removing every artifact from build and configure steps, run `make distclean`, and also consider removing the cargo binaries in the `target` directory, as well as the database in the `.neon` directory. Note that removing the `.neon` directory will remove your database, with all data in it. You have been warned! + ## Documentation [docs](/docs) Contains a top-level overview of all available markdown documentation. diff --git a/clippy.toml b/clippy.toml index d788afc84d..5f7dc66152 100644 --- a/clippy.toml +++ b/clippy.toml @@ -3,3 +3,10 @@ disallowed-methods = [ # Allow this for now, to deny it later once we stop using Handle::block_on completely # "tokio::runtime::Handle::block_on", ] + +disallowed-macros = [ + # use std::pin::pin + "futures::pin_mut", + # cannot disallow this, because clippy finds used from tokio macros + #"tokio::pin", +] diff --git a/compute_tools/src/bin/compute_ctl.rs b/compute_tools/src/bin/compute_ctl.rs index a7e10d0aee..117919786e 100644 --- a/compute_tools/src/bin/compute_ctl.rs +++ b/compute_tools/src/bin/compute_ctl.rs @@ -45,7 +45,6 @@ use std::{thread, time::Duration}; use anyhow::{Context, Result}; use chrono::Utc; use clap::Arg; -use nix::sys::signal::{kill, Signal}; use signal_hook::consts::{SIGQUIT, SIGTERM}; use signal_hook::{consts::SIGINT, iterator::Signals}; use tracing::{error, info}; @@ -53,7 +52,9 @@ use url::Url; use compute_api::responses::ComputeStatus; -use compute_tools::compute::{ComputeNode, ComputeState, ParsedSpec, PG_PID, SYNC_SAFEKEEPERS_PID}; +use compute_tools::compute::{ + forward_termination_signal, ComputeNode, ComputeState, ParsedSpec, PG_PID, +}; use compute_tools::configurator::launch_configurator; use compute_tools::extension_server::get_pg_version; use compute_tools::http::api::launch_http_server; @@ -394,6 +395,15 @@ fn main() -> Result<()> { info!("synced safekeepers at lsn {lsn}"); } + let mut state = compute.state.lock().unwrap(); + if state.status == ComputeStatus::TerminationPending { + state.status = ComputeStatus::Terminated; + compute.state_changed.notify_all(); + // we were asked to terminate gracefully, don't exit to avoid restart + delay_exit = true + } + drop(state); + if let Err(err) = compute.check_for_core_dumps() { error!("error while checking for core dumps: {err:?}"); } @@ -523,16 +533,7 @@ fn cli() -> clap::Command { /// wait for termination which would be easy then. fn handle_exit_signal(sig: i32) { info!("received {sig} termination signal"); - let ss_pid = SYNC_SAFEKEEPERS_PID.load(Ordering::SeqCst); - if ss_pid != 0 { - let ss_pid = nix::unistd::Pid::from_raw(ss_pid as i32); - kill(ss_pid, Signal::SIGTERM).ok(); - } - let pg_pid = PG_PID.load(Ordering::SeqCst); - if pg_pid != 0 { - let pg_pid = nix::unistd::Pid::from_raw(pg_pid as i32); - kill(pg_pid, Signal::SIGTERM).ok(); - } + forward_termination_signal(); exit(1); } diff --git a/compute_tools/src/compute.rs b/compute_tools/src/compute.rs index 07e0abe6ff..0fa315682d 100644 --- a/compute_tools/src/compute.rs +++ b/compute_tools/src/compute.rs @@ -2,7 +2,7 @@ use std::collections::HashMap; use std::env; use std::fs; use std::io::BufRead; -use std::os::unix::fs::PermissionsExt; +use std::os::unix::fs::{symlink, PermissionsExt}; use std::path::Path; use std::process::{Command, Stdio}; use std::str::FromStr; @@ -17,9 +17,9 @@ use chrono::{DateTime, Utc}; use futures::future::join_all; use futures::stream::FuturesUnordered; use futures::StreamExt; +use nix::unistd::Pid; +use postgres::error::SqlState; use postgres::{Client, NoTls}; -use tokio; -use tokio_postgres; use tracing::{debug, error, info, instrument, warn}; use utils::id::{TenantId, TimelineId}; use utils::lsn::Lsn; @@ -28,6 +28,8 @@ use compute_api::responses::{ComputeMetrics, ComputeStatus}; use compute_api::spec::{ComputeFeature, ComputeMode, ComputeSpec}; use utils::measured_stream::MeasuredReader; +use nix::sys::signal::{kill, Signal}; + use remote_storage::{DownloadError, RemotePath}; use crate::checker::create_availability_check_data; @@ -207,6 +209,7 @@ fn maybe_cgexec(cmd: &str) -> Command { /// Create special neon_superuser role, that's a slightly nerfed version of a real superuser /// that we give to customers +#[instrument(skip_all)] fn create_neon_superuser(spec: &ComputeSpec, client: &mut Client) -> Result<()> { let roles = spec .cluster @@ -319,11 +322,12 @@ impl ComputeNode { // Get basebackup from the libpq connection to pageserver using `connstr` and // unarchive it to `pgdata` directory overriding all its previous content. #[instrument(skip_all, fields(%lsn))] - fn get_basebackup(&self, compute_state: &ComputeState, lsn: Lsn) -> Result<()> { + fn try_get_basebackup(&self, compute_state: &ComputeState, lsn: Lsn) -> Result<()> { let spec = compute_state.pspec.as_ref().expect("spec must be set"); let start_time = Instant::now(); - let mut config = postgres::Config::from_str(&spec.pageserver_connstr)?; + let shard0_connstr = spec.pageserver_connstr.split(',').next().unwrap(); + let mut config = postgres::Config::from_str(shard0_connstr)?; // Use the storage auth token from the config file, if given. // Note: this overrides any password set in the connection string. @@ -390,6 +394,34 @@ impl ComputeNode { Ok(()) } + // Gets the basebackup in a retry loop + #[instrument(skip_all, fields(%lsn))] + pub fn get_basebackup(&self, compute_state: &ComputeState, lsn: Lsn) -> Result<()> { + let mut retry_period_ms = 500.0; + let mut attempts = 0; + let max_attempts = 10; + loop { + let result = self.try_get_basebackup(compute_state, lsn); + match result { + Ok(_) => { + return result; + } + Err(ref e) if attempts < max_attempts => { + warn!( + "Failed to get basebackup: {} (attempt {}/{})", + e, attempts, max_attempts + ); + std::thread::sleep(std::time::Duration::from_millis(retry_period_ms as u64)); + retry_period_ms *= 1.5; + } + Err(_) => { + return result; + } + } + attempts += 1; + } + } + pub async fn check_safekeepers_synced_async( &self, compute_state: &ComputeState, @@ -605,6 +637,48 @@ impl ComputeNode { // Update pg_hba.conf received with basebackup. update_pg_hba(pgdata_path)?; + // Place pg_dynshmem under /dev/shm. This allows us to use + // 'dynamic_shared_memory_type = mmap' so that the files are placed in + // /dev/shm, similar to how 'dynamic_shared_memory_type = posix' works. + // + // Why on earth don't we just stick to the 'posix' default, you might + // ask. It turns out that making large allocations with 'posix' doesn't + // work very well with autoscaling. The behavior we want is that: + // + // 1. You can make large DSM allocations, larger than the current RAM + // size of the VM, without errors + // + // 2. If the allocated memory is really used, the VM is scaled up + // automatically to accommodate that + // + // We try to make that possible by having swap in the VM. But with the + // default 'posix' DSM implementation, we fail step 1, even when there's + // plenty of swap available. PostgreSQL uses posix_fallocate() to create + // the shmem segment, which is really just a file in /dev/shm in Linux, + // but posix_fallocate() on tmpfs returns ENOMEM if the size is larger + // than available RAM. + // + // Using 'dynamic_shared_memory_type = mmap' works around that, because + // the Postgres 'mmap' DSM implementation doesn't use + // posix_fallocate(). Instead, it uses repeated calls to write(2) to + // fill the file with zeros. It's weird that that differs between + // 'posix' and 'mmap', but we take advantage of it. When the file is + // filled slowly with write(2), the kernel allows it to grow larger, as + // long as there's swap available. + // + // In short, using 'dynamic_shared_memory_type = mmap' allows us one DSM + // segment to be larger than currently available RAM. But because we + // don't want to store it on a real file, which the kernel would try to + // flush to disk, so symlink pg_dynshm to /dev/shm. + // + // We don't set 'dynamic_shared_memory_type = mmap' here, we let the + // control plane control that option. If 'mmap' is not used, this + // symlink doesn't affect anything. + // + // See https://github.com/neondatabase/autoscaling/issues/800 + std::fs::remove_dir(pgdata_path.join("pg_dynshmem"))?; + symlink("/dev/shm/", pgdata_path.join("pg_dynshmem"))?; + match spec.mode { ComputeMode::Primary => {} ComputeMode::Replica | ComputeMode::Static(..) => { @@ -649,8 +723,12 @@ impl ComputeNode { // Stop it when it's ready info!("waiting for postgres"); wait_for_postgres(&mut pg, Path::new(pgdata))?; - pg.kill()?; - info!("sent kill signal"); + // SIGQUIT orders postgres to exit immediately. We don't want to SIGKILL + // it to avoid orphaned processes prowling around while datadir is + // wiped. + let pm_pid = Pid::from_raw(pg.id() as i32); + kill(pm_pid, Signal::SIGQUIT)?; + info!("sent SIGQUIT signal"); pg.wait()?; info!("done prewarming"); @@ -691,6 +769,26 @@ impl ComputeNode { Ok((pg, logs_handle)) } + /// Do post configuration of the already started Postgres. This function spawns a background thread to + /// configure the database after applying the compute spec. Currently, it upgrades the neon extension + /// version. In the future, it may upgrade all 3rd-party extensions. + #[instrument(skip_all)] + pub fn post_apply_config(&self) -> Result<()> { + let connstr = self.connstr.clone(); + thread::spawn(move || { + let func = || { + let mut client = Client::connect(connstr.as_str(), NoTls)?; + handle_neon_extension_upgrade(&mut client) + .context("handle_neon_extension_upgrade")?; + Ok::<_, anyhow::Error>(()) + }; + if let Err(err) = func() { + error!("error while post_apply_config: {err:#}"); + } + }); + Ok(()) + } + /// Do initial configuration of the already started Postgres. #[instrument(skip_all)] pub fn apply_config(&self, compute_state: &ComputeState) -> Result<()> { @@ -702,27 +800,34 @@ impl ComputeNode { // but we can create a new one and grant it all privileges. let connstr = self.connstr.clone(); let mut client = match Client::connect(connstr.as_str(), NoTls) { - Err(e) => { - info!( - "cannot connect to postgres: {}, retrying with `zenith_admin` username", - e - ); - let mut zenith_admin_connstr = connstr.clone(); + Err(e) => match e.code() { + Some(&SqlState::INVALID_PASSWORD) + | Some(&SqlState::INVALID_AUTHORIZATION_SPECIFICATION) => { + // connect with zenith_admin if cloud_admin could not authenticate + info!( + "cannot connect to postgres: {}, retrying with `zenith_admin` username", + e + ); + let mut zenith_admin_connstr = connstr.clone(); - zenith_admin_connstr - .set_username("zenith_admin") - .map_err(|_| anyhow::anyhow!("invalid connstr"))?; + zenith_admin_connstr + .set_username("zenith_admin") + .map_err(|_| anyhow::anyhow!("invalid connstr"))?; - let mut client = Client::connect(zenith_admin_connstr.as_str(), NoTls)?; - // Disable forwarding so that users don't get a cloud_admin role - client.simple_query("SET neon.forward_ddl = false")?; - client.simple_query("CREATE USER cloud_admin WITH SUPERUSER")?; - client.simple_query("GRANT zenith_admin TO cloud_admin")?; - drop(client); + let mut client = + Client::connect(zenith_admin_connstr.as_str(), NoTls) + .context("broken cloud_admin credential: tried connecting with cloud_admin but could not authenticate, and zenith_admin does not work either")?; + // Disable forwarding so that users don't get a cloud_admin role + client.simple_query("SET neon.forward_ddl = false")?; + client.simple_query("CREATE USER cloud_admin WITH SUPERUSER")?; + client.simple_query("GRANT zenith_admin TO cloud_admin")?; + drop(client); - // reconnect with connstring with expected name - Client::connect(connstr.as_str(), NoTls)? - } + // reconnect with connstring with expected name + Client::connect(connstr.as_str(), NoTls)? + } + _ => return Err(e.into()), + }, Ok(client) => client, }; @@ -736,7 +841,12 @@ impl ComputeNode { handle_roles(spec, &mut client)?; handle_databases(spec, &mut client)?; handle_role_deletions(spec, connstr.as_str(), &mut client)?; - handle_grants(spec, &mut client, connstr.as_str())?; + handle_grants( + spec, + &mut client, + connstr.as_str(), + self.has_feature(ComputeFeature::AnonExtension), + )?; handle_extensions(spec, &mut client)?; handle_extension_neon(&mut client)?; create_availability_check_data(&mut client)?; @@ -744,12 +854,11 @@ impl ComputeNode { // 'Close' connection drop(client); - if self.has_feature(ComputeFeature::Migrations) { - thread::spawn(move || { - let mut client = Client::connect(connstr.as_str(), NoTls)?; - handle_migrations(&mut client) - }); - } + // Run migrations separately to not hold up cold starts + thread::spawn(move || { + let mut client = Client::connect(connstr.as_str(), NoTls)?; + handle_migrations(&mut client) + }); Ok(()) } @@ -811,7 +920,12 @@ impl ComputeNode { handle_roles(&spec, &mut client)?; handle_databases(&spec, &mut client)?; handle_role_deletions(&spec, self.connstr.as_str(), &mut client)?; - handle_grants(&spec, &mut client, self.connstr.as_str())?; + handle_grants( + &spec, + &mut client, + self.connstr.as_str(), + self.has_feature(ComputeFeature::AnonExtension), + )?; handle_extensions(&spec, &mut client)?; handle_extension_neon(&mut client)?; // We can skip handle_migrations here because a new migration can only appear @@ -909,18 +1023,21 @@ impl ComputeNode { let pg_process = self.start_postgres(pspec.storage_auth_token.clone())?; let config_time = Utc::now(); - if pspec.spec.mode == ComputeMode::Primary && !pspec.spec.skip_pg_catalog_updates { - let pgdata_path = Path::new(&self.pgdata); - // temporarily reset max_cluster_size in config - // to avoid the possibility of hitting the limit, while we are applying config: - // creating new extensions, roles, etc... - config::compute_ctl_temp_override_create(pgdata_path, "neon.max_cluster_size=-1")?; - self.pg_reload_conf()?; + if pspec.spec.mode == ComputeMode::Primary { + if !pspec.spec.skip_pg_catalog_updates { + let pgdata_path = Path::new(&self.pgdata); + // temporarily reset max_cluster_size in config + // to avoid the possibility of hitting the limit, while we are applying config: + // creating new extensions, roles, etc... + config::compute_ctl_temp_override_create(pgdata_path, "neon.max_cluster_size=-1")?; + self.pg_reload_conf()?; - self.apply_config(&compute_state)?; + self.apply_config(&compute_state)?; - config::compute_ctl_temp_override_remove(pgdata_path)?; - self.pg_reload_conf()?; + config::compute_ctl_temp_override_remove(pgdata_path)?; + self.pg_reload_conf()?; + } + self.post_apply_config()?; } let startup_end_time = Utc::now(); @@ -1241,3 +1358,17 @@ LIMIT 100", Ok(remote_ext_metrics) } } + +pub fn forward_termination_signal() { + let ss_pid = SYNC_SAFEKEEPERS_PID.load(Ordering::SeqCst); + if ss_pid != 0 { + let ss_pid = nix::unistd::Pid::from_raw(ss_pid as i32); + kill(ss_pid, Signal::SIGTERM).ok(); + } + let pg_pid = PG_PID.load(Ordering::SeqCst); + if pg_pid != 0 { + let pg_pid = nix::unistd::Pid::from_raw(pg_pid as i32); + // use 'immediate' shutdown (SIGQUIT): https://www.postgresql.org/docs/current/server-shutdown.html + kill(pg_pid, Signal::SIGQUIT).ok(); + } +} diff --git a/compute_tools/src/config.rs b/compute_tools/src/config.rs index a7ef8cea92..42b8480211 100644 --- a/compute_tools/src/config.rs +++ b/compute_tools/src/config.rs @@ -51,6 +51,9 @@ pub fn write_postgres_conf( if let Some(s) = &spec.pageserver_connstring { writeln!(file, "neon.pageserver_connstring={}", escape_conf_value(s))?; } + if let Some(stripe_size) = spec.shard_stripe_size { + writeln!(file, "neon.stripe_size={stripe_size}")?; + } if !spec.safekeeper_connstrings.is_empty() { writeln!( file, @@ -79,6 +82,12 @@ pub fn write_postgres_conf( ComputeMode::Replica => { // hot_standby is 'on' by default, but let's be explicit writeln!(file, "hot_standby=on")?; + + // Inform the replica about the primary state + // Default is 'false' + if let Some(primary_is_running) = spec.primary_is_running { + writeln!(file, "neon.primary_is_running={}", primary_is_running)?; + } } } diff --git a/compute_tools/src/extension_server.rs b/compute_tools/src/extension_server.rs index 2cec12119f..ef1db73982 100644 --- a/compute_tools/src/extension_server.rs +++ b/compute_tools/src/extension_server.rs @@ -71,7 +71,7 @@ More specifically, here is an example ext_index.json } } */ -use anyhow::{self, Result}; +use anyhow::Result; use anyhow::{bail, Context}; use bytes::Bytes; use compute_api::spec::RemoteExtSpec; diff --git a/compute_tools/src/http/api.rs b/compute_tools/src/http/api.rs index fa2c4cff28..128783b477 100644 --- a/compute_tools/src/http/api.rs +++ b/compute_tools/src/http/api.rs @@ -5,6 +5,7 @@ use std::net::SocketAddr; use std::sync::Arc; use std::thread; +use crate::compute::forward_termination_signal; use crate::compute::{ComputeNode, ComputeState, ParsedSpec}; use compute_api::requests::ConfigurationRequest; use compute_api::responses::{ComputeStatus, ComputeStatusResponse, GenericAPIError}; @@ -12,8 +13,6 @@ use compute_api::responses::{ComputeStatus, ComputeStatusResponse, GenericAPIErr use anyhow::Result; use hyper::service::{make_service_fn, service_fn}; use hyper::{Body, Method, Request, Response, Server, StatusCode}; -use num_cpus; -use serde_json; use tokio::task; use tracing::{error, info, warn}; use tracing_utils::http::OtelName; @@ -123,6 +122,17 @@ async fn routes(req: Request, compute: &Arc) -> Response { + info!("serving /terminate POST request"); + match handle_terminate_request(compute).await { + Ok(()) => Response::new(Body::empty()), + Err((msg, code)) => { + error!("error handling /terminate request: {msg}"); + render_json_error(&msg, code) + } + } + } + // download extension files from remote extension storage on demand (&Method::POST, route) if route.starts_with("/extension_server/") => { info!("serving {:?} POST request", route); @@ -297,6 +307,49 @@ fn render_json_error(e: &str, status: StatusCode) -> Response { .unwrap() } +async fn handle_terminate_request(compute: &Arc) -> Result<(), (String, StatusCode)> { + { + let mut state = compute.state.lock().unwrap(); + if state.status == ComputeStatus::Terminated { + return Ok(()); + } + if state.status != ComputeStatus::Empty && state.status != ComputeStatus::Running { + let msg = format!( + "invalid compute status for termination request: {:?}", + state.status.clone() + ); + return Err((msg, StatusCode::PRECONDITION_FAILED)); + } + state.status = ComputeStatus::TerminationPending; + compute.state_changed.notify_all(); + drop(state); + } + forward_termination_signal(); + info!("sent signal and notified waiters"); + + // Spawn a blocking thread to wait for compute to become Terminated. + // This is needed to do not block the main pool of workers and + // be able to serve other requests while some particular request + // is waiting for compute to finish configuration. + let c = compute.clone(); + task::spawn_blocking(move || { + let mut state = c.state.lock().unwrap(); + while state.status != ComputeStatus::Terminated { + state = c.state_changed.wait(state).unwrap(); + info!( + "waiting for compute to become Terminated, current status: {:?}", + state.status + ); + } + + Ok(()) + }) + .await + .unwrap()?; + info!("terminated Postgres"); + Ok(()) +} + // Main Hyper HTTP server function that runs it and blocks waiting on it forever. #[tokio::main] async fn serve(port: u16, state: Arc) { diff --git a/compute_tools/src/http/openapi_spec.yaml b/compute_tools/src/http/openapi_spec.yaml index cedc6ece8f..d2ec54299f 100644 --- a/compute_tools/src/http/openapi_spec.yaml +++ b/compute_tools/src/http/openapi_spec.yaml @@ -168,6 +168,29 @@ paths: schema: $ref: "#/components/schemas/GenericError" + /terminate: + post: + tags: + - Terminate + summary: Terminate Postgres and wait for it to exit + description: "" + operationId: terminate + responses: + 200: + description: Result + 412: + description: "wrong state" + content: + application/json: + schema: + $ref: "#/components/schemas/GenericError" + 500: + description: "Unexpected error" + content: + application/json: + schema: + $ref: "#/components/schemas/GenericError" + components: securitySchemes: JWT: diff --git a/compute_tools/src/monitor.rs b/compute_tools/src/monitor.rs index f09bd02664..872a3f7750 100644 --- a/compute_tools/src/monitor.rs +++ b/compute_tools/src/monitor.rs @@ -138,6 +138,34 @@ fn watch_compute_activity(compute: &ComputeNode) { } } // + // Don't suspend compute if there is an active logical replication subscription + // + // `where pid is not null` – to filter out read only computes and subscription on branches + // + let logical_subscriptions_query = + "select count(*) from pg_stat_subscription where pid is not null;"; + match cli.query_one(logical_subscriptions_query, &[]) { + Ok(row) => match row.try_get::<&str, i64>("count") { + Ok(num_subscribers) => { + if num_subscribers > 0 { + compute.update_last_active(Some(Utc::now())); + continue; + } + } + Err(e) => { + warn!("failed to parse `pg_stat_subscription` count: {:?}", e); + continue; + } + }, + Err(e) => { + warn!( + "failed to get list of active logical replication subscriptions: {:?}", + e + ); + continue; + } + } + // // Do not suspend compute if autovacuum is running // let autovacuum_count_query = "select count(*) from pg_stat_activity where backend_type = 'autovacuum worker'"; diff --git a/compute_tools/src/pg_helpers.rs b/compute_tools/src/pg_helpers.rs index ce704385c6..5deb50d6b7 100644 --- a/compute_tools/src/pg_helpers.rs +++ b/compute_tools/src/pg_helpers.rs @@ -264,9 +264,10 @@ pub fn wait_for_postgres(pg: &mut Child, pgdata: &Path) -> Result<()> { // case we miss some events for some reason. Not strictly necessary, but // better safe than sorry. let (tx, rx) = std::sync::mpsc::channel(); - let (mut watcher, rx): (Box, _) = match notify::recommended_watcher(move |res| { + let watcher_res = notify::recommended_watcher(move |res| { let _ = tx.send(res); - }) { + }); + let (mut watcher, rx): (Box, _) = match watcher_res { Ok(watcher) => (Box::new(watcher), rx), Err(e) => { match e.kind { diff --git a/compute_tools/src/spec.rs b/compute_tools/src/spec.rs index e87dc0b732..ba3a84cda8 100644 --- a/compute_tools/src/spec.rs +++ b/compute_tools/src/spec.rs @@ -302,9 +302,9 @@ pub fn handle_roles(spec: &ComputeSpec, client: &mut Client) -> Result<()> { RoleAction::Create => { // This branch only runs when roles are created through the console, so it is // safe to add more permissions here. BYPASSRLS and REPLICATION are inherited - // from neon_superuser. + // from neon_superuser. (NOTE: REPLICATION has been removed from here for now). let mut query: String = format!( - "CREATE ROLE {} INHERIT CREATEROLE CREATEDB BYPASSRLS REPLICATION IN ROLE neon_superuser", + "CREATE ROLE {} INHERIT CREATEROLE CREATEDB BYPASSRLS IN ROLE neon_superuser", name.pg_quote() ); info!("running role create query: '{}'", &query); @@ -581,7 +581,12 @@ pub fn handle_databases(spec: &ComputeSpec, client: &mut Client) -> Result<()> { /// Grant CREATE ON DATABASE to the database owner and do some other alters and grants /// to allow users creating trusted extensions and re-creating `public` schema, for example. #[instrument(skip_all)] -pub fn handle_grants(spec: &ComputeSpec, client: &mut Client, connstr: &str) -> Result<()> { +pub fn handle_grants( + spec: &ComputeSpec, + client: &mut Client, + connstr: &str, + enable_anon_extension: bool, +) -> Result<()> { info!("modifying database permissions"); let existing_dbs = get_existing_dbs(client)?; @@ -650,6 +655,9 @@ pub fn handle_grants(spec: &ComputeSpec, client: &mut Client, connstr: &str) -> // remove this code if possible. The worst thing that could happen is that // user won't be able to use public schema in NEW databases created in the // very OLD project. + // + // Also, alter default permissions so that relations created by extensions can be + // used by neon_superuser without permission issues. let grant_query = "DO $$\n\ BEGIN\n\ IF EXISTS(\n\ @@ -668,6 +676,15 @@ pub fn handle_grants(spec: &ComputeSpec, client: &mut Client, connstr: &str) -> GRANT CREATE ON SCHEMA public TO web_access;\n\ END IF;\n\ END IF;\n\ + IF EXISTS(\n\ + SELECT nspname\n\ + FROM pg_catalog.pg_namespace\n\ + WHERE nspname = 'public'\n\ + )\n\ + THEN\n\ + ALTER DEFAULT PRIVILEGES IN SCHEMA public GRANT ALL ON TABLES TO neon_superuser WITH GRANT OPTION;\n\ + ALTER DEFAULT PRIVILEGES IN SCHEMA public GRANT ALL ON SEQUENCES TO neon_superuser WITH GRANT OPTION;\n\ + END IF;\n\ END\n\ $$;" .to_string(); @@ -678,6 +695,11 @@ pub fn handle_grants(spec: &ComputeSpec, client: &mut Client, connstr: &str) -> inlinify(&grant_query) ); db_client.simple_query(&grant_query)?; + + // it is important to run this after all grants + if enable_anon_extension { + handle_extension_anon(spec, &db.owner, &mut db_client, false)?; + } } Ok(()) @@ -722,7 +744,17 @@ pub fn handle_extension_neon(client: &mut Client) -> Result<()> { // - extension was just installed // - extension was already installed and is up to date let query = "ALTER EXTENSION neon UPDATE"; - info!("update neon extension schema with query: {}", query); + info!("update neon extension version with query: {}", query); + client.simple_query(query)?; + + Ok(()) +} + +#[instrument(skip_all)] +pub fn handle_neon_extension_upgrade(client: &mut Client) -> Result<()> { + info!("handle neon extension upgrade"); + let query = "ALTER EXTENSION neon UPDATE"; + info!("update neon extension version with query: {}", query); client.simple_query(query)?; Ok(()) @@ -758,6 +790,33 @@ BEGIN END LOOP; END $$; "#, + r#" +DO $$ +BEGIN + IF (SELECT setting::numeric >= 160000 FROM pg_settings WHERE name = 'server_version_num') THEN + EXECUTE 'GRANT pg_create_subscription TO neon_superuser'; + END IF; +END +$$;"#, + "GRANT pg_monitor TO neon_superuser WITH ADMIN OPTION", + // Don't remove: these are some SQLs that we originally applied in migrations but turned out to execute somewhere else. + "", + "", + "", + "", + // Add new migrations below. + r#" +DO $$ +DECLARE + role_name TEXT; +BEGIN + FOR role_name IN SELECT rolname FROM pg_roles WHERE rolreplication IS TRUE + LOOP + RAISE NOTICE 'EXECUTING ALTER ROLE % NOREPLICATION', quote_ident(role_name); + EXECUTE 'ALTER ROLE ' || quote_ident(role_name) || ' NOREPLICATION'; + END LOOP; +END +$$;"#, ]; let mut query = "CREATE SCHEMA IF NOT EXISTS neon_migration"; @@ -784,8 +843,13 @@ END $$; client.simple_query(query)?; while current_migration < migrations.len() { - info!("Running migration:\n{}\n", migrations[current_migration]); - client.simple_query(migrations[current_migration])?; + let migration = &migrations[current_migration]; + if migration.is_empty() { + info!("Skip migration id={}", current_migration); + } else { + info!("Running migration:\n{}\n", migration); + client.simple_query(migration)?; + } current_migration += 1; } let setval = format!( @@ -801,5 +865,125 @@ END $$; "Ran {} migrations", (migrations.len() - starting_migration_id) ); + + Ok(()) +} + +/// Connect to the database as superuser and pre-create anon extension +/// if it is present in shared_preload_libraries +#[instrument(skip_all)] +pub fn handle_extension_anon( + spec: &ComputeSpec, + db_owner: &str, + db_client: &mut Client, + grants_only: bool, +) -> Result<()> { + info!("handle extension anon"); + + if let Some(libs) = spec.cluster.settings.find("shared_preload_libraries") { + if libs.contains("anon") { + if !grants_only { + // check if extension is already initialized using anon.is_initialized() + let query = "SELECT anon.is_initialized()"; + match db_client.query(query, &[]) { + Ok(rows) => { + if !rows.is_empty() { + let is_initialized: bool = rows[0].get(0); + if is_initialized { + info!("anon extension is already initialized"); + return Ok(()); + } + } + } + Err(e) => { + warn!( + "anon extension is_installed check failed with expected error: {}", + e + ); + } + }; + + // Create anon extension if this compute needs it + // Users cannot create it themselves, because superuser is required. + let mut query = "CREATE EXTENSION IF NOT EXISTS anon CASCADE"; + info!("creating anon extension with query: {}", query); + match db_client.query(query, &[]) { + Ok(_) => {} + Err(e) => { + error!("anon extension creation failed with error: {}", e); + return Ok(()); + } + } + + // check that extension is installed + query = "SELECT extname FROM pg_extension WHERE extname = 'anon'"; + let rows = db_client.query(query, &[])?; + if rows.is_empty() { + error!("anon extension is not installed"); + return Ok(()); + } + + // Initialize anon extension + // This also requires superuser privileges, so users cannot do it themselves. + query = "SELECT anon.init()"; + match db_client.query(query, &[]) { + Ok(_) => {} + Err(e) => { + error!("anon.init() failed with error: {}", e); + return Ok(()); + } + } + } + + // check that extension is installed, if not bail early + let query = "SELECT extname FROM pg_extension WHERE extname = 'anon'"; + match db_client.query(query, &[]) { + Ok(rows) => { + if rows.is_empty() { + error!("anon extension is not installed"); + return Ok(()); + } + } + Err(e) => { + error!("anon extension check failed with error: {}", e); + return Ok(()); + } + }; + + let query = format!("GRANT ALL ON SCHEMA anon TO {}", db_owner); + info!("granting anon extension permissions with query: {}", query); + db_client.simple_query(&query)?; + + // Grant permissions to db_owner to use anon extension functions + let query = format!("GRANT ALL ON ALL FUNCTIONS IN SCHEMA anon TO {}", db_owner); + info!("granting anon extension permissions with query: {}", query); + db_client.simple_query(&query)?; + + // This is needed, because some functions are defined as SECURITY DEFINER. + // In Postgres SECURITY DEFINER functions are executed with the privileges + // of the owner. + // In anon extension this it is needed to access some GUCs, which are only accessible to + // superuser. But we've patched postgres to allow db_owner to access them as well. + // So we need to change owner of these functions to db_owner. + let query = format!(" + SELECT 'ALTER FUNCTION '||nsp.nspname||'.'||p.proname||'('||pg_get_function_identity_arguments(p.oid)||') OWNER TO {};' + from pg_proc p + join pg_namespace nsp ON p.pronamespace = nsp.oid + where nsp.nspname = 'anon';", db_owner); + + info!("change anon extension functions owner to db owner"); + db_client.simple_query(&query)?; + + // affects views as well + let query = format!("GRANT ALL ON ALL TABLES IN SCHEMA anon TO {}", db_owner); + info!("granting anon extension permissions with query: {}", query); + db_client.simple_query(&query)?; + + let query = format!("GRANT ALL ON ALL SEQUENCES IN SCHEMA anon TO {}", db_owner); + info!("granting anon extension permissions with query: {}", query); + db_client.simple_query(&query)?; + } + } + Ok(()) } diff --git a/control_plane/README.md b/control_plane/README.md new file mode 100644 index 0000000000..827aba5c1f --- /dev/null +++ b/control_plane/README.md @@ -0,0 +1,26 @@ +# Control Plane and Neon Local + +This crate contains tools to start a Neon development environment locally. This utility can be used with the `cargo neon` command. + +## Example: Start with Postgres 16 + +To create and start a local development environment with Postgres 16, you will need to provide `--pg-version` flag to 3 of the start-up commands. + +```shell +cargo neon init --pg-version 16 +cargo neon start +cargo neon tenant create --set-default --pg-version 16 +cargo neon endpoint create main --pg-version 16 +cargo neon endpoint start main +``` + +## Example: Create Test User and Database + +By default, `cargo neon` starts an endpoint with `cloud_admin` and `postgres` database. If you want to have a role and a database similar to what we have on the cloud service, you can do it with the following commands when starting an endpoint. + +```shell +cargo neon endpoint create main --pg-version 16 --update-catalog true +cargo neon endpoint start main --create-test-user true +``` + +The first command creates `neon_superuser` and necessary roles. The second command creates `test` user and `neondb` database. You will see a connection string that connects you to the test user after running the second command. diff --git a/control_plane/attachment_service/Cargo.toml b/control_plane/attachment_service/Cargo.toml index 743dd806c4..a5fad7216c 100644 --- a/control_plane/attachment_service/Cargo.toml +++ b/control_plane/attachment_service/Cargo.toml @@ -4,17 +4,30 @@ version = "0.1.0" edition.workspace = true license.workspace = true +[[bin]] +name = "storage_controller" +path = "src/main.rs" + +[features] +default = [] +# Enables test-only APIs and behaviors +testing = [] + [dependencies] anyhow.workspace = true +aws-config.workspace = true +aws-sdk-secretsmanager.workspace = true camino.workspace = true clap.workspace = true futures.workspace = true git-version.workspace = true hyper.workspace = true +humantime.workspace = true +once_cell.workspace = true pageserver_api.workspace = true pageserver_client.workspace = true postgres_connection.workspace = true -scopeguard.workspace = true +reqwest.workspace = true serde.workspace = true serde_json.workspace = true thiserror.workspace = true @@ -22,9 +35,9 @@ tokio.workspace = true tokio-util.workspace = true tracing.workspace = true -# TODO: remove this after DB persistence is added, it is only used for -# a parsing function when loading pageservers from neon_local LocalEnv -postgres_backend.workspace = true +diesel = { version = "2.1.4", features = ["serde_json", "postgres", "r2d2"] } +diesel_migrations = { version = "2.1.0" } +r2d2 = { version = "0.8.10" } utils = { path = "../../libs/utils/" } metrics = { path = "../../libs/metrics/" } diff --git a/control_plane/attachment_service/migrations/.keep b/control_plane/attachment_service/migrations/.keep new file mode 100644 index 0000000000..e69de29bb2 diff --git a/control_plane/attachment_service/migrations/00000000000000_diesel_initial_setup/down.sql b/control_plane/attachment_service/migrations/00000000000000_diesel_initial_setup/down.sql new file mode 100644 index 0000000000..a9f5260911 --- /dev/null +++ b/control_plane/attachment_service/migrations/00000000000000_diesel_initial_setup/down.sql @@ -0,0 +1,6 @@ +-- This file was automatically created by Diesel to setup helper functions +-- and other internal bookkeeping. This file is safe to edit, any future +-- changes will be added to existing projects as new migrations. + +DROP FUNCTION IF EXISTS diesel_manage_updated_at(_tbl regclass); +DROP FUNCTION IF EXISTS diesel_set_updated_at(); diff --git a/control_plane/attachment_service/migrations/00000000000000_diesel_initial_setup/up.sql b/control_plane/attachment_service/migrations/00000000000000_diesel_initial_setup/up.sql new file mode 100644 index 0000000000..d68895b1a7 --- /dev/null +++ b/control_plane/attachment_service/migrations/00000000000000_diesel_initial_setup/up.sql @@ -0,0 +1,36 @@ +-- This file was automatically created by Diesel to setup helper functions +-- and other internal bookkeeping. This file is safe to edit, any future +-- changes will be added to existing projects as new migrations. + + + + +-- Sets up a trigger for the given table to automatically set a column called +-- `updated_at` whenever the row is modified (unless `updated_at` was included +-- in the modified columns) +-- +-- # Example +-- +-- ```sql +-- CREATE TABLE users (id SERIAL PRIMARY KEY, updated_at TIMESTAMP NOT NULL DEFAULT NOW()); +-- +-- SELECT diesel_manage_updated_at('users'); +-- ``` +CREATE OR REPLACE FUNCTION diesel_manage_updated_at(_tbl regclass) RETURNS VOID AS $$ +BEGIN + EXECUTE format('CREATE TRIGGER set_updated_at BEFORE UPDATE ON %s + FOR EACH ROW EXECUTE PROCEDURE diesel_set_updated_at()', _tbl); +END; +$$ LANGUAGE plpgsql; + +CREATE OR REPLACE FUNCTION diesel_set_updated_at() RETURNS trigger AS $$ +BEGIN + IF ( + NEW IS DISTINCT FROM OLD AND + NEW.updated_at IS NOT DISTINCT FROM OLD.updated_at + ) THEN + NEW.updated_at := current_timestamp; + END IF; + RETURN NEW; +END; +$$ LANGUAGE plpgsql; diff --git a/control_plane/attachment_service/migrations/2024-01-07-211257_create_tenant_shards/down.sql b/control_plane/attachment_service/migrations/2024-01-07-211257_create_tenant_shards/down.sql new file mode 100644 index 0000000000..b875b91c00 --- /dev/null +++ b/control_plane/attachment_service/migrations/2024-01-07-211257_create_tenant_shards/down.sql @@ -0,0 +1 @@ +DROP TABLE tenant_shards; diff --git a/control_plane/attachment_service/migrations/2024-01-07-211257_create_tenant_shards/up.sql b/control_plane/attachment_service/migrations/2024-01-07-211257_create_tenant_shards/up.sql new file mode 100644 index 0000000000..2ffdae6287 --- /dev/null +++ b/control_plane/attachment_service/migrations/2024-01-07-211257_create_tenant_shards/up.sql @@ -0,0 +1,13 @@ +CREATE TABLE tenant_shards ( + tenant_id VARCHAR NOT NULL, + shard_number INTEGER NOT NULL, + shard_count INTEGER NOT NULL, + PRIMARY KEY(tenant_id, shard_number, shard_count), + shard_stripe_size INTEGER NOT NULL, + generation INTEGER NOT NULL, + generation_pageserver BIGINT NOT NULL, + placement_policy VARCHAR NOT NULL, + splitting SMALLINT NOT NULL, + -- config is JSON encoded, opaque to the database. + config TEXT NOT NULL +); \ No newline at end of file diff --git a/control_plane/attachment_service/migrations/2024-01-07-212945_create_nodes/down.sql b/control_plane/attachment_service/migrations/2024-01-07-212945_create_nodes/down.sql new file mode 100644 index 0000000000..ec303bc8cf --- /dev/null +++ b/control_plane/attachment_service/migrations/2024-01-07-212945_create_nodes/down.sql @@ -0,0 +1 @@ +DROP TABLE nodes; diff --git a/control_plane/attachment_service/migrations/2024-01-07-212945_create_nodes/up.sql b/control_plane/attachment_service/migrations/2024-01-07-212945_create_nodes/up.sql new file mode 100644 index 0000000000..9be0880fa4 --- /dev/null +++ b/control_plane/attachment_service/migrations/2024-01-07-212945_create_nodes/up.sql @@ -0,0 +1,10 @@ +CREATE TABLE nodes ( + node_id BIGINT PRIMARY KEY NOT NULL, + + scheduling_policy VARCHAR NOT NULL, + + listen_http_addr VARCHAR NOT NULL, + listen_http_port INTEGER NOT NULL, + listen_pg_addr VARCHAR NOT NULL, + listen_pg_port INTEGER NOT NULL +); \ No newline at end of file diff --git a/control_plane/attachment_service/migrations/2024-02-29-094122_generations_null/down.sql b/control_plane/attachment_service/migrations/2024-02-29-094122_generations_null/down.sql new file mode 100644 index 0000000000..503231f69d --- /dev/null +++ b/control_plane/attachment_service/migrations/2024-02-29-094122_generations_null/down.sql @@ -0,0 +1,2 @@ +ALTER TABLE tenant_shards ALTER generation SET NOT NULL; +ALTER TABLE tenant_shards ALTER generation_pageserver SET NOT NULL; diff --git a/control_plane/attachment_service/migrations/2024-02-29-094122_generations_null/up.sql b/control_plane/attachment_service/migrations/2024-02-29-094122_generations_null/up.sql new file mode 100644 index 0000000000..7e1e3cfe90 --- /dev/null +++ b/control_plane/attachment_service/migrations/2024-02-29-094122_generations_null/up.sql @@ -0,0 +1,4 @@ + + +ALTER TABLE tenant_shards ALTER generation DROP NOT NULL; +ALTER TABLE tenant_shards ALTER generation_pageserver DROP NOT NULL; \ No newline at end of file diff --git a/control_plane/attachment_service/src/auth.rs b/control_plane/attachment_service/src/auth.rs new file mode 100644 index 0000000000..ef47abf8c7 --- /dev/null +++ b/control_plane/attachment_service/src/auth.rs @@ -0,0 +1,9 @@ +use utils::auth::{AuthError, Claims, Scope}; + +pub fn check_permission(claims: &Claims, required_scope: Scope) -> Result<(), AuthError> { + if claims.scope != required_scope { + return Err(AuthError("Scope mismatch. Permission denied".into())); + } + + Ok(()) +} diff --git a/control_plane/attachment_service/src/compute_hook.rs b/control_plane/attachment_service/src/compute_hook.rs index 02617cd065..bebc62ac2f 100644 --- a/control_plane/attachment_service/src/compute_hook.rs +++ b/control_plane/attachment_service/src/compute_hook.rs @@ -1,72 +1,164 @@ -use std::collections::HashMap; +use std::{collections::HashMap, time::Duration}; -use control_plane::endpoint::ComputeControlPlane; +use control_plane::endpoint::{ComputeControlPlane, EndpointStatus}; use control_plane::local_env::LocalEnv; -use pageserver_api::shard::{ShardCount, ShardIndex, TenantShardId}; +use hyper::{Method, StatusCode}; +use pageserver_api::shard::{ShardCount, ShardNumber, ShardStripeSize, TenantShardId}; use postgres_connection::parse_host_port; -use utils::id::{NodeId, TenantId}; +use serde::{Deserialize, Serialize}; +use tokio_util::sync::CancellationToken; +use utils::{ + backoff::{self}, + id::{NodeId, TenantId}, +}; -pub(super) struct ComputeHookTenant { - shards: Vec<(ShardIndex, NodeId)>, +use crate::service::Config; + +const BUSY_DELAY: Duration = Duration::from_secs(1); +const SLOWDOWN_DELAY: Duration = Duration::from_secs(5); + +pub(crate) const API_CONCURRENCY: usize = 32; + +struct ShardedComputeHookTenant { + stripe_size: ShardStripeSize, + shard_count: ShardCount, + shards: Vec<(ShardNumber, NodeId)>, +} + +enum ComputeHookTenant { + Unsharded(NodeId), + Sharded(ShardedComputeHookTenant), } impl ComputeHookTenant { - pub(super) async fn maybe_reconfigure(&mut self, tenant_id: TenantId) -> anyhow::Result<()> { - // Find the highest shard count and drop any shards that aren't - // for that shard count. - let shard_count = self.shards.iter().map(|(k, _v)| k.shard_count).max(); - let Some(shard_count) = shard_count else { - // No shards, nothing to do. - tracing::info!("ComputeHookTenant::maybe_reconfigure: no shards"); - return Ok(()); - }; + /// Construct with at least one shard's information + fn new(tenant_shard_id: TenantShardId, stripe_size: ShardStripeSize, node_id: NodeId) -> Self { + if tenant_shard_id.shard_count.count() > 1 { + Self::Sharded(ShardedComputeHookTenant { + shards: vec![(tenant_shard_id.shard_number, node_id)], + stripe_size, + shard_count: tenant_shard_id.shard_count, + }) + } else { + Self::Unsharded(node_id) + } + } - self.shards.retain(|(k, _v)| k.shard_count == shard_count); - self.shards - .sort_by_key(|(shard, _node_id)| shard.shard_number); - - if self.shards.len() == shard_count.0 as usize || shard_count == ShardCount(0) { - // We have pageservers for all the shards: proceed to reconfigure compute - let env = match LocalEnv::load_config() { - Ok(e) => e, - Err(e) => { - tracing::warn!( - "Couldn't load neon_local config, skipping compute update ({e})" - ); - return Ok(()); - } - }; - let cplane = ComputeControlPlane::load(env.clone()) - .expect("Error loading compute control plane"); - - let compute_pageservers = self - .shards - .iter() - .map(|(_shard, node_id)| { - let ps_conf = env - .get_pageserver_conf(*node_id) - .expect("Unknown pageserver"); - let (pg_host, pg_port) = parse_host_port(&ps_conf.listen_pg_addr) - .expect("Unable to parse listen_pg_addr"); - (pg_host, pg_port.unwrap_or(5432)) - }) - .collect::>(); - - for (endpoint_name, endpoint) in &cplane.endpoints { - if endpoint.tenant_id == tenant_id && endpoint.status() == "running" { - tracing::info!("🔁 Reconfiguring endpoint {}", endpoint_name,); - endpoint.reconfigure(compute_pageservers.clone()).await?; + /// Set one shard's location. If stripe size or shard count have changed, Self is reset + /// and drops existing content. + fn update( + &mut self, + tenant_shard_id: TenantShardId, + stripe_size: ShardStripeSize, + node_id: NodeId, + ) { + match self { + Self::Unsharded(existing_node_id) if tenant_shard_id.shard_count.count() == 1 => { + *existing_node_id = node_id + } + Self::Sharded(sharded_tenant) + if sharded_tenant.stripe_size == stripe_size + && sharded_tenant.shard_count == tenant_shard_id.shard_count => + { + if let Some(existing) = sharded_tenant + .shards + .iter() + .position(|s| s.0 == tenant_shard_id.shard_number) + { + sharded_tenant.shards.get_mut(existing).unwrap().1 = node_id; + } else { + sharded_tenant + .shards + .push((tenant_shard_id.shard_number, node_id)); + sharded_tenant.shards.sort_by_key(|s| s.0) } } - } else { - tracing::info!( - "ComputeHookTenant::maybe_reconfigure: not enough shards ({}/{})", - self.shards.len(), - shard_count.0 - ); + _ => { + // Shard count changed: reset struct. + *self = Self::new(tenant_shard_id, stripe_size, node_id); + } } + } +} - Ok(()) +#[derive(Serialize, Deserialize, Debug)] +struct ComputeHookNotifyRequestShard { + node_id: NodeId, + shard_number: ShardNumber, +} + +/// Request body that we send to the control plane to notify it of where a tenant is attached +#[derive(Serialize, Deserialize, Debug)] +struct ComputeHookNotifyRequest { + tenant_id: TenantId, + stripe_size: Option, + shards: Vec, +} + +/// Error type for attempts to call into the control plane compute notification hook +#[derive(thiserror::Error, Debug)] +pub(crate) enum NotifyError { + // Request was not send successfully, e.g. transport error + #[error("Sending request: {0}")] + Request(#[from] reqwest::Error), + // Request could not be serviced right now due to ongoing Operation in control plane, but should be possible soon. + #[error("Control plane tenant busy")] + Busy, + // Explicit 429 response asking us to retry less frequently + #[error("Control plane overloaded")] + SlowDown, + // A 503 response indicates the control plane can't handle the request right now + #[error("Control plane unavailable (status {0})")] + Unavailable(StatusCode), + // API returned unexpected non-success status. We will retry, but log a warning. + #[error("Control plane returned unexpected status {0}")] + Unexpected(StatusCode), + // We shutdown while sending + #[error("Shutting down")] + ShuttingDown, + // A response indicates we will never succeed, such as 400 or 404 + #[error("Non-retryable error {0}")] + Fatal(StatusCode), +} + +impl ComputeHookTenant { + fn maybe_reconfigure(&self, tenant_id: TenantId) -> Option { + match self { + Self::Unsharded(node_id) => Some(ComputeHookNotifyRequest { + tenant_id, + shards: vec![ComputeHookNotifyRequestShard { + shard_number: ShardNumber(0), + node_id: *node_id, + }], + stripe_size: None, + }), + Self::Sharded(sharded_tenant) + if sharded_tenant.shards.len() == sharded_tenant.shard_count.count() as usize => + { + Some(ComputeHookNotifyRequest { + tenant_id, + shards: sharded_tenant + .shards + .iter() + .map(|(shard_number, node_id)| ComputeHookNotifyRequestShard { + shard_number: *shard_number, + node_id: *node_id, + }) + .collect(), + stripe_size: Some(sharded_tenant.stripe_size), + }) + } + Self::Sharded(sharded_tenant) => { + // Sharded tenant doesn't yet have information for all its shards + + tracing::info!( + "ComputeHookTenant::maybe_reconfigure: not enough shards ({}/{})", + sharded_tenant.shards.len(), + sharded_tenant.shard_count.count() + ); + None + } + } } } @@ -74,43 +166,297 @@ impl ComputeHookTenant { /// mapping. It aggregates updates for the shards in a tenant, and when appropriate reconfigures /// the compute connection string. pub(super) struct ComputeHook { + config: Config, state: tokio::sync::Mutex>, + authorization_header: Option, } impl ComputeHook { - pub(super) fn new() -> Self { + pub(super) fn new(config: Config) -> Self { + let authorization_header = config + .control_plane_jwt_token + .clone() + .map(|jwt| format!("Bearer {}", jwt)); + Self { state: Default::default(), + config, + authorization_header, } } + /// For test environments: use neon_local's LocalEnv to update compute + async fn do_notify_local( + &self, + reconfigure_request: ComputeHookNotifyRequest, + ) -> anyhow::Result<()> { + let env = match LocalEnv::load_config() { + Ok(e) => e, + Err(e) => { + tracing::warn!("Couldn't load neon_local config, skipping compute update ({e})"); + return Ok(()); + } + }; + let cplane = + ComputeControlPlane::load(env.clone()).expect("Error loading compute control plane"); + let ComputeHookNotifyRequest { + tenant_id, + shards, + stripe_size, + } = reconfigure_request; + + let compute_pageservers = shards + .into_iter() + .map(|shard| { + let ps_conf = env + .get_pageserver_conf(shard.node_id) + .expect("Unknown pageserver"); + let (pg_host, pg_port) = parse_host_port(&ps_conf.listen_pg_addr) + .expect("Unable to parse listen_pg_addr"); + (pg_host, pg_port.unwrap_or(5432)) + }) + .collect::>(); + + for (endpoint_name, endpoint) in &cplane.endpoints { + if endpoint.tenant_id == tenant_id && endpoint.status() == EndpointStatus::Running { + tracing::info!("Reconfiguring endpoint {}", endpoint_name,); + endpoint + .reconfigure(compute_pageservers.clone(), stripe_size) + .await?; + } + } + + Ok(()) + } + + async fn do_notify_iteration( + &self, + client: &reqwest::Client, + url: &String, + reconfigure_request: &ComputeHookNotifyRequest, + cancel: &CancellationToken, + ) -> Result<(), NotifyError> { + let req = client.request(Method::PUT, url); + let req = if let Some(value) = &self.authorization_header { + req.header(reqwest::header::AUTHORIZATION, value) + } else { + req + }; + + tracing::info!( + "Sending notify request to {} ({:?})", + url, + reconfigure_request + ); + let send_result = req.json(&reconfigure_request).send().await; + let response = match send_result { + Ok(r) => r, + Err(e) => return Err(e.into()), + }; + + // Treat all 2xx responses as success + if response.status() >= StatusCode::OK && response.status() < StatusCode::MULTIPLE_CHOICES { + if response.status() != StatusCode::OK { + // Non-200 2xx response: it doesn't make sense to retry, but this is unexpected, so + // log a warning. + tracing::warn!( + "Unexpected 2xx response code {} from control plane", + response.status() + ); + } + + return Ok(()); + } + + // Error response codes + match response.status() { + StatusCode::TOO_MANY_REQUESTS => { + // TODO: 429 handling should be global: set some state visible to other requests + // so that they will delay before starting, rather than all notifications trying + // once before backing off. + tokio::time::timeout(SLOWDOWN_DELAY, cancel.cancelled()) + .await + .ok(); + Err(NotifyError::SlowDown) + } + StatusCode::LOCKED => { + // Delay our retry if busy: the usual fast exponential backoff in backoff::retry + // is not appropriate + tokio::time::timeout(BUSY_DELAY, cancel.cancelled()) + .await + .ok(); + Err(NotifyError::Busy) + } + StatusCode::SERVICE_UNAVAILABLE + | StatusCode::GATEWAY_TIMEOUT + | StatusCode::BAD_GATEWAY => Err(NotifyError::Unavailable(response.status())), + StatusCode::BAD_REQUEST | StatusCode::UNAUTHORIZED | StatusCode::FORBIDDEN => { + Err(NotifyError::Fatal(response.status())) + } + _ => Err(NotifyError::Unexpected(response.status())), + } + } + + async fn do_notify( + &self, + url: &String, + reconfigure_request: ComputeHookNotifyRequest, + cancel: &CancellationToken, + ) -> Result<(), NotifyError> { + let client = reqwest::Client::new(); + backoff::retry( + || self.do_notify_iteration(&client, url, &reconfigure_request, cancel), + |e| matches!(e, NotifyError::Fatal(_) | NotifyError::Unexpected(_)), + 3, + 10, + "Send compute notification", + cancel, + ) + .await + .ok_or_else(|| NotifyError::ShuttingDown) + .and_then(|x| x) + } + + /// Call this to notify the compute (postgres) tier of new pageservers to use + /// for a tenant. notify() is called by each shard individually, and this function + /// will decide whether an update to the tenant is sent. An update is sent on the + /// condition that: + /// - We know a pageserver for every shard. + /// - All the shards have the same shard_count (i.e. we are not mid-split) + /// + /// Cancellation token enables callers to drop out, e.g. if calling from a Reconciler + /// that is cancelled. + /// + /// This function is fallible, including in the case that the control plane is transiently + /// unavailable. A limited number of retries are done internally to efficiently hide short unavailability + /// periods, but we don't retry forever. The **caller** is responsible for handling failures and + /// ensuring that they eventually call again to ensure that the compute is eventually notified of + /// the proper pageserver nodes for a tenant. + #[tracing::instrument(skip_all, fields(tenant_id=%tenant_shard_id.tenant_id, shard_id=%tenant_shard_id.shard_slug(), node_id))] pub(super) async fn notify( &self, tenant_shard_id: TenantShardId, node_id: NodeId, - ) -> anyhow::Result<()> { - tracing::info!("ComputeHook::notify: {}->{}", tenant_shard_id, node_id); + stripe_size: ShardStripeSize, + cancel: &CancellationToken, + ) -> Result<(), NotifyError> { let mut locked = self.state.lock().await; - let entry = locked - .entry(tenant_shard_id.tenant_id) - .or_insert_with(|| ComputeHookTenant { shards: Vec::new() }); - let shard_index = ShardIndex { - shard_count: tenant_shard_id.shard_count, - shard_number: tenant_shard_id.shard_number, + use std::collections::hash_map::Entry; + let tenant = match locked.entry(tenant_shard_id.tenant_id) { + Entry::Vacant(e) => e.insert(ComputeHookTenant::new( + tenant_shard_id, + stripe_size, + node_id, + )), + Entry::Occupied(e) => { + let tenant = e.into_mut(); + tenant.update(tenant_shard_id, stripe_size, node_id); + tenant + } }; - let mut set = false; - for (existing_shard, existing_node) in &mut entry.shards { - if *existing_shard == shard_index { - *existing_node = node_id; - set = true; - } - } - if !set { - entry.shards.push((shard_index, node_id)); - } + let reconfigure_request = tenant.maybe_reconfigure(tenant_shard_id.tenant_id); + let Some(reconfigure_request) = reconfigure_request else { + // The tenant doesn't yet have pageservers for all its shards: we won't notify anything + // until it does. + tracing::info!("Tenant isn't yet ready to emit a notification"); + return Ok(()); + }; - entry.maybe_reconfigure(tenant_shard_id.tenant_id).await + if let Some(notify_url) = &self.config.compute_hook_url { + self.do_notify(notify_url, reconfigure_request, cancel) + .await + } else { + self.do_notify_local(reconfigure_request) + .await + .map_err(|e| { + // This path is for testing only, so munge the error into our prod-style error type. + tracing::error!("Local notification hook failed: {e}"); + NotifyError::Fatal(StatusCode::INTERNAL_SERVER_ERROR) + }) + } + } +} + +#[cfg(test)] +pub(crate) mod tests { + use pageserver_api::shard::{ShardCount, ShardNumber}; + use utils::id::TenantId; + + use super::*; + + #[test] + fn tenant_updates() -> anyhow::Result<()> { + let tenant_id = TenantId::generate(); + let mut tenant_state = ComputeHookTenant::new( + TenantShardId { + tenant_id, + shard_count: ShardCount::new(0), + shard_number: ShardNumber(0), + }, + ShardStripeSize(12345), + NodeId(1), + ); + + // An unsharded tenant is always ready to emit a notification + assert!(tenant_state.maybe_reconfigure(tenant_id).is_some()); + assert_eq!( + tenant_state + .maybe_reconfigure(tenant_id) + .unwrap() + .shards + .len(), + 1 + ); + assert!(tenant_state + .maybe_reconfigure(tenant_id) + .unwrap() + .stripe_size + .is_none()); + + // Writing the first shard of a multi-sharded situation (i.e. in a split) + // resets the tenant state and puts it in an non-notifying state (need to + // see all shards) + tenant_state.update( + TenantShardId { + tenant_id, + shard_count: ShardCount::new(2), + shard_number: ShardNumber(1), + }, + ShardStripeSize(32768), + NodeId(1), + ); + assert!(tenant_state.maybe_reconfigure(tenant_id).is_none()); + + // Writing the second shard makes it ready to notify + tenant_state.update( + TenantShardId { + tenant_id, + shard_count: ShardCount::new(2), + shard_number: ShardNumber(0), + }, + ShardStripeSize(32768), + NodeId(1), + ); + + assert!(tenant_state.maybe_reconfigure(tenant_id).is_some()); + assert_eq!( + tenant_state + .maybe_reconfigure(tenant_id) + .unwrap() + .shards + .len(), + 2 + ); + assert_eq!( + tenant_state + .maybe_reconfigure(tenant_id) + .unwrap() + .stripe_size, + Some(ShardStripeSize(32768)) + ); + + Ok(()) } } diff --git a/control_plane/attachment_service/src/http.rs b/control_plane/attachment_service/src/http.rs index 30f6dd66ee..27ba5bdb65 100644 --- a/control_plane/attachment_service/src/http.rs +++ b/control_plane/attachment_service/src/http.rs @@ -1,14 +1,19 @@ use crate::reconciler::ReconcileError; -use crate::service::Service; +use crate::service::{Service, STARTUP_RECONCILE_TIMEOUT}; use hyper::{Body, Request, Response}; use hyper::{StatusCode, Uri}; -use pageserver_api::models::{TenantCreateRequest, TimelineCreateRequest}; +use pageserver_api::models::{ + TenantConfigRequest, TenantCreateRequest, TenantLocationConfigRequest, TenantShardSplitRequest, + TenantTimeTravelRequest, TimelineCreateRequest, +}; use pageserver_api::shard::TenantShardId; +use pageserver_client::mgmt_api; use std::sync::Arc; -use utils::auth::SwappableJwtAuth; -use utils::http::endpoint::{auth_middleware, request_span}; -use utils::http::request::parse_request_param; -use utils::id::TenantId; +use std::time::{Duration, Instant}; +use utils::auth::{Scope, SwappableJwtAuth}; +use utils::http::endpoint::{auth_middleware, check_permission_with, request_span}; +use utils::http::request::{must_get_query_param, parse_request_param}; +use utils::id::{TenantId, TimelineId}; use utils::{ http::{ @@ -20,12 +25,12 @@ use utils::{ id::NodeId, }; -use pageserver_api::control_api::{ReAttachRequest, ValidateRequest}; - -use control_plane::attachment_service::{ - AttachHookRequest, InspectRequest, NodeConfigureRequest, NodeRegisterRequest, - TenantShardMigrateRequest, +use pageserver_api::controller_api::{ + NodeConfigureRequest, NodeRegisterRequest, TenantShardMigrateRequest, }; +use pageserver_api::upcall_api::{ReAttachRequest, ValidateRequest}; + +use control_plane::storage_controller::{AttachHookRequest, InspectRequest}; /// State available to HTTP request handlers #[derive(Clone)] @@ -37,7 +42,7 @@ pub struct HttpState { impl HttpState { pub fn new(service: Arc, auth: Option>) -> Self { - let allowlist_routes = ["/status"] + let allowlist_routes = ["/status", "/ready", "/metrics"] .iter() .map(|v| v.parse().unwrap()) .collect::>(); @@ -59,21 +64,18 @@ fn get_state(request: &Request) -> &HttpState { /// Pageserver calls into this on startup, to learn which tenants it should attach async fn handle_re_attach(mut req: Request) -> Result, ApiError> { + check_permissions(&req, Scope::GenerationsApi)?; + let reattach_req = json_request::(&mut req).await?; let state = get_state(&req); - json_response( - StatusCode::OK, - state - .service - .re_attach(reattach_req) - .await - .map_err(ApiError::InternalServerError)?, - ) + json_response(StatusCode::OK, state.service.re_attach(reattach_req).await?) } /// Pageserver calls into this before doing deletions, to confirm that it still /// holds the latest generation for the tenants with deletions enqueued async fn handle_validate(mut req: Request) -> Result, ApiError> { + check_permissions(&req, Scope::GenerationsApi)?; + let validate_req = json_request::(&mut req).await?; let state = get_state(&req); json_response(StatusCode::OK, state.service.validate(validate_req)) @@ -83,6 +85,8 @@ async fn handle_validate(mut req: Request) -> Result, ApiEr /// (in the real control plane this is unnecessary, because the same program is managing /// generation numbers and doing attachments). async fn handle_attach_hook(mut req: Request) -> Result, ApiError> { + check_permissions(&req, Scope::Admin)?; + let attach_req = json_request::(&mut req).await?; let state = get_state(&req); @@ -97,6 +101,8 @@ async fn handle_attach_hook(mut req: Request) -> Result, Ap } async fn handle_inspect(mut req: Request) -> Result, ApiError> { + check_permissions(&req, Scope::Admin)?; + let inspect_req = json_request::(&mut req).await?; let state = get_state(&req); @@ -104,44 +110,272 @@ async fn handle_inspect(mut req: Request) -> Result, ApiErr json_response(StatusCode::OK, state.service.inspect(inspect_req)) } -async fn handle_tenant_create(mut req: Request) -> Result, ApiError> { +async fn handle_tenant_create( + service: Arc, + mut req: Request, +) -> Result, ApiError> { + check_permissions(&req, Scope::PageServerApi)?; + let create_req = json_request::(&mut req).await?; - let state = get_state(&req); + json_response( - StatusCode::OK, - state.service.tenant_create(create_req).await?, + StatusCode::CREATED, + service.tenant_create(create_req).await?, ) } -async fn handle_tenant_timeline_create(mut req: Request) -> Result, ApiError> { - let tenant_id: TenantId = parse_request_param(&req, "tenant_id")?; - let create_req = json_request::(&mut req).await?; +// For tenant and timeline deletions, which both implement an "initially return 202, then 404 once +// we're done" semantic, we wrap with a retry loop to expose a simpler API upstream. This avoids +// needing to track a "deleting" state for tenants. +async fn deletion_wrapper(service: Arc, f: F) -> Result, ApiError> +where + R: std::future::Future> + Send + 'static, + F: Fn(Arc) -> R + Send + Sync + 'static, +{ + let started_at = Instant::now(); + // To keep deletion reasonably snappy for small tenants, initially check after 1 second if deletion + // completed. + let mut retry_period = Duration::from_secs(1); + // On subsequent retries, wait longer. + let max_retry_period = Duration::from_secs(5); + // Enable callers with a 30 second request timeout to reliably get a response + let max_wait = Duration::from_secs(25); - let state = get_state(&req); + loop { + let status = f(service.clone()).await?; + match status { + StatusCode::ACCEPTED => { + tracing::info!("Deletion accepted, waiting to try again..."); + tokio::time::sleep(retry_period).await; + retry_period = max_retry_period; + } + StatusCode::NOT_FOUND => { + tracing::info!("Deletion complete"); + return json_response(StatusCode::OK, ()); + } + _ => { + tracing::warn!("Unexpected status {status}"); + return json_response(status, ()); + } + } + + let now = Instant::now(); + if now + retry_period > started_at + max_wait { + tracing::info!("Deletion timed out waiting for 404"); + // REQUEST_TIMEOUT would be more appropriate, but CONFLICT is already part of + // the pageserver's swagger definition for this endpoint, and has the same desired + // effect of causing the control plane to retry later. + return json_response(StatusCode::CONFLICT, ()); + } + } +} + +async fn handle_tenant_location_config( + service: Arc, + mut req: Request, +) -> Result, ApiError> { + let tenant_id: TenantId = parse_request_param(&req, "tenant_id")?; + check_permissions(&req, Scope::PageServerApi)?; + + let config_req = json_request::(&mut req).await?; json_response( StatusCode::OK, - state - .service + service + .tenant_location_config(tenant_id, config_req) + .await?, + ) +} + +async fn handle_tenant_config_set( + service: Arc, + mut req: Request, +) -> Result, ApiError> { + check_permissions(&req, Scope::PageServerApi)?; + + let config_req = json_request::(&mut req).await?; + + json_response(StatusCode::OK, service.tenant_config_set(config_req).await?) +} + +async fn handle_tenant_config_get( + service: Arc, + req: Request, +) -> Result, ApiError> { + let tenant_id: TenantId = parse_request_param(&req, "tenant_id")?; + check_permissions(&req, Scope::PageServerApi)?; + + json_response(StatusCode::OK, service.tenant_config_get(tenant_id)?) +} + +async fn handle_tenant_time_travel_remote_storage( + service: Arc, + mut req: Request, +) -> Result, ApiError> { + let tenant_id: TenantId = parse_request_param(&req, "tenant_id")?; + check_permissions(&req, Scope::PageServerApi)?; + + let time_travel_req = json_request::(&mut req).await?; + + let timestamp_raw = must_get_query_param(&req, "travel_to")?; + let _timestamp = humantime::parse_rfc3339(×tamp_raw).map_err(|_e| { + ApiError::BadRequest(anyhow::anyhow!( + "Invalid time for travel_to: {timestamp_raw:?}" + )) + })?; + + let done_if_after_raw = must_get_query_param(&req, "done_if_after")?; + let _done_if_after = humantime::parse_rfc3339(&done_if_after_raw).map_err(|_e| { + ApiError::BadRequest(anyhow::anyhow!( + "Invalid time for done_if_after: {done_if_after_raw:?}" + )) + })?; + + service + .tenant_time_travel_remote_storage( + &time_travel_req, + tenant_id, + timestamp_raw, + done_if_after_raw, + ) + .await?; + json_response(StatusCode::OK, ()) +} + +async fn handle_tenant_secondary_download( + service: Arc, + req: Request, +) -> Result, ApiError> { + let tenant_id: TenantId = parse_request_param(&req, "tenant_id")?; + service.tenant_secondary_download(tenant_id).await?; + json_response(StatusCode::OK, ()) +} + +async fn handle_tenant_delete( + service: Arc, + req: Request, +) -> Result, ApiError> { + let tenant_id: TenantId = parse_request_param(&req, "tenant_id")?; + check_permissions(&req, Scope::PageServerApi)?; + + deletion_wrapper(service, move |service| async move { + service.tenant_delete(tenant_id).await + }) + .await +} + +async fn handle_tenant_timeline_create( + service: Arc, + mut req: Request, +) -> Result, ApiError> { + let tenant_id: TenantId = parse_request_param(&req, "tenant_id")?; + check_permissions(&req, Scope::PageServerApi)?; + + let create_req = json_request::(&mut req).await?; + json_response( + StatusCode::CREATED, + service .tenant_timeline_create(tenant_id, create_req) .await?, ) } -async fn handle_tenant_locate(req: Request) -> Result, ApiError> { +async fn handle_tenant_timeline_delete( + service: Arc, + req: Request, +) -> Result, ApiError> { let tenant_id: TenantId = parse_request_param(&req, "tenant_id")?; - let state = get_state(&req); + check_permissions(&req, Scope::PageServerApi)?; - json_response(StatusCode::OK, state.service.tenant_locate(tenant_id)?) + let timeline_id: TimelineId = parse_request_param(&req, "timeline_id")?; + + deletion_wrapper(service, move |service| async move { + service.tenant_timeline_delete(tenant_id, timeline_id).await + }) + .await +} + +async fn handle_tenant_timeline_passthrough( + service: Arc, + req: Request, +) -> Result, ApiError> { + let tenant_id: TenantId = parse_request_param(&req, "tenant_id")?; + check_permissions(&req, Scope::PageServerApi)?; + + let Some(path) = req.uri().path_and_query() else { + // This should never happen, our request router only calls us if there is a path + return Err(ApiError::BadRequest(anyhow::anyhow!("Missing path"))); + }; + + tracing::info!("Proxying request for tenant {} ({})", tenant_id, path); + + // Find the node that holds shard zero + let (base_url, tenant_shard_id) = service.tenant_shard0_baseurl(tenant_id)?; + + // Callers will always pass an unsharded tenant ID. Before proxying, we must + // rewrite this to a shard-aware shard zero ID. + let path = format!("{}", path); + let tenant_str = tenant_id.to_string(); + let tenant_shard_str = format!("{}", tenant_shard_id); + let path = path.replace(&tenant_str, &tenant_shard_str); + + let client = mgmt_api::Client::new(base_url, service.get_config().jwt_token.as_deref()); + let resp = client.get_raw(path).await.map_err(|_e| + // FIXME: give APiError a proper Unavailable variant. We return 503 here because + // if we can't successfully send a request to the pageserver, we aren't available. + ApiError::ShuttingDown)?; + + // We have a reqest::Response, would like a http::Response + let mut builder = hyper::Response::builder() + .status(resp.status()) + .version(resp.version()); + for (k, v) in resp.headers() { + builder = builder.header(k, v); + } + + let response = builder + .body(Body::wrap_stream(resp.bytes_stream())) + .map_err(|e| ApiError::InternalServerError(e.into()))?; + + Ok(response) +} + +async fn handle_tenant_locate( + service: Arc, + req: Request, +) -> Result, ApiError> { + check_permissions(&req, Scope::Admin)?; + + let tenant_id: TenantId = parse_request_param(&req, "tenant_id")?; + json_response(StatusCode::OK, service.tenant_locate(tenant_id)?) } async fn handle_node_register(mut req: Request) -> Result, ApiError> { + check_permissions(&req, Scope::Admin)?; + let register_req = json_request::(&mut req).await?; let state = get_state(&req); state.service.node_register(register_req).await?; json_response(StatusCode::OK, ()) } +async fn handle_node_list(req: Request) -> Result, ApiError> { + check_permissions(&req, Scope::Admin)?; + + let state = get_state(&req); + json_response(StatusCode::OK, state.service.node_list().await?) +} + +async fn handle_node_drop(req: Request) -> Result, ApiError> { + check_permissions(&req, Scope::Admin)?; + + let state = get_state(&req); + let node_id: NodeId = parse_request_param(&req, "node_id")?; + json_response(StatusCode::OK, state.service.node_drop(node_id).await?) +} + async fn handle_node_configure(mut req: Request) -> Result, ApiError> { + check_permissions(&req, Scope::Admin)?; + let node_id: NodeId = parse_request_param(&req, "node_id")?; let config_req = json_request::(&mut req).await?; if node_id != config_req.node_id { @@ -151,33 +385,131 @@ async fn handle_node_configure(mut req: Request) -> Result, } let state = get_state(&req); - json_response(StatusCode::OK, state.service.node_configure(config_req)?) -} - -async fn handle_tenant_shard_migrate(mut req: Request) -> Result, ApiError> { - let tenant_shard_id: TenantShardId = parse_request_param(&req, "tenant_shard_id")?; - let migrate_req = json_request::(&mut req).await?; - let state = get_state(&req); json_response( StatusCode::OK, - state - .service + state.service.node_configure(config_req).await?, + ) +} + +async fn handle_tenant_shard_split( + service: Arc, + mut req: Request, +) -> Result, ApiError> { + check_permissions(&req, Scope::Admin)?; + + let tenant_id: TenantId = parse_request_param(&req, "tenant_id")?; + let split_req = json_request::(&mut req).await?; + + json_response( + StatusCode::OK, + service.tenant_shard_split(tenant_id, split_req).await?, + ) +} + +async fn handle_tenant_shard_migrate( + service: Arc, + mut req: Request, +) -> Result, ApiError> { + check_permissions(&req, Scope::Admin)?; + + let tenant_shard_id: TenantShardId = parse_request_param(&req, "tenant_shard_id")?; + let migrate_req = json_request::(&mut req).await?; + json_response( + StatusCode::OK, + service .tenant_shard_migrate(tenant_shard_id, migrate_req) .await?, ) } +async fn handle_tenant_drop(req: Request) -> Result, ApiError> { + let tenant_id: TenantId = parse_request_param(&req, "tenant_id")?; + check_permissions(&req, Scope::PageServerApi)?; + + let state = get_state(&req); + + json_response(StatusCode::OK, state.service.tenant_drop(tenant_id).await?) +} + +async fn handle_tenants_dump(req: Request) -> Result, ApiError> { + check_permissions(&req, Scope::Admin)?; + + let state = get_state(&req); + state.service.tenants_dump() +} + +async fn handle_scheduler_dump(req: Request) -> Result, ApiError> { + check_permissions(&req, Scope::Admin)?; + + let state = get_state(&req); + state.service.scheduler_dump() +} + +async fn handle_consistency_check(req: Request) -> Result, ApiError> { + check_permissions(&req, Scope::Admin)?; + + let state = get_state(&req); + + json_response(StatusCode::OK, state.service.consistency_check().await?) +} + /// Status endpoint is just used for checking that our HTTP listener is up async fn handle_status(_req: Request) -> Result, ApiError> { json_response(StatusCode::OK, ()) } +/// Readiness endpoint indicates when we're done doing startup I/O (e.g. reconciling +/// with remote pageserver nodes). This is intended for use as a kubernetes readiness probe. +async fn handle_ready(req: Request) -> Result, ApiError> { + let state = get_state(&req); + if state.service.startup_complete.is_ready() { + json_response(StatusCode::OK, ()) + } else { + json_response(StatusCode::SERVICE_UNAVAILABLE, ()) + } +} + impl From for ApiError { fn from(value: ReconcileError) -> Self { ApiError::Conflict(format!("Reconciliation error: {}", value)) } } +/// Common wrapper for request handlers that call into Service and will operate on tenants: they must only +/// be allowed to run if Service has finished its initial reconciliation. +async fn tenant_service_handler(request: Request, handler: H) -> R::Output +where + R: std::future::Future, ApiError>> + Send + 'static, + H: FnOnce(Arc, Request) -> R + Send + Sync + 'static, +{ + let state = get_state(&request); + let service = state.service.clone(); + + let startup_complete = service.startup_complete.clone(); + if tokio::time::timeout(STARTUP_RECONCILE_TIMEOUT, startup_complete.wait()) + .await + .is_err() + { + // This shouldn't happen: it is the responsibilty of [`Service::startup_reconcile`] to use appropriate + // timeouts around its remote calls, to bound its runtime. + return Err(ApiError::Timeout( + "Timed out waiting for service readiness".into(), + )); + } + + request_span( + request, + |request| async move { handler(service, request).await }, + ) + .await +} + +fn check_permissions(request: &Request, required_scope: Scope) -> Result<(), ApiError> { + check_permission_with(request, |claims| { + crate::auth::check_permission(claims, required_scope) + }) +} + pub fn make_router( service: Arc, auth: Option>, @@ -196,23 +528,88 @@ pub fn make_router( router .data(Arc::new(HttpState::new(service, auth))) + // Non-prefixed generic endpoints (status, metrics) .get("/status", |r| request_span(r, handle_status)) - .post("/re-attach", |r| request_span(r, handle_re_attach)) - .post("/validate", |r| request_span(r, handle_validate)) - .post("/attach-hook", |r| request_span(r, handle_attach_hook)) - .post("/inspect", |r| request_span(r, handle_inspect)) - .post("/node", |r| request_span(r, handle_node_register)) - .put("/node/:node_id/config", |r| { + .get("/ready", |r| request_span(r, handle_ready)) + // Upcalls for the pageserver: point the pageserver's `control_plane_api` config to this prefix + .post("/upcall/v1/re-attach", |r| { + request_span(r, handle_re_attach) + }) + .post("/upcall/v1/validate", |r| request_span(r, handle_validate)) + // Test/dev/debug endpoints + .post("/debug/v1/attach-hook", |r| { + request_span(r, handle_attach_hook) + }) + .post("/debug/v1/inspect", |r| request_span(r, handle_inspect)) + .post("/debug/v1/tenant/:tenant_id/drop", |r| { + request_span(r, handle_tenant_drop) + }) + .post("/debug/v1/node/:node_id/drop", |r| { + request_span(r, handle_node_drop) + }) + .get("/debug/v1/tenant", |r| request_span(r, handle_tenants_dump)) + .get("/debug/v1/scheduler", |r| { + request_span(r, handle_scheduler_dump) + }) + .post("/debug/v1/consistency_check", |r| { + request_span(r, handle_consistency_check) + }) + .get("/control/v1/tenant/:tenant_id/locate", |r| { + tenant_service_handler(r, handle_tenant_locate) + }) + // Node operations + .post("/control/v1/node", |r| { + request_span(r, handle_node_register) + }) + .get("/control/v1/node", |r| request_span(r, handle_node_list)) + .put("/control/v1/node/:node_id/config", |r| { request_span(r, handle_node_configure) }) - .post("/tenant", |r| request_span(r, handle_tenant_create)) - .post("/tenant/:tenant_id/timeline", |r| { - request_span(r, handle_tenant_timeline_create) + // Tenant Shard operations + .put("/control/v1/tenant/:tenant_shard_id/migrate", |r| { + tenant_service_handler(r, handle_tenant_shard_migrate) }) - .get("/tenant/:tenant_id/locate", |r| { - request_span(r, handle_tenant_locate) + .put("/control/v1/tenant/:tenant_id/shard_split", |r| { + tenant_service_handler(r, handle_tenant_shard_split) }) - .put("/tenant/:tenant_shard_id/migrate", |r| { - request_span(r, handle_tenant_shard_migrate) + // Tenant operations + // The ^/v1/ endpoints act as a "Virtual Pageserver", enabling shard-naive clients to call into + // this service to manage tenants that actually consist of many tenant shards, as if they are a single entity. + .post("/v1/tenant", |r| { + tenant_service_handler(r, handle_tenant_create) + }) + .delete("/v1/tenant/:tenant_id", |r| { + tenant_service_handler(r, handle_tenant_delete) + }) + .put("/v1/tenant/config", |r| { + tenant_service_handler(r, handle_tenant_config_set) + }) + .get("/v1/tenant/:tenant_id/config", |r| { + tenant_service_handler(r, handle_tenant_config_get) + }) + .put("/v1/tenant/:tenant_id/location_config", |r| { + tenant_service_handler(r, handle_tenant_location_config) + }) + .put("/v1/tenant/:tenant_id/time_travel_remote_storage", |r| { + tenant_service_handler(r, handle_tenant_time_travel_remote_storage) + }) + .post("/v1/tenant/:tenant_id/secondary/download", |r| { + tenant_service_handler(r, handle_tenant_secondary_download) + }) + // Timeline operations + .delete("/v1/tenant/:tenant_id/timeline/:timeline_id", |r| { + tenant_service_handler(r, handle_tenant_timeline_delete) + }) + .post("/v1/tenant/:tenant_id/timeline", |r| { + tenant_service_handler(r, handle_tenant_timeline_create) + }) + // Tenant detail GET passthrough to shard zero + .get("/v1/tenant/:tenant_id", |r| { + tenant_service_handler(r, handle_tenant_timeline_passthrough) + }) + // Timeline GET passthrough to shard zero. Note that the `*` in the URL is a wildcard: any future + // timeline GET APIs will be implicitly included. + .get("/v1/tenant/:tenant_id/timeline*", |r| { + tenant_service_handler(r, handle_tenant_timeline_passthrough) }) } diff --git a/control_plane/attachment_service/src/lib.rs b/control_plane/attachment_service/src/lib.rs index e4ca9aa304..796b465c10 100644 --- a/control_plane/attachment_service/src/lib.rs +++ b/control_plane/attachment_service/src/lib.rs @@ -1,27 +1,19 @@ -use serde::{Deserialize, Serialize}; +use serde::Serialize; use utils::seqwait::MonotonicCounter; +mod auth; mod compute_hook; pub mod http; +pub mod metrics; mod node; pub mod persistence; mod reconciler; mod scheduler; +mod schema; pub mod service; mod tenant_state; -#[derive(Clone, Serialize, Deserialize)] -enum PlacementPolicy { - /// Cheapest way to attach a tenant: just one pageserver, no secondary - Single, - /// Production-ready way to attach a tenant: one attached pageserver and - /// some number of secondaries. - Double(usize), - /// Do not attach to any pageservers - Detached, -} - -#[derive(Ord, PartialOrd, Eq, PartialEq, Copy, Clone)] +#[derive(Ord, PartialOrd, Eq, PartialEq, Copy, Clone, Serialize)] struct Sequence(u64); impl Sequence { @@ -36,6 +28,12 @@ impl std::fmt::Display for Sequence { } } +impl std::fmt::Debug for Sequence { + fn fmt(&self, f: &mut std::fmt::Formatter) -> std::fmt::Result { + write!(f, "{}", self.0) + } +} + impl MonotonicCounter for Sequence { fn cnt_advance(&mut self, v: Sequence) { assert!(*self <= v); @@ -51,9 +49,3 @@ impl Sequence { Sequence(self.0 + 1) } } - -impl Default for PlacementPolicy { - fn default() -> Self { - PlacementPolicy::Double(1) - } -} diff --git a/control_plane/attachment_service/src/main.rs b/control_plane/attachment_service/src/main.rs index 38e51b9a9e..333c3911e3 100644 --- a/control_plane/attachment_service/src/main.rs +++ b/control_plane/attachment_service/src/main.rs @@ -1,26 +1,27 @@ -/// The attachment service mimics the aspects of the control plane API -/// that are required for a pageserver to operate. -/// -/// This enables running & testing pageservers without a full-blown -/// deployment of the Neon cloud platform. -/// -use anyhow::anyhow; +use anyhow::{anyhow, Context}; use attachment_service::http::make_router; +use attachment_service::metrics::preinitialize_metrics; use attachment_service::persistence::Persistence; use attachment_service::service::{Config, Service}; +use aws_config::{BehaviorVersion, Region}; use camino::Utf8PathBuf; use clap::Parser; +use diesel::Connection; use metrics::launch_timestamp::LaunchTimestamp; use std::sync::Arc; +use tokio::signal::unix::SignalKind; +use tokio_util::sync::CancellationToken; use utils::auth::{JwtAuth, SwappableJwtAuth}; use utils::logging::{self, LogFormat}; -use utils::signals::{ShutdownSignals, Signal}; use utils::{project_build_tag, project_git_version, tcp_listener}; project_git_version!(GIT_VERSION); project_build_tag!(BUILD_TAG); +use diesel_migrations::{embed_migrations, EmbeddedMigrations}; +pub const MIGRATIONS: EmbeddedMigrations = embed_migrations!("./migrations"); + #[derive(Parser)] #[command(author, version, about, long_about = None)] #[command(arg_required_else_help(true))] @@ -29,21 +30,193 @@ struct Cli { #[arg(short, long)] listen: std::net::SocketAddr, - /// Path to public key for JWT authentication of clients + /// Public key for JWT authentication of clients #[arg(long)] - public_key: Option, + public_key: Option, /// Token for authenticating this service with the pageservers it controls - #[arg(short, long)] + #[arg(long)] jwt_token: Option, + /// Token for authenticating this service with the control plane, when calling + /// the compute notification endpoint + #[arg(long)] + control_plane_jwt_token: Option, + + /// URL to control plane compute notification endpoint + #[arg(long)] + compute_hook_url: Option, + /// Path to the .json file to store state (will be created if it doesn't exist) #[arg(short, long)] - path: Utf8PathBuf, + path: Option, + + /// URL to connect to postgres, like postgresql://localhost:1234/attachment_service + #[arg(long)] + database_url: Option, } -#[tokio::main] -async fn main() -> anyhow::Result<()> { +/// Secrets may either be provided on the command line (for testing), or loaded from AWS SecretManager: this +/// type encapsulates the logic to decide which and do the loading. +struct Secrets { + database_url: String, + public_key: Option, + jwt_token: Option, + control_plane_jwt_token: Option, +} + +impl Secrets { + const DATABASE_URL_SECRET: &'static str = "rds-neon-storage-controller-url"; + const PAGESERVER_JWT_TOKEN_SECRET: &'static str = + "neon-storage-controller-pageserver-jwt-token"; + const CONTROL_PLANE_JWT_TOKEN_SECRET: &'static str = + "neon-storage-controller-control-plane-jwt-token"; + const PUBLIC_KEY_SECRET: &'static str = "neon-storage-controller-public-key"; + + const DATABASE_URL_ENV: &'static str = "DATABASE_URL"; + const PAGESERVER_JWT_TOKEN_ENV: &'static str = "PAGESERVER_JWT_TOKEN"; + const CONTROL_PLANE_JWT_TOKEN_ENV: &'static str = "CONTROL_PLANE_JWT_TOKEN"; + const PUBLIC_KEY_ENV: &'static str = "PUBLIC_KEY"; + + /// Load secrets from, in order of preference: + /// - CLI args if database URL is provided on the CLI + /// - Environment variables if DATABASE_URL is set. + /// - AWS Secrets Manager secrets + async fn load(args: &Cli) -> anyhow::Result { + match &args.database_url { + Some(url) => Self::load_cli(url, args), + None => match std::env::var(Self::DATABASE_URL_ENV) { + Ok(database_url) => Self::load_env(database_url), + Err(_) => Self::load_aws_sm().await, + }, + } + } + + fn load_env(database_url: String) -> anyhow::Result { + let public_key = match std::env::var(Self::PUBLIC_KEY_ENV) { + Ok(public_key) => Some(JwtAuth::from_key(public_key).context("Loading public key")?), + Err(_) => None, + }; + Ok(Self { + database_url, + public_key, + jwt_token: std::env::var(Self::PAGESERVER_JWT_TOKEN_ENV).ok(), + control_plane_jwt_token: std::env::var(Self::CONTROL_PLANE_JWT_TOKEN_ENV).ok(), + }) + } + + async fn load_aws_sm() -> anyhow::Result { + let Ok(region) = std::env::var("AWS_REGION") else { + anyhow::bail!("AWS_REGION is not set, cannot load secrets automatically: either set this, or use CLI args to supply secrets"); + }; + let config = aws_config::defaults(BehaviorVersion::v2023_11_09()) + .region(Region::new(region.clone())) + .load() + .await; + + let asm = aws_sdk_secretsmanager::Client::new(&config); + + let Some(database_url) = asm + .get_secret_value() + .secret_id(Self::DATABASE_URL_SECRET) + .send() + .await? + .secret_string() + .map(str::to_string) + else { + anyhow::bail!( + "Database URL secret not found at {region}/{}", + Self::DATABASE_URL_SECRET + ) + }; + + let jwt_token = asm + .get_secret_value() + .secret_id(Self::PAGESERVER_JWT_TOKEN_SECRET) + .send() + .await? + .secret_string() + .map(str::to_string); + if jwt_token.is_none() { + tracing::warn!("No pageserver JWT token set: this will only work if authentication is disabled on the pageserver"); + } + + let control_plane_jwt_token = asm + .get_secret_value() + .secret_id(Self::CONTROL_PLANE_JWT_TOKEN_SECRET) + .send() + .await? + .secret_string() + .map(str::to_string); + if jwt_token.is_none() { + tracing::warn!("No control plane JWT token set: this will only work if authentication is disabled on the pageserver"); + } + + let public_key = asm + .get_secret_value() + .secret_id(Self::PUBLIC_KEY_SECRET) + .send() + .await? + .secret_string() + .map(str::to_string); + let public_key = match public_key { + Some(key) => Some(JwtAuth::from_key(key)?), + None => { + tracing::warn!( + "No public key set: inccoming HTTP requests will not be authenticated" + ); + None + } + }; + + Ok(Self { + database_url, + public_key, + jwt_token, + control_plane_jwt_token, + }) + } + + fn load_cli(database_url: &str, args: &Cli) -> anyhow::Result { + let public_key = match &args.public_key { + None => None, + Some(key) => Some(JwtAuth::from_key(key.clone()).context("Loading public key")?), + }; + Ok(Self { + database_url: database_url.to_owned(), + public_key, + jwt_token: args.jwt_token.clone(), + control_plane_jwt_token: args.control_plane_jwt_token.clone(), + }) + } +} + +/// Execute the diesel migrations that are built into this binary +async fn migration_run(database_url: &str) -> anyhow::Result<()> { + use diesel::PgConnection; + use diesel_migrations::{HarnessWithOutput, MigrationHarness}; + let mut conn = PgConnection::establish(database_url)?; + + HarnessWithOutput::write_to_stdout(&mut conn) + .run_pending_migrations(MIGRATIONS) + .map(|_| ()) + .map_err(|e| anyhow::anyhow!(e))?; + + Ok(()) +} + +fn main() -> anyhow::Result<()> { + tokio::runtime::Builder::new_current_thread() + // We use spawn_blocking for database operations, so require approximately + // as many blocking threads as we will open database connections. + .max_blocking_threads(Persistence::MAX_CONNECTIONS as usize) + .enable_all() + .build() + .unwrap() + .block_on(async_main()) +} + +async fn async_main() -> anyhow::Result<()> { let launch_ts = Box::leak(Box::new(LaunchTimestamp::generate())); logging::init( @@ -52,49 +225,88 @@ async fn main() -> anyhow::Result<()> { logging::Output::Stdout, )?; + preinitialize_metrics(); + let args = Cli::parse(); tracing::info!( "version: {}, launch_timestamp: {}, build_tag {}, state at {}, listening on {}", GIT_VERSION, launch_ts.to_string(), BUILD_TAG, - args.path, + args.path.as_ref().unwrap_or(&Utf8PathBuf::from("")), args.listen ); + let secrets = Secrets::load(&args).await?; + let config = Config { - jwt_token: args.jwt_token, + jwt_token: secrets.jwt_token, + control_plane_jwt_token: secrets.control_plane_jwt_token, + compute_hook_url: args.compute_hook_url, }; - let persistence = Arc::new(Persistence::spawn(&args.path).await); + // After loading secrets & config, but before starting anything else, apply database migrations + migration_run(&secrets.database_url) + .await + .context("Running database migrations")?; - let service = Service::spawn(config, persistence).await?; + let json_path = args.path; + let persistence = Arc::new(Persistence::new(secrets.database_url, json_path.clone())); + + let service = Service::spawn(config, persistence.clone()).await?; let http_listener = tcp_listener::bind(args.listen)?; - let auth = if let Some(public_key_path) = &args.public_key { - let jwt_auth = JwtAuth::from_key_path(public_key_path)?; - Some(Arc::new(SwappableJwtAuth::new(jwt_auth))) - } else { - None - }; - let router = make_router(service, auth) + let auth = secrets + .public_key + .map(|jwt_auth| Arc::new(SwappableJwtAuth::new(jwt_auth))); + let router = make_router(service.clone(), auth) .build() .map_err(|err| anyhow!(err))?; - let service = utils::http::RouterService::new(router).unwrap(); - let server = hyper::Server::from_tcp(http_listener)?.serve(service); + let router_service = utils::http::RouterService::new(router).unwrap(); + // Start HTTP server + let server_shutdown = CancellationToken::new(); + let server = hyper::Server::from_tcp(http_listener)? + .serve(router_service) + .with_graceful_shutdown({ + let server_shutdown = server_shutdown.clone(); + async move { + server_shutdown.cancelled().await; + } + }); tracing::info!("Serving on {0}", args.listen); + let server_task = tokio::task::spawn(server); - tokio::task::spawn(server); + // Wait until we receive a signal + let mut sigint = tokio::signal::unix::signal(SignalKind::interrupt())?; + let mut sigquit = tokio::signal::unix::signal(SignalKind::quit())?; + let mut sigterm = tokio::signal::unix::signal(SignalKind::terminate())?; + tokio::select! { + _ = sigint.recv() => {}, + _ = sigterm.recv() => {}, + _ = sigquit.recv() => {}, + } + tracing::info!("Terminating on signal"); - ShutdownSignals::handle(|signal| match signal { - Signal::Interrupt | Signal::Terminate | Signal::Quit => { - tracing::info!("Got {}. Terminating", signal.name()); - // We're just a test helper: no graceful shutdown. - std::process::exit(0); + if json_path.is_some() { + // Write out a JSON dump on shutdown: this is used in compat tests to avoid passing + // full postgres dumps around. + if let Err(e) = persistence.write_tenants_json().await { + tracing::error!("Failed to write JSON on shutdown: {e}") } - })?; + } - Ok(()) + // Stop HTTP server first, so that we don't have to service requests + // while shutting down Service + server_shutdown.cancel(); + if let Err(e) = server_task.await { + tracing::error!("Error joining HTTP server task: {e}") + } + tracing::info!("Joined HTTP server task"); + + service.shutdown().await; + tracing::info!("Service shutdown complete"); + + std::process::exit(0); } diff --git a/control_plane/attachment_service/src/metrics.rs b/control_plane/attachment_service/src/metrics.rs new file mode 100644 index 0000000000..ffe093b9c8 --- /dev/null +++ b/control_plane/attachment_service/src/metrics.rs @@ -0,0 +1,32 @@ +use metrics::{register_int_counter, register_int_counter_vec, IntCounter, IntCounterVec}; +use once_cell::sync::Lazy; + +pub(crate) struct ReconcilerMetrics { + pub(crate) spawned: IntCounter, + pub(crate) complete: IntCounterVec, +} + +impl ReconcilerMetrics { + // Labels used on [`Self::complete`] + pub(crate) const SUCCESS: &'static str = "ok"; + pub(crate) const ERROR: &'static str = "success"; + pub(crate) const CANCEL: &'static str = "cancel"; +} + +pub(crate) static RECONCILER: Lazy = Lazy::new(|| ReconcilerMetrics { + spawned: register_int_counter!( + "storage_controller_reconcile_spawn", + "Count of how many times we spawn a reconcile task", + ) + .expect("failed to define a metric"), + complete: register_int_counter_vec!( + "storage_controller_reconcile_complete", + "Reconciler tasks completed, broken down by success/failure/cancelled", + &["status"], + ) + .expect("failed to define a metric"), +}); + +pub fn preinitialize_metrics() { + Lazy::force(&RECONCILER); +} diff --git a/control_plane/attachment_service/src/node.rs b/control_plane/attachment_service/src/node.rs index efd3f8f49b..27b03608fa 100644 --- a/control_plane/attachment_service/src/node.rs +++ b/control_plane/attachment_service/src/node.rs @@ -1,18 +1,50 @@ -use control_plane::attachment_service::{NodeAvailability, NodeSchedulingPolicy}; -use utils::id::NodeId; +use std::{str::FromStr, time::Duration}; -#[derive(Clone)] +use hyper::StatusCode; +use pageserver_api::{ + controller_api::{ + NodeAvailability, NodeRegisterRequest, NodeSchedulingPolicy, TenantLocateResponseShard, + }, + shard::TenantShardId, +}; +use pageserver_client::mgmt_api; +use serde::Serialize; +use tokio_util::sync::CancellationToken; +use utils::{backoff, id::NodeId}; + +use crate::persistence::NodePersistence; + +/// Represents the in-memory description of a Node. +/// +/// Scheduling statistics are maintened separately in [`crate::scheduler`]. +/// +/// The persistent subset of the Node is defined in [`crate::persistence::NodePersistence`]: the +/// implementation of serialization on this type is only for debug dumps. +#[derive(Clone, Serialize)] pub(crate) struct Node { - pub(crate) id: NodeId, + id: NodeId, - pub(crate) availability: NodeAvailability, - pub(crate) scheduling: NodeSchedulingPolicy, + availability: NodeAvailability, + scheduling: NodeSchedulingPolicy, - pub(crate) listen_http_addr: String, - pub(crate) listen_http_port: u16, + listen_http_addr: String, + listen_http_port: u16, - pub(crate) listen_pg_addr: String, - pub(crate) listen_pg_port: u16, + listen_pg_addr: String, + listen_pg_port: u16, + + // This cancellation token means "stop any RPCs in flight to this node, and don't start + // any more". It is not related to process shutdown. + #[serde(skip)] + cancel: CancellationToken, +} + +/// When updating [`Node::availability`] we use this type to indicate to the caller +/// whether/how they changed it. +pub(crate) enum AvailabilityTransition { + ToActive, + ToOffline, + Unchanged, } impl Node { @@ -20,6 +52,71 @@ impl Node { format!("http://{}:{}", self.listen_http_addr, self.listen_http_port) } + pub(crate) fn get_id(&self) -> NodeId { + self.id + } + + pub(crate) fn set_scheduling(&mut self, scheduling: NodeSchedulingPolicy) { + self.scheduling = scheduling + } + + /// Does this registration request match `self`? This is used when deciding whether a registration + /// request should be allowed to update an existing record with the same node ID. + pub(crate) fn registration_match(&self, register_req: &NodeRegisterRequest) -> bool { + self.id == register_req.node_id + && self.listen_http_addr == register_req.listen_http_addr + && self.listen_http_port == register_req.listen_http_port + && self.listen_pg_addr == register_req.listen_pg_addr + && self.listen_pg_port == register_req.listen_pg_port + } + + /// For a shard located on this node, populate a response object + /// with this node's address information. + pub(crate) fn shard_location(&self, shard_id: TenantShardId) -> TenantLocateResponseShard { + TenantLocateResponseShard { + shard_id, + node_id: self.id, + listen_http_addr: self.listen_http_addr.clone(), + listen_http_port: self.listen_http_port, + listen_pg_addr: self.listen_pg_addr.clone(), + listen_pg_port: self.listen_pg_port, + } + } + + pub(crate) fn set_availability( + &mut self, + availability: NodeAvailability, + ) -> AvailabilityTransition { + use NodeAvailability::*; + let transition = match (self.availability, availability) { + (Offline, Active) => { + // Give the node a new cancellation token, effectively resetting it to un-cancelled. Any + // users of previously-cloned copies of the node will still see the old cancellation + // state. For example, Reconcilers in flight will have to complete and be spawned + // again to realize that the node has become available. + self.cancel = CancellationToken::new(); + AvailabilityTransition::ToActive + } + (Active, Offline) => { + // Fire the node's cancellation token to cancel any in-flight API requests to it + self.cancel.cancel(); + AvailabilityTransition::ToOffline + } + _ => AvailabilityTransition::Unchanged, + }; + self.availability = availability; + transition + } + + /// Whether we may send API requests to this node. + pub(crate) fn is_available(&self) -> bool { + // When we clone a node, [`Self::availability`] is a snapshot, but [`Self::cancel`] holds + // a reference to the original Node's cancellation status. Checking both of these results + // in a "pessimistic" check where we will consider a Node instance unavailable if it was unavailable + // when we cloned it, or if the original Node instance's cancellation token was fired. + matches!(self.availability, NodeAvailability::Active) && !self.cancel.is_cancelled() + } + /// Is this node elegible to have work scheduled onto it? pub(crate) fn may_schedule(&self) -> bool { match self.availability { @@ -34,4 +131,127 @@ impl Node { NodeSchedulingPolicy::Pause => false, } } + + pub(crate) fn new( + id: NodeId, + listen_http_addr: String, + listen_http_port: u16, + listen_pg_addr: String, + listen_pg_port: u16, + ) -> Self { + Self { + id, + listen_http_addr, + listen_http_port, + listen_pg_addr, + listen_pg_port, + scheduling: NodeSchedulingPolicy::Filling, + // TODO: we shouldn't really call this Active until we've heartbeated it. + availability: NodeAvailability::Active, + cancel: CancellationToken::new(), + } + } + + pub(crate) fn to_persistent(&self) -> NodePersistence { + NodePersistence { + node_id: self.id.0 as i64, + scheduling_policy: self.scheduling.into(), + listen_http_addr: self.listen_http_addr.clone(), + listen_http_port: self.listen_http_port as i32, + listen_pg_addr: self.listen_pg_addr.clone(), + listen_pg_port: self.listen_pg_port as i32, + } + } + + pub(crate) fn from_persistent(np: NodePersistence) -> Self { + Self { + id: NodeId(np.node_id as u64), + // At startup we consider a node offline until proven otherwise. + availability: NodeAvailability::Offline, + scheduling: NodeSchedulingPolicy::from_str(&np.scheduling_policy) + .expect("Bad scheduling policy in DB"), + listen_http_addr: np.listen_http_addr, + listen_http_port: np.listen_http_port as u16, + listen_pg_addr: np.listen_pg_addr, + listen_pg_port: np.listen_pg_port as u16, + cancel: CancellationToken::new(), + } + } + + /// Wrapper for issuing requests to pageserver management API: takes care of generic + /// retry/backoff for retryable HTTP status codes. + /// + /// This will return None to indicate cancellation. Cancellation may happen from + /// the cancellation token passed in, or from Self's cancellation token (i.e. node + /// going offline). + pub(crate) async fn with_client_retries( + &self, + mut op: O, + jwt: &Option, + warn_threshold: u32, + max_retries: u32, + timeout: Duration, + cancel: &CancellationToken, + ) -> Option> + where + O: FnMut(mgmt_api::Client) -> F, + F: std::future::Future>, + { + fn is_fatal(e: &mgmt_api::Error) -> bool { + use mgmt_api::Error::*; + match e { + ReceiveBody(_) | ReceiveErrorBody(_) => false, + ApiError(StatusCode::SERVICE_UNAVAILABLE, _) + | ApiError(StatusCode::GATEWAY_TIMEOUT, _) + | ApiError(StatusCode::REQUEST_TIMEOUT, _) => false, + ApiError(_, _) => true, + Cancelled => true, + } + } + + backoff::retry( + || { + let http_client = reqwest::ClientBuilder::new() + .timeout(timeout) + .build() + .expect("Failed to construct HTTP client"); + + let client = + mgmt_api::Client::from_client(http_client, self.base_url(), jwt.as_deref()); + + let node_cancel_fut = self.cancel.cancelled(); + + let op_fut = op(client); + + async { + tokio::select! { + r = op_fut=> {r}, + _ = node_cancel_fut => { + Err(mgmt_api::Error::Cancelled) + }} + } + }, + is_fatal, + warn_threshold, + max_retries, + &format!( + "Call to node {} ({}:{}) management API", + self.id, self.listen_http_addr, self.listen_http_port + ), + cancel, + ) + .await + } +} + +impl std::fmt::Display for Node { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + write!(f, "{} ({})", self.id, self.listen_http_addr) + } +} + +impl std::fmt::Debug for Node { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + write!(f, "{} ({})", self.id, self.listen_http_addr) + } } diff --git a/control_plane/attachment_service/src/persistence.rs b/control_plane/attachment_service/src/persistence.rs index e944a2e9ed..aa08945834 100644 --- a/control_plane/attachment_service/src/persistence.rs +++ b/control_plane/attachment_service/src/persistence.rs @@ -1,199 +1,250 @@ -use std::{collections::HashMap, str::FromStr}; +pub(crate) mod split_state; +use std::collections::HashMap; +use std::str::FromStr; +use std::time::Duration; -use camino::{Utf8Path, Utf8PathBuf}; -use control_plane::{ - attachment_service::{NodeAvailability, NodeSchedulingPolicy}, - local_env::LocalEnv, -}; -use pageserver_api::{ - models::TenantConfig, - shard::{ShardCount, ShardNumber, TenantShardId}, -}; -use postgres_connection::parse_host_port; +use self::split_state::SplitState; +use camino::Utf8Path; +use camino::Utf8PathBuf; +use diesel::pg::PgConnection; +use diesel::prelude::*; +use diesel::Connection; +use pageserver_api::controller_api::{NodeSchedulingPolicy, PlacementPolicy}; +use pageserver_api::models::TenantConfig; +use pageserver_api::shard::{ShardCount, ShardNumber, TenantShardId}; use serde::{Deserialize, Serialize}; -use tracing::info; -use utils::{ - generation::Generation, - id::{NodeId, TenantId}, -}; +use utils::generation::Generation; +use utils::id::{NodeId, TenantId}; -use crate::{node::Node, PlacementPolicy}; +use crate::node::Node; -/// Placeholder for storage. This will be replaced with a database client. +/// ## What do we store? +/// +/// The storage controller service does not store most of its state durably. +/// +/// The essential things to store durably are: +/// - generation numbers, as these must always advance monotonically to ensure data safety. +/// - Tenant's PlacementPolicy and TenantConfig, as the source of truth for these is something external. +/// - Node's scheduling policies, as the source of truth for these is something external. +/// +/// Other things we store durably as an implementation detail: +/// - Node's host/port: this could be avoided it we made nodes emit a self-registering heartbeat, +/// but it is operationally simpler to make this service the authority for which nodes +/// it talks to. +/// +/// ## Performance/efficiency +/// +/// The storage controller service does not go via the database for most things: there are +/// a couple of places where we must, and where efficiency matters: +/// - Incrementing generation numbers: the Reconciler has to wait for this to complete +/// before it can attach a tenant, so this acts as a bound on how fast things like +/// failover can happen. +/// - Pageserver re-attach: we will increment many shards' generations when this happens, +/// so it is important to avoid e.g. issuing O(N) queries. +/// +/// Database calls relating to nodes have low performance requirements, as they are very rarely +/// updated, and reads of nodes are always from memory, not the database. We only require that +/// we can UPDATE a node's scheduling mode reasonably quickly to mark a bad node offline. pub struct Persistence { - inner: std::sync::Mutex, -} - -struct Inner { - state: PersistentState, - write_queue_tx: tokio::sync::mpsc::UnboundedSender, + connection_pool: diesel::r2d2::Pool>, + + // In test environments, we support loading+saving a JSON file. This is temporary, for the benefit of + // test_compatibility.py, so that we don't have to commit to making the database contents fully backward/forward + // compatible just yet. + json_path: Option, } +/// Legacy format, for use in JSON compat objects in test environment #[derive(Serialize, Deserialize)] -struct PersistentState { +struct JsonPersistence { tenants: HashMap, } -struct PendingWrite { - bytes: Vec, - done_tx: tokio::sync::oneshot::Sender<()>, +#[derive(thiserror::Error, Debug)] +pub(crate) enum DatabaseError { + #[error(transparent)] + Query(#[from] diesel::result::Error), + #[error(transparent)] + Connection(#[from] diesel::result::ConnectionError), + #[error(transparent)] + ConnectionPool(#[from] r2d2::Error), + #[error("Logical error: {0}")] + Logical(String), } -impl PersistentState { - async fn load(path: &Utf8Path) -> anyhow::Result { - let bytes = tokio::fs::read(path).await?; - let mut decoded = serde_json::from_slice::(&bytes)?; +pub(crate) type DatabaseResult = Result; +impl Persistence { + // The default postgres connection limit is 100. We use up to 99, to leave one free for a human admin under + // normal circumstances. This assumes we have exclusive use of the database cluster to which we connect. + pub const MAX_CONNECTIONS: u32 = 99; + + // We don't want to keep a lot of connections alive: close them down promptly if they aren't being used. + const IDLE_CONNECTION_TIMEOUT: Duration = Duration::from_secs(10); + const MAX_CONNECTION_LIFETIME: Duration = Duration::from_secs(60); + + pub fn new(database_url: String, json_path: Option) -> Self { + let manager = diesel::r2d2::ConnectionManager::::new(database_url); + + // We will use a connection pool: this is primarily to _limit_ our connection count, rather than to optimize time + // to execute queries (database queries are not generally on latency-sensitive paths). + let connection_pool = diesel::r2d2::Pool::builder() + .max_size(Self::MAX_CONNECTIONS) + .max_lifetime(Some(Self::MAX_CONNECTION_LIFETIME)) + .idle_timeout(Some(Self::IDLE_CONNECTION_TIMEOUT)) + // Always keep at least one connection ready to go + .min_idle(Some(1)) + .test_on_check_out(true) + .build(manager) + .expect("Could not build connection pool"); + + Self { + connection_pool, + json_path, + } + } + + /// Call the provided function in a tokio blocking thread, with a Diesel database connection. + async fn with_conn(&self, func: F) -> DatabaseResult + where + F: Fn(&mut PgConnection) -> DatabaseResult + Send + 'static, + R: Send + 'static, + { + let mut conn = self.connection_pool.get()?; + tokio::task::spawn_blocking(move || -> DatabaseResult { func(&mut conn) }) + .await + .expect("Task panic") + } + + /// When a node is first registered, persist it before using it for anything + pub(crate) async fn insert_node(&self, node: &Node) -> DatabaseResult<()> { + let np = node.to_persistent(); + self.with_conn(move |conn| -> DatabaseResult<()> { + diesel::insert_into(crate::schema::nodes::table) + .values(&np) + .execute(conn)?; + Ok(()) + }) + .await + } + + /// At startup, populate the list of nodes which our shards may be placed on + pub(crate) async fn list_nodes(&self) -> DatabaseResult> { + let nodes: Vec = self + .with_conn(move |conn| -> DatabaseResult<_> { + Ok(crate::schema::nodes::table.load::(conn)?) + }) + .await?; + + tracing::info!("list_nodes: loaded {} nodes", nodes.len()); + + Ok(nodes) + } + + pub(crate) async fn update_node( + &self, + input_node_id: NodeId, + input_scheduling: NodeSchedulingPolicy, + ) -> DatabaseResult<()> { + use crate::schema::nodes::dsl::*; + let updated = self + .with_conn(move |conn| { + let updated = diesel::update(nodes) + .filter(node_id.eq(input_node_id.0 as i64)) + .set((scheduling_policy.eq(String::from(input_scheduling)),)) + .execute(conn)?; + Ok(updated) + }) + .await?; + + if updated != 1 { + Err(DatabaseError::Logical(format!( + "Node {node_id:?} not found for update", + ))) + } else { + Ok(()) + } + } + + /// At startup, load the high level state for shards, such as their config + policy. This will + /// be enriched at runtime with state discovered on pageservers. + pub(crate) async fn list_tenant_shards(&self) -> DatabaseResult> { + let loaded = self + .with_conn(move |conn| -> DatabaseResult<_> { + Ok(crate::schema::tenant_shards::table.load::(conn)?) + }) + .await?; + + if loaded.is_empty() { + if let Some(path) = &self.json_path { + if tokio::fs::try_exists(path) + .await + .map_err(|e| DatabaseError::Logical(format!("Error stat'ing JSON file: {e}")))? + { + tracing::info!("Importing from legacy JSON format at {path}"); + return self.list_tenant_shards_json(path).await; + } + } + } + Ok(loaded) + } + + /// Shim for automated compatibility tests: load tenants from a JSON file instead of database + pub(crate) async fn list_tenant_shards_json( + &self, + path: &Utf8Path, + ) -> DatabaseResult> { + let bytes = tokio::fs::read(path) + .await + .map_err(|e| DatabaseError::Logical(format!("Failed to load JSON: {e}")))?; + + let mut decoded = serde_json::from_slice::(&bytes) + .map_err(|e| DatabaseError::Logical(format!("Deserialization error: {e}")))?; for (tenant_id, tenant) in &mut decoded.tenants { // Backward compat: an old attachments.json from before PR #6251, replace // empty strings with proper defaults. if tenant.tenant_id.is_empty() { - tenant.tenant_id = format!("{}", tenant_id); - tenant.config = serde_json::to_string(&TenantConfig::default())?; - tenant.placement_policy = serde_json::to_string(&PlacementPolicy::default())?; + tenant.tenant_id = tenant_id.to_string(); + tenant.config = serde_json::to_string(&TenantConfig::default()) + .map_err(|e| DatabaseError::Logical(format!("Serialization error: {e}")))?; + tenant.placement_policy = serde_json::to_string(&PlacementPolicy::Single) + .map_err(|e| DatabaseError::Logical(format!("Serialization error: {e}")))?; } } - Ok(decoded) + let tenants: Vec = decoded.tenants.into_values().collect(); + + // Synchronize database with what is in the JSON file + self.insert_tenant_shards(tenants.clone()).await?; + + Ok(tenants) } - async fn load_or_new(path: &Utf8Path) -> Self { - match Self::load(path).await { - Ok(s) => { - tracing::info!("Loaded state file at {}", path); - s - } - Err(e) - if e.downcast_ref::() - .map(|e| e.kind() == std::io::ErrorKind::NotFound) - .unwrap_or(false) => - { - tracing::info!("Will create state file at {}", path); - Self { - tenants: HashMap::new(), - } - } - Err(e) => { - panic!("Failed to load state from '{}': {e:#} (maybe your .neon/ dir was written by an older version?)", path) - } - } - } -} - -impl Persistence { - pub async fn spawn(path: &Utf8Path) -> Self { - let (tx, rx) = tokio::sync::mpsc::unbounded_channel(); - let state = PersistentState::load_or_new(path).await; - tokio::spawn(Self::writer_task(rx, path.to_owned())); - Self { - inner: std::sync::Mutex::new(Inner { - state, - write_queue_tx: tx, - }), - } - } - - async fn writer_task( - mut rx: tokio::sync::mpsc::UnboundedReceiver, - path: Utf8PathBuf, - ) { - scopeguard::defer! { - info!("persistence writer task exiting"); + /// For use in testing environments, where we dump out JSON on shutdown. + pub async fn write_tenants_json(&self) -> anyhow::Result<()> { + let Some(path) = &self.json_path else { + anyhow::bail!("Cannot write JSON if path isn't set (test environment bug)"); }; - loop { - match rx.recv().await { - Some(write) => { - tokio::task::spawn_blocking({ - let path = path.clone(); - move || { - let tmp_path = - utils::crashsafe::path_with_suffix_extension(&path, "___new"); - utils::crashsafe::overwrite(&path, &tmp_path, &write.bytes) - } - }) - .await - .expect("spawn_blocking") - .expect("write file"); - let _ = write.done_tx.send(()); // receiver may lose interest any time - } - None => { - return; - } - } - } - } - - /// Perform a modification on our [`PersistentState`]. - /// Return a future that completes once our modification has been persisted. - /// The output of the future is the return value of the `txn`` closure. - async fn mutating_transaction(&self, txn: F) -> R - where - F: FnOnce(&mut PersistentState) -> R, - { - let (ret, done_rx) = { - let mut inner = self.inner.lock().unwrap(); - let ret = txn(&mut inner.state); - let (done_tx, done_rx) = tokio::sync::oneshot::channel(); - let write = PendingWrite { - bytes: serde_json::to_vec(&inner.state).expect("Serialization error"), - done_tx, + tracing::info!("Writing state to {path}..."); + let tenants = self.list_tenant_shards().await?; + let mut tenants_map = HashMap::new(); + for tsp in tenants { + let tenant_shard_id = TenantShardId { + tenant_id: TenantId::from_str(tsp.tenant_id.as_str())?, + shard_number: ShardNumber(tsp.shard_number as u8), + shard_count: ShardCount::new(tsp.shard_count as u8), }; - inner - .write_queue_tx - .send(write) - .expect("writer task always outlives self"); - (ret, done_rx) - }; - // the write task can go away once we start .await'ing - let _: () = done_rx.await.expect("writer task dead, check logs"); - ret - } - /// When registering a node, persist it so that on next start we will be able to - /// iterate over known nodes to synchronize their tenant shard states with our observed state. - pub(crate) async fn insert_node(&self, _node: &Node) -> anyhow::Result<()> { - // TODO: node persitence will come with database backend - Ok(()) - } - - /// At startup, we populate the service's list of nodes, and use this list to call into - /// each node to do an initial reconciliation of the state of the world with our in-memory - /// observed state. - pub(crate) async fn list_nodes(&self) -> anyhow::Result> { - let env = LocalEnv::load_config()?; - // TODO: node persitence will come with database backend - - // XXX hack: enable test_backward_compatibility to work by populating our list of - // nodes from LocalEnv when it is not present in persistent storage. Otherwise at - // first startup in the compat test, we may have shards but no nodes. - let mut result = Vec::new(); - tracing::info!( - "Loaded {} pageserver nodes from LocalEnv", - env.pageservers.len() - ); - for ps_conf in env.pageservers { - let (pg_host, pg_port) = - parse_host_port(&ps_conf.listen_pg_addr).expect("Unable to parse listen_pg_addr"); - let (http_host, http_port) = parse_host_port(&ps_conf.listen_http_addr) - .expect("Unable to parse listen_http_addr"); - result.push(Node { - id: ps_conf.id, - listen_pg_addr: pg_host.to_string(), - listen_pg_port: pg_port.unwrap_or(5432), - listen_http_addr: http_host.to_string(), - listen_http_port: http_port.unwrap_or(80), - availability: NodeAvailability::Active, - scheduling: NodeSchedulingPolicy::Active, - }); + tenants_map.insert(tenant_shard_id, tsp); } + let json = serde_json::to_string(&JsonPersistence { + tenants: tenants_map, + })?; - Ok(result) - } + tokio::fs::write(path, &json).await?; + tracing::info!("Wrote {} bytes to {path}...", json.len()); - /// At startup, we populate our map of tenant shards from persistent storage. - pub(crate) async fn list_tenant_shards(&self) -> anyhow::Result> { - let inner = self.inner.lock().unwrap(); - Ok(inner.state.tenants.values().cloned().collect()) + Ok(()) } /// Tenants must be persisted before we schedule them for the first time. This enables us @@ -201,22 +252,98 @@ impl Persistence { pub(crate) async fn insert_tenant_shards( &self, shards: Vec, - ) -> anyhow::Result<()> { - self.mutating_transaction(|locked| { - for shard in shards { - let tenant_shard_id = TenantShardId { - tenant_id: TenantId::from_str(shard.tenant_id.as_str())?, - shard_number: ShardNumber(shard.shard_number as u8), - shard_count: ShardCount(shard.shard_count as u8), - }; - - locked.tenants.insert(tenant_shard_id, shard); - } + ) -> DatabaseResult<()> { + use crate::schema::tenant_shards::dsl::*; + self.with_conn(move |conn| -> DatabaseResult<()> { + conn.transaction(|conn| -> QueryResult<()> { + for tenant in &shards { + diesel::insert_into(tenant_shards) + .values(tenant) + .execute(conn)?; + } + Ok(()) + })?; Ok(()) }) .await } + /// Ordering: call this _after_ deleting the tenant on pageservers, but _before_ dropping state for + /// the tenant from memory on this server. + pub(crate) async fn delete_tenant(&self, del_tenant_id: TenantId) -> DatabaseResult<()> { + use crate::schema::tenant_shards::dsl::*; + self.with_conn(move |conn| -> DatabaseResult<()> { + diesel::delete(tenant_shards) + .filter(tenant_id.eq(del_tenant_id.to_string())) + .execute(conn)?; + + Ok(()) + }) + .await + } + + pub(crate) async fn delete_node(&self, del_node_id: NodeId) -> DatabaseResult<()> { + use crate::schema::nodes::dsl::*; + self.with_conn(move |conn| -> DatabaseResult<()> { + diesel::delete(nodes) + .filter(node_id.eq(del_node_id.0 as i64)) + .execute(conn)?; + + Ok(()) + }) + .await + } + + /// When a tenant invokes the /re-attach API, this function is responsible for doing an efficient + /// batched increment of the generations of all tenants whose generation_pageserver is equal to + /// the node that called /re-attach. + #[tracing::instrument(skip_all, fields(node_id))] + pub(crate) async fn re_attach( + &self, + node_id: NodeId, + ) -> DatabaseResult> { + use crate::schema::tenant_shards::dsl::*; + let updated = self + .with_conn(move |conn| { + let rows_updated = diesel::update(tenant_shards) + .filter(generation_pageserver.eq(node_id.0 as i64)) + .set(generation.eq(generation + 1)) + .execute(conn)?; + + tracing::info!("Incremented {} tenants' generations", rows_updated); + + // TODO: UPDATE+SELECT in one query + + let updated = tenant_shards + .filter(generation_pageserver.eq(node_id.0 as i64)) + .select(TenantShardPersistence::as_select()) + .load(conn)?; + Ok(updated) + }) + .await?; + + let mut result = HashMap::new(); + for tsp in updated { + let tenant_shard_id = TenantShardId { + tenant_id: TenantId::from_str(tsp.tenant_id.as_str()) + .map_err(|e| DatabaseError::Logical(format!("Malformed tenant id: {e}")))?, + shard_number: ShardNumber(tsp.shard_number as u8), + shard_count: ShardCount::new(tsp.shard_count as u8), + }; + + let Some(g) = tsp.generation else { + // If the generation_pageserver column was non-NULL, then the generation column should also be non-NULL: + // we only set generation_pageserver when setting generation. + return Err(DatabaseError::Logical( + "Generation should always be set after incrementing".to_string(), + )); + }; + result.insert(tenant_shard_id, Generation::new(g as u32)); + } + + Ok(result) + } + /// Reconciler calls this immediately before attaching to a new pageserver, to acquire a unique, monotonically /// advancing generation number. We also store the NodeId for which the generation was issued, so that in /// [`Self::re_attach`] we can do a bulk UPDATE on the generations for that node. @@ -225,67 +352,229 @@ impl Persistence { tenant_shard_id: TenantShardId, node_id: NodeId, ) -> anyhow::Result { - self.mutating_transaction(|locked| { - let Some(shard) = locked.tenants.get_mut(&tenant_shard_id) else { - anyhow::bail!("Tried to increment generation of unknown shard"); - }; + use crate::schema::tenant_shards::dsl::*; + let updated = self + .with_conn(move |conn| { + let updated = diesel::update(tenant_shards) + .filter(tenant_id.eq(tenant_shard_id.tenant_id.to_string())) + .filter(shard_number.eq(tenant_shard_id.shard_number.0 as i32)) + .filter(shard_count.eq(tenant_shard_id.shard_count.literal() as i32)) + .set(( + generation.eq(generation + 1), + generation_pageserver.eq(node_id.0 as i64), + )) + // TODO: only returning() the generation column + .returning(TenantShardPersistence::as_returning()) + .get_result(conn)?; - shard.generation += 1; - shard.generation_pageserver = Some(node_id); + Ok(updated) + }) + .await?; - let gen = Generation::new(shard.generation); - Ok(gen) + // Generation is always non-null in the rseult: if the generation column had been NULL, then we + // should have experienced an SQL Confilict error while executing a query that tries to increment it. + debug_assert!(updated.generation.is_some()); + let Some(g) = updated.generation else { + return Err(DatabaseError::Logical( + "Generation should always be set after incrementing".to_string(), + ) + .into()); + }; + + Ok(Generation::new(g as u32)) + } + + /// For use when updating a persistent property of a tenant, such as its config or placement_policy. + /// + /// Do not use this for settting generation, unless in the special onboarding code path (/location_config) + /// API: use [`Self::increment_generation`] instead. Setting the generation via this route is a one-time thing + /// that we only do the first time a tenant is set to an attached policy via /location_config. + pub(crate) async fn update_tenant_shard( + &self, + tenant_shard_id: TenantShardId, + input_placement_policy: PlacementPolicy, + input_config: TenantConfig, + input_generation: Option, + ) -> DatabaseResult<()> { + use crate::schema::tenant_shards::dsl::*; + + self.with_conn(move |conn| { + let query = diesel::update(tenant_shards) + .filter(tenant_id.eq(tenant_shard_id.tenant_id.to_string())) + .filter(shard_number.eq(tenant_shard_id.shard_number.0 as i32)) + .filter(shard_count.eq(tenant_shard_id.shard_count.literal() as i32)); + + if let Some(input_generation) = input_generation { + // Update includes generation column + query + .set(( + generation.eq(Some(input_generation.into().unwrap() as i32)), + placement_policy + .eq(serde_json::to_string(&input_placement_policy).unwrap()), + config.eq(serde_json::to_string(&input_config).unwrap()), + )) + .execute(conn)?; + } else { + // Update does not include generation column + query + .set(( + placement_policy + .eq(serde_json::to_string(&input_placement_policy).unwrap()), + config.eq(serde_json::to_string(&input_config).unwrap()), + )) + .execute(conn)?; + } + + Ok(()) }) - .await + .await?; + + Ok(()) + } + + pub(crate) async fn update_tenant_config( + &self, + input_tenant_id: TenantId, + input_config: TenantConfig, + ) -> DatabaseResult<()> { + use crate::schema::tenant_shards::dsl::*; + + self.with_conn(move |conn| { + diesel::update(tenant_shards) + .filter(tenant_id.eq(input_tenant_id.to_string())) + .set((config.eq(serde_json::to_string(&input_config).unwrap()),)) + .execute(conn)?; + + Ok(()) + }) + .await?; + + Ok(()) } pub(crate) async fn detach(&self, tenant_shard_id: TenantShardId) -> anyhow::Result<()> { - self.mutating_transaction(|locked| { - let Some(shard) = locked.tenants.get_mut(&tenant_shard_id) else { - anyhow::bail!("Tried to increment generation of unknown shard"); - }; - shard.generation_pageserver = None; - shard.placement_policy = serde_json::to_string(&PlacementPolicy::Detached).unwrap(); + use crate::schema::tenant_shards::dsl::*; + self.with_conn(move |conn| { + let updated = diesel::update(tenant_shards) + .filter(tenant_id.eq(tenant_shard_id.tenant_id.to_string())) + .filter(shard_number.eq(tenant_shard_id.shard_number.0 as i32)) + .filter(shard_count.eq(tenant_shard_id.shard_count.literal() as i32)) + .set(( + generation_pageserver.eq(Option::::None), + placement_policy.eq(serde_json::to_string(&PlacementPolicy::Detached).unwrap()), + )) + .execute(conn)?; + + Ok(updated) + }) + .await?; + + Ok(()) + } + + // When we start shard splitting, we must durably mark the tenant so that + // on restart, we know that we must go through recovery. + // + // We create the child shards here, so that they will be available for increment_generation calls + // if some pageserver holding a child shard needs to restart before the overall tenant split is complete. + pub(crate) async fn begin_shard_split( + &self, + old_shard_count: ShardCount, + split_tenant_id: TenantId, + parent_to_children: Vec<(TenantShardId, Vec)>, + ) -> DatabaseResult<()> { + use crate::schema::tenant_shards::dsl::*; + self.with_conn(move |conn| -> DatabaseResult<()> { + conn.transaction(|conn| -> DatabaseResult<()> { + // Mark parent shards as splitting + + let updated = diesel::update(tenant_shards) + .filter(tenant_id.eq(split_tenant_id.to_string())) + .filter(shard_count.eq(old_shard_count.literal() as i32)) + .set((splitting.eq(1),)) + .execute(conn)?; + if u8::try_from(updated) + .map_err(|_| DatabaseError::Logical( + format!("Overflow existing shard count {} while splitting", updated)) + )? != old_shard_count.count() { + // Perhaps a deletion or another split raced with this attempt to split, mutating + // the parent shards that we intend to split. In this case the split request should fail. + return Err(DatabaseError::Logical( + format!("Unexpected existing shard count {updated} when preparing tenant for split (expected {})", old_shard_count.count()) + )); + } + + // FIXME: spurious clone to sidestep closure move rules + let parent_to_children = parent_to_children.clone(); + + // Insert child shards + for (parent_shard_id, children) in parent_to_children { + let mut parent = crate::schema::tenant_shards::table + .filter(tenant_id.eq(parent_shard_id.tenant_id.to_string())) + .filter(shard_number.eq(parent_shard_id.shard_number.0 as i32)) + .filter(shard_count.eq(parent_shard_id.shard_count.literal() as i32)) + .load::(conn)?; + let parent = if parent.len() != 1 { + return Err(DatabaseError::Logical(format!( + "Parent shard {parent_shard_id} not found" + ))); + } else { + parent.pop().unwrap() + }; + for mut shard in children { + // Carry the parent's generation into the child + shard.generation = parent.generation; + + debug_assert!(shard.splitting == SplitState::Splitting); + diesel::insert_into(tenant_shards) + .values(shard) + .execute(conn)?; + } + } + + Ok(()) + })?; + Ok(()) }) .await } - pub(crate) async fn re_attach( + // When we finish shard splitting, we must atomically clean up the old shards + // and insert the new shards, and clear the splitting marker. + pub(crate) async fn complete_shard_split( &self, - node_id: NodeId, - ) -> anyhow::Result> { - self.mutating_transaction(|locked| { - let mut result = HashMap::new(); - for (tenant_shard_id, shard) in locked.tenants.iter_mut() { - if shard.generation_pageserver == Some(node_id) { - shard.generation += 1; - result.insert(*tenant_shard_id, Generation::new(shard.generation)); - } - } - Ok(result) + split_tenant_id: TenantId, + old_shard_count: ShardCount, + ) -> DatabaseResult<()> { + use crate::schema::tenant_shards::dsl::*; + self.with_conn(move |conn| -> DatabaseResult<()> { + conn.transaction(|conn| -> QueryResult<()> { + // Drop parent shards + diesel::delete(tenant_shards) + .filter(tenant_id.eq(split_tenant_id.to_string())) + .filter(shard_count.eq(old_shard_count.literal() as i32)) + .execute(conn)?; + + // Clear sharding flag + let updated = diesel::update(tenant_shards) + .filter(tenant_id.eq(split_tenant_id.to_string())) + .set((splitting.eq(0),)) + .execute(conn)?; + debug_assert!(updated > 0); + + Ok(()) + })?; + + Ok(()) }) .await } - - // TODO: when we start shard splitting, we must durably mark the tenant so that - // on restart, we know that we must go through recovery (list shards that exist - // and pick up where we left off and/or revert to parent shards). - #[allow(dead_code)] - pub(crate) async fn begin_shard_split(&self, _tenant_id: TenantId) -> anyhow::Result<()> { - todo!(); - } - - // TODO: when we finish shard splitting, we must atomically clean up the old shards - // and insert the new shards, and clear the splitting marker. - #[allow(dead_code)] - pub(crate) async fn complete_shard_split(&self, _tenant_id: TenantId) -> anyhow::Result<()> { - todo!(); - } } /// Parts of [`crate::tenant_state::TenantState`] that are stored durably -#[derive(Serialize, Deserialize, Clone)] +#[derive(Queryable, Selectable, Insertable, Serialize, Deserialize, Clone, Eq, PartialEq)] +#[diesel(table_name = crate::schema::tenant_shards)] pub(crate) struct TenantShardPersistence { #[serde(default)] pub(crate) tenant_id: String, @@ -296,16 +585,33 @@ pub(crate) struct TenantShardPersistence { #[serde(default)] pub(crate) shard_stripe_size: i32, + // Latest generation number: next time we attach, increment this + // and use the incremented number when attaching. + // + // Generation is only None when first onboarding a tenant, where it may + // be in PlacementPolicy::Secondary and therefore have no valid generation state. + pub(crate) generation: Option, + // Currently attached pageserver #[serde(rename = "pageserver")] - pub(crate) generation_pageserver: Option, - - // Latest generation number: next time we attach, increment this - // and use the incremented number when attaching - pub(crate) generation: u32, + pub(crate) generation_pageserver: Option, #[serde(default)] pub(crate) placement_policy: String, #[serde(default)] + pub(crate) splitting: SplitState, + #[serde(default)] pub(crate) config: String, } + +/// Parts of [`crate::node::Node`] that are stored durably +#[derive(Serialize, Deserialize, Queryable, Selectable, Insertable, Eq, PartialEq)] +#[diesel(table_name = crate::schema::nodes)] +pub(crate) struct NodePersistence { + pub(crate) node_id: i64, + pub(crate) scheduling_policy: String, + pub(crate) listen_http_addr: String, + pub(crate) listen_http_port: i32, + pub(crate) listen_pg_addr: String, + pub(crate) listen_pg_port: i32, +} diff --git a/control_plane/attachment_service/src/persistence/split_state.rs b/control_plane/attachment_service/src/persistence/split_state.rs new file mode 100644 index 0000000000..bce1a75843 --- /dev/null +++ b/control_plane/attachment_service/src/persistence/split_state.rs @@ -0,0 +1,46 @@ +use diesel::pg::{Pg, PgValue}; +use diesel::{ + deserialize::FromSql, deserialize::FromSqlRow, expression::AsExpression, serialize::ToSql, + sql_types::Int2, +}; +use serde::{Deserialize, Serialize}; + +#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash, PartialOrd, Ord, FromSqlRow, AsExpression)] +#[diesel(sql_type = SplitStateSQLRepr)] +#[derive(Deserialize, Serialize)] +pub enum SplitState { + Idle = 0, + Splitting = 1, +} + +impl Default for SplitState { + fn default() -> Self { + Self::Idle + } +} + +type SplitStateSQLRepr = Int2; + +impl ToSql for SplitState { + fn to_sql<'a>( + &'a self, + out: &'a mut diesel::serialize::Output, + ) -> diesel::serialize::Result { + let raw_value: i16 = *self as i16; + let mut new_out = out.reborrow(); + ToSql::::to_sql(&raw_value, &mut new_out) + } +} + +impl FromSql for SplitState { + fn from_sql(pg_value: PgValue) -> diesel::deserialize::Result { + match FromSql::::from_sql(pg_value).map(|v| match v { + 0 => Some(Self::Idle), + 1 => Some(Self::Splitting), + _ => None, + })? { + Some(v) => Ok(v), + None => Err(format!("Invalid SplitState value, was: {:?}", pg_value.as_bytes()).into()), + } + } +} diff --git a/control_plane/attachment_service/src/reconciler.rs b/control_plane/attachment_service/src/reconciler.rs index d7f4c0406a..603da9bf02 100644 --- a/control_plane/attachment_service/src/reconciler.rs +++ b/control_plane/attachment_service/src/reconciler.rs @@ -1,6 +1,5 @@ use crate::persistence::Persistence; use crate::service; -use control_plane::attachment_service::NodeAvailability; use pageserver_api::models::{ LocationConfig, LocationConfigMode, LocationConfigSecondary, TenantConfig, }; @@ -13,8 +12,9 @@ use tokio_util::sync::CancellationToken; use utils::generation::Generation; use utils::id::{NodeId, TimelineId}; use utils::lsn::Lsn; +use utils::sync::gate::GateGuard; -use crate::compute_hook::ComputeHook; +use crate::compute_hook::{ComputeHook, NotifyError}; use crate::node::Node; use crate::tenant_state::{IntentState, ObservedState, ObservedStateLocation}; @@ -25,21 +25,28 @@ pub(super) struct Reconciler { /// of a tenant's state from when we spawned a reconcile task. pub(super) tenant_shard_id: TenantShardId, pub(crate) shard: ShardIdentity, - pub(crate) generation: Generation, - pub(crate) intent: IntentState, + pub(crate) generation: Option, + pub(crate) intent: TargetState, + + /// Nodes not referenced by [`Self::intent`], from which we should try + /// to detach this tenant shard. + pub(crate) detach: Vec, + pub(crate) config: TenantConfig, pub(crate) observed: ObservedState, pub(crate) service_config: service::Config, - /// A snapshot of the pageservers as they were when we were asked - /// to reconcile. - pub(crate) pageservers: Arc>, - /// A hook to notify the running postgres instances when we change the location - /// of a tenant + /// of a tenant. Use this via [`Self::compute_notify`] to update our failure flag + /// and guarantee eventual retries. pub(crate) compute_hook: Arc, + /// To avoid stalling if the cloud control plane is unavailable, we may proceed + /// past failures in [`ComputeHook::notify`], but we _must_ remember that we failed + /// so that we can set [`crate::tenant_state::TenantState::pending_compute_notification`] to ensure a later retry. + pub(crate) compute_notify_failure: bool, + /// A means to abort background reconciliation: it is essential to /// call this when something changes in the original TenantState that /// will make this reconciliation impossible or unnecessary, for @@ -47,12 +54,54 @@ pub(super) struct Reconciler { /// the tenant is changed. pub(crate) cancel: CancellationToken, + /// Reconcilers are registered with a Gate so that during a graceful shutdown we + /// can wait for all the reconcilers to respond to their cancellation tokens. + pub(crate) _gate_guard: GateGuard, + /// Access to persistent storage for updating generation numbers pub(crate) persistence: Arc, } +/// This is a snapshot of [`crate::tenant_state::IntentState`], but it does not do any +/// reference counting for Scheduler. The IntentState is what the scheduler works with, +/// and the TargetState is just the instruction for a particular Reconciler run. +#[derive(Debug)] +pub(crate) struct TargetState { + pub(crate) attached: Option, + pub(crate) secondary: Vec, +} + +impl TargetState { + pub(crate) fn from_intent(nodes: &HashMap, intent: &IntentState) -> Self { + Self { + attached: intent.get_attached().map(|n| { + nodes + .get(&n) + .expect("Intent attached referenced non-existent node") + .clone() + }), + secondary: intent + .get_secondary() + .iter() + .map(|n| { + nodes + .get(n) + .expect("Intent secondary referenced non-existent node") + .clone() + }) + .collect(), + } + } +} + #[derive(thiserror::Error, Debug)] -pub enum ReconcileError { +pub(crate) enum ReconcileError { + #[error(transparent)] + Remote(#[from] mgmt_api::Error), + #[error(transparent)] + Notify(#[from] NotifyError), + #[error("Cancelled")] + Cancel, #[error(transparent)] Other(#[from] anyhow::Error), } @@ -60,44 +109,83 @@ pub enum ReconcileError { impl Reconciler { async fn location_config( &mut self, - node_id: NodeId, + node: &Node, config: LocationConfig, flush_ms: Option, - ) -> anyhow::Result<()> { - let node = self - .pageservers - .get(&node_id) - .expect("Pageserver may not be removed while referenced"); + lazy: bool, + ) -> Result<(), ReconcileError> { + self.observed + .locations + .insert(node.get_id(), ObservedStateLocation { conf: None }); + + // TODO: amend locations that use long-polling: they will hit this timeout. + let timeout = Duration::from_secs(25); + + tracing::info!("location_config({node}) calling: {:?}", config); + let tenant_shard_id = self.tenant_shard_id; + let config_ref = &config; + match node + .with_client_retries( + |client| async move { + let config = config_ref.clone(); + client + .location_config(tenant_shard_id, config.clone(), flush_ms, lazy) + .await + }, + &self.service_config.jwt_token, + 1, + 3, + timeout, + &self.cancel, + ) + .await + { + Some(Ok(_)) => {} + Some(Err(e)) => return Err(e.into()), + None => return Err(ReconcileError::Cancel), + }; + tracing::info!("location_config({node}) complete: {:?}", config); self.observed .locations - .insert(node.id, ObservedStateLocation { conf: None }); - - tracing::info!("location_config({}) calling: {:?}", node_id, config); - let client = - mgmt_api::Client::new(node.base_url(), self.service_config.jwt_token.as_deref()); - client - .location_config(self.tenant_shard_id, config.clone(), flush_ms) - .await?; - tracing::info!("location_config({}) complete: {:?}", node_id, config); - - self.observed - .locations - .insert(node.id, ObservedStateLocation { conf: Some(config) }); + .insert(node.get_id(), ObservedStateLocation { conf: Some(config) }); Ok(()) } + fn get_node(&self, node_id: &NodeId) -> Option<&Node> { + if let Some(node) = self.intent.attached.as_ref() { + if node.get_id() == *node_id { + return Some(node); + } + } + + if let Some(node) = self + .intent + .secondary + .iter() + .find(|n| n.get_id() == *node_id) + { + return Some(node); + } + + if let Some(node) = self.detach.iter().find(|n| n.get_id() == *node_id) { + return Some(node); + } + + None + } + async fn maybe_live_migrate(&mut self) -> Result<(), ReconcileError> { - let destination = if let Some(node_id) = self.intent.attached { - match self.observed.locations.get(&node_id) { + let destination = if let Some(node) = &self.intent.attached { + match self.observed.locations.get(&node.get_id()) { Some(conf) => { // We will do a live migration only if the intended destination is not // currently in an attached state. match &conf.conf { Some(conf) if conf.mode == LocationConfigMode::Secondary => { // Fall through to do a live migration - node_id + node } None | Some(_) => { // Attached or uncertain: don't do a live migration, proceed @@ -110,7 +198,7 @@ impl Reconciler { None => { // Our destination is not attached: maybe live migrate if some other // node is currently attached. Fall through. - node_id + node } } } else { @@ -123,15 +211,13 @@ impl Reconciler { for (node_id, state) in &self.observed.locations { if let Some(observed_conf) = &state.conf { if observed_conf.mode == LocationConfigMode::AttachedSingle { - let node = self - .pageservers - .get(node_id) - .expect("Nodes may not be removed while referenced"); // We will only attempt live migration if the origin is not offline: this // avoids trying to do it while reconciling after responding to an HA failover. - if !matches!(node.availability, NodeAvailability::Offline) { - origin = Some(*node_id); - break; + if let Some(node) = self.get_node(node_id) { + if node.is_available() { + origin = Some(node.clone()); + break; + } } } } @@ -144,7 +230,7 @@ impl Reconciler { // We have an origin and a destination: proceed to do the live migration tracing::info!("Live migrating {}->{}", origin, destination); - self.live_migrate(origin, destination).await?; + self.live_migrate(origin, destination.clone()).await?; Ok(()) } @@ -152,13 +238,8 @@ impl Reconciler { async fn get_lsns( &self, tenant_shard_id: TenantShardId, - node_id: &NodeId, + node: &Node, ) -> anyhow::Result> { - let node = self - .pageservers - .get(node_id) - .expect("Pageserver may not be removed while referenced"); - let client = mgmt_api::Client::new(node.base_url(), self.service_config.jwt_token.as_deref()); @@ -169,19 +250,27 @@ impl Reconciler { .collect()) } - async fn secondary_download(&self, tenant_shard_id: TenantShardId, node_id: &NodeId) { - let node = self - .pageservers - .get(node_id) - .expect("Pageserver may not be removed while referenced"); - - let client = - mgmt_api::Client::new(node.base_url(), self.service_config.jwt_token.as_deref()); - - match client.tenant_secondary_download(tenant_shard_id).await { - Ok(()) => {} - Err(_) => { - tracing::info!(" (skipping, destination wasn't in secondary mode)") + async fn secondary_download( + &self, + tenant_shard_id: TenantShardId, + node: &Node, + ) -> Result<(), ReconcileError> { + match node + .with_client_retries( + |client| async move { client.tenant_secondary_download(tenant_shard_id).await }, + &self.service_config.jwt_token, + 1, + 1, + Duration::from_secs(60), + &self.cancel, + ) + .await + { + None => Err(ReconcileError::Cancel), + Some(Ok(_)) => Ok(()), + Some(Err(e)) => { + tracing::info!(" (skipping destination download: {})", e); + Ok(()) } } } @@ -189,17 +278,14 @@ impl Reconciler { async fn await_lsn( &self, tenant_shard_id: TenantShardId, - pageserver_id: &NodeId, + node: &Node, baseline: HashMap, ) -> anyhow::Result<()> { loop { - let latest = match self.get_lsns(tenant_shard_id, pageserver_id).await { + let latest = match self.get_lsns(tenant_shard_id, node).await { Ok(l) => l, Err(e) => { - println!( - "🕑 Can't get LSNs on pageserver {} yet, waiting ({e})", - pageserver_id - ); + tracing::info!("🕑 Can't get LSNs on node {node} yet, waiting ({e})",); std::thread::sleep(Duration::from_millis(500)); continue; } @@ -209,7 +295,7 @@ impl Reconciler { for (timeline_id, baseline_lsn) in &baseline { match latest.get(timeline_id) { Some(latest_lsn) => { - println!("🕑 LSN origin {baseline_lsn} vs destination {latest_lsn}"); + tracing::info!("🕑 LSN origin {baseline_lsn} vs destination {latest_lsn}"); if latest_lsn < baseline_lsn { any_behind = true; } @@ -224,7 +310,7 @@ impl Reconciler { } if !any_behind { - println!("✅ LSN caught up. Proceeding..."); + tracing::info!("✅ LSN caught up. Proceeding..."); break; } else { std::thread::sleep(Duration::from_millis(500)); @@ -236,11 +322,11 @@ impl Reconciler { pub async fn live_migrate( &mut self, - origin_ps_id: NodeId, - dest_ps_id: NodeId, - ) -> anyhow::Result<()> { + origin_ps: Node, + dest_ps: Node, + ) -> Result<(), ReconcileError> { // `maybe_live_migrate` is responsibble for sanity of inputs - assert!(origin_ps_id != dest_ps_id); + assert!(origin_ps.get_id() != dest_ps.get_id()); fn build_location_config( shard: &ShardIdentity, @@ -255,15 +341,12 @@ impl Reconciler { secondary_conf, tenant_conf: config.clone(), shard_number: shard.number.0, - shard_count: shard.count.0, + shard_count: shard.count.literal(), shard_stripe_size: shard.stripe_size.0, } } - tracing::info!( - "🔁 Switching origin pageserver {} to stale mode", - origin_ps_id - ); + tracing::info!("🔁 Switching origin node {origin_ps} to stale mode",); // FIXME: it is incorrect to use self.generation here, we should use the generation // from the ObservedState of the origin pageserver (it might be older than self.generation) @@ -271,55 +354,64 @@ impl Reconciler { &self.shard, &self.config, LocationConfigMode::AttachedStale, - Some(self.generation), + self.generation, None, ); - self.location_config(origin_ps_id, stale_conf, Some(Duration::from_secs(10))) + self.location_config(&origin_ps, stale_conf, Some(Duration::from_secs(10)), false) .await?; - let baseline_lsns = Some(self.get_lsns(self.tenant_shard_id, &origin_ps_id).await?); + let baseline_lsns = Some(self.get_lsns(self.tenant_shard_id, &origin_ps).await?); // If we are migrating to a destination that has a secondary location, warm it up first - if let Some(destination_conf) = self.observed.locations.get(&dest_ps_id) { + if let Some(destination_conf) = self.observed.locations.get(&dest_ps.get_id()) { if let Some(destination_conf) = &destination_conf.conf { if destination_conf.mode == LocationConfigMode::Secondary { - tracing::info!( - "🔁 Downloading latest layers to destination pageserver {}", - dest_ps_id, - ); - self.secondary_download(self.tenant_shard_id, &dest_ps_id) - .await; + tracing::info!("🔁 Downloading latest layers to destination node {dest_ps}",); + self.secondary_download(self.tenant_shard_id, &dest_ps) + .await?; } } } // Increment generation before attaching to new pageserver - self.generation = self - .persistence - .increment_generation(self.tenant_shard_id, dest_ps_id) - .await?; + self.generation = Some( + self.persistence + .increment_generation(self.tenant_shard_id, dest_ps.get_id()) + .await?, + ); let dest_conf = build_location_config( &self.shard, &self.config, LocationConfigMode::AttachedMulti, - Some(self.generation), + self.generation, None, ); - tracing::info!("🔁 Attaching to pageserver {}", dest_ps_id); - self.location_config(dest_ps_id, dest_conf, None).await?; + tracing::info!("🔁 Attaching to pageserver {dest_ps}"); + self.location_config(&dest_ps, dest_conf, None, false) + .await?; if let Some(baseline) = baseline_lsns { tracing::info!("🕑 Waiting for LSN to catch up..."); - self.await_lsn(self.tenant_shard_id, &dest_ps_id, baseline) + self.await_lsn(self.tenant_shard_id, &dest_ps, baseline) .await?; } - tracing::info!("🔁 Notifying compute to use pageserver {}", dest_ps_id); - self.compute_hook - .notify(self.tenant_shard_id, dest_ps_id) - .await?; + tracing::info!("🔁 Notifying compute to use pageserver {dest_ps}"); + + // During a live migration it is unhelpful to proceed if we couldn't notify compute: if we detach + // the origin without notifying compute, we will render the tenant unavailable. + while let Err(e) = self.compute_notify().await { + match e { + NotifyError::Fatal(_) => return Err(ReconcileError::Notify(e)), + _ => { + tracing::warn!( + "Live migration blocked by compute notification error, retrying: {e}" + ); + } + } + } // Downgrade the origin to secondary. If the tenant's policy is PlacementPolicy::Single, then // this location will be deleted in the general case reconciliation that runs after this. @@ -330,39 +422,81 @@ impl Reconciler { None, Some(LocationConfigSecondary { warm: true }), ); - self.location_config(origin_ps_id, origin_secondary_conf.clone(), None) + self.location_config(&origin_ps, origin_secondary_conf.clone(), None, false) .await?; // TODO: we should also be setting the ObservedState on earlier API calls, in case we fail // partway through. In fact, all location conf API calls should be in a wrapper that sets // the observed state to None, then runs, then sets it to what we wrote. self.observed.locations.insert( - origin_ps_id, + origin_ps.get_id(), ObservedStateLocation { conf: Some(origin_secondary_conf), }, ); - println!( - "🔁 Switching to AttachedSingle mode on pageserver {}", - dest_ps_id - ); + tracing::info!("🔁 Switching to AttachedSingle mode on node {dest_ps}",); let dest_final_conf = build_location_config( &self.shard, &self.config, LocationConfigMode::AttachedSingle, - Some(self.generation), + self.generation, None, ); - self.location_config(dest_ps_id, dest_final_conf.clone(), None) + self.location_config(&dest_ps, dest_final_conf.clone(), None, false) .await?; self.observed.locations.insert( - dest_ps_id, + dest_ps.get_id(), ObservedStateLocation { conf: Some(dest_final_conf), }, ); - println!("✅ Migration complete"); + tracing::info!("✅ Migration complete"); + + Ok(()) + } + + async fn maybe_refresh_observed(&mut self) -> Result<(), ReconcileError> { + // If the attached node has uncertain state, read it from the pageserver before proceeding: this + // is important to avoid spurious generation increments. + // + // We don't need to do this for secondary/detach locations because it's harmless to just PUT their + // location conf, whereas for attached locations it can interrupt clients if we spuriously destroy/recreate + // the `Timeline` object in the pageserver. + + let Some(attached_node) = self.intent.attached.as_ref() else { + // Nothing to do + return Ok(()); + }; + + if matches!( + self.observed.locations.get(&attached_node.get_id()), + Some(ObservedStateLocation { conf: None }) + ) { + let tenant_shard_id = self.tenant_shard_id; + let observed_conf = match attached_node + .with_client_retries( + |client| async move { client.get_location_config(tenant_shard_id).await }, + &self.service_config.jwt_token, + 1, + 1, + Duration::from_secs(5), + &self.cancel, + ) + .await + { + Some(Ok(observed)) => observed, + Some(Err(e)) => return Err(e.into()), + None => return Err(ReconcileError::Cancel), + }; + tracing::info!("Scanned location configuration on {attached_node}: {observed_conf:?}"); + self.observed.locations.insert( + attached_node.get_id(), + ObservedStateLocation { + conf: observed_conf, + }, + ); + } Ok(()) } @@ -374,41 +508,81 @@ impl Reconciler { /// general case reconciliation where we walk through the intent by pageserver /// and call out to the pageserver to apply the desired state. pub(crate) async fn reconcile(&mut self) -> Result<(), ReconcileError> { - // TODO: if any of self.observed is None, call to remote pageservers - // to learn correct state. + // Prepare: if we have uncertain `observed` state for our would-be attachement location, then refresh it + self.maybe_refresh_observed().await?; // Special case: live migration self.maybe_live_migrate().await?; // If the attached pageserver is not attached, do so now. - if let Some(node_id) = self.intent.attached { - let mut wanted_conf = - attached_location_conf(self.generation, &self.shard, &self.config); - match self.observed.locations.get(&node_id) { + if let Some(node) = self.intent.attached.as_ref() { + // If we are in an attached policy, then generation must have been set (null generations + // are only present when a tenant is initially loaded with a secondary policy) + debug_assert!(self.generation.is_some()); + let Some(generation) = self.generation else { + return Err(ReconcileError::Other(anyhow::anyhow!( + "Attempted to attach with NULL generation" + ))); + }; + + let mut wanted_conf = attached_location_conf(generation, &self.shard, &self.config); + match self.observed.locations.get(&node.get_id()) { Some(conf) if conf.conf.as_ref() == Some(&wanted_conf) => { // Nothing to do - tracing::info!("Observed configuration already correct.") + tracing::info!(node_id=%node.get_id(), "Observed configuration already correct.") } - _ => { + observed => { // In all cases other than a matching observed configuration, we will // reconcile this location. This includes locations with different configurations, as well // as locations with unknown (None) observed state. - self.generation = self - .persistence - .increment_generation(self.tenant_shard_id, node_id) - .await?; - wanted_conf.generation = self.generation.into(); - tracing::info!("Observed configuration requires update."); - self.location_config(node_id, wanted_conf, None).await?; - if let Err(e) = self - .compute_hook - .notify(self.tenant_shard_id, node_id) - .await - { - tracing::warn!( - "Failed to notify compute of newly attached pageserver {node_id}: {e}" - ); + + // The general case is to increment the generation. However, there are cases + // where this is not necessary: + // - if we are only updating the TenantConf part of the location + // - if we are only changing the attachment mode (e.g. going to attachedmulti or attachedstale) + // and the location was already in the correct generation + let increment_generation = match observed { + None => true, + Some(ObservedStateLocation { conf: None }) => true, + Some(ObservedStateLocation { + conf: Some(observed), + }) => { + let generations_match = observed.generation == wanted_conf.generation; + + use LocationConfigMode::*; + let mode_transition_requires_gen_inc = + match (observed.mode, wanted_conf.mode) { + // Usually the short-lived attachment modes (multi and stale) are only used + // in the case of [`Self::live_migrate`], but it is simple to handle them correctly + // here too. Locations are allowed to go Single->Stale and Multi->Single within the same generation. + (AttachedSingle, AttachedStale) => false, + (AttachedMulti, AttachedSingle) => false, + (lhs, rhs) => lhs != rhs, + }; + + !generations_match || mode_transition_requires_gen_inc + } + }; + + if increment_generation { + let generation = self + .persistence + .increment_generation(self.tenant_shard_id, node.get_id()) + .await?; + self.generation = Some(generation); + wanted_conf.generation = generation.into(); } + tracing::info!(node_id=%node.get_id(), "Observed configuration requires update."); + + // Because `node` comes from a ref to &self, clone it before calling into a &mut self + // function: this could be avoided by refactoring the state mutated by location_config into + // a separate type to Self. + let node = node.clone(); + + // Use lazy=true, because we may run many of Self concurrently, and do not want to + // overload the pageserver with logical size calculations. + self.location_config(&node, wanted_conf, None, true).await?; + self.compute_notify().await?; } } } @@ -416,51 +590,76 @@ impl Reconciler { // Configure secondary locations: if these were previously attached this // implicitly downgrades them from attached to secondary. let mut changes = Vec::new(); - for node_id in &self.intent.secondary { + for node in &self.intent.secondary { let wanted_conf = secondary_location_conf(&self.shard, &self.config); - match self.observed.locations.get(node_id) { + match self.observed.locations.get(&node.get_id()) { Some(conf) if conf.conf.as_ref() == Some(&wanted_conf) => { // Nothing to do - tracing::info!(%node_id, "Observed configuration already correct.") + tracing::info!(node_id=%node.get_id(), "Observed configuration already correct.") } _ => { // In all cases other than a matching observed configuration, we will // reconcile this location. - tracing::info!(%node_id, "Observed configuration requires update."); - changes.push((*node_id, wanted_conf)) + tracing::info!(node_id=%node.get_id(), "Observed configuration requires update."); + changes.push((node.clone(), wanted_conf)) } } } // Detach any extraneous pageservers that are no longer referenced // by our intent. - let all_pageservers = self.intent.all_pageservers(); - for node_id in self.observed.locations.keys() { - if all_pageservers.contains(node_id) { - // We are only detaching pageservers that aren't used at all. - continue; - } - + for node in &self.detach { changes.push(( - *node_id, + node.clone(), LocationConfig { mode: LocationConfigMode::Detached, generation: None, secondary_conf: None, shard_number: self.shard.number.0, - shard_count: self.shard.count.0, + shard_count: self.shard.count.literal(), shard_stripe_size: self.shard.stripe_size.0, tenant_conf: self.config.clone(), }, )); } - for (node_id, conf) in changes { - self.location_config(node_id, conf, None).await?; + for (node, conf) in changes { + if self.cancel.is_cancelled() { + return Err(ReconcileError::Cancel); + } + self.location_config(&node, conf, None, false).await?; } Ok(()) } + + pub(crate) async fn compute_notify(&mut self) -> Result<(), NotifyError> { + // Whenever a particular Reconciler emits a notification, it is always notifying for the intended + // destination. + if let Some(node) = &self.intent.attached { + let result = self + .compute_hook + .notify( + self.tenant_shard_id, + node.get_id(), + self.shard.stripe_size, + &self.cancel, + ) + .await; + if let Err(e) = &result { + // It is up to the caller whether they want to drop out on this error, but they don't have to: + // in general we should avoid letting unavailability of the cloud control plane stop us from + // making progress. + tracing::warn!("Failed to notify compute of attached pageserver {node}: {e}"); + // Set this flag so that in our ReconcileResult we will set the flag on the shard that it + // needs to retry at some point. + self.compute_notify_failure = true; + } + result + } else { + Ok(()) + } + } } pub(crate) fn attached_location_conf( @@ -473,7 +672,7 @@ pub(crate) fn attached_location_conf( generation: generation.into(), secondary_conf: None, shard_number: shard.number.0, - shard_count: shard.count.0, + shard_count: shard.count.literal(), shard_stripe_size: shard.stripe_size.0, tenant_conf: config.clone(), } @@ -488,7 +687,7 @@ pub(crate) fn secondary_location_conf( generation: None, secondary_conf: Some(LocationConfigSecondary { warm: true }), shard_number: shard.number.0, - shard_count: shard.count.0, + shard_count: shard.count.literal(), shard_stripe_size: shard.stripe_size.0, tenant_conf: config.clone(), } diff --git a/control_plane/attachment_service/src/scheduler.rs b/control_plane/attachment_service/src/scheduler.rs index 1966a7ea2a..26a2707e8d 100644 --- a/control_plane/attachment_service/src/scheduler.rs +++ b/control_plane/attachment_service/src/scheduler.rs @@ -1,8 +1,7 @@ -use pageserver_api::shard::TenantShardId; -use std::collections::{BTreeMap, HashMap}; -use utils::{http::error::ApiError, id::NodeId}; - use crate::{node::Node, tenant_state::TenantState}; +use serde::Serialize; +use std::collections::HashMap; +use utils::{http::error::ApiError, id::NodeId}; /// Scenarios in which we cannot find a suitable location for a tenant shard #[derive(thiserror::Error, Debug)] @@ -19,52 +18,203 @@ impl From for ApiError { } } +#[derive(Serialize, Eq, PartialEq)] +struct SchedulerNode { + /// How many shards are currently scheduled on this node, via their [`crate::tenant_state::IntentState`]. + shard_count: usize, + + /// Whether this node is currently elegible to have new shards scheduled (this is derived + /// from a node's availability state and scheduling policy). + may_schedule: bool, +} + +/// This type is responsible for selecting which node is used when a tenant shard needs to choose a pageserver +/// on which to run. +/// +/// The type has no persistent state of its own: this is all populated at startup. The Serialize +/// impl is only for debug dumps. +#[derive(Serialize)] pub(crate) struct Scheduler { - tenant_counts: HashMap, + nodes: HashMap, } impl Scheduler { - pub(crate) fn new( - tenants: &BTreeMap, - nodes: &HashMap, - ) -> Self { - let mut tenant_counts = HashMap::new(); - for node_id in nodes.keys() { - tenant_counts.insert(*node_id, 0); + pub(crate) fn new<'a>(nodes: impl Iterator) -> Self { + let mut scheduler_nodes = HashMap::new(); + for node in nodes { + scheduler_nodes.insert( + node.get_id(), + SchedulerNode { + shard_count: 0, + may_schedule: node.may_schedule(), + }, + ); } - for tenant in tenants.values() { - if let Some(ps) = tenant.intent.attached { - let entry = tenant_counts.entry(ps).or_insert(0); - *entry += 1; - } + Self { + nodes: scheduler_nodes, } - - for (node_id, node) in nodes { - if !node.may_schedule() { - tenant_counts.remove(node_id); - } - } - - Self { tenant_counts } } - pub(crate) fn schedule_shard( - &mut self, - hard_exclude: &[NodeId], - ) -> Result { - if self.tenant_counts.is_empty() { + /// For debug/support: check that our internal statistics are in sync with the state of + /// the nodes & tenant shards. + /// + /// If anything is inconsistent, log details and return an error. + pub(crate) fn consistency_check<'a>( + &self, + nodes: impl Iterator, + shards: impl Iterator, + ) -> anyhow::Result<()> { + let mut expect_nodes: HashMap = HashMap::new(); + for node in nodes { + expect_nodes.insert( + node.get_id(), + SchedulerNode { + shard_count: 0, + may_schedule: node.may_schedule(), + }, + ); + } + + for shard in shards { + if let Some(node_id) = shard.intent.get_attached() { + match expect_nodes.get_mut(node_id) { + Some(node) => node.shard_count += 1, + None => anyhow::bail!( + "Tenant {} references nonexistent node {}", + shard.tenant_shard_id, + node_id + ), + } + } + + for node_id in shard.intent.get_secondary() { + match expect_nodes.get_mut(node_id) { + Some(node) => node.shard_count += 1, + None => anyhow::bail!( + "Tenant {} references nonexistent node {}", + shard.tenant_shard_id, + node_id + ), + } + } + } + + for (node_id, expect_node) in &expect_nodes { + let Some(self_node) = self.nodes.get(node_id) else { + anyhow::bail!("Node {node_id} not found in Self") + }; + + if self_node != expect_node { + tracing::error!("Inconsistency detected in scheduling state for node {node_id}"); + tracing::error!("Expected state: {}", serde_json::to_string(expect_node)?); + tracing::error!("Self state: {}", serde_json::to_string(self_node)?); + + anyhow::bail!("Inconsistent state on {node_id}"); + } + } + + if expect_nodes.len() != self.nodes.len() { + // We just checked that all the expected nodes are present. If the lengths don't match, + // it means that we have nodes in Self that are unexpected. + for node_id in self.nodes.keys() { + if !expect_nodes.contains_key(node_id) { + anyhow::bail!("Node {node_id} found in Self but not in expected nodes"); + } + } + } + + Ok(()) + } + + /// Increment the reference count of a node. This reference count is used to guide scheduling + /// decisions, not for memory management: it represents one tenant shard whose IntentState targets + /// this node. + /// + /// It is an error to call this for a node that is not known to the scheduler (i.e. passed into + /// [`Self::new`] or [`Self::node_upsert`]) + pub(crate) fn node_inc_ref(&mut self, node_id: NodeId) { + let Some(node) = self.nodes.get_mut(&node_id) else { + tracing::error!("Scheduler missing node {node_id}"); + debug_assert!(false); + return; + }; + + node.shard_count += 1; + } + + /// Decrement a node's reference count. Inverse of [`Self::node_inc_ref`]. + pub(crate) fn node_dec_ref(&mut self, node_id: NodeId) { + let Some(node) = self.nodes.get_mut(&node_id) else { + debug_assert!(false); + tracing::error!("Scheduler missing node {node_id}"); + return; + }; + + node.shard_count -= 1; + } + + pub(crate) fn node_upsert(&mut self, node: &Node) { + use std::collections::hash_map::Entry::*; + match self.nodes.entry(node.get_id()) { + Occupied(mut entry) => { + entry.get_mut().may_schedule = node.may_schedule(); + } + Vacant(entry) => { + entry.insert(SchedulerNode { + shard_count: 0, + may_schedule: node.may_schedule(), + }); + } + } + } + + pub(crate) fn node_remove(&mut self, node_id: NodeId) { + if self.nodes.remove(&node_id).is_none() { + tracing::warn!(node_id=%node_id, "Removed non-existent node from scheduler"); + } + } + + /// Where we have several nodes to choose from, for example when picking a secondary location + /// to promote to an attached location, this method may be used to pick the best choice based + /// on the scheduler's knowledge of utilization and availability. + /// + /// If the input is empty, or all the nodes are not elegible for scheduling, return None: the + /// caller can pick a node some other way. + pub(crate) fn node_preferred(&self, nodes: &[NodeId]) -> Option { + if nodes.is_empty() { + return None; + } + + let node = nodes + .iter() + .map(|node_id| { + let may_schedule = self + .nodes + .get(node_id) + .map(|n| n.may_schedule) + .unwrap_or(false); + (*node_id, may_schedule) + }) + .max_by_key(|(_n, may_schedule)| *may_schedule); + + // If even the preferred node has may_schedule==false, return None + node.and_then(|(node_id, may_schedule)| if may_schedule { Some(node_id) } else { None }) + } + + pub(crate) fn schedule_shard(&self, hard_exclude: &[NodeId]) -> Result { + if self.nodes.is_empty() { return Err(ScheduleError::NoPageservers); } let mut tenant_counts: Vec<(NodeId, usize)> = self - .tenant_counts + .nodes .iter() .filter_map(|(k, v)| { - if hard_exclude.contains(k) { + if hard_exclude.contains(k) || !v.may_schedule { None } else { - Some((*k, *v)) + Some((*k, v.shard_count)) } }) .collect(); @@ -73,17 +223,106 @@ impl Scheduler { tenant_counts.sort_by_key(|i| (i.1, i.0)); if tenant_counts.is_empty() { - // After applying constraints, no pageservers were left + // After applying constraints, no pageservers were left. We log some detail about + // the state of nodes to help understand why this happened. This is not logged as an error because + // it is legitimately possible for enough nodes to be Offline to prevent scheduling a shard. + tracing::info!("Scheduling failure, while excluding {hard_exclude:?}, node states:"); + for (node_id, node) in &self.nodes { + tracing::info!( + "Node {node_id}: may_schedule={} shards={}", + node.may_schedule, + node.shard_count + ); + } + return Err(ScheduleError::ImpossibleConstraint); } - for (node_id, count) in &tenant_counts { - tracing::info!("tenant_counts[{node_id}]={count}"); - } - let node_id = tenant_counts.first().unwrap().0; - tracing::info!("scheduler selected node {node_id}"); - *self.tenant_counts.get_mut(&node_id).unwrap() += 1; + tracing::info!( + "scheduler selected node {node_id} (elegible nodes {:?}, exclude: {hard_exclude:?})", + tenant_counts.iter().map(|i| i.0 .0).collect::>() + ); + + // Note that we do not update shard count here to reflect the scheduling: that + // is IntentState's job when the scheduled location is used. + Ok(node_id) } } + +#[cfg(test)] +pub(crate) mod test_utils { + + use crate::node::Node; + use std::collections::HashMap; + use utils::id::NodeId; + /// Test helper: synthesize the requested number of nodes, all in active state. + /// + /// Node IDs start at one. + pub(crate) fn make_test_nodes(n: u64) -> HashMap { + (1..n + 1) + .map(|i| { + (NodeId(i), { + let node = Node::new( + NodeId(i), + format!("httphost-{i}"), + 80 + i as u16, + format!("pghost-{i}"), + 5432 + i as u16, + ); + assert!(node.is_available()); + node + }) + }) + .collect() + } +} + +#[cfg(test)] +mod tests { + use super::*; + + use crate::tenant_state::IntentState; + #[test] + fn scheduler_basic() -> anyhow::Result<()> { + let nodes = test_utils::make_test_nodes(2); + + let mut scheduler = Scheduler::new(nodes.values()); + let mut t1_intent = IntentState::new(); + let mut t2_intent = IntentState::new(); + + let scheduled = scheduler.schedule_shard(&[])?; + t1_intent.set_attached(&mut scheduler, Some(scheduled)); + let scheduled = scheduler.schedule_shard(&[])?; + t2_intent.set_attached(&mut scheduler, Some(scheduled)); + + assert_eq!(scheduler.nodes.get(&NodeId(1)).unwrap().shard_count, 1); + assert_eq!(scheduler.nodes.get(&NodeId(2)).unwrap().shard_count, 1); + + let scheduled = scheduler.schedule_shard(&t1_intent.all_pageservers())?; + t1_intent.push_secondary(&mut scheduler, scheduled); + + assert_eq!(scheduler.nodes.get(&NodeId(1)).unwrap().shard_count, 1); + assert_eq!(scheduler.nodes.get(&NodeId(2)).unwrap().shard_count, 2); + + t1_intent.clear(&mut scheduler); + assert_eq!(scheduler.nodes.get(&NodeId(1)).unwrap().shard_count, 0); + assert_eq!(scheduler.nodes.get(&NodeId(2)).unwrap().shard_count, 1); + + if cfg!(debug_assertions) { + // Dropping an IntentState without clearing it causes a panic in debug mode, + // because we have failed to properly update scheduler shard counts. + let result = std::panic::catch_unwind(move || { + drop(t2_intent); + }); + assert!(result.is_err()); + } else { + t2_intent.clear(&mut scheduler); + assert_eq!(scheduler.nodes.get(&NodeId(1)).unwrap().shard_count, 0); + assert_eq!(scheduler.nodes.get(&NodeId(2)).unwrap().shard_count, 0); + } + + Ok(()) + } +} diff --git a/control_plane/attachment_service/src/schema.rs b/control_plane/attachment_service/src/schema.rs new file mode 100644 index 0000000000..76e4e56a66 --- /dev/null +++ b/control_plane/attachment_service/src/schema.rs @@ -0,0 +1,28 @@ +// @generated automatically by Diesel CLI. + +diesel::table! { + nodes (node_id) { + node_id -> Int8, + scheduling_policy -> Varchar, + listen_http_addr -> Varchar, + listen_http_port -> Int4, + listen_pg_addr -> Varchar, + listen_pg_port -> Int4, + } +} + +diesel::table! { + tenant_shards (tenant_id, shard_number, shard_count) { + tenant_id -> Varchar, + shard_number -> Int4, + shard_count -> Int4, + shard_stripe_size -> Int4, + generation -> Nullable, + generation_pageserver -> Nullable, + placement_policy -> Varchar, + splitting -> Int2, + config -> Text, + } +} + +diesel::allow_tables_to_appear_in_same_query!(nodes, tenant_shards,); diff --git a/control_plane/attachment_service/src/service.rs b/control_plane/attachment_service/src/service.rs index c9ed07ae5f..ea301d0372 100644 --- a/control_plane/attachment_service/src/service.rs +++ b/control_plane/attachment_service/src/service.rs @@ -1,88 +1,161 @@ use std::{ - collections::{BTreeMap, HashMap}, + borrow::Cow, + cmp::Ordering, + collections::{BTreeMap, HashMap, HashSet}, str::FromStr, sync::Arc, time::{Duration, Instant}, }; -use control_plane::attachment_service::{ - AttachHookRequest, AttachHookResponse, InspectRequest, InspectResponse, NodeAvailability, - NodeConfigureRequest, NodeRegisterRequest, NodeSchedulingPolicy, TenantCreateResponse, - TenantCreateResponseShard, TenantLocateResponse, TenantLocateResponseShard, - TenantShardMigrateRequest, TenantShardMigrateResponse, +use anyhow::Context; +use control_plane::storage_controller::{ + AttachHookRequest, AttachHookResponse, InspectRequest, InspectResponse, }; +use diesel::result::DatabaseErrorKind; +use futures::{stream::FuturesUnordered, StreamExt}; use hyper::StatusCode; use pageserver_api::{ - control_api::{ + controller_api::{ + NodeAvailability, NodeConfigureRequest, NodeRegisterRequest, PlacementPolicy, + TenantCreateResponse, TenantCreateResponseShard, TenantLocateResponse, + TenantShardMigrateRequest, TenantShardMigrateResponse, + }, + models::TenantConfigRequest, +}; +use pageserver_api::{ + models::{ + self, LocationConfig, LocationConfigListResponse, LocationConfigMode, ShardParameters, + TenantConfig, TenantCreateRequest, TenantLocationConfigRequest, + TenantLocationConfigResponse, TenantShardLocation, TenantShardSplitRequest, + TenantShardSplitResponse, TenantTimeTravelRequest, TimelineCreateRequest, TimelineInfo, + }, + shard::{ShardCount, ShardIdentity, ShardNumber, ShardStripeSize, TenantShardId}, + upcall_api::{ ReAttachRequest, ReAttachResponse, ReAttachResponseTenant, ValidateRequest, ValidateResponse, ValidateResponseTenant, }, - models, - models::{ - LocationConfig, LocationConfigMode, ShardParameters, TenantConfig, TenantCreateRequest, - TimelineCreateRequest, TimelineInfo, - }, - shard::{ShardCount, ShardIdentity, ShardNumber, ShardStripeSize, TenantShardId}, }; use pageserver_client::mgmt_api; +use tokio_util::sync::CancellationToken; +use tracing::instrument; use utils::{ + completion::Barrier, generation::Generation, http::error::ApiError, - id::{NodeId, TenantId}, + id::{NodeId, TenantId, TimelineId}, seqwait::SeqWait, + sync::gate::Gate, }; use crate::{ - compute_hook::ComputeHook, - node::Node, - persistence::{Persistence, TenantShardPersistence}, + compute_hook::{self, ComputeHook}, + node::{AvailabilityTransition, Node}, + persistence::{split_state::SplitState, DatabaseError, Persistence, TenantShardPersistence}, + reconciler::attached_location_conf, scheduler::Scheduler, tenant_state::{ IntentState, ObservedState, ObservedStateLocation, ReconcileResult, ReconcileWaitError, ReconcilerWaiter, TenantState, }, - PlacementPolicy, Sequence, + Sequence, }; +// For operations that should be quick, like attaching a new tenant +const SHORT_RECONCILE_TIMEOUT: Duration = Duration::from_secs(5); + +// For operations that might be slow, like migrating a tenant with +// some data in it. const RECONCILE_TIMEOUT: Duration = Duration::from_secs(30); +// If we receive a call using Secondary mode initially, it will omit generation. We will initialize +// tenant shards into this generation, and as long as it remains in this generation, we will accept +// input generation from future requests as authoritative. +const INITIAL_GENERATION: Generation = Generation::new(0); + +/// How long [`Service::startup_reconcile`] is allowed to take before it should give +/// up on unresponsive pageservers and proceed. +pub(crate) const STARTUP_RECONCILE_TIMEOUT: Duration = Duration::from_secs(30); + // Top level state available to all HTTP handlers struct ServiceState { tenants: BTreeMap, nodes: Arc>, - compute_hook: Arc, - - result_tx: tokio::sync::mpsc::UnboundedSender, + scheduler: Scheduler, } impl ServiceState { fn new( - result_tx: tokio::sync::mpsc::UnboundedSender, nodes: HashMap, tenants: BTreeMap, + scheduler: Scheduler, ) -> Self { Self { tenants, nodes: Arc::new(nodes), - compute_hook: Arc::new(ComputeHook::new()), - result_tx, + scheduler, } } + + fn parts_mut( + &mut self, + ) -> ( + &mut Arc>, + &mut BTreeMap, + &mut Scheduler, + ) { + (&mut self.nodes, &mut self.tenants, &mut self.scheduler) + } } #[derive(Clone)] pub struct Config { // All pageservers managed by one instance of this service must have - // the same public key. + // the same public key. This JWT token will be used to authenticate + // this service to the pageservers it manages. pub jwt_token: Option, + + // This JWT token will be used to authenticate this service to the control plane. + pub control_plane_jwt_token: Option, + + /// Where the compute hook should send notifications of pageserver attachment locations + /// (this URL points to the control plane in prod). If this is None, the compute hook will + /// assume it is running in a test environment and try to update neon_local. + pub compute_hook_url: Option, +} + +impl From for ApiError { + fn from(err: DatabaseError) -> ApiError { + match err { + DatabaseError::Query(e) => ApiError::InternalServerError(e.into()), + // FIXME: ApiError doesn't have an Unavailable variant, but ShuttingDown maps to 503. + DatabaseError::Connection(_) | DatabaseError::ConnectionPool(_) => { + ApiError::ShuttingDown + } + DatabaseError::Logical(reason) => { + ApiError::InternalServerError(anyhow::anyhow!(reason)) + } + } + } } pub struct Service { inner: Arc>, config: Config, persistence: Arc, + compute_hook: Arc, + result_tx: tokio::sync::mpsc::UnboundedSender, + + // Process shutdown will fire this token + cancel: CancellationToken, + + // Background tasks will hold this gate + gate: Gate, + + /// This waits for initial reconciliation with pageservers to complete. Until this barrier + /// passes, it isn't safe to do any actions that mutate tenants. + pub(crate) startup_complete: Barrier, } impl From for ApiError { @@ -95,132 +168,272 @@ impl From for ApiError { } } +#[allow(clippy::large_enum_variant)] +enum TenantCreateOrUpdate { + Create(TenantCreateRequest), + Update(Vec), +} + +struct ShardUpdate { + tenant_shard_id: TenantShardId, + placement_policy: PlacementPolicy, + tenant_config: TenantConfig, + + /// If this is None, generation is not updated. + generation: Option, +} + impl Service { - pub async fn spawn(config: Config, persistence: Arc) -> anyhow::Result> { - let (result_tx, mut result_rx) = tokio::sync::mpsc::unbounded_channel(); - - tracing::info!("Loading nodes from database..."); - let mut nodes = persistence.list_nodes().await?; - tracing::info!("Loaded {} nodes from database.", nodes.len()); - - tracing::info!("Loading shards from database..."); - let tenant_shard_persistence = persistence.list_tenant_shards().await?; - tracing::info!( - "Loaded {} shards from database.", - tenant_shard_persistence.len() - ); - - let mut tenants = BTreeMap::new(); - - for tsp in tenant_shard_persistence { - let tenant_shard_id = TenantShardId { - tenant_id: TenantId::from_str(tsp.tenant_id.as_str())?, - shard_number: ShardNumber(tsp.shard_number as u8), - shard_count: ShardCount(tsp.shard_count as u8), - }; - let shard_identity = if tsp.shard_count == 0 { - ShardIdentity::unsharded() - } else { - ShardIdentity::new( - ShardNumber(tsp.shard_number as u8), - ShardCount(tsp.shard_count as u8), - ShardStripeSize(tsp.shard_stripe_size as u32), - )? - }; - let new_tenant = TenantState { - tenant_shard_id, - shard: shard_identity, - sequence: Sequence::initial(), - // Note that we load generation, but don't care about generation_pageserver. We will either end up finding - // our existing attached location and it will match generation_pageserver, or we will attach somewhere new - // and update generation_pageserver in the process. - generation: Generation::new(tsp.generation), - policy: serde_json::from_str(&tsp.placement_policy).unwrap(), - intent: IntentState::new(), - observed: ObservedState::new(), - config: serde_json::from_str(&tsp.config).unwrap(), - reconciler: None, - waiter: Arc::new(SeqWait::new(Sequence::initial())), - error_waiter: Arc::new(SeqWait::new(Sequence::initial())), - last_error: Arc::default(), - }; - - tenants.insert(tenant_shard_id, new_tenant); - } + pub fn get_config(&self) -> &Config { + &self.config + } + /// Called once on startup, this function attempts to contact all pageservers to build an up-to-date + /// view of the world, and determine which pageservers are responsive. + #[instrument(skip_all)] + async fn startup_reconcile(self: &Arc) { // For all tenant shards, a vector of observed states on nodes (where None means // indeterminate, same as in [`ObservedStateLocation`]) - let mut observed = HashMap::new(); + let mut observed: HashMap)>> = + HashMap::new(); - // TODO: issue these requests concurrently - for node in &mut nodes { - let client = mgmt_api::Client::new(node.base_url(), config.jwt_token.as_deref()); + let mut nodes_online = HashSet::new(); - tracing::info!("Scanning shards on node {}...", node.id); - match client.list_location_config().await { - Err(e) => { - tracing::warn!("Could not contact pageserver {} ({e})", node.id); - // TODO: be more tolerant, apply a generous 5-10 second timeout - // TODO: setting a node to Offline is a dramatic thing to do, and can - // prevent neon_local from starting up (it starts this service before - // any pageservers are running). It may make sense to give nodes - // a Pending state to accomodate this situation, and allow (but deprioritize) - // scheduling on Pending nodes. - //node.availability = NodeAvailability::Offline; + // Startup reconciliation does I/O to other services: whether they + // are responsive or not, we should aim to finish within our deadline, because: + // - If we don't, a k8s readiness hook watching /ready will kill us. + // - While we're waiting for startup reconciliation, we are not fully + // available for end user operations like creating/deleting tenants and timelines. + // + // We set multiple deadlines to break up the time available between the phases of work: this is + // arbitrary, but avoids a situation where the first phase could burn our entire timeout period. + let start_at = Instant::now(); + let node_scan_deadline = start_at + .checked_add(STARTUP_RECONCILE_TIMEOUT / 2) + .expect("Reconcile timeout is a modest constant"); + + let compute_notify_deadline = start_at + .checked_add((STARTUP_RECONCILE_TIMEOUT / 4) * 3) + .expect("Reconcile timeout is a modest constant"); + + // Accumulate a list of any tenant locations that ought to be detached + let mut cleanup = Vec::new(); + + let node_listings = self.scan_node_locations(node_scan_deadline).await; + for (node_id, list_response) in node_listings { + let tenant_shards = list_response.tenant_shards; + tracing::info!( + "Received {} shard statuses from pageserver {}, setting it to Active", + tenant_shards.len(), + node_id + ); + nodes_online.insert(node_id); + + for (tenant_shard_id, conf_opt) in tenant_shards { + let shard_observations = observed.entry(tenant_shard_id).or_default(); + shard_observations.push((node_id, conf_opt)); + } + } + + // List of tenants for which we will attempt to notify compute of their location at startup + let mut compute_notifications = Vec::new(); + + // Populate intent and observed states for all tenants, based on reported state on pageservers + let shard_count = { + let mut locked = self.inner.write().unwrap(); + let (nodes, tenants, scheduler) = locked.parts_mut(); + + // Mark nodes online if they responded to us: nodes are offline by default after a restart. + let mut new_nodes = (**nodes).clone(); + for (node_id, node) in new_nodes.iter_mut() { + if nodes_online.contains(node_id) { + node.set_availability(NodeAvailability::Active); + scheduler.node_upsert(node); } - Ok(listing) => { - tracing::info!( - "Received {} shard statuses from pageserver {}, setting it to Active", - listing.tenant_shards.len(), - node.id - ); - node.availability = NodeAvailability::Active; + } + *nodes = Arc::new(new_nodes); - for (tenant_shard_id, conf_opt) in listing.tenant_shards { - observed.insert(tenant_shard_id, (node.id, conf_opt)); + for (tenant_shard_id, shard_observations) in observed { + for (node_id, observed_loc) in shard_observations { + let Some(tenant_state) = tenants.get_mut(&tenant_shard_id) else { + cleanup.push((tenant_shard_id, node_id)); + continue; + }; + tenant_state + .observed + .locations + .insert(node_id, ObservedStateLocation { conf: observed_loc }); + } + } + + // Populate each tenant's intent state + for (tenant_shard_id, tenant_state) in tenants.iter_mut() { + tenant_state.intent_from_observed(scheduler); + if let Err(e) = tenant_state.schedule(scheduler) { + // Non-fatal error: we are unable to properly schedule the tenant, perhaps because + // not enough pageservers are available. The tenant may well still be available + // to clients. + tracing::error!("Failed to schedule tenant {tenant_shard_id} at startup: {e}"); + } else { + // If we're both intending and observed to be attached at a particular node, we will + // emit a compute notification for this. In the case where our observed state does not + // yet match our intent, we will eventually reconcile, and that will emit a compute notification. + if let Some(attached_at) = tenant_state.stably_attached() { + compute_notifications.push(( + *tenant_shard_id, + attached_at, + tenant_state.shard.stripe_size, + )); } } } - } - let mut cleanup = Vec::new(); + tenants.len() + }; - // Populate intent and observed states for all tenants, based on reported state on pageservers - for (tenant_shard_id, (node_id, observed_loc)) in observed { - let Some(tenant_state) = tenants.get_mut(&tenant_shard_id) else { - cleanup.push((tenant_shard_id, node_id)); - continue; - }; + // TODO: if any tenant's intent now differs from its loaded generation_pageserver, we should clear that + // generation_pageserver in the database. - tenant_state - .observed - .locations - .insert(node_id, ObservedStateLocation { conf: observed_loc }); - } + // Emit compute hook notifications for all tenants which are already stably attached. Other tenants + // will emit compute hook notifications when they reconcile. + // + // Ordering: we must complete these notification attempts before doing any other reconciliation for the + // tenants named here, because otherwise our calls to notify() might race with more recent values + // generated by reconciliation. + let notify_failures = self + .compute_notify_many(compute_notifications, compute_notify_deadline) + .await; - // State of nodes is now frozen, transform to a HashMap. - let mut nodes: HashMap = nodes.into_iter().map(|n| (n.id, n)).collect(); - - // Populate each tenant's intent state - let mut scheduler = Scheduler::new(&tenants, &nodes); - for (tenant_shard_id, tenant_state) in tenants.iter_mut() { - tenant_state.intent_from_observed(); - if let Err(e) = tenant_state.schedule(&mut scheduler) { - // Non-fatal error: we are unable to properly schedule the tenant, perhaps because - // not enough pageservers are available. The tenant may well still be available - // to clients. - tracing::error!("Failed to schedule tenant {tenant_shard_id} at startup: {e}"); + // Compute notify is fallible. If it fails here, do not delay overall startup: set the + // flag on these shards that they have a pending notification. + // Update tenant state for any that failed to do their initial compute notify, so that they'll retry later. + { + let mut locked = self.inner.write().unwrap(); + for tenant_shard_id in notify_failures.into_iter() { + if let Some(shard) = locked.tenants.get_mut(&tenant_shard_id) { + shard.pending_compute_notification = true; + } } } - // Clean up any tenants that were found on pageservers but are not known to us. + // Finally, now that the service is up and running, launch reconcile operations for any tenants + // which require it: under normal circumstances this should only include tenants that were in some + // transient state before we restarted, or any tenants whose compute hooks failed above. + let reconcile_tasks = self.reconcile_all(); + // We will not wait for these reconciliation tasks to run here: we're now done with startup and + // normal operations may proceed. + + // Clean up any tenants that were found on pageservers but are not known to us. Do this in the + // background because it does not need to complete in order to proceed with other work. + if !cleanup.is_empty() { + tracing::info!("Cleaning up {} locations in the background", cleanup.len()); + tokio::task::spawn({ + let cleanup_self = self.clone(); + async move { cleanup_self.cleanup_locations(cleanup).await } + }); + } + + tracing::info!("Startup complete, spawned {reconcile_tasks} reconciliation tasks ({shard_count} shards total)"); + } + + /// Used during [`Self::startup_reconcile`]: issue GETs to all nodes concurrently, with a deadline. + /// + /// The result includes only nodes which responded within the deadline + async fn scan_node_locations( + &self, + deadline: Instant, + ) -> HashMap { + let nodes = { + let locked = self.inner.read().unwrap(); + locked.nodes.clone() + }; + + let mut node_results = HashMap::new(); + + let mut node_list_futs = FuturesUnordered::new(); + + for node in nodes.values() { + node_list_futs.push({ + async move { + tracing::info!("Scanning shards on node {node}..."); + let timeout = Duration::from_secs(5); + let response = node + .with_client_retries( + |client| async move { client.list_location_config().await }, + &self.config.jwt_token, + 1, + 5, + timeout, + &self.cancel, + ) + .await; + (node.get_id(), response) + } + }); + } + + loop { + let (node_id, result) = tokio::select! { + next = node_list_futs.next() => { + match next { + Some(result) => result, + None =>{ + // We got results for all our nodes + break; + } + + } + }, + _ = tokio::time::sleep(deadline.duration_since(Instant::now())) => { + // Give up waiting for anyone who hasn't responded: we will yield the results that we have + tracing::info!("Reached deadline while waiting for nodes to respond to location listing requests"); + break; + } + }; + + let Some(list_response) = result else { + tracing::info!("Shutdown during startup_reconcile"); + break; + }; + + match list_response { + Err(e) => { + tracing::warn!("Could not scan node {} ({e})", node_id); + } + Ok(listing) => { + node_results.insert(node_id, listing); + } + } + } + + node_results + } + + /// Used during [`Self::startup_reconcile`]: detach a list of unknown-to-us tenants from pageservers. + /// + /// This is safe to run in the background, because if we don't have this TenantShardId in our map of + /// tenants, then it is probably something incompletely deleted before: we will not fight with any + /// other task trying to attach it. + #[instrument(skip_all)] + async fn cleanup_locations(&self, cleanup: Vec<(TenantShardId, NodeId)>) { + let nodes = self.inner.read().unwrap().nodes.clone(); + for (tenant_shard_id, node_id) in cleanup { // A node reported a tenant_shard_id which is unknown to us: detach it. - let node = nodes - .get_mut(&node_id) - .expect("Always exists: only known nodes are scanned"); + let Some(node) = nodes.get(&node_id) else { + // This is legitimate; we run in the background and [`Self::startup_reconcile`] might have identified + // a location to clean up on a node that has since been removed. + tracing::info!( + "Not cleaning up location {node_id}/{tenant_shard_id}: node not found" + ); + continue; + }; - let client = mgmt_api::Client::new(node.base_url(), config.jwt_token.as_deref()); + if self.cancel.is_cancelled() { + break; + } + + let client = mgmt_api::Client::new(node.base_url(), self.config.jwt_token.as_deref()); match client .location_config( tenant_shard_id, @@ -229,11 +442,12 @@ impl Service { generation: None, secondary_conf: None, shard_number: tenant_shard_id.shard_number.0, - shard_count: tenant_shard_id.shard_count.0, + shard_count: tenant_shard_id.shard_count.literal(), shard_stripe_size: 0, tenant_conf: models::TenantConfig::default(), }, None, + false, ) .await { @@ -251,76 +465,300 @@ impl Service { } } } + } + + /// Used during [`Self::startup_reconcile`]: issue many concurrent compute notifications. + /// + /// Returns a set of any shards for which notifications where not acked within the deadline. + async fn compute_notify_many( + &self, + notifications: Vec<(TenantShardId, NodeId, ShardStripeSize)>, + deadline: Instant, + ) -> HashSet { + let attempt_shards = notifications.iter().map(|i| i.0).collect::>(); + let mut success_shards = HashSet::new(); + + // Construct an async stream of futures to invoke the compute notify function: we do this + // in order to subsequently use .buffered() on the stream to execute with bounded parallelism. + let mut stream = futures::stream::iter(notifications.into_iter()) + .map(|(tenant_shard_id, node_id, stripe_size)| { + let compute_hook = self.compute_hook.clone(); + let cancel = self.cancel.clone(); + async move { + if let Err(e) = compute_hook + .notify(tenant_shard_id, node_id, stripe_size, &cancel) + .await + { + tracing::error!( + %tenant_shard_id, + %node_id, + "Failed to notify compute on startup for shard: {e}" + ); + None + } else { + Some(tenant_shard_id) + } + } + }) + .buffered(compute_hook::API_CONCURRENCY); + + loop { + tokio::select! { + next = stream.next() => { + match next { + Some(Some(success_shard)) => { + // A notification succeeded + success_shards.insert(success_shard); + }, + Some(None) => { + // A notification that failed + }, + None => { + tracing::info!("Successfully sent all compute notifications"); + break; + } + } + }, + _ = tokio::time::sleep(deadline.duration_since(Instant::now())) => { + // Give up sending any that didn't succeed yet + tracing::info!("Reached deadline while sending compute notifications"); + break; + } + }; + } + + attempt_shards + .difference(&success_shards) + .cloned() + .collect() + } + + /// Long running background task that periodically wakes up and looks for shards that need + /// reconciliation. Reconciliation is fallible, so any reconciliation tasks that fail during + /// e.g. a tenant create/attach/migrate must eventually be retried: this task is responsible + /// for those retries. + #[instrument(skip_all)] + async fn background_reconcile(&self) { + self.startup_complete.clone().wait().await; + + const BACKGROUND_RECONCILE_PERIOD: Duration = Duration::from_secs(20); + + let mut interval = tokio::time::interval(BACKGROUND_RECONCILE_PERIOD); + while !self.cancel.is_cancelled() { + tokio::select! { + _ = interval.tick() => { self.reconcile_all(); } + _ = self.cancel.cancelled() => return + } + } + } + + /// Apply the contents of a [`ReconcileResult`] to our in-memory state: if the reconciliation + /// was successful, this will update the observed state of the tenant such that subsequent + /// calls to [`TenantState::maybe_reconcile`] will do nothing. + #[instrument(skip_all, fields( + tenant_id=%result.tenant_shard_id.tenant_id, shard_id=%result.tenant_shard_id.shard_slug(), + sequence=%result.sequence + ))] + fn process_result(&self, result: ReconcileResult) { + let mut locked = self.inner.write().unwrap(); + let Some(tenant) = locked.tenants.get_mut(&result.tenant_shard_id) else { + // A reconciliation result might race with removing a tenant: drop results for + // tenants that aren't in our map. + return; + }; + + // Usually generation should only be updated via this path, so the max() isn't + // needed, but it is used to handle out-of-band updates via. e.g. test hook. + tenant.generation = std::cmp::max(tenant.generation, result.generation); + + // If the reconciler signals that it failed to notify compute, set this state on + // the shard so that a future [`TenantState::maybe_reconcile`] will try again. + tenant.pending_compute_notification = result.pending_compute_notification; + + // Let the TenantState know it is idle. + tenant.reconcile_complete(result.sequence); + + match result.result { + Ok(()) => { + for (node_id, loc) in &result.observed.locations { + if let Some(conf) = &loc.conf { + tracing::info!("Updating observed location {}: {:?}", node_id, conf); + } else { + tracing::info!("Setting observed location {} to None", node_id,) + } + } + tenant.observed = result.observed; + tenant.waiter.advance(result.sequence); + } + Err(e) => { + tracing::warn!("Reconcile error: {}", e); + + // Ordering: populate last_error before advancing error_seq, + // so that waiters will see the correct error after waiting. + *(tenant.last_error.lock().unwrap()) = format!("{e}"); + tenant.error_waiter.advance(result.sequence); + + for (node_id, o) in result.observed.locations { + tenant.observed.locations.insert(node_id, o); + } + } + } + } + + async fn process_results( + &self, + mut result_rx: tokio::sync::mpsc::UnboundedReceiver, + ) { + loop { + // Wait for the next result, or for cancellation + let result = tokio::select! { + r = result_rx.recv() => { + match r { + Some(result) => {result}, + None => {break;} + } + } + _ = self.cancel.cancelled() => { + break; + } + }; + + self.process_result(result); + } + } + + pub async fn spawn(config: Config, persistence: Arc) -> anyhow::Result> { + let (result_tx, result_rx) = tokio::sync::mpsc::unbounded_channel(); + + tracing::info!("Loading nodes from database..."); + let nodes = persistence + .list_nodes() + .await? + .into_iter() + .map(Node::from_persistent) + .collect::>(); + let nodes: HashMap = nodes.into_iter().map(|n| (n.get_id(), n)).collect(); + tracing::info!("Loaded {} nodes from database.", nodes.len()); + + tracing::info!("Loading shards from database..."); + let tenant_shard_persistence = persistence.list_tenant_shards().await?; + tracing::info!( + "Loaded {} shards from database.", + tenant_shard_persistence.len() + ); + + let mut tenants = BTreeMap::new(); + + let mut scheduler = Scheduler::new(nodes.values()); + + #[cfg(feature = "testing")] + { + // Hack: insert scheduler state for all nodes referenced by shards, as compatibility + // tests only store the shards, not the nodes. The nodes will be loaded shortly + // after when pageservers start up and register. + let mut node_ids = HashSet::new(); + for tsp in &tenant_shard_persistence { + if let Some(node_id) = tsp.generation_pageserver { + node_ids.insert(node_id); + } + } + for node_id in node_ids { + tracing::info!("Creating node {} in scheduler for tests", node_id); + let node = Node::new( + NodeId(node_id as u64), + "".to_string(), + 123, + "".to_string(), + 123, + ); + + scheduler.node_upsert(&node); + } + } + for tsp in tenant_shard_persistence { + let tenant_shard_id = TenantShardId { + tenant_id: TenantId::from_str(tsp.tenant_id.as_str())?, + shard_number: ShardNumber(tsp.shard_number as u8), + shard_count: ShardCount::new(tsp.shard_count as u8), + }; + let shard_identity = if tsp.shard_count == 0 { + ShardIdentity::unsharded() + } else { + ShardIdentity::new( + ShardNumber(tsp.shard_number as u8), + ShardCount::new(tsp.shard_count as u8), + ShardStripeSize(tsp.shard_stripe_size as u32), + )? + }; + + // We will populate intent properly later in [`Self::startup_reconcile`], initially populate + // it with what we can infer: the node for which a generation was most recently issued. + let mut intent = IntentState::new(); + if let Some(generation_pageserver) = tsp.generation_pageserver { + intent.set_attached(&mut scheduler, Some(NodeId(generation_pageserver as u64))); + } + + let new_tenant = TenantState { + tenant_shard_id, + shard: shard_identity, + sequence: Sequence::initial(), + generation: tsp.generation.map(|g| Generation::new(g as u32)), + policy: serde_json::from_str(&tsp.placement_policy).unwrap(), + intent, + observed: ObservedState::new(), + config: serde_json::from_str(&tsp.config).unwrap(), + reconciler: None, + splitting: tsp.splitting, + waiter: Arc::new(SeqWait::new(Sequence::initial())), + error_waiter: Arc::new(SeqWait::new(Sequence::initial())), + last_error: Arc::default(), + pending_compute_notification: false, + }; + + tenants.insert(tenant_shard_id, new_tenant); + } + + let (startup_completion, startup_complete) = utils::completion::channel(); - let shard_count = tenants.len(); let this = Arc::new(Self { inner: Arc::new(std::sync::RwLock::new(ServiceState::new( - result_tx, nodes, tenants, + nodes, tenants, scheduler, ))), - config, + config: config.clone(), persistence, + compute_hook: Arc::new(ComputeHook::new(config)), + result_tx, + startup_complete: startup_complete.clone(), + cancel: CancellationToken::new(), + gate: Gate::default(), }); let result_task_this = this.clone(); tokio::task::spawn(async move { - while let Some(result) = result_rx.recv().await { - tracing::info!( - "Reconcile result for sequence {}, ok={}", - result.sequence, - result.result.is_ok() - ); - let mut locked = result_task_this.inner.write().unwrap(); - let Some(tenant) = locked.tenants.get_mut(&result.tenant_shard_id) else { - // A reconciliation result might race with removing a tenant: drop results for - // tenants that aren't in our map. - continue; - }; - - // Usually generation should only be updated via this path, so the max() isn't - // needed, but it is used to handle out-of-band updates via. e.g. test hook. - tenant.generation = std::cmp::max(tenant.generation, result.generation); - - match result.result { - Ok(()) => { - for (node_id, loc) in &result.observed.locations { - if let Some(conf) = &loc.conf { - tracing::info!( - "Updating observed location {}: {:?}", - node_id, - conf - ); - } else { - tracing::info!("Setting observed location {} to None", node_id,) - } - } - tenant.observed = result.observed; - tenant.waiter.advance(result.sequence); - } - Err(e) => { - tracing::warn!( - "Reconcile error on tenant {}: {}", - tenant.tenant_shard_id, - e - ); - - // Ordering: populate last_error before advancing error_seq, - // so that waiters will see the correct error after waiting. - *(tenant.last_error.lock().unwrap()) = format!("{e}"); - tenant.error_waiter.advance(result.sequence); - - for (node_id, o) in result.observed.locations { - tenant.observed.locations.insert(node_id, o); - } - } - } + // Block shutdown until we're done (we must respect self.cancel) + if let Ok(_gate) = result_task_this.gate.enter() { + result_task_this.process_results(result_rx).await } }); - // Finally, now that the service is up and running, launch reconcile operations for any tenants - // which require it: under normal circumstances this should only include tenants that were in some - // transient state before we restarted. - let reconcile_tasks = this.reconcile_all(); - tracing::info!("Startup complete, spawned {reconcile_tasks} reconciliation tasks ({shard_count} shards total)"); + tokio::task::spawn({ + let this = this.clone(); + // We will block the [`Service::startup_complete`] barrier until [`Self::startup_reconcile`] + // is done. + let startup_completion = startup_completion.clone(); + async move { + // Block shutdown until we're done (we must respect self.cancel) + let Ok(_gate) = this.gate.enter() else { + return; + }; + + this.startup_reconcile().await; + + drop(startup_completion); + + this.background_reconcile().await; + } + }); Ok(this) } @@ -336,30 +774,47 @@ impl Service { let locked = self.inner.write().unwrap(); !locked.tenants.contains_key(&attach_req.tenant_shard_id) }; - if insert { let tsp = TenantShardPersistence { tenant_id: attach_req.tenant_shard_id.tenant_id.to_string(), shard_number: attach_req.tenant_shard_id.shard_number.0 as i32, - shard_count: attach_req.tenant_shard_id.shard_count.0 as i32, + shard_count: attach_req.tenant_shard_id.shard_count.literal() as i32, shard_stripe_size: 0, - generation: 0, + generation: Some(0), generation_pageserver: None, - placement_policy: serde_json::to_string(&PlacementPolicy::default()).unwrap(), + placement_policy: serde_json::to_string(&PlacementPolicy::Single).unwrap(), config: serde_json::to_string(&TenantConfig::default()).unwrap(), + splitting: SplitState::default(), }; - self.persistence.insert_tenant_shards(vec![tsp]).await?; + match self.persistence.insert_tenant_shards(vec![tsp]).await { + Err(e) => match e { + DatabaseError::Query(diesel::result::Error::DatabaseError( + DatabaseErrorKind::UniqueViolation, + _, + )) => { + tracing::info!( + "Raced with another request to insert tenant {}", + attach_req.tenant_shard_id + ) + } + _ => return Err(e.into()), + }, + Ok(()) => { + tracing::info!("Inserted shard {} in database", attach_req.tenant_shard_id); - let mut locked = self.inner.write().unwrap(); - locked.tenants.insert( - attach_req.tenant_shard_id, - TenantState::new( - attach_req.tenant_shard_id, - ShardIdentity::unsharded(), - PlacementPolicy::Single, - ), - ); + let mut locked = self.inner.write().unwrap(); + locked.tenants.insert( + attach_req.tenant_shard_id, + TenantState::new( + attach_req.tenant_shard_id, + ShardIdentity::unsharded(), + PlacementPolicy::Single, + ), + ); + tracing::info!("Inserted shard {} in memory", attach_req.tenant_shard_id); + } + } } let new_generation = if let Some(req_node_id) = attach_req.node_id { @@ -374,16 +829,17 @@ impl Service { }; let mut locked = self.inner.write().unwrap(); - let tenant_state = locked - .tenants + let (_nodes, tenants, scheduler) = locked.parts_mut(); + + let tenant_state = tenants .get_mut(&attach_req.tenant_shard_id) .expect("Checked for existence above"); if let Some(new_generation) = new_generation { - tenant_state.generation = new_generation; + tenant_state.generation = Some(new_generation); } else { // This is a detach notification. We must update placement policy to avoid re-attaching - // during background scheduling/reconciliation, or during attachment service restart. + // during background scheduling/reconciliation, or during storage controller restart. assert!(attach_req.node_id.is_none()); tenant_state.policy = PlacementPolicy::Detached; } @@ -395,7 +851,7 @@ impl Service { generation = ?tenant_state.generation, "issuing", ); - } else if let Some(ps_id) = tenant_state.intent.attached { + } else if let Some(ps_id) = tenant_state.intent.get_attached() { tracing::info!( tenant_id = %attach_req.tenant_shard_id, %ps_id, @@ -407,7 +863,9 @@ impl Service { tenant_id = %attach_req.tenant_shard_id, "no-op: tenant already has no pageserver"); } - tenant_state.intent.attached = attach_req.node_id; + tenant_state + .intent + .set_attached(scheduler, attach_req.node_id); tracing::info!( "attach_hook: tenant {} set generation {:?}, pageserver {}", @@ -417,10 +875,32 @@ impl Service { attach_req.node_id.unwrap_or(utils::id::NodeId(0xfffffff)) ); + // Trick the reconciler into not doing anything for this tenant: this helps + // tests that manually configure a tenant on the pagesrever, and then call this + // attach hook: they don't want background reconciliation to modify what they + // did to the pageserver. + #[cfg(feature = "testing")] + { + if let Some(node_id) = attach_req.node_id { + tenant_state.observed.locations = HashMap::from([( + node_id, + ObservedStateLocation { + conf: Some(attached_location_conf( + tenant_state.generation.unwrap(), + &tenant_state.shard, + &tenant_state.config, + )), + }, + )]); + } else { + tenant_state.observed.locations.clear(); + } + } + Ok(AttachHookResponse { gen: attach_req .node_id - .map(|_| tenant_state.generation.into().unwrap()), + .map(|_| tenant_state.generation.expect("Test hook, not used on tenants that are mid-onboarding with a NULL generation").into().unwrap()), }) } @@ -432,8 +912,8 @@ impl Service { InspectResponse { attachment: tenant_state.and_then(|s| { s.intent - .attached - .map(|ps| (s.generation.into().unwrap(), ps)) + .get_attached() + .map(|ps| (s.generation.expect("Test hook, not used on tenants that are mid-onboarding with a NULL generation").into().unwrap(), ps)) }), } } @@ -441,10 +921,29 @@ impl Service { pub(crate) async fn re_attach( &self, reattach_req: ReAttachRequest, - ) -> anyhow::Result { + ) -> Result { + if let Some(register_req) = reattach_req.register { + self.node_register(register_req).await?; + } + + // Take a re-attach as indication that the node is available: this is a precursor to proper + // heartbeating in https://github.com/neondatabase/neon/issues/6844 + self.node_configure(NodeConfigureRequest { + node_id: reattach_req.node_id, + availability: Some(NodeAvailability::Active), + scheduling: None, + }) + .await?; + // Ordering: we must persist generation number updates before making them visible in the in-memory state let incremented_generations = self.persistence.re_attach(reattach_req.node_id).await?; + tracing::info!( + node_id=%reattach_req.node_id, + "Incremented {} tenant shards' generations", + incremented_generations.len() + ); + // Apply the updated generation to our in-memory state let mut locked = self.inner.write().unwrap(); @@ -457,7 +956,6 @@ impl Service { id: tenant_shard_id, gen: new_gen.into().unwrap(), }); - // Apply the new generation number to our in-memory state let shard_state = locked.tenants.get_mut(&tenant_shard_id); let Some(shard_state) = shard_state else { @@ -474,7 +972,34 @@ impl Service { continue; }; - shard_state.generation = std::cmp::max(shard_state.generation, new_gen); + // If [`Persistence::re_attach`] selected this shard, it must have alread + // had a generation set. + debug_assert!(shard_state.generation.is_some()); + let Some(old_gen) = shard_state.generation else { + // Should never happen: would only return incremented generation + // for a tenant that already had a non-null generation. + return Err(ApiError::InternalServerError(anyhow::anyhow!( + "Generation must be set while re-attaching" + ))); + }; + shard_state.generation = Some(std::cmp::max(old_gen, new_gen)); + if let Some(observed) = shard_state + .observed + .locations + .get_mut(&reattach_req.node_id) + { + if let Some(conf) = observed.conf.as_mut() { + conf.generation = new_gen.into(); + } + } else { + // This node has no observed state for the shard: perhaps it was offline + // when the pageserver restarted. Insert a None, so that the Reconciler + // will be prompted to learn the location's state before it makes changes. + shard_state + .observed + .locations + .insert(reattach_req.node_id, ObservedStateLocation { conf: None }); + } // TODO: cancel/restart any running reconciliation for this tenant, it might be trying // to call location_conf API with an old generation. Wait for cancellation to complete @@ -495,7 +1020,7 @@ impl Service { for req_tenant in validate_req.tenants { if let Some(tenant_state) = locked.tenants.get(&req_tenant.id) { - let valid = tenant_state.generation == Generation::new(req_tenant.gen); + let valid = tenant_state.generation == Some(Generation::new(req_tenant.gen)); tracing::info!( "handle_validate: {}(gen {}): valid={valid} (latest {:?})", req_tenant.id, @@ -506,6 +1031,14 @@ impl Service { id: req_tenant.id, valid, }); + } else { + // After tenant deletion, we may approve any validation. This avoids + // spurious warnings on the pageserver if it has pending LSN updates + // at the point a deletion happens. + response.tenants.push(ValidateResponseTenant { + id: req_tenant.id, + valid: true, + }); } } response @@ -515,16 +1048,25 @@ impl Service { &self, create_req: TenantCreateRequest, ) -> Result { - // Shard count 0 is valid: it means create a single shard (ShardCount(0) means "unsharded") - let literal_shard_count = if create_req.shard_parameters.is_unsharded() { - 1 - } else { - create_req.shard_parameters.count.0 - }; + let (response, waiters) = self.do_tenant_create(create_req).await?; + + self.await_waiters(waiters, SHORT_RECONCILE_TIMEOUT).await?; + Ok(response) + } + + pub(crate) async fn do_tenant_create( + &self, + create_req: TenantCreateRequest, + ) -> Result<(TenantCreateResponse, Vec), ApiError> { + // As a default, single is convenient for tests that don't choose a policy. + let placement_policy = create_req + .placement_policy + .clone() + .unwrap_or(PlacementPolicy::Single); // This service expects to handle sharding itself: it is an error to try and directly create // a particular shard here. - let tenant_id = if create_req.new_tenant_id.shard_count > ShardCount(1) { + let tenant_id = if !create_req.new_tenant_id.is_unsharded() { return Err(ApiError::BadRequest(anyhow::anyhow!( "Attempted to create a specific shard, this API is for creating the whole tenant" ))); @@ -538,7 +1080,7 @@ impl Service { create_req.shard_parameters.count, ); - let create_ids = (0..literal_shard_count) + let create_ids = (0..create_req.shard_parameters.count.count()) .map(|i| TenantShardId { tenant_id, shard_number: ShardNumber(i), @@ -546,9 +1088,27 @@ impl Service { }) .collect::>(); - // TODO: enable specifying this. Using Single as a default helps legacy tests to work (they - // have no expectation of HA). - let placement_policy: PlacementPolicy = PlacementPolicy::Single; + // If the caller specifies a None generation, it means "start from default". This is different + // to [`Self::tenant_location_config`], where a None generation is used to represent + // an incompletely-onboarded tenant. + let initial_generation = if matches!(placement_policy, PlacementPolicy::Secondary) { + tracing::info!( + "tenant_create: secondary mode, generation is_some={}", + create_req.generation.is_some() + ); + create_req.generation.map(Generation::new) + } else { + tracing::info!( + "tenant_create: not secondary mode, generation is_some={}", + create_req.generation.is_some() + ); + Some( + create_req + .generation + .map(Generation::new) + .unwrap_or(INITIAL_GENERATION), + ) + }; // Ordering: we persist tenant shards before creating them on the pageserver. This enables a caller // to clean up after themselves by issuing a tenant deletion if something goes wrong and we restart @@ -558,12 +1118,15 @@ impl Service { .map(|tenant_shard_id| TenantShardPersistence { tenant_id: tenant_shard_id.tenant_id.to_string(), shard_number: tenant_shard_id.shard_number.0 as i32, - shard_count: tenant_shard_id.shard_count.0 as i32, + shard_count: tenant_shard_id.shard_count.literal() as i32, shard_stripe_size: create_req.shard_parameters.stripe_size.0 as i32, - generation: 0, + generation: initial_generation.map(|g| g.into().unwrap() as i32), + // The pageserver is not known until scheduling happens: we will set this column when + // incrementing the generation the first time we attach to a pageserver. generation_pageserver: None, placement_policy: serde_json::to_string(&placement_policy).unwrap(), config: serde_json::to_string(&create_req.config).unwrap(), + splitting: SplitState::default(), }) .collect(); self.persistence @@ -576,16 +1139,16 @@ impl Service { let (waiters, response_shards) = { let mut locked = self.inner.write().unwrap(); + let (nodes, tenants, scheduler) = locked.parts_mut(); let mut response_shards = Vec::new(); - - let mut scheduler = Scheduler::new(&locked.tenants, &locked.nodes); + let mut schcedule_error = None; for tenant_shard_id in create_ids { tracing::info!("Creating shard {tenant_shard_id}..."); use std::collections::btree_map::Entry; - match locked.tenants.entry(tenant_shard_id) { + match tenants.entry(tenant_shard_id) { Entry::Occupied(mut entry) => { tracing::info!( "Tenant shard {tenant_shard_id} already exists while creating" @@ -595,114 +1158,491 @@ impl Service { // attached and secondary locations (independently) away frorm those // pageservers also holding a shard for this tenant. - entry.get_mut().schedule(&mut scheduler).map_err(|e| { + entry.get_mut().schedule(scheduler).map_err(|e| { ApiError::Conflict(format!( "Failed to schedule shard {tenant_shard_id}: {e}" )) })?; - response_shards.push(TenantCreateResponseShard { - node_id: entry + if let Some(node_id) = entry.get().intent.get_attached() { + let generation = entry .get() - .intent - .attached - .expect("We just set pageserver if it was None"), - generation: entry.get().generation.into().unwrap(), - }); + .generation + .expect("Generation is set when in attached mode"); + response_shards.push(TenantCreateResponseShard { + shard_id: tenant_shard_id, + node_id: *node_id, + generation: generation.into().unwrap(), + }); + } continue; } Entry::Vacant(entry) => { - let mut state = TenantState::new( + let state = entry.insert(TenantState::new( tenant_shard_id, ShardIdentity::from_params( tenant_shard_id.shard_number, &create_req.shard_parameters, ), placement_policy.clone(), - ); + )); - if let Some(create_gen) = create_req.generation { - state.generation = Generation::new(create_gen); - } + state.generation = initial_generation; state.config = create_req.config.clone(); + if let Err(e) = state.schedule(scheduler) { + schcedule_error = Some(e); + } - state.schedule(&mut scheduler).map_err(|e| { - ApiError::Conflict(format!( - "Failed to schedule shard {tenant_shard_id}: {e}" - )) - })?; - - response_shards.push(TenantCreateResponseShard { - node_id: state - .intent - .attached - .expect("We just set pageserver if it was None"), - generation: state.generation.into().unwrap(), - }); - entry.insert(state) + // Only include shards in result if we are attaching: the purpose + // of the response is to tell the caller where the shards are attached. + if let Some(node_id) = state.intent.get_attached() { + let generation = state + .generation + .expect("Generation is set when in attached mode"); + response_shards.push(TenantCreateResponseShard { + shard_id: tenant_shard_id, + node_id: *node_id, + generation: generation.into().unwrap(), + }); + } } }; } - // Take a snapshot of pageservers - let pageservers = locked.nodes.clone(); + // If we failed to schedule shards, then they are still created in the controller, + // but we return an error to the requester to avoid a silent failure when someone + // tries to e.g. create a tenant whose placement policy requires more nodes than + // are present in the system. We do this here rather than in the above loop, to + // avoid situations where we only create a subset of shards in the tenant. + if let Some(e) = schcedule_error { + return Err(ApiError::Conflict(format!( + "Failed to schedule shard(s): {e}" + ))); + } - let result_tx = locked.result_tx.clone(); - let compute_hook = locked.compute_hook.clone(); - - let waiters = locked - .tenants + let waiters = tenants .range_mut(TenantShardId::tenant_range(tenant_id)) - .filter_map(|(_shard_id, shard)| { - shard.maybe_reconcile( - result_tx.clone(), - &pageservers, - &compute_hook, - &self.config, - &self.persistence, - ) - }) + .filter_map(|(_shard_id, shard)| self.maybe_reconcile_shard(shard, nodes)) .collect::>(); (waiters, response_shards) }; - let deadline = Instant::now().checked_add(Duration::from_secs(5)).unwrap(); + Ok(( + TenantCreateResponse { + shards: response_shards, + }, + waiters, + )) + } + + /// Helper for functions that reconcile a number of shards, and would like to do a timeout-bounded + /// wait for reconciliation to complete before responding. + async fn await_waiters( + &self, + waiters: Vec, + timeout: Duration, + ) -> Result<(), ReconcileWaitError> { + let deadline = Instant::now().checked_add(timeout).unwrap(); for waiter in waiters { let timeout = deadline.duration_since(Instant::now()); waiter.wait_timeout(timeout).await?; } - Ok(TenantCreateResponse { - shards: response_shards, - }) + + Ok(()) } - pub(crate) async fn tenant_timeline_create( + /// Part of [`Self::tenant_location_config`]: dissect an incoming location config request, + /// and transform it into either a tenant creation of a series of shard updates. + fn tenant_location_config_prepare( &self, tenant_id: TenantId, - mut create_req: TimelineCreateRequest, - ) -> Result { - let mut timeline_info = None; + req: TenantLocationConfigRequest, + ) -> TenantCreateOrUpdate { + let mut updates = Vec::new(); + let mut locked = self.inner.write().unwrap(); + let (nodes, tenants, _scheduler) = locked.parts_mut(); - let ensure_waiters = { - let locked = self.inner.write().unwrap(); - tracing::info!( - "Creating timeline {}/{}, have {} pageservers", - tenant_id, - create_req.new_timeline_id, - locked.nodes.len() - ); - - self.ensure_attached(locked, tenant_id) - .map_err(ApiError::InternalServerError)? + // Use location config mode as an indicator of policy. + let placement_policy = match req.config.mode { + LocationConfigMode::Detached => PlacementPolicy::Detached, + LocationConfigMode::Secondary => PlacementPolicy::Secondary, + LocationConfigMode::AttachedMulti + | LocationConfigMode::AttachedSingle + | LocationConfigMode::AttachedStale => { + if nodes.len() > 1 { + PlacementPolicy::Double(1) + } else { + // Convenience for dev/test: if we just have one pageserver, import + // tenants into Single mode so that scheduling will succeed. + PlacementPolicy::Single + } + } }; - let deadline = Instant::now().checked_add(Duration::from_secs(5)).unwrap(); - for waiter in ensure_waiters { - let timeout = deadline.duration_since(Instant::now()); - waiter.wait_timeout(timeout).await?; + let mut create = true; + for (shard_id, shard) in tenants.range_mut(TenantShardId::tenant_range(tenant_id)) { + // Saw an existing shard: this is not a creation + create = false; + + // Shards may have initially been created by a Secondary request, where we + // would have left generation as None. + // + // We only update generation the first time we see an attached-mode request, + // and if there is no existing generation set. The caller is responsible for + // ensuring that no non-storage-controller pageserver ever uses a higher + // generation than they passed in here. + use LocationConfigMode::*; + let set_generation = match req.config.mode { + AttachedMulti | AttachedSingle | AttachedStale if shard.generation.is_none() => { + req.config.generation.map(Generation::new) + } + _ => None, + }; + + if shard.policy != placement_policy + || shard.config != req.config.tenant_conf + || set_generation.is_some() + { + updates.push(ShardUpdate { + tenant_shard_id: *shard_id, + placement_policy: placement_policy.clone(), + tenant_config: req.config.tenant_conf.clone(), + generation: set_generation, + }); + } } + if create { + use LocationConfigMode::*; + let generation = match req.config.mode { + AttachedMulti | AttachedSingle | AttachedStale => req.config.generation, + // If a caller provided a generation in a non-attached request, ignore it + // and leave our generation as None: this enables a subsequent update to set + // the generation when setting an attached mode for the first time. + _ => None, + }; + + TenantCreateOrUpdate::Create( + // Synthesize a creation request + TenantCreateRequest { + new_tenant_id: TenantShardId::unsharded(tenant_id), + generation, + shard_parameters: ShardParameters { + // Must preserve the incoming shard_count do distinguish unsharded (0) + // from single-sharded (1): this distinction appears in the S3 keys of the tenant. + count: req.tenant_id.shard_count, + // We only import un-sharded or single-sharded tenants, so stripe + // size can be made up arbitrarily here. + stripe_size: ShardParameters::DEFAULT_STRIPE_SIZE, + }, + placement_policy: Some(placement_policy), + config: req.config.tenant_conf, + }, + ) + } else { + TenantCreateOrUpdate::Update(updates) + } + } + + /// This API is used by the cloud control plane to migrate unsharded tenants that it created + /// directly with pageservers into this service. + /// + /// Cloud control plane MUST NOT continue issuing GENERATION NUMBERS for this tenant once it + /// has attempted to call this API. Failure to oblige to this rule may lead to S3 corruption. + /// Think of the first attempt to call this API as a transfer of absolute authority over the + /// tenant's source of generation numbers. + /// + /// The mode in this request coarse-grained control of tenants: + /// - Call with mode Attached* to upsert the tenant. + /// - Call with mode Secondary to either onboard a tenant without attaching it, or + /// to set an existing tenant to PolicyMode::Secondary + /// - Call with mode Detached to switch to PolicyMode::Detached + pub(crate) async fn tenant_location_config( + &self, + tenant_id: TenantId, + req: TenantLocationConfigRequest, + ) -> Result { + if !req.tenant_id.is_unsharded() { + return Err(ApiError::BadRequest(anyhow::anyhow!( + "This API is for importing single-sharded or unsharded tenants" + ))); + } + + // First check if this is a creation or an update + let create_or_update = self.tenant_location_config_prepare(tenant_id, req); + + let mut result = TenantLocationConfigResponse { + shards: Vec::new(), + stripe_size: None, + }; + let waiters = match create_or_update { + TenantCreateOrUpdate::Create(create_req) => { + let (create_resp, waiters) = self.do_tenant_create(create_req).await?; + result.shards = create_resp + .shards + .into_iter() + .map(|s| TenantShardLocation { + node_id: s.node_id, + shard_id: s.shard_id, + }) + .collect(); + waiters + } + TenantCreateOrUpdate::Update(updates) => { + // Persist updates + // Ordering: write to the database before applying changes in-memory, so that + // we will not appear time-travel backwards on a restart. + for ShardUpdate { + tenant_shard_id, + placement_policy, + tenant_config, + generation, + } in &updates + { + self.persistence + .update_tenant_shard( + *tenant_shard_id, + placement_policy.clone(), + tenant_config.clone(), + *generation, + ) + .await?; + } + + // Apply updates in-memory + let mut waiters = Vec::new(); + { + let mut locked = self.inner.write().unwrap(); + let (nodes, tenants, scheduler) = locked.parts_mut(); + + for ShardUpdate { + tenant_shard_id, + placement_policy, + tenant_config, + generation: update_generation, + } in updates + { + let Some(shard) = tenants.get_mut(&tenant_shard_id) else { + tracing::warn!("Shard {tenant_shard_id} removed while updating"); + continue; + }; + + // Update stripe size + if result.stripe_size.is_none() && shard.shard.count.count() > 1 { + result.stripe_size = Some(shard.shard.stripe_size); + } + + shard.policy = placement_policy; + shard.config = tenant_config; + if let Some(generation) = update_generation { + shard.generation = Some(generation); + } + + shard.schedule(scheduler)?; + + let maybe_waiter = self.maybe_reconcile_shard(shard, nodes); + if let Some(waiter) = maybe_waiter { + waiters.push(waiter); + } + + if let Some(node_id) = shard.intent.get_attached() { + result.shards.push(TenantShardLocation { + shard_id: tenant_shard_id, + node_id: *node_id, + }) + } + } + } + waiters + } + }; + + if let Err(e) = self.await_waiters(waiters, SHORT_RECONCILE_TIMEOUT).await { + // Do not treat a reconcile error as fatal: we have already applied any requested + // Intent changes, and the reconcile can fail for external reasons like unavailable + // compute notification API. In these cases, it is important that we do not + // cause the cloud control plane to retry forever on this API. + tracing::warn!( + "Failed to reconcile after /location_config: {e}, returning success anyway" + ); + } + + // Logging the full result is useful because it lets us cross-check what the cloud control + // plane's tenant_shards table should contain. + tracing::info!("Complete, returning {result:?}"); + + Ok(result) + } + + pub(crate) async fn tenant_config_set(&self, req: TenantConfigRequest) -> Result<(), ApiError> { + let tenant_id = req.tenant_id; + let config = req.config; + + self.persistence + .update_tenant_config(req.tenant_id, config.clone()) + .await?; + + let waiters = { + let mut waiters = Vec::new(); + let mut locked = self.inner.write().unwrap(); + let (nodes, tenants, _scheduler) = locked.parts_mut(); + for (_shard_id, shard) in tenants.range_mut(TenantShardId::tenant_range(tenant_id)) { + shard.config = config.clone(); + if let Some(waiter) = self.maybe_reconcile_shard(shard, nodes) { + waiters.push(waiter); + } + } + waiters + }; + + if let Err(e) = self.await_waiters(waiters, SHORT_RECONCILE_TIMEOUT).await { + // Treat this as success because we have stored the configuration. If e.g. + // a node was unavailable at this time, it should not stop us accepting a + // configuration change. + tracing::warn!(%tenant_id, "Accepted configuration update but reconciliation failed: {e}"); + } + + Ok(()) + } + + pub(crate) fn tenant_config_get( + &self, + tenant_id: TenantId, + ) -> Result, ApiError> { + let config = { + let locked = self.inner.read().unwrap(); + + match locked + .tenants + .range(TenantShardId::tenant_range(tenant_id)) + .next() + { + Some((_tenant_shard_id, shard)) => shard.config.clone(), + None => { + return Err(ApiError::NotFound( + anyhow::anyhow!("Tenant not found").into(), + )) + } + } + }; + + // Unlike the pageserver, we do not have a set of global defaults: the config is + // entirely per-tenant. Therefore the distinction between `tenant_specific_overrides` + // and `effective_config` in the response is meaningless, but we retain that syntax + // in order to remain compatible with the pageserver API. + + let response = HashMap::from([ + ( + "tenant_specific_overrides", + serde_json::to_value(&config) + .context("serializing tenant specific overrides") + .map_err(ApiError::InternalServerError)?, + ), + ( + "effective_config", + serde_json::to_value(&config) + .context("serializing effective config") + .map_err(ApiError::InternalServerError)?, + ), + ]); + + Ok(response) + } + + pub(crate) async fn tenant_time_travel_remote_storage( + &self, + time_travel_req: &TenantTimeTravelRequest, + tenant_id: TenantId, + timestamp: Cow<'_, str>, + done_if_after: Cow<'_, str>, + ) -> Result<(), ApiError> { + let node = { + let locked = self.inner.read().unwrap(); + // Just a sanity check to prevent misuse: the API expects that the tenant is fully + // detached everywhere, and nothing writes to S3 storage. Here, we verify that, + // but only at the start of the process, so it's really just to prevent operator + // mistakes. + for (shard_id, shard) in locked.tenants.range(TenantShardId::tenant_range(tenant_id)) { + if shard.intent.get_attached().is_some() || !shard.intent.get_secondary().is_empty() + { + return Err(ApiError::InternalServerError(anyhow::anyhow!( + "We want tenant to be attached in shard with tenant_shard_id={shard_id}" + ))); + } + let maybe_attached = shard + .observed + .locations + .iter() + .filter_map(|(node_id, observed_location)| { + observed_location + .conf + .as_ref() + .map(|loc| (node_id, observed_location, loc.mode)) + }) + .find(|(_, _, mode)| *mode != LocationConfigMode::Detached); + if let Some((node_id, _observed_location, mode)) = maybe_attached { + return Err(ApiError::InternalServerError(anyhow::anyhow!("We observed attached={mode:?} tenant in node_id={node_id} shard with tenant_shard_id={shard_id}"))); + } + } + let scheduler = &locked.scheduler; + // Right now we only perform the operation on a single node without parallelization + // TODO fan out the operation to multiple nodes for better performance + let node_id = scheduler.schedule_shard(&[])?; + let node = locked + .nodes + .get(&node_id) + .expect("Pageservers may not be deleted while lock is active"); + node.clone() + }; + + // The shard count is encoded in the remote storage's URL, so we need to handle all historically used shard counts + let mut counts = time_travel_req + .shard_counts + .iter() + .copied() + .collect::>() + .into_iter() + .collect::>(); + counts.sort_unstable(); + + for count in counts { + let shard_ids = (0..count.count()) + .map(|i| TenantShardId { + tenant_id, + shard_number: ShardNumber(i), + shard_count: count, + }) + .collect::>(); + for tenant_shard_id in shard_ids { + let client = + mgmt_api::Client::new(node.base_url(), self.config.jwt_token.as_deref()); + + tracing::info!("Doing time travel recovery for shard {tenant_shard_id}",); + + client + .tenant_time_travel_remote_storage( + tenant_shard_id, + ×tamp, + &done_if_after, + ) + .await + .map_err(|e| { + ApiError::InternalServerError(anyhow::anyhow!( + "Error doing time travel recovery for shard {tenant_shard_id} on node {}: {e}", + node + )) + })?; + } + } + Ok(()) + } + + pub(crate) async fn tenant_secondary_download( + &self, + tenant_id: TenantId, + ) -> Result<(), ApiError> { + // Acquire lock and yield the collection of shard-node tuples which we will send requests onward to let targets = { let locked = self.inner.read().unwrap(); let mut targets = Vec::new(); @@ -710,7 +1650,281 @@ impl Service { for (tenant_shard_id, shard) in locked.tenants.range(TenantShardId::tenant_range(tenant_id)) { - let node_id = shard.intent.attached.ok_or_else(|| { + for node_id in shard.intent.get_secondary() { + let node = locked + .nodes + .get(node_id) + .expect("Pageservers may not be deleted while referenced"); + + targets.push((*tenant_shard_id, node.clone())); + } + } + targets + }; + + // TODO: this API, and the underlying pageserver API, should take a timeout argument so that for long running + // downloads, they can return a clean 202 response instead of the HTTP client timing out. + + // Issue concurrent requests to all shards' locations + let mut futs = FuturesUnordered::new(); + for (tenant_shard_id, node) in targets { + let client = mgmt_api::Client::new(node.base_url(), self.config.jwt_token.as_deref()); + futs.push(async move { + let result = client.tenant_secondary_download(tenant_shard_id).await; + (result, node) + }) + } + + // Handle any errors returned by pageservers. This includes cases like this request racing with + // a scheduling operation, such that the tenant shard we're calling doesn't exist on that pageserver any more, as + // well as more general cases like 503s, 500s, or timeouts. + while let Some((result, node)) = futs.next().await { + let Err(e) = result else { continue }; + + // Secondary downloads are always advisory: if something fails, we nevertheless report success, so that whoever + // is calling us will proceed with whatever migration they're doing, albeit with a slightly less warm cache + // than they had hoped for. + tracing::warn!("Ignoring tenant secondary download error from pageserver {node}: {e}",); + } + + Ok(()) + } + + pub(crate) async fn tenant_delete(&self, tenant_id: TenantId) -> Result { + self.ensure_attached_wait(tenant_id).await?; + + // TODO: refactor into helper + let targets = { + let locked = self.inner.read().unwrap(); + let mut targets = Vec::new(); + + for (tenant_shard_id, shard) in + locked.tenants.range(TenantShardId::tenant_range(tenant_id)) + { + let node_id = shard.intent.get_attached().ok_or_else(|| { + ApiError::InternalServerError(anyhow::anyhow!("Shard not scheduled")) + })?; + let node = locked + .nodes + .get(&node_id) + .expect("Pageservers may not be deleted while referenced"); + + targets.push((*tenant_shard_id, node.clone())); + } + targets + }; + + // Phase 1: delete on the pageservers + let mut any_pending = false; + for (tenant_shard_id, node) in targets { + let client = mgmt_api::Client::new(node.base_url(), self.config.jwt_token.as_deref()); + // TODO: this, like many other places, requires proper retry handling for 503, timeout: those should not + // surface immediately as an error to our caller. + let status = client.tenant_delete(tenant_shard_id).await.map_err(|e| { + ApiError::InternalServerError(anyhow::anyhow!( + "Error deleting shard {tenant_shard_id} on node {node}: {e}", + )) + })?; + tracing::info!( + "Shard {tenant_shard_id} on node {node}, delete returned {}", + status + ); + if status == StatusCode::ACCEPTED { + any_pending = true; + } + } + + if any_pending { + // Caller should call us again later. When we eventually see 404s from + // all the shards, we may proceed to delete our records of the tenant. + tracing::info!( + "Tenant {} has some shards pending deletion, returning 202", + tenant_id + ); + return Ok(StatusCode::ACCEPTED); + } + + // Fall through: deletion of the tenant on pageservers is complete, we may proceed to drop + // our in-memory state and database state. + + // Ordering: we delete persistent state first: if we then + // crash, we will drop the in-memory state. + + // Drop persistent state. + self.persistence.delete_tenant(tenant_id).await?; + + // Drop in-memory state + { + let mut locked = self.inner.write().unwrap(); + let (_nodes, tenants, scheduler) = locked.parts_mut(); + + // Dereference Scheduler from shards before dropping them + for (_tenant_shard_id, shard) in + tenants.range_mut(TenantShardId::tenant_range(tenant_id)) + { + shard.intent.clear(scheduler); + } + + tenants.retain(|tenant_shard_id, _shard| tenant_shard_id.tenant_id != tenant_id); + tracing::info!( + "Deleted tenant {tenant_id}, now have {} tenants", + locked.tenants.len() + ); + }; + + // Success is represented as 404, to imitate the existing pageserver deletion API + Ok(StatusCode::NOT_FOUND) + } + + pub(crate) async fn tenant_timeline_create( + &self, + tenant_id: TenantId, + mut create_req: TimelineCreateRequest, + ) -> Result { + tracing::info!( + "Creating timeline {}/{}", + tenant_id, + create_req.new_timeline_id, + ); + + self.ensure_attached_wait(tenant_id).await?; + + // TODO: refuse to do this if shard splitting is in progress + // (https://github.com/neondatabase/neon/issues/6676) + let mut targets = { + let locked = self.inner.read().unwrap(); + let mut targets = Vec::new(); + + for (tenant_shard_id, shard) in + locked.tenants.range(TenantShardId::tenant_range(tenant_id)) + { + let node_id = shard.intent.get_attached().ok_or_else(|| { + ApiError::InternalServerError(anyhow::anyhow!("Shard not scheduled")) + })?; + let node = locked + .nodes + .get(&node_id) + .expect("Pageservers may not be deleted while referenced"); + + targets.push((*tenant_shard_id, node.clone())); + } + targets + }; + + if targets.is_empty() { + return Err(ApiError::NotFound( + anyhow::anyhow!("Tenant not found").into(), + )); + }; + let shard_zero = targets.remove(0); + + async fn create_one( + tenant_shard_id: TenantShardId, + node: Node, + jwt: Option, + create_req: TimelineCreateRequest, + ) -> Result { + tracing::info!( + "Creating timeline on shard {}/{}, attached to node {node}", + tenant_shard_id, + create_req.new_timeline_id, + ); + let client = mgmt_api::Client::new(node.base_url(), jwt.as_deref()); + + client + .timeline_create(tenant_shard_id, &create_req) + .await + .map_err(|e| match e { + mgmt_api::Error::ApiError(status, msg) + if status == StatusCode::INTERNAL_SERVER_ERROR + || status == StatusCode::NOT_ACCEPTABLE => + { + // TODO: handle more error codes, e.g. 503 should be passed through. Make a general wrapper + // for pass-through API calls. + ApiError::InternalServerError(anyhow::anyhow!(msg)) + } + _ => ApiError::Conflict(format!("Failed to create timeline: {e}")), + }) + } + + // Because the caller might not provide an explicit LSN, we must do the creation first on a single shard, and then + // use whatever LSN that shard picked when creating on subsequent shards. We arbitrarily use shard zero as the shard + // that will get the first creation request, and propagate the LSN to all the >0 shards. + let timeline_info = create_one( + shard_zero.0, + shard_zero.1, + self.config.jwt_token.clone(), + create_req.clone(), + ) + .await?; + + // Propagate the LSN that shard zero picked, if caller didn't provide one + if create_req.ancestor_timeline_id.is_some() && create_req.ancestor_start_lsn.is_none() { + create_req.ancestor_start_lsn = timeline_info.ancestor_lsn; + } + + // Create timeline on remaining shards with number >0 + if !targets.is_empty() { + // If we had multiple shards, issue requests for the remainder now. + let jwt = self.config.jwt_token.clone(); + self.tenant_for_shards(targets, |tenant_shard_id: TenantShardId, node: Node| { + let create_req = create_req.clone(); + Box::pin(create_one(tenant_shard_id, node, jwt.clone(), create_req)) + }) + .await?; + } + + Ok(timeline_info) + } + + /// Helper for concurrently calling a pageserver API on a number of shards, such as timeline creation. + /// + /// On success, the returned vector contains exactly the same number of elements as the input `locations`. + async fn tenant_for_shards( + &self, + locations: Vec<(TenantShardId, Node)>, + mut req_fn: F, + ) -> Result, ApiError> + where + F: FnMut( + TenantShardId, + Node, + ) + -> std::pin::Pin> + Send>>, + { + let mut futs = FuturesUnordered::new(); + let mut results = Vec::with_capacity(locations.len()); + + for (tenant_shard_id, node) in locations { + futs.push(req_fn(tenant_shard_id, node)); + } + + while let Some(r) = futs.next().await { + results.push(r?); + } + + Ok(results) + } + + pub(crate) async fn tenant_timeline_delete( + &self, + tenant_id: TenantId, + timeline_id: TimelineId, + ) -> Result { + tracing::info!("Deleting timeline {}/{}", tenant_id, timeline_id,); + + self.ensure_attached_wait(tenant_id).await?; + + // TODO: refuse to do this if shard splitting is in progress + // (https://github.com/neondatabase/neon/issues/6676) + let mut targets = { + let locked = self.inner.read().unwrap(); + let mut targets = Vec::new(); + + for (tenant_shard_id, shard) in + locked.tenants.range(TenantShardId::tenant_range(tenant_id)) + { + let node_id = shard.intent.get_attached().ok_or_else(|| { ApiError::InternalServerError(anyhow::anyhow!("Shard not scheduled")) })?; let node = locked @@ -728,49 +1942,96 @@ impl Service { anyhow::anyhow!("Tenant not found").into(), )); } + let shard_zero = targets.remove(0); - for (tenant_shard_id, node) in targets { - // TODO: issue shard timeline creates in parallel, once the 0th is done. - - let client = mgmt_api::Client::new(node.base_url(), self.config.jwt_token.as_deref()); - + async fn delete_one( + tenant_shard_id: TenantShardId, + timeline_id: TimelineId, + node: Node, + jwt: Option, + ) -> Result { tracing::info!( - "Creating timeline on shard {}/{}, attached to node {}", - tenant_shard_id, - create_req.new_timeline_id, - node.id + "Deleting timeline on shard {tenant_shard_id}/{timeline_id}, attached to node {node}", ); - let shard_timeline_info = client - .timeline_create(tenant_shard_id, &create_req) + let client = mgmt_api::Client::new(node.base_url(), jwt.as_deref()); + client + .timeline_delete(tenant_shard_id, timeline_id) .await - .map_err(|e| match e { - mgmt_api::Error::ApiError(status, msg) - if status == StatusCode::INTERNAL_SERVER_ERROR - || status == StatusCode::NOT_ACCEPTABLE => - { - // TODO: handle more error codes, e.g. 503 should be passed through. Make a general wrapper - // for pass-through API calls. - ApiError::InternalServerError(anyhow::anyhow!(msg)) - } - _ => ApiError::Conflict(format!("Failed to create timeline: {e}")), - })?; - - if timeline_info.is_none() { - // If the caller specified an ancestor but no ancestor LSN, we are responsible for - // propagating the LSN chosen by the first shard to the other shards: it is important - // that all shards end up with the same ancestor_start_lsn. - if create_req.ancestor_timeline_id.is_some() - && create_req.ancestor_start_lsn.is_none() - { - create_req.ancestor_start_lsn = shard_timeline_info.ancestor_lsn; - } - - // We will return the TimelineInfo from the first shard - timeline_info = Some(shard_timeline_info); - } + .map_err(|e| { + ApiError::InternalServerError(anyhow::anyhow!( + "Error deleting timeline {timeline_id} on {tenant_shard_id} on node {node}: {e}", + )) + }) } - Ok(timeline_info.expect("targets cannot be empty")) + + let statuses = self + .tenant_for_shards(targets, |tenant_shard_id: TenantShardId, node: Node| { + Box::pin(delete_one( + tenant_shard_id, + timeline_id, + node, + self.config.jwt_token.clone(), + )) + }) + .await?; + + // If any shards >0 haven't finished deletion yet, don't start deletion on shard zero + if statuses.iter().any(|s| s != &StatusCode::NOT_FOUND) { + return Ok(StatusCode::ACCEPTED); + } + + // Delete shard zero last: this is not strictly necessary, but since a caller's GET on a timeline will be routed + // to shard zero, it gives a more obvious behavior that a GET returns 404 once the deletion is done. + let shard_zero_status = delete_one( + shard_zero.0, + timeline_id, + shard_zero.1, + self.config.jwt_token.clone(), + ) + .await?; + + Ok(shard_zero_status) + } + + /// When you need to send an HTTP request to the pageserver that holds shard0 of a tenant, this + /// function looks it up and returns the url. If the tenant isn't found, returns Err(ApiError::NotFound) + pub(crate) fn tenant_shard0_baseurl( + &self, + tenant_id: TenantId, + ) -> Result<(String, TenantShardId), ApiError> { + let locked = self.inner.read().unwrap(); + let Some((tenant_shard_id, shard)) = locked + .tenants + .range(TenantShardId::tenant_range(tenant_id)) + .next() + else { + return Err(ApiError::NotFound( + anyhow::anyhow!("Tenant {tenant_id} not found").into(), + )); + }; + + // TODO: should use the ID last published to compute_hook, rather than the intent: the intent might + // point to somewhere we haven't attached yet. + let Some(node_id) = shard.intent.get_attached() else { + tracing::warn!( + tenant_id=%tenant_shard_id.tenant_id, shard_id=%tenant_shard_id.shard_slug(), + "Shard not scheduled (policy {:?}), cannot generate pass-through URL", + shard.policy + ); + return Err(ApiError::Conflict( + "Cannot call timeline API on non-attached tenant".to_string(), + )); + }; + + let Some(node) = locked.nodes.get(node_id) else { + // This should never happen + return Err(ApiError::InternalServerError(anyhow::anyhow!( + "Shard refers to nonexistent node" + ))); + }; + + Ok((node.base_url(), *tenant_shard_id)) } pub(crate) fn tenant_locate( @@ -788,25 +2049,19 @@ impl Service { for (tenant_shard_id, shard) in locked.tenants.range(TenantShardId::tenant_range(tenant_id)) { - let node_id = shard - .intent - .attached - .ok_or(ApiError::BadRequest(anyhow::anyhow!( - "Cannot locate a tenant that is not attached" - )))?; + let node_id = + shard + .intent + .get_attached() + .ok_or(ApiError::BadRequest(anyhow::anyhow!( + "Cannot locate a tenant that is not attached" + )))?; let node = pageservers .get(&node_id) .expect("Pageservers may not be deleted while referenced"); - result.push(TenantLocateResponseShard { - shard_id: *tenant_shard_id, - node_id, - listen_http_addr: node.listen_http_addr.clone(), - listen_http_port: node.listen_http_port, - listen_pg_addr: node.listen_pg_addr.clone(), - listen_pg_port: node.listen_pg_port, - }); + result.push(node.shard_location(*tenant_shard_id)); match &shard_params { None => { @@ -850,6 +2105,350 @@ impl Service { }) } + pub(crate) async fn tenant_shard_split( + &self, + tenant_id: TenantId, + split_req: TenantShardSplitRequest, + ) -> Result { + let mut policy = None; + let mut shard_ident = None; + + // A parent shard which will be split + struct SplitTarget { + parent_id: TenantShardId, + node: Node, + child_ids: Vec, + } + + // Validate input, and calculate which shards we will create + let (old_shard_count, targets) = + { + let locked = self.inner.read().unwrap(); + + let pageservers = locked.nodes.clone(); + + let mut targets = Vec::new(); + + // In case this is a retry, count how many already-split shards we found + let mut children_found = Vec::new(); + let mut old_shard_count = None; + + for (tenant_shard_id, shard) in + locked.tenants.range(TenantShardId::tenant_range(tenant_id)) + { + match shard.shard.count.count().cmp(&split_req.new_shard_count) { + Ordering::Equal => { + // Already split this + children_found.push(*tenant_shard_id); + continue; + } + Ordering::Greater => { + return Err(ApiError::BadRequest(anyhow::anyhow!( + "Requested count {} but already have shards at count {}", + split_req.new_shard_count, + shard.shard.count.count() + ))); + } + Ordering::Less => { + // Fall through: this shard has lower count than requested, + // is a candidate for splitting. + } + } + + match old_shard_count { + None => old_shard_count = Some(shard.shard.count), + Some(old_shard_count) => { + if old_shard_count != shard.shard.count { + // We may hit this case if a caller asked for two splits to + // different sizes, before the first one is complete. + // e.g. 1->2, 2->4, where the 4 call comes while we have a mixture + // of shard_count=1 and shard_count=2 shards in the map. + return Err(ApiError::Conflict( + "Cannot split, currently mid-split".to_string(), + )); + } + } + } + if policy.is_none() { + policy = Some(shard.policy.clone()); + } + if shard_ident.is_none() { + shard_ident = Some(shard.shard); + } + + if tenant_shard_id.shard_count.count() == split_req.new_shard_count { + tracing::info!( + "Tenant shard {} already has shard count {}", + tenant_shard_id, + split_req.new_shard_count + ); + continue; + } + + let node_id = shard.intent.get_attached().ok_or(ApiError::BadRequest( + anyhow::anyhow!("Cannot split a tenant that is not attached"), + ))?; + + let node = pageservers + .get(&node_id) + .expect("Pageservers may not be deleted while referenced"); + + // TODO: if any reconciliation is currently in progress for this shard, wait for it. + + targets.push(SplitTarget { + parent_id: *tenant_shard_id, + node: node.clone(), + child_ids: tenant_shard_id + .split(ShardCount::new(split_req.new_shard_count)), + }); + } + + if targets.is_empty() { + if children_found.len() == split_req.new_shard_count as usize { + return Ok(TenantShardSplitResponse { + new_shards: children_found, + }); + } else { + // No shards found to split, and no existing children found: the + // tenant doesn't exist at all. + return Err(ApiError::NotFound( + anyhow::anyhow!("Tenant {} not found", tenant_id).into(), + )); + } + } + + (old_shard_count, targets) + }; + + // unwrap safety: we would have returned above if we didn't find at least one shard to split + let old_shard_count = old_shard_count.unwrap(); + let shard_ident = if let Some(new_stripe_size) = split_req.new_stripe_size { + // This ShardIdentity will be used as the template for all children, so this implicitly + // applies the new stripe size to the children. + let mut shard_ident = shard_ident.unwrap(); + if shard_ident.count.count() > 1 && shard_ident.stripe_size != new_stripe_size { + return Err(ApiError::BadRequest(anyhow::anyhow!("Attempted to change stripe size ({:?}->{new_stripe_size:?}) on a tenant with multiple shards", shard_ident.stripe_size))); + } + shard_ident.stripe_size = new_stripe_size; + shard_ident + } else { + shard_ident.unwrap() + }; + let policy = policy.unwrap(); + + // FIXME: we have dropped self.inner lock, and not yet written anything to the database: another + // request could occur here, deleting or mutating the tenant. begin_shard_split checks that the + // parent shards exist as expected, but it would be neater to do the above pre-checks within the + // same database transaction rather than pre-check in-memory and then maybe-fail the database write. + // (https://github.com/neondatabase/neon/issues/6676) + + // Before creating any new child shards in memory or on the pageservers, persist them: this + // enables us to ensure that we will always be able to clean up if something goes wrong. This also + // acts as the protection against two concurrent attempts to split: one of them will get a database + // error trying to insert the child shards. + let mut child_tsps = Vec::new(); + for target in &targets { + let mut this_child_tsps = Vec::new(); + for child in &target.child_ids { + let mut child_shard = shard_ident; + child_shard.number = child.shard_number; + child_shard.count = child.shard_count; + + this_child_tsps.push(TenantShardPersistence { + tenant_id: child.tenant_id.to_string(), + shard_number: child.shard_number.0 as i32, + shard_count: child.shard_count.literal() as i32, + shard_stripe_size: shard_ident.stripe_size.0 as i32, + // Note: this generation is a placeholder, [`Persistence::begin_shard_split`] will + // populate the correct generation as part of its transaction, to protect us + // against racing with changes in the state of the parent. + generation: None, + generation_pageserver: Some(target.node.get_id().0 as i64), + placement_policy: serde_json::to_string(&policy).unwrap(), + // TODO: get the config out of the map + config: serde_json::to_string(&TenantConfig::default()).unwrap(), + splitting: SplitState::Splitting, + }); + } + + child_tsps.push((target.parent_id, this_child_tsps)); + } + + if let Err(e) = self + .persistence + .begin_shard_split(old_shard_count, tenant_id, child_tsps) + .await + { + match e { + DatabaseError::Query(diesel::result::Error::DatabaseError( + DatabaseErrorKind::UniqueViolation, + _, + )) => { + // Inserting a child shard violated a unique constraint: we raced with another call to + // this function + tracing::warn!("Conflicting attempt to split {tenant_id}: {e}"); + return Err(ApiError::Conflict("Tenant is already splitting".into())); + } + _ => return Err(ApiError::InternalServerError(e.into())), + } + } + + // Now that I have persisted the splitting state, apply it in-memory. This is infallible, so + // callers may assume that if splitting is set in memory, then it was persisted, and if splitting + // is not set in memory, then it was not persisted. + { + let mut locked = self.inner.write().unwrap(); + for target in &targets { + if let Some(parent_shard) = locked.tenants.get_mut(&target.parent_id) { + parent_shard.splitting = SplitState::Splitting; + } + } + } + + // FIXME: we have now committed the shard split state to the database, so any subsequent + // failure needs to roll it back. We will later wrap this function in logic to roll back + // the split if it fails. + // (https://github.com/neondatabase/neon/issues/6676) + + // TODO: issue split calls concurrently (this only matters once we're splitting + // N>1 shards into M shards -- initially we're usually splitting 1 shard into N). + + for target in &targets { + let SplitTarget { + parent_id, + node, + child_ids, + } = target; + let client = mgmt_api::Client::new(node.base_url(), self.config.jwt_token.as_deref()); + let response = client + .tenant_shard_split( + *parent_id, + TenantShardSplitRequest { + new_shard_count: split_req.new_shard_count, + new_stripe_size: split_req.new_stripe_size, + }, + ) + .await + .map_err(|e| ApiError::Conflict(format!("Failed to split {}: {}", parent_id, e)))?; + + tracing::info!( + "Split {} into {}", + parent_id, + response + .new_shards + .iter() + .map(|s| format!("{:?}", s)) + .collect::>() + .join(",") + ); + + if &response.new_shards != child_ids { + // This should never happen: the pageserver should agree with us on how shard splits work. + return Err(ApiError::InternalServerError(anyhow::anyhow!( + "Splitting shard {} resulted in unexpected IDs: {:?} (expected {:?})", + parent_id, + response.new_shards, + child_ids + ))); + } + } + + // TODO: if the pageserver restarted concurrently with our split API call, + // the actual generation of the child shard might differ from the generation + // we expect it to have. In order for our in-database generation to end up + // correct, we should carry the child generation back in the response and apply it here + // in complete_shard_split (and apply the correct generation in memory) + // (or, we can carry generation in the request and reject the request if + // it doesn't match, but that requires more retry logic on this side) + + self.persistence + .complete_shard_split(tenant_id, old_shard_count) + .await?; + + // Replace all the shards we just split with their children: this phase is infallible. + let mut response = TenantShardSplitResponse { + new_shards: Vec::new(), + }; + let mut child_locations = Vec::new(); + { + let mut locked = self.inner.write().unwrap(); + let (_nodes, tenants, scheduler) = locked.parts_mut(); + for target in targets { + let SplitTarget { + parent_id, + node: _node, + child_ids, + } = target; + let (pageserver, generation, config) = { + let mut old_state = tenants + .remove(&parent_id) + .expect("It was present, we just split it"); + let old_attached = old_state.intent.get_attached().unwrap(); + old_state.intent.clear(scheduler); + let generation = old_state.generation.expect("Shard must have been attached"); + (old_attached, generation, old_state.config.clone()) + }; + + for child in child_ids { + let mut child_shard = shard_ident; + child_shard.number = child.shard_number; + child_shard.count = child.shard_count; + + let mut child_observed: HashMap = HashMap::new(); + child_observed.insert( + pageserver, + ObservedStateLocation { + conf: Some(attached_location_conf(generation, &child_shard, &config)), + }, + ); + + let mut child_state = TenantState::new(child, child_shard, policy.clone()); + child_state.intent = IntentState::single(scheduler, Some(pageserver)); + child_state.observed = ObservedState { + locations: child_observed, + }; + child_state.generation = Some(generation); + child_state.config = config.clone(); + + // The child's TenantState::splitting is intentionally left at the default value of Idle, + // as at this point in the split process we have succeeded and this part is infallible: + // we will never need to do any special recovery from this state. + + child_locations.push((child, pageserver, child_shard.stripe_size)); + + tenants.insert(child, child_state); + response.new_shards.push(child); + } + } + } + + // Send compute notifications for all the new shards + let mut failed_notifications = Vec::new(); + for (child_id, child_ps, stripe_size) in child_locations { + if let Err(e) = self + .compute_hook + .notify(child_id, child_ps, stripe_size, &self.cancel) + .await + { + tracing::warn!("Failed to update compute of {}->{} during split, proceeding anyway to complete split ({e})", + child_id, child_ps); + failed_notifications.push(child_id); + } + } + + // If we failed any compute notifications, make a note to retry later. + if !failed_notifications.is_empty() { + let mut locked = self.inner.write().unwrap(); + for failed in failed_notifications { + if let Some(shard) = locked.tenants.get_mut(&failed) { + shard.pending_compute_notification = true; + } + } + } + + Ok(response) + } + pub(crate) async fn tenant_shard_migrate( &self, tenant_shard_id: TenantShardId, @@ -857,36 +2456,53 @@ impl Service { ) -> Result { let waiter = { let mut locked = self.inner.write().unwrap(); + let (nodes, tenants, scheduler) = locked.parts_mut(); - let result_tx = locked.result_tx.clone(); - let pageservers = locked.nodes.clone(); - let compute_hook = locked.compute_hook.clone(); + let Some(node) = nodes.get(&migrate_req.node_id) else { + return Err(ApiError::BadRequest(anyhow::anyhow!( + "Node {} not found", + migrate_req.node_id + ))); + }; - let Some(shard) = locked.tenants.get_mut(&tenant_shard_id) else { + if !node.is_available() { + // Warn but proceed: the caller may intend to manually adjust the placement of + // a shard even if the node is down, e.g. if intervening during an incident. + tracing::warn!("Migrating to unavailable node {node}"); + } + + let Some(shard) = tenants.get_mut(&tenant_shard_id) else { return Err(ApiError::NotFound( anyhow::anyhow!("Tenant shard not found").into(), )); }; - if shard.intent.attached == Some(migrate_req.node_id) { + if shard.intent.get_attached() == &Some(migrate_req.node_id) { // No-op case: we will still proceed to wait for reconciliation in case it is // incomplete from an earlier update to the intent. tracing::info!("Migrating: intent is unchanged {:?}", shard.intent); } else { - let old_attached = shard.intent.attached; + let old_attached = *shard.intent.get_attached(); match shard.policy { PlacementPolicy::Single => { - shard.intent.secondary.clear(); + shard.intent.clear_secondary(scheduler); + shard.intent.set_attached(scheduler, Some(migrate_req.node_id)); } PlacementPolicy::Double(_n) => { // If our new attached node was a secondary, it no longer should be. - shard.intent.secondary.retain(|s| s != &migrate_req.node_id); + shard.intent.remove_secondary(scheduler, migrate_req.node_id); // If we were already attached to something, demote that to a secondary if let Some(old_attached) = old_attached { - shard.intent.secondary.push(old_attached); + shard.intent.push_secondary(scheduler, old_attached); } + + shard.intent.set_attached(scheduler, Some(migrate_req.node_id)); + } + PlacementPolicy::Secondary => { + shard.intent.clear(scheduler); + shard.intent.push_secondary(scheduler, migrate_req.node_id); } PlacementPolicy::Detached => { return Err(ApiError::BadRequest(anyhow::anyhow!( @@ -894,19 +2510,12 @@ impl Service { ))) } } - shard.intent.attached = Some(migrate_req.node_id); tracing::info!("Migrating: new intent {:?}", shard.intent); shard.sequence = shard.sequence.next(); } - shard.maybe_reconcile( - result_tx, - &pageservers, - &compute_hook, - &self.config, - &self.persistence, - ) + self.maybe_reconcile_shard(shard, nodes) }; if let Some(waiter) = waiter { @@ -918,6 +2527,184 @@ impl Service { Ok(TenantShardMigrateResponse {}) } + /// This is for debug/support only: we simply drop all state for a tenant, without + /// detaching or deleting it on pageservers. + pub(crate) async fn tenant_drop(&self, tenant_id: TenantId) -> Result<(), ApiError> { + self.persistence.delete_tenant(tenant_id).await?; + + let mut locked = self.inner.write().unwrap(); + let (_nodes, tenants, scheduler) = locked.parts_mut(); + let mut shards = Vec::new(); + for (tenant_shard_id, _) in tenants.range(TenantShardId::tenant_range(tenant_id)) { + shards.push(*tenant_shard_id); + } + + for shard_id in shards { + if let Some(mut shard) = tenants.remove(&shard_id) { + shard.intent.clear(scheduler); + } + } + + Ok(()) + } + + /// For debug/support: a full JSON dump of TenantStates. Returns a response so that + /// we don't have to make TenantState clonable in the return path. + pub(crate) fn tenants_dump(&self) -> Result, ApiError> { + let serialized = { + let locked = self.inner.read().unwrap(); + let result = locked.tenants.values().collect::>(); + serde_json::to_string(&result).map_err(|e| ApiError::InternalServerError(e.into()))? + }; + + hyper::Response::builder() + .status(hyper::StatusCode::OK) + .header(hyper::header::CONTENT_TYPE, "application/json") + .body(hyper::Body::from(serialized)) + .map_err(|e| ApiError::InternalServerError(e.into())) + } + + /// Check the consistency of in-memory state vs. persistent state, and check that the + /// scheduler's statistics are up to date. + /// + /// These consistency checks expect an **idle** system. If changes are going on while + /// we run, then we can falsely indicate a consistency issue. This is sufficient for end-of-test + /// checks, but not suitable for running continuously in the background in the field. + pub(crate) async fn consistency_check(&self) -> Result<(), ApiError> { + let (mut expect_nodes, mut expect_shards) = { + let locked = self.inner.read().unwrap(); + + locked + .scheduler + .consistency_check(locked.nodes.values(), locked.tenants.values()) + .context("Scheduler checks") + .map_err(ApiError::InternalServerError)?; + + let expect_nodes = locked + .nodes + .values() + .map(|n| n.to_persistent()) + .collect::>(); + + let expect_shards = locked + .tenants + .values() + .map(|t| t.to_persistent()) + .collect::>(); + + // This method can only validate the state of an idle system: if a reconcile is in + // progress, fail out early to avoid giving false errors on state that won't match + // between database and memory under a ReconcileResult is processed. + for t in locked.tenants.values() { + if t.reconciler.is_some() { + return Err(ApiError::InternalServerError(anyhow::anyhow!( + "Shard {} reconciliation in progress", + t.tenant_shard_id + ))); + } + } + + (expect_nodes, expect_shards) + }; + + let mut nodes = self.persistence.list_nodes().await?; + expect_nodes.sort_by_key(|n| n.node_id); + nodes.sort_by_key(|n| n.node_id); + + if nodes != expect_nodes { + tracing::error!("Consistency check failed on nodes."); + tracing::error!( + "Nodes in memory: {}", + serde_json::to_string(&expect_nodes) + .map_err(|e| ApiError::InternalServerError(e.into()))? + ); + tracing::error!( + "Nodes in database: {}", + serde_json::to_string(&nodes) + .map_err(|e| ApiError::InternalServerError(e.into()))? + ); + return Err(ApiError::InternalServerError(anyhow::anyhow!( + "Node consistency failure" + ))); + } + + let mut shards = self.persistence.list_tenant_shards().await?; + shards.sort_by_key(|tsp| (tsp.tenant_id.clone(), tsp.shard_number, tsp.shard_count)); + expect_shards.sort_by_key(|tsp| (tsp.tenant_id.clone(), tsp.shard_number, tsp.shard_count)); + + if shards != expect_shards { + tracing::error!("Consistency check failed on shards."); + tracing::error!( + "Shards in memory: {}", + serde_json::to_string(&expect_shards) + .map_err(|e| ApiError::InternalServerError(e.into()))? + ); + tracing::error!( + "Shards in database: {}", + serde_json::to_string(&shards) + .map_err(|e| ApiError::InternalServerError(e.into()))? + ); + return Err(ApiError::InternalServerError(anyhow::anyhow!( + "Shard consistency failure" + ))); + } + + Ok(()) + } + + /// For debug/support: a JSON dump of the [`Scheduler`]. Returns a response so that + /// we don't have to make TenantState clonable in the return path. + pub(crate) fn scheduler_dump(&self) -> Result, ApiError> { + let serialized = { + let locked = self.inner.read().unwrap(); + serde_json::to_string(&locked.scheduler) + .map_err(|e| ApiError::InternalServerError(e.into()))? + }; + + hyper::Response::builder() + .status(hyper::StatusCode::OK) + .header(hyper::header::CONTENT_TYPE, "application/json") + .body(hyper::Body::from(serialized)) + .map_err(|e| ApiError::InternalServerError(e.into())) + } + + /// This is for debug/support only: we simply drop all state for a tenant, without + /// detaching or deleting it on pageservers. We do not try and re-schedule any + /// tenants that were on this node. + /// + /// TODO: proper node deletion API that unhooks things more gracefully + pub(crate) async fn node_drop(&self, node_id: NodeId) -> Result<(), ApiError> { + self.persistence.delete_node(node_id).await?; + + let mut locked = self.inner.write().unwrap(); + + for shard in locked.tenants.values_mut() { + shard.deref_node(node_id); + } + + let mut nodes = (*locked.nodes).clone(); + nodes.remove(&node_id); + locked.nodes = Arc::new(nodes); + + locked.scheduler.node_remove(node_id); + + Ok(()) + } + + pub(crate) async fn node_list(&self) -> Result, ApiError> { + let nodes = { + self.inner + .read() + .unwrap() + .nodes + .values() + .cloned() + .collect::>() + }; + + Ok(nodes) + } + pub(crate) async fn node_register( &self, register_req: NodeRegisterRequest, @@ -928,11 +2715,7 @@ impl Service { if let Some(node) = locked.nodes.get(®ister_req.node_id) { // Note that we do not do a total equality of the struct, because we don't require // the availability/scheduling states to agree for a POST to be idempotent. - if node.listen_http_addr == register_req.listen_http_addr - && node.listen_http_port == register_req.listen_http_port - && node.listen_pg_addr == register_req.listen_pg_addr - && node.listen_pg_port == register_req.listen_pg_port - { + if node.registration_match(®ister_req) { tracing::info!( "Node {} re-registered with matching address", register_req.node_id @@ -956,25 +2739,21 @@ impl Service { // Ordering: we must persist the new node _before_ adding it to in-memory state. // This ensures that before we use it for anything or expose it via any external // API, it is guaranteed to be available after a restart. - let new_node = Node { - id: register_req.node_id, - listen_http_addr: register_req.listen_http_addr, - listen_http_port: register_req.listen_http_port, - listen_pg_addr: register_req.listen_pg_addr, - listen_pg_port: register_req.listen_pg_port, - scheduling: NodeSchedulingPolicy::Filling, - // TODO: we shouldn't really call this Active until we've heartbeated it. - availability: NodeAvailability::Active, - }; + let new_node = Node::new( + register_req.node_id, + register_req.listen_http_addr, + register_req.listen_http_port, + register_req.listen_pg_addr, + register_req.listen_pg_port, + ); + // TODO: idempotency if the node already exists in the database - self.persistence - .insert_node(&new_node) - .await - .map_err(ApiError::InternalServerError)?; + self.persistence.insert_node(&new_node).await?; let mut locked = self.inner.write().unwrap(); let mut new_nodes = (*locked.nodes).clone(); + locked.scheduler.node_upsert(&new_node); new_nodes.insert(register_req.node_id, new_node); locked.nodes = Arc::new(new_nodes); @@ -987,12 +2766,22 @@ impl Service { Ok(()) } - pub(crate) fn node_configure(&self, config_req: NodeConfigureRequest) -> Result<(), ApiError> { - let mut locked = self.inner.write().unwrap(); - let result_tx = locked.result_tx.clone(); - let compute_hook = locked.compute_hook.clone(); + pub(crate) async fn node_configure( + &self, + config_req: NodeConfigureRequest, + ) -> Result<(), ApiError> { + if let Some(scheduling) = config_req.scheduling { + // Scheduling is a persistent part of Node: we must write updates to the database before + // applying them in memory + self.persistence + .update_node(config_req.node_id, scheduling) + .await?; + } - let mut new_nodes = (*locked.nodes).clone(); + let mut locked = self.inner.write().unwrap(); + let (nodes, tenants, scheduler) = locked.parts_mut(); + + let mut new_nodes = (**nodes).clone(); let Some(node) = new_nodes.get_mut(&config_req.node_id) else { return Err(ApiError::NotFound( @@ -1000,90 +2789,82 @@ impl Service { )); }; - let mut offline_transition = false; - let mut active_transition = false; - - if let Some(availability) = &config_req.availability { - match (availability, &node.availability) { - (NodeAvailability::Offline, NodeAvailability::Active) => { - tracing::info!("Node {} transition to offline", config_req.node_id); - offline_transition = true; - } - (NodeAvailability::Active, NodeAvailability::Offline) => { - tracing::info!("Node {} transition to active", config_req.node_id); - active_transition = true; - } - _ => { - tracing::info!("Node {} no change during config", config_req.node_id); - // No change - } - }; - node.availability = *availability; - } + let availability_transition = if let Some(availability) = &config_req.availability { + node.set_availability(*availability) + } else { + AvailabilityTransition::Unchanged + }; if let Some(scheduling) = config_req.scheduling { - node.scheduling = scheduling; + node.set_scheduling(scheduling); // TODO: once we have a background scheduling ticker for fill/drain, kick it // to wake up and start working. } + // Update the scheduler, in case the elegibility of the node for new shards has changed + scheduler.node_upsert(node); + let new_nodes = Arc::new(new_nodes); - let mut scheduler = Scheduler::new(&locked.tenants, &new_nodes); - if offline_transition { - for (tenant_shard_id, tenant_state) in &mut locked.tenants { - if let Some(observed_loc) = - tenant_state.observed.locations.get_mut(&config_req.node_id) - { - // When a node goes offline, we set its observed configuration to None, indicating unknown: we will - // not assume our knowledge of the node's configuration is accurate until it comes back online - observed_loc.conf = None; - } + match availability_transition { + AvailabilityTransition::ToOffline => { + tracing::info!("Node {} transition to offline", config_req.node_id); + let mut tenants_affected: usize = 0; + for (tenant_shard_id, tenant_state) in tenants { + if let Some(observed_loc) = + tenant_state.observed.locations.get_mut(&config_req.node_id) + { + // When a node goes offline, we set its observed configuration to None, indicating unknown: we will + // not assume our knowledge of the node's configuration is accurate until it comes back online + observed_loc.conf = None; + } - if tenant_state.intent.notify_offline(config_req.node_id) { - tenant_state.sequence = tenant_state.sequence.next(); - match tenant_state.schedule(&mut scheduler) { - Err(e) => { - // It is possible that some tenants will become unschedulable when too many pageservers - // go offline: in this case there isn't much we can do other than make the issue observable. - // TODO: give TenantState a scheduling error attribute to be queried later. - tracing::warn!(%tenant_shard_id, "Scheduling error when marking pageserver {} offline: {e}", config_req.node_id); - } - Ok(()) => { - tenant_state.maybe_reconcile( - result_tx.clone(), - &new_nodes, - &compute_hook, - &self.config, - &self.persistence, - ); + if tenant_state.intent.demote_attached(config_req.node_id) { + tenant_state.sequence = tenant_state.sequence.next(); + match tenant_state.schedule(scheduler) { + Err(e) => { + // It is possible that some tenants will become unschedulable when too many pageservers + // go offline: in this case there isn't much we can do other than make the issue observable. + // TODO: give TenantState a scheduling error attribute to be queried later. + tracing::warn!(%tenant_shard_id, "Scheduling error when marking pageserver {} offline: {e}", config_req.node_id); + } + Ok(()) => { + if self + .maybe_reconcile_shard(tenant_state, &new_nodes) + .is_some() + { + tenants_affected += 1; + }; + } } } } + tracing::info!( + "Launched {} reconciler tasks for tenants affected by node {} going offline", + tenants_affected, + config_req.node_id + ) } - } - - if active_transition { - // When a node comes back online, we must reconcile any tenant that has a None observed - // location on the node. - for tenant_state in locked.tenants.values_mut() { - if let Some(observed_loc) = - tenant_state.observed.locations.get_mut(&config_req.node_id) - { - if observed_loc.conf.is_none() { - tenant_state.maybe_reconcile( - result_tx.clone(), - &new_nodes, - &compute_hook, - &self.config, - &self.persistence, - ); + AvailabilityTransition::ToActive => { + tracing::info!("Node {} transition to active", config_req.node_id); + // When a node comes back online, we must reconcile any tenant that has a None observed + // location on the node. + for tenant_state in locked.tenants.values_mut() { + if let Some(observed_loc) = + tenant_state.observed.locations.get_mut(&config_req.node_id) + { + if observed_loc.conf.is_none() { + self.maybe_reconcile_shard(tenant_state, &new_nodes); + } } } - } - // TODO: in the background, we should balance work back onto this pageserver + // TODO: in the background, we should balance work back onto this pageserver + } + AvailabilityTransition::Unchanged => { + tracing::info!("Node {} no change during config", config_req.node_id); + } } locked.nodes = new_nodes; @@ -1094,56 +2875,98 @@ impl Service { /// Helper for methods that will try and call pageserver APIs for /// a tenant, such as timeline CRUD: they cannot proceed unless the tenant /// is attached somewhere. - fn ensure_attached( + /// + /// TODO: this doesn't actually ensure attached unless the PlacementPolicy is + /// an attached policy. We should error out if it isn't. + fn ensure_attached_schedule( &self, mut locked: std::sync::RwLockWriteGuard<'_, ServiceState>, tenant_id: TenantId, ) -> Result, anyhow::Error> { let mut waiters = Vec::new(); - let result_tx = locked.result_tx.clone(); - let compute_hook = locked.compute_hook.clone(); - let mut scheduler = Scheduler::new(&locked.tenants, &locked.nodes); - let pageservers = locked.nodes.clone(); + let (nodes, tenants, scheduler) = locked.parts_mut(); - for (_tenant_shard_id, shard) in locked - .tenants - .range_mut(TenantShardId::tenant_range(tenant_id)) - { - shard.schedule(&mut scheduler)?; + for (_tenant_shard_id, shard) in tenants.range_mut(TenantShardId::tenant_range(tenant_id)) { + shard.schedule(scheduler)?; - if let Some(waiter) = shard.maybe_reconcile( - result_tx.clone(), - &pageservers, - &compute_hook, - &self.config, - &self.persistence, - ) { + if let Some(waiter) = self.maybe_reconcile_shard(shard, nodes) { waiters.push(waiter); } } Ok(waiters) } + async fn ensure_attached_wait(&self, tenant_id: TenantId) -> Result<(), ApiError> { + let ensure_waiters = { + let locked = self.inner.write().unwrap(); + + // Check if the tenant is splitting: in this case, even if it is attached, + // we must act as if it is not: this blocks e.g. timeline creation/deletion + // operations during the split. + for (_shard_id, shard) in locked.tenants.range(TenantShardId::tenant_range(tenant_id)) { + if !matches!(shard.splitting, SplitState::Idle) { + return Err(ApiError::ResourceUnavailable( + "Tenant shards are currently splitting".into(), + )); + } + } + + self.ensure_attached_schedule(locked, tenant_id) + .map_err(ApiError::InternalServerError)? + }; + + let deadline = Instant::now().checked_add(Duration::from_secs(5)).unwrap(); + for waiter in ensure_waiters { + let timeout = deadline.duration_since(Instant::now()); + waiter.wait_timeout(timeout).await?; + } + + Ok(()) + } + + /// Convenience wrapper around [`TenantState::maybe_reconcile`] that provides + /// all the references to parts of Self that are needed + fn maybe_reconcile_shard( + &self, + shard: &mut TenantState, + nodes: &Arc>, + ) -> Option { + shard.maybe_reconcile( + &self.result_tx, + nodes, + &self.compute_hook, + &self.config, + &self.persistence, + &self.gate, + &self.cancel, + ) + } + /// Check all tenants for pending reconciliation work, and reconcile those in need /// /// Returns how many reconciliation tasks were started fn reconcile_all(&self) -> usize { let mut locked = self.inner.write().unwrap(); - let result_tx = locked.result_tx.clone(); - let compute_hook = locked.compute_hook.clone(); let pageservers = locked.nodes.clone(); locked .tenants .iter_mut() - .filter_map(|(_tenant_shard_id, shard)| { - shard.maybe_reconcile( - result_tx.clone(), - &pageservers, - &compute_hook, - &self.config, - &self.persistence, - ) - }) + .filter_map(|(_tenant_shard_id, shard)| self.maybe_reconcile_shard(shard, &pageservers)) .count() } + + pub async fn shutdown(&self) { + // Note that this already stops processing any results from reconciles: so + // we do not expect that our [`TenantState`] objects will reach a neat + // final state. + self.cancel.cancel(); + + // The cancellation tokens in [`crate::reconciler::Reconciler`] are children + // of our cancellation token, so we do not need to explicitly cancel each of + // them. + + // Background tasks and reconcilers hold gate guards: this waits for them all + // to complete. + self.gate.close().await; + } } diff --git a/control_plane/attachment_service/src/tenant_state.rs b/control_plane/attachment_service/src/tenant_state.rs index 5290197d84..3c91e09ac3 100644 --- a/control_plane/attachment_service/src/tenant_state.rs +++ b/control_plane/attachment_service/src/tenant_state.rs @@ -1,27 +1,51 @@ -use std::{collections::HashMap, sync::Arc, time::Duration}; +use std::{ + collections::{HashMap, HashSet}, + sync::Arc, + time::Duration, +}; -use control_plane::attachment_service::NodeAvailability; +use crate::{metrics, persistence::TenantShardPersistence}; +use pageserver_api::controller_api::PlacementPolicy; use pageserver_api::{ models::{LocationConfig, LocationConfigMode, TenantConfig}, shard::{ShardIdentity, TenantShardId}, }; +use serde::Serialize; use tokio::task::JoinHandle; use tokio_util::sync::CancellationToken; +use tracing::{instrument, Instrument}; use utils::{ generation::Generation, id::NodeId, seqwait::{SeqWait, SeqWaitError}, + sync::gate::Gate, }; use crate::{ compute_hook::ComputeHook, node::Node, - persistence::Persistence, - reconciler::{attached_location_conf, secondary_location_conf, ReconcileError, Reconciler}, + persistence::{split_state::SplitState, Persistence}, + reconciler::{ + attached_location_conf, secondary_location_conf, ReconcileError, Reconciler, TargetState, + }, scheduler::{ScheduleError, Scheduler}, - service, PlacementPolicy, Sequence, + service, Sequence, }; +/// Serialization helper +fn read_mutex_content(v: &std::sync::Mutex, serializer: S) -> Result +where + S: serde::ser::Serializer, + T: Clone + std::fmt::Display, +{ + serializer.collect_str(&v.lock().unwrap()) +} + +/// In-memory state for a particular tenant shard. +/// +/// This struct implement Serialize for debugging purposes, but is _not_ persisted +/// itself: see [`crate::persistence`] for the subset of tenant shard state that is persisted. +#[derive(Serialize)] pub(crate) struct TenantState { pub(crate) tenant_shard_id: TenantShardId, @@ -33,8 +57,11 @@ pub(crate) struct TenantState { pub(crate) sequence: Sequence, // Latest generation number: next time we attach, increment this - // and use the incremented number when attaching - pub(crate) generation: Generation, + // and use the incremented number when attaching. + // + // None represents an incompletely onboarded tenant via the [`Service::location_config`] + // API, where this tenant may only run in PlacementPolicy::Secondary. + pub(crate) generation: Option, // High level description of how the tenant should be set up. Provided // externally. @@ -56,30 +83,172 @@ pub(crate) struct TenantState { /// If a reconcile task is currently in flight, it may be joined here (it is /// only safe to join if either the result has been received or the reconciler's /// cancellation token has been fired) + #[serde(skip)] pub(crate) reconciler: Option, + /// If a tenant is being split, then all shards with that TenantId will have a + /// SplitState set, this acts as a guard against other operations such as background + /// reconciliation, and timeline creation. + pub(crate) splitting: SplitState, + /// Optionally wait for reconciliation to complete up to a particular /// sequence number. + #[serde(skip)] pub(crate) waiter: std::sync::Arc>, /// Indicates sequence number for which we have encountered an error reconciling. If /// this advances ahead of [`Self::waiter`] then a reconciliation error has occurred, /// and callers should stop waiting for `waiter` and propagate the error. + #[serde(skip)] pub(crate) error_waiter: std::sync::Arc>, /// The most recent error from a reconcile on this tenant /// TODO: generalize to an array of recent events /// TOOD: use a ArcSwap instead of mutex for faster reads? + #[serde(serialize_with = "read_mutex_content")] pub(crate) last_error: std::sync::Arc>, + + /// If we have a pending compute notification that for some reason we weren't able to send, + /// set this to true. If this is set, calls to [`Self::maybe_reconcile`] will run a task to retry + /// sending it. This is the mechanism by which compute notifications are included in the scope + /// of state that we publish externally in an eventually consistent way. + pub(crate) pending_compute_notification: bool, } -#[derive(Default, Clone, Debug)] +#[derive(Default, Clone, Debug, Serialize)] pub(crate) struct IntentState { - pub(crate) attached: Option, - pub(crate) secondary: Vec, + attached: Option, + secondary: Vec, } -#[derive(Default, Clone)] +impl IntentState { + pub(crate) fn new() -> Self { + Self { + attached: None, + secondary: vec![], + } + } + pub(crate) fn single(scheduler: &mut Scheduler, node_id: Option) -> Self { + if let Some(node_id) = node_id { + scheduler.node_inc_ref(node_id); + } + Self { + attached: node_id, + secondary: vec![], + } + } + + pub(crate) fn set_attached(&mut self, scheduler: &mut Scheduler, new_attached: Option) { + if self.attached != new_attached { + if let Some(old_attached) = self.attached.take() { + scheduler.node_dec_ref(old_attached); + } + if let Some(new_attached) = &new_attached { + scheduler.node_inc_ref(*new_attached); + } + self.attached = new_attached; + } + } + + /// Like set_attached, but the node is from [`Self::secondary`]. This swaps the node from + /// secondary to attached while maintaining the scheduler's reference counts. + pub(crate) fn promote_attached( + &mut self, + _scheduler: &mut Scheduler, + promote_secondary: NodeId, + ) { + // If we call this with a node that isn't in secondary, it would cause incorrect + // scheduler reference counting, since we assume the node is already referenced as a secondary. + debug_assert!(self.secondary.contains(&promote_secondary)); + + // TODO: when scheduler starts tracking attached + secondary counts separately, we will + // need to call into it here. + self.secondary.retain(|n| n != &promote_secondary); + self.attached = Some(promote_secondary); + } + + pub(crate) fn push_secondary(&mut self, scheduler: &mut Scheduler, new_secondary: NodeId) { + debug_assert!(!self.secondary.contains(&new_secondary)); + scheduler.node_inc_ref(new_secondary); + self.secondary.push(new_secondary); + } + + /// It is legal to call this with a node that is not currently a secondary: that is a no-op + pub(crate) fn remove_secondary(&mut self, scheduler: &mut Scheduler, node_id: NodeId) { + let index = self.secondary.iter().position(|n| *n == node_id); + if let Some(index) = index { + scheduler.node_dec_ref(node_id); + self.secondary.remove(index); + } + } + + pub(crate) fn clear_secondary(&mut self, scheduler: &mut Scheduler) { + for secondary in self.secondary.drain(..) { + scheduler.node_dec_ref(secondary); + } + } + + /// Remove the last secondary node from the list of secondaries + pub(crate) fn pop_secondary(&mut self, scheduler: &mut Scheduler) { + if let Some(node_id) = self.secondary.pop() { + scheduler.node_dec_ref(node_id); + } + } + + pub(crate) fn clear(&mut self, scheduler: &mut Scheduler) { + if let Some(old_attached) = self.attached.take() { + scheduler.node_dec_ref(old_attached); + } + + self.clear_secondary(scheduler); + } + + pub(crate) fn all_pageservers(&self) -> Vec { + let mut result = Vec::new(); + if let Some(p) = self.attached { + result.push(p) + } + + result.extend(self.secondary.iter().copied()); + + result + } + + pub(crate) fn get_attached(&self) -> &Option { + &self.attached + } + + pub(crate) fn get_secondary(&self) -> &Vec { + &self.secondary + } + + /// If the node is in use as the attached location, demote it into + /// the list of secondary locations. This is used when a node goes offline, + /// and we want to use a different node for attachment, but not permanently + /// forget the location on the offline node. + /// + /// Returns true if a change was made + pub(crate) fn demote_attached(&mut self, node_id: NodeId) -> bool { + if self.attached == Some(node_id) { + // TODO: when scheduler starts tracking attached + secondary counts separately, we will + // need to call into it here. + self.attached = None; + self.secondary.push(node_id); + true + } else { + false + } + } +} + +impl Drop for IntentState { + fn drop(&mut self) { + // Must clear before dropping, to avoid leaving stale refcounts in the Scheduler + debug_assert!(self.attached.is_none() && self.secondary.is_empty()); + } +} + +#[derive(Default, Clone, Serialize)] pub(crate) struct ObservedState { pub(crate) locations: HashMap, } @@ -93,7 +262,7 @@ pub(crate) struct ObservedState { /// what it is (e.g. we failed partway through configuring it) /// * Instance exists with conf==Some: this tells us what we last successfully configured on this node, /// and that configuration will still be present unless something external interfered. -#[derive(Clone)] +#[derive(Clone, Serialize)] pub(crate) struct ObservedStateLocation { /// If None, it means we do not know the status of this shard's location on this node, but /// we know that we might have some state on this node. @@ -162,41 +331,11 @@ pub(crate) struct ReconcileResult { pub(crate) result: Result<(), ReconcileError>, pub(crate) tenant_shard_id: TenantShardId, - pub(crate) generation: Generation, + pub(crate) generation: Option, pub(crate) observed: ObservedState, -} -impl IntentState { - pub(crate) fn new() -> Self { - Self { - attached: None, - secondary: vec![], - } - } - pub(crate) fn all_pageservers(&self) -> Vec { - let mut result = Vec::new(); - if let Some(p) = self.attached { - result.push(p) - } - - result.extend(self.secondary.iter().copied()); - - result - } - - /// When a node goes offline, we update intents to avoid using it - /// as their attached pageserver. - /// - /// Returns true if a change was made - pub(crate) fn notify_offline(&mut self, node_id: NodeId) -> bool { - if self.attached == Some(node_id) { - self.attached = None; - self.secondary.push(node_id); - true - } else { - false - } - } + /// Set [`TenantState::pending_compute_notification`] from this flag + pub(crate) pending_compute_notification: bool, } impl ObservedState { @@ -217,15 +356,17 @@ impl TenantState { tenant_shard_id, policy, intent: IntentState::default(), - generation: Generation::new(0), + generation: Some(Generation::new(0)), shard, observed: ObservedState::default(), config: TenantConfig::default(), reconciler: None, + splitting: SplitState::Idle, sequence: Sequence(1), waiter: Arc::new(SeqWait::new(Sequence(0))), error_waiter: Arc::new(SeqWait::new(Sequence(0))), last_error: Arc::default(), + pending_compute_notification: false, } } @@ -233,7 +374,7 @@ impl TenantState { /// [`ObservedState`], even if it violates my [`PlacementPolicy`]. Call [`Self::schedule`] next, /// to get an intent state that complies with placement policy. The overall goal is to do scheduling /// in a way that makes use of any configured locations that already exist in the outside world. - pub(crate) fn intent_from_observed(&mut self) { + pub(crate) fn intent_from_observed(&mut self, scheduler: &mut Scheduler) { // Choose an attached location by filtering observed locations, and then sorting to get the highest // generation let mut attached_locs = self @@ -258,69 +399,131 @@ impl TenantState { attached_locs.sort_by_key(|i| i.1); if let Some((node_id, _gen)) = attached_locs.into_iter().last() { - self.intent.attached = Some(*node_id); + self.intent.set_attached(scheduler, Some(*node_id)); } // All remaining observed locations generate secondary intents. This includes None // observations, as these may well have some local content on disk that is usable (this // is an edge case that might occur if we restarted during a migration or other change) + // + // We may leave intent.attached empty if we didn't find any attached locations: [`Self::schedule`] + // will take care of promoting one of these secondaries to be attached. self.observed.locations.keys().for_each(|node_id| { if Some(*node_id) != self.intent.attached { - self.intent.secondary.push(*node_id); + self.intent.push_secondary(scheduler, *node_id); } }); } + /// Part of [`Self::schedule`] that is used to choose exactly one node to act as the + /// attached pageserver for a shard. + /// + /// Returns whether we modified it, and the NodeId selected. + fn schedule_attached( + &mut self, + scheduler: &mut Scheduler, + ) -> Result<(bool, NodeId), ScheduleError> { + // No work to do if we already have an attached tenant + if let Some(node_id) = self.intent.attached { + return Ok((false, node_id)); + } + + if let Some(promote_secondary) = scheduler.node_preferred(&self.intent.secondary) { + // Promote a secondary + tracing::debug!("Promoted secondary {} to attached", promote_secondary); + self.intent.promote_attached(scheduler, promote_secondary); + Ok((true, promote_secondary)) + } else { + // Pick a fresh node: either we had no secondaries or none were schedulable + let node_id = scheduler.schedule_shard(&self.intent.secondary)?; + tracing::debug!("Selected {} as attached", node_id); + self.intent.set_attached(scheduler, Some(node_id)); + Ok((true, node_id)) + } + } + pub(crate) fn schedule(&mut self, scheduler: &mut Scheduler) -> Result<(), ScheduleError> { // TODO: before scheduling new nodes, check if any existing content in // self.intent refers to pageservers that are offline, and pick other // pageservers if so. + // TODO: respect the splitting bit on tenants: if they are currently splitting then we may not + // change their attach location. + // Build the set of pageservers already in use by this tenant, to avoid scheduling // more work on the same pageservers we're already using. - let mut used_pageservers = self.intent.all_pageservers(); let mut modified = false; + // Add/remove nodes to fulfil policy use PlacementPolicy::*; match self.policy { Single => { // Should have exactly one attached, and zero secondaries - if self.intent.attached.is_none() { - let node_id = scheduler.schedule_shard(&used_pageservers)?; - self.intent.attached = Some(node_id); - used_pageservers.push(node_id); + if !self.intent.secondary.is_empty() { + self.intent.clear_secondary(scheduler); modified = true; } + + let (modified_attached, _attached_node_id) = self.schedule_attached(scheduler)?; + modified |= modified_attached; + if !self.intent.secondary.is_empty() { - self.intent.secondary.clear(); + self.intent.clear_secondary(scheduler); modified = true; } } Double(secondary_count) => { - // Should have exactly one attached, and N secondaries - if self.intent.attached.is_none() { - let node_id = scheduler.schedule_shard(&used_pageservers)?; - self.intent.attached = Some(node_id); - used_pageservers.push(node_id); + let retain_secondaries = if self.intent.attached.is_none() + && scheduler.node_preferred(&self.intent.secondary).is_some() + { + // If we have no attached, and one of the secondaries is elegible to be promoted, retain + // one more secondary than we usually would, as one of them will become attached futher down this function. + secondary_count + 1 + } else { + secondary_count + }; + + while self.intent.secondary.len() > retain_secondaries { + // We have no particular preference for one secondary location over another: just + // arbitrarily drop from the end + self.intent.pop_secondary(scheduler); modified = true; } + // Should have exactly one attached, and N secondaries + let (modified_attached, attached_node_id) = self.schedule_attached(scheduler)?; + modified |= modified_attached; + + let mut used_pageservers = vec![attached_node_id]; while self.intent.secondary.len() < secondary_count { let node_id = scheduler.schedule_shard(&used_pageservers)?; - self.intent.secondary.push(node_id); + self.intent.push_secondary(scheduler, node_id); used_pageservers.push(node_id); modified = true; } } - Detached => { - // Should have no attached or secondary pageservers - if self.intent.attached.is_some() { - self.intent.attached = None; + Secondary => { + if let Some(node_id) = self.intent.get_attached() { + // Populate secondary by demoting the attached node + self.intent.demote_attached(*node_id); + modified = true; + } else if self.intent.secondary.is_empty() { + // Populate secondary by scheduling a fresh node + let node_id = scheduler.schedule_shard(&[])?; + self.intent.push_secondary(scheduler, node_id); modified = true; } - - if !self.intent.secondary.is_empty() { - self.intent.secondary.clear(); + while self.intent.secondary.len() > 1 { + // We have no particular preference for one secondary location over another: just + // arbitrarily drop from the end + self.intent.pop_secondary(scheduler); + modified = true; + } + } + Detached => { + // Never add locations in this mode + if self.intent.get_attached().is_some() || !self.intent.get_secondary().is_empty() { + self.intent.clear(scheduler); modified = true; } } @@ -333,13 +536,52 @@ impl TenantState { Ok(()) } - fn dirty(&self) -> bool { + /// Query whether the tenant's observed state for attached node matches its intent state, and if so, + /// yield the node ID. This is appropriate for emitting compute hook notifications: we are checking that + /// the node in question is not only where we intend to attach, but that the tenant is indeed already attached there. + /// + /// Reconciliation may still be needed for other aspects of state such as secondaries (see [`Self::dirty`]): this + /// funciton should not be used to decide whether to reconcile. + pub(crate) fn stably_attached(&self) -> Option { + if let Some(attach_intent) = self.intent.attached { + match self.observed.locations.get(&attach_intent) { + Some(loc) => match &loc.conf { + Some(conf) => match conf.mode { + LocationConfigMode::AttachedMulti + | LocationConfigMode::AttachedSingle + | LocationConfigMode::AttachedStale => { + // Our intent and observed state agree that this node is in an attached state. + Some(attach_intent) + } + // Our observed config is not an attached state + _ => None, + }, + // Our observed state is None, i.e. in flux + None => None, + }, + // We have no observed state for this node + None => None, + } + } else { + // Our intent is not to attach + None + } + } + + fn dirty(&self, nodes: &Arc>) -> bool { + let mut dirty_nodes = HashSet::new(); + if let Some(node_id) = self.intent.attached { - let wanted_conf = attached_location_conf(self.generation, &self.shard, &self.config); + // Maybe panic: it is a severe bug if we try to attach while generation is null. + let generation = self + .generation + .expect("Attempted to enter attached state without a generation"); + + let wanted_conf = attached_location_conf(generation, &self.shard, &self.config); match self.observed.locations.get(&node_id) { Some(conf) if conf.conf.as_ref() == Some(&wanted_conf) => {} Some(_) | None => { - return true; + dirty_nodes.insert(node_id); } } } @@ -349,21 +591,39 @@ impl TenantState { match self.observed.locations.get(node_id) { Some(conf) if conf.conf.as_ref() == Some(&wanted_conf) => {} Some(_) | None => { - return true; + dirty_nodes.insert(*node_id); } } } - false + for node_id in self.observed.locations.keys() { + if self.intent.attached != Some(*node_id) && !self.intent.secondary.contains(node_id) { + // We have observed state that isn't part of our intent: need to clean it up. + dirty_nodes.insert(*node_id); + } + } + + dirty_nodes.retain(|node_id| { + nodes + .get(node_id) + .map(|n| n.is_available()) + .unwrap_or(false) + }); + + !dirty_nodes.is_empty() } + #[allow(clippy::too_many_arguments)] + #[instrument(skip_all, fields(tenant_id=%self.tenant_shard_id.tenant_id, shard_id=%self.tenant_shard_id.shard_slug()))] pub(crate) fn maybe_reconcile( &mut self, - result_tx: tokio::sync::mpsc::UnboundedSender, + result_tx: &tokio::sync::mpsc::UnboundedSender, pageservers: &Arc>, compute_hook: &Arc, service_config: &service::Config, persistence: &Arc, + gate: &Gate, + cancel: &CancellationToken, ) -> Option { // If there are any ambiguous observed states, and the nodes they refer to are available, // we should reconcile to clean them up. @@ -372,22 +632,39 @@ impl TenantState { let node = pageservers .get(node_id) .expect("Nodes may not be removed while referenced"); - if observed_loc.conf.is_none() - && !matches!(node.availability, NodeAvailability::Offline) - { + if observed_loc.conf.is_none() && node.is_available() { dirty_observed = true; break; } } - if !self.dirty() && !dirty_observed { + let active_nodes_dirty = self.dirty(pageservers); + + // Even if there is no pageserver work to be done, if we have a pending notification to computes, + // wake up a reconciler to send it. + let do_reconcile = + active_nodes_dirty || dirty_observed || self.pending_compute_notification; + + if !do_reconcile { tracing::info!("Not dirty, no reconciliation needed."); return None; } + // If we are currently splitting, then never start a reconciler task: the splitting logic + // requires that shards are not interfered with while it runs. Do this check here rather than + // up top, so that we only log this message if we would otherwise have done a reconciliation. + if !matches!(self.splitting, SplitState::Idle) { + tracing::info!("Refusing to reconcile, splitting in progress"); + return None; + } + // Reconcile already in flight for the current sequence? if let Some(handle) = &self.reconciler { if handle.sequence == self.sequence { + tracing::info!( + "Reconciliation already in progress for sequence {:?}", + self.sequence, + ); return Some(ReconcilerWaiter { tenant_shard_id: self.tenant_shard_id, seq_wait: self.waiter.clone(), @@ -398,62 +675,126 @@ impl TenantState { } } + // Build list of nodes from which the reconciler should detach + let mut detach = Vec::new(); + for node_id in self.observed.locations.keys() { + if self.intent.get_attached() != &Some(*node_id) + && !self.intent.secondary.contains(node_id) + { + detach.push( + pageservers + .get(node_id) + .expect("Intent references non-existent pageserver") + .clone(), + ) + } + } + // Reconcile in flight for a stale sequence? Our sequence's task will wait for it before // doing our sequence's work. let old_handle = self.reconciler.take(); - let cancel = CancellationToken::new(); + let Ok(gate_guard) = gate.enter() else { + // Shutting down, don't start a reconciler + return None; + }; + + // Advance the sequence before spawning a reconciler, so that sequence waiters + // can distinguish between before+after the reconcile completes. + self.sequence = self.sequence.next(); + + let reconciler_cancel = cancel.child_token(); + let reconciler_intent = TargetState::from_intent(pageservers, &self.intent); let mut reconciler = Reconciler { tenant_shard_id: self.tenant_shard_id, shard: self.shard, generation: self.generation, - intent: self.intent.clone(), + intent: reconciler_intent, + detach, config: self.config.clone(), observed: self.observed.clone(), - pageservers: pageservers.clone(), compute_hook: compute_hook.clone(), service_config: service_config.clone(), - cancel: cancel.clone(), + _gate_guard: gate_guard, + cancel: reconciler_cancel.clone(), persistence: persistence.clone(), + compute_notify_failure: false, }; let reconcile_seq = self.sequence; - tracing::info!("Spawning Reconciler for sequence {}", self.sequence); - let join_handle = tokio::task::spawn(async move { - // Wait for any previous reconcile task to complete before we start - if let Some(old_handle) = old_handle { - old_handle.cancel.cancel(); - if let Err(e) = old_handle.handle.await { - // We can't do much with this other than log it: the task is done, so - // we may proceed with our work. - tracing::error!("Unexpected join error waiting for reconcile task: {e}"); + tracing::info!(seq=%reconcile_seq, "Spawning Reconciler for sequence {}", self.sequence); + let must_notify = self.pending_compute_notification; + let reconciler_span = tracing::info_span!(parent: None, "reconciler", seq=%reconcile_seq, + tenant_id=%reconciler.tenant_shard_id.tenant_id, + shard_id=%reconciler.tenant_shard_id.shard_slug()); + metrics::RECONCILER.spawned.inc(); + let result_tx = result_tx.clone(); + let join_handle = tokio::task::spawn( + async move { + // Wait for any previous reconcile task to complete before we start + if let Some(old_handle) = old_handle { + old_handle.cancel.cancel(); + if let Err(e) = old_handle.handle.await { + // We can't do much with this other than log it: the task is done, so + // we may proceed with our work. + tracing::error!("Unexpected join error waiting for reconcile task: {e}"); + } } - } - // Early check for cancellation before doing any work - // TODO: wrap all remote API operations in cancellation check - // as well. - if reconciler.cancel.is_cancelled() { - return; - } + // Early check for cancellation before doing any work + // TODO: wrap all remote API operations in cancellation check + // as well. + if reconciler.cancel.is_cancelled() { + metrics::RECONCILER + .complete + .with_label_values(&[metrics::ReconcilerMetrics::CANCEL]) + .inc(); + return; + } - let result = reconciler.reconcile().await; - result_tx - .send(ReconcileResult { - sequence: reconcile_seq, - result, - tenant_shard_id: reconciler.tenant_shard_id, - generation: reconciler.generation, - observed: reconciler.observed, - }) - .ok(); - }); + // Attempt to make observed state match intent state + let result = reconciler.reconcile().await; + + // If we know we had a pending compute notification from some previous action, send a notification irrespective + // of whether the above reconcile() did any work + if result.is_ok() && must_notify { + // If this fails we will send the need to retry in [`ReconcileResult::pending_compute_notification`] + reconciler.compute_notify().await.ok(); + } + + // Update result counter + match &result { + Ok(_) => metrics::RECONCILER + .complete + .with_label_values(&[metrics::ReconcilerMetrics::SUCCESS]), + Err(ReconcileError::Cancel) => metrics::RECONCILER + .complete + .with_label_values(&[metrics::ReconcilerMetrics::CANCEL]), + Err(_) => metrics::RECONCILER + .complete + .with_label_values(&[metrics::ReconcilerMetrics::ERROR]), + } + .inc(); + + result_tx + .send(ReconcileResult { + sequence: reconcile_seq, + result, + tenant_shard_id: reconciler.tenant_shard_id, + generation: reconciler.generation, + observed: reconciler.observed, + pending_compute_notification: reconciler.compute_notify_failure, + }) + .ok(); + } + .instrument(reconciler_span), + ); self.reconciler = Some(ReconcilerHandle { sequence: self.sequence, handle: join_handle, - cancel, + cancel: reconciler_cancel, }); Some(ReconcilerWaiter { @@ -464,4 +805,181 @@ impl TenantState { seq: self.sequence, }) } + + /// Called when a ReconcileResult has been emitted and the service is updating + /// our state: if the result is from a sequence >= my ReconcileHandle, then drop + /// the handle to indicate there is no longer a reconciliation in progress. + pub(crate) fn reconcile_complete(&mut self, sequence: Sequence) { + if let Some(reconcile_handle) = &self.reconciler { + if reconcile_handle.sequence <= sequence { + self.reconciler = None; + } + } + } + + // If we had any state at all referring to this node ID, drop it. Does not + // attempt to reschedule. + pub(crate) fn deref_node(&mut self, node_id: NodeId) { + if self.intent.attached == Some(node_id) { + self.intent.attached = None; + } + + self.intent.secondary.retain(|n| n != &node_id); + + self.observed.locations.remove(&node_id); + + debug_assert!(!self.intent.all_pageservers().contains(&node_id)); + } + + pub(crate) fn to_persistent(&self) -> TenantShardPersistence { + TenantShardPersistence { + tenant_id: self.tenant_shard_id.tenant_id.to_string(), + shard_number: self.tenant_shard_id.shard_number.0 as i32, + shard_count: self.tenant_shard_id.shard_count.literal() as i32, + shard_stripe_size: self.shard.stripe_size.0 as i32, + generation: self.generation.map(|g| g.into().unwrap_or(0) as i32), + generation_pageserver: self.intent.get_attached().map(|n| n.0 as i64), + placement_policy: serde_json::to_string(&self.policy).unwrap(), + config: serde_json::to_string(&self.config).unwrap(), + splitting: SplitState::default(), + } + } +} + +#[cfg(test)] +pub(crate) mod tests { + use pageserver_api::{ + controller_api::NodeAvailability, + shard::{ShardCount, ShardNumber}, + }; + use utils::id::TenantId; + + use crate::scheduler::test_utils::make_test_nodes; + + use super::*; + + fn make_test_tenant_shard(policy: PlacementPolicy) -> TenantState { + let tenant_id = TenantId::generate(); + let shard_number = ShardNumber(0); + let shard_count = ShardCount::new(1); + + let tenant_shard_id = TenantShardId { + tenant_id, + shard_number, + shard_count, + }; + TenantState::new( + tenant_shard_id, + ShardIdentity::new( + shard_number, + shard_count, + pageserver_api::shard::ShardStripeSize(32768), + ) + .unwrap(), + policy, + ) + } + + /// Test the scheduling behaviors used when a tenant configured for HA is subject + /// to nodes being marked offline. + #[test] + fn tenant_ha_scheduling() -> anyhow::Result<()> { + // Start with three nodes. Our tenant will only use two. The third one is + // expected to remain unused. + let mut nodes = make_test_nodes(3); + + let mut scheduler = Scheduler::new(nodes.values()); + + let mut tenant_state = make_test_tenant_shard(PlacementPolicy::Double(1)); + tenant_state + .schedule(&mut scheduler) + .expect("we have enough nodes, scheduling should work"); + + // Expect to initially be schedule on to different nodes + assert_eq!(tenant_state.intent.secondary.len(), 1); + assert!(tenant_state.intent.attached.is_some()); + + let attached_node_id = tenant_state.intent.attached.unwrap(); + let secondary_node_id = *tenant_state.intent.secondary.iter().last().unwrap(); + assert_ne!(attached_node_id, secondary_node_id); + + // Notifying the attached node is offline should demote it to a secondary + let changed = tenant_state.intent.demote_attached(attached_node_id); + assert!(changed); + assert!(tenant_state.intent.attached.is_none()); + assert_eq!(tenant_state.intent.secondary.len(), 2); + + // Update the scheduler state to indicate the node is offline + nodes + .get_mut(&attached_node_id) + .unwrap() + .set_availability(NodeAvailability::Offline); + scheduler.node_upsert(nodes.get(&attached_node_id).unwrap()); + + // Scheduling the node should promote the still-available secondary node to attached + tenant_state + .schedule(&mut scheduler) + .expect("active nodes are available"); + assert_eq!(tenant_state.intent.attached.unwrap(), secondary_node_id); + + // The original attached node should have been retained as a secondary + assert_eq!( + *tenant_state.intent.secondary.iter().last().unwrap(), + attached_node_id + ); + + tenant_state.intent.clear(&mut scheduler); + + Ok(()) + } + + #[test] + fn intent_from_observed() -> anyhow::Result<()> { + let nodes = make_test_nodes(3); + let mut scheduler = Scheduler::new(nodes.values()); + + let mut tenant_state = make_test_tenant_shard(PlacementPolicy::Double(1)); + + tenant_state.observed.locations.insert( + NodeId(3), + ObservedStateLocation { + conf: Some(LocationConfig { + mode: LocationConfigMode::AttachedMulti, + generation: Some(2), + secondary_conf: None, + shard_number: tenant_state.shard.number.0, + shard_count: tenant_state.shard.count.literal(), + shard_stripe_size: tenant_state.shard.stripe_size.0, + tenant_conf: TenantConfig::default(), + }), + }, + ); + + tenant_state.observed.locations.insert( + NodeId(2), + ObservedStateLocation { + conf: Some(LocationConfig { + mode: LocationConfigMode::AttachedStale, + generation: Some(1), + secondary_conf: None, + shard_number: tenant_state.shard.number.0, + shard_count: tenant_state.shard.count.literal(), + shard_stripe_size: tenant_state.shard.stripe_size.0, + tenant_conf: TenantConfig::default(), + }), + }, + ); + + tenant_state.intent_from_observed(&mut scheduler); + + // The highest generationed attached location gets used as attached + assert_eq!(tenant_state.intent.attached, Some(NodeId(3))); + // Other locations get used as secondary + assert_eq!(tenant_state.intent.secondary, vec![NodeId(2)]); + + scheduler.consistency_check(nodes.values(), [&tenant_state].into_iter())?; + + tenant_state.intent.clear(&mut scheduler); + Ok(()) + } } diff --git a/control_plane/src/attachment_service.rs b/control_plane/src/attachment_service.rs deleted file mode 100644 index 2d43c46270..0000000000 --- a/control_plane/src/attachment_service.rs +++ /dev/null @@ -1,422 +0,0 @@ -use crate::{background_process, local_env::LocalEnv}; -use camino::Utf8PathBuf; -use hyper::Method; -use pageserver_api::{ - models::{ShardParameters, TenantCreateRequest, TimelineCreateRequest, TimelineInfo}, - shard::TenantShardId, -}; -use pageserver_client::mgmt_api::ResponseErrorMessageExt; -use postgres_backend::AuthType; -use postgres_connection::parse_host_port; -use serde::{de::DeserializeOwned, Deserialize, Serialize}; -use std::{path::PathBuf, str::FromStr}; -use tracing::instrument; -use utils::{ - auth::{Claims, Scope}, - id::{NodeId, TenantId}, -}; - -pub struct AttachmentService { - env: LocalEnv, - listen: String, - path: PathBuf, - jwt_token: Option, - public_key_path: Option, - client: reqwest::Client, -} - -const COMMAND: &str = "attachment_service"; - -#[derive(Serialize, Deserialize)] -pub struct AttachHookRequest { - pub tenant_shard_id: TenantShardId, - pub node_id: Option, -} - -#[derive(Serialize, Deserialize)] -pub struct AttachHookResponse { - pub gen: Option, -} - -#[derive(Serialize, Deserialize)] -pub struct InspectRequest { - pub tenant_shard_id: TenantShardId, -} - -#[derive(Serialize, Deserialize)] -pub struct InspectResponse { - pub attachment: Option<(u32, NodeId)>, -} - -#[derive(Serialize, Deserialize)] -pub struct TenantCreateResponseShard { - pub node_id: NodeId, - pub generation: u32, -} - -#[derive(Serialize, Deserialize)] -pub struct TenantCreateResponse { - pub shards: Vec, -} - -#[derive(Serialize, Deserialize)] -pub struct NodeRegisterRequest { - pub node_id: NodeId, - - pub listen_pg_addr: String, - pub listen_pg_port: u16, - - pub listen_http_addr: String, - pub listen_http_port: u16, -} - -#[derive(Serialize, Deserialize)] -pub struct NodeConfigureRequest { - pub node_id: NodeId, - - pub availability: Option, - pub scheduling: Option, -} - -#[derive(Serialize, Deserialize, Debug)] -pub struct TenantLocateResponseShard { - pub shard_id: TenantShardId, - pub node_id: NodeId, - - pub listen_pg_addr: String, - pub listen_pg_port: u16, - - pub listen_http_addr: String, - pub listen_http_port: u16, -} - -#[derive(Serialize, Deserialize)] -pub struct TenantLocateResponse { - pub shards: Vec, - pub shard_params: ShardParameters, -} - -/// Explicitly migrating a particular shard is a low level operation -/// TODO: higher level "Reschedule tenant" operation where the request -/// specifies some constraints, e.g. asking it to get off particular node(s) -#[derive(Serialize, Deserialize, Debug)] -pub struct TenantShardMigrateRequest { - pub tenant_shard_id: TenantShardId, - pub node_id: NodeId, -} - -#[derive(Serialize, Deserialize, Clone, Copy)] -pub enum NodeAvailability { - // Normal, happy state - Active, - // Offline: Tenants shouldn't try to attach here, but they may assume that their - // secondary locations on this node still exist. Newly added nodes are in this - // state until we successfully contact them. - Offline, -} - -impl FromStr for NodeAvailability { - type Err = anyhow::Error; - - fn from_str(s: &str) -> Result { - match s { - "active" => Ok(Self::Active), - "offline" => Ok(Self::Offline), - _ => Err(anyhow::anyhow!("Unknown availability state '{s}'")), - } - } -} - -/// FIXME: this is a duplicate of the type in the attachment_service crate, because the -/// type needs to be defined with diesel traits in there. -#[derive(Serialize, Deserialize, Clone, Copy)] -pub enum NodeSchedulingPolicy { - Active, - Filling, - Pause, - Draining, -} - -impl FromStr for NodeSchedulingPolicy { - type Err = anyhow::Error; - - fn from_str(s: &str) -> Result { - match s { - "active" => Ok(Self::Active), - "filling" => Ok(Self::Filling), - "pause" => Ok(Self::Pause), - "draining" => Ok(Self::Draining), - _ => Err(anyhow::anyhow!("Unknown scheduling state '{s}'")), - } - } -} - -impl From for String { - fn from(value: NodeSchedulingPolicy) -> String { - use NodeSchedulingPolicy::*; - match value { - Active => "active", - Filling => "filling", - Pause => "pause", - Draining => "draining", - } - .to_string() - } -} - -#[derive(Serialize, Deserialize, Debug)] -pub struct TenantShardMigrateResponse {} - -impl AttachmentService { - pub fn from_env(env: &LocalEnv) -> Self { - let path = env.base_data_dir.join("attachments.json"); - - // Makes no sense to construct this if pageservers aren't going to use it: assume - // pageservers have control plane API set - let listen_url = env.control_plane_api.clone().unwrap(); - - let listen = format!( - "{}:{}", - listen_url.host_str().unwrap(), - listen_url.port().unwrap() - ); - - // Assume all pageservers have symmetric auth configuration: this service - // expects to use one JWT token to talk to all of them. - let ps_conf = env - .pageservers - .first() - .expect("Config is validated to contain at least one pageserver"); - let (jwt_token, public_key_path) = match ps_conf.http_auth_type { - AuthType::Trust => (None, None), - AuthType::NeonJWT => { - let jwt_token = env - .generate_auth_token(&Claims::new(None, Scope::PageServerApi)) - .unwrap(); - - // If pageserver auth is enabled, this implicitly enables auth for this service, - // using the same credentials. - let public_key_path = - camino::Utf8PathBuf::try_from(env.base_data_dir.join("auth_public_key.pem")) - .unwrap(); - (Some(jwt_token), Some(public_key_path)) - } - }; - - Self { - env: env.clone(), - path, - listen, - jwt_token, - public_key_path, - client: reqwest::ClientBuilder::new() - .build() - .expect("Failed to construct http client"), - } - } - - fn pid_file(&self) -> Utf8PathBuf { - Utf8PathBuf::from_path_buf(self.env.base_data_dir.join("attachment_service.pid")) - .expect("non-Unicode path") - } - - pub async fn start(&self) -> anyhow::Result<()> { - let path_str = self.path.to_string_lossy(); - - let mut args = vec!["-l", &self.listen, "-p", &path_str] - .into_iter() - .map(|s| s.to_string()) - .collect::>(); - if let Some(jwt_token) = &self.jwt_token { - args.push(format!("--jwt-token={jwt_token}")); - } - - if let Some(public_key_path) = &self.public_key_path { - args.push(format!("--public-key={public_key_path}")); - } - - let result = background_process::start_process( - COMMAND, - &self.env.base_data_dir, - &self.env.attachment_service_bin(), - args, - [( - "NEON_REPO_DIR".to_string(), - self.env.base_data_dir.to_string_lossy().to_string(), - )], - background_process::InitialPidFile::Create(self.pid_file()), - || async { - match self.status().await { - Ok(_) => Ok(true), - Err(_) => Ok(false), - } - }, - ) - .await; - - // TODO: shouldn't we bail if we fail to spawn the process? - for ps_conf in &self.env.pageservers { - let (pg_host, pg_port) = - parse_host_port(&ps_conf.listen_pg_addr).expect("Unable to parse listen_pg_addr"); - let (http_host, http_port) = parse_host_port(&ps_conf.listen_http_addr) - .expect("Unable to parse listen_http_addr"); - self.node_register(NodeRegisterRequest { - node_id: ps_conf.id, - listen_pg_addr: pg_host.to_string(), - listen_pg_port: pg_port.unwrap_or(5432), - listen_http_addr: http_host.to_string(), - listen_http_port: http_port.unwrap_or(80), - }) - .await?; - } - - result - } - - pub fn stop(&self, immediate: bool) -> anyhow::Result<()> { - background_process::stop_process(immediate, COMMAND, &self.pid_file()) - } - /// Simple HTTP request wrapper for calling into attachment service - async fn dispatch( - &self, - method: hyper::Method, - path: String, - body: Option, - ) -> anyhow::Result - where - RQ: Serialize + Sized, - RS: DeserializeOwned + Sized, - { - let url = self - .env - .control_plane_api - .clone() - .unwrap() - .join(&path) - .unwrap(); - - let mut builder = self.client.request(method, url); - if let Some(body) = body { - builder = builder.json(&body) - } - if let Some(jwt_token) = &self.jwt_token { - builder = builder.header( - reqwest::header::AUTHORIZATION, - format!("Bearer {jwt_token}"), - ); - } - - let response = builder.send().await?; - let response = response.error_from_body().await?; - - Ok(response - .json() - .await - .map_err(pageserver_client::mgmt_api::Error::ReceiveBody)?) - } - - /// Call into the attach_hook API, for use before handing out attachments to pageservers - #[instrument(skip(self))] - pub async fn attach_hook( - &self, - tenant_shard_id: TenantShardId, - pageserver_id: NodeId, - ) -> anyhow::Result> { - let request = AttachHookRequest { - tenant_shard_id, - node_id: Some(pageserver_id), - }; - - let response = self - .dispatch::<_, AttachHookResponse>( - Method::POST, - "attach-hook".to_string(), - Some(request), - ) - .await?; - - Ok(response.gen) - } - - #[instrument(skip(self))] - pub async fn inspect( - &self, - tenant_shard_id: TenantShardId, - ) -> anyhow::Result> { - let request = InspectRequest { tenant_shard_id }; - - let response = self - .dispatch::<_, InspectResponse>(Method::POST, "inspect".to_string(), Some(request)) - .await?; - - Ok(response.attachment) - } - - #[instrument(skip(self))] - pub async fn tenant_create( - &self, - req: TenantCreateRequest, - ) -> anyhow::Result { - self.dispatch(Method::POST, "tenant".to_string(), Some(req)) - .await - } - - #[instrument(skip(self))] - pub async fn tenant_locate(&self, tenant_id: TenantId) -> anyhow::Result { - self.dispatch::<(), _>(Method::GET, format!("tenant/{tenant_id}/locate"), None) - .await - } - - #[instrument(skip(self))] - pub async fn tenant_migrate( - &self, - tenant_shard_id: TenantShardId, - node_id: NodeId, - ) -> anyhow::Result { - self.dispatch( - Method::PUT, - format!("tenant/{tenant_shard_id}/migrate"), - Some(TenantShardMigrateRequest { - tenant_shard_id, - node_id, - }), - ) - .await - } - - #[instrument(skip_all, fields(node_id=%req.node_id))] - pub async fn node_register(&self, req: NodeRegisterRequest) -> anyhow::Result<()> { - self.dispatch::<_, ()>(Method::POST, "node".to_string(), Some(req)) - .await - } - - #[instrument(skip_all, fields(node_id=%req.node_id))] - pub async fn node_configure(&self, req: NodeConfigureRequest) -> anyhow::Result<()> { - self.dispatch::<_, ()>( - Method::PUT, - format!("node/{}/config", req.node_id), - Some(req), - ) - .await - } - - #[instrument(skip(self))] - pub async fn status(&self) -> anyhow::Result<()> { - self.dispatch::<(), ()>(Method::GET, "status".to_string(), None) - .await - } - - #[instrument(skip_all, fields(%tenant_id, timeline_id=%req.new_timeline_id))] - pub async fn tenant_timeline_create( - &self, - tenant_id: TenantId, - req: TimelineCreateRequest, - ) -> anyhow::Result { - self.dispatch( - Method::POST, - format!("tenant/{tenant_id}/timeline"), - Some(req), - ) - .await - } -} diff --git a/control_plane/src/background_process.rs b/control_plane/src/background_process.rs index 3ffb8734d0..0e59b28230 100644 --- a/control_plane/src/background_process.rs +++ b/control_plane/src/background_process.rs @@ -72,7 +72,6 @@ where let log_path = datadir.join(format!("{process_name}.log")); let process_log_file = fs::OpenOptions::new() .create(true) - .write(true) .append(true) .open(&log_path) .with_context(|| { @@ -256,7 +255,9 @@ fn fill_remote_storage_secrets_vars(mut cmd: &mut Command) -> &mut Command { for env_key in [ "AWS_ACCESS_KEY_ID", "AWS_SECRET_ACCESS_KEY", - "AWS_SESSION_TOKEN", + "AWS_PROFILE", + // HOME is needed in combination with `AWS_PROFILE` to pick up the SSO sessions. + "HOME", "AZURE_STORAGE_ACCOUNT", "AZURE_STORAGE_ACCESS_KEY", ] { diff --git a/control_plane/src/bin/neon_local.rs b/control_plane/src/bin/neon_local.rs index 279c47398f..6c722f36b4 100644 --- a/control_plane/src/bin/neon_local.rs +++ b/control_plane/src/bin/neon_local.rs @@ -8,14 +8,15 @@ use anyhow::{anyhow, bail, Context, Result}; use clap::{value_parser, Arg, ArgAction, ArgMatches, Command, ValueEnum}; use compute_api::spec::ComputeMode; -use control_plane::attachment_service::{ - AttachmentService, NodeAvailability, NodeConfigureRequest, NodeSchedulingPolicy, -}; use control_plane::endpoint::ComputeControlPlane; use control_plane::local_env::{InitForceMode, LocalEnv}; use control_plane::pageserver::{PageServerNode, PAGESERVER_REMOTE_STORAGE_DIR}; use control_plane::safekeeper::SafekeeperNode; +use control_plane::storage_controller::StorageController; use control_plane::{broker, local_env}; +use pageserver_api::controller_api::{ + NodeAvailability, NodeConfigureRequest, NodeSchedulingPolicy, PlacementPolicy, +}; use pageserver_api::models::{ ShardParameters, TenantCreateRequest, TimelineCreateRequest, TimelineInfo, }; @@ -51,7 +52,7 @@ project_git_version!(GIT_VERSION); const DEFAULT_PG_VERSION: &str = "15"; -const DEFAULT_PAGESERVER_CONTROL_PLANE_API: &str = "http://127.0.0.1:1234/"; +const DEFAULT_PAGESERVER_CONTROL_PLANE_API: &str = "http://127.0.0.1:1234/upcall/v1/"; fn default_conf(num_pageservers: u16) -> String { let mut template = format!( @@ -135,9 +136,9 @@ fn main() -> Result<()> { "tenant" => rt.block_on(handle_tenant(sub_args, &mut env)), "timeline" => rt.block_on(handle_timeline(sub_args, &mut env)), "start" => rt.block_on(handle_start_all(sub_args, &env)), - "stop" => handle_stop_all(sub_args, &env), + "stop" => rt.block_on(handle_stop_all(sub_args, &env)), "pageserver" => rt.block_on(handle_pageserver(sub_args, &env)), - "attachment_service" => rt.block_on(handle_attachment_service(sub_args, &env)), + "storage_controller" => rt.block_on(handle_storage_controller(sub_args, &env)), "safekeeper" => rt.block_on(handle_safekeeper(sub_args, &env)), "endpoint" => rt.block_on(handle_endpoint(sub_args, &env)), "mappings" => handle_mappings(sub_args, &mut env), @@ -434,27 +435,33 @@ async fn handle_tenant( let shard_stripe_size: Option = create_match.get_one::("shard-stripe-size").cloned(); + let placement_policy = match create_match.get_one::("placement-policy") { + Some(s) if !s.is_empty() => serde_json::from_str::(s)?, + _ => PlacementPolicy::Single, + }; + let tenant_conf = PageServerNode::parse_config(tenant_conf)?; // If tenant ID was not specified, generate one let tenant_id = parse_tenant_id(create_match)?.unwrap_or_else(TenantId::generate); - // We must register the tenant with the attachment service, so + // We must register the tenant with the storage controller, so // that when the pageserver restarts, it will be re-attached. - let attachment_service = AttachmentService::from_env(env); - attachment_service + let storage_controller = StorageController::from_env(env); + storage_controller .tenant_create(TenantCreateRequest { // Note that ::unsharded here isn't actually because the tenant is unsharded, its because the - // attachment service expecfs a shard-naive tenant_id in this attribute, and the TenantCreateRequest - // type is used both in attachment service (for creating tenants) and in pageserver (for creating shards) + // storage controller expecfs a shard-naive tenant_id in this attribute, and the TenantCreateRequest + // type is used both in storage controller (for creating tenants) and in pageserver (for creating shards) new_tenant_id: TenantShardId::unsharded(tenant_id), generation: None, shard_parameters: ShardParameters { - count: ShardCount(shard_count), + count: ShardCount::new(shard_count), stripe_size: shard_stripe_size .map(ShardStripeSize) .unwrap_or(ShardParameters::DEFAULT_STRIPE_SIZE), }, + placement_policy: Some(placement_policy), config: tenant_conf, }) .await?; @@ -469,9 +476,9 @@ async fn handle_tenant( .context("Failed to parse postgres version from the argument string")?; // FIXME: passing None for ancestor_start_lsn is not kosher in a sharded world: we can't have - // different shards picking different start lsns. Maybe we have to teach attachment service + // different shards picking different start lsns. Maybe we have to teach storage controller // to let shard 0 branch first and then propagate the chosen LSN to other shards. - attachment_service + storage_controller .tenant_timeline_create( tenant_id, TimelineCreateRequest { @@ -521,8 +528,8 @@ async fn handle_tenant( let new_pageserver = get_pageserver(env, matches)?; let new_pageserver_id = new_pageserver.conf.id; - let attachment_service = AttachmentService::from_env(env); - attachment_service + let storage_controller = StorageController::from_env(env); + storage_controller .tenant_migrate(tenant_shard_id, new_pageserver_id) .await?; @@ -536,8 +543,8 @@ async fn handle_tenant( let mut tenant_synthetic_size = None; - let attachment_service = AttachmentService::from_env(env); - for shard in attachment_service.tenant_locate(tenant_id).await?.shards { + let storage_controller = StorageController::from_env(env); + for shard in storage_controller.tenant_locate(tenant_id).await?.shards { let pageserver = PageServerNode::from_env(env, env.get_pageserver_conf(shard.node_id)?); @@ -575,6 +582,30 @@ async fn handle_tenant( println!("{tenant_table}"); println!("{shard_table}"); } + Some(("shard-split", matches)) => { + let tenant_id = get_tenant_id(matches, env)?; + let shard_count: u8 = matches.get_one::("shard-count").cloned().unwrap_or(0); + let shard_stripe_size: Option = matches + .get_one::>("shard-stripe-size") + .cloned() + .unwrap(); + + let storage_controller = StorageController::from_env(env); + let result = storage_controller + .tenant_split(tenant_id, shard_count, shard_stripe_size) + .await?; + println!( + "Split tenant {} into shards {}", + tenant_id, + result + .new_shards + .iter() + .map(|s| format!("{:?}", s)) + .collect::>() + .join(",") + ); + } + Some((sub_name, _)) => bail!("Unexpected tenant subcommand '{}'", sub_name), None => bail!("no tenant subcommand provided"), } @@ -586,7 +617,7 @@ async fn handle_timeline(timeline_match: &ArgMatches, env: &mut local_env::Local match timeline_match.subcommand() { Some(("list", list_match)) => { - // TODO(sharding): this command shouldn't have to specify a shard ID: we should ask the attachment service + // TODO(sharding): this command shouldn't have to specify a shard ID: we should ask the storage controller // where shard 0 is attached, and query there. let tenant_shard_id = get_tenant_shard_id(list_match, env)?; let timelines = pageserver.timeline_list(&tenant_shard_id).await?; @@ -606,7 +637,7 @@ async fn handle_timeline(timeline_match: &ArgMatches, env: &mut local_env::Local let new_timeline_id_opt = parse_timeline_id(create_match)?; let new_timeline_id = new_timeline_id_opt.unwrap_or(TimelineId::generate()); - let attachment_service = AttachmentService::from_env(env); + let storage_controller = StorageController::from_env(env); let create_req = TimelineCreateRequest { new_timeline_id, ancestor_timeline_id: None, @@ -614,7 +645,7 @@ async fn handle_timeline(timeline_match: &ArgMatches, env: &mut local_env::Local ancestor_start_lsn: None, pg_version: Some(pg_version), }; - let timeline_info = attachment_service + let timeline_info = storage_controller .tenant_timeline_create(tenant_id, create_req) .await?; @@ -632,6 +663,10 @@ async fn handle_timeline(timeline_match: &ArgMatches, env: &mut local_env::Local let name = import_match .get_one::("node-name") .ok_or_else(|| anyhow!("No node name provided"))?; + let update_catalog = import_match + .get_one::("update-catalog") + .cloned() + .unwrap_or_default(); // Parse base inputs let base_tarfile = import_match @@ -674,6 +709,7 @@ async fn handle_timeline(timeline_match: &ArgMatches, env: &mut local_env::Local None, pg_version, ComputeMode::Primary, + !update_catalog, )?; println!("Done"); } @@ -698,7 +734,7 @@ async fn handle_timeline(timeline_match: &ArgMatches, env: &mut local_env::Local .transpose() .context("Failed to parse ancestor start Lsn from the request")?; let new_timeline_id = TimelineId::generate(); - let attachment_service = AttachmentService::from_env(env); + let storage_controller = StorageController::from_env(env); let create_req = TimelineCreateRequest { new_timeline_id, ancestor_timeline_id: Some(ancestor_timeline_id), @@ -706,7 +742,7 @@ async fn handle_timeline(timeline_match: &ArgMatches, env: &mut local_env::Local ancestor_start_lsn: start_lsn, pg_version: None, }; - let timeline_info = attachment_service + let timeline_info = storage_controller .tenant_timeline_create(tenant_id, create_req) .await?; @@ -735,7 +771,7 @@ async fn handle_endpoint(ep_match: &ArgMatches, env: &local_env::LocalEnv) -> Re match sub_name { "list" => { - // TODO(sharding): this command shouldn't have to specify a shard ID: we should ask the attachment service + // TODO(sharding): this command shouldn't have to specify a shard ID: we should ask the storage controller // where shard 0 is attached, and query there. let tenant_shard_id = get_tenant_shard_id(sub_args, env)?; let timeline_infos = get_timeline_infos(env, &tenant_shard_id) @@ -795,7 +831,7 @@ async fn handle_endpoint(ep_match: &ArgMatches, env: &local_env::LocalEnv) -> Re &endpoint.timeline_id.to_string(), branch_name, lsn_str.as_str(), - endpoint.status(), + &format!("{}", endpoint.status()), ]); } @@ -811,6 +847,10 @@ async fn handle_endpoint(ep_match: &ArgMatches, env: &local_env::LocalEnv) -> Re .get_one::("endpoint_id") .map(String::to_string) .unwrap_or_else(|| format!("ep-{branch_name}")); + let update_catalog = sub_args + .get_one::("update-catalog") + .cloned() + .unwrap_or_default(); let lsn = sub_args .get_one::("lsn") @@ -860,6 +900,7 @@ async fn handle_endpoint(ep_match: &ArgMatches, env: &local_env::LocalEnv) -> Re http_port, pg_version, mode, + !update_catalog, )?; } "start" => { @@ -898,6 +939,11 @@ async fn handle_endpoint(ep_match: &ArgMatches, env: &local_env::LocalEnv) -> Re .get(endpoint_id.as_str()) .ok_or_else(|| anyhow::anyhow!("endpoint {endpoint_id} not found"))?; + let create_test_user = sub_args + .get_one::("create-test-user") + .cloned() + .unwrap_or_default(); + cplane.check_conflicting_endpoints( endpoint.mode, endpoint.tenant_id, @@ -910,21 +956,21 @@ async fn handle_endpoint(ep_match: &ArgMatches, env: &local_env::LocalEnv) -> Re ( vec![(parsed.0, parsed.1.unwrap_or(5432))], // If caller is telling us what pageserver to use, this is not a tenant which is - // full managed by attachment service, therefore not sharded. + // full managed by storage controller, therefore not sharded. ShardParameters::DEFAULT_STRIPE_SIZE, ) } else { // Look up the currently attached location of the tenant, and its striping metadata, // to pass these on to postgres. - let attachment_service = AttachmentService::from_env(env); - let locate_result = attachment_service.tenant_locate(endpoint.tenant_id).await?; + let storage_controller = StorageController::from_env(env); + let locate_result = storage_controller.tenant_locate(endpoint.tenant_id).await?; let pageservers = locate_result .shards .into_iter() .map(|shard| { ( Host::parse(&shard.listen_pg_addr) - .expect("Attachment service reported bad hostname"), + .expect("Storage controller reported bad hostname"), shard.listen_pg_port, ) }) @@ -952,6 +998,7 @@ async fn handle_endpoint(ep_match: &ArgMatches, env: &local_env::LocalEnv) -> Re pageservers, remote_ext_config, stripe_size.0 as usize, + create_test_user, ) .await?; } @@ -972,8 +1019,8 @@ async fn handle_endpoint(ep_match: &ArgMatches, env: &local_env::LocalEnv) -> Re pageserver.pg_connection_config.port(), )] } else { - let attachment_service = AttachmentService::from_env(env); - attachment_service + let storage_controller = StorageController::from_env(env); + storage_controller .tenant_locate(endpoint.tenant_id) .await? .shards @@ -981,25 +1028,26 @@ async fn handle_endpoint(ep_match: &ArgMatches, env: &local_env::LocalEnv) -> Re .map(|shard| { ( Host::parse(&shard.listen_pg_addr) - .expect("Attachment service reported malformed host"), + .expect("Storage controller reported malformed host"), shard.listen_pg_port, ) }) .collect::>() }; - endpoint.reconfigure(pageservers).await?; + endpoint.reconfigure(pageservers, None).await?; } "stop" => { let endpoint_id = sub_args .get_one::("endpoint_id") .ok_or_else(|| anyhow!("No endpoint ID was provided to stop"))?; let destroy = sub_args.get_flag("destroy"); + let mode = sub_args.get_one::("mode").expect("has a default"); let endpoint = cplane .endpoints .get(endpoint_id.as_str()) .with_context(|| format!("postgres endpoint {endpoint_id} is not found"))?; - endpoint.stop(destroy)?; + endpoint.stop(mode, destroy)?; } _ => bail!("Unexpected endpoint subcommand '{sub_name}'"), @@ -1094,30 +1142,13 @@ async fn handle_pageserver(sub_match: &ArgMatches, env: &local_env::LocalEnv) -> } } - Some(("migrate", subcommand_args)) => { - let pageserver = get_pageserver(env, subcommand_args)?; - //TODO what shutdown strategy should we use here? - if let Err(e) = pageserver.stop(false) { - eprintln!("pageserver stop failed: {}", e); - exit(1); - } - - if let Err(e) = pageserver - .start(&pageserver_config_overrides(subcommand_args)) - .await - { - eprintln!("pageserver start failed: {e}"); - exit(1); - } - } - Some(("set-state", subcommand_args)) => { let pageserver = get_pageserver(env, subcommand_args)?; let scheduling = subcommand_args.get_one("scheduling"); let availability = subcommand_args.get_one("availability"); - let attachment_service = AttachmentService::from_env(env); - attachment_service + let storage_controller = StorageController::from_env(env); + storage_controller .node_configure(NodeConfigureRequest { node_id: pageserver.conf.id, scheduling: scheduling.cloned(), @@ -1142,11 +1173,11 @@ async fn handle_pageserver(sub_match: &ArgMatches, env: &local_env::LocalEnv) -> Ok(()) } -async fn handle_attachment_service( +async fn handle_storage_controller( sub_match: &ArgMatches, env: &local_env::LocalEnv, ) -> Result<()> { - let svc = AttachmentService::from_env(env); + let svc = StorageController::from_env(env); match sub_match.subcommand() { Some(("start", _start_match)) => { if let Err(e) = svc.start().await { @@ -1161,13 +1192,13 @@ async fn handle_attachment_service( .map(|s| s.as_str()) == Some("immediate"); - if let Err(e) = svc.stop(immediate) { + if let Err(e) = svc.stop(immediate).await { eprintln!("stop failed: {}", e); exit(1); } } - Some((sub_name, _)) => bail!("Unexpected attachment_service subcommand '{}'", sub_name), - None => bail!("no attachment_service subcommand provided"), + Some((sub_name, _)) => bail!("Unexpected storage_controller subcommand '{}'", sub_name), + None => bail!("no storage_controller subcommand provided"), } Ok(()) } @@ -1252,12 +1283,12 @@ async fn handle_start_all(sub_match: &ArgMatches, env: &local_env::LocalEnv) -> broker::start_broker_process(env).await?; - // Only start the attachment service if the pageserver is configured to need it + // Only start the storage controller if the pageserver is configured to need it if env.control_plane_api.is_some() { - let attachment_service = AttachmentService::from_env(env); - if let Err(e) = attachment_service.start().await { - eprintln!("attachment_service start failed: {:#}", e); - try_stop_all(env, true); + let storage_controller = StorageController::from_env(env); + if let Err(e) = storage_controller.start().await { + eprintln!("storage_controller start failed: {:#}", e); + try_stop_all(env, true).await; exit(1); } } @@ -1269,7 +1300,7 @@ async fn handle_start_all(sub_match: &ArgMatches, env: &local_env::LocalEnv) -> .await { eprintln!("pageserver {} start failed: {:#}", ps_conf.id, e); - try_stop_all(env, true); + try_stop_all(env, true).await; exit(1); } } @@ -1278,28 +1309,28 @@ async fn handle_start_all(sub_match: &ArgMatches, env: &local_env::LocalEnv) -> let safekeeper = SafekeeperNode::from_env(env, node); if let Err(e) = safekeeper.start(vec![]).await { eprintln!("safekeeper {} start failed: {:#}", safekeeper.id, e); - try_stop_all(env, false); + try_stop_all(env, false).await; exit(1); } } Ok(()) } -fn handle_stop_all(sub_match: &ArgMatches, env: &local_env::LocalEnv) -> Result<()> { +async fn handle_stop_all(sub_match: &ArgMatches, env: &local_env::LocalEnv) -> Result<()> { let immediate = sub_match.get_one::("stop-mode").map(|s| s.as_str()) == Some("immediate"); - try_stop_all(env, immediate); + try_stop_all(env, immediate).await; Ok(()) } -fn try_stop_all(env: &local_env::LocalEnv, immediate: bool) { +async fn try_stop_all(env: &local_env::LocalEnv, immediate: bool) { // Stop all endpoints match ComputeControlPlane::load(env.clone()) { Ok(cplane) => { for (_k, node) in cplane.endpoints { - if let Err(e) = node.stop(false) { + if let Err(e) = node.stop(if immediate { "immediate" } else { "fast " }, false) { eprintln!("postgres stop failed: {e:#}"); } } @@ -1328,9 +1359,9 @@ fn try_stop_all(env: &local_env::LocalEnv, immediate: bool) { } if env.control_plane_api.is_some() { - let attachment_service = AttachmentService::from_env(env); - if let Err(e) = attachment_service.stop(immediate) { - eprintln!("attachment service stop failed: {e:#}"); + let storage_controller = StorageController::from_env(env); + if let Err(e) = storage_controller.stop(immediate).await { + eprintln!("storage controller stop failed: {e:#}"); } } } @@ -1452,6 +1483,18 @@ fn cli() -> Command { .required(false) .default_value("1"); + let update_catalog = Arg::new("update-catalog") + .value_parser(value_parser!(bool)) + .long("update-catalog") + .help("If set, will set up the catalog for neon_superuser") + .required(false); + + let create_test_user = Arg::new("create-test-user") + .value_parser(value_parser!(bool)) + .long("create-test-user") + .help("If set, will create test user `user` and `neondb` database. Requires `update-catalog = true`") + .required(false); + Command::new("Neon CLI") .arg_required_else_help(true) .version(GIT_VERSION) @@ -1512,6 +1555,7 @@ fn cli() -> Command { .arg(Arg::new("end-lsn").long("end-lsn") .help("Lsn the basebackup ends at")) .arg(pg_version_arg.clone()) + .arg(update_catalog.clone()) ) ).subcommand( Command::new("tenant") @@ -1527,6 +1571,7 @@ fn cli() -> Command { .help("Use this tenant in future CLI commands where tenant_id is needed, but not specified")) .arg(Arg::new("shard-count").value_parser(value_parser!(u8)).long("shard-count").action(ArgAction::Set).help("Number of shards in the new tenant (default 1)")) .arg(Arg::new("shard-stripe-size").value_parser(value_parser!(u32)).long("shard-stripe-size").action(ArgAction::Set).help("Sharding stripe size in pages")) + .arg(Arg::new("placement-policy").value_parser(value_parser!(String)).long("placement-policy").action(ArgAction::Set).help("Placement policy shards in this tenant")) ) .subcommand(Command::new("set-default").arg(tenant_id_arg.clone().required(true)) .about("Set a particular tenant as default in future CLI commands where tenant_id is needed, but not specified")) @@ -1540,6 +1585,12 @@ fn cli() -> Command { .subcommand(Command::new("status") .about("Human readable summary of the tenant's shards and attachment locations") .arg(tenant_id_arg.clone())) + .subcommand(Command::new("shard-split") + .about("Increase the number of shards in the tenant") + .arg(tenant_id_arg.clone()) + .arg(Arg::new("shard-count").value_parser(value_parser!(u8)).long("shard-count").action(ArgAction::Set).help("Number of shards in the new tenant (default 1)")) + .arg(Arg::new("shard-stripe-size").value_parser(value_parser!(u32)).long("shard-stripe-size").action(ArgAction::Set).help("Sharding stripe size in pages")) + ) ) .subcommand( Command::new("pageserver") @@ -1567,9 +1618,9 @@ fn cli() -> Command { ) ) .subcommand( - Command::new("attachment_service") + Command::new("storage_controller") .arg_required_else_help(true) - .about("Manage attachment_service") + .about("Manage storage_controller") .subcommand(Command::new("start").about("Start local pageserver").arg(pageserver_config_args.clone())) .subcommand(Command::new("stop").about("Stop local pageserver") .arg(stop_mode_arg.clone())) @@ -1616,6 +1667,7 @@ fn cli() -> Command { .required(false)) .arg(pg_version_arg.clone()) .arg(hot_standby_arg.clone()) + .arg(update_catalog) ) .subcommand(Command::new("start") .about("Start postgres.\n If the endpoint doesn't exist yet, it is created.") @@ -1623,6 +1675,7 @@ fn cli() -> Command { .arg(endpoint_pageserver_id_arg.clone()) .arg(safekeepers_arg) .arg(remote_ext_config_args) + .arg(create_test_user) ) .subcommand(Command::new("reconfigure") .about("Reconfigure the endpoint") @@ -1639,7 +1692,16 @@ fn cli() -> Command { .long("destroy") .action(ArgAction::SetTrue) .required(false) - ) + ) + .arg( + Arg::new("mode") + .help("Postgres shutdown mode, passed to \"pg_ctl -m \"") + .long("mode") + .action(ArgAction::Set) + .required(false) + .value_parser(["smart", "fast", "immediate"]) + .default_value("fast") + ) ) ) diff --git a/control_plane/src/endpoint.rs b/control_plane/src/endpoint.rs index d3b0366d31..5206222961 100644 --- a/control_plane/src/endpoint.rs +++ b/control_plane/src/endpoint.rs @@ -41,20 +41,25 @@ use std::net::SocketAddr; use std::net::TcpStream; use std::path::PathBuf; use std::process::Command; +use std::str::FromStr; use std::sync::Arc; use std::time::Duration; use anyhow::{anyhow, bail, Context, Result}; +use compute_api::spec::Database; +use compute_api::spec::PgIdent; use compute_api::spec::RemoteExtSpec; +use compute_api::spec::Role; use nix::sys::signal::kill; use nix::sys::signal::Signal; +use pageserver_api::shard::ShardStripeSize; use serde::{Deserialize, Serialize}; use url::Host; use utils::id::{NodeId, TenantId, TimelineId}; -use crate::attachment_service::AttachmentService; use crate::local_env::LocalEnv; use crate::postgresql_conf::PostgresConf; +use crate::storage_controller::StorageController; use compute_api::responses::{ComputeState, ComputeStatus}; use compute_api::spec::{Cluster, ComputeFeature, ComputeMode, ComputeSpec}; @@ -122,6 +127,7 @@ impl ComputeControlPlane { http_port: Option, pg_version: u32, mode: ComputeMode, + skip_pg_catalog_updates: bool, ) -> Result> { let pg_port = pg_port.unwrap_or_else(|| self.get_port()); let http_port = http_port.unwrap_or_else(|| self.get_port() + 1); @@ -140,7 +146,7 @@ impl ComputeControlPlane { // before and after start are the same. So, skip catalog updates, // with this we basically test a case of waking up an idle compute, where // we also skip catalog updates in the cloud. - skip_pg_catalog_updates: true, + skip_pg_catalog_updates, features: vec![], }); @@ -155,7 +161,7 @@ impl ComputeControlPlane { http_port, pg_port, pg_version, - skip_pg_catalog_updates: true, + skip_pg_catalog_updates, features: vec![], })?, )?; @@ -184,7 +190,7 @@ impl ComputeControlPlane { v.tenant_id == tenant_id && v.timeline_id == timeline_id && v.mode == mode - && v.status() != "stopped" + && v.status() != EndpointStatus::Stopped }); if let Some((key, _)) = duplicates.next() { @@ -223,6 +229,26 @@ pub struct Endpoint { features: Vec, } +#[derive(PartialEq, Eq)] +pub enum EndpointStatus { + Running, + Stopped, + Crashed, + RunningNoPidfile, +} + +impl std::fmt::Display for EndpointStatus { + fn fmt(&self, writer: &mut std::fmt::Formatter) -> std::fmt::Result { + let s = match self { + Self::Running => "running", + Self::Stopped => "stopped", + Self::Crashed => "crashed", + Self::RunningNoPidfile => "running, no pidfile", + }; + write!(writer, "{}", s) + } +} + impl Endpoint { fn from_dir_entry(entry: std::fs::DirEntry, env: &LocalEnv) -> Result { if !entry.file_type()?.is_dir() { @@ -380,16 +406,16 @@ impl Endpoint { self.endpoint_path().join("pgdata") } - pub fn status(&self) -> &str { + pub fn status(&self) -> EndpointStatus { let timeout = Duration::from_millis(300); let has_pidfile = self.pgdata().join("postmaster.pid").exists(); let can_connect = TcpStream::connect_timeout(&self.pg_address, timeout).is_ok(); match (has_pidfile, can_connect) { - (true, true) => "running", - (false, false) => "stopped", - (true, false) => "crashed", - (false, true) => "running, no pidfile", + (true, true) => EndpointStatus::Running, + (false, false) => EndpointStatus::Stopped, + (true, false) => EndpointStatus::Crashed, + (false, true) => EndpointStatus::RunningNoPidfile, } } @@ -438,7 +464,7 @@ impl Endpoint { } fn wait_for_compute_ctl_to_exit(&self, send_sigterm: bool) -> Result<()> { - // TODO use background_process::stop_process instead + // TODO use background_process::stop_process instead: https://github.com/neondatabase/neon/pull/6482 let pidfile_path = self.endpoint_path().join("compute_ctl.pid"); let pid: u32 = std::fs::read_to_string(pidfile_path)?.parse()?; let pid = nix::unistd::Pid::from_raw(pid as i32); @@ -480,8 +506,9 @@ impl Endpoint { pageservers: Vec<(Host, u16)>, remote_ext_config: Option<&String>, shard_stripe_size: usize, + create_test_user: bool, ) -> Result<()> { - if self.status() == "running" { + if self.status() == EndpointStatus::Running { anyhow::bail!("The endpoint is already running"); } @@ -531,8 +558,26 @@ impl Endpoint { cluster_id: None, // project ID: not used name: None, // project name: not used state: None, - roles: vec![], - databases: vec![], + roles: if create_test_user { + vec![Role { + name: PgIdent::from_str("test").unwrap(), + encrypted_password: None, + options: None, + }] + } else { + Vec::new() + }, + databases: if create_test_user { + vec![Database { + name: PgIdent::from_str("neondb").unwrap(), + owner: PgIdent::from_str("test").unwrap(), + options: None, + restrict_conn: false, + invalid: false, + }] + } else { + Vec::new() + }, settings: None, postgresql_conf: Some(postgresql_conf), }, @@ -546,6 +591,7 @@ impl Endpoint { remote_extensions, pgbouncer_settings: None, shard_stripe_size: Some(shard_stripe_size), + primary_is_running: None, }; let spec_path = self.endpoint_path().join("spec.json"); std::fs::write(spec_path, serde_json::to_string_pretty(&spec)?)?; @@ -557,11 +603,16 @@ impl Endpoint { .open(self.endpoint_path().join("compute.log"))?; // Launch compute_ctl - println!("Starting postgres node at '{}'", self.connstr()); + let conn_str = self.connstr("cloud_admin", "postgres"); + println!("Starting postgres node at '{}'", conn_str); + if create_test_user { + let conn_str = self.connstr("test", "neondb"); + println!("Also at '{}'", conn_str); + } let mut cmd = Command::new(self.env.neon_distrib_dir.join("compute_ctl")); cmd.args(["--http-port", &self.http_address.port().to_string()]) .args(["--pgdata", self.pgdata().to_str().unwrap()]) - .args(["--connstr", &self.connstr()]) + .args(["--connstr", &conn_str]) .args([ "--spec-path", self.endpoint_path().join("spec.json").to_str().unwrap(), @@ -583,9 +634,21 @@ impl Endpoint { } let child = cmd.spawn()?; + // set up a scopeguard to kill & wait for the child in case we panic or bail below + let child = scopeguard::guard(child, |mut child| { + println!("SIGKILL & wait the started process"); + (|| { + // TODO: use another signal that can be caught by the child so it can clean up any children it spawned + child.kill().context("SIGKILL child")?; + child.wait().context("wait() for child process")?; + anyhow::Ok(()) + })() + .with_context(|| format!("scopeguard kill&wait child {child:?}")) + .unwrap(); + }); // Write down the pid so we can wait for it when we want to stop - // TODO use background_process::start_process instead + // TODO use background_process::start_process instead: https://github.com/neondatabase/neon/pull/6482 let pid = child.id(); let pidfile_path = self.endpoint_path().join("compute_ctl.pid"); std::fs::write(pidfile_path, pid.to_string())?; @@ -593,7 +656,7 @@ impl Endpoint { // Wait for it to start let mut attempt = 0; const ATTEMPT_INTERVAL: Duration = Duration::from_millis(100); - const MAX_ATTEMPTS: u32 = 10 * 30; // Wait up to 30 s + const MAX_ATTEMPTS: u32 = 10 * 90; // Wait up to 1.5 min loop { attempt += 1; match self.get_status().await { @@ -620,7 +683,9 @@ impl Endpoint { } ComputeStatus::Empty | ComputeStatus::ConfigurationPending - | ComputeStatus::Configuration => { + | ComputeStatus::Configuration + | ComputeStatus::TerminationPending + | ComputeStatus::Terminated => { bail!("unexpected compute status: {:?}", state.status) } } @@ -634,6 +699,9 @@ impl Endpoint { std::thread::sleep(ATTEMPT_INTERVAL); } + // disarm the scopeguard, let the child outlive this function (and neon_local invoction) + drop(scopeguard::ScopeGuard::into_inner(child)); + Ok(()) } @@ -668,7 +736,11 @@ impl Endpoint { } } - pub async fn reconfigure(&self, mut pageservers: Vec<(Host, u16)>) -> Result<()> { + pub async fn reconfigure( + &self, + mut pageservers: Vec<(Host, u16)>, + stripe_size: Option, + ) -> Result<()> { let mut spec: ComputeSpec = { let spec_path = self.endpoint_path().join("spec.json"); let file = std::fs::File::open(spec_path)?; @@ -678,17 +750,17 @@ impl Endpoint { let postgresql_conf = self.read_postgresql_conf()?; spec.cluster.postgresql_conf = Some(postgresql_conf); - // If we weren't given explicit pageservers, query the attachment service + // If we weren't given explicit pageservers, query the storage controller if pageservers.is_empty() { - let attachment_service = AttachmentService::from_env(&self.env); - let locate_result = attachment_service.tenant_locate(self.tenant_id).await?; + let storage_controller = StorageController::from_env(&self.env); + let locate_result = storage_controller.tenant_locate(self.tenant_id).await?; pageservers = locate_result .shards .into_iter() .map(|shard| { ( Host::parse(&shard.listen_pg_addr) - .expect("Attachment service reported bad hostname"), + .expect("Storage controller reported bad hostname"), shard.listen_pg_port, ) }) @@ -698,8 +770,14 @@ impl Endpoint { let pageserver_connstr = Self::build_pageserver_connstr(&pageservers); assert!(!pageserver_connstr.is_empty()); spec.pageserver_connstring = Some(pageserver_connstr); + if stripe_size.is_some() { + spec.shard_stripe_size = stripe_size.map(|s| s.0 as usize); + } - let client = reqwest::Client::new(); + let client = reqwest::Client::builder() + .timeout(Duration::from_secs(30)) + .build() + .unwrap(); let response = client .post(format!( "http://{}:{}/configure", @@ -726,22 +804,8 @@ impl Endpoint { } } - pub fn stop(&self, destroy: bool) -> Result<()> { - // If we are going to destroy data directory, - // use immediate shutdown mode, otherwise, - // shutdown gracefully to leave the data directory sane. - // - // Postgres is always started from scratch, so stop - // without destroy only used for testing and debugging. - // - self.pg_ctl( - if destroy { - &["-m", "immediate", "stop"] - } else { - &["stop"] - }, - &None, - )?; + pub fn stop(&self, mode: &str, destroy: bool) -> Result<()> { + self.pg_ctl(&["-m", mode, "stop"], &None)?; // Also wait for the compute_ctl process to die. It might have some // cleanup work to do after postgres stops, like syncing safekeepers, @@ -762,13 +826,13 @@ impl Endpoint { Ok(()) } - pub fn connstr(&self) -> String { + pub fn connstr(&self, user: &str, db_name: &str) -> String { format!( "postgresql://{}@{}:{}/{}", - "cloud_admin", + user, self.pg_address.ip(), self.pg_address.port(), - "postgres" + db_name ) } } diff --git a/control_plane/src/lib.rs b/control_plane/src/lib.rs index bb79d36bfc..2af272f388 100644 --- a/control_plane/src/lib.rs +++ b/control_plane/src/lib.rs @@ -6,7 +6,6 @@ //! local installations. #![deny(clippy::undocumented_unsafe_blocks)] -pub mod attachment_service; mod background_process; pub mod broker; pub mod endpoint; @@ -14,3 +13,4 @@ pub mod local_env; pub mod pageserver; pub mod postgresql_conf; pub mod safekeeper; +pub mod storage_controller; diff --git a/control_plane/src/local_env.rs b/control_plane/src/local_env.rs index 4460fdd3a6..2e64489432 100644 --- a/control_plane/src/local_env.rs +++ b/control_plane/src/local_env.rs @@ -72,11 +72,16 @@ pub struct LocalEnv { #[serde(default)] pub safekeepers: Vec, - // Control plane location: if None, we will not run attachment_service. If set, this will + // Control plane upcall API for pageserver: if None, we will not run storage_controller If set, this will // be propagated into each pageserver's configuration. #[serde(default)] pub control_plane_api: Option, + // Control plane upcall API for storage controller. If set, this will be propagated into the + // storage controller's configuration. + #[serde(default)] + pub control_plane_compute_hook_api: Option, + /// Keep human-readable aliases in memory (and persist them to config), to hide ZId hex strings from the user. #[serde(default)] // A `HashMap>` would be more appropriate here, @@ -222,8 +227,12 @@ impl LocalEnv { self.neon_distrib_dir.join("pageserver") } - pub fn attachment_service_bin(&self) -> PathBuf { - self.neon_distrib_dir.join("attachment_service") + pub fn storage_controller_bin(&self) -> PathBuf { + // Irrespective of configuration, storage controller binary is always + // run from the same location as neon_local. This means that for compatibility + // tests that run old pageserver/safekeeper, they still run latest storage controller. + let neon_local_bin_dir = env::current_exe().unwrap().parent().unwrap().to_owned(); + neon_local_bin_dir.join("storage_controller") } pub fn safekeeper_bin(&self) -> PathBuf { @@ -403,14 +412,17 @@ impl LocalEnv { // this function is used only for testing purposes in CLI e g generate tokens during init pub fn generate_auth_token(&self, claims: &Claims) -> anyhow::Result { - let private_key_path = if self.private_key_path.is_absolute() { + let private_key_path = self.get_private_key_path(); + let key_data = fs::read(private_key_path)?; + encode_from_key_file(claims, &key_data) + } + + pub fn get_private_key_path(&self) -> PathBuf { + if self.private_key_path.is_absolute() { self.private_key_path.to_path_buf() } else { self.base_data_dir.join(&self.private_key_path) - }; - - let key_data = fs::read(private_key_path)?; - encode_from_key_file(claims, &key_data) + } } // diff --git a/control_plane/src/pageserver.rs b/control_plane/src/pageserver.rs index 1db21c9a37..06ec942895 100644 --- a/control_plane/src/pageserver.rs +++ b/control_plane/src/pageserver.rs @@ -109,12 +109,12 @@ impl PageServerNode { control_plane_api.as_str() )); - // Attachment service uses the same auth as pageserver: if JWT is enabled + // Storage controller uses the same auth as pageserver: if JWT is enabled // for us, we will also need it to talk to them. if matches!(self.conf.http_auth_type, AuthType::NeonJWT) { let jwt_token = self .env - .generate_auth_token(&Claims::new(None, Scope::PageServerApi)) + .generate_auth_token(&Claims::new(None, Scope::GenerationsApi)) .unwrap(); overrides.push(format!("control_plane_api_token='{}'", jwt_token)); } @@ -200,6 +200,28 @@ impl PageServerNode { String::from_utf8_lossy(&init_output.stderr), ); + // Write metadata file, used by pageserver on startup to register itself with + // the storage controller + let metadata_path = datadir.join("metadata.json"); + + let (_http_host, http_port) = + parse_host_port(&self.conf.listen_http_addr).expect("Unable to parse listen_http_addr"); + let http_port = http_port.unwrap_or(9898); + // Intentionally hand-craft JSON: this acts as an implicit format compat test + // in case the pageserver-side structure is edited, and reflects the real life + // situation: the metadata is written by some other script. + std::fs::write( + metadata_path, + serde_json::to_vec(&serde_json::json!({ + "host": "localhost", + "port": self.pg_connection_config.port(), + "http_host": "localhost", + "http_port": http_port, + })) + .unwrap(), + ) + .expect("Failed to write metadata file"); + Ok(()) } @@ -244,7 +266,9 @@ impl PageServerNode { } }, ) - .await + .await?; + + Ok(()) } fn pageserver_basic_args<'a>( @@ -329,6 +353,11 @@ impl PageServerNode { .remove("compaction_threshold") .map(|x| x.parse::()) .transpose()?, + compaction_algorithm: settings + .remove("compaction_algorithm") + .map(serde_json::from_str) + .transpose() + .context("Failed to parse 'compaction_algorithm' json")?, gc_horizon: settings .remove("gc_horizon") .map(|x| x.parse::()) @@ -368,12 +397,17 @@ impl PageServerNode { evictions_low_residence_duration_metric_threshold: settings .remove("evictions_low_residence_duration_metric_threshold") .map(|x| x.to_string()), - gc_feedback: settings - .remove("gc_feedback") + heatmap_period: settings.remove("heatmap_period").map(|x| x.to_string()), + lazy_slru_download: settings + .remove("lazy_slru_download") .map(|x| x.parse::()) .transpose() - .context("Failed to parse 'gc_feedback' as bool")?, - heatmap_period: settings.remove("heatmap_period").map(|x| x.to_string()), + .context("Failed to parse 'lazy_slru_download' as bool")?, + timeline_get_throttle: settings + .remove("timeline_get_throttle") + .map(serde_json::from_str) + .transpose() + .context("parse `timeline_get_throttle` from json")?, }; if !settings.is_empty() { bail!("Unrecognized tenant settings: {settings:?}") @@ -395,6 +429,8 @@ impl PageServerNode { generation, config, shard_parameters: ShardParameters::default(), + // Placement policy is not meaningful for creations not done via storage controller + placement_policy: None, }; if !settings.is_empty() { bail!("Unrecognized tenant settings: {settings:?}") @@ -427,6 +463,11 @@ impl PageServerNode { .map(|x| x.parse::()) .transpose() .context("Failed to parse 'compaction_threshold' as an integer")?, + compaction_algorithm: settings + .remove("compactin_algorithm") + .map(serde_json::from_str) + .transpose() + .context("Failed to parse 'compaction_algorithm' json")?, gc_horizon: settings .remove("gc_horizon") .map(|x| x.parse::()) @@ -468,12 +509,17 @@ impl PageServerNode { evictions_low_residence_duration_metric_threshold: settings .remove("evictions_low_residence_duration_metric_threshold") .map(|x| x.to_string()), - gc_feedback: settings - .remove("gc_feedback") + heatmap_period: settings.remove("heatmap_period").map(|x| x.to_string()), + lazy_slru_download: settings + .remove("lazy_slru_download") .map(|x| x.parse::()) .transpose() - .context("Failed to parse 'gc_feedback' as bool")?, - heatmap_period: settings.remove("heatmap_period").map(|x| x.to_string()), + .context("Failed to parse 'lazy_slru_download' as bool")?, + timeline_get_throttle: settings + .remove("timeline_get_throttle") + .map(serde_json::from_str) + .transpose() + .context("parse `timeline_get_throttle` from json")?, } }; @@ -493,10 +539,11 @@ impl PageServerNode { tenant_shard_id: TenantShardId, config: LocationConfig, flush_ms: Option, + lazy: bool, ) -> anyhow::Result<()> { Ok(self .http_client - .location_config(tenant_shard_id, config, flush_ms) + .location_config(tenant_shard_id, config, flush_ms, lazy) .await?) } @@ -561,7 +608,7 @@ impl PageServerNode { eprintln!("connection error: {}", e); } }); - tokio::pin!(client); + let client = std::pin::pin!(client); // Init base reader let (start_lsn, base_tarfile_path) = base; diff --git a/control_plane/src/storage_controller.rs b/control_plane/src/storage_controller.rs new file mode 100644 index 0000000000..d7673f1b26 --- /dev/null +++ b/control_plane/src/storage_controller.rs @@ -0,0 +1,547 @@ +use crate::{background_process, local_env::LocalEnv}; +use camino::{Utf8Path, Utf8PathBuf}; +use hyper::Method; +use pageserver_api::{ + controller_api::{ + NodeConfigureRequest, NodeRegisterRequest, TenantCreateResponse, TenantLocateResponse, + TenantShardMigrateRequest, TenantShardMigrateResponse, + }, + models::{ + TenantCreateRequest, TenantShardSplitRequest, TenantShardSplitResponse, + TimelineCreateRequest, TimelineInfo, + }, + shard::{ShardStripeSize, TenantShardId}, +}; +use pageserver_client::mgmt_api::ResponseErrorMessageExt; +use postgres_backend::AuthType; +use serde::{de::DeserializeOwned, Deserialize, Serialize}; +use std::{fs, str::FromStr}; +use tokio::process::Command; +use tracing::instrument; +use url::Url; +use utils::{ + auth::{encode_from_key_file, Claims, Scope}, + id::{NodeId, TenantId}, +}; + +pub struct StorageController { + env: LocalEnv, + listen: String, + path: Utf8PathBuf, + private_key: Option>, + public_key: Option, + postgres_port: u16, + client: reqwest::Client, +} + +const COMMAND: &str = "storage_controller"; + +const STORAGE_CONTROLLER_POSTGRES_VERSION: u32 = 16; + +#[derive(Serialize, Deserialize)] +pub struct AttachHookRequest { + pub tenant_shard_id: TenantShardId, + pub node_id: Option, +} + +#[derive(Serialize, Deserialize)] +pub struct AttachHookResponse { + pub gen: Option, +} + +#[derive(Serialize, Deserialize)] +pub struct InspectRequest { + pub tenant_shard_id: TenantShardId, +} + +#[derive(Serialize, Deserialize)] +pub struct InspectResponse { + pub attachment: Option<(u32, NodeId)>, +} + +impl StorageController { + pub fn from_env(env: &LocalEnv) -> Self { + let path = Utf8PathBuf::from_path_buf(env.base_data_dir.clone()) + .unwrap() + .join("attachments.json"); + + // Makes no sense to construct this if pageservers aren't going to use it: assume + // pageservers have control plane API set + let listen_url = env.control_plane_api.clone().unwrap(); + + let listen = format!( + "{}:{}", + listen_url.host_str().unwrap(), + listen_url.port().unwrap() + ); + + // Convention: NeonEnv in python tests reserves the next port after the control_plane_api + // port, for use by our captive postgres. + let postgres_port = listen_url + .port() + .expect("Control plane API setting should always have a port") + + 1; + + // Assume all pageservers have symmetric auth configuration: this service + // expects to use one JWT token to talk to all of them. + let ps_conf = env + .pageservers + .first() + .expect("Config is validated to contain at least one pageserver"); + let (private_key, public_key) = match ps_conf.http_auth_type { + AuthType::Trust => (None, None), + AuthType::NeonJWT => { + let private_key_path = env.get_private_key_path(); + let private_key = fs::read(private_key_path).expect("failed to read private key"); + + // If pageserver auth is enabled, this implicitly enables auth for this service, + // using the same credentials. + let public_key_path = + camino::Utf8PathBuf::try_from(env.base_data_dir.join("auth_public_key.pem")) + .unwrap(); + + // This service takes keys as a string rather than as a path to a file/dir: read the key into memory. + let public_key = if std::fs::metadata(&public_key_path) + .expect("Can't stat public key") + .is_dir() + { + // Our config may specify a directory: this is for the pageserver's ability to handle multiple + // keys. We only use one key at a time, so, arbitrarily load the first one in the directory. + let mut dir = + std::fs::read_dir(&public_key_path).expect("Can't readdir public key path"); + let dent = dir + .next() + .expect("Empty key dir") + .expect("Error reading key dir"); + + std::fs::read_to_string(dent.path()).expect("Can't read public key") + } else { + std::fs::read_to_string(&public_key_path).expect("Can't read public key") + }; + (Some(private_key), Some(public_key)) + } + }; + + Self { + env: env.clone(), + path, + listen, + private_key, + public_key, + postgres_port, + client: reqwest::ClientBuilder::new() + .build() + .expect("Failed to construct http client"), + } + } + + fn pid_file(&self) -> Utf8PathBuf { + Utf8PathBuf::from_path_buf(self.env.base_data_dir.join("storage_controller.pid")) + .expect("non-Unicode path") + } + + /// PIDFile for the postgres instance used to store storage controller state + fn postgres_pid_file(&self) -> Utf8PathBuf { + Utf8PathBuf::from_path_buf( + self.env + .base_data_dir + .join("storage_controller_postgres.pid"), + ) + .expect("non-Unicode path") + } + + /// Find the directory containing postgres binaries, such as `initdb` and `pg_ctl` + /// + /// This usually uses STORAGE_CONTROLLER_POSTGRES_VERSION of postgres, but will fall back + /// to other versions if that one isn't found. Some automated tests create circumstances + /// where only one version is available in pg_distrib_dir, such as `test_remote_extensions`. + pub async fn get_pg_bin_dir(&self) -> anyhow::Result { + let prefer_versions = [STORAGE_CONTROLLER_POSTGRES_VERSION, 15, 14]; + + for v in prefer_versions { + let path = Utf8PathBuf::from_path_buf(self.env.pg_bin_dir(v)?).unwrap(); + if tokio::fs::try_exists(&path).await? { + return Ok(path); + } + } + + // Fall through + anyhow::bail!( + "Postgres binaries not found in {}", + self.env.pg_distrib_dir.display() + ); + } + + /// Readiness check for our postgres process + async fn pg_isready(&self, pg_bin_dir: &Utf8Path) -> anyhow::Result { + let bin_path = pg_bin_dir.join("pg_isready"); + let args = ["-h", "localhost", "-p", &format!("{}", self.postgres_port)]; + let exitcode = Command::new(bin_path).args(args).spawn()?.wait().await?; + + Ok(exitcode.success()) + } + + /// Create our database if it doesn't exist, and run migrations. + /// + /// This function is equivalent to the `diesel setup` command in the diesel CLI. We implement + /// the same steps by hand to avoid imposing a dependency on installing diesel-cli for developers + /// who just want to run `cargo neon_local` without knowing about diesel. + /// + /// Returns the database url + pub async fn setup_database(&self) -> anyhow::Result { + const DB_NAME: &str = "storage_controller"; + let database_url = format!("postgresql://localhost:{}/{DB_NAME}", self.postgres_port); + + let pg_bin_dir = self.get_pg_bin_dir().await?; + let createdb_path = pg_bin_dir.join("createdb"); + let output = Command::new(&createdb_path) + .args([ + "-h", + "localhost", + "-p", + &format!("{}", self.postgres_port), + DB_NAME, + ]) + .output() + .await + .expect("Failed to spawn createdb"); + + if !output.status.success() { + let stderr = String::from_utf8(output.stderr).expect("Non-UTF8 output from createdb"); + if stderr.contains("already exists") { + tracing::info!("Database {DB_NAME} already exists"); + } else { + anyhow::bail!("createdb failed with status {}: {stderr}", output.status); + } + } + + Ok(database_url) + } + + pub async fn start(&self) -> anyhow::Result<()> { + // Start a vanilla Postgres process used by the storage controller for persistence. + let pg_data_path = Utf8PathBuf::from_path_buf(self.env.base_data_dir.clone()) + .unwrap() + .join("storage_controller_db"); + let pg_bin_dir = self.get_pg_bin_dir().await?; + let pg_log_path = pg_data_path.join("postgres.log"); + + if !tokio::fs::try_exists(&pg_data_path).await? { + // Initialize empty database + let initdb_path = pg_bin_dir.join("initdb"); + let mut child = Command::new(&initdb_path) + .args(["-D", pg_data_path.as_ref()]) + .spawn() + .expect("Failed to spawn initdb"); + let status = child.wait().await?; + if !status.success() { + anyhow::bail!("initdb failed with status {status}"); + } + + tokio::fs::write( + &pg_data_path.join("postgresql.conf"), + format!("port = {}", self.postgres_port), + ) + .await?; + }; + + println!("Starting storage controller database..."); + let db_start_args = [ + "-w", + "-D", + pg_data_path.as_ref(), + "-l", + pg_log_path.as_ref(), + "start", + ]; + + background_process::start_process( + "storage_controller_db", + &self.env.base_data_dir, + pg_bin_dir.join("pg_ctl").as_std_path(), + db_start_args, + [], + background_process::InitialPidFile::Create(self.postgres_pid_file()), + || self.pg_isready(&pg_bin_dir), + ) + .await?; + + // Run migrations on every startup, in case something changed. + let database_url = self.setup_database().await?; + + let mut args = vec![ + "-l", + &self.listen, + "-p", + self.path.as_ref(), + "--database-url", + &database_url, + ] + .into_iter() + .map(|s| s.to_string()) + .collect::>(); + if let Some(private_key) = &self.private_key { + let claims = Claims::new(None, Scope::PageServerApi); + let jwt_token = + encode_from_key_file(&claims, private_key).expect("failed to generate jwt token"); + args.push(format!("--jwt-token={jwt_token}")); + } + + if let Some(public_key) = &self.public_key { + args.push(format!("--public-key=\"{public_key}\"")); + } + + if let Some(control_plane_compute_hook_api) = &self.env.control_plane_compute_hook_api { + args.push(format!( + "--compute-hook-url={control_plane_compute_hook_api}" + )); + } + + background_process::start_process( + COMMAND, + &self.env.base_data_dir, + &self.env.storage_controller_bin(), + args, + [( + "NEON_REPO_DIR".to_string(), + self.env.base_data_dir.to_string_lossy().to_string(), + )], + background_process::InitialPidFile::Create(self.pid_file()), + || async { + match self.ready().await { + Ok(_) => Ok(true), + Err(_) => Ok(false), + } + }, + ) + .await?; + + Ok(()) + } + + pub async fn stop(&self, immediate: bool) -> anyhow::Result<()> { + background_process::stop_process(immediate, COMMAND, &self.pid_file())?; + + let pg_data_path = self.env.base_data_dir.join("storage_controller_db"); + let pg_bin_dir = self.get_pg_bin_dir().await?; + + println!("Stopping storage controller database..."); + let pg_stop_args = ["-D", &pg_data_path.to_string_lossy(), "stop"]; + let stop_status = Command::new(pg_bin_dir.join("pg_ctl")) + .args(pg_stop_args) + .spawn()? + .wait() + .await?; + if !stop_status.success() { + let pg_status_args = ["-D", &pg_data_path.to_string_lossy(), "status"]; + let status_exitcode = Command::new(pg_bin_dir.join("pg_ctl")) + .args(pg_status_args) + .spawn()? + .wait() + .await?; + + // pg_ctl status returns this exit code if postgres is not running: in this case it is + // fine that stop failed. Otherwise it is an error that stop failed. + const PG_STATUS_NOT_RUNNING: i32 = 3; + if Some(PG_STATUS_NOT_RUNNING) == status_exitcode.code() { + println!("Storage controller database is already stopped"); + return Ok(()); + } else { + anyhow::bail!("Failed to stop storage controller database: {stop_status}") + } + } + + Ok(()) + } + + fn get_claims_for_path(path: &str) -> anyhow::Result> { + let category = match path.find('/') { + Some(idx) => &path[..idx], + None => path, + }; + + match category { + "status" | "ready" => Ok(None), + "control" | "debug" => Ok(Some(Claims::new(None, Scope::Admin))), + "v1" => Ok(Some(Claims::new(None, Scope::PageServerApi))), + _ => Err(anyhow::anyhow!("Failed to determine claims for {}", path)), + } + } + + /// Simple HTTP request wrapper for calling into storage controller + async fn dispatch( + &self, + method: hyper::Method, + path: String, + body: Option, + ) -> anyhow::Result + where + RQ: Serialize + Sized, + RS: DeserializeOwned + Sized, + { + // The configured URL has the /upcall path prefix for pageservers to use: we will strip that out + // for general purpose API access. + let listen_url = self.env.control_plane_api.clone().unwrap(); + let url = Url::from_str(&format!( + "http://{}:{}/{path}", + listen_url.host_str().unwrap(), + listen_url.port().unwrap() + )) + .unwrap(); + + let mut builder = self.client.request(method, url); + if let Some(body) = body { + builder = builder.json(&body) + } + if let Some(private_key) = &self.private_key { + println!("Getting claims for path {}", path); + if let Some(required_claims) = Self::get_claims_for_path(&path)? { + println!("Got claims {:?} for path {}", required_claims, path); + let jwt_token = encode_from_key_file(&required_claims, private_key)?; + builder = builder.header( + reqwest::header::AUTHORIZATION, + format!("Bearer {jwt_token}"), + ); + } + } + + let response = builder.send().await?; + let response = response.error_from_body().await?; + + Ok(response + .json() + .await + .map_err(pageserver_client::mgmt_api::Error::ReceiveBody)?) + } + + /// Call into the attach_hook API, for use before handing out attachments to pageservers + #[instrument(skip(self))] + pub async fn attach_hook( + &self, + tenant_shard_id: TenantShardId, + pageserver_id: NodeId, + ) -> anyhow::Result> { + let request = AttachHookRequest { + tenant_shard_id, + node_id: Some(pageserver_id), + }; + + let response = self + .dispatch::<_, AttachHookResponse>( + Method::POST, + "debug/v1/attach-hook".to_string(), + Some(request), + ) + .await?; + + Ok(response.gen) + } + + #[instrument(skip(self))] + pub async fn inspect( + &self, + tenant_shard_id: TenantShardId, + ) -> anyhow::Result> { + let request = InspectRequest { tenant_shard_id }; + + let response = self + .dispatch::<_, InspectResponse>( + Method::POST, + "debug/v1/inspect".to_string(), + Some(request), + ) + .await?; + + Ok(response.attachment) + } + + #[instrument(skip(self))] + pub async fn tenant_create( + &self, + req: TenantCreateRequest, + ) -> anyhow::Result { + self.dispatch(Method::POST, "v1/tenant".to_string(), Some(req)) + .await + } + + #[instrument(skip(self))] + pub async fn tenant_locate(&self, tenant_id: TenantId) -> anyhow::Result { + self.dispatch::<(), _>( + Method::GET, + format!("control/v1/tenant/{tenant_id}/locate"), + None, + ) + .await + } + + #[instrument(skip(self))] + pub async fn tenant_migrate( + &self, + tenant_shard_id: TenantShardId, + node_id: NodeId, + ) -> anyhow::Result { + self.dispatch( + Method::PUT, + format!("control/v1/tenant/{tenant_shard_id}/migrate"), + Some(TenantShardMigrateRequest { + tenant_shard_id, + node_id, + }), + ) + .await + } + + #[instrument(skip(self), fields(%tenant_id, %new_shard_count))] + pub async fn tenant_split( + &self, + tenant_id: TenantId, + new_shard_count: u8, + new_stripe_size: Option, + ) -> anyhow::Result { + self.dispatch( + Method::PUT, + format!("control/v1/tenant/{tenant_id}/shard_split"), + Some(TenantShardSplitRequest { + new_shard_count, + new_stripe_size, + }), + ) + .await + } + + #[instrument(skip_all, fields(node_id=%req.node_id))] + pub async fn node_register(&self, req: NodeRegisterRequest) -> anyhow::Result<()> { + self.dispatch::<_, ()>(Method::POST, "control/v1/node".to_string(), Some(req)) + .await + } + + #[instrument(skip_all, fields(node_id=%req.node_id))] + pub async fn node_configure(&self, req: NodeConfigureRequest) -> anyhow::Result<()> { + self.dispatch::<_, ()>( + Method::PUT, + format!("control/v1/node/{}/config", req.node_id), + Some(req), + ) + .await + } + + #[instrument(skip(self))] + pub async fn ready(&self) -> anyhow::Result<()> { + self.dispatch::<(), ()>(Method::GET, "ready".to_string(), None) + .await + } + + #[instrument(skip_all, fields(%tenant_id, timeline_id=%req.new_timeline_id))] + pub async fn tenant_timeline_create( + &self, + tenant_id: TenantId, + req: TimelineCreateRequest, + ) -> anyhow::Result { + self.dispatch( + Method::POST, + format!("v1/tenant/{tenant_id}/timeline"), + Some(req), + ) + .await + } +} diff --git a/diesel.toml b/diesel.toml new file mode 100644 index 0000000000..30ed4444d7 --- /dev/null +++ b/diesel.toml @@ -0,0 +1,9 @@ +# For documentation on how to configure this file, +# see https://diesel.rs/guides/configuring-diesel-cli + +[print_schema] +file = "control_plane/attachment_service/src/schema.rs" +custom_type_derives = ["diesel::query_builder::QueryId"] + +[migrations_directory] +dir = "control_plane/attachment_service/migrations" diff --git a/docs/authentication.md b/docs/authentication.md index f768b04c5b..522c5481b4 100644 --- a/docs/authentication.md +++ b/docs/authentication.md @@ -70,6 +70,9 @@ Should only be used e.g. for status check/tenant creation/list. Should only be used e.g. for status check. Currently also used for connection from any pageserver to any safekeeper. +"generations_api": Provides access to the upcall APIs served by the storage controller or the control plane. + +"admin": Provides access to the control plane and admin APIs of the storage controller. ### CLI CLI generates a key pair during call to `neon_local init` with the following commands: diff --git a/docs/docker.md b/docs/docker.md index 9761cc4346..cbf68be3a7 100644 --- a/docs/docker.md +++ b/docs/docker.md @@ -21,7 +21,7 @@ We build all images after a successful `release` tests run and push automaticall ## Docker Compose example -You can see a [docker compose](https://docs.docker.com/compose/) example to create a neon cluster in [/docker-compose/docker-compose.yml](/docker-compose/docker-compose.yml). It creates the following conatainers. +You can see a [docker compose](https://docs.docker.com/compose/) example to create a neon cluster in [/docker-compose/docker-compose.yml](/docker-compose/docker-compose.yml). It creates the following containers. - pageserver x 1 - safekeeper x 3 @@ -38,7 +38,7 @@ You can specify version of neon cluster using following environment values. - TAG: the tag version of [docker image](https://registry.hub.docker.com/r/neondatabase/neon/tags) (default is latest), which is tagged in [CI test](/.github/workflows/build_and_test.yml) ``` $ cd docker-compose/ -$ docker-compose down # remove the conainers if exists +$ docker-compose down # remove the containers if exists $ PG_VERSION=15 TAG=2937 docker-compose up --build -d # You can specify the postgres and image version Creating network "dockercompose_default" with the default driver Creating docker-compose_storage_broker_1 ... done diff --git a/docs/pageserver-storage.md b/docs/pageserver-storage.md index 77e7ff35bc..9902f6b930 100644 --- a/docs/pageserver-storage.md +++ b/docs/pageserver-storage.md @@ -64,7 +64,7 @@ Storage. The LayerMap tracks what layers exist in a timeline. -Currently, the layer map is just a resizeable array (Vec). On a GetPage@LSN or +Currently, the layer map is just a resizable array (Vec). On a GetPage@LSN or other read request, the layer map scans through the array to find the right layer that contains the data for the requested page. The read-code in LayeredTimeline is aware of the ancestor, and returns data from the ancestor timeline if it's diff --git a/docs/pageserver-thread-mgmt.md b/docs/pageserver-thread-mgmt.md index c911d2c53d..5d862415eb 100644 --- a/docs/pageserver-thread-mgmt.md +++ b/docs/pageserver-thread-mgmt.md @@ -22,7 +22,7 @@ timeline to shutdown. It will also wait for them to finish. A task registered in the task registry can check if it has been requested to shut down, by calling `is_shutdown_requested()`. There's -also a `shudown_watcher()` Future that can be used with `tokio::select!` +also a `shutdown_watcher()` Future that can be used with `tokio::select!` or similar, to wake up on shutdown. diff --git a/docs/pageserver-walredo.md b/docs/pageserver-walredo.md index 1de9c177cc..7b366ff616 100644 --- a/docs/pageserver-walredo.md +++ b/docs/pageserver-walredo.md @@ -74,4 +74,4 @@ somewhat wasteful, but because most WAL records only affect one page, the overhead is acceptable. The WAL redo always happens for one particular page. If the WAL record -coantains changes to other pages, they are ignored. +contains changes to other pages, they are ignored. diff --git a/docs/rfcs/002-storage.md b/docs/rfcs/002-storage.md index f99683cf09..d11b750e73 100644 --- a/docs/rfcs/002-storage.md +++ b/docs/rfcs/002-storage.md @@ -1,4 +1,4 @@ -# Zenith storage node — alternative +# Neon storage node — alternative ## **Design considerations** diff --git a/docs/rfcs/003-laptop-cli.md b/docs/rfcs/003-laptop-cli.md index 1a549c2df5..003a05bd16 100644 --- a/docs/rfcs/003-laptop-cli.md +++ b/docs/rfcs/003-laptop-cli.md @@ -1,6 +1,6 @@ # Command line interface (end-user) -Zenith CLI as it is described here mostly resides on the same conceptual level as pg_ctl/initdb/pg_recvxlog/etc and replaces some of them in an opinionated way. I would also suggest bundling our patched postgres inside zenith distribution at least at the start. +Neon CLI as it is described here mostly resides on the same conceptual level as pg_ctl/initdb/pg_recvxlog/etc and replaces some of them in an opinionated way. I would also suggest bundling our patched postgres inside neon distribution at least at the start. This proposal is focused on managing local installations. For cluster operations, different tooling would be needed. The point of integration between the two is storage URL: no matter how complex cluster setup is it may provide an endpoint where the user may push snapshots. @@ -8,40 +8,40 @@ The most important concept here is a snapshot, which can be created/pushed/pulle # Possible usage scenarios -## Install zenith, run a postgres +## Install neon, run a postgres ``` -> brew install pg-zenith -> zenith pg create # creates pgdata with default pattern pgdata$i -> zenith pg list +> brew install pg-neon +> neon pg create # creates pgdata with default pattern pgdata$i +> neon pg list ID PGDATA USED STORAGE ENDPOINT -primary1 pgdata1 0G zenith-local localhost:5432 +primary1 pgdata1 0G neon-local localhost:5432 ``` -## Import standalone postgres to zenith +## Import standalone postgres to neon ``` -> zenith snapshot import --from=basebackup://replication@localhost:5432/ oldpg +> neon snapshot import --from=basebackup://replication@localhost:5432/ oldpg [====================------------] 60% | 20MB/s -> zenith snapshot list +> neon snapshot list ID SIZE PARENT oldpg 5G - -> zenith pg create --snapshot oldpg +> neon pg create --snapshot oldpg Started postgres on localhost:5432 -> zenith pg list +> neon pg list ID PGDATA USED STORAGE ENDPOINT -primary1 pgdata1 5G zenith-local localhost:5432 +primary1 pgdata1 5G neon-local localhost:5432 -> zenith snapshot destroy oldpg +> neon snapshot destroy oldpg Ok ``` Also, we may start snapshot import implicitly by looking at snapshot schema ``` -> zenith pg create --snapshot basebackup://replication@localhost:5432/ +> neon pg create --snapshot basebackup://replication@localhost:5432/ Downloading snapshot... Done. Started postgres on localhost:5432 Destroying snapshot... Done. @@ -52,39 +52,39 @@ Destroying snapshot... Done. Since we may export the whole snapshot as one big file (tar of basebackup, maybe with some manifest) it may be shared over conventional means: http, ssh, [git+lfs](https://docs.github.com/en/github/managing-large-files/about-git-large-file-storage). ``` -> zenith pg create --snapshot http://learn-postgres.com/movies_db.zenith movies +> neon pg create --snapshot http://learn-postgres.com/movies_db.neon movies ``` ## Create snapshot and push it to the cloud ``` -> zenith snapshot create pgdata1@snap1 -> zenith snapshot push --to ssh://stas@zenith.tech pgdata1@snap1 +> neon snapshot create pgdata1@snap1 +> neon snapshot push --to ssh://stas@neon.tech pgdata1@snap1 ``` ## Rollback database to the snapshot -One way to rollback the database is just to init a new database from the snapshot and destroy the old one. But creating a new database from a snapshot would require a copy of that snapshot which is time consuming operation. Another option that would be cool to support is the ability to create the copy-on-write database from the snapshot without copying data, and store updated pages in a separate location, however that way would have performance implications. So to properly rollback the database to the older state we have `zenith pg checkout`. +One way to rollback the database is just to init a new database from the snapshot and destroy the old one. But creating a new database from a snapshot would require a copy of that snapshot which is time consuming operation. Another option that would be cool to support is the ability to create the copy-on-write database from the snapshot without copying data, and store updated pages in a separate location, however that way would have performance implications. So to properly rollback the database to the older state we have `neon pg checkout`. ``` -> zenith pg list +> neon pg list ID PGDATA USED STORAGE ENDPOINT -primary1 pgdata1 5G zenith-local localhost:5432 +primary1 pgdata1 5G neon-local localhost:5432 -> zenith snapshot create pgdata1@snap1 +> neon snapshot create pgdata1@snap1 -> zenith snapshot list +> neon snapshot list ID SIZE PARENT oldpg 5G - pgdata1@snap1 6G - pgdata1@CURRENT 6G - -> zenith pg checkout pgdata1@snap1 +> neon pg checkout pgdata1@snap1 Stopping postgres on pgdata1. Rolling back pgdata1@CURRENT to pgdata1@snap1. Starting postgres on pgdata1. -> zenith snapshot list +> neon snapshot list ID SIZE PARENT oldpg 5G - pgdata1@snap1 6G - @@ -99,7 +99,7 @@ Some notes: pgdata1@CURRENT -- implicit snapshot representing the current state PITR area acts like a continuous snapshot where you can reset the database to any point in time within this area (by area I mean some TTL period or some size limit, both possibly infinite). ``` -> zenith pitr create --storage s3tank --ttl 30d --name pitr_last_month +> neon pitr create --storage s3tank --ttl 30d --name pitr_last_month ``` Resetting the database to some state in past would require creating a snapshot on some lsn / time in this pirt area. @@ -108,29 +108,29 @@ Resetting the database to some state in past would require creating a snapshot o ## storage -Storage is either zenith pagestore or s3. Users may create a database in a pagestore and create/move *snapshots* and *pitr regions* in both pagestore and s3. Storage is a concept similar to `git remote`. After installation, I imagine one local storage is available by default. +Storage is either neon pagestore or s3. Users may create a database in a pagestore and create/move *snapshots* and *pitr regions* in both pagestore and s3. Storage is a concept similar to `git remote`. After installation, I imagine one local storage is available by default. -**zenith storage attach** -t [native|s3] -c key=value -n name +**neon storage attach** -t [native|s3] -c key=value -n name -Attaches/initializes storage. For --type=s3, user credentials and path should be provided. For --type=native we may support --path=/local/path and --url=zenith.tech/stas/mystore. Other possible term for native is 'zstore'. +Attaches/initializes storage. For --type=s3, user credentials and path should be provided. For --type=native we may support --path=/local/path and --url=neon.tech/stas/mystore. Other possible term for native is 'zstore'. -**zenith storage list** +**neon storage list** Show currently attached storages. For example: ``` -> zenith storage list +> neon storage list NAME USED TYPE OPTIONS PATH -local 5.1G zenith-local /opt/zenith/store/local -local.compr 20.4G zenith-local compression=on /opt/zenith/store/local.compr -zcloud 60G zenith-remote zenith.tech/stas/mystore +local 5.1G neon-local /opt/neon/store/local +local.compr 20.4G neon-local compression=on /opt/neon/store/local.compr +zcloud 60G neon-remote neon.tech/stas/mystore s3tank 80G S3 ``` -**zenith storage detach** +**neon storage detach** -**zenith storage show** +**neon storage show** @@ -140,29 +140,29 @@ Manages postgres data directories and can start postgres instances with proper c Pg is a term for a single postgres running on some data. I'm trying to avoid separation of datadir management and postgres instance management -- both that concepts bundled here together. -**zenith pg create** [--no-start --snapshot --cow] -s storage-name -n pgdata +**neon pg create** [--no-start --snapshot --cow] -s storage-name -n pgdata Creates (initializes) new data directory in given storage and starts postgres. I imagine that storage for this operation may be only local and data movement to remote location happens through snapshots/pitr. --no-start: just init datadir without creating ---snapshot snap: init from the snapshot. Snap is a name or URL (zenith.tech/stas/mystore/snap1) +--snapshot snap: init from the snapshot. Snap is a name or URL (neon.tech/stas/mystore/snap1) --cow: initialize Copy-on-Write data directory on top of some snapshot (makes sense if it is a snapshot of currently running a database) -**zenith pg destroy** +**neon pg destroy** -**zenith pg start** [--replica] pgdata +**neon pg start** [--replica] pgdata Start postgres with proper extensions preloaded/installed. -**zenith pg checkout** +**neon pg checkout** Rollback data directory to some previous snapshot. -**zenith pg stop** pg_id +**neon pg stop** pg_id -**zenith pg list** +**neon pg list** ``` ROLE PGDATA USED STORAGE ENDPOINT @@ -173,7 +173,7 @@ primary my_pg2 3.2G local.compr localhost:5435 - my_pg3 9.2G local.compr - ``` -**zenith pg show** +**neon pg show** ``` my_pg: @@ -194,7 +194,7 @@ my_pg: ``` -**zenith pg start-rest/graphql** pgdata +**neon pg start-rest/graphql** pgdata Starts REST/GraphQL proxy on top of postgres master. Not sure we should do that, just an idea. @@ -203,35 +203,35 @@ Starts REST/GraphQL proxy on top of postgres master. Not sure we should do that, Snapshot creation is cheap -- no actual data is copied, we just start retaining old pages. Snapshot size means the amount of retained data, not all data. Snapshot name looks like pgdata_name@tag_name. tag_name is set by the user during snapshot creation. There are some reserved tag names: CURRENT represents the current state of the data directory; HEAD{i} represents the data directory state that resided in the database before i-th checkout. -**zenith snapshot create** pgdata_name@snap_name +**neon snapshot create** pgdata_name@snap_name Creates a new snapshot in the same storage where pgdata_name exists. -**zenith snapshot push** --to url pgdata_name@snap_name +**neon snapshot push** --to url pgdata_name@snap_name -Produces binary stream of a given snapshot. Under the hood starts temp read-only postgres over this snapshot and sends basebackup stream. Receiving side should start `zenith snapshot recv` before push happens. If url has some special schema like zenith:// receiving side may require auth start `zenith snapshot recv` on the go. +Produces binary stream of a given snapshot. Under the hood starts temp read-only postgres over this snapshot and sends basebackup stream. Receiving side should start `neon snapshot recv` before push happens. If url has some special schema like neon:// receiving side may require auth start `neon snapshot recv` on the go. -**zenith snapshot recv** +**neon snapshot recv** Starts a port listening for a basebackup stream, prints connection info to stdout (so that user may use that in push command), and expects data on that socket. -**zenith snapshot pull** --from url or path +**neon snapshot pull** --from url or path -Connects to a remote zenith/s3/file and pulls snapshot. The remote site should be zenith service or files in our format. +Connects to a remote neon/s3/file and pulls snapshot. The remote site should be neon service or files in our format. -**zenith snapshot import** --from basebackup://<...> or path +**neon snapshot import** --from basebackup://<...> or path Creates a new snapshot out of running postgres via basebackup protocol or basebackup files. -**zenith snapshot export** +**neon snapshot export** -Starts read-only postgres over this snapshot and exports data in some format (pg_dump, or COPY TO on some/all tables). One of the options may be zenith own format which is handy for us (but I think just tar of basebackup would be okay). +Starts read-only postgres over this snapshot and exports data in some format (pg_dump, or COPY TO on some/all tables). One of the options may be neon own format which is handy for us (but I think just tar of basebackup would be okay). -**zenith snapshot diff** snap1 snap2 +**neon snapshot diff** snap1 snap2 Shows size of data changed between two snapshots. We also may provide options to diff schema/data in tables. To do that start temp read-only postgreses. -**zenith snapshot destroy** +**neon snapshot destroy** ## pitr @@ -239,7 +239,7 @@ Pitr represents wal stream and ttl policy for that stream XXX: any suggestions on a better name? -**zenith pitr create** name +**neon pitr create** name --ttl = inf | period @@ -247,21 +247,21 @@ XXX: any suggestions on a better name? --storage = storage_name -**zenith pitr extract-snapshot** pitr_name --lsn xxx +**neon pitr extract-snapshot** pitr_name --lsn xxx Creates a snapshot out of some lsn in PITR area. The obtained snapshot may be managed with snapshot routines (move/send/export) -**zenith pitr gc** pitr_name +**neon pitr gc** pitr_name Force garbage collection on some PITR area. -**zenith pitr list** +**neon pitr list** -**zenith pitr destroy** +**neon pitr destroy** ## console -**zenith console** +**neon console** Opens browser targeted at web console with the more or less same functionality as described here. diff --git a/docs/rfcs/004-durability.md b/docs/rfcs/004-durability.md index d4716156d1..6b83c77403 100644 --- a/docs/rfcs/004-durability.md +++ b/docs/rfcs/004-durability.md @@ -6,7 +6,7 @@ When do we consider the WAL record as durable, so that we can acknowledge the commit to the client and be reasonably certain that we will not lose the transaction? -Zenith uses a group of WAL safekeeper nodes to hold the generated WAL. +Neon uses a group of WAL safekeeper nodes to hold the generated WAL. A WAL record is considered durable, when it has been written to a majority of WAL safekeeper nodes. In this document, I use 5 safekeepers, because I have five fingers. A WAL record is durable, diff --git a/docs/rfcs/005-zenith_local.md b/docs/rfcs/005-zenith_local.md index e36d0a9ae3..6c283d7a37 100644 --- a/docs/rfcs/005-zenith_local.md +++ b/docs/rfcs/005-zenith_local.md @@ -1,23 +1,23 @@ -# Zenith local +# Neon local -Here I list some objectives to keep in mind when discussing zenith-local design and a proposal that brings all components together. Your comments on both parts are very welcome. +Here I list some objectives to keep in mind when discussing neon-local design and a proposal that brings all components together. Your comments on both parts are very welcome. #### Why do we need it? - For distribution - this easy to use binary will help us to build adoption among developers. - For internal use - to test all components together. -In my understanding, we consider it to be just a mock-up version of zenith-cloud. +In my understanding, we consider it to be just a mock-up version of neon-cloud. > Question: How much should we care about durability and security issues for a local setup? #### Why is it better than a simple local postgres? -- Easy one-line setup. As simple as `cargo install zenith && zenith start` +- Easy one-line setup. As simple as `cargo install neon && neon start` - Quick and cheap creation of compute nodes over the same storage. > Question: How can we describe a use-case for this feature? -- Zenith-local can work with S3 directly. +- Neon-local can work with S3 directly. - Push and pull images (snapshots) to remote S3 to exchange data with other users. @@ -31,50 +31,50 @@ Ideally, just one binary that incorporates all elements we need. #### Components: -- **zenith-CLI** - interface for end-users. Turns commands to REST requests and handles responses to show them in a user-friendly way. -CLI proposal is here https://github.com/libzenith/rfcs/blob/003-laptop-cli.md/003-laptop-cli.md -WIP code is here: https://github.com/libzenith/postgres/tree/main/pageserver/src/bin/cli +- **neon-CLI** - interface for end-users. Turns commands to REST requests and handles responses to show them in a user-friendly way. +CLI proposal is here https://github.com/neondatabase/rfcs/blob/003-laptop-cli.md/003-laptop-cli.md +WIP code is here: https://github.com/neondatabase/postgres/tree/main/pageserver/src/bin/cli -- **zenith-console** - WEB UI with same functionality as CLI. +- **neon-console** - WEB UI with same functionality as CLI. >Note: not for the first release. -- **zenith-local** - entrypoint. Service that starts all other components and handles REST API requests. See REST API proposal below. - > Idea: spawn all other components as child processes, so that we could shutdown everything by stopping zenith-local. +- **neon-local** - entrypoint. Service that starts all other components and handles REST API requests. See REST API proposal below. + > Idea: spawn all other components as child processes, so that we could shutdown everything by stopping neon-local. -- **zenith-pageserver** - consists of a storage and WAL-replaying service (modified PG in current implementation). +- **neon-pageserver** - consists of a storage and WAL-replaying service (modified PG in current implementation). > Question: Probably, for local setup we should be able to bypass page-storage and interact directly with S3 to avoid double caching in shared buffers and page-server? -WIP code is here: https://github.com/libzenith/postgres/tree/main/pageserver/src +WIP code is here: https://github.com/neondatabase/postgres/tree/main/pageserver/src -- **zenith-S3** - stores base images of the database and WAL in S3 object storage. Import and export images from/to zenith. +- **neon-S3** - stores base images of the database and WAL in S3 object storage. Import and export images from/to neon. > Question: How should it operate in a local setup? Will we manage it ourselves or ask user to provide credentials for existing S3 object storage (i.e. minio)? > Question: Do we use it together with local page store or they are interchangeable? WIP code is ??? -- **zenith-safekeeper** - receives WAL from postgres, stores it durably, answers to Postgres that "sync" is succeed. +- **neon-safekeeper** - receives WAL from postgres, stores it durably, answers to Postgres that "sync" is succeed. > Question: How should it operate in a local setup? In my understanding it should push WAL directly to S3 (if we use it) or store all data locally (if we use local page storage). The latter option seems meaningless (extra overhead and no gain), but it is still good to test the system. -WIP code is here: https://github.com/libzenith/postgres/tree/main/src/bin/safekeeper +WIP code is here: https://github.com/neondatabase/postgres/tree/main/src/bin/safekeeper -- **zenith-computenode** - bottomless PostgreSQL, ideally upstream, but for a start - our modified version. User can quickly create and destroy them and work with it as a regular postgres database. +- **neon-computenode** - bottomless PostgreSQL, ideally upstream, but for a start - our modified version. User can quickly create and destroy them and work with it as a regular postgres database. - WIP code is in main branch and here: https://github.com/libzenith/postgres/commits/compute_node + WIP code is in main branch and here: https://github.com/neondatabase/postgres/commits/compute_node #### REST API: Service endpoint: `http://localhost:3000` Resources: -- /storages - Where data lives: zenith-pageserver or zenith-s3 -- /pgs - Postgres - zenith-computenode +- /storages - Where data lives: neon-pageserver or neon-s3 +- /pgs - Postgres - neon-computenode - /snapshots - snapshots **TODO** ->Question: Do we want to extend this API to manage zenith components? I.e. start page-server, manage safekeepers and so on? Or they will be hardcoded to just start once and for all? +>Question: Do we want to extend this API to manage neon components? I.e. start page-server, manage safekeepers and so on? Or they will be hardcoded to just start once and for all? Methods and their mapping to CLI: -- /storages - zenith-pageserver or zenith-s3 +- /storages - neon-pageserver or neon-s3 CLI | REST API ------------- | ------------- @@ -84,7 +84,7 @@ storage list | GET /storages storage show -n name | GET /storages/:storage_name -- /pgs - zenith-computenode +- /pgs - neon-computenode CLI | REST API ------------- | ------------- diff --git a/docs/rfcs/006-laptop-cli-v2-CLI.md b/docs/rfcs/006-laptop-cli-v2-CLI.md index 84dc932211..5030ecc7e7 100644 --- a/docs/rfcs/006-laptop-cli-v2-CLI.md +++ b/docs/rfcs/006-laptop-cli-v2-CLI.md @@ -1,45 +1,45 @@ -Zenith CLI allows you to operate database clusters (catalog clusters) and their commit history locally and in the cloud. Since ANSI calls them catalog clusters and cluster is a loaded term in the modern infrastructure we will call it "catalog". +Neon CLI allows you to operate database clusters (catalog clusters) and their commit history locally and in the cloud. Since ANSI calls them catalog clusters and cluster is a loaded term in the modern infrastructure we will call it "catalog". # CLI v2 (after chatting with Carl) -Zenith introduces the notion of a repository. +Neon introduces the notion of a repository. ```bash -zenith init -zenith clone zenith://zenith.tech/piedpiper/northwind -- clones a repo to the northwind directory +neon init +neon clone neon://neon.tech/piedpiper/northwind -- clones a repo to the northwind directory ``` Once you have a cluster catalog you can explore it ```bash -zenith log -- returns a list of commits -zenith status -- returns if there are changes in the catalog that can be committed -zenith commit -- commits the changes and generates a new commit hash -zenith branch experimental -- creates a branch called testdb based on a given commit hash +neon log -- returns a list of commits +neon status -- returns if there are changes in the catalog that can be committed +neon commit -- commits the changes and generates a new commit hash +neon branch experimental -- creates a branch called testdb based on a given commit hash ``` To make changes in the catalog you need to run compute nodes ```bash -- here is how you a compute node -zenith start /home/pipedpiper/northwind:main -- starts a compute instance -zenith start zenith://zenith.tech/northwind:main -- starts a compute instance in the cloud +neon start /home/pipedpiper/northwind:main -- starts a compute instance +neon start neon://neon.tech/northwind:main -- starts a compute instance in the cloud -- you can start a compute node against any hash or branch -zenith start /home/pipedpiper/northwind:experimental --port 8008 -- start another compute instance (on different port) +neon start /home/pipedpiper/northwind:experimental --port 8008 -- start another compute instance (on different port) -- you can start a compute node against any hash or branch -zenith start /home/pipedpiper/northwind: --port 8009 -- start another compute instance (on different port) +neon start /home/pipedpiper/northwind: --port 8009 -- start another compute instance (on different port) -- After running some DML you can run --- zenith status and see how there are two WAL streams one on top of +-- neon status and see how there are two WAL streams one on top of -- the main branch -zenith status +neon status -- and another on top of the experimental branch -zenith status -b experimental +neon status -b experimental -- you can commit each branch separately -zenith commit main +neon commit main -- or -zenith commit -c /home/pipedpiper/northwind:experimental +neon commit -c /home/pipedpiper/northwind:experimental ``` Starting compute instances against cloud environments @@ -47,18 +47,18 @@ Starting compute instances against cloud environments ```bash -- you can start a compute instance against the cloud environment -- in this case all of the changes will be streamed into the cloud -zenith start https://zenith:tech/pipedpiper/northwind:main -zenith start https://zenith:tech/pipedpiper/northwind:main -zenith status -c https://zenith:tech/pipedpiper/northwind:main -zenith commit -c https://zenith:tech/pipedpiper/northwind:main -zenith branch -c https://zenith:tech/pipedpiper/northwind: experimental +neon start https://neon:tecj/pipedpiper/northwind:main +neon start https://neon:tecj/pipedpiper/northwind:main +neon status -c https://neon:tecj/pipedpiper/northwind:main +neon commit -c https://neon:tecj/pipedpiper/northwind:main +neon branch -c https://neon:tecj/pipedpiper/northwind: experimental ``` Pushing data into the cloud ```bash -- pull all the commits from the cloud -zenith pull +neon pull -- push all the commits to the cloud -zenith push +neon push ``` diff --git a/docs/rfcs/006-laptop-cli-v2-repository-structure.md b/docs/rfcs/006-laptop-cli-v2-repository-structure.md index e6e6e172ad..749a940313 100644 --- a/docs/rfcs/006-laptop-cli-v2-repository-structure.md +++ b/docs/rfcs/006-laptop-cli-v2-repository-structure.md @@ -1,14 +1,14 @@ # Repository format -A Zenith repository is similar to a traditional PostgreSQL backup +A Neon repository is similar to a traditional PostgreSQL backup archive, like a WAL-G bucket or pgbarman backup catalogue. It holds multiple versions of a PostgreSQL database cluster. -The distinguishing feature is that you can launch a Zenith Postgres +The distinguishing feature is that you can launch a Neon Postgres server directly against a branch in the repository, without having to -"restore" it first. Also, Zenith manages the storage automatically, +"restore" it first. Also, Neon manages the storage automatically, there is no separation between full and incremental backups nor WAL -archive. Zenith relies heavily on the WAL, and uses concepts similar +archive. Neon relies heavily on the WAL, and uses concepts similar to incremental backups and WAL archiving internally, but it is hidden from the user. @@ -19,15 +19,15 @@ efficient. Just something to get us started. The repository directory looks like this: - .zenith/timelines/4543be3daeab2ed4e58a285cbb8dd1fce6970f8c/wal/ - .zenith/timelines/4543be3daeab2ed4e58a285cbb8dd1fce6970f8c/snapshots// - .zenith/timelines/4543be3daeab2ed4e58a285cbb8dd1fce6970f8c/history + .neon/timelines/4543be3daeab2ed4e58a285cbb8dd1fce6970f8c/wal/ + .neon/timelines/4543be3daeab2ed4e58a285cbb8dd1fce6970f8c/snapshots// + .neon/timelines/4543be3daeab2ed4e58a285cbb8dd1fce6970f8c/history - .zenith/refs/branches/mybranch - .zenith/refs/tags/foo - .zenith/refs/tags/bar + .neon/refs/branches/mybranch + .neon/refs/tags/foo + .neon/refs/tags/bar - .zenith/datadirs/ + .neon/datadirs/ ### Timelines @@ -39,7 +39,7 @@ All WAL is generated on a timeline. You can launch a read-only node against a tag or arbitrary LSN on a timeline, but in order to write, you need to create a timeline. -Each timeline is stored in a directory under .zenith/timelines. It +Each timeline is stored in a directory under .neon/timelines. It consists of a WAL archive, containing all the WAL in the standard PostgreSQL format, under the wal/ subdirectory. @@ -66,18 +66,18 @@ contains the UUID of the timeline (and LSN, for tags). ### Datadirs -.zenith/datadirs contains PostgreSQL data directories. You can launch +.neon/datadirs contains PostgreSQL data directories. You can launch a Postgres instance on one of them with: ``` - postgres -D .zenith/datadirs/4543be3daeab2ed4e58a285cbb8dd1fce6970f8c + postgres -D .neon/datadirs/4543be3daeab2ed4e58a285cbb8dd1fce6970f8c ``` All the actual data is kept in the timeline directories, under -.zenith/timelines. The data directories are only needed for active +.neon/timelines. The data directories are only needed for active PostgreQSL instances. After an instance is stopped, the data directory -can be safely removed. "zenith start" will recreate it quickly from -the data in .zenith/timelines, if it's missing. +can be safely removed. "neon start" will recreate it quickly from +the data in .neon/timelines, if it's missing. ## Version 2 @@ -103,14 +103,14 @@ more advanced. The exact format is TODO. But it should support: ### Garbage collection -When you run "zenith gc", old timelines that are no longer needed are +When you run "neon gc", old timelines that are no longer needed are removed. That involves collecting the list of "unreachable" objects, starting from the named branches and tags. Also, if enough WAL has been generated on a timeline since last snapshot, a new snapshot or delta is created. -### zenith push/pull +### neon push/pull Compare the tags and branches on both servers, and copy missing ones. For each branch, compare the timeline it points to in both servers. If @@ -123,7 +123,7 @@ every time you start up an instance? Then you would detect that the timelines have diverged. That would match with the "epoch" concept that we have in the WAL safekeeper -### zenith checkout/commit +### neon checkout/commit In this format, there is no concept of a "working tree", and hence no concept of checking out or committing. All modifications are done on @@ -134,7 +134,7 @@ You can easily fork off a temporary timeline to emulate a "working tree". You can later remove it and have it garbage collected, or to "commit", re-point the branch to the new timeline. -If we want to have a worktree and "zenith checkout/commit" concept, we can +If we want to have a worktree and "neon checkout/commit" concept, we can emulate that with a temporary timeline. Create the temporary timeline at -"zenith checkout", and have "zenith commit" modify the branch to point to +"neon checkout", and have "neon commit" modify the branch to point to the new timeline. diff --git a/docs/rfcs/007-serverless-on-laptop.md b/docs/rfcs/007-serverless-on-laptop.md index e6355f4a03..96f117bfe9 100644 --- a/docs/rfcs/007-serverless-on-laptop.md +++ b/docs/rfcs/007-serverless-on-laptop.md @@ -4,27 +4,27 @@ How it works now 1. Create repository, start page server on it ``` -$ zenith init +$ neon init ... created main branch -new zenith repository was created in .zenith +new neon repository was created in .neon -$ zenith pageserver start -Starting pageserver at '127.0.0.1:64000' in .zenith +$ neon pageserver start +Starting pageserver at '127.0.0.1:64000' in .neon Page server started ``` 2. Create a branch, and start a Postgres instance on it ``` -$ zenith branch heikki main +$ neon branch heikki main branching at end of WAL: 0/15ECF68 -$ zenith pg create heikki +$ neon pg create heikki Initializing Postgres on timeline 76cf9279915be7797095241638e64644... -Extracting base backup to create postgres instance: path=.zenith/pgdatadirs/pg1 port=55432 +Extracting base backup to create postgres instance: path=.neon/pgdatadirs/pg1 port=55432 -$ zenith pg start pg1 +$ neon pg start pg1 Starting postgres node at 'host=127.0.0.1 port=55432 user=heikki' waiting for server to start.... done server started @@ -52,20 +52,20 @@ serverless on your laptop, so that the workflow becomes just: 1. Create repository, start page server on it (same as before) ``` -$ zenith init +$ neon init ... created main branch -new zenith repository was created in .zenith +new neon repository was created in .neon -$ zenith pageserver start -Starting pageserver at '127.0.0.1:64000' in .zenith +$ neon pageserver start +Starting pageserver at '127.0.0.1:64000' in .neon Page server started ``` 2. Create branch ``` -$ zenith branch heikki main +$ neon branch heikki main branching at end of WAL: 0/15ECF68 ``` diff --git a/docs/rfcs/008-push-pull.md b/docs/rfcs/008-push-pull.md index 272628e1ce..a36932222a 100644 --- a/docs/rfcs/008-push-pull.md +++ b/docs/rfcs/008-push-pull.md @@ -7,22 +7,22 @@ Here is a proposal about implementing push/pull mechanics between pageservers. W The origin represents connection info for some remote pageserver. Let's use here same commands as git uses except using explicit list subcommand (git uses `origin -v` for that). ``` -zenith origin add -zenith origin list -zenith origin remove +neon origin add +neon origin list +neon origin remove ``` Connection URI a string of form `postgresql://user:pass@hostname:port` (https://www.postgresql.org/docs/13/libpq-connect.html#id-1.7.3.8.3.6). We can start with libpq password auth and later add support for client certs or require ssh as transport or invent some other kind of transport. -Behind the scenes, this commands may update toml file inside .zenith directory. +Behind the scenes, this commands may update toml file inside .neon directory. ## Push ### Pushing branch ``` -zenith push mybranch cloudserver # push to eponymous branch in cloudserver -zenith push mybranch cloudserver:otherbranch # push to a different branch in cloudserver +neon push mybranch cloudserver # push to eponymous branch in cloudserver +neon push mybranch cloudserver:otherbranch # push to a different branch in cloudserver ``` Exact mechanics would be slightly different in the following situations: diff --git a/docs/rfcs/009-snapshot-first-storage-cli.md b/docs/rfcs/009-snapshot-first-storage-cli.md index 0acbd68f86..bbd0f75fe2 100644 --- a/docs/rfcs/009-snapshot-first-storage-cli.md +++ b/docs/rfcs/009-snapshot-first-storage-cli.md @@ -2,7 +2,7 @@ While working on export/import commands, I understood that they fit really well We may think about backups as snapshots in a different format (i.e plain pgdata format, basebackup tar format, WAL-G format (if they want to support it) and so on). They use same storage API, the only difference is the code that packs/unpacks files. -Even if zenith aims to maintains durability using it's own snapshots, backups will be useful for uploading data from postgres to zenith. +Even if neon aims to maintains durability using it's own snapshots, backups will be useful for uploading data from postgres to neon. So here is an attempt to design consistent CLI for different usage scenarios: @@ -16,8 +16,8 @@ Save`storage_dest` and other parameters in config. Push snapshots to `storage_dest` in background. ``` -zenith init --storage_dest=S3_PREFIX -zenith start +neon init --storage_dest=S3_PREFIX +neon start ``` #### 2. Restart pageserver (manually or crash-recovery). @@ -25,7 +25,7 @@ Take `storage_dest` from pageserver config, start pageserver from latest snapsho Push snapshots to `storage_dest` in background. ``` -zenith start +neon start ``` #### 3. Import. @@ -35,22 +35,22 @@ Do not save `snapshot_path` and `snapshot_format` in config, as it is a one-time Save`storage_dest` parameters in config. Push snapshots to `storage_dest` in background. ``` -//I.e. we want to start zenith on top of existing $PGDATA and use s3 as a persistent storage. -zenith init --snapshot_path=FILE_PREFIX --snapshot_format=pgdata --storage_dest=S3_PREFIX -zenith start +//I.e. we want to start neon on top of existing $PGDATA and use s3 as a persistent storage. +neon init --snapshot_path=FILE_PREFIX --snapshot_format=pgdata --storage_dest=S3_PREFIX +neon start ``` How to pass credentials needed for `snapshot_path`? #### 4. Export. Manually push snapshot to `snapshot_path` which differs from `storage_dest` -Optionally set `snapshot_format`, which can be plain pgdata format or zenith format. +Optionally set `snapshot_format`, which can be plain pgdata format or neon format. ``` -zenith export --snapshot_path=FILE_PREFIX --snapshot_format=pgdata +neon export --snapshot_path=FILE_PREFIX --snapshot_format=pgdata ``` #### Notes and questions - safekeeper s3_offload should use same (similar) syntax for storage. How to set it in UI? -- Why do we need `zenith init` as a separate command? Can't we init everything at first start? +- Why do we need `neon init` as a separate command? Can't we init everything at first start? - We can think of better names for all options. - Export to plain postgres format will be useless, if we are not 100% compatible on page level. I can recall at least one such difference - PD_WAL_LOGGED flag in pages. diff --git a/docs/rfcs/013-term-history.md b/docs/rfcs/013-term-history.md index 7e815abf73..2f3ccbc09b 100644 --- a/docs/rfcs/013-term-history.md +++ b/docs/rfcs/013-term-history.md @@ -9,7 +9,7 @@ receival and this might lag behind `term`; safekeeper switches to epoch `n` when it has received all committed log records from all `< n` terms. This roughly corresponds to proposed in -https://github.com/zenithdb/rfcs/pull/3/files +https://github.com/neondatabase/rfcs/pull/3/files This makes our biggest our difference from Raft. In Raft, every log record is diff --git a/docs/rfcs/014-safekeepers-gossip.md b/docs/rfcs/014-safekeepers-gossip.md index 3d6cc04b94..ff38a0a0ef 100644 --- a/docs/rfcs/014-safekeepers-gossip.md +++ b/docs/rfcs/014-safekeepers-gossip.md @@ -1,6 +1,6 @@ # Safekeeper gossip -Extracted from this [PR](https://github.com/zenithdb/rfcs/pull/13) +Extracted from this [PR](https://github.com/neondatabase/rfcs/pull/13) ## Motivation diff --git a/docs/rfcs/015-storage-messaging.md b/docs/rfcs/015-storage-messaging.md index a415b90459..7702311d65 100644 --- a/docs/rfcs/015-storage-messaging.md +++ b/docs/rfcs/015-storage-messaging.md @@ -2,7 +2,7 @@ Created on 19.01.22 -Initially created [here](https://github.com/zenithdb/rfcs/pull/16) by @kelvich. +Initially created [here](https://github.com/neondatabase/rfcs/pull/16) by @kelvich. That it is an alternative to (014-safekeeper-gossip)[] @@ -292,4 +292,4 @@ But with an etcd we are in a bit different situation: 1. We don't need persistency and strong consistency guarantees for the data we store in the etcd 2. etcd uses Grpc as a protocol, and messages are pretty simple -So it looks like implementing in-mem store with etcd interface is straightforward thing _if we will want that in future_. At the same time, we can avoid implementing it right now, and we will be able to run local zenith installation with etcd running somewhere in the background (as opposed to building and running console, which in turn requires Postgres). +So it looks like implementing in-mem store with etcd interface is straightforward thing _if we will want that in future_. At the same time, we can avoid implementing it right now, and we will be able to run local neon installation with etcd running somewhere in the background (as opposed to building and running console, which in turn requires Postgres). diff --git a/docs/rfcs/017-console-split.md b/docs/rfcs/017-console-split.md new file mode 100644 index 0000000000..8036920610 --- /dev/null +++ b/docs/rfcs/017-console-split.md @@ -0,0 +1,420 @@ +# Splitting cloud console + +Created on 17.06.2022 + +## Summary + +Currently we have `cloud` repository that contains code implementing public API for our clients as well as code for managing storage and internal infrastructure services. We can split everything user-related from everything storage-related to make it easier to test and maintain. + +This RFC proposes to introduce a new control-plane service with HTTP API. The overall architecture will look like this: + +```markup +. x + external area x internal area + (our clients) x (our services) + x + x ┌───────────────────────┐ + x ┌───────────────┐ > ┌─────────────────────┐ │ Storage (EC2) │ + x │ console db │ > │ control-plane db │ │ │ + x └───────────────┘ > └─────────────────────┘ │ - safekeepers │ + x ▲ > ▲ │ - pageservers │ + x │ > │ │ │ +┌──────────────────┐ x ┌───────┴───────┐ > │ │ Dependencies │ +│ browser UI ├──►│ │ > ┌──────────┴──────────┐ │ │ +└──────────────────┘ x │ │ > │ │ │ - etcd │ + x │ console ├───────►│ control-plane ├────►│ - S3 │ +┌──────────────────┐ x │ │ > │ (deployed in k8s) │ │ - more? │ +│public API clients├──►│ │ > │ │ │ │ +└──────────────────┘ x └───────┬───────┘ > └──────────┬──────────┘ └───────────────────────┘ + x │ > ▲ │ ▲ + x │ > │ │ │ + x ┌───────┴───────┐ > │ │ ┌───────────┴───────────┐ + x │ dependencies │ > │ │ │ │ + x │- analytics │ > │ └───────────────►│ computes │ + x │- auth │ > │ │ (deployed in k8s) │ + x │- billing │ > │ │ │ + x └───────────────┘ > │ └───────────────────────┘ + x > │ ▲ + x > ┌─────┴───────────────┐ │ +┌──────────────────┐ x > │ │ │ +│ │ x > │ proxy ├─────────────────┘ +│ postgres ├───────────────────────────►│ (deployed in k8s) │ +│ users │ x > │ │ +│ │ x > └─────────────────────┘ +└──────────────────┘ x > + > + > + closed-source > open-source + > + > +``` + +Notes: + +- diagram is simplified in the less-important places +- directed arrows are strict and mean that connections in the reverse direction are forbidden + +This split is quite complex and this RFC proposes several smaller steps to achieve the larger goal: + +1. Start by refactoring the console code, the goal is to have console and control-plane code in the different directories without dependencies on each other. +2. Do similar refactoring for tables in the console database, remove queries selecting data from both console and control-plane; move control-plane tables to a separate database. +3. Implement control-plane HTTP API serving on a separate TCP port; make all console→control-plane calls to go through that HTTP API. +4. Move control-plane source code to the neon repo; start control-plane as a separate service. + +## Motivation + +These are the two most important problems we want to solve: + +- Publish open-source implementation of all our cloud/storage features +- Make a unified control-plane that is used in all cloud (serverless) and local (tests) setups + +Right now we have some closed-source code in the cloud repo. That code contains implementation for running Neon computes in k8s and without that code it’s impossible to automatically scale PostgreSQL computes. That means that we don’t have an open-source serverless PostgreSQL at the moment. + +After splitting and open-sourcing control-plane service we will have source code and Docker images for all storage services. That control-plane service should have HTTP API for creating and managing tenants (including all our storage features), while proxy will listen for incoming connections and create computes on-demand. + +Improving our test suite is an important task, but requires a lot of prerequisites and may require a separate RFC. Possible implementation of that is described in the section [Next steps](#next-steps). + +Another piece of motivation can be a better involvement of storage development team into a control-plane. By splitting control-plane from the console, it can be more convenient to test and develop control-plane with paying less attention to “business” features, such as user management, billing and analytics. + +For example, console currently requires authentication providers such as GitHub OAuth to work at all, as well as nodejs to be able to build it locally. It will be more convenient to build and run it locally without these requirements. + +## Proposed implementation + +### Current state of things + +Let’s start with defining the current state of things at the moment of this proposal. We have three repositories containing source code: + +- open-source `postgres` — our fork of postgres +- open-source `neon` — our main repository for storage source code +- closed-source `cloud` — mostly console backend and UI frontend + +This proposal aims not to change anything at the existing code in `neon` and `postgres` repositories, but to create control-plane service and move it’s source code from `cloud` to the `neon` repository. That means that we need to split code in `cloud` repo only, and will consider only this repository for exploring its source code. + +Let’s look at the miscellaneous things in the `cloud` repo which are NOT part of the console application, i.e. NOT the Go source code that is compiled to the `./console` binary. There we have: + +- command-line tools, such as cloudbench, neonadmin +- markdown documentation +- cloud operations scripts (helm, terraform, ansible) +- configs and other things +- e2e python tests +- incidents playbooks +- UI frontend +- Make build scripts, code generation scripts +- database migrations +- swagger definitions + +And also let’s take a look at what we have in the console source code, which is the service we’d like to split: + +- API Servers + - Public API v2 + - Management API v2 + - Public API v1 + - Admin API v1 (same port as Public API v1) + - Management API v1 +- Workers + - Monitor Compute Activity + - Watch Failed Operations + - Availability Checker + - Business Metrics Collector +- Internal Services + - Auth Middleware, UserIsAdmin, Cookies + - Cable Websocket Server + - Admin Services + - Global Settings, Operations, Pageservers, Platforms, Projects, Safekeepers, Users + - Authenticate Proxy + - API Keys + - App Controller, serving UI HTML + - Auth Controller + - Branches + - Projects + - Psql Connect + Passwordless login + - Users + - Cloud Metrics + - User Metrics + - Invites + - Pageserver/Safekeeper management + - Operations, k8s/docker/common logic + - Platforms, Regions + - Project State + - Projects Roles, SCRAM + - Global Settings +- Other things + - segment analytics integration + - sentry integration + - other common utilities packages + +### Drawing the splitting line + +The most challenging and the most important thing is to define the line that will split new control-plane service from the existing cloud service. If we don’t get it right, then we can end up with having a lot more issues without many benefits. + +We propose to define that line as follows: + +- everything user-related stays in the console service +- everything storage-related should be in the control-plane service +- something that falls in between should be decided where to go, but most likely should stay in the console service +- some similar parts should be in both services, such as admin/management/db_migrations + +We call user-related all requests that can be connected to some user. The general idea is don’t have any user_id in the control-plane service and operate exclusively on tenant_id+timeline_id, the same way as existing storage services work now (compute, safekeeper, pageserver). + +Storage-related things can be defined as doing any of the following: + +- using k8s API +- doing requests to any of the storage services (proxy, compute, safekeeper, pageserver, etc..) +- tracking current status of tenants/timelines, managing lifetime of computes + +Based on that idea, we can say that new control-plane service should have the following components: + +- single HTTP API for everything + - Create and manage tenants and timelines + - Manage global settings and storage configuration (regions, platforms, safekeepers, pageservers) + - Admin API for storage health inspection and debugging +- Workers + - Monitor Compute Activity + - Watch Failed Operations + - Availability Checker +- Internal Services + - Admin Services + - Global Settings, Operations, Pageservers, Platforms, Tenants, Safekeepers + - Authenticate Proxy + - Branches + - Psql Connect + - Cloud Metrics + - Pageserver/Safekeeper management + - Operations, k8s/docker/common logic + - Platforms, Regions + - Tenant State + - Compute Roles, SCRAM + - Global Settings + +--- + +And other components should probably stay in the console service: + +- API Servers (no changes here) + - Public API v2 + - Management API v2 + - Public API v1 + - Admin API v1 (same port as Public API v1) + - Management API v1 +- Workers + - Business Metrics Collector +- Internal Services + - Auth Middleware, UserIsAdmin, Cookies + - Cable Websocket Server + - Admin Services + - Users admin stays the same + - Other admin services can redirect requests to the control-plane + - API Keys + - App Controller, serving UI HTML + - Auth Controller + - Projects + - User Metrics + - Invites + - Users + - Passwordless login +- Other things + - segment analytics integration + - sentry integration + - other common utilities packages + +There are also miscellaneous things that are useful for all kinds of services. So we can say that these things can be in both services: + +- markdown documentation +- e2e python tests +- make build scripts, code generation scripts +- database migrations +- swagger definitions + +The single entrypoint to the storage should be control-plane API. After we define that API, we can have code-generated implementation for the client and for the server. The general idea is to move code implementing storage components from the console to the API implementation inside the new control-plane service. + +After the code is moved to the new service, we can fill the created void by making API calls to the new service: + +- authorization of the client +- mapping user_id + project_id to the tenant_id +- calling the control-plane API + +### control-plane API + +Currently we have the following projects API in the console: + +``` +GET /projects/{project_id} +PATCH /projects/{project_id} +POST /projects/{project_id}/branches +GET /projects/{project_id}/databases +POST /projects/{project_id}/databases +GET /projects/{project_id}/databases/{database_id} +PUT /projects/{project_id}/databases/{database_id} +DELETE /projects/{project_id}/databases/{database_id} +POST /projects/{project_id}/delete +GET /projects/{project_id}/issue_token +GET /projects/{project_id}/operations +GET /projects/{project_id}/operations/{operation_id} +POST /projects/{project_id}/query +GET /projects/{project_id}/roles +POST /projects/{project_id}/roles +GET /projects/{project_id}/roles/{role_name} +DELETE /projects/{project_id}/roles/{role_name} +POST /projects/{project_id}/roles/{role_name}/reset_password +POST /projects/{project_id}/start +POST /projects/{project_id}/stop +POST /psql_session/{psql_session_id} +``` + +It looks fine and we probably already have clients relying on it. So we should not change it, at least for now. But most of these endpoints (if not all) are related to storage, and it can suggest us what control-plane API should look like: + +``` +GET /tenants/{tenant_id} +PATCH /tenants/{tenant_id} +POST /tenants/{tenant_id}/branches +GET /tenants/{tenant_id}/databases +POST /tenants/{tenant_id}/databases +GET /tenants/{tenant_id}/databases/{database_id} +PUT /tenants/{tenant_id}/databases/{database_id} +DELETE /tenants/{tenant_id}/databases/{database_id} +POST /tenants/{tenant_id}/delete +GET /tenants/{tenant_id}/issue_token +GET /tenants/{tenant_id}/operations +GET /tenants/{tenant_id}/operations/{operation_id} +POST /tenants/{tenant_id}/query +GET /tenants/{tenant_id}/roles +POST /tenants/{tenant_id}/roles +GET /tenants/{tenant_id}/roles/{role_name} +DELETE /tenants/{tenant_id}/roles/{role_name} +POST /tenants/{tenant_id}/roles/{role_name}/reset_password +POST /tenants/{tenant_id}/start +POST /tenants/{tenant_id}/stop +POST /psql_session/{psql_session_id} +``` + +One of the options here is to use gRPC instead of the HTTP, which has some useful features, but there are some strong points towards using plain HTTP: + +- HTTP API is easier to use for the clients +- we already have HTTP API in pageserver/safekeeper/console +- we probably want control-plane API to be similar to the console API, available in the cloud + +### Getting updates from the storage + +There can be some valid cases, when we would like to know what is changed in the storage. For example, console might want to know when user has queried and started compute and when compute was scaled to zero after that, to know how much user should pay for the service. Another example is to get info about reaching the disk space limits. Yet another example is to do analytics, such as how many users had at least one active project in a month. + +All of the above cases can happen without using the console, just by accessing compute through the proxy. + +To solve this, we can have a log of events occurring in the storage (event logs). That is very similar to operations table we have right now, the only difference is that events are immutable and we cannot change them after saving to the database. For example, we might want to have events for the following activities: + +- We finished processing some HTTP API query, such as resetting the password +- We changed some state, such as started or stopped a compute +- Operation is created +- Operation is started for the first time +- Operation is failed for the first time +- Operation is finished + +Once we save these events to the database, we can create HTTP API to subscribe to these events. That API can look like this: + +``` +GET /events/ + +{ + "events": [...], + "next_cursor": 123 +} +``` + +It should be possible to replay event logs from some point of time, to get a state of almost anything from the storage services. That means that if we maintain some state in the control-plane database and we have a reason to have the same state in the console database, it is possible by polling events from the control-plane API and changing the state in the console database according to the events. + +### Next steps + +After implementing control-plane HTTP API and starting control-plane as a separate service, we might want to think of exploiting benefits of the new architecture, such as reorganizing test infrastructure. Possible options are listed in the [Next steps](#next-steps-1). + +## Non Goals + +RFC doesn’t cover the actual cloud deployment scripts and schemas, such as terraform, ansible, k8s yaml’s and so on. + +## Impacted components + +Mostly console, but can also affect some storage service. + +## Scalability + +We should support starting several instances of the new control-plane service at the same time. + +At the same time, it should be possible to use only single instance of control-plane, which can be useful for local tests. + +## Security implications + +New control-plane service is an internal service, so no external requests can reach it. But at the same time, it contains API to do absolutely anything with any of the tenants. That means that bad internal actor can potentially read and write all of the tenants. To make this safer, we can have one of these: + +- Simple option is to protect all requests with a single private key, so that no one can make requests without having that one key. +- Another option is to have a separate token for every tenant and store these tokens in another secure place. This way it’s harder to access all tenants at once, because they have the different tokens. + +## Alternative implementation + +There was an idea to create a k8s operator for managing storage services and computes, but author of this RFC is not really familiar with it. + +Regarding less alternative ideas, there are another options for the name of the new control-plane service: + +- storage-ctl +- cloud +- cloud-ctl + +## Pros/cons of proposed approaches (TODO) + +Pros: + +- All storage features are completely open-source +- Better tests coverage, less difference between cloud and local setups +- Easier to develop storage and cloud features, because there is no need to setup console for that +- Easier to deploy storage-only services to the any cloud + +Cons: + +- All storage features are completely open-source +- Distributed services mean more code to connect different services and potential network issues +- Console needs to have a dependency on storage API, there can be complications with developing new feature in a branch +- More code to JOIN data from different services (console and control-plane) + +## Definition of Done + +We have a new control-plane service running in the k8s. Source code for that control-plane service is located in the open-source neon repo. + +## Next steps + +After we’ve reached DoD, we can make further improvements. + +First thing that can benefit from the split is local testing. The same control-plane service can implement starting computes as a local processes instead of k8s deployments. If it will also support starting pageservers/safekeepers/proxy for the local setup, then it can completely replace `./neon_local` binary, which is currently used for testing. The local testing environment can look like this: + +``` +┌─────────────────────┐ ┌───────────────────────┐ +│ │ │ Storage (local) │ +│ control-plane db │ │ │ +│ (local process) │ │ - safekeepers │ +│ │ │ - pageservers │ +└──────────▲──────────┘ │ │ + │ │ Dependencies │ +┌──────────┴──────────┐ │ │ +│ │ │ - etcd │ +│ control-plane ├────►│ - S3 │ +│ (local process) │ │ - more? │ +│ │ │ │ +└──────────┬──────────┘ └───────────────────────┘ + ▲ │ ▲ + │ │ │ + │ │ ┌───────────┴───────────┐ + │ │ │ │ + │ └───────────────►│ computes │ + │ │ (local processes) │ + │ │ │ +┌──────┴──────────────┐ └───────────────────────┘ +│ │ ▲ +│ proxy │ │ +│ (local process) ├─────────────────┘ +│ │ +└─────────────────────┘ +``` + +The key thing here is that control-plane local service have the same API and almost the same implementation as the one deployed in the k8s. This allows to run the same e2e tests against both cloud and local setups. + +For the python test_runner tests everything can stay mostly the same. To do that, we just need to replace `./neon_local` cli commands with API calls to the control-plane. + +The benefit here will be in having fast local tests that are really close to our cloud setup. Bugs in k8s queries are still cannot be found when running computes as a local processes, but it should be really easy to start k8s locally (for example in k3s) and run the same tests with control-plane connected to the local k8s. + +Talking about console and UI tests, after the split there should be a way to test these without spinning up all the storage locally. New control-plane service has a well-defined API, allowing us to mock it. This way we can create UI tests to verify the right calls are issued after specific UI interactions and verify that we render correct messages when API returns errors. \ No newline at end of file diff --git a/docs/rfcs/018-storage-messaging-2.md b/docs/rfcs/018-storage-messaging-2.md index 364f62dd2e..2419dd5fc5 100644 --- a/docs/rfcs/018-storage-messaging-2.md +++ b/docs/rfcs/018-storage-messaging-2.md @@ -78,7 +78,7 @@ with grpc streams and tokio mpsc channels. The implementation description is at It is just 500 lines of code and core functionality is complete. 1-1 pub sub gives about 120k received messages per second; having multiple subscribers in -different connecitons quickly scales to 1 million received messages per second. +different connections quickly scales to 1 million received messages per second. I had concerns about many concurrent streams in singe connection, but 2^20 subscribers still work (though eat memory, with 10 publishers 20GB are consumed; in this implementation each publisher holds full copy of all subscribers). There @@ -95,12 +95,12 @@ other members, with best-effort this is simple. ### Security implications Communication happens in a private network that is not exposed to users; -additionaly we can add auth to the broker. +additionally we can add auth to the broker. ## Alternative: get existing pub-sub We could take some existing pub sub solution, e.g. RabbitMQ, Redis. But in this -case IMV simplicity of our own outweights external dependency costs (RabbitMQ is +case IMV simplicity of our own outweighs external dependency costs (RabbitMQ is much more complicated and needs VM; Redis Rust client maintenance is not ideal...). Also note that projects like CockroachDB and TiDB are based on gRPC as well. diff --git a/docs/rfcs/019-tenant-timeline-lifecycles.md b/docs/rfcs/019-tenant-timeline-lifecycles.md index 2734bf17b9..558b5335e7 100644 --- a/docs/rfcs/019-tenant-timeline-lifecycles.md +++ b/docs/rfcs/019-tenant-timeline-lifecycles.md @@ -74,7 +74,7 @@ TenantMaintenanceGuard: Like ActiveTenantGuard, but can be held even when the tenant is not in Active state. Used for operations like attach/detach. Perhaps allow only one such guard on a Tenant at a time. -Similarly for Timelines. We don't currentl have a "state" on Timeline, but I think +Similarly for Timelines. We don't currently have a "state" on Timeline, but I think we need at least two states: Active and Stopping. The Stopping state is used at deletion, to prevent new TimelineActiveGuards from appearing, while you wait for existing TimelineActiveGuards to die out. @@ -85,7 +85,7 @@ have a TenantActiveGuard, and the tenant's state changes from Active to Stopping, the is_shutdown_requested() function should return true, and shutdown_watcher() future should return. -This signaling doesn't neessarily need to cover all cases. For example, if you +This signaling doesn't necessarily need to cover all cases. For example, if you have a block of code in spawn_blocking(), it might be acceptable if is_shutdown_requested() doesn't return true even though the tenant is in Stopping state, as long as the code finishes reasonably fast. diff --git a/docs/rfcs/020-pageserver-s3-coordination.md b/docs/rfcs/020-pageserver-s3-coordination.md index 5e2912ba99..90ba3a6f4d 100644 --- a/docs/rfcs/020-pageserver-s3-coordination.md +++ b/docs/rfcs/020-pageserver-s3-coordination.md @@ -37,7 +37,7 @@ sequenceDiagram ``` At this point it is not possible to restore from index, it contains L2 which -is no longer available in s3 and doesnt contain L3 added by compaction by the +is no longer available in s3 and doesn't contain L3 added by compaction by the first pageserver. So if any of the pageservers restart initial sync will fail (or in on-demand world it will fail a bit later during page request from missing layer) @@ -74,7 +74,7 @@ One possible solution for relocation case is to orchestrate background jobs from outside. The oracle who runs migration can turn off background jobs on PS1 before migration and then run migration -> enable them on PS2. The problem comes if migration fails. In this case in order to resume background jobs -oracle needs to guarantee that PS2 doesnt run background jobs and if it doesnt +oracle needs to guarantee that PS2 doesn't run background jobs and if it doesn't respond then PS1 is stuck unable to run compaction/gc. This cannot be solved without human ensuring that no upload from PS2 can happen. In order to be able to resolve this automatically CAS is required on S3 side so pageserver can @@ -128,7 +128,7 @@ During discussion it seems that we converged on the approach consisting of: whether we need to apply change to the index state or not. - Responsibility for running background jobs is assigned externally. Pageserver keeps locally persistent flag for each tenant that indicates whether this - pageserver is considered as primary one or not. TODO what happends if we + pageserver is considered as primary one or not. TODO what happens if we crash and cannot start for some extended period of time? Control plane can assign ownership to some other pageserver. Pageserver needs some way to check if its still the blessed one. Maybe by explicit request to control plane on @@ -138,7 +138,7 @@ Requirement for deterministic layer generation was considered overly strict because of two reasons: - It can limit possible optimizations e g when pageserver wants to reshuffle - some data locally and doesnt want to coordinate this + some data locally and doesn't want to coordinate this - The deterministic algorithm itself can change so during deployments for some time there will be two different version running at the same time which can cause non determinism @@ -164,7 +164,7 @@ sequenceDiagram CP->>PS1: Yes deactivate CP PS1->>S3: Fetch PS1 index. - note over PS1: Continue operations, start backround jobs + note over PS1: Continue operations, start background jobs note over PS1,PS2: PS1 starts up and still and is not a leader anymore PS1->>CP: Am I still the leader for Tenant X? CP->>PS1: No @@ -203,7 +203,7 @@ sequenceDiagram ### Eviction When two pageservers operate on a tenant for extended period of time follower -doesnt perform write operations in s3. When layer is evicted follower relies +doesn't perform write operations in s3. When layer is evicted follower relies on updates from primary to get info about layers it needs to cover range for evicted layer. diff --git a/docs/rfcs/022-pageserver-delete-from-s3.md b/docs/rfcs/022-pageserver-delete-from-s3.md index 260e549670..c237a3edb8 100644 --- a/docs/rfcs/022-pageserver-delete-from-s3.md +++ b/docs/rfcs/022-pageserver-delete-from-s3.md @@ -4,7 +4,7 @@ Created on 08.03.23 ## Motivation -Currently we dont delete pageserver part of the data from s3 when project is deleted. (The same is true for safekeepers, but this outside of the scope of this RFC). +Currently we don't delete pageserver part of the data from s3 when project is deleted. (The same is true for safekeepers, but this outside of the scope of this RFC). This RFC aims to spin a discussion to come to a robust deletion solution that wont put us in into a corner for features like postponed deletion (when we keep data for user to be able to restore a project if it was deleted by accident) @@ -75,9 +75,9 @@ Remote one is needed for cases when pageserver is lost during deletion so other Why local mark file is needed? -If we dont have one, we have two choices, delete local data before deleting the remote part or do that after. +If we don't have one, we have two choices, delete local data before deleting the remote part or do that after. -If we delete local data before remote then during restart pageserver wont pick up remote tenant at all because nothing is available locally (pageserver looks for remote conuterparts of locally available tenants). +If we delete local data before remote then during restart pageserver wont pick up remote tenant at all because nothing is available locally (pageserver looks for remote counterparts of locally available tenants). If we delete local data after remote then at the end of the sequence when remote mark file is deleted if pageserver restart happens then the state is the same to situation when pageserver just missing data on remote without knowing the fact that this data is intended to be deleted. In this case the current behavior is upload everything local-only to remote. @@ -145,7 +145,7 @@ sequenceDiagram CP->>PS: Retry delete tenant PS->>CP: Not modified else Mark is missing - note over PS: Continue to operate the tenant as if deletion didnt happen + note over PS: Continue to operate the tenant as if deletion didn't happen note over CP: Eventually console should
retry delete request @@ -168,7 +168,7 @@ sequenceDiagram PS->>CP: True ``` -Similar sequence applies when both local and remote marks were persisted but Control Plane still didnt receive a response. +Similar sequence applies when both local and remote marks were persisted but Control Plane still didn't receive a response. If pageserver crashes after both mark files were deleted then it will reply to control plane status poll request with 404 which should be treated by control plane as success. @@ -187,7 +187,7 @@ If pageseserver is lost then the deleted tenant should be attached to different ##### Restrictions for tenant that is in progress of being deleted -I propose to add another state to tenant/timeline - PendingDelete. This state shouldnt allow executing any operations aside from polling the deletion status. +I propose to add another state to tenant/timeline - PendingDelete. This state shouldn't allow executing any operations aside from polling the deletion status. #### Summary @@ -237,7 +237,7 @@ New branch gets created PS1 starts up (is it possible or we just recycle it?) PS1 is unaware of the new branch. It can either fall back to s3 ls, or ask control plane. -So here comes the dependency of storage on control plane. During restart storage needs to know which timelines are valid for operation. If there is nothing on s3 that can answer that question storage neeeds to ask control plane. +So here comes the dependency of storage on control plane. During restart storage needs to know which timelines are valid for operation. If there is nothing on s3 that can answer that question storage needs to ask control plane. ### Summary @@ -250,7 +250,7 @@ Cons: Pros: -- Easier to reason about if you dont have to account for pageserver restarts +- Easier to reason about if you don't have to account for pageserver restarts ### Extra notes @@ -262,7 +262,7 @@ Delayed deletion can be done with both approaches. As discussed with Anna (@step After discussion in comments I see that we settled on two options (though a bit different from ones described in rfc). First one is the same - pageserver owns as much as possible. The second option is that pageserver owns markers thing, but actual deletion happens in control plane by repeatedly calling ls + delete. -To my mind the only benefit of the latter approach is possible code reuse between safekeepers and pageservers. Otherwise poking around integrating s3 library into control plane, configuring shared knowledge abouth paths in s3 - are the downsides. Another downside of relying on control plane is the testing process. Control plane resides in different repository so it is quite hard to test pageserver related changes there. e2e test suite there doesnt support shutting down pageservers, which are separate docker containers there instead of just processes. +To my mind the only benefit of the latter approach is possible code reuse between safekeepers and pageservers. Otherwise poking around integrating s3 library into control plane, configuring shared knowledge about paths in s3 - are the downsides. Another downside of relying on control plane is the testing process. Control plane resides in different repository so it is quite hard to test pageserver related changes there. e2e test suite there doesn't support shutting down pageservers, which are separate docker containers there instead of just processes. With pageserver owning everything we still give the retry logic to control plane but its easier to duplicate if needed compared to sharing inner s3 workings. We will have needed tests for retry logic in neon repo. diff --git a/docs/rfcs/023-the-state-of-pageserver-tenant-relocation.md b/docs/rfcs/023-the-state-of-pageserver-tenant-relocation.md index 836c91fb25..97e62bf8c6 100644 --- a/docs/rfcs/023-the-state-of-pageserver-tenant-relocation.md +++ b/docs/rfcs/023-the-state-of-pageserver-tenant-relocation.md @@ -75,7 +75,7 @@ sequenceDiagram ``` At this point it is not possible to restore the state from index, it contains L2 which -is no longer available in s3 and doesnt contain L3 added by compaction by the +is no longer available in s3 and doesn't contain L3 added by compaction by the first pageserver. So if any of the pageservers restart, initial sync will fail (or in on-demand world it will fail a bit later during page request from missing layer) @@ -171,7 +171,7 @@ sequenceDiagram Another problem is a possibility of concurrent branch creation calls. -I e during migration create_branch can be called on old pageserver and newly created branch wont be seen on new pageserver. Prior art includes prototyping an approach of trying to mirror such branches, but currently it lost its importance, because now attach is fast because we dont need to download all data, and additionally to the best of my knowledge of control plane internals (cc @ololobus to confirm) operations on one project are executed sequentially, so it is not possible to have such case. So branch create operation will be executed only when relocation is completed. As a safety measure we can forbid branch creation for tenants that are in readonly remote state. +I e during migration create_branch can be called on old pageserver and newly created branch wont be seen on new pageserver. Prior art includes prototyping an approach of trying to mirror such branches, but currently it lost its importance, because now attach is fast because we don't need to download all data, and additionally to the best of my knowledge of control plane internals (cc @ololobus to confirm) operations on one project are executed sequentially, so it is not possible to have such case. So branch create operation will be executed only when relocation is completed. As a safety measure we can forbid branch creation for tenants that are in readonly remote state. ## Simplistic approach diff --git a/docs/rfcs/024-extension-loading.md b/docs/rfcs/024-extension-loading.md index 26ba4f7927..7e243b23e3 100644 --- a/docs/rfcs/024-extension-loading.md +++ b/docs/rfcs/024-extension-loading.md @@ -55,7 +55,7 @@ When PostgreSQL requests a file, `compute_ctl` downloads it. PostgreSQL requests files in the following cases: - When loading a preload library set in `local_preload_libraries` - When explicitly loading a library with `LOAD` -- Wnen creating extension with `CREATE EXTENSION` (download sql scripts, (optional) extension data files and (optional) library files))) +- When creating extension with `CREATE EXTENSION` (download sql scripts, (optional) extension data files and (optional) library files))) #### Summary diff --git a/docs/rfcs/025-generation-numbers.md b/docs/rfcs/025-generation-numbers.md index 6a0131c66a..dfc8529d2d 100644 --- a/docs/rfcs/025-generation-numbers.md +++ b/docs/rfcs/025-generation-numbers.md @@ -26,7 +26,7 @@ plane guarantee prevents robust response to failures, as if a pageserver is unre we may not detach from it. The mechanism in this RFC fixes this, by making it safe to attach to a new, different pageserver even if an unresponsive pageserver may be running. -Futher, lack of safety during split-brain conditions blocks two important features where occasional +Further lack of safety during split-brain conditions blocks two important features where occasional split-brain conditions are part of the design assumptions: - seamless tenant migration ([RFC PR](https://github.com/neondatabase/neon/pull/5029)) @@ -490,11 +490,11 @@ The above makes it safe for control plane to change the assignment of tenant to pageserver in control plane while a timeline creation is ongoing. The reason is that the creation request against the new assigned pageserver uses a new generation number. However, care must be taken by control plane -to ensure that a "timeline creation successul" response from some pageserver +to ensure that a "timeline creation successful" response from some pageserver is checked for the pageserver's generation for that timeline's tenant still being the latest. If it is not the latest, the response does not constitute a successful timeline creation. It is acceptable to discard such responses, the scrubber will clean up the S3 state. -It is better to issue a timelien deletion request to the stale attachment. +It is better to issue a timeline deletion request to the stale attachment. #### Timeline Deletion @@ -633,7 +633,7 @@ As outlined in the Part 1 on correctness, it is critical that deletions are only executed once the key is not referenced anywhere in S3. This property is obviously upheld by the scheme above. -#### We Accept Object Leakage In Acceptable Circumcstances +#### We Accept Object Leakage In Acceptable Circumstances If we crash in the flow above between (2) and (3), we lose track of unreferenced object. Further, enqueuing a single to the persistent queue may not be durable immediately to amortize cost of flush to disk. diff --git a/docs/rfcs/026-pageserver-s3-mvcc.md b/docs/rfcs/026-pageserver-s3-mvcc.md index 2a8c925781..473d5a2bd0 100644 --- a/docs/rfcs/026-pageserver-s3-mvcc.md +++ b/docs/rfcs/026-pageserver-s3-mvcc.md @@ -162,7 +162,7 @@ struct Tenant { ... txns: HashMap, - // the most recently started txn's id; only most recently sarted can win + // the most recently started txn's id; only most recently started can win next_winner_txn: Option, } struct Transaction { @@ -186,7 +186,7 @@ A transaction T in state Committed has subsequent transactions that may or may n So, for garbage collection, we need to assess transactions in state Committed and RejectAcknowledged: -- Commited: delete objects on the deadlist. +- Committed: delete objects on the deadlist. - We don’t need a LIST request here, the deadlist is sufficient. So, it’s really cheap. - This is **not true MVCC garbage collection**; by deleting the objects on Committed transaction T ’s deadlist, we might delete data referenced by other transactions that were concurrent with T, i.e., they started while T was still open. However, the fact that T is committed means that the other transactions are RejectPending or RejectAcknowledged, so, they don’t matter. Pageservers executing these doomed RejectPending transactions must handle 404 for GETs gracefully, e.g., by trying to commit txn so they observe the rejection they’re destined to get anyways. 404’s for RejectAcknowledged is handled below. - RejectAcknowledged: delete all objects created in that txn, and discard deadlists. @@ -242,15 +242,15 @@ If a pageserver is unresponsive from Control Plane’s / Compute’s perspective At this point, availability is restored and user pain relieved. -What’s left is to somehow close the doomed transaction of the unresponsive pageserver, so that it beomes RejectAcknowledged, and GC can make progress. Since S3 is cheap, we can afford to wait a really long time here, especially if we put a soft bound on the amount of data a transaction may produce before it must commit. Procedure: +What’s left is to somehow close the doomed transaction of the unresponsive pageserver, so that it becomes RejectAcknowledged, and GC can make progress. Since S3 is cheap, we can afford to wait a really long time here, especially if we put a soft bound on the amount of data a transaction may produce before it must commit. Procedure: 1. Ensure the unresponsive pageserver is taken out of rotation for new attachments. That probably should happen as part of the routine above. 2. Make a human operator investigate decide what to do (next morning, NO ONCALL ALERT): 1. Inspect the instance, investigate logs, understand root cause. 2. Try to re-establish connectivity between pageserver and Control Plane so that pageserver can retry commits, get rejected, ack rejection ⇒ enable GC. - 3. Use below procedure to decomission pageserver. + 3. Use below procedure to decommission pageserver. -### Decomissioning A Pageserver (Dead or Alive-but-Unrespsonive) +### Decommissioning A Pageserver (Dead or Alive-but-Unresponsive) The solution, enabled by this proposal: @@ -310,7 +310,7 @@ Issues that we discussed: 1. In abstract terms, this proposal provides a linearized history for a given S3 prefix. 2. In concrete terms, this proposal provides a linearized history per tenant. 3. There can be multiple writers at a given time, but only one of them will win to become part of the linearized history. -4. ************************************************************************************Alternative ideas mentioned during meetings that should be turned into a written prospoal like this one:************************************************************************************ +4. ************************************************************************************Alternative ideas mentioned during meetings that should be turned into a written proposal like this one:************************************************************************************ 1. @Dmitry Rodionov : having linearized storage of index_part.json in some database that allows serializable transactions / atomic compare-and-swap PUT 2. @Dmitry Rodionov : 3. @Stas : something like this scheme, but somehow find a way to equate attachment duration with transaction duration, without losing work if pageserver dies months after attachment. diff --git a/docs/rfcs/027-crash-consistent-layer-map-through-index-part.md b/docs/rfcs/027-crash-consistent-layer-map-through-index-part.md index 2c6b46eabe..e18b7c16c9 100644 --- a/docs/rfcs/027-crash-consistent-layer-map-through-index-part.md +++ b/docs/rfcs/027-crash-consistent-layer-map-through-index-part.md @@ -54,7 +54,7 @@ If the compaction algorithm doesn't change between the two compaction runs, is d *However*: 1. the file size of the overwritten L1s may not be identical, and 2. the bit pattern of the overwritten L1s may not be identical, and, -3. in the future, we may want to make the compaction code non-determinstic, influenced by past access patterns, or otherwise change it, resulting in L1 overwrites with a different set of delta records than before the overwrite +3. in the future, we may want to make the compaction code non-deterministic, influenced by past access patterns, or otherwise change it, resulting in L1 overwrites with a different set of delta records than before the overwrite The items above are a problem for the [split-brain protection RFC](https://github.com/neondatabase/neon/pull/4919) because it assumes that layer files in S3 are only ever deleted, but never replaced (overPUTted). @@ -63,7 +63,7 @@ But node B based its world view on the version of node A's `index_part.json` fro That earlier `index_part.json`` contained the file size of the pre-overwrite L1. If the overwritten L1 has a different file size, node B will refuse to read data from the overwritten L1. Effectively, the data in the L1 has become inaccessible to node B. -If node B already uploaded an index part itself, all subsequent attachments will use node B's index part, and run into the same probem. +If node B already uploaded an index part itself, all subsequent attachments will use node B's index part, and run into the same problem. If we ever introduce checksums instead of checking just the file size, then a mismatching bit pattern (2) will cause similar problems. @@ -121,7 +121,7 @@ Multi-object changes that previously created and removed files in timeline dir a * atomic `index_part.json` update in S3, as per guarantee that S3 PUT is atomic * local timeline dir state: * irrelevant for layer map content => irrelevant for atomic updates / crash consistency - * if we crash after index part PUT, local layer files will be used, so, no on-demand downloads neede for them + * if we crash after index part PUT, local layer files will be used, so, no on-demand downloads needed for them * if we crash before index part PUT, local layer files will be deleted ## Trade-Offs @@ -140,7 +140,7 @@ Assuming upload queue allows for unlimited queue depth (that's what it does toda * wal ingest: currently unbounded * L0 => L1 compaction: CPU time proportional to `O(sum(L0 size))` and upload work proportional to `O()` * Compaction threshold is 10 L0s and each L0 can be up to 256M in size. Target size for L1 is 128M. - * In practive, most L0s are tiny due to 10minute `DEFAULT_CHECKPOINT_TIMEOUT`. + * In practice, most L0s are tiny due to 10minute `DEFAULT_CHECKPOINT_TIMEOUT`. * image layer generation: CPU time `O(sum(input data))` + upload work `O(sum(new image layer size))` * I have no intuition how expensive / long-running it is in reality. * gc: `update_gc_info`` work (not substantial, AFAIK) @@ -158,7 +158,7 @@ Pageserver crashes are very rare ; it would likely be acceptable to re-do the lo However, regular pageserver restart happen frequently, e.g., during weekly deploys. In general, pageserver restart faces the problem of tenants that "take too long" to shut down. -They are a problem because other tenants that shut down quickly are unavailble while we wait for the slow tenants to shut down. +They are a problem because other tenants that shut down quickly are unavailable while we wait for the slow tenants to shut down. We currently allot 10 seconds for graceful shutdown until we SIGKILL the pageserver process (as per `pageserver.service` unit file). A longer budget would expose tenants that are done early to a longer downtime. A short budget would risk throwing away more work that'd have to be re-done after restart. @@ -236,7 +236,7 @@ tenants/$tenant/timelines/$timeline/$key_and_lsn_range tenants/$tenant/timelines/$timeline/$layer_file_id-$key_and_lsn_range ``` -To guarantee uniqueness, the unqiue number is a sequence number, stored in `index_part.json`. +To guarantee uniqueness, the unique number is a sequence number, stored in `index_part.json`. This alternative does not solve atomic layer map updates. In our crash-during-compaction scenario above, the compaction run after the crash will not overwrite the L1s, but write/PUT new files with new sequence numbers. @@ -246,11 +246,11 @@ We'd need to write a deduplication pass that checks if perfectly overlapping lay However, this alternative is appealing because it systematically prevents overwrites at a lower level than this RFC. So, this alternative is sufficient for the needs of the split-brain safety RFC (immutable layer files locally and in S3). -But it doesn't solve the problems with crash-during-compaction outlined earlier in this RFC, and in fact, makes it much more accute. +But it doesn't solve the problems with crash-during-compaction outlined earlier in this RFC, and in fact, makes it much more acute. The proposed design in this RFC addresses both. So, if this alternative sounds appealing, we should implement the proposal in this RFC first, then implement this alternative on top. -That way, we avoid a phase where the crash-during-compaction problem is accute. +That way, we avoid a phase where the crash-during-compaction problem is acute. ## Related issues diff --git a/docs/rfcs/028-pageserver-migration.md b/docs/rfcs/028-pageserver-migration.md index f708f641aa..17ef9aef52 100644 --- a/docs/rfcs/028-pageserver-migration.md +++ b/docs/rfcs/028-pageserver-migration.md @@ -596,4 +596,4 @@ pageservers are updated to be aware of it. As well as simplifying implementation, putting heatmaps in S3 will be useful for future analytics purposes -- gathering aggregated statistics on activity -pattersn across many tenants may be done directly from data in S3. +patterns across many tenants may be done directly from data in S3. diff --git a/docs/rfcs/029-pageserver-wal-disaster-recovery.md b/docs/rfcs/029-pageserver-wal-disaster-recovery.md index 15ebd72bfe..229e40100e 100644 --- a/docs/rfcs/029-pageserver-wal-disaster-recovery.md +++ b/docs/rfcs/029-pageserver-wal-disaster-recovery.md @@ -147,7 +147,7 @@ Separating corrupt writes from non-corrupt ones is a hard problem in general, and if the application was involved in making the corrupt write, a recovery would also involve the application. Therefore, corruption that has made it into the WAL is outside of the scope of this feature. However, the WAL replay can be -issued to right before the point in time where the corruption occured. Then the +issued to right before the point in time where the corruption occurred. Then the data loss is isolated to post-corruption writes only. ## Impacted components (e.g. pageserver, safekeeper, console, etc) @@ -161,7 +161,7 @@ limits and billing we apply to existing timelines. ## Proposed implementation -The first problem to keep in mind is the reproducability of `initdb`. +The first problem to keep in mind is the reproducibility of `initdb`. So an initial step would be to upload `initdb` snapshots to S3. After that, we'd have the endpoint spawn a background process which diff --git a/docs/rfcs/030-vectored-timeline-get.md b/docs/rfcs/030-vectored-timeline-get.md index d4017471b7..093a964f38 100644 --- a/docs/rfcs/030-vectored-timeline-get.md +++ b/docs/rfcs/030-vectored-timeline-get.md @@ -69,7 +69,7 @@ However, unlike above, an ideal solution will * This means, read each `DiskBtree` page at most once. * Facilitate merging of the reads we issue to the OS and eventually NVMe. -Each of these items above represents a signficant amount of work. +Each of these items above represents a significant amount of work. ## Performance diff --git a/docs/synthetic-size.md b/docs/synthetic-size.md index 407d7b525a..3acb4e18cb 100644 --- a/docs/synthetic-size.md +++ b/docs/synthetic-size.md @@ -21,7 +21,7 @@ implementation where we keep more data than we would need to, do not change the synthetic size or incur any costs to the user. The synthetic size is calculated for the whole project. It is not -straighforward to attribute size to individual branches. See "What is +straightforward to attribute size to individual branches. See "What is the size of an individual branch?" for discussion on those difficulties. @@ -248,7 +248,7 @@ and truncate the WAL. Synthetic size is calculated for the whole project, and includes all branches. There is no such thing as the size of a branch, because it -is not straighforward to attribute the parts of size to individual +is not straightforward to attribute the parts of size to individual branches. ## Example: attributing size to branches diff --git a/libs/compute_api/src/responses.rs b/libs/compute_api/src/responses.rs index 92bbf79cd4..fd0c90d447 100644 --- a/libs/compute_api/src/responses.rs +++ b/libs/compute_api/src/responses.rs @@ -52,6 +52,10 @@ pub enum ComputeStatus { // compute will exit soon or is waiting for // control-plane to terminate it. Failed, + // Termination requested + TerminationPending, + // Terminated Postgres + Terminated, } fn rfc3339_serialize(x: &Option>, s: S) -> Result diff --git a/libs/compute_api/src/spec.rs b/libs/compute_api/src/spec.rs index 5361d14004..71ae66c45c 100644 --- a/libs/compute_api/src/spec.rs +++ b/libs/compute_api/src/spec.rs @@ -79,6 +79,12 @@ pub struct ComputeSpec { // Stripe size for pageserver sharding, in pages #[serde(default)] pub shard_stripe_size: Option, + + // When we are starting a new replica in hot standby mode, + // we need to know if the primary is running. + // This is used to determine if replica should wait for + // RUNNING_XACTS from primary or not. + pub primary_is_running: Option, } /// Feature flag to signal `compute_ctl` to enable certain experimental functionality. @@ -90,8 +96,8 @@ pub enum ComputeFeature { /// track short-lived connections as user activity. ActivityMonitorExperimental, - /// Enable running migrations - Migrations, + /// Pre-install and initialize anon extension for every database in the cluster + AnonExtension, /// This is a special feature flag that is used to represent unknown feature flags. /// Basically all unknown to enum flags are represented as this one. See unit test diff --git a/libs/desim/Cargo.toml b/libs/desim/Cargo.toml new file mode 100644 index 0000000000..6f442d8243 --- /dev/null +++ b/libs/desim/Cargo.toml @@ -0,0 +1,18 @@ +[package] +name = "desim" +version = "0.1.0" +edition.workspace = true +license.workspace = true + +[dependencies] +anyhow.workspace = true +rand.workspace = true +tracing.workspace = true +bytes.workspace = true +utils.workspace = true +parking_lot.workspace = true +hex.workspace = true +scopeguard.workspace = true +smallvec = { workspace = true, features = ["write"] } + +workspace_hack.workspace = true diff --git a/libs/desim/README.md b/libs/desim/README.md new file mode 100644 index 0000000000..80568ebb1b --- /dev/null +++ b/libs/desim/README.md @@ -0,0 +1,7 @@ +# Discrete Event SIMulator + +This is a library for running simulations of distributed systems. The main idea is borrowed from [FoundationDB](https://www.youtube.com/watch?v=4fFDFbi3toc). + +Each node runs as a separate thread. This library was not optimized for speed yet, but it's already much faster than running usual intergration tests in real time, because it uses virtual simulation time and can fast-forward time to skip intervals where all nodes are doing nothing but sleeping or waiting for something. + +The original purpose for this library is to test walproposer and safekeeper implementation working together, in a scenarios close to the real world environment. This simulator is determenistic and can inject failures in networking without waiting minutes of wall-time to trigger timeout, which makes it easier to find bugs in our consensus implementation compared to using integration tests. diff --git a/libs/desim/src/chan.rs b/libs/desim/src/chan.rs new file mode 100644 index 0000000000..6661d59871 --- /dev/null +++ b/libs/desim/src/chan.rs @@ -0,0 +1,108 @@ +use std::{collections::VecDeque, sync::Arc}; + +use parking_lot::{Mutex, MutexGuard}; + +use crate::executor::{self, PollSome, Waker}; + +/// FIFO channel with blocking send and receive. Can be cloned and shared between threads. +/// Blocking functions should be used only from threads that are managed by the executor. +pub struct Chan { + shared: Arc>, +} + +impl Clone for Chan { + fn clone(&self) -> Self { + Chan { + shared: self.shared.clone(), + } + } +} + +impl Default for Chan { + fn default() -> Self { + Self::new() + } +} + +impl Chan { + pub fn new() -> Chan { + Chan { + shared: Arc::new(State { + queue: Mutex::new(VecDeque::new()), + waker: Waker::new(), + }), + } + } + + /// Get a message from the front of the queue, block if the queue is empty. + /// If not called from the executor thread, it can block forever. + pub fn recv(&self) -> T { + self.shared.recv() + } + + /// Panic if the queue is empty. + pub fn must_recv(&self) -> T { + self.shared + .try_recv() + .expect("message should've been ready") + } + + /// Get a message from the front of the queue, return None if the queue is empty. + /// Never blocks. + pub fn try_recv(&self) -> Option { + self.shared.try_recv() + } + + /// Send a message to the back of the queue. + pub fn send(&self, t: T) { + self.shared.send(t); + } +} + +struct State { + queue: Mutex>, + waker: Waker, +} + +impl State { + fn send(&self, t: T) { + self.queue.lock().push_back(t); + self.waker.wake_all(); + } + + fn try_recv(&self) -> Option { + let mut q = self.queue.lock(); + q.pop_front() + } + + fn recv(&self) -> T { + // interrupt the receiver to prevent consuming everything at once + executor::yield_me(0); + + let mut queue = self.queue.lock(); + if let Some(t) = queue.pop_front() { + return t; + } + loop { + self.waker.wake_me_later(); + if let Some(t) = queue.pop_front() { + return t; + } + MutexGuard::unlocked(&mut queue, || { + executor::yield_me(-1); + }); + } + } +} + +impl PollSome for Chan { + /// Schedules a wakeup for the current thread. + fn wake_me(&self) { + self.shared.waker.wake_me_later(); + } + + /// Checks if chan has any pending messages. + fn has_some(&self) -> bool { + !self.shared.queue.lock().is_empty() + } +} diff --git a/libs/desim/src/executor.rs b/libs/desim/src/executor.rs new file mode 100644 index 0000000000..9d44bd7741 --- /dev/null +++ b/libs/desim/src/executor.rs @@ -0,0 +1,483 @@ +use std::{ + panic::AssertUnwindSafe, + sync::{ + atomic::{AtomicBool, AtomicU32, AtomicU8, Ordering}, + mpsc, Arc, OnceLock, + }, + thread::JoinHandle, +}; + +use tracing::{debug, error, trace}; + +use crate::time::Timing; + +/// Stores status of the running threads. Threads are registered in the runtime upon creation +/// and deregistered upon termination. +pub struct Runtime { + // stores handles to all threads that are currently running + threads: Vec, + // stores current time and pending wakeups + clock: Arc, + // thread counter + thread_counter: AtomicU32, + // Thread step counter -- how many times all threads has been actually + // stepped (note that all world/time/executor/thread have slightly different + // meaning of steps). For observability. + pub step_counter: u64, +} + +impl Runtime { + /// Init new runtime, no running threads. + pub fn new(clock: Arc) -> Self { + Self { + threads: Vec::new(), + clock, + thread_counter: AtomicU32::new(0), + step_counter: 0, + } + } + + /// Spawn a new thread and register it in the runtime. + pub fn spawn(&mut self, f: F) -> ExternalHandle + where + F: FnOnce() + Send + 'static, + { + let (tx, rx) = mpsc::channel(); + + let clock = self.clock.clone(); + let tid = self.thread_counter.fetch_add(1, Ordering::SeqCst); + debug!("spawning thread-{}", tid); + + let join = std::thread::spawn(move || { + let _guard = tracing::info_span!("", tid).entered(); + + let res = std::panic::catch_unwind(AssertUnwindSafe(|| { + with_thread_context(|ctx| { + assert!(ctx.clock.set(clock).is_ok()); + ctx.id.store(tid, Ordering::SeqCst); + tx.send(ctx.clone()).expect("failed to send thread context"); + // suspend thread to put it to `threads` in sleeping state + ctx.yield_me(0); + }); + + // start user-provided function + f(); + })); + debug!("thread finished"); + + if let Err(e) = res { + with_thread_context(|ctx| { + if !ctx.allow_panic.load(std::sync::atomic::Ordering::SeqCst) { + error!("thread panicked, terminating the process: {:?}", e); + std::process::exit(1); + } + + debug!("thread panicked: {:?}", e); + let mut result = ctx.result.lock(); + if result.0 == -1 { + *result = (256, format!("thread panicked: {:?}", e)); + } + }); + } + + with_thread_context(|ctx| { + ctx.finish_me(); + }); + }); + + let ctx = rx.recv().expect("failed to receive thread context"); + let handle = ThreadHandle::new(ctx.clone(), join); + + self.threads.push(handle); + + ExternalHandle { ctx } + } + + /// Returns true if there are any unfinished activity, such as running thread or pending events. + /// Otherwise returns false, which means all threads are blocked forever. + pub fn step(&mut self) -> bool { + trace!("runtime step"); + + // have we run any thread? + let mut ran = false; + + self.threads.retain(|thread: &ThreadHandle| { + let res = thread.ctx.wakeup.compare_exchange( + PENDING_WAKEUP, + NO_WAKEUP, + Ordering::SeqCst, + Ordering::SeqCst, + ); + if res.is_err() { + // thread has no pending wakeups, leaving as is + return true; + } + ran = true; + + trace!("entering thread-{}", thread.ctx.tid()); + let status = thread.step(); + self.step_counter += 1; + trace!( + "out of thread-{} with status {:?}", + thread.ctx.tid(), + status + ); + + if status == Status::Sleep { + true + } else { + trace!("thread has finished"); + // removing the thread from the list + false + } + }); + + if !ran { + trace!("no threads were run, stepping clock"); + if let Some(ctx_to_wake) = self.clock.step() { + trace!("waking up thread-{}", ctx_to_wake.tid()); + ctx_to_wake.inc_wake(); + } else { + return false; + } + } + + true + } + + /// Kill all threads. This is done by setting a flag in each thread context and waking it up. + pub fn crash_all_threads(&mut self) { + for thread in self.threads.iter() { + thread.ctx.crash_stop(); + } + + // all threads should be finished after a few steps + while !self.threads.is_empty() { + self.step(); + } + } +} + +impl Drop for Runtime { + fn drop(&mut self) { + debug!("dropping the runtime"); + self.crash_all_threads(); + } +} + +#[derive(Clone)] +pub struct ExternalHandle { + ctx: Arc, +} + +impl ExternalHandle { + /// Returns true if thread has finished execution. + pub fn is_finished(&self) -> bool { + let status = self.ctx.mutex.lock(); + *status == Status::Finished + } + + /// Returns exitcode and message, which is available after thread has finished execution. + pub fn result(&self) -> (i32, String) { + let result = self.ctx.result.lock(); + result.clone() + } + + /// Returns thread id. + pub fn id(&self) -> u32 { + self.ctx.id.load(Ordering::SeqCst) + } + + /// Sets a flag to crash thread on the next wakeup. + pub fn crash_stop(&self) { + self.ctx.crash_stop(); + } +} + +struct ThreadHandle { + ctx: Arc, + _join: JoinHandle<()>, +} + +impl ThreadHandle { + /// Create a new [`ThreadHandle`] and wait until thread will enter [`Status::Sleep`] state. + fn new(ctx: Arc, join: JoinHandle<()>) -> Self { + let mut status = ctx.mutex.lock(); + // wait until thread will go into the first yield + while *status != Status::Sleep { + ctx.condvar.wait(&mut status); + } + drop(status); + + Self { ctx, _join: join } + } + + /// Allows thread to execute one step of its execution. + /// Returns [`Status`] of the thread after the step. + fn step(&self) -> Status { + let mut status = self.ctx.mutex.lock(); + assert!(matches!(*status, Status::Sleep)); + + *status = Status::Running; + self.ctx.condvar.notify_all(); + + while *status == Status::Running { + self.ctx.condvar.wait(&mut status); + } + + *status + } +} + +#[derive(Clone, Copy, Debug, PartialEq, Eq)] +enum Status { + /// Thread is running. + Running, + /// Waiting for event to complete, will be resumed by the executor step, once wakeup flag is set. + Sleep, + /// Thread finished execution. + Finished, +} + +const NO_WAKEUP: u8 = 0; +const PENDING_WAKEUP: u8 = 1; + +pub struct ThreadContext { + id: AtomicU32, + // used to block thread until it is woken up + mutex: parking_lot::Mutex, + condvar: parking_lot::Condvar, + // used as a flag to indicate runtime that thread is ready to be woken up + wakeup: AtomicU8, + clock: OnceLock>, + // execution result, set by exit() call + result: parking_lot::Mutex<(i32, String)>, + // determines if process should be killed on receiving panic + allow_panic: AtomicBool, + // acts as a signal that thread should crash itself on the next wakeup + crash_request: AtomicBool, +} + +impl ThreadContext { + pub(crate) fn new() -> Self { + Self { + id: AtomicU32::new(0), + mutex: parking_lot::Mutex::new(Status::Running), + condvar: parking_lot::Condvar::new(), + wakeup: AtomicU8::new(NO_WAKEUP), + clock: OnceLock::new(), + result: parking_lot::Mutex::new((-1, String::new())), + allow_panic: AtomicBool::new(false), + crash_request: AtomicBool::new(false), + } + } +} + +// Functions for executor to control thread execution. +impl ThreadContext { + /// Set atomic flag to indicate that thread is ready to be woken up. + fn inc_wake(&self) { + self.wakeup.store(PENDING_WAKEUP, Ordering::SeqCst); + } + + /// Internal function used for event queues. + pub(crate) fn schedule_wakeup(self: &Arc, after_ms: u64) { + self.clock + .get() + .unwrap() + .schedule_wakeup(after_ms, self.clone()); + } + + fn tid(&self) -> u32 { + self.id.load(Ordering::SeqCst) + } + + fn crash_stop(&self) { + let status = self.mutex.lock(); + if *status == Status::Finished { + debug!( + "trying to crash thread-{}, which is already finished", + self.tid() + ); + return; + } + assert!(matches!(*status, Status::Sleep)); + drop(status); + + self.allow_panic.store(true, Ordering::SeqCst); + self.crash_request.store(true, Ordering::SeqCst); + // set a wakeup + self.inc_wake(); + // it will panic on the next wakeup + } +} + +// Internal functions. +impl ThreadContext { + /// Blocks thread until it's woken up by the executor. If `after_ms` is 0, is will be + /// woken on the next step. If `after_ms` > 0, wakeup is scheduled after that time. + /// Otherwise wakeup is not scheduled inside `yield_me`, and should be arranged before + /// calling this function. + fn yield_me(self: &Arc, after_ms: i64) { + let mut status = self.mutex.lock(); + assert!(matches!(*status, Status::Running)); + + match after_ms.cmp(&0) { + std::cmp::Ordering::Less => { + // block until something wakes us up + } + std::cmp::Ordering::Equal => { + // tell executor that we are ready to be woken up + self.inc_wake(); + } + std::cmp::Ordering::Greater => { + // schedule wakeup + self.clock + .get() + .unwrap() + .schedule_wakeup(after_ms as u64, self.clone()); + } + } + + *status = Status::Sleep; + self.condvar.notify_all(); + + // wait until executor wakes us up + while *status != Status::Running { + self.condvar.wait(&mut status); + } + + if self.crash_request.load(Ordering::SeqCst) { + panic!("crashed by request"); + } + } + + /// Called only once, exactly before thread finishes execution. + fn finish_me(&self) { + let mut status = self.mutex.lock(); + assert!(matches!(*status, Status::Running)); + + *status = Status::Finished; + { + let mut result = self.result.lock(); + if result.0 == -1 { + *result = (0, "finished normally".to_owned()); + } + } + self.condvar.notify_all(); + } +} + +/// Invokes the given closure with a reference to the current thread [`ThreadContext`]. +#[inline(always)] +fn with_thread_context(f: impl FnOnce(&Arc) -> T) -> T { + thread_local!(static THREAD_DATA: Arc = Arc::new(ThreadContext::new())); + THREAD_DATA.with(f) +} + +/// Waker is used to wake up threads that are blocked on condition. +/// It keeps track of contexts [`Arc`] and can increment the counter +/// of several contexts to send a notification. +pub struct Waker { + // contexts that are waiting for a notification + contexts: parking_lot::Mutex; 8]>>, +} + +impl Default for Waker { + fn default() -> Self { + Self::new() + } +} + +impl Waker { + pub fn new() -> Self { + Self { + contexts: parking_lot::Mutex::new(smallvec::SmallVec::new()), + } + } + + /// Subscribe current thread to receive a wake notification later. + pub fn wake_me_later(&self) { + with_thread_context(|ctx| { + self.contexts.lock().push(ctx.clone()); + }); + } + + /// Wake up all threads that are waiting for a notification and clear the list. + pub fn wake_all(&self) { + let mut v = self.contexts.lock(); + for ctx in v.iter() { + ctx.inc_wake(); + } + v.clear(); + } +} + +/// See [`ThreadContext::yield_me`]. +pub fn yield_me(after_ms: i64) { + with_thread_context(|ctx| ctx.yield_me(after_ms)) +} + +/// Get current time. +pub fn now() -> u64 { + with_thread_context(|ctx| ctx.clock.get().unwrap().now()) +} + +pub fn exit(code: i32, msg: String) { + with_thread_context(|ctx| { + ctx.allow_panic.store(true, Ordering::SeqCst); + let mut result = ctx.result.lock(); + *result = (code, msg); + panic!("exit"); + }); +} + +pub(crate) fn get_thread_ctx() -> Arc { + with_thread_context(|ctx| ctx.clone()) +} + +/// Trait for polling channels until they have something. +pub trait PollSome { + /// Schedule wakeup for message arrival. + fn wake_me(&self); + + /// Check if channel has a ready message. + fn has_some(&self) -> bool; +} + +/// Blocks current thread until one of the channels has a ready message. Returns +/// index of the channel that has a message. If timeout is reached, returns None. +/// +/// Negative timeout means block forever. Zero timeout means check channels and return +/// immediately. Positive timeout means block until timeout is reached. +pub fn epoll_chans(chans: &[Box], timeout: i64) -> Option { + let deadline = if timeout < 0 { + 0 + } else { + now() + timeout as u64 + }; + + loop { + for chan in chans { + chan.wake_me() + } + + for (i, chan) in chans.iter().enumerate() { + if chan.has_some() { + return Some(i); + } + } + + if timeout < 0 { + // block until wakeup + yield_me(-1); + } else { + let current_time = now(); + if current_time >= deadline { + return None; + } + + yield_me((deadline - current_time) as i64); + } + } +} diff --git a/libs/desim/src/lib.rs b/libs/desim/src/lib.rs new file mode 100644 index 0000000000..14f5a885c5 --- /dev/null +++ b/libs/desim/src/lib.rs @@ -0,0 +1,8 @@ +pub mod chan; +pub mod executor; +pub mod network; +pub mod node_os; +pub mod options; +pub mod proto; +pub mod time; +pub mod world; diff --git a/libs/desim/src/network.rs b/libs/desim/src/network.rs new file mode 100644 index 0000000000..e15a714daa --- /dev/null +++ b/libs/desim/src/network.rs @@ -0,0 +1,451 @@ +use std::{ + cmp::Ordering, + collections::{BinaryHeap, VecDeque}, + fmt::{self, Debug}, + ops::DerefMut, + sync::{mpsc, Arc}, +}; + +use parking_lot::{ + lock_api::{MappedMutexGuard, MutexGuard}, + Mutex, RawMutex, +}; +use rand::rngs::StdRng; +use tracing::debug; + +use crate::{ + executor::{self, ThreadContext}, + options::NetworkOptions, + proto::NetEvent, + proto::NodeEvent, +}; + +use super::{chan::Chan, proto::AnyMessage}; + +pub struct NetworkTask { + options: Arc, + connections: Mutex>, + /// min-heap of connections having something to deliver. + events: Mutex>, + task_context: Arc, +} + +impl NetworkTask { + pub fn start_new(options: Arc, tx: mpsc::Sender>) { + let ctx = executor::get_thread_ctx(); + let task = Arc::new(Self { + options, + connections: Mutex::new(Vec::new()), + events: Mutex::new(BinaryHeap::new()), + task_context: ctx, + }); + + // send the task upstream + tx.send(task.clone()).unwrap(); + + // start the task + task.start(); + } + + pub fn start_new_connection(self: &Arc, rng: StdRng, dst_accept: Chan) -> TCP { + let now = executor::now(); + let connection_id = self.connections.lock().len(); + + let vc = VirtualConnection { + connection_id, + dst_accept, + dst_sockets: [Chan::new(), Chan::new()], + state: Mutex::new(ConnectionState { + buffers: [NetworkBuffer::new(None), NetworkBuffer::new(Some(now))], + rng, + }), + }; + vc.schedule_timeout(self); + vc.send_connect(self); + + let recv_chan = vc.dst_sockets[0].clone(); + self.connections.lock().push(vc); + + TCP { + net: self.clone(), + conn_id: connection_id, + dir: 0, + recv_chan, + } + } +} + +// private functions +impl NetworkTask { + /// Schedule to wakeup network task (self) `after_ms` later to deliver + /// messages of connection `id`. + fn schedule(&self, id: usize, after_ms: u64) { + self.events.lock().push(Event { + time: executor::now() + after_ms, + conn_id: id, + }); + self.task_context.schedule_wakeup(after_ms); + } + + /// Get locked connection `id`. + fn get(&self, id: usize) -> MappedMutexGuard<'_, RawMutex, VirtualConnection> { + MutexGuard::map(self.connections.lock(), |connections| { + connections.get_mut(id).unwrap() + }) + } + + fn collect_pending_events(&self, now: u64, vec: &mut Vec) { + vec.clear(); + let mut events = self.events.lock(); + while let Some(event) = events.peek() { + if event.time > now { + break; + } + let event = events.pop().unwrap(); + vec.push(event); + } + } + + fn start(self: &Arc) { + debug!("started network task"); + + let mut events = Vec::new(); + loop { + let now = executor::now(); + self.collect_pending_events(now, &mut events); + + for event in events.drain(..) { + let conn = self.get(event.conn_id); + conn.process(self); + } + + // block until wakeup + executor::yield_me(-1); + } + } +} + +// 0 - from node(0) to node(1) +// 1 - from node(1) to node(0) +type MessageDirection = u8; + +fn sender_str(dir: MessageDirection) -> &'static str { + match dir { + 0 => "client", + 1 => "server", + _ => unreachable!(), + } +} + +fn receiver_str(dir: MessageDirection) -> &'static str { + match dir { + 0 => "server", + 1 => "client", + _ => unreachable!(), + } +} + +/// Virtual connection between two nodes. +/// Node 0 is the creator of the connection (client), +/// and node 1 is the acceptor (server). +struct VirtualConnection { + connection_id: usize, + /// one-off chan, used to deliver Accept message to dst + dst_accept: Chan, + /// message sinks + dst_sockets: [Chan; 2], + state: Mutex, +} + +struct ConnectionState { + buffers: [NetworkBuffer; 2], + rng: StdRng, +} + +impl VirtualConnection { + /// Notify the future about the possible timeout. + fn schedule_timeout(&self, net: &NetworkTask) { + if let Some(timeout) = net.options.keepalive_timeout { + net.schedule(self.connection_id, timeout); + } + } + + /// Send the handshake (Accept) to the server. + fn send_connect(&self, net: &NetworkTask) { + let now = executor::now(); + let mut state = self.state.lock(); + let delay = net.options.connect_delay.delay(&mut state.rng); + let buffer = &mut state.buffers[0]; + assert!(buffer.buf.is_empty()); + assert!(!buffer.recv_closed); + assert!(!buffer.send_closed); + assert!(buffer.last_recv.is_none()); + + let delay = if let Some(ms) = delay { + ms + } else { + debug!("NET: TCP #{} dropped connect", self.connection_id); + buffer.send_closed = true; + return; + }; + + // Send a message into the future. + buffer + .buf + .push_back((now + delay, AnyMessage::InternalConnect)); + net.schedule(self.connection_id, delay); + } + + /// Transmit some of the messages from the buffer to the nodes. + fn process(&self, net: &Arc) { + let now = executor::now(); + + let mut state = self.state.lock(); + + for direction in 0..2 { + self.process_direction( + net, + state.deref_mut(), + now, + direction as MessageDirection, + &self.dst_sockets[direction ^ 1], + ); + } + + // Close the one side of the connection by timeout if the node + // has not received any messages for a long time. + if let Some(timeout) = net.options.keepalive_timeout { + let mut to_close = [false, false]; + for direction in 0..2 { + let buffer = &mut state.buffers[direction]; + if buffer.recv_closed { + continue; + } + if let Some(last_recv) = buffer.last_recv { + if now - last_recv >= timeout { + debug!( + "NET: connection {} timed out at {}", + self.connection_id, + receiver_str(direction as MessageDirection) + ); + let node_idx = direction ^ 1; + to_close[node_idx] = true; + } + } + } + drop(state); + + for (node_idx, should_close) in to_close.iter().enumerate() { + if *should_close { + self.close(node_idx); + } + } + } + } + + /// Process messages in the buffer in the given direction. + fn process_direction( + &self, + net: &Arc, + state: &mut ConnectionState, + now: u64, + direction: MessageDirection, + to_socket: &Chan, + ) { + let buffer = &mut state.buffers[direction as usize]; + if buffer.recv_closed { + assert!(buffer.buf.is_empty()); + } + + while !buffer.buf.is_empty() && buffer.buf.front().unwrap().0 <= now { + let msg = buffer.buf.pop_front().unwrap().1; + + buffer.last_recv = Some(now); + self.schedule_timeout(net); + + if let AnyMessage::InternalConnect = msg { + // TODO: assert to_socket is the server + let server_to_client = TCP { + net: net.clone(), + conn_id: self.connection_id, + dir: direction ^ 1, + recv_chan: to_socket.clone(), + }; + // special case, we need to deliver new connection to a separate channel + self.dst_accept.send(NodeEvent::Accept(server_to_client)); + } else { + to_socket.send(NetEvent::Message(msg)); + } + } + } + + /// Try to send a message to the buffer, optionally dropping it and + /// determining delivery timestamp. + fn send(&self, net: &NetworkTask, direction: MessageDirection, msg: AnyMessage) { + let now = executor::now(); + let mut state = self.state.lock(); + + let (delay, close) = if let Some(ms) = net.options.send_delay.delay(&mut state.rng) { + (ms, false) + } else { + (0, true) + }; + + let buffer = &mut state.buffers[direction as usize]; + if buffer.send_closed { + debug!( + "NET: TCP #{} dropped message {:?} (broken pipe)", + self.connection_id, msg + ); + return; + } + + if close { + debug!( + "NET: TCP #{} dropped message {:?} (pipe just broke)", + self.connection_id, msg + ); + buffer.send_closed = true; + return; + } + + if buffer.recv_closed { + debug!( + "NET: TCP #{} dropped message {:?} (recv closed)", + self.connection_id, msg + ); + return; + } + + // Send a message into the future. + buffer.buf.push_back((now + delay, msg)); + net.schedule(self.connection_id, delay); + } + + /// Close the connection. Only one side of the connection will be closed, + /// and no further messages will be delivered. The other side will not be notified. + fn close(&self, node_idx: usize) { + let mut state = self.state.lock(); + let recv_buffer = &mut state.buffers[1 ^ node_idx]; + if recv_buffer.recv_closed { + debug!( + "NET: TCP #{} closed twice at {}", + self.connection_id, + sender_str(node_idx as MessageDirection), + ); + return; + } + + debug!( + "NET: TCP #{} closed at {}", + self.connection_id, + sender_str(node_idx as MessageDirection), + ); + recv_buffer.recv_closed = true; + for msg in recv_buffer.buf.drain(..) { + debug!( + "NET: TCP #{} dropped message {:?} (closed)", + self.connection_id, msg + ); + } + + let send_buffer = &mut state.buffers[node_idx]; + send_buffer.send_closed = true; + drop(state); + + // TODO: notify the other side? + + self.dst_sockets[node_idx].send(NetEvent::Closed); + } +} + +struct NetworkBuffer { + /// Messages paired with time of delivery + buf: VecDeque<(u64, AnyMessage)>, + /// True if the connection is closed on the receiving side, + /// i.e. no more messages from the buffer will be delivered. + recv_closed: bool, + /// True if the connection is closed on the sending side, + /// i.e. no more messages will be added to the buffer. + send_closed: bool, + /// Last time a message was delivered from the buffer. + /// If None, it means that the server is the receiver and + /// it has not yet aware of this connection (i.e. has not + /// received the Accept). + last_recv: Option, +} + +impl NetworkBuffer { + fn new(last_recv: Option) -> Self { + Self { + buf: VecDeque::new(), + recv_closed: false, + send_closed: false, + last_recv, + } + } +} + +/// Single end of a bidirectional network stream without reordering (TCP-like). +/// Reads are implemented using channels, writes go to the buffer inside VirtualConnection. +pub struct TCP { + net: Arc, + conn_id: usize, + dir: MessageDirection, + recv_chan: Chan, +} + +impl Debug for TCP { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + write!(f, "TCP #{} ({})", self.conn_id, sender_str(self.dir),) + } +} + +impl TCP { + /// Send a message to the other side. It's guaranteed that it will not arrive + /// before the arrival of all messages sent earlier. + pub fn send(&self, msg: AnyMessage) { + let conn = self.net.get(self.conn_id); + conn.send(&self.net, self.dir, msg); + } + + /// Get a channel to receive incoming messages. + pub fn recv_chan(&self) -> Chan { + self.recv_chan.clone() + } + + pub fn connection_id(&self) -> usize { + self.conn_id + } + + pub fn close(&self) { + let conn = self.net.get(self.conn_id); + conn.close(self.dir as usize); + } +} +struct Event { + time: u64, + conn_id: usize, +} + +// BinaryHeap is a max-heap, and we want a min-heap. Reverse the ordering here +// to get that. +impl PartialOrd for Event { + fn partial_cmp(&self, other: &Self) -> Option { + Some(self.cmp(other)) + } +} + +impl Ord for Event { + fn cmp(&self, other: &Self) -> Ordering { + (other.time, other.conn_id).cmp(&(self.time, self.conn_id)) + } +} + +impl PartialEq for Event { + fn eq(&self, other: &Self) -> bool { + (other.time, other.conn_id) == (self.time, self.conn_id) + } +} + +impl Eq for Event {} diff --git a/libs/desim/src/node_os.rs b/libs/desim/src/node_os.rs new file mode 100644 index 0000000000..7744a9f5e1 --- /dev/null +++ b/libs/desim/src/node_os.rs @@ -0,0 +1,54 @@ +use std::sync::Arc; + +use rand::Rng; + +use crate::proto::NodeEvent; + +use super::{ + chan::Chan, + network::TCP, + world::{Node, NodeId, World}, +}; + +/// Abstraction with all functions (aka syscalls) available to the node. +#[derive(Clone)] +pub struct NodeOs { + world: Arc, + internal: Arc, +} + +impl NodeOs { + pub fn new(world: Arc, internal: Arc) -> NodeOs { + NodeOs { world, internal } + } + + /// Get the node id. + pub fn id(&self) -> NodeId { + self.internal.id + } + + /// Opens a bidirectional connection with the other node. Always successful. + pub fn open_tcp(&self, dst: NodeId) -> TCP { + self.world.open_tcp(dst) + } + + /// Returns a channel to receive node events (socket Accept and internal messages). + pub fn node_events(&self) -> Chan { + self.internal.node_events() + } + + /// Get current time. + pub fn now(&self) -> u64 { + self.world.now() + } + + /// Generate a random number in range [0, max). + pub fn random(&self, max: u64) -> u64 { + self.internal.rng.lock().gen_range(0..max) + } + + /// Append a new event to the world event log. + pub fn log_event(&self, data: String) { + self.internal.log_event(data) + } +} diff --git a/libs/desim/src/options.rs b/libs/desim/src/options.rs new file mode 100644 index 0000000000..5da7c2c482 --- /dev/null +++ b/libs/desim/src/options.rs @@ -0,0 +1,50 @@ +use rand::{rngs::StdRng, Rng}; + +/// Describes random delays and failures. Delay will be uniformly distributed in [min, max]. +/// Connection failure will occur with the probablity fail_prob. +#[derive(Clone, Debug)] +pub struct Delay { + pub min: u64, + pub max: u64, + pub fail_prob: f64, // [0; 1] +} + +impl Delay { + /// Create a struct with no delay, no failures. + pub fn empty() -> Delay { + Delay { + min: 0, + max: 0, + fail_prob: 0.0, + } + } + + /// Create a struct with a fixed delay. + pub fn fixed(ms: u64) -> Delay { + Delay { + min: ms, + max: ms, + fail_prob: 0.0, + } + } + + /// Generate a random delay in range [min, max]. Return None if the + /// message should be dropped. + pub fn delay(&self, rng: &mut StdRng) -> Option { + if rng.gen_bool(self.fail_prob) { + return None; + } + Some(rng.gen_range(self.min..=self.max)) + } +} + +/// Describes network settings. All network packets will be subjected to the same delays and failures. +#[derive(Clone, Debug)] +pub struct NetworkOptions { + /// Connection will be automatically closed after this timeout if no data is received. + pub keepalive_timeout: Option, + /// New connections will be delayed by this amount of time. + pub connect_delay: Delay, + /// Each message will be delayed by this amount of time. + pub send_delay: Delay, +} diff --git a/libs/desim/src/proto.rs b/libs/desim/src/proto.rs new file mode 100644 index 0000000000..92a7e8a27d --- /dev/null +++ b/libs/desim/src/proto.rs @@ -0,0 +1,63 @@ +use std::fmt::Debug; + +use bytes::Bytes; +use utils::lsn::Lsn; + +use crate::{network::TCP, world::NodeId}; + +/// Internal node events. +#[derive(Debug)] +pub enum NodeEvent { + Accept(TCP), + Internal(AnyMessage), +} + +/// Events that are coming from a network socket. +#[derive(Clone, Debug)] +pub enum NetEvent { + Message(AnyMessage), + Closed, +} + +/// Custom events generated throughout the simulation. Can be used by the test to verify the correctness. +#[derive(Debug)] +pub struct SimEvent { + pub time: u64, + pub node: NodeId, + pub data: String, +} + +/// Umbrella type for all possible flavours of messages. These events can be sent over network +/// or to an internal node events channel. +#[derive(Clone)] +pub enum AnyMessage { + /// Not used, empty placeholder. + None, + /// Used internally for notifying node about new incoming connection. + InternalConnect, + Just32(u32), + ReplCell(ReplCell), + Bytes(Bytes), + LSN(u64), +} + +impl Debug for AnyMessage { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + match self { + AnyMessage::None => write!(f, "None"), + AnyMessage::InternalConnect => write!(f, "InternalConnect"), + AnyMessage::Just32(v) => write!(f, "Just32({})", v), + AnyMessage::ReplCell(v) => write!(f, "ReplCell({:?})", v), + AnyMessage::Bytes(v) => write!(f, "Bytes({})", hex::encode(v)), + AnyMessage::LSN(v) => write!(f, "LSN({})", Lsn(*v)), + } + } +} + +/// Used in reliable_copy_test.rs +#[derive(Clone, Debug)] +pub struct ReplCell { + pub value: u32, + pub client_id: u32, + pub seqno: u32, +} diff --git a/libs/desim/src/time.rs b/libs/desim/src/time.rs new file mode 100644 index 0000000000..7bb71db95c --- /dev/null +++ b/libs/desim/src/time.rs @@ -0,0 +1,129 @@ +use std::{ + cmp::Ordering, + collections::BinaryHeap, + ops::DerefMut, + sync::{ + atomic::{AtomicU32, AtomicU64}, + Arc, + }, +}; + +use parking_lot::Mutex; +use tracing::trace; + +use crate::executor::ThreadContext; + +/// Holds current time and all pending wakeup events. +pub struct Timing { + /// Current world's time. + current_time: AtomicU64, + /// Pending timers. + queue: Mutex>, + /// Global nonce. Makes picking events from binary heap queue deterministic + /// by appending a number to events with the same timestamp. + nonce: AtomicU32, + /// Used to schedule fake events. + fake_context: Arc, +} + +impl Default for Timing { + fn default() -> Self { + Self::new() + } +} + +impl Timing { + /// Create a new empty clock with time set to 0. + pub fn new() -> Timing { + Timing { + current_time: AtomicU64::new(0), + queue: Mutex::new(BinaryHeap::new()), + nonce: AtomicU32::new(0), + fake_context: Arc::new(ThreadContext::new()), + } + } + + /// Return the current world's time. + pub fn now(&self) -> u64 { + self.current_time.load(std::sync::atomic::Ordering::SeqCst) + } + + /// Tick-tock the global clock. Return the event ready to be processed + /// or move the clock forward and then return the event. + pub(crate) fn step(&self) -> Option> { + let mut queue = self.queue.lock(); + + if queue.is_empty() { + // no future events + return None; + } + + if !self.is_event_ready(queue.deref_mut()) { + let next_time = queue.peek().unwrap().time; + self.current_time + .store(next_time, std::sync::atomic::Ordering::SeqCst); + trace!("rewind time to {}", next_time); + assert!(self.is_event_ready(queue.deref_mut())); + } + + Some(queue.pop().unwrap().wake_context) + } + + /// Append an event to the queue, to wakeup the thread in `ms` milliseconds. + pub(crate) fn schedule_wakeup(&self, ms: u64, wake_context: Arc) { + self.nonce.fetch_add(1, std::sync::atomic::Ordering::SeqCst); + let nonce = self.nonce.load(std::sync::atomic::Ordering::SeqCst); + self.queue.lock().push(Pending { + time: self.now() + ms, + nonce, + wake_context, + }) + } + + /// Append a fake event to the queue, to prevent clocks from skipping this time. + pub fn schedule_fake(&self, ms: u64) { + self.queue.lock().push(Pending { + time: self.now() + ms, + nonce: 0, + wake_context: self.fake_context.clone(), + }); + } + + /// Return true if there is a ready event. + fn is_event_ready(&self, queue: &mut BinaryHeap) -> bool { + queue.peek().map_or(false, |x| x.time <= self.now()) + } + + /// Clear all pending events. + pub(crate) fn clear(&self) { + self.queue.lock().clear(); + } +} + +struct Pending { + time: u64, + nonce: u32, + wake_context: Arc, +} + +// BinaryHeap is a max-heap, and we want a min-heap. Reverse the ordering here +// to get that. +impl PartialOrd for Pending { + fn partial_cmp(&self, other: &Self) -> Option { + Some(self.cmp(other)) + } +} + +impl Ord for Pending { + fn cmp(&self, other: &Self) -> Ordering { + (other.time, other.nonce).cmp(&(self.time, self.nonce)) + } +} + +impl PartialEq for Pending { + fn eq(&self, other: &Self) -> bool { + (other.time, other.nonce) == (self.time, self.nonce) + } +} + +impl Eq for Pending {} diff --git a/libs/desim/src/world.rs b/libs/desim/src/world.rs new file mode 100644 index 0000000000..7d60be04b5 --- /dev/null +++ b/libs/desim/src/world.rs @@ -0,0 +1,180 @@ +use parking_lot::Mutex; +use rand::{rngs::StdRng, SeedableRng}; +use std::{ + ops::DerefMut, + sync::{mpsc, Arc}, +}; + +use crate::{ + executor::{ExternalHandle, Runtime}, + network::NetworkTask, + options::NetworkOptions, + proto::{NodeEvent, SimEvent}, + time::Timing, +}; + +use super::{chan::Chan, network::TCP, node_os::NodeOs}; + +pub type NodeId = u32; + +/// World contains simulation state. +pub struct World { + nodes: Mutex>>, + /// Random number generator. + rng: Mutex, + /// Internal event log. + events: Mutex>, + /// Separate task that processes all network messages. + network_task: Arc, + /// Runtime for running threads and moving time. + runtime: Mutex, + /// To get current time. + timing: Arc, +} + +impl World { + pub fn new(seed: u64, options: Arc) -> World { + let timing = Arc::new(Timing::new()); + let mut runtime = Runtime::new(timing.clone()); + + let (tx, rx) = mpsc::channel(); + + runtime.spawn(move || { + // create and start network background thread, and send it back via the channel + NetworkTask::start_new(options, tx) + }); + + // wait for the network task to start + while runtime.step() {} + + let network_task = rx.recv().unwrap(); + + World { + nodes: Mutex::new(Vec::new()), + rng: Mutex::new(StdRng::seed_from_u64(seed)), + events: Mutex::new(Vec::new()), + network_task, + runtime: Mutex::new(runtime), + timing, + } + } + + pub fn step(&self) -> bool { + self.runtime.lock().step() + } + + pub fn get_thread_step_count(&self) -> u64 { + self.runtime.lock().step_counter + } + + /// Create a new random number generator. + pub fn new_rng(&self) -> StdRng { + let mut rng = self.rng.lock(); + StdRng::from_rng(rng.deref_mut()).unwrap() + } + + /// Create a new node. + pub fn new_node(self: &Arc) -> Arc { + let mut nodes = self.nodes.lock(); + let id = nodes.len() as NodeId; + let node = Arc::new(Node::new(id, self.clone(), self.new_rng())); + nodes.push(node.clone()); + node + } + + /// Get an internal node state by id. + fn get_node(&self, id: NodeId) -> Option> { + let nodes = self.nodes.lock(); + let num = id as usize; + if num < nodes.len() { + Some(nodes[num].clone()) + } else { + None + } + } + + pub fn stop_all(&self) { + self.runtime.lock().crash_all_threads(); + } + + /// Returns a writable end of a TCP connection, to send src->dst messages. + pub fn open_tcp(self: &Arc, dst: NodeId) -> TCP { + // TODO: replace unwrap() with /dev/null socket. + let dst = self.get_node(dst).unwrap(); + let dst_accept = dst.node_events.lock().clone(); + + let rng = self.new_rng(); + self.network_task.start_new_connection(rng, dst_accept) + } + + /// Get current time. + pub fn now(&self) -> u64 { + self.timing.now() + } + + /// Get a copy of the internal clock. + pub fn clock(&self) -> Arc { + self.timing.clone() + } + + pub fn add_event(&self, node: NodeId, data: String) { + let time = self.now(); + self.events.lock().push(SimEvent { time, node, data }); + } + + pub fn take_events(&self) -> Vec { + let mut events = self.events.lock(); + let mut res = Vec::new(); + std::mem::swap(&mut res, &mut events); + res + } + + pub fn deallocate(&self) { + self.stop_all(); + self.timing.clear(); + self.nodes.lock().clear(); + } +} + +/// Internal node state. +pub struct Node { + pub id: NodeId, + node_events: Mutex>, + world: Arc, + pub(crate) rng: Mutex, +} + +impl Node { + pub fn new(id: NodeId, world: Arc, rng: StdRng) -> Node { + Node { + id, + node_events: Mutex::new(Chan::new()), + world, + rng: Mutex::new(rng), + } + } + + /// Spawn a new thread with this node context. + pub fn launch(self: &Arc, f: impl FnOnce(NodeOs) + Send + 'static) -> ExternalHandle { + let node = self.clone(); + let world = self.world.clone(); + self.world.runtime.lock().spawn(move || { + f(NodeOs::new(world, node.clone())); + }) + } + + /// Returns a channel to receive Accepts and internal messages. + pub fn node_events(&self) -> Chan { + self.node_events.lock().clone() + } + + /// This will drop all in-flight Accept messages. + pub fn replug_node_events(&self, chan: Chan) { + *self.node_events.lock() = chan; + } + + /// Append event to the world's log. + pub fn log_event(&self, data: String) { + self.world.add_event(self.id, data) + } +} diff --git a/libs/desim/tests/reliable_copy_test.rs b/libs/desim/tests/reliable_copy_test.rs new file mode 100644 index 0000000000..cf7bff8f5a --- /dev/null +++ b/libs/desim/tests/reliable_copy_test.rs @@ -0,0 +1,244 @@ +//! Simple test to verify that simulator is working. +#[cfg(test)] +mod reliable_copy_test { + use anyhow::Result; + use desim::executor::{self, PollSome}; + use desim::options::{Delay, NetworkOptions}; + use desim::proto::{NetEvent, NodeEvent, ReplCell}; + use desim::world::{NodeId, World}; + use desim::{node_os::NodeOs, proto::AnyMessage}; + use parking_lot::Mutex; + use std::sync::Arc; + use tracing::info; + + /// Disk storage trait and implementation. + pub trait Storage { + fn flush_pos(&self) -> u32; + fn flush(&mut self) -> Result<()>; + fn write(&mut self, t: T); + } + + #[derive(Clone)] + pub struct SharedStorage { + pub state: Arc>>, + } + + impl SharedStorage { + pub fn new() -> Self { + Self { + state: Arc::new(Mutex::new(InMemoryStorage::new())), + } + } + } + + impl Storage for SharedStorage { + fn flush_pos(&self) -> u32 { + self.state.lock().flush_pos + } + + fn flush(&mut self) -> Result<()> { + executor::yield_me(0); + self.state.lock().flush() + } + + fn write(&mut self, t: T) { + executor::yield_me(0); + self.state.lock().write(t); + } + } + + pub struct InMemoryStorage { + pub data: Vec, + pub flush_pos: u32, + } + + impl InMemoryStorage { + pub fn new() -> Self { + Self { + data: Vec::new(), + flush_pos: 0, + } + } + + pub fn flush(&mut self) -> Result<()> { + self.flush_pos = self.data.len() as u32; + Ok(()) + } + + pub fn write(&mut self, t: T) { + self.data.push(t); + } + } + + /// Server implementation. + pub fn run_server(os: NodeOs, mut storage: Box>) { + info!("started server"); + + let node_events = os.node_events(); + let mut epoll_vec: Vec> = vec![Box::new(node_events.clone())]; + let mut sockets = vec![]; + + loop { + let index = executor::epoll_chans(&epoll_vec, -1).unwrap(); + + if index == 0 { + let node_event = node_events.must_recv(); + info!("got node event: {:?}", node_event); + if let NodeEvent::Accept(tcp) = node_event { + tcp.send(AnyMessage::Just32(storage.flush_pos())); + epoll_vec.push(Box::new(tcp.recv_chan())); + sockets.push(tcp); + } + continue; + } + + let recv_chan = sockets[index - 1].recv_chan(); + let socket = &sockets[index - 1]; + + let event = recv_chan.must_recv(); + info!("got event: {:?}", event); + if let NetEvent::Message(AnyMessage::ReplCell(cell)) = event { + if cell.seqno != storage.flush_pos() { + info!("got out of order data: {:?}", cell); + continue; + } + storage.write(cell.value); + storage.flush().unwrap(); + socket.send(AnyMessage::Just32(storage.flush_pos())); + } + } + } + + /// Client copies all data from array to the remote node. + pub fn run_client(os: NodeOs, data: &[ReplCell], dst: NodeId) { + info!("started client"); + + let mut delivered = 0; + + let mut sock = os.open_tcp(dst); + let mut recv_chan = sock.recv_chan(); + + while delivered < data.len() { + let num = &data[delivered]; + info!("sending data: {:?}", num.clone()); + sock.send(AnyMessage::ReplCell(num.clone())); + + // loop { + let event = recv_chan.recv(); + match event { + NetEvent::Message(AnyMessage::Just32(flush_pos)) => { + if flush_pos == 1 + delivered as u32 { + delivered += 1; + } + } + NetEvent::Closed => { + info!("connection closed, reestablishing"); + sock = os.open_tcp(dst); + recv_chan = sock.recv_chan(); + } + _ => {} + } + + // } + } + + let sock = os.open_tcp(dst); + for num in data { + info!("sending data: {:?}", num.clone()); + sock.send(AnyMessage::ReplCell(num.clone())); + } + + info!("sent all data and finished client"); + } + + /// Run test simulations. + #[test] + fn sim_example_reliable_copy() { + utils::logging::init( + utils::logging::LogFormat::Test, + utils::logging::TracingErrorLayerEnablement::Disabled, + utils::logging::Output::Stdout, + ) + .expect("logging init failed"); + + let delay = Delay { + min: 1, + max: 60, + fail_prob: 0.4, + }; + + let network = NetworkOptions { + keepalive_timeout: Some(50), + connect_delay: delay.clone(), + send_delay: delay.clone(), + }; + + for seed in 0..20 { + let u32_data: [u32; 5] = [1, 2, 3, 4, 5]; + let data = u32_to_cells(&u32_data, 1); + let world = Arc::new(World::new(seed, Arc::new(network.clone()))); + + start_simulation(Options { + world, + time_limit: 1_000_000, + client_fn: Box::new(move |os, server_id| run_client(os, &data, server_id)), + u32_data, + }); + } + } + + pub struct Options { + pub world: Arc, + pub time_limit: u64, + pub u32_data: [u32; 5], + pub client_fn: Box, + } + + pub fn start_simulation(options: Options) { + let world = options.world; + + let client_node = world.new_node(); + let server_node = world.new_node(); + let server_id = server_node.id; + + // start the client thread + client_node.launch(move |os| { + let client_fn = options.client_fn; + client_fn(os, server_id); + }); + + // start the server thread + let shared_storage = SharedStorage::new(); + let server_storage = shared_storage.clone(); + server_node.launch(move |os| run_server(os, Box::new(server_storage))); + + while world.step() && world.now() < options.time_limit {} + + let disk_data = shared_storage.state.lock().data.clone(); + assert!(verify_data(&disk_data, &options.u32_data[..])); + } + + pub fn u32_to_cells(data: &[u32], client_id: u32) -> Vec { + let mut res = Vec::new(); + for (i, _) in data.iter().enumerate() { + res.push(ReplCell { + client_id, + seqno: i as u32, + value: data[i], + }); + } + res + } + + fn verify_data(disk_data: &[u32], data: &[u32]) -> bool { + if disk_data.len() != data.len() { + return false; + } + for i in 0..data.len() { + if disk_data[i] != data[i] { + return false; + } + } + true + } +} diff --git a/libs/metrics/Cargo.toml b/libs/metrics/Cargo.toml index d4323ae766..f6a49a0166 100644 --- a/libs/metrics/Cargo.toml +++ b/libs/metrics/Cargo.toml @@ -9,5 +9,13 @@ prometheus.workspace = true libc.workspace = true once_cell.workspace = true chrono.workspace = true +twox-hash.workspace = true workspace_hack.workspace = true + +[target.'cfg(target_os = "linux")'.dependencies] +procfs.workspace = true + +[dev-dependencies] +rand = "0.8" +rand_distr = "0.4.3" diff --git a/libs/metrics/src/hll.rs b/libs/metrics/src/hll.rs new file mode 100644 index 0000000000..46a623b0e2 --- /dev/null +++ b/libs/metrics/src/hll.rs @@ -0,0 +1,523 @@ +//! HyperLogLog is an algorithm for the count-distinct problem, +//! approximating the number of distinct elements in a multiset. +//! Calculating the exact cardinality of the distinct elements +//! of a multiset requires an amount of memory proportional to +//! the cardinality, which is impractical for very large data sets. +//! Probabilistic cardinality estimators, such as the HyperLogLog algorithm, +//! use significantly less memory than this, but can only approximate the cardinality. + +use std::{ + collections::HashMap, + hash::{BuildHasher, BuildHasherDefault, Hash, Hasher}, + sync::{atomic::AtomicU8, Arc, RwLock}, +}; + +use prometheus::{ + core::{self, Describer}, + proto, Opts, +}; +use twox_hash::xxh3; + +/// Create an [`HyperLogLogVec`] and registers to default registry. +#[macro_export(local_inner_macros)] +macro_rules! register_hll_vec { + ($N:literal, $OPTS:expr, $LABELS_NAMES:expr $(,)?) => {{ + let hll_vec = $crate::HyperLogLogVec::<$N>::new($OPTS, $LABELS_NAMES).unwrap(); + $crate::register(Box::new(hll_vec.clone())).map(|_| hll_vec) + }}; + + ($N:literal, $NAME:expr, $HELP:expr, $LABELS_NAMES:expr $(,)?) => {{ + $crate::register_hll_vec!($N, $crate::opts!($NAME, $HELP), $LABELS_NAMES) + }}; +} + +/// Create an [`HyperLogLog`] and registers to default registry. +#[macro_export(local_inner_macros)] +macro_rules! register_hll { + ($N:literal, $OPTS:expr $(,)?) => {{ + let hll = $crate::HyperLogLog::<$N>::with_opts($OPTS).unwrap(); + $crate::register(Box::new(hll.clone())).map(|_| hll) + }}; + + ($N:literal, $NAME:expr, $HELP:expr $(,)?) => {{ + $crate::register_hll!($N, $crate::opts!($NAME, $HELP), $LABELS_NAMES) + }}; +} + +/// HLL is a probabilistic cardinality measure. +/// +/// How to use this time-series for a metric name `my_metrics_total_hll`: +/// +/// ```promql +/// # harmonic mean +/// 1 / ( +/// sum ( +/// 2 ^ -( +/// # HLL merge operation +/// max (my_metrics_total_hll{}) by (hll_shard, other_labels...) +/// ) +/// ) without (hll_shard) +/// ) +/// * alpha +/// * shards_count +/// * shards_count +/// ``` +/// +/// If you want an estimate over time, you can use the following query: +/// +/// ```promql +/// # harmonic mean +/// 1 / ( +/// sum ( +/// 2 ^ -( +/// # HLL merge operation +/// max ( +/// max_over_time(my_metrics_total_hll{}[$__rate_interval]) +/// ) by (hll_shard, other_labels...) +/// ) +/// ) without (hll_shard) +/// ) +/// * alpha +/// * shards_count +/// * shards_count +/// ``` +/// +/// In the case of low cardinality, you might want to use the linear counting approximation: +/// +/// ```promql +/// # LinearCounting(m, V) = m log (m / V) +/// shards_count * ln(shards_count / +/// # calculate V = how many shards contain a 0 +/// count(max (proxy_connecting_endpoints{}) by (hll_shard, protocol) == 0) without (hll_shard) +/// ) +/// ``` +/// +/// See for estimates on alpha +#[derive(Clone)] +pub struct HyperLogLogVec { + core: Arc>, +} + +struct HyperLogLogVecCore { + pub children: RwLock, BuildHasherDefault>>, + pub desc: core::Desc, + pub opts: Opts, +} + +impl core::Collector for HyperLogLogVec { + fn desc(&self) -> Vec<&core::Desc> { + vec![&self.core.desc] + } + + fn collect(&self) -> Vec { + let mut m = proto::MetricFamily::default(); + m.set_name(self.core.desc.fq_name.clone()); + m.set_help(self.core.desc.help.clone()); + m.set_field_type(proto::MetricType::GAUGE); + + let mut metrics = Vec::new(); + for child in self.core.children.read().unwrap().values() { + child.core.collect_into(&mut metrics); + } + m.set_metric(metrics); + + vec![m] + } +} + +impl HyperLogLogVec { + /// Create a new [`HyperLogLogVec`] based on the provided + /// [`Opts`] and partitioned by the given label names. At least one label name must be + /// provided. + pub fn new(opts: Opts, label_names: &[&str]) -> prometheus::Result { + assert!(N.is_power_of_two()); + let variable_names = label_names.iter().map(|s| (*s).to_owned()).collect(); + let opts = opts.variable_labels(variable_names); + + let desc = opts.describe()?; + let v = HyperLogLogVecCore { + children: RwLock::new(HashMap::default()), + desc, + opts, + }; + + Ok(Self { core: Arc::new(v) }) + } + + /// `get_metric_with_label_values` returns the [`HyperLogLog

`] for the given slice + /// of label values (same order as the VariableLabels in Desc). If that combination of + /// label values is accessed for the first time, a new [`HyperLogLog

`] is created. + /// + /// An error is returned if the number of label values is not the same as the + /// number of VariableLabels in Desc. + pub fn get_metric_with_label_values( + &self, + vals: &[&str], + ) -> prometheus::Result> { + self.core.get_metric_with_label_values(vals) + } + + /// `with_label_values` works as `get_metric_with_label_values`, but panics if an error + /// occurs. + pub fn with_label_values(&self, vals: &[&str]) -> HyperLogLog { + self.get_metric_with_label_values(vals).unwrap() + } +} + +impl HyperLogLogVecCore { + pub fn get_metric_with_label_values( + &self, + vals: &[&str], + ) -> prometheus::Result> { + let h = self.hash_label_values(vals)?; + + if let Some(metric) = self.children.read().unwrap().get(&h).cloned() { + return Ok(metric); + } + + self.get_or_create_metric(h, vals) + } + + pub(crate) fn hash_label_values(&self, vals: &[&str]) -> prometheus::Result { + if vals.len() != self.desc.variable_labels.len() { + return Err(prometheus::Error::InconsistentCardinality { + expect: self.desc.variable_labels.len(), + got: vals.len(), + }); + } + + let mut h = xxh3::Hash64::default(); + for val in vals { + h.write(val.as_bytes()); + } + + Ok(h.finish()) + } + + fn get_or_create_metric( + &self, + hash: u64, + label_values: &[&str], + ) -> prometheus::Result> { + let mut children = self.children.write().unwrap(); + // Check exist first. + if let Some(metric) = children.get(&hash).cloned() { + return Ok(metric); + } + + let metric = HyperLogLog::with_opts_and_label_values(&self.opts, label_values)?; + children.insert(hash, metric.clone()); + Ok(metric) + } +} + +/// HLL is a probabilistic cardinality measure. +/// +/// How to use this time-series for a metric name `my_metrics_total_hll`: +/// +/// ```promql +/// # harmonic mean +/// 1 / ( +/// sum ( +/// 2 ^ -( +/// # HLL merge operation +/// max (my_metrics_total_hll{}) by (hll_shard, other_labels...) +/// ) +/// ) without (hll_shard) +/// ) +/// * alpha +/// * shards_count +/// * shards_count +/// ``` +/// +/// If you want an estimate over time, you can use the following query: +/// +/// ```promql +/// # harmonic mean +/// 1 / ( +/// sum ( +/// 2 ^ -( +/// # HLL merge operation +/// max ( +/// max_over_time(my_metrics_total_hll{}[$__rate_interval]) +/// ) by (hll_shard, other_labels...) +/// ) +/// ) without (hll_shard) +/// ) +/// * alpha +/// * shards_count +/// * shards_count +/// ``` +/// +/// In the case of low cardinality, you might want to use the linear counting approximation: +/// +/// ```promql +/// # LinearCounting(m, V) = m log (m / V) +/// shards_count * ln(shards_count / +/// # calculate V = how many shards contain a 0 +/// count(max (proxy_connecting_endpoints{}) by (hll_shard, protocol) == 0) without (hll_shard) +/// ) +/// ``` +/// +/// See for estimates on alpha +#[derive(Clone)] +pub struct HyperLogLog { + core: Arc>, +} + +impl HyperLogLog { + /// Create a [`HyperLogLog`] with the `name` and `help` arguments. + pub fn new, S2: Into>(name: S1, help: S2) -> prometheus::Result { + assert!(N.is_power_of_two()); + let opts = Opts::new(name, help); + Self::with_opts(opts) + } + + /// Create a [`HyperLogLog`] with the `opts` options. + pub fn with_opts(opts: Opts) -> prometheus::Result { + Self::with_opts_and_label_values(&opts, &[]) + } + + fn with_opts_and_label_values(opts: &Opts, label_values: &[&str]) -> prometheus::Result { + let desc = opts.describe()?; + let labels = make_label_pairs(&desc, label_values)?; + + let v = HyperLogLogCore { + shards: [0; N].map(AtomicU8::new), + desc, + labels, + }; + Ok(Self { core: Arc::new(v) }) + } + + pub fn measure(&self, item: &impl Hash) { + // changing the hasher will break compatibility with previous measurements. + self.record(BuildHasherDefault::::default().hash_one(item)); + } + + fn record(&self, hash: u64) { + let p = N.ilog2() as u8; + let j = hash & (N as u64 - 1); + let rho = (hash >> p).leading_zeros() as u8 + 1 - p; + self.core.shards[j as usize].fetch_max(rho, std::sync::atomic::Ordering::Relaxed); + } +} + +struct HyperLogLogCore { + shards: [AtomicU8; N], + desc: core::Desc, + labels: Vec, +} + +impl core::Collector for HyperLogLog { + fn desc(&self) -> Vec<&core::Desc> { + vec![&self.core.desc] + } + + fn collect(&self) -> Vec { + let mut m = proto::MetricFamily::default(); + m.set_name(self.core.desc.fq_name.clone()); + m.set_help(self.core.desc.help.clone()); + m.set_field_type(proto::MetricType::GAUGE); + + let mut metrics = Vec::new(); + self.core.collect_into(&mut metrics); + m.set_metric(metrics); + + vec![m] + } +} + +impl HyperLogLogCore { + fn collect_into(&self, metrics: &mut Vec) { + self.shards.iter().enumerate().for_each(|(i, x)| { + let mut shard_label = proto::LabelPair::default(); + shard_label.set_name("hll_shard".to_owned()); + shard_label.set_value(format!("{i}")); + + // We reset the counter to 0 so we can perform a cardinality measure over any time slice in prometheus. + + // This seems like it would be a race condition, + // but HLL is not impacted by a write in one shard happening in between. + // This is because in PromQL we will be implementing a harmonic mean of all buckets. + // we will also merge samples in a time series using `max by (hll_shard)`. + + // TODO: maybe we shouldn't reset this on every collect, instead, only after a time window. + // this would mean that a dev port-forwarding the metrics url won't break the sampling. + let v = x.swap(0, std::sync::atomic::Ordering::Relaxed); + + let mut m = proto::Metric::default(); + let mut c = proto::Gauge::default(); + c.set_value(v as f64); + m.set_gauge(c); + + let mut labels = Vec::with_capacity(self.labels.len() + 1); + labels.extend_from_slice(&self.labels); + labels.push(shard_label); + + m.set_label(labels); + metrics.push(m); + }) + } +} + +fn make_label_pairs( + desc: &core::Desc, + label_values: &[&str], +) -> prometheus::Result> { + if desc.variable_labels.len() != label_values.len() { + return Err(prometheus::Error::InconsistentCardinality { + expect: desc.variable_labels.len(), + got: label_values.len(), + }); + } + + let total_len = desc.variable_labels.len() + desc.const_label_pairs.len(); + if total_len == 0 { + return Ok(vec![]); + } + + if desc.variable_labels.is_empty() { + return Ok(desc.const_label_pairs.clone()); + } + + let mut label_pairs = Vec::with_capacity(total_len); + for (i, n) in desc.variable_labels.iter().enumerate() { + let mut label_pair = proto::LabelPair::default(); + label_pair.set_name(n.clone()); + label_pair.set_value(label_values[i].to_owned()); + label_pairs.push(label_pair); + } + + for label_pair in &desc.const_label_pairs { + label_pairs.push(label_pair.clone()); + } + label_pairs.sort(); + Ok(label_pairs) +} + +#[cfg(test)] +mod tests { + use std::collections::HashSet; + + use prometheus::{proto, Opts}; + use rand::{rngs::StdRng, Rng, SeedableRng}; + use rand_distr::{Distribution, Zipf}; + + use crate::HyperLogLogVec; + + fn collect(hll: &HyperLogLogVec<32>) -> Vec { + let mut metrics = vec![]; + hll.core + .children + .read() + .unwrap() + .values() + .for_each(|c| c.core.collect_into(&mut metrics)); + metrics + } + fn get_cardinality(metrics: &[proto::Metric], filter: impl Fn(&proto::Metric) -> bool) -> f64 { + let mut buckets = [0.0; 32]; + for metric in metrics.chunks_exact(32) { + if filter(&metric[0]) { + for (i, m) in metric.iter().enumerate() { + buckets[i] = f64::max(buckets[i], m.get_gauge().get_value()); + } + } + } + + buckets + .into_iter() + .map(|f| 2.0f64.powf(-f)) + .sum::() + .recip() + * 0.697 + * 32.0 + * 32.0 + } + + fn test_cardinality(n: usize, dist: impl Distribution) -> ([usize; 3], [f64; 3]) { + let hll = HyperLogLogVec::<32>::new(Opts::new("foo", "bar"), &["x"]).unwrap(); + + let mut iter = StdRng::seed_from_u64(0x2024_0112).sample_iter(dist); + let mut set_a = HashSet::new(); + let mut set_b = HashSet::new(); + + for x in iter.by_ref().take(n) { + set_a.insert(x.to_bits()); + hll.with_label_values(&["a"]).measure(&x.to_bits()); + } + for x in iter.by_ref().take(n) { + set_b.insert(x.to_bits()); + hll.with_label_values(&["b"]).measure(&x.to_bits()); + } + let merge = &set_a | &set_b; + + let metrics = collect(&hll); + let len = get_cardinality(&metrics, |_| true); + let len_a = get_cardinality(&metrics, |l| l.get_label()[0].get_value() == "a"); + let len_b = get_cardinality(&metrics, |l| l.get_label()[0].get_value() == "b"); + + ([merge.len(), set_a.len(), set_b.len()], [len, len_a, len_b]) + } + + #[test] + fn test_cardinality_small() { + let (actual, estimate) = test_cardinality(100, Zipf::new(100, 1.2f64).unwrap()); + + assert_eq!(actual, [46, 30, 32]); + assert!(51.3 < estimate[0] && estimate[0] < 51.4); + assert!(44.0 < estimate[1] && estimate[1] < 44.1); + assert!(39.0 < estimate[2] && estimate[2] < 39.1); + } + + #[test] + fn test_cardinality_medium() { + let (actual, estimate) = test_cardinality(10000, Zipf::new(10000, 1.2f64).unwrap()); + + assert_eq!(actual, [2529, 1618, 1629]); + assert!(2309.1 < estimate[0] && estimate[0] < 2309.2); + assert!(1566.6 < estimate[1] && estimate[1] < 1566.7); + assert!(1629.5 < estimate[2] && estimate[2] < 1629.6); + } + + #[test] + fn test_cardinality_large() { + let (actual, estimate) = test_cardinality(1_000_000, Zipf::new(1_000_000, 1.2f64).unwrap()); + + assert_eq!(actual, [129077, 79579, 79630]); + assert!(126067.2 < estimate[0] && estimate[0] < 126067.3); + assert!(83076.8 < estimate[1] && estimate[1] < 83076.9); + assert!(64251.2 < estimate[2] && estimate[2] < 64251.3); + } + + #[test] + fn test_cardinality_small2() { + let (actual, estimate) = test_cardinality(100, Zipf::new(200, 0.8f64).unwrap()); + + assert_eq!(actual, [92, 58, 60]); + assert!(116.1 < estimate[0] && estimate[0] < 116.2); + assert!(81.7 < estimate[1] && estimate[1] < 81.8); + assert!(69.3 < estimate[2] && estimate[2] < 69.4); + } + + #[test] + fn test_cardinality_medium2() { + let (actual, estimate) = test_cardinality(10000, Zipf::new(20000, 0.8f64).unwrap()); + + assert_eq!(actual, [8201, 5131, 5051]); + assert!(6846.4 < estimate[0] && estimate[0] < 6846.5); + assert!(5239.1 < estimate[1] && estimate[1] < 5239.2); + assert!(4292.8 < estimate[2] && estimate[2] < 4292.9); + } + + #[test] + fn test_cardinality_large2() { + let (actual, estimate) = test_cardinality(1_000_000, Zipf::new(2_000_000, 0.8f64).unwrap()); + + assert_eq!(actual, [777847, 482069, 482246]); + assert!(699437.4 < estimate[0] && estimate[0] < 699437.5); + assert!(374948.9 < estimate[1] && estimate[1] < 374949.0); + assert!(434609.7 < estimate[2] && estimate[2] < 434609.8); + } +} diff --git a/libs/metrics/src/lib.rs b/libs/metrics/src/lib.rs index d09ba11344..22b0a18933 100644 --- a/libs/metrics/src/lib.rs +++ b/libs/metrics/src/lib.rs @@ -28,7 +28,10 @@ use prometheus::{Registry, Result}; pub mod launch_timestamp; mod wrappers; pub use wrappers::{CountedReader, CountedWriter}; -pub mod metric_vec_duration; +mod hll; +pub use hll::{HyperLogLog, HyperLogLogVec}; +#[cfg(target_os = "linux")] +pub mod more_process_metrics; pub type UIntGauge = GenericGauge; pub type UIntGaugeVec = GenericGaugeVec; @@ -111,7 +114,6 @@ pub fn set_build_info_metric(revision: &str, build_tag: &str) { // performed by the process. // We know the size of the block, so we can determine the I/O bytes out of it. // The value might be not 100% exact, but should be fine for Prometheus metrics in this case. -#[allow(clippy::unnecessary_cast)] fn update_rusage_metrics() { let rusage_stats = get_rusage_stats(); @@ -198,6 +200,11 @@ impl GenericCounterPairVec

{ pub fn with_label_values(&self, vals: &[&str]) -> GenericCounterPair

{ self.get_metric_with_label_values(vals).unwrap() } + + pub fn remove_label_values(&self, res: &mut [Result<()>; 2], vals: &[&str]) { + res[0] = self.inc.remove_label_values(vals); + res[1] = self.dec.remove_label_values(vals); + } } impl GenericCounterPair

{ @@ -244,6 +251,15 @@ impl GenericCounterPair

{ } } +impl Clone for GenericCounterPair

{ + fn clone(&self) -> Self { + Self { + inc: self.inc.clone(), + dec: self.dec.clone(), + } + } +} + /// Guard returned by [`GenericCounterPair::guard`] pub struct GenericCounterPairGuard(GenericCounter

); diff --git a/libs/metrics/src/metric_vec_duration.rs b/libs/metrics/src/metric_vec_duration.rs deleted file mode 100644 index e9a0a65570..0000000000 --- a/libs/metrics/src/metric_vec_duration.rs +++ /dev/null @@ -1,23 +0,0 @@ -//! Helpers for observing duration on `HistogramVec` / `CounterVec` / `GaugeVec` / `MetricVec`. - -use std::{future::Future, time::Instant}; - -pub trait DurationResultObserver { - fn observe_result(&self, res: &Result, duration: std::time::Duration); -} - -pub async fn observe_async_block_duration_by_result< - T, - E, - F: Future>, - O: DurationResultObserver, ->( - observer: &O, - block: F, -) -> Result { - let start = Instant::now(); - let result = block.await; - let duration = start.elapsed(); - observer.observe_result(&result, duration); - result -} diff --git a/libs/metrics/src/more_process_metrics.rs b/libs/metrics/src/more_process_metrics.rs new file mode 100644 index 0000000000..920724fdec --- /dev/null +++ b/libs/metrics/src/more_process_metrics.rs @@ -0,0 +1,54 @@ +//! process metrics that the [`::prometheus`] crate doesn't provide. + +// This module has heavy inspiration from the prometheus crate's `process_collector.rs`. + +use crate::UIntGauge; + +pub struct Collector { + descs: Vec, + vmlck: crate::UIntGauge, +} + +const NMETRICS: usize = 1; + +impl prometheus::core::Collector for Collector { + fn desc(&self) -> Vec<&prometheus::core::Desc> { + self.descs.iter().collect() + } + + fn collect(&self) -> Vec { + let Ok(myself) = procfs::process::Process::myself() else { + return vec![]; + }; + let mut mfs = Vec::with_capacity(NMETRICS); + if let Ok(status) = myself.status() { + if let Some(vmlck) = status.vmlck { + self.vmlck.set(vmlck); + mfs.extend(self.vmlck.collect()) + } + } + mfs + } +} + +impl Collector { + pub fn new() -> Self { + let mut descs = Vec::new(); + + let vmlck = + UIntGauge::new("libmetrics_process_status_vmlck", "/proc/self/status vmlck").unwrap(); + descs.extend( + prometheus::core::Collector::desc(&vmlck) + .into_iter() + .cloned(), + ); + + Self { descs, vmlck } + } +} + +impl Default for Collector { + fn default() -> Self { + Self::new() + } +} diff --git a/libs/pageserver_api/Cargo.toml b/libs/pageserver_api/Cargo.toml index 96c6c10d3e..3bba89c76d 100644 --- a/libs/pageserver_api/Cargo.toml +++ b/libs/pageserver_api/Cargo.toml @@ -18,8 +18,11 @@ enum-map.workspace = true strum.workspace = true strum_macros.workspace = true hex.workspace = true +humantime.workspace = true thiserror.workspace = true humantime-serde.workspace = true +chrono.workspace = true +itertools.workspace = true workspace_hack.workspace = true diff --git a/libs/pageserver_api/src/controller_api.rs b/libs/pageserver_api/src/controller_api.rs new file mode 100644 index 0000000000..c172354e9f --- /dev/null +++ b/libs/pageserver_api/src/controller_api.rs @@ -0,0 +1,167 @@ +use std::str::FromStr; + +/// Request/response types for the storage controller +/// API (`/control/v1` prefix). Implemented by the server +/// in [`attachment_service::http`] +use serde::{Deserialize, Serialize}; +use utils::id::NodeId; + +use crate::{models::ShardParameters, shard::TenantShardId}; + +#[derive(Serialize, Deserialize)] +pub struct TenantCreateResponseShard { + pub shard_id: TenantShardId, + pub node_id: NodeId, + pub generation: u32, +} + +#[derive(Serialize, Deserialize)] +pub struct TenantCreateResponse { + pub shards: Vec, +} + +#[derive(Serialize, Deserialize)] +pub struct NodeRegisterRequest { + pub node_id: NodeId, + + pub listen_pg_addr: String, + pub listen_pg_port: u16, + + pub listen_http_addr: String, + pub listen_http_port: u16, +} + +#[derive(Serialize, Deserialize)] +pub struct NodeConfigureRequest { + pub node_id: NodeId, + + pub availability: Option, + pub scheduling: Option, +} + +#[derive(Serialize, Deserialize, Debug)] +pub struct TenantLocateResponseShard { + pub shard_id: TenantShardId, + pub node_id: NodeId, + + pub listen_pg_addr: String, + pub listen_pg_port: u16, + + pub listen_http_addr: String, + pub listen_http_port: u16, +} + +#[derive(Serialize, Deserialize)] +pub struct TenantLocateResponse { + pub shards: Vec, + pub shard_params: ShardParameters, +} + +/// Explicitly migrating a particular shard is a low level operation +/// TODO: higher level "Reschedule tenant" operation where the request +/// specifies some constraints, e.g. asking it to get off particular node(s) +#[derive(Serialize, Deserialize, Debug)] +pub struct TenantShardMigrateRequest { + pub tenant_shard_id: TenantShardId, + pub node_id: NodeId, +} + +#[derive(Serialize, Deserialize, Clone, Copy, Eq, PartialEq)] +pub enum NodeAvailability { + // Normal, happy state + Active, + // Offline: Tenants shouldn't try to attach here, but they may assume that their + // secondary locations on this node still exist. Newly added nodes are in this + // state until we successfully contact them. + Offline, +} + +impl FromStr for NodeAvailability { + type Err = anyhow::Error; + + fn from_str(s: &str) -> Result { + match s { + "active" => Ok(Self::Active), + "offline" => Ok(Self::Offline), + _ => Err(anyhow::anyhow!("Unknown availability state '{s}'")), + } + } +} + +#[derive(Serialize, Deserialize, Clone, Copy, Eq, PartialEq)] +pub enum NodeSchedulingPolicy { + Active, + Filling, + Pause, + Draining, +} + +impl FromStr for NodeSchedulingPolicy { + type Err = anyhow::Error; + + fn from_str(s: &str) -> Result { + match s { + "active" => Ok(Self::Active), + "filling" => Ok(Self::Filling), + "pause" => Ok(Self::Pause), + "draining" => Ok(Self::Draining), + _ => Err(anyhow::anyhow!("Unknown scheduling state '{s}'")), + } + } +} + +impl From for String { + fn from(value: NodeSchedulingPolicy) -> String { + use NodeSchedulingPolicy::*; + match value { + Active => "active", + Filling => "filling", + Pause => "pause", + Draining => "draining", + } + .to_string() + } +} + +/// Controls how tenant shards are mapped to locations on pageservers, e.g. whether +/// to create secondary locations. +#[derive(Clone, Serialize, Deserialize, Debug, PartialEq, Eq)] +pub enum PlacementPolicy { + /// Cheapest way to attach a tenant: just one pageserver, no secondary + Single, + /// Production-ready way to attach a tenant: one attached pageserver and + /// some number of secondaries. + Double(usize), + /// Create one secondary mode locations. This is useful when onboarding + /// a tenant, or for an idle tenant that we might want to bring online quickly. + Secondary, + + /// Do not attach to any pageservers. This is appropriate for tenants that + /// have been idle for a long time, where we do not mind some delay in making + /// them available in future. + Detached, +} + +#[derive(Serialize, Deserialize, Debug)] +pub struct TenantShardMigrateResponse {} + +#[cfg(test)] +mod test { + use super::*; + use serde_json; + + /// Check stability of PlacementPolicy's serialization + #[test] + fn placement_policy_encoding() -> anyhow::Result<()> { + let v = PlacementPolicy::Double(1); + let encoded = serde_json::to_string(&v)?; + assert_eq!(encoded, "{\"Double\":1}"); + assert_eq!(serde_json::from_str::(&encoded)?, v); + + let v = PlacementPolicy::Single; + let encoded = serde_json::to_string(&v)?; + assert_eq!(encoded, "\"Single\""); + assert_eq!(serde_json::from_str::(&encoded)?, v); + Ok(()) + } +} diff --git a/libs/pageserver_api/src/keyspace.rs b/libs/pageserver_api/src/keyspace.rs index 2316acb616..05fa4562e1 100644 --- a/libs/pageserver_api/src/keyspace.rs +++ b/libs/pageserver_api/src/keyspace.rs @@ -2,6 +2,7 @@ use postgres_ffi::BLCKSZ; use std::ops::Range; use crate::key::Key; +use itertools::Itertools; /// /// Represents a set of Keys, in a compact form. @@ -63,16 +64,111 @@ impl KeySpace { KeyPartitioning { parts } } + /// Merge another keyspace into the current one. + /// Note: the keyspaces must not ovelap (enforced via assertions) + pub fn merge(&mut self, other: &KeySpace) { + let all_ranges = self + .ranges + .iter() + .merge_by(other.ranges.iter(), |lhs, rhs| lhs.start < rhs.start); + + let mut accum = KeySpaceAccum::new(); + let mut prev: Option<&Range> = None; + for range in all_ranges { + if let Some(prev) = prev { + let overlap = + std::cmp::max(range.start, prev.start) < std::cmp::min(range.end, prev.end); + assert!( + !overlap, + "Attempt to merge ovelapping keyspaces: {:?} overlaps {:?}", + prev, range + ); + } + + accum.add_range(range.clone()); + prev = Some(range); + } + + self.ranges = accum.to_keyspace().ranges; + } + + /// Remove all keys in `other` from `self`. + /// This can involve splitting or removing of existing ranges. + pub fn remove_overlapping_with(&mut self, other: &KeySpace) { + let (self_start, self_end) = match (self.start(), self.end()) { + (Some(start), Some(end)) => (start, end), + _ => { + // self is empty + return; + } + }; + + // Key spaces are sorted by definition, so skip ahead to the first + // potentially intersecting range. Similarly, ignore ranges that start + // after the current keyspace ends. + let other_ranges = other + .ranges + .iter() + .skip_while(|range| self_start >= range.end) + .take_while(|range| self_end > range.start); + + for range in other_ranges { + while let Some(overlap_at) = self.overlaps_at(range) { + let overlapped = self.ranges[overlap_at].clone(); + + if overlapped.start < range.start && overlapped.end <= range.end { + // Higher part of the range is completely overlapped. + self.ranges[overlap_at].end = range.start; + } + if overlapped.start >= range.start && overlapped.end > range.end { + // Lower part of the range is completely overlapped. + self.ranges[overlap_at].start = range.end; + } + if overlapped.start < range.start && overlapped.end > range.end { + // Middle part of the range is overlapped. + self.ranges[overlap_at].end = range.start; + self.ranges + .insert(overlap_at + 1, range.end..overlapped.end); + } + if overlapped.start >= range.start && overlapped.end <= range.end { + // Whole range is overlapped + self.ranges.remove(overlap_at); + } + } + } + } + + pub fn start(&self) -> Option { + self.ranges.first().map(|range| range.start) + } + + pub fn end(&self) -> Option { + self.ranges.last().map(|range| range.end) + } + + #[allow(unused)] + pub fn total_size(&self) -> usize { + self.ranges + .iter() + .map(|range| key_range_size(range) as usize) + .sum() + } + + fn overlaps_at(&self, range: &Range) -> Option { + match self.ranges.binary_search_by_key(&range.end, |r| r.start) { + Ok(0) => None, + Err(0) => None, + Ok(index) if self.ranges[index - 1].end > range.start => Some(index - 1), + Err(index) if self.ranges[index - 1].end > range.start => Some(index - 1), + _ => None, + } + } + /// /// Check if key space contains overlapping range /// pub fn overlaps(&self, range: &Range) -> bool { - match self.ranges.binary_search_by_key(&range.end, |r| r.start) { - Ok(0) => false, - Err(0) => false, - Ok(index) => self.ranges[index - 1].end > range.start, - Err(index) => self.ranges[index - 1].end > range.start, - } + self.overlaps_at(range).is_some() } } @@ -152,16 +248,7 @@ impl KeySpaceAccum { } pub fn consume_keyspace(&mut self) -> KeySpace { - if let Some(accum) = self.accum.take() { - self.ranges.push(accum); - } - - let mut prev_accum = KeySpaceAccum::new(); - std::mem::swap(self, &mut prev_accum); - - KeySpace { - ranges: prev_accum.ranges, - } + std::mem::take(self).to_keyspace() } pub fn size(&self) -> u64 { @@ -211,8 +298,16 @@ impl KeySpaceRandomAccum { } KeySpace { ranges } } + + pub fn consume_keyspace(&mut self) -> KeySpace { + let mut prev_accum = KeySpaceRandomAccum::new(); + std::mem::swap(self, &mut prev_accum); + + prev_accum.to_keyspace() + } } +#[inline(always)] pub fn key_range_size(key_range: &Range) -> u32 { let start = key_range.start; let end = key_range.end; @@ -441,4 +536,118 @@ mod tests { // xxxxxxxxxxx assert!(ks.overlaps(&kr(0..30))); // XXXXX This fails currently! } + + #[test] + fn test_remove_full_overlapps() { + let mut key_space1 = KeySpace { + ranges: vec![ + Key::from_i128(1)..Key::from_i128(4), + Key::from_i128(5)..Key::from_i128(8), + Key::from_i128(10)..Key::from_i128(12), + ], + }; + let key_space2 = KeySpace { + ranges: vec![ + Key::from_i128(2)..Key::from_i128(3), + Key::from_i128(6)..Key::from_i128(7), + Key::from_i128(11)..Key::from_i128(13), + ], + }; + key_space1.remove_overlapping_with(&key_space2); + assert_eq!( + key_space1.ranges, + vec![ + Key::from_i128(1)..Key::from_i128(2), + Key::from_i128(3)..Key::from_i128(4), + Key::from_i128(5)..Key::from_i128(6), + Key::from_i128(7)..Key::from_i128(8), + Key::from_i128(10)..Key::from_i128(11) + ] + ); + } + + #[test] + fn test_remove_partial_overlaps() { + // Test partial ovelaps + let mut key_space1 = KeySpace { + ranges: vec![ + Key::from_i128(1)..Key::from_i128(5), + Key::from_i128(7)..Key::from_i128(10), + Key::from_i128(12)..Key::from_i128(15), + ], + }; + let key_space2 = KeySpace { + ranges: vec![ + Key::from_i128(3)..Key::from_i128(6), + Key::from_i128(8)..Key::from_i128(11), + Key::from_i128(14)..Key::from_i128(17), + ], + }; + key_space1.remove_overlapping_with(&key_space2); + assert_eq!( + key_space1.ranges, + vec![ + Key::from_i128(1)..Key::from_i128(3), + Key::from_i128(7)..Key::from_i128(8), + Key::from_i128(12)..Key::from_i128(14), + ] + ); + } + + #[test] + fn test_remove_no_overlaps() { + let mut key_space1 = KeySpace { + ranges: vec![ + Key::from_i128(1)..Key::from_i128(5), + Key::from_i128(7)..Key::from_i128(10), + Key::from_i128(12)..Key::from_i128(15), + ], + }; + let key_space2 = KeySpace { + ranges: vec![ + Key::from_i128(6)..Key::from_i128(7), + Key::from_i128(11)..Key::from_i128(12), + Key::from_i128(15)..Key::from_i128(17), + ], + }; + key_space1.remove_overlapping_with(&key_space2); + assert_eq!( + key_space1.ranges, + vec![ + Key::from_i128(1)..Key::from_i128(5), + Key::from_i128(7)..Key::from_i128(10), + Key::from_i128(12)..Key::from_i128(15), + ] + ); + } + + #[test] + fn test_remove_one_range_overlaps_multiple() { + let mut key_space1 = KeySpace { + ranges: vec![ + Key::from_i128(1)..Key::from_i128(3), + Key::from_i128(3)..Key::from_i128(6), + Key::from_i128(6)..Key::from_i128(10), + Key::from_i128(12)..Key::from_i128(15), + Key::from_i128(17)..Key::from_i128(20), + Key::from_i128(20)..Key::from_i128(30), + Key::from_i128(30)..Key::from_i128(40), + ], + }; + let key_space2 = KeySpace { + ranges: vec![Key::from_i128(9)..Key::from_i128(19)], + }; + key_space1.remove_overlapping_with(&key_space2); + assert_eq!( + key_space1.ranges, + vec![ + Key::from_i128(1)..Key::from_i128(3), + Key::from_i128(3)..Key::from_i128(6), + Key::from_i128(6)..Key::from_i128(9), + Key::from_i128(19)..Key::from_i128(20), + Key::from_i128(20)..Key::from_i128(30), + Key::from_i128(30)..Key::from_i128(40), + ] + ); + } } diff --git a/libs/pageserver_api/src/lib.rs b/libs/pageserver_api/src/lib.rs index b236b93428..1b948d60c3 100644 --- a/libs/pageserver_api/src/lib.rs +++ b/libs/pageserver_api/src/lib.rs @@ -2,13 +2,14 @@ #![deny(clippy::undocumented_unsafe_blocks)] use const_format::formatcp; -/// Public API types -pub mod control_api; +pub mod controller_api; pub mod key; pub mod keyspace; pub mod models; pub mod reltag; pub mod shard; +/// Public API types +pub mod upcall_api; pub const DEFAULT_PG_LISTEN_PORT: u16 = 64000; pub const DEFAULT_PG_LISTEN_ADDR: &str = formatcp!("127.0.0.1:{DEFAULT_PG_LISTEN_PORT}"); diff --git a/libs/pageserver_api/src/models.rs b/libs/pageserver_api/src/models.rs index 86d2c2a7ca..a96cc09158 100644 --- a/libs/pageserver_api/src/models.rs +++ b/libs/pageserver_api/src/models.rs @@ -1,4 +1,7 @@ pub mod partitioning; +pub mod utilization; + +pub use utilization::PageserverUtilization; use std::{ collections::HashMap, @@ -8,9 +11,9 @@ use std::{ }; use byteorder::{BigEndian, ReadBytesExt}; +use postgres_ffi::BLCKSZ; use serde::{Deserialize, Serialize}; use serde_with::serde_as; -use strum_macros; use utils::{ completion, history_buffer::HistoryBufferWithDropCounter, @@ -18,6 +21,7 @@ use utils::{ lsn::Lsn, }; +use crate::controller_api::PlacementPolicy; use crate::{ reltag::RelTag, shard::{ShardCount, ShardStripeSize, TenantShardId}, @@ -179,7 +183,7 @@ pub enum TimelineState { Broken { reason: String, backtrace: String }, } -#[derive(Serialize, Deserialize)] +#[derive(Serialize, Deserialize, Clone)] pub struct TimelineCreateRequest { pub new_timeline_id: TimelineId, #[serde(default)] @@ -191,6 +195,23 @@ pub struct TimelineCreateRequest { pub pg_version: Option, } +#[derive(Serialize, Deserialize)] +pub struct TenantShardSplitRequest { + pub new_shard_count: u8, + + // A tenant's stripe size is only meaningful the first time their shard count goes + // above 1: therefore during a split from 1->N shards, we may modify the stripe size. + // + // If this is set while the stripe count is being increased from an already >1 value, + // then the request will fail with 400. + pub new_stripe_size: Option, +} + +#[derive(Serialize, Deserialize)] +pub struct TenantShardSplitResponse { + pub new_shards: Vec, +} + /// Parameters that apply to all shards in a tenant. Used during tenant creation. #[derive(Serialize, Deserialize, Debug)] #[serde(deny_unknown_fields)] @@ -203,14 +224,14 @@ impl ShardParameters { pub const DEFAULT_STRIPE_SIZE: ShardStripeSize = ShardStripeSize(256 * 1024 / 8); pub fn is_unsharded(&self) -> bool { - self.count == ShardCount(0) + self.count.is_unsharded() } } impl Default for ShardParameters { fn default() -> Self { Self { - count: ShardCount(0), + count: ShardCount::new(0), stripe_size: Self::DEFAULT_STRIPE_SIZE, } } @@ -229,6 +250,11 @@ pub struct TenantCreateRequest { #[serde(skip_serializing_if = "ShardParameters::is_unsharded")] pub shard_parameters: ShardParameters, + // This parameter is only meaningful in requests sent to the storage controller + #[serde(default)] + #[serde(skip_serializing_if = "Option::is_none")] + pub placement_policy: Option, + #[serde(flatten)] pub config: TenantConfig, // as we have a flattened field, we should reject all unknown fields in it } @@ -258,6 +284,8 @@ pub struct TenantConfig { pub compaction_target_size: Option, pub compaction_period: Option, pub compaction_threshold: Option, + // defer parsing compaction_algorithm, like eviction_policy + pub compaction_algorithm: Option, pub gc_horizon: Option, pub gc_period: Option, pub image_creation_threshold: Option, @@ -269,8 +297,9 @@ pub struct TenantConfig { pub eviction_policy: Option, pub min_resident_size_override: Option, pub evictions_low_residence_duration_metric_threshold: Option, - pub gc_feedback: Option, pub heatmap_period: Option, + pub lazy_slru_download: Option, + pub timeline_get_throttle: Option, } #[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)] @@ -278,6 +307,7 @@ pub struct TenantConfig { pub enum EvictionPolicy { NoEviction, LayerAccessThreshold(EvictionPolicyLayerAccessThreshold), + OnlyImitiate(EvictionPolicyLayerAccessThreshold), } impl EvictionPolicy { @@ -285,10 +315,18 @@ impl EvictionPolicy { match self { EvictionPolicy::NoEviction => "NoEviction", EvictionPolicy::LayerAccessThreshold(_) => "LayerAccessThreshold", + EvictionPolicy::OnlyImitiate(_) => "OnlyImitiate", } } } +#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)] +#[serde(tag = "kind")] +pub enum CompactionAlgorithm { + Legacy, + Tiered, +} + #[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)] pub struct EvictionPolicyLayerAccessThreshold { #[serde(with = "humantime_serde")] @@ -297,10 +335,39 @@ pub struct EvictionPolicyLayerAccessThreshold { pub threshold: Duration, } +#[derive(Debug, Serialize, Deserialize, Clone, PartialEq, Eq)] +pub struct ThrottleConfig { + pub task_kinds: Vec, // TaskKind + pub initial: usize, + #[serde(with = "humantime_serde")] + pub refill_interval: Duration, + pub refill_amount: NonZeroUsize, + pub max: usize, + pub fair: bool, +} + +impl ThrottleConfig { + pub fn disabled() -> Self { + Self { + task_kinds: vec![], // effectively disables the throttle + // other values don't matter with emtpy `task_kinds`. + initial: 0, + refill_interval: Duration::from_millis(1), + refill_amount: NonZeroUsize::new(1).unwrap(), + max: 1, + fair: true, + } + } + /// The requests per second allowed by the given config. + pub fn steady_rps(&self) -> f64 { + (self.refill_amount.get() as f64) / (self.refill_interval.as_secs_f64()) + } +} + /// A flattened analog of a `pagesever::tenant::LocationMode`, which /// lists out all possible states (and the virtual "Detached" state) /// in a flat form rather than using rust-style enums. -#[derive(Serialize, Deserialize, Debug, Clone, Eq, PartialEq)] +#[derive(Serialize, Deserialize, Debug, Clone, Copy, Eq, PartialEq)] pub enum LocationConfigMode { AttachedSingle, AttachedMulti, @@ -364,6 +431,27 @@ pub struct TenantLocationConfigRequest { pub config: LocationConfig, // as we have a flattened field, we should reject all unknown fields in it } +#[derive(Serialize, Deserialize, Debug)] +#[serde(deny_unknown_fields)] +pub struct TenantTimeTravelRequest { + pub shard_counts: Vec, +} + +#[derive(Serialize, Deserialize, Debug)] +#[serde(deny_unknown_fields)] +pub struct TenantShardLocation { + pub shard_id: TenantShardId, + pub node_id: NodeId, +} + +#[derive(Serialize, Deserialize, Debug)] +#[serde(deny_unknown_fields)] +pub struct TenantLocationConfigResponse { + pub shards: Vec, + // If the shards' ShardCount count is >1, stripe_size will be set. + pub stripe_size: Option, +} + #[derive(Serialize, Deserialize, Debug)] #[serde(deny_unknown_fields)] pub struct TenantConfigRequest { @@ -439,6 +527,8 @@ pub struct TenantDetails { #[serde(flatten)] pub tenant_info: TenantInfo, + pub walredo: Option, + pub timelines: Vec, } @@ -467,6 +557,8 @@ pub struct TimelineInfo { pub current_logical_size: u64, pub current_logical_size_is_accurate: bool, + pub directory_entries_counts: Vec, + /// Sum of the size of all layer files. /// If a layer is present in both local FS and S3, it counts only once. pub current_physical_size: Option, // is None when timeline is Unloaded @@ -626,6 +718,33 @@ pub struct TimelineGcRequest { pub gc_horizon: Option, } +#[derive(Debug, Clone, Serialize, Deserialize)] +pub struct WalRedoManagerStatus { + pub last_redo_at: Option>, + pub pid: Option, +} + +pub mod virtual_file { + #[derive( + Copy, + Clone, + PartialEq, + Eq, + Hash, + strum_macros::EnumString, + strum_macros::Display, + serde_with::DeserializeFromStr, + serde_with::SerializeDisplay, + Debug, + )] + #[strum(serialize_all = "kebab-case")] + pub enum IoEngineKind { + StdFs, + #[cfg(target_os = "linux")] + TokioEpollUring, + } +} + // Wrapped in libpq CopyData #[derive(PartialEq, Eq, Debug)] pub enum PagestreamFeMessage { @@ -633,6 +752,7 @@ pub enum PagestreamFeMessage { Nblocks(PagestreamNblocksRequest), GetPage(PagestreamGetPageRequest), DbSize(PagestreamDbSizeRequest), + GetSlruSegment(PagestreamGetSlruSegmentRequest), } // Wrapped in libpq CopyData @@ -643,6 +763,7 @@ pub enum PagestreamBeMessage { GetPage(PagestreamGetPageResponse), Error(PagestreamErrorResponse), DbSize(PagestreamDbSizeResponse), + GetSlruSegment(PagestreamGetSlruSegmentResponse), } // Keep in sync with `pagestore_client.h` @@ -653,6 +774,7 @@ enum PagestreamBeMessageTag { GetPage = 102, Error = 103, DbSize = 104, + GetSlruSegment = 105, } impl TryFrom for PagestreamBeMessageTag { type Error = u8; @@ -663,6 +785,7 @@ impl TryFrom for PagestreamBeMessageTag { 102 => Ok(PagestreamBeMessageTag::GetPage), 103 => Ok(PagestreamBeMessageTag::Error), 104 => Ok(PagestreamBeMessageTag::DbSize), + 105 => Ok(PagestreamBeMessageTag::GetSlruSegment), _ => Err(value), } } @@ -697,6 +820,14 @@ pub struct PagestreamDbSizeRequest { pub dbnode: u32, } +#[derive(Debug, PartialEq, Eq)] +pub struct PagestreamGetSlruSegmentRequest { + pub latest: bool, + pub lsn: Lsn, + pub kind: u8, + pub segno: u32, +} + #[derive(Debug)] pub struct PagestreamExistsResponse { pub exists: bool, @@ -712,6 +843,11 @@ pub struct PagestreamGetPageResponse { pub page: Bytes, } +#[derive(Debug)] +pub struct PagestreamGetSlruSegmentResponse { + pub segment: Bytes, +} + #[derive(Debug)] pub struct PagestreamErrorResponse { pub message: String, @@ -775,6 +911,14 @@ impl PagestreamFeMessage { bytes.put_u64(req.lsn.0); bytes.put_u32(req.dbnode); } + + Self::GetSlruSegment(req) => { + bytes.put_u8(4); + bytes.put_u8(u8::from(req.latest)); + bytes.put_u64(req.lsn.0); + bytes.put_u8(req.kind); + bytes.put_u32(req.segno); + } } bytes.into() @@ -825,6 +969,14 @@ impl PagestreamFeMessage { lsn: Lsn::from(body.read_u64::()?), dbnode: body.read_u32::()?, })), + 4 => Ok(PagestreamFeMessage::GetSlruSegment( + PagestreamGetSlruSegmentRequest { + latest: body.read_u8()? != 0, + lsn: Lsn::from(body.read_u64::()?), + kind: body.read_u8()?, + segno: body.read_u32::()?, + }, + )), _ => bail!("unknown smgr message tag: {:?}", msg_tag), } } @@ -860,6 +1012,12 @@ impl PagestreamBeMessage { bytes.put_u8(Tag::DbSize as u8); bytes.put_i64(resp.db_size); } + + Self::GetSlruSegment(resp) => { + bytes.put_u8(Tag::GetSlruSegment as u8); + bytes.put_u32((resp.segment.len() / BLCKSZ as usize) as u32); + bytes.put(&resp.segment[..]); + } } bytes.into() @@ -900,6 +1058,14 @@ impl PagestreamBeMessage { let db_size = buf.read_i64::()?; Self::DbSize(PagestreamDbSizeResponse { db_size }) } + Tag::GetSlruSegment => { + let n_blocks = buf.read_u32::()?; + let mut segment = vec![0; n_blocks as usize * BLCKSZ as usize]; + buf.read_exact(&mut segment)?; + Self::GetSlruSegment(PagestreamGetSlruSegmentResponse { + segment: segment.into(), + }) + } }; let remaining = buf.into_inner(); if !remaining.is_empty() { @@ -918,13 +1084,13 @@ impl PagestreamBeMessage { Self::GetPage(_) => "GetPage", Self::Error(_) => "Error", Self::DbSize(_) => "DbSize", + Self::GetSlruSegment(_) => "GetSlruSegment", } } } #[cfg(test)] mod tests { - use bytes::Buf; use serde_json::json; use super::*; diff --git a/libs/pageserver_api/src/models/utilization.rs b/libs/pageserver_api/src/models/utilization.rs new file mode 100644 index 0000000000..7195a12395 --- /dev/null +++ b/libs/pageserver_api/src/models/utilization.rs @@ -0,0 +1,70 @@ +use std::time::SystemTime; + +/// Pageserver current utilization and scoring for how good candidate the pageserver would be for +/// the next tenant. +/// +/// See and maintain pageserver openapi spec for `/v1/utilization_score` as the truth. +/// +/// `format: int64` fields must use `ser_saturating_u63` because openapi generated clients might +/// not handle full u64 values properly. +#[derive(serde::Serialize, Debug)] +pub struct PageserverUtilization { + /// Used disk space + #[serde(serialize_with = "ser_saturating_u63")] + pub disk_usage_bytes: u64, + /// Free disk space + #[serde(serialize_with = "ser_saturating_u63")] + pub free_space_bytes: u64, + /// Lower is better score for how good candidate for a next tenant would this pageserver be. + #[serde(serialize_with = "ser_saturating_u63")] + pub utilization_score: u64, + /// When was this snapshot captured, pageserver local time. + /// + /// Use millis to give confidence that the value is regenerated often enough. + #[serde(serialize_with = "ser_rfc3339_millis")] + pub captured_at: SystemTime, +} + +fn ser_rfc3339_millis( + ts: &SystemTime, + serializer: S, +) -> Result { + serializer.collect_str(&humantime::format_rfc3339_millis(*ts)) +} + +/// openapi knows only `format: int64`, so avoid outputting a non-parseable value by generated clients. +/// +/// Instead of newtype, use this because a newtype would get require handling deserializing values +/// with the highest bit set which is properly parsed by serde formats, but would create a +/// conundrum on how to handle and again serialize such values at type level. It will be a few +/// years until we can use more than `i64::MAX` bytes on a disk. +fn ser_saturating_u63(value: &u64, serializer: S) -> Result { + const MAX_FORMAT_INT64: u64 = i64::MAX as u64; + + let value = (*value).min(MAX_FORMAT_INT64); + + serializer.serialize_u64(value) +} + +#[cfg(test)] +mod tests { + use std::time::Duration; + + use super::*; + + #[test] + fn u64_max_is_serialized_as_u63_max() { + let doc = PageserverUtilization { + disk_usage_bytes: u64::MAX, + free_space_bytes: 0, + utilization_score: u64::MAX, + captured_at: SystemTime::UNIX_EPOCH + Duration::from_secs(1708509779), + }; + + let s = serde_json::to_string(&doc).unwrap(); + + let expected = r#"{"disk_usage_bytes":9223372036854775807,"free_space_bytes":0,"utilization_score":9223372036854775807,"captured_at":"2024-02-21T10:02:59.000Z"}"#; + + assert_eq!(s, expected); + } +} diff --git a/libs/pageserver_api/src/reltag.rs b/libs/pageserver_api/src/reltag.rs index 3f37af600d..38693ab847 100644 --- a/libs/pageserver_api/src/reltag.rs +++ b/libs/pageserver_api/src/reltag.rs @@ -123,9 +123,12 @@ impl RelTag { PartialOrd, Ord, strum_macros::EnumIter, + strum_macros::FromRepr, + enum_map::Enum, )] +#[repr(u8)] pub enum SlruKind { - Clog, + Clog = 0, MultiXactMembers, MultiXactOffsets, } diff --git a/libs/pageserver_api/src/shard.rs b/libs/pageserver_api/src/shard.rs index e27aad8156..a2a9165184 100644 --- a/libs/pageserver_api/src/shard.rs +++ b/libs/pageserver_api/src/shard.rs @@ -6,17 +6,47 @@ use crate::{ }; use hex::FromHex; use serde::{Deserialize, Serialize}; -use thiserror; use utils::id::TenantId; #[derive(Ord, PartialOrd, Eq, PartialEq, Clone, Copy, Serialize, Deserialize, Debug, Hash)] pub struct ShardNumber(pub u8); #[derive(Ord, PartialOrd, Eq, PartialEq, Clone, Copy, Serialize, Deserialize, Debug, Hash)] -pub struct ShardCount(pub u8); +pub struct ShardCount(u8); impl ShardCount { pub const MAX: Self = Self(u8::MAX); + + /// The internal value of a ShardCount may be zero, which means "1 shard, but use + /// legacy format for TenantShardId that excludes the shard suffix", also known + /// as `TenantShardId::unsharded`. + /// + /// This method returns the actual number of shards, i.e. if our internal value is + /// zero, we return 1 (unsharded tenants have 1 shard). + pub fn count(&self) -> u8 { + if self.0 > 0 { + self.0 + } else { + 1 + } + } + + /// The literal internal value: this is **not** the number of shards in the + /// tenant, as we have a special zero value for legacy unsharded tenants. Use + /// [`Self::count`] if you want to know the cardinality of shards. + pub fn literal(&self) -> u8 { + self.0 + } + + pub fn is_unsharded(&self) -> bool { + self.0 == 0 + } + + /// `v` may be zero, or the number of shards in the tenant. `v` is what + /// [`Self::literal`] would return. + pub fn new(val: u8) -> Self { + Self(val) + } } impl ShardNumber { @@ -86,14 +116,38 @@ impl TenantShardId { } pub fn is_unsharded(&self) -> bool { - self.shard_number == ShardNumber(0) && self.shard_count == ShardCount(0) + self.shard_number == ShardNumber(0) && self.shard_count.is_unsharded() } + + /// Convenience for dropping the tenant_id and just getting the ShardIndex: this + /// is useful when logging from code that is already in a span that includes tenant ID, to + /// keep messages reasonably terse. pub fn to_index(&self) -> ShardIndex { ShardIndex { shard_number: self.shard_number, shard_count: self.shard_count, } } + + /// Calculate the children of this TenantShardId when splitting the overall tenant into + /// the given number of shards. + pub fn split(&self, new_shard_count: ShardCount) -> Vec { + let effective_old_shard_count = std::cmp::max(self.shard_count.0, 1); + let mut child_shards = Vec::new(); + for shard_number in 0..ShardNumber(new_shard_count.0).0 { + // Key mapping is based on a round robin mapping of key hash modulo shard count, + // so our child shards are the ones which the same keys would map to. + if shard_number % effective_old_shard_count == self.shard_number.0 { + child_shards.push(TenantShardId { + tenant_id: self.tenant_id, + shard_number: ShardNumber(shard_number), + shard_count: new_shard_count, + }) + } + } + + child_shards + } } /// Formatting helper @@ -447,10 +501,12 @@ impl ShardIdentity { pub fn is_key_disposable(&self, key: &Key) -> bool { if key_is_shard0(key) { // Q: Why can't we dispose of shard0 content if we're not shard 0? - // A: because the WAL ingestion logic currently ingests some shard 0 - // content on all shards, even though it's only read on shard 0. If we - // dropped it, then subsequent WAL ingest to these keys would encounter - // an error. + // A1: because the WAL ingestion logic currently ingests some shard 0 + // content on all shards, even though it's only read on shard 0. If we + // dropped it, then subsequent WAL ingest to these keys would encounter + // an error. + // A2: because key_is_shard0 also covers relation size keys, which are written + // on all shards even though they're only maintained accurately on shard 0. false } else { !self.is_key_local(key) @@ -599,10 +655,7 @@ fn key_to_shard_number(count: ShardCount, stripe_size: ShardStripeSize, key: &Ke #[cfg(test)] mod tests { - use std::str::FromStr; - - use bincode; - use utils::{id::TenantId, Hex}; + use utils::Hex; use super::*; @@ -793,4 +846,108 @@ mod tests { let shard = key_to_shard_number(ShardCount(10), DEFAULT_STRIPE_SIZE, &key); assert_eq!(shard, ShardNumber(8)); } + + #[test] + fn shard_id_split() { + let tenant_id = TenantId::generate(); + let parent = TenantShardId::unsharded(tenant_id); + + // Unsharded into 2 + assert_eq!( + parent.split(ShardCount(2)), + vec![ + TenantShardId { + tenant_id, + shard_count: ShardCount(2), + shard_number: ShardNumber(0) + }, + TenantShardId { + tenant_id, + shard_count: ShardCount(2), + shard_number: ShardNumber(1) + } + ] + ); + + // Unsharded into 4 + assert_eq!( + parent.split(ShardCount(4)), + vec![ + TenantShardId { + tenant_id, + shard_count: ShardCount(4), + shard_number: ShardNumber(0) + }, + TenantShardId { + tenant_id, + shard_count: ShardCount(4), + shard_number: ShardNumber(1) + }, + TenantShardId { + tenant_id, + shard_count: ShardCount(4), + shard_number: ShardNumber(2) + }, + TenantShardId { + tenant_id, + shard_count: ShardCount(4), + shard_number: ShardNumber(3) + } + ] + ); + + // count=1 into 2 (check this works the same as unsharded.) + let parent = TenantShardId { + tenant_id, + shard_count: ShardCount(1), + shard_number: ShardNumber(0), + }; + assert_eq!( + parent.split(ShardCount(2)), + vec![ + TenantShardId { + tenant_id, + shard_count: ShardCount(2), + shard_number: ShardNumber(0) + }, + TenantShardId { + tenant_id, + shard_count: ShardCount(2), + shard_number: ShardNumber(1) + } + ] + ); + + // count=2 into count=8 + let parent = TenantShardId { + tenant_id, + shard_count: ShardCount(2), + shard_number: ShardNumber(1), + }; + assert_eq!( + parent.split(ShardCount(8)), + vec![ + TenantShardId { + tenant_id, + shard_count: ShardCount(8), + shard_number: ShardNumber(1) + }, + TenantShardId { + tenant_id, + shard_count: ShardCount(8), + shard_number: ShardNumber(3) + }, + TenantShardId { + tenant_id, + shard_count: ShardCount(8), + shard_number: ShardNumber(5) + }, + TenantShardId { + tenant_id, + shard_count: ShardCount(8), + shard_number: ShardNumber(7) + }, + ] + ); + } } diff --git a/libs/pageserver_api/src/control_api.rs b/libs/pageserver_api/src/upcall_api.rs similarity index 71% rename from libs/pageserver_api/src/control_api.rs rename to libs/pageserver_api/src/upcall_api.rs index 0acc3a7bb0..5472948091 100644 --- a/libs/pageserver_api/src/control_api.rs +++ b/libs/pageserver_api/src/upcall_api.rs @@ -6,11 +6,18 @@ use serde::{Deserialize, Serialize}; use utils::id::NodeId; -use crate::shard::TenantShardId; +use crate::{controller_api::NodeRegisterRequest, shard::TenantShardId}; +/// Upcall message sent by the pageserver to the configured `control_plane_api` on +/// startup. #[derive(Serialize, Deserialize)] pub struct ReAttachRequest { pub node_id: NodeId, + + /// Optional inline self-registration: this is useful with the storage controller, + /// if the node already has a node_id set. + #[serde(skip_serializing_if = "Option::is_none", default)] + pub register: Option, } #[derive(Serialize, Deserialize)] diff --git a/libs/postgres_backend/src/lib.rs b/libs/postgres_backend/src/lib.rs index 73d25619c3..260018ad89 100644 --- a/libs/postgres_backend/src/lib.rs +++ b/libs/postgres_backend/src/lib.rs @@ -6,7 +6,6 @@ #![deny(clippy::undocumented_unsafe_blocks)] use anyhow::Context; use bytes::Bytes; -use futures::pin_mut; use serde::{Deserialize, Serialize}; use std::io::ErrorKind; use std::net::SocketAddr; @@ -378,8 +377,7 @@ impl PostgresBackend { &mut self, cx: &mut std::task::Context<'_>, ) -> Poll> { - let flush_fut = self.flush(); - pin_mut!(flush_fut); + let flush_fut = std::pin::pin!(self.flush()); flush_fut.poll(cx) } diff --git a/libs/postgres_backend/tests/simple_select.rs b/libs/postgres_backend/tests/simple_select.rs index e046fa5260..80df9db858 100644 --- a/libs/postgres_backend/tests/simple_select.rs +++ b/libs/postgres_backend/tests/simple_select.rs @@ -72,14 +72,19 @@ async fn simple_select() { } } -static KEY: Lazy = Lazy::new(|| { +static KEY: Lazy> = Lazy::new(|| { let mut cursor = Cursor::new(include_bytes!("key.pem")); - rustls::PrivateKey(rustls_pemfile::rsa_private_keys(&mut cursor).unwrap()[0].clone()) + let key = rustls_pemfile::rsa_private_keys(&mut cursor) + .next() + .unwrap() + .unwrap(); + rustls::pki_types::PrivateKeyDer::Pkcs1(key) }); -static CERT: Lazy = Lazy::new(|| { +static CERT: Lazy> = Lazy::new(|| { let mut cursor = Cursor::new(include_bytes!("cert.pem")); - rustls::Certificate(rustls_pemfile::certs(&mut cursor).unwrap()[0].clone()) + let cert = rustls_pemfile::certs(&mut cursor).next().unwrap().unwrap(); + cert }); // test that basic select with ssl works @@ -88,9 +93,8 @@ async fn simple_select_ssl() { let (client_sock, server_sock) = make_tcp_pair().await; let server_cfg = rustls::ServerConfig::builder() - .with_safe_defaults() .with_no_client_auth() - .with_single_cert(vec![CERT.clone()], KEY.clone()) + .with_single_cert(vec![CERT.clone()], KEY.clone_key()) .unwrap(); let tls_config = Some(Arc::new(server_cfg)); let pgbackend = @@ -102,10 +106,9 @@ async fn simple_select_ssl() { }); let client_cfg = rustls::ClientConfig::builder() - .with_safe_defaults() .with_root_certificates({ let mut store = rustls::RootCertStore::empty(); - store.add(&CERT).unwrap(); + store.add(CERT.clone()).unwrap(); store }) .with_no_client_auth(); diff --git a/libs/postgres_ffi/src/lib.rs b/libs/postgres_ffi/src/lib.rs index d10ebfe277..aa6845b9b1 100644 --- a/libs/postgres_ffi/src/lib.rs +++ b/libs/postgres_ffi/src/lib.rs @@ -3,7 +3,7 @@ #![allow(non_snake_case)] // bindgen creates some unsafe code with no doc comments. #![allow(clippy::missing_safety_doc)] -// noted at 1.63 that in many cases there's a u32 -> u32 transmutes in bindgen code. +// noted at 1.63 that in many cases there's u32 -> u32 transmutes in bindgen code. #![allow(clippy::useless_transmute)] // modules included with the postgres_ffi macro depend on the types of the specific version's // types, and trigger a too eager lint. diff --git a/libs/postgres_ffi/src/pg_constants.rs b/libs/postgres_ffi/src/pg_constants.rs index d59e0e4a15..2701ddf5e0 100644 --- a/libs/postgres_ffi/src/pg_constants.rs +++ b/libs/postgres_ffi/src/pg_constants.rs @@ -80,6 +80,9 @@ pub const XLOG_XACT_ABORT: u8 = 0x20; pub const XLOG_XACT_COMMIT_PREPARED: u8 = 0x30; pub const XLOG_XACT_ABORT_PREPARED: u8 = 0x40; +// From standbydefs.h +pub const XLOG_RUNNING_XACTS: u8 = 0x10; + // From srlu.h pub const SLRU_PAGES_PER_SEGMENT: u32 = 32; pub const SLRU_SEG_SIZE: usize = BLCKSZ as usize * SLRU_PAGES_PER_SEGMENT as usize; diff --git a/libs/postgres_ffi/src/xlog_utils.rs b/libs/postgres_ffi/src/xlog_utils.rs index 56ce9c901e..4a66a0ab1d 100644 --- a/libs/postgres_ffi/src/xlog_utils.rs +++ b/libs/postgres_ffi/src/xlog_utils.rs @@ -119,11 +119,6 @@ pub fn generate_pg_control( // Generate new pg_control needed for bootstrap checkpoint.redo = normalize_lsn(lsn, WAL_SEGMENT_SIZE).0; - //reset some fields we don't want to preserve - //TODO Check this. - //We may need to determine the value from twophase data. - checkpoint.oldestActiveXid = 0; - //save new values in pg_control pg_control.checkPoint = 0; pg_control.checkPointCopy = checkpoint; @@ -207,10 +202,16 @@ pub fn find_end_of_wal( let seg_offs = curr_lsn.segment_offset(wal_seg_size); segment.seek(SeekFrom::Start(seg_offs as u64))?; // loop inside segment - loop { + while curr_lsn.segment_number(wal_seg_size) == segno { let bytes_read = segment.read(&mut buf)?; if bytes_read == 0 { - break; // EOF + debug!( + "find_end_of_wal reached end at {:?}, EOF in segment {:?} at offset {}", + result, + seg_file_path, + curr_lsn.segment_offset(wal_seg_size) + ); + return Ok(result); } curr_lsn += bytes_read as u64; decoder.feed_bytes(&buf[0..bytes_read]); @@ -425,11 +426,11 @@ pub fn generate_wal_segment(segno: u64, system_id: u64, lsn: Lsn) -> Result, max_keys_per_list_response: Option, concurrency_limiter: ConcurrencyLimiter, + // Per-request timeout. Accessible for tests. + pub timeout: Duration, } impl AzureBlobStorage { - pub fn new(azure_config: &AzureConfig) -> Result { + pub fn new(azure_config: &AzureConfig, timeout: Duration) -> Result { debug!( "Creating azure remote storage for azure container {}", azure_config.container_name @@ -78,6 +80,7 @@ impl AzureBlobStorage { prefix_in_container: azure_config.prefix_in_container.to_owned(), max_keys_per_list_response, concurrency_limiter: ConcurrencyLimiter::new(azure_config.concurrency_limit.get()), + timeout, }) } @@ -120,8 +123,11 @@ impl AzureBlobStorage { async fn download_for_builder( &self, builder: GetBlobBuilder, + cancel: &CancellationToken, ) -> Result { - let mut response = builder.into_stream(); + let kind = RequestKind::Get; + + let _permit = self.permit(kind, cancel).await?; let mut etag = None; let mut last_modified = None; @@ -129,39 +135,70 @@ impl AzureBlobStorage { // TODO give proper streaming response instead of buffering into RAM // https://github.com/neondatabase/neon/issues/5563 - let mut bufs = Vec::new(); - while let Some(part) = response.next().await { - let part = part.map_err(to_download_error)?; - let etag_str: &str = part.blob.properties.etag.as_ref(); - if etag.is_none() { - etag = Some(etag.unwrap_or_else(|| etag_str.to_owned())); + let download = async { + let response = builder + // convert to concrete Pageable + .into_stream() + // convert to TryStream + .into_stream() + .map_err(to_download_error); + + // apply per request timeout + let response = tokio_stream::StreamExt::timeout(response, self.timeout); + + // flatten + let response = response.map(|res| match res { + Ok(res) => res, + Err(_elapsed) => Err(DownloadError::Timeout), + }); + + let mut response = std::pin::pin!(response); + + let mut bufs = Vec::new(); + while let Some(part) = response.next().await { + let part = part?; + let etag_str: &str = part.blob.properties.etag.as_ref(); + if etag.is_none() { + etag = Some(etag.unwrap_or_else(|| etag_str.to_owned())); + } + if last_modified.is_none() { + last_modified = Some(part.blob.properties.last_modified.into()); + } + if let Some(blob_meta) = part.blob.metadata { + metadata.extend(blob_meta.iter().map(|(k, v)| (k.to_owned(), v.to_owned()))); + } + let data = part + .data + .collect() + .await + .map_err(|e| DownloadError::Other(e.into()))?; + bufs.push(data); } - if last_modified.is_none() { - last_modified = Some(part.blob.properties.last_modified.into()); - } - if let Some(blob_meta) = part.blob.metadata { - metadata.extend(blob_meta.iter().map(|(k, v)| (k.to_owned(), v.to_owned()))); - } - let data = part - .data - .collect() - .await - .map_err(|e| DownloadError::Other(e.into()))?; - bufs.push(data); + Ok(Download { + download_stream: Box::pin(futures::stream::iter(bufs.into_iter().map(Ok))), + etag, + last_modified, + metadata: Some(StorageMetadata(metadata)), + }) + }; + + tokio::select! { + bufs = download => bufs, + _ = cancel.cancelled() => Err(DownloadError::Cancelled), } - Ok(Download { - download_stream: Box::pin(futures::stream::iter(bufs.into_iter().map(Ok))), - etag, - last_modified, - metadata: Some(StorageMetadata(metadata)), - }) } - async fn permit(&self, kind: RequestKind) -> tokio::sync::SemaphorePermit<'_> { - self.concurrency_limiter - .acquire(kind) - .await - .expect("semaphore is never closed") + async fn permit( + &self, + kind: RequestKind, + cancel: &CancellationToken, + ) -> Result, Cancelled> { + let acquire = self.concurrency_limiter.acquire(kind); + + tokio::select! { + permit = acquire => Ok(permit.expect("never closed")), + _ = cancel.cancelled() => Err(Cancelled), + } } } @@ -190,53 +227,88 @@ impl RemoteStorage for AzureBlobStorage { &self, prefix: Option<&RemotePath>, mode: ListingMode, + max_keys: Option, + cancel: &CancellationToken, ) -> anyhow::Result { - // get the passed prefix or if it is not set use prefix_in_bucket value - let list_prefix = prefix - .map(|p| self.relative_path_to_name(p)) - .or_else(|| self.prefix_in_container.clone()) - .map(|mut p| { - // required to end with a separator - // otherwise request will return only the entry of a prefix - if matches!(mode, ListingMode::WithDelimiter) - && !p.ends_with(REMOTE_STORAGE_PREFIX_SEPARATOR) - { - p.push(REMOTE_STORAGE_PREFIX_SEPARATOR); - } - p + let _permit = self.permit(RequestKind::List, cancel).await?; + + let op = async { + // get the passed prefix or if it is not set use prefix_in_bucket value + let list_prefix = prefix + .map(|p| self.relative_path_to_name(p)) + .or_else(|| self.prefix_in_container.clone()) + .map(|mut p| { + // required to end with a separator + // otherwise request will return only the entry of a prefix + if matches!(mode, ListingMode::WithDelimiter) + && !p.ends_with(REMOTE_STORAGE_PREFIX_SEPARATOR) + { + p.push(REMOTE_STORAGE_PREFIX_SEPARATOR); + } + p + }); + + let mut builder = self.client.list_blobs(); + + if let ListingMode::WithDelimiter = mode { + builder = builder.delimiter(REMOTE_STORAGE_PREFIX_SEPARATOR.to_string()); + } + + if let Some(prefix) = list_prefix { + builder = builder.prefix(Cow::from(prefix.to_owned())); + } + + if let Some(limit) = self.max_keys_per_list_response { + builder = builder.max_results(MaxResults::new(limit)); + } + + let response = builder.into_stream(); + let response = response.into_stream().map_err(to_download_error); + let response = tokio_stream::StreamExt::timeout(response, self.timeout); + let response = response.map(|res| match res { + Ok(res) => res, + Err(_elapsed) => Err(DownloadError::Timeout), }); - let mut builder = self.client.list_blobs(); + let mut response = std::pin::pin!(response); - if let ListingMode::WithDelimiter = mode { - builder = builder.delimiter(REMOTE_STORAGE_PREFIX_SEPARATOR.to_string()); + let mut res = Listing::default(); + + let mut max_keys = max_keys.map(|mk| mk.get()); + while let Some(entry) = response.next().await { + let entry = entry?; + let prefix_iter = entry + .blobs + .prefixes() + .map(|prefix| self.name_to_relative_path(&prefix.name)); + res.prefixes.extend(prefix_iter); + + let blob_iter = entry + .blobs + .blobs() + .map(|k| self.name_to_relative_path(&k.name)); + + for key in blob_iter { + res.keys.push(key); + + if let Some(mut mk) = max_keys { + assert!(mk > 0); + mk -= 1; + if mk == 0 { + return Ok(res); // limit reached + } + max_keys = Some(mk); + } + } + } + + Ok(res) + }; + + tokio::select! { + res = op => res, + _ = cancel.cancelled() => Err(DownloadError::Cancelled), } - - if let Some(prefix) = list_prefix { - builder = builder.prefix(Cow::from(prefix.to_owned())); - } - - if let Some(limit) = self.max_keys_per_list_response { - builder = builder.max_results(MaxResults::new(limit)); - } - - let mut response = builder.into_stream(); - let mut res = Listing::default(); - while let Some(l) = response.next().await { - let entry = l.map_err(to_download_error)?; - let prefix_iter = entry - .blobs - .prefixes() - .map(|prefix| self.name_to_relative_path(&prefix.name)); - res.prefixes.extend(prefix_iter); - - let blob_iter = entry - .blobs - .blobs() - .map(|k| self.name_to_relative_path(&k.name)); - res.keys.extend(blob_iter); - } - Ok(res) } async fn upload( @@ -245,35 +317,52 @@ impl RemoteStorage for AzureBlobStorage { data_size_bytes: usize, to: &RemotePath, metadata: Option, + cancel: &CancellationToken, ) -> anyhow::Result<()> { - let _permit = self.permit(RequestKind::Put).await; - let blob_client = self.client.blob_client(self.relative_path_to_name(to)); + let _permit = self.permit(RequestKind::Put, cancel).await?; - let from: Pin> + Send + Sync + 'static>> = - Box::pin(from); + let op = async { + let blob_client = self.client.blob_client(self.relative_path_to_name(to)); - let from = NonSeekableStream::new(from, data_size_bytes); + let from: Pin> + Send + Sync + 'static>> = + Box::pin(from); - let body = azure_core::Body::SeekableStream(Box::new(from)); + let from = NonSeekableStream::new(from, data_size_bytes); - let mut builder = blob_client.put_block_blob(body); + let body = azure_core::Body::SeekableStream(Box::new(from)); - if let Some(metadata) = metadata { - builder = builder.metadata(to_azure_metadata(metadata)); + let mut builder = blob_client.put_block_blob(body); + + if let Some(metadata) = metadata { + builder = builder.metadata(to_azure_metadata(metadata)); + } + + let fut = builder.into_future(); + let fut = tokio::time::timeout(self.timeout, fut); + + match fut.await { + Ok(Ok(_response)) => Ok(()), + Ok(Err(azure)) => Err(azure.into()), + Err(_timeout) => Err(TimeoutOrCancel::Cancel.into()), + } + }; + + tokio::select! { + res = op => res, + _ = cancel.cancelled() => Err(TimeoutOrCancel::Cancel.into()), } - - let _response = builder.into_future().await?; - - Ok(()) } - async fn download(&self, from: &RemotePath) -> Result { - let _permit = self.permit(RequestKind::Get).await; + async fn download( + &self, + from: &RemotePath, + cancel: &CancellationToken, + ) -> Result { let blob_client = self.client.blob_client(self.relative_path_to_name(from)); let builder = blob_client.get(); - self.download_for_builder(builder).await + self.download_for_builder(builder, cancel).await } async fn download_byte_range( @@ -281,8 +370,8 @@ impl RemoteStorage for AzureBlobStorage { from: &RemotePath, start_inclusive: u64, end_exclusive: Option, + cancel: &CancellationToken, ) -> Result { - let _permit = self.permit(RequestKind::Get).await; let blob_client = self.client.blob_client(self.relative_path_to_name(from)); let mut builder = blob_client.get(); @@ -294,82 +383,113 @@ impl RemoteStorage for AzureBlobStorage { }; builder = builder.range(range); - self.download_for_builder(builder).await + self.download_for_builder(builder, cancel).await } - async fn delete(&self, path: &RemotePath) -> anyhow::Result<()> { - let _permit = self.permit(RequestKind::Delete).await; - let blob_client = self.client.blob_client(self.relative_path_to_name(path)); + async fn delete(&self, path: &RemotePath, cancel: &CancellationToken) -> anyhow::Result<()> { + self.delete_objects(std::array::from_ref(path), cancel) + .await + } - let builder = blob_client.delete(); + async fn delete_objects<'a>( + &self, + paths: &'a [RemotePath], + cancel: &CancellationToken, + ) -> anyhow::Result<()> { + let _permit = self.permit(RequestKind::Delete, cancel).await?; - match builder.into_future().await { - Ok(_response) => Ok(()), - Err(e) => { - if let Some(http_err) = e.as_http_error() { - if http_err.status() == StatusCode::NotFound { - return Ok(()); + let op = async { + // TODO batch requests are also not supported by the SDK + // https://github.com/Azure/azure-sdk-for-rust/issues/1068 + // https://github.com/Azure/azure-sdk-for-rust/issues/1249 + for path in paths { + let blob_client = self.client.blob_client(self.relative_path_to_name(path)); + + let request = blob_client.delete().into_future(); + + let res = tokio::time::timeout(self.timeout, request).await; + + match res { + Ok(Ok(_response)) => continue, + Ok(Err(e)) => { + if let Some(http_err) = e.as_http_error() { + if http_err.status() == StatusCode::NotFound { + continue; + } + } + return Err(e.into()); } + Err(_elapsed) => return Err(TimeoutOrCancel::Timeout.into()), } - Err(anyhow::Error::new(e)) } + + Ok(()) + }; + + tokio::select! { + res = op => res, + _ = cancel.cancelled() => Err(TimeoutOrCancel::Cancel.into()), } } - async fn delete_objects<'a>(&self, paths: &'a [RemotePath]) -> anyhow::Result<()> { - // Permit is already obtained by inner delete function + async fn copy( + &self, + from: &RemotePath, + to: &RemotePath, + cancel: &CancellationToken, + ) -> anyhow::Result<()> { + let _permit = self.permit(RequestKind::Copy, cancel).await?; - // TODO batch requests are also not supported by the SDK - // https://github.com/Azure/azure-sdk-for-rust/issues/1068 - // https://github.com/Azure/azure-sdk-for-rust/issues/1249 - for path in paths { - self.delete(path).await?; - } - Ok(()) - } + let timeout = tokio::time::sleep(self.timeout); - async fn copy(&self, from: &RemotePath, to: &RemotePath) -> anyhow::Result<()> { - let _permit = self.permit(RequestKind::Copy).await; - let blob_client = self.client.blob_client(self.relative_path_to_name(to)); + let mut copy_status = None; - let source_url = format!( - "{}/{}", - self.client.url()?, - self.relative_path_to_name(from) - ); - let builder = blob_client.copy(Url::from_str(&source_url)?); + let op = async { + let blob_client = self.client.blob_client(self.relative_path_to_name(to)); - let result = builder.into_future().await?; + let source_url = format!( + "{}/{}", + self.client.url()?, + self.relative_path_to_name(from) + ); - let mut copy_status = result.copy_status; - let start_time = Instant::now(); - const MAX_WAIT_TIME: Duration = Duration::from_secs(60); - loop { - match copy_status { - CopyStatus::Aborted => { - anyhow::bail!("Received abort for copy from {from} to {to}."); + let builder = blob_client.copy(Url::from_str(&source_url)?); + let copy = builder.into_future(); + + let result = copy.await?; + + copy_status = Some(result.copy_status); + loop { + match copy_status.as_ref().expect("we always set it to Some") { + CopyStatus::Aborted => { + anyhow::bail!("Received abort for copy from {from} to {to}."); + } + CopyStatus::Failed => { + anyhow::bail!("Received failure response for copy from {from} to {to}."); + } + CopyStatus::Success => return Ok(()), + CopyStatus::Pending => (), } - CopyStatus::Failed => { - anyhow::bail!("Received failure response for copy from {from} to {to}."); - } - CopyStatus::Success => return Ok(()), - CopyStatus::Pending => (), + // The copy is taking longer. Waiting a second and then re-trying. + // TODO estimate time based on copy_progress and adjust time based on that + tokio::time::sleep(Duration::from_millis(1000)).await; + let properties = blob_client.get_properties().into_future().await?; + let Some(status) = properties.blob.properties.copy_status else { + tracing::warn!("copy_status for copy is None!, from={from}, to={to}"); + return Ok(()); + }; + copy_status = Some(status); } - // The copy is taking longer. Waiting a second and then re-trying. - // TODO estimate time based on copy_progress and adjust time based on that - tokio::time::sleep(Duration::from_millis(1000)).await; - let properties = blob_client.get_properties().into_future().await?; - let Some(status) = properties.blob.properties.copy_status else { - tracing::warn!("copy_status for copy is None!, from={from}, to={to}"); - return Ok(()); - }; - if start_time.elapsed() > MAX_WAIT_TIME { - anyhow::bail!("Copy from from {from} to {to} took longer than limit MAX_WAIT_TIME={}s. copy_pogress={:?}.", - MAX_WAIT_TIME.as_secs_f32(), - properties.blob.properties.copy_progress, - ); - } - copy_status = status; + }; + + tokio::select! { + res = op => res, + _ = cancel.cancelled() => Err(anyhow::Error::new(TimeoutOrCancel::Cancel)), + _ = timeout => { + let e = anyhow::Error::new(TimeoutOrCancel::Timeout); + let e = e.context(format!("Timeout, last status: {copy_status:?}")); + Err(e) + }, } } @@ -378,13 +498,11 @@ impl RemoteStorage for AzureBlobStorage { _prefix: Option<&RemotePath>, _timestamp: SystemTime, _done_if_after: SystemTime, - _cancel: CancellationToken, - ) -> anyhow::Result<()> { + _cancel: &CancellationToken, + ) -> Result<(), TimeTravelError> { // TODO use Azure point in time recovery feature for this // https://learn.microsoft.com/en-us/azure/storage/blobs/point-in-time-restore-overview - Err(anyhow::anyhow!( - "time travel recovery for azure blob storage is not implemented" - )) + Err(TimeTravelError::Unimplemented) } } diff --git a/libs/remote_storage/src/error.rs b/libs/remote_storage/src/error.rs new file mode 100644 index 0000000000..66422853e1 --- /dev/null +++ b/libs/remote_storage/src/error.rs @@ -0,0 +1,200 @@ +/// Reasons for downloads or listings to fail. +#[derive(Debug)] +pub enum DownloadError { + /// Validation or other error happened due to user input. + BadInput(anyhow::Error), + /// The file was not found in the remote storage. + NotFound, + /// A cancellation token aborted the download, typically during + /// tenant detach or process shutdown. + Cancelled, + /// A timeout happened while executing the request. Possible reasons: + /// - stuck tcp connection + /// + /// Concurrency control is not timed within timeout. + Timeout, + /// The file was found in the remote storage, but the download failed. + Other(anyhow::Error), +} + +impl std::fmt::Display for DownloadError { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + match self { + DownloadError::BadInput(e) => { + write!(f, "Failed to download a remote file due to user input: {e}") + } + DownloadError::NotFound => write!(f, "No file found for the remote object id given"), + DownloadError::Cancelled => write!(f, "Cancelled, shutting down"), + DownloadError::Timeout => write!(f, "timeout"), + DownloadError::Other(e) => write!(f, "Failed to download a remote file: {e:?}"), + } + } +} + +impl std::error::Error for DownloadError {} + +impl DownloadError { + /// Returns true if the error should not be retried with backoff + pub fn is_permanent(&self) -> bool { + use DownloadError::*; + match self { + BadInput(_) | NotFound | Cancelled => true, + Timeout | Other(_) => false, + } + } +} + +impl From for DownloadError { + fn from(value: std::io::Error) -> Self { + let needs_unwrap = value.kind() == std::io::ErrorKind::Other + && value + .get_ref() + .and_then(|x| x.downcast_ref::()) + .is_some(); + + if needs_unwrap { + *value + .into_inner() + .expect("just checked") + .downcast::() + .expect("just checked") + } else { + DownloadError::Other(value.into()) + } + } +} + +#[derive(Debug)] +pub enum TimeTravelError { + /// Validation or other error happened due to user input. + BadInput(anyhow::Error), + /// The used remote storage does not have time travel recovery implemented + Unimplemented, + /// The number of versions/deletion markers is above our limit. + TooManyVersions, + /// A cancellation token aborted the process, typically during + /// request closure or process shutdown. + Cancelled, + /// Other errors + Other(anyhow::Error), +} + +impl std::fmt::Display for TimeTravelError { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + match self { + TimeTravelError::BadInput(e) => { + write!( + f, + "Failed to time travel recover a prefix due to user input: {e}" + ) + } + TimeTravelError::Unimplemented => write!( + f, + "time travel recovery is not implemented for the current storage backend" + ), + TimeTravelError::Cancelled => write!(f, "Cancelled, shutting down"), + TimeTravelError::TooManyVersions => { + write!(f, "Number of versions/delete markers above limit") + } + TimeTravelError::Other(e) => write!(f, "Failed to time travel recover a prefix: {e:?}"), + } + } +} + +impl std::error::Error for TimeTravelError {} + +/// Plain cancelled error. +/// +/// By design this type does not not implement `std::error::Error` so it cannot be put as the root +/// cause of `std::io::Error` or `anyhow::Error`. It should never need to be exposed out of this +/// crate. +/// +/// It exists to implement permit acquiring in `{Download,TimeTravel}Error` and `anyhow::Error` returning +/// operations and ensuring that those get converted to proper versions with just `?`. +#[derive(Debug)] +pub(crate) struct Cancelled; + +impl From for anyhow::Error { + fn from(_: Cancelled) -> Self { + anyhow::Error::new(TimeoutOrCancel::Cancel) + } +} + +impl From for TimeTravelError { + fn from(_: Cancelled) -> Self { + TimeTravelError::Cancelled + } +} + +impl From for TimeoutOrCancel { + fn from(_: Cancelled) -> Self { + TimeoutOrCancel::Cancel + } +} + +impl From for DownloadError { + fn from(_: Cancelled) -> Self { + DownloadError::Cancelled + } +} + +/// This type is used at as the root cause for timeouts and cancellations with `anyhow::Error` returning +/// RemoteStorage methods. +/// +/// For use with `utils::backoff::retry` and `anyhow::Error` returning operations there is +/// `TimeoutOrCancel::caused_by_cancel` method to query "proper form" errors. +#[derive(Debug)] +pub enum TimeoutOrCancel { + Timeout, + Cancel, +} + +impl std::fmt::Display for TimeoutOrCancel { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + use TimeoutOrCancel::*; + match self { + Timeout => write!(f, "timeout"), + Cancel => write!(f, "cancel"), + } + } +} + +impl std::error::Error for TimeoutOrCancel {} + +impl TimeoutOrCancel { + /// Returns true if the error was caused by [`TimeoutOrCancel::Cancel`]. + pub fn caused_by_cancel(error: &anyhow::Error) -> bool { + error + .root_cause() + .downcast_ref::() + .is_some_and(Self::is_cancel) + } + + pub fn is_cancel(&self) -> bool { + matches!(self, TimeoutOrCancel::Cancel) + } + + pub fn is_timeout(&self) -> bool { + matches!(self, TimeoutOrCancel::Timeout) + } +} + +/// This conversion is used when [`crate::support::DownloadStream`] notices a cancellation or +/// timeout to wrap it in an `std::io::Error`. +impl From for std::io::Error { + fn from(value: TimeoutOrCancel) -> Self { + let e = DownloadError::from(value); + std::io::Error::other(e) + } +} + +impl From for DownloadError { + fn from(value: TimeoutOrCancel) -> Self { + use TimeoutOrCancel::*; + + match value { + Timeout => DownloadError::Timeout, + Cancel => DownloadError::Cancelled, + } + } +} diff --git a/libs/remote_storage/src/lib.rs b/libs/remote_storage/src/lib.rs index bf9c51ad1a..b0b69f9155 100644 --- a/libs/remote_storage/src/lib.rs +++ b/libs/remote_storage/src/lib.rs @@ -10,12 +10,19 @@ #![deny(clippy::undocumented_unsafe_blocks)] mod azure_blob; +mod error; mod local_fs; mod s3_bucket; mod simulate_failures; +mod support; use std::{ - collections::HashMap, fmt::Debug, num::NonZeroUsize, pin::Pin, sync::Arc, time::SystemTime, + collections::HashMap, + fmt::Debug, + num::{NonZeroU32, NonZeroUsize}, + pin::Pin, + sync::Arc, + time::{Duration, SystemTime}, }; use anyhow::{bail, Context}; @@ -35,6 +42,8 @@ pub use self::{ }; use s3_bucket::RequestKind; +pub use error::{DownloadError, TimeTravelError, TimeoutOrCancel}; + /// Currently, sync happens with AWS S3, that has two limits on requests per second: /// ~200 RPS for IAM services /// @@ -152,9 +161,10 @@ pub trait RemoteStorage: Send + Sync + 'static { async fn list_prefixes( &self, prefix: Option<&RemotePath>, + cancel: &CancellationToken, ) -> Result, DownloadError> { let result = self - .list(prefix, ListingMode::WithDelimiter) + .list(prefix, ListingMode::WithDelimiter, None, cancel) .await? .prefixes; Ok(result) @@ -170,8 +180,18 @@ pub trait RemoteStorage: Send + Sync + 'static { /// whereas, /// list_prefixes("foo/bar/") = ["cat", "dog"] /// See `test_real_s3.rs` for more details. - async fn list_files(&self, prefix: Option<&RemotePath>) -> anyhow::Result> { - let result = self.list(prefix, ListingMode::NoDelimiter).await?.keys; + /// + /// max_keys limits max number of keys returned; None means unlimited. + async fn list_files( + &self, + prefix: Option<&RemotePath>, + max_keys: Option, + cancel: &CancellationToken, + ) -> Result, DownloadError> { + let result = self + .list(prefix, ListingMode::NoDelimiter, max_keys, cancel) + .await? + .keys; Ok(result) } @@ -179,9 +199,14 @@ pub trait RemoteStorage: Send + Sync + 'static { &self, prefix: Option<&RemotePath>, _mode: ListingMode, - ) -> anyhow::Result; + max_keys: Option, + cancel: &CancellationToken, + ) -> Result; /// Streams the local file contents into remote into the remote storage entry. + /// + /// If the operation fails because of timeout or cancellation, the root cause of the error will be + /// set to `TimeoutOrCancel`. async fn upload( &self, from: impl Stream> + Send + Sync + 'static, @@ -190,27 +215,61 @@ pub trait RemoteStorage: Send + Sync + 'static { data_size_bytes: usize, to: &RemotePath, metadata: Option, + cancel: &CancellationToken, ) -> anyhow::Result<()>; - /// Streams the remote storage entry contents into the buffered writer given, returns the filled writer. + /// Streams the remote storage entry contents. + /// + /// The returned download stream will obey initial timeout and cancellation signal by erroring + /// on whichever happens first. Only one of the reasons will fail the stream, which is usually + /// enough for `tokio::io::copy_buf` usage. If needed the error can be filtered out. + /// /// Returns the metadata, if any was stored with the file previously. - async fn download(&self, from: &RemotePath) -> Result; + async fn download( + &self, + from: &RemotePath, + cancel: &CancellationToken, + ) -> Result; - /// Streams a given byte range of the remote storage entry contents into the buffered writer given, returns the filled writer. + /// Streams a given byte range of the remote storage entry contents. + /// + /// The returned download stream will obey initial timeout and cancellation signal by erroring + /// on whichever happens first. Only one of the reasons will fail the stream, which is usually + /// enough for `tokio::io::copy_buf` usage. If needed the error can be filtered out. + /// /// Returns the metadata, if any was stored with the file previously. async fn download_byte_range( &self, from: &RemotePath, start_inclusive: u64, end_exclusive: Option, + cancel: &CancellationToken, ) -> Result; - async fn delete(&self, path: &RemotePath) -> anyhow::Result<()>; + /// Delete a single path from remote storage. + /// + /// If the operation fails because of timeout or cancellation, the root cause of the error will be + /// set to `TimeoutOrCancel`. In such situation it is unknown if the deletion went through. + async fn delete(&self, path: &RemotePath, cancel: &CancellationToken) -> anyhow::Result<()>; - async fn delete_objects<'a>(&self, paths: &'a [RemotePath]) -> anyhow::Result<()>; + /// Delete a multiple paths from remote storage. + /// + /// If the operation fails because of timeout or cancellation, the root cause of the error will be + /// set to `TimeoutOrCancel`. In such situation it is unknown which deletions, if any, went + /// through. + async fn delete_objects<'a>( + &self, + paths: &'a [RemotePath], + cancel: &CancellationToken, + ) -> anyhow::Result<()>; /// Copy a remote object inside a bucket from one path to another. - async fn copy(&self, from: &RemotePath, to: &RemotePath) -> anyhow::Result<()>; + async fn copy( + &self, + from: &RemotePath, + to: &RemotePath, + cancel: &CancellationToken, + ) -> anyhow::Result<()>; /// Resets the content of everything with the given prefix to the given state async fn time_travel_recover( @@ -218,11 +277,17 @@ pub trait RemoteStorage: Send + Sync + 'static { prefix: Option<&RemotePath>, timestamp: SystemTime, done_if_after: SystemTime, - cancel: CancellationToken, - ) -> anyhow::Result<()>; + cancel: &CancellationToken, + ) -> Result<(), TimeTravelError>; } -pub type DownloadStream = Pin> + Unpin + Send + Sync>>; +/// DownloadStream is sensitive to the timeout and cancellation used with the original +/// [`RemoteStorage::download`] request. The type yields `std::io::Result` to be compatible +/// with `tokio::io::copy_buf`. +// This has 'static because safekeepers do not use cancellation tokens (yet) +pub type DownloadStream = + Pin> + Send + Sync + 'static>>; + pub struct Download { pub download_stream: DownloadStream, /// The last time the file was modified (`last-modified` HTTP header) @@ -241,34 +306,6 @@ impl Debug for Download { } } -#[derive(Debug)] -pub enum DownloadError { - /// Validation or other error happened due to user input. - BadInput(anyhow::Error), - /// The file was not found in the remote storage. - NotFound, - /// A cancellation token aborted the download, typically during - /// tenant detach or process shutdown. - Cancelled, - /// The file was found in the remote storage, but the download failed. - Other(anyhow::Error), -} - -impl std::fmt::Display for DownloadError { - fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { - match self { - DownloadError::BadInput(e) => { - write!(f, "Failed to download a remote file due to user input: {e}") - } - DownloadError::Cancelled => write!(f, "Cancelled, shutting down"), - DownloadError::NotFound => write!(f, "No file found for the remote object id given"), - DownloadError::Other(e) => write!(f, "Failed to download a remote file: {e:?}"), - } - } -} - -impl std::error::Error for DownloadError {} - /// Every storage, currently supported. /// Serves as a simple way to pass around the [`RemoteStorage`] without dealing with generics. #[derive(Clone)] @@ -285,24 +322,33 @@ impl GenericRemoteStorage> { &self, prefix: Option<&RemotePath>, mode: ListingMode, + max_keys: Option, + cancel: &CancellationToken, ) -> anyhow::Result { match self { - Self::LocalFs(s) => s.list(prefix, mode).await, - Self::AwsS3(s) => s.list(prefix, mode).await, - Self::AzureBlob(s) => s.list(prefix, mode).await, - Self::Unreliable(s) => s.list(prefix, mode).await, + Self::LocalFs(s) => s.list(prefix, mode, max_keys, cancel).await, + Self::AwsS3(s) => s.list(prefix, mode, max_keys, cancel).await, + Self::AzureBlob(s) => s.list(prefix, mode, max_keys, cancel).await, + Self::Unreliable(s) => s.list(prefix, mode, max_keys, cancel).await, } } // A function for listing all the files in a "directory" // Example: // list_files("foo/bar") = ["foo/bar/a.txt", "foo/bar/b.txt"] - pub async fn list_files(&self, folder: Option<&RemotePath>) -> anyhow::Result> { + // + // max_keys limits max number of keys returned; None means unlimited. + pub async fn list_files( + &self, + folder: Option<&RemotePath>, + max_keys: Option, + cancel: &CancellationToken, + ) -> Result, DownloadError> { match self { - Self::LocalFs(s) => s.list_files(folder).await, - Self::AwsS3(s) => s.list_files(folder).await, - Self::AzureBlob(s) => s.list_files(folder).await, - Self::Unreliable(s) => s.list_files(folder).await, + Self::LocalFs(s) => s.list_files(folder, max_keys, cancel).await, + Self::AwsS3(s) => s.list_files(folder, max_keys, cancel).await, + Self::AzureBlob(s) => s.list_files(folder, max_keys, cancel).await, + Self::Unreliable(s) => s.list_files(folder, max_keys, cancel).await, } } @@ -312,36 +358,43 @@ impl GenericRemoteStorage> { pub async fn list_prefixes( &self, prefix: Option<&RemotePath>, + cancel: &CancellationToken, ) -> Result, DownloadError> { match self { - Self::LocalFs(s) => s.list_prefixes(prefix).await, - Self::AwsS3(s) => s.list_prefixes(prefix).await, - Self::AzureBlob(s) => s.list_prefixes(prefix).await, - Self::Unreliable(s) => s.list_prefixes(prefix).await, + Self::LocalFs(s) => s.list_prefixes(prefix, cancel).await, + Self::AwsS3(s) => s.list_prefixes(prefix, cancel).await, + Self::AzureBlob(s) => s.list_prefixes(prefix, cancel).await, + Self::Unreliable(s) => s.list_prefixes(prefix, cancel).await, } } + /// See [`RemoteStorage::upload`] pub async fn upload( &self, from: impl Stream> + Send + Sync + 'static, data_size_bytes: usize, to: &RemotePath, metadata: Option, + cancel: &CancellationToken, ) -> anyhow::Result<()> { match self { - Self::LocalFs(s) => s.upload(from, data_size_bytes, to, metadata).await, - Self::AwsS3(s) => s.upload(from, data_size_bytes, to, metadata).await, - Self::AzureBlob(s) => s.upload(from, data_size_bytes, to, metadata).await, - Self::Unreliable(s) => s.upload(from, data_size_bytes, to, metadata).await, + Self::LocalFs(s) => s.upload(from, data_size_bytes, to, metadata, cancel).await, + Self::AwsS3(s) => s.upload(from, data_size_bytes, to, metadata, cancel).await, + Self::AzureBlob(s) => s.upload(from, data_size_bytes, to, metadata, cancel).await, + Self::Unreliable(s) => s.upload(from, data_size_bytes, to, metadata, cancel).await, } } - pub async fn download(&self, from: &RemotePath) -> Result { + pub async fn download( + &self, + from: &RemotePath, + cancel: &CancellationToken, + ) -> Result { match self { - Self::LocalFs(s) => s.download(from).await, - Self::AwsS3(s) => s.download(from).await, - Self::AzureBlob(s) => s.download(from).await, - Self::Unreliable(s) => s.download(from).await, + Self::LocalFs(s) => s.download(from, cancel).await, + Self::AwsS3(s) => s.download(from, cancel).await, + Self::AzureBlob(s) => s.download(from, cancel).await, + Self::Unreliable(s) => s.download(from, cancel).await, } } @@ -350,61 +403,79 @@ impl GenericRemoteStorage> { from: &RemotePath, start_inclusive: u64, end_exclusive: Option, + cancel: &CancellationToken, ) -> Result { match self { Self::LocalFs(s) => { - s.download_byte_range(from, start_inclusive, end_exclusive) + s.download_byte_range(from, start_inclusive, end_exclusive, cancel) .await } Self::AwsS3(s) => { - s.download_byte_range(from, start_inclusive, end_exclusive) + s.download_byte_range(from, start_inclusive, end_exclusive, cancel) .await } Self::AzureBlob(s) => { - s.download_byte_range(from, start_inclusive, end_exclusive) + s.download_byte_range(from, start_inclusive, end_exclusive, cancel) .await } Self::Unreliable(s) => { - s.download_byte_range(from, start_inclusive, end_exclusive) + s.download_byte_range(from, start_inclusive, end_exclusive, cancel) .await } } } - pub async fn delete(&self, path: &RemotePath) -> anyhow::Result<()> { + /// See [`RemoteStorage::delete`] + pub async fn delete( + &self, + path: &RemotePath, + cancel: &CancellationToken, + ) -> anyhow::Result<()> { match self { - Self::LocalFs(s) => s.delete(path).await, - Self::AwsS3(s) => s.delete(path).await, - Self::AzureBlob(s) => s.delete(path).await, - Self::Unreliable(s) => s.delete(path).await, + Self::LocalFs(s) => s.delete(path, cancel).await, + Self::AwsS3(s) => s.delete(path, cancel).await, + Self::AzureBlob(s) => s.delete(path, cancel).await, + Self::Unreliable(s) => s.delete(path, cancel).await, } } - pub async fn delete_objects<'a>(&self, paths: &'a [RemotePath]) -> anyhow::Result<()> { + /// See [`RemoteStorage::delete_objects`] + pub async fn delete_objects( + &self, + paths: &[RemotePath], + cancel: &CancellationToken, + ) -> anyhow::Result<()> { match self { - Self::LocalFs(s) => s.delete_objects(paths).await, - Self::AwsS3(s) => s.delete_objects(paths).await, - Self::AzureBlob(s) => s.delete_objects(paths).await, - Self::Unreliable(s) => s.delete_objects(paths).await, + Self::LocalFs(s) => s.delete_objects(paths, cancel).await, + Self::AwsS3(s) => s.delete_objects(paths, cancel).await, + Self::AzureBlob(s) => s.delete_objects(paths, cancel).await, + Self::Unreliable(s) => s.delete_objects(paths, cancel).await, } } - pub async fn copy_object(&self, from: &RemotePath, to: &RemotePath) -> anyhow::Result<()> { + /// See [`RemoteStorage::copy`] + pub async fn copy_object( + &self, + from: &RemotePath, + to: &RemotePath, + cancel: &CancellationToken, + ) -> anyhow::Result<()> { match self { - Self::LocalFs(s) => s.copy(from, to).await, - Self::AwsS3(s) => s.copy(from, to).await, - Self::AzureBlob(s) => s.copy(from, to).await, - Self::Unreliable(s) => s.copy(from, to).await, + Self::LocalFs(s) => s.copy(from, to, cancel).await, + Self::AwsS3(s) => s.copy(from, to, cancel).await, + Self::AzureBlob(s) => s.copy(from, to, cancel).await, + Self::Unreliable(s) => s.copy(from, to, cancel).await, } } + /// See [`RemoteStorage::time_travel_recover`]. pub async fn time_travel_recover( &self, prefix: Option<&RemotePath>, timestamp: SystemTime, done_if_after: SystemTime, - cancel: CancellationToken, - ) -> anyhow::Result<()> { + cancel: &CancellationToken, + ) -> Result<(), TimeTravelError> { match self { Self::LocalFs(s) => { s.time_travel_recover(prefix, timestamp, done_if_after, cancel) @@ -428,20 +499,26 @@ impl GenericRemoteStorage> { impl GenericRemoteStorage { pub fn from_config(storage_config: &RemoteStorageConfig) -> anyhow::Result { + let timeout = storage_config.timeout; Ok(match &storage_config.storage { - RemoteStorageKind::LocalFs(root) => { - info!("Using fs root '{root}' as a remote storage"); - Self::LocalFs(LocalFs::new(root.clone())?) + RemoteStorageKind::LocalFs(path) => { + info!("Using fs root '{path}' as a remote storage"); + Self::LocalFs(LocalFs::new(path.clone(), timeout)?) } RemoteStorageKind::AwsS3(s3_config) => { - info!("Using s3 bucket '{}' in region '{}' as a remote storage, prefix in bucket: '{:?}', bucket endpoint: '{:?}'", + // The profile and access key id are only printed here for debugging purposes, + // their values don't indicate the eventually taken choice for auth. + let profile = std::env::var("AWS_PROFILE").unwrap_or_else(|_| "".into()); + let access_key_id = + std::env::var("AWS_ACCESS_KEY_ID").unwrap_or_else(|_| "".into()); + info!("Using s3 bucket '{}' in region '{}' as a remote storage, prefix in bucket: '{:?}', bucket endpoint: '{:?}', profile: {profile}, access_key_id: {access_key_id}", s3_config.bucket_name, s3_config.bucket_region, s3_config.prefix_in_bucket, s3_config.endpoint); - Self::AwsS3(Arc::new(S3Bucket::new(s3_config)?)) + Self::AwsS3(Arc::new(S3Bucket::new(s3_config, timeout)?)) } RemoteStorageKind::AzureContainer(azure_config) => { info!("Using azure container '{}' in region '{}' as a remote storage, prefix in container: '{:?}'", azure_config.container_name, azure_config.container_region, azure_config.prefix_in_container); - Self::AzureBlob(Arc::new(AzureBlobStorage::new(azure_config)?)) + Self::AzureBlob(Arc::new(AzureBlobStorage::new(azure_config, timeout)?)) } }) } @@ -450,18 +527,15 @@ impl GenericRemoteStorage { Self::Unreliable(Arc::new(UnreliableWrapper::new(s, fail_first))) } - /// Takes storage object contents and its size and uploads to remote storage, - /// mapping `from_path` to the corresponding remote object id in the storage. - /// - /// The storage object does not have to be present on the `from_path`, - /// this path is used for the remote object id conversion only. + /// See [`RemoteStorage::upload`], which this method calls with `None` as metadata. pub async fn upload_storage_object( &self, from: impl Stream> + Send + Sync + 'static, from_size_bytes: usize, to: &RemotePath, + cancel: &CancellationToken, ) -> anyhow::Result<()> { - self.upload(from, from_size_bytes, to, None) + self.upload(from, from_size_bytes, to, None, cancel) .await .with_context(|| { format!("Failed to upload data of length {from_size_bytes} to storage path {to:?}") @@ -474,10 +548,11 @@ impl GenericRemoteStorage { &self, byte_range: Option<(u64, Option)>, from: &RemotePath, + cancel: &CancellationToken, ) -> Result { match byte_range { - Some((start, end)) => self.download_byte_range(from, start, end).await, - None => self.download(from).await, + Some((start, end)) => self.download_byte_range(from, start, end, cancel).await, + None => self.download(from, cancel).await, } } } @@ -492,6 +567,9 @@ pub struct StorageMetadata(HashMap); pub struct RemoteStorageConfig { /// The storage connection configuration. pub storage: RemoteStorageKind, + /// A common timeout enforced for all requests after concurrency limiter permit has been + /// acquired. + pub timeout: Duration, } /// A kind of a remote storage to connect to, with its connection configuration. @@ -576,6 +654,8 @@ impl Debug for AzureConfig { } impl RemoteStorageConfig { + pub const DEFAULT_TIMEOUT: Duration = std::time::Duration::from_secs(120); + pub fn from_toml(toml: &toml_edit::Item) -> anyhow::Result> { let local_path = toml.get("local_path"); let bucket_name = toml.get("bucket_name"); @@ -605,6 +685,27 @@ impl RemoteStorageConfig { .map(|endpoint| parse_toml_string("endpoint", endpoint)) .transpose()?; + let timeout = toml + .get("timeout") + .map(|timeout| { + timeout + .as_str() + .ok_or_else(|| anyhow::Error::msg("timeout was not a string")) + }) + .transpose() + .and_then(|timeout| { + timeout + .map(humantime::parse_duration) + .transpose() + .map_err(anyhow::Error::new) + }) + .context("parse timeout")? + .unwrap_or(Self::DEFAULT_TIMEOUT); + + if timeout < Duration::from_secs(1) { + bail!("timeout was specified as {timeout:?} which is too low"); + } + let storage = match ( local_path, bucket_name, @@ -666,7 +767,7 @@ impl RemoteStorageConfig { } }; - Ok(Some(RemoteStorageConfig { storage })) + Ok(Some(RemoteStorageConfig { storage, timeout })) } } @@ -762,4 +863,24 @@ mod tests { let err = RemotePath::new(Utf8Path::new("/")).expect_err("Should fail on absolute paths"); assert_eq!(err.to_string(), "Path \"/\" is not relative"); } + + #[test] + fn parse_localfs_config_with_timeout() { + let input = "local_path = '.' +timeout = '5s'"; + + let toml = input.parse::().unwrap(); + + let config = RemoteStorageConfig::from_toml(toml.as_item()) + .unwrap() + .expect("it exists"); + + assert_eq!( + config, + RemoteStorageConfig { + storage: RemoteStorageKind::LocalFs(Utf8PathBuf::from(".")), + timeout: Duration::from_secs(5) + } + ); + } } diff --git a/libs/remote_storage/src/local_fs.rs b/libs/remote_storage/src/local_fs.rs index 34a6658a69..478ad81dc1 100644 --- a/libs/remote_storage/src/local_fs.rs +++ b/libs/remote_storage/src/local_fs.rs @@ -4,7 +4,14 @@ //! This storage used in tests, but can also be used in cases when a certain persistent //! volume is mounted to the local FS. -use std::{borrow::Cow, future::Future, io::ErrorKind, pin::Pin, time::SystemTime}; +use std::{ + borrow::Cow, + future::Future, + io::ErrorKind, + num::NonZeroU32, + pin::Pin, + time::{Duration, SystemTime}, +}; use anyhow::{bail, ensure, Context}; use bytes::Bytes; @@ -18,7 +25,9 @@ use tokio_util::{io::ReaderStream, sync::CancellationToken}; use tracing::*; use utils::{crashsafe::path_with_suffix_extension, fs_ext::is_directory_empty}; -use crate::{Download, DownloadError, DownloadStream, Listing, ListingMode, RemotePath}; +use crate::{ + Download, DownloadError, Listing, ListingMode, RemotePath, TimeTravelError, TimeoutOrCancel, +}; use super::{RemoteStorage, StorageMetadata}; @@ -27,12 +36,13 @@ const LOCAL_FS_TEMP_FILE_SUFFIX: &str = "___temp"; #[derive(Debug, Clone)] pub struct LocalFs { storage_root: Utf8PathBuf, + timeout: Duration, } impl LocalFs { /// Attempts to create local FS storage, along with its root directory. /// Storage root will be created (if does not exist) and transformed into an absolute path (if passed as relative). - pub fn new(mut storage_root: Utf8PathBuf) -> anyhow::Result { + pub fn new(mut storage_root: Utf8PathBuf, timeout: Duration) -> anyhow::Result { if !storage_root.exists() { std::fs::create_dir_all(&storage_root).with_context(|| { format!("Failed to create all directories in the given root path {storage_root:?}") @@ -44,7 +54,10 @@ impl LocalFs { })?; } - Ok(Self { storage_root }) + Ok(Self { + storage_root, + timeout, + }) } // mirrors S3Bucket::s3_object_to_relative_path @@ -155,76 +168,14 @@ impl LocalFs { Ok(files) } -} -impl RemoteStorage for LocalFs { - async fn list( - &self, - prefix: Option<&RemotePath>, - mode: ListingMode, - ) -> Result { - let mut result = Listing::default(); - - if let ListingMode::NoDelimiter = mode { - let keys = self - .list_recursive(prefix) - .await - .map_err(DownloadError::Other)?; - - result.keys = keys - .into_iter() - .filter(|k| { - let path = k.with_base(&self.storage_root); - !path.is_dir() - }) - .collect(); - - return Ok(result); - } - - let path = match prefix { - Some(prefix) => Cow::Owned(prefix.with_base(&self.storage_root)), - None => Cow::Borrowed(&self.storage_root), - }; - - let prefixes_to_filter = get_all_files(path.as_ref(), false) - .await - .map_err(DownloadError::Other)?; - - // filter out empty directories to mirror s3 behavior. - for prefix in prefixes_to_filter { - if prefix.is_dir() - && is_directory_empty(&prefix) - .await - .map_err(DownloadError::Other)? - { - continue; - } - - let stripped = prefix - .strip_prefix(&self.storage_root) - .context("Failed to strip prefix") - .and_then(RemotePath::new) - .expect( - "We list files for storage root, hence should be able to remote the prefix", - ); - - if prefix.is_dir() { - result.prefixes.push(stripped); - } else { - result.keys.push(stripped); - } - } - - Ok(result) - } - - async fn upload( + async fn upload0( &self, data: impl Stream> + Send + Sync, data_size_bytes: usize, to: &RemotePath, metadata: Option, + cancel: &CancellationToken, ) -> anyhow::Result<()> { let target_file_path = to.with_base(&self.storage_root); create_target_directory(&target_file_path).await?; @@ -259,9 +210,26 @@ impl RemoteStorage for LocalFs { let mut buffer_to_read = data.take(from_size_bytes); // alternatively we could just write the bytes to a file, but local_fs is a testing utility - let bytes_read = io::copy_buf(&mut buffer_to_read, &mut destination) - .await - .with_context(|| { + let copy = io::copy_buf(&mut buffer_to_read, &mut destination); + + let bytes_read = tokio::select! { + biased; + _ = cancel.cancelled() => { + let file = destination.into_inner(); + // wait for the inflight operation(s) to complete so that there could be a next + // attempt right away and our writes are not directed to their file. + file.into_std().await; + + // TODO: leave the temp or not? leaving is probably less racy. enabled truncate at + // least. + fs::remove_file(temp_file_path).await.context("remove temp_file_path after cancellation or timeout")?; + return Err(TimeoutOrCancel::Cancel.into()); + } + read = copy => read, + }; + + let bytes_read = + bytes_read.with_context(|| { format!( "Failed to upload file (write temp) to the local storage at '{temp_file_path}'", ) @@ -293,6 +261,9 @@ impl RemoteStorage for LocalFs { })?; if let Some(storage_metadata) = metadata { + // FIXME: we must not be using metadata much, since this would forget the old metadata + // for new writes? or perhaps metadata is sticky; could consider removing if it's never + // used. let storage_metadata_path = storage_metadata_path(&target_file_path); fs::write( &storage_metadata_path, @@ -309,8 +280,131 @@ impl RemoteStorage for LocalFs { Ok(()) } +} - async fn download(&self, from: &RemotePath) -> Result { +impl RemoteStorage for LocalFs { + async fn list( + &self, + prefix: Option<&RemotePath>, + mode: ListingMode, + max_keys: Option, + cancel: &CancellationToken, + ) -> Result { + let op = async { + let mut result = Listing::default(); + + if let ListingMode::NoDelimiter = mode { + let keys = self + .list_recursive(prefix) + .await + .map_err(DownloadError::Other)?; + + result.keys = keys + .into_iter() + .filter(|k| { + let path = k.with_base(&self.storage_root); + !path.is_dir() + }) + .collect(); + + if let Some(max_keys) = max_keys { + result.keys.truncate(max_keys.get() as usize); + } + + return Ok(result); + } + + let path = match prefix { + Some(prefix) => Cow::Owned(prefix.with_base(&self.storage_root)), + None => Cow::Borrowed(&self.storage_root), + }; + + let prefixes_to_filter = get_all_files(path.as_ref(), false) + .await + .map_err(DownloadError::Other)?; + + // filter out empty directories to mirror s3 behavior. + for prefix in prefixes_to_filter { + if prefix.is_dir() + && is_directory_empty(&prefix) + .await + .map_err(DownloadError::Other)? + { + continue; + } + + let stripped = prefix + .strip_prefix(&self.storage_root) + .context("Failed to strip prefix") + .and_then(RemotePath::new) + .expect( + "We list files for storage root, hence should be able to remote the prefix", + ); + + if prefix.is_dir() { + result.prefixes.push(stripped); + } else { + result.keys.push(stripped); + } + } + + Ok(result) + }; + + let timeout = async { + tokio::time::sleep(self.timeout).await; + Err(DownloadError::Timeout) + }; + + let cancelled = async { + cancel.cancelled().await; + Err(DownloadError::Cancelled) + }; + + tokio::select! { + res = op => res, + res = timeout => res, + res = cancelled => res, + } + } + + async fn upload( + &self, + data: impl Stream> + Send + Sync, + data_size_bytes: usize, + to: &RemotePath, + metadata: Option, + cancel: &CancellationToken, + ) -> anyhow::Result<()> { + let cancel = cancel.child_token(); + + let op = self.upload0(data, data_size_bytes, to, metadata, &cancel); + let mut op = std::pin::pin!(op); + + // race the upload0 to the timeout; if it goes over, do a graceful shutdown + let (res, timeout) = tokio::select! { + res = &mut op => (res, false), + _ = tokio::time::sleep(self.timeout) => { + cancel.cancel(); + (op.await, true) + } + }; + + match res { + Err(e) if timeout && TimeoutOrCancel::caused_by_cancel(&e) => { + // we caused this cancel (or they happened simultaneously) -- swap it out to + // Timeout + Err(TimeoutOrCancel::Timeout.into()) + } + res => res, + } + } + + async fn download( + &self, + from: &RemotePath, + cancel: &CancellationToken, + ) -> Result { let target_path = from.with_base(&self.storage_root); if file_exists(&target_path).map_err(DownloadError::BadInput)? { let source = ReaderStream::new( @@ -328,6 +422,10 @@ impl RemoteStorage for LocalFs { .read_storage_metadata(&target_path) .await .map_err(DownloadError::Other)?; + + let cancel_or_timeout = crate::support::cancel_or_timeout(self.timeout, cancel.clone()); + let source = crate::support::DownloadStream::new(cancel_or_timeout, source); + Ok(Download { metadata, last_modified: None, @@ -344,6 +442,7 @@ impl RemoteStorage for LocalFs { from: &RemotePath, start_inclusive: u64, end_exclusive: Option, + cancel: &CancellationToken, ) -> Result { if let Some(end_exclusive) = end_exclusive { if end_exclusive <= start_inclusive { @@ -363,34 +462,43 @@ impl RemoteStorage for LocalFs { format!("Failed to open source file {target_path:?} to use in the download") }) .map_err(DownloadError::Other)?; + + let len = source + .metadata() + .await + .context("query file length") + .map_err(DownloadError::Other)? + .len(); + source .seek(io::SeekFrom::Start(start_inclusive)) .await .context("Failed to seek to the range start in a local storage file") .map_err(DownloadError::Other)?; + let metadata = self .read_storage_metadata(&target_path) .await .map_err(DownloadError::Other)?; - let download_stream: DownloadStream = match end_exclusive { - Some(end_exclusive) => Box::pin(ReaderStream::new( - source.take(end_exclusive - start_inclusive), - )), - None => Box::pin(ReaderStream::new(source)), - }; + let source = source.take(end_exclusive.unwrap_or(len) - start_inclusive); + let source = ReaderStream::new(source); + + let cancel_or_timeout = crate::support::cancel_or_timeout(self.timeout, cancel.clone()); + let source = crate::support::DownloadStream::new(cancel_or_timeout, source); + Ok(Download { metadata, last_modified: None, etag: None, - download_stream, + download_stream: Box::pin(source), }) } else { Err(DownloadError::NotFound) } } - async fn delete(&self, path: &RemotePath) -> anyhow::Result<()> { + async fn delete(&self, path: &RemotePath, _cancel: &CancellationToken) -> anyhow::Result<()> { let file_path = path.with_base(&self.storage_root); match fs::remove_file(&file_path).await { Ok(()) => Ok(()), @@ -402,14 +510,23 @@ impl RemoteStorage for LocalFs { } } - async fn delete_objects<'a>(&self, paths: &'a [RemotePath]) -> anyhow::Result<()> { + async fn delete_objects<'a>( + &self, + paths: &'a [RemotePath], + cancel: &CancellationToken, + ) -> anyhow::Result<()> { for path in paths { - self.delete(path).await? + self.delete(path, cancel).await? } Ok(()) } - async fn copy(&self, from: &RemotePath, to: &RemotePath) -> anyhow::Result<()> { + async fn copy( + &self, + from: &RemotePath, + to: &RemotePath, + _cancel: &CancellationToken, + ) -> anyhow::Result<()> { let from_path = from.with_base(&self.storage_root); let to_path = to.with_base(&self.storage_root); create_target_directory(&to_path).await?; @@ -423,15 +540,14 @@ impl RemoteStorage for LocalFs { Ok(()) } - #[allow(clippy::diverging_sub_expression)] async fn time_travel_recover( &self, _prefix: Option<&RemotePath>, _timestamp: SystemTime, _done_if_after: SystemTime, - _cancel: CancellationToken, - ) -> anyhow::Result<()> { - unimplemented!() + _cancel: &CancellationToken, + ) -> Result<(), TimeTravelError> { + Err(TimeTravelError::Unimplemented) } } @@ -507,20 +623,17 @@ fn file_exists(file_path: &Utf8Path) -> anyhow::Result { mod fs_tests { use super::*; - use bytes::Bytes; use camino_tempfile::tempdir; - use futures_util::Stream; use std::{collections::HashMap, io::Write}; - async fn read_and_assert_remote_file_contents( + async fn read_and_check_metadata( storage: &LocalFs, - #[allow(clippy::ptr_arg)] - // have to use &Utf8PathBuf due to `storage.local_path` parameter requirements remote_storage_path: &RemotePath, expected_metadata: Option<&StorageMetadata>, ) -> anyhow::Result { + let cancel = CancellationToken::new(); let download = storage - .download(remote_storage_path) + .download(remote_storage_path, &cancel) .await .map_err(|e| anyhow::anyhow!("Download failed: {e}"))?; ensure!( @@ -535,16 +648,16 @@ mod fs_tests { #[tokio::test] async fn upload_file() -> anyhow::Result<()> { - let storage = create_storage()?; + let (storage, cancel) = create_storage()?; - let target_path_1 = upload_dummy_file(&storage, "upload_1", None).await?; + let target_path_1 = upload_dummy_file(&storage, "upload_1", None, &cancel).await?; assert_eq!( storage.list_all().await?, vec![target_path_1.clone()], "Should list a single file after first upload" ); - let target_path_2 = upload_dummy_file(&storage, "upload_2", None).await?; + let target_path_2 = upload_dummy_file(&storage, "upload_2", None, &cancel).await?; assert_eq!( list_files_sorted(&storage).await?, vec![target_path_1.clone(), target_path_2.clone()], @@ -556,7 +669,7 @@ mod fs_tests { #[tokio::test] async fn upload_file_negatives() -> anyhow::Result<()> { - let storage = create_storage()?; + let (storage, cancel) = create_storage()?; let id = RemotePath::new(Utf8Path::new("dummy"))?; let content = Bytes::from_static(b"12345"); @@ -565,36 +678,36 @@ mod fs_tests { // Check that you get an error if the size parameter doesn't match the actual // size of the stream. storage - .upload(content(), 0, &id, None) + .upload(content(), 0, &id, None, &cancel) .await .expect_err("upload with zero size succeeded"); storage - .upload(content(), 4, &id, None) + .upload(content(), 4, &id, None, &cancel) .await .expect_err("upload with too short size succeeded"); storage - .upload(content(), 6, &id, None) + .upload(content(), 6, &id, None, &cancel) .await .expect_err("upload with too large size succeeded"); // Correct size is 5, this should succeed. - storage.upload(content(), 5, &id, None).await?; + storage.upload(content(), 5, &id, None, &cancel).await?; Ok(()) } - fn create_storage() -> anyhow::Result { + fn create_storage() -> anyhow::Result<(LocalFs, CancellationToken)> { let storage_root = tempdir()?.path().to_path_buf(); - LocalFs::new(storage_root) + LocalFs::new(storage_root, Duration::from_secs(120)).map(|s| (s, CancellationToken::new())) } #[tokio::test] async fn download_file() -> anyhow::Result<()> { - let storage = create_storage()?; + let (storage, cancel) = create_storage()?; let upload_name = "upload_1"; - let upload_target = upload_dummy_file(&storage, upload_name, None).await?; + let upload_target = upload_dummy_file(&storage, upload_name, None, &cancel).await?; - let contents = read_and_assert_remote_file_contents(&storage, &upload_target, None).await?; + let contents = read_and_check_metadata(&storage, &upload_target, None).await?; assert_eq!( dummy_contents(upload_name), contents, @@ -602,7 +715,7 @@ mod fs_tests { ); let non_existing_path = "somewhere/else"; - match storage.download(&RemotePath::new(Utf8Path::new(non_existing_path))?).await { + match storage.download(&RemotePath::new(Utf8Path::new(non_existing_path))?, &cancel).await { Err(DownloadError::NotFound) => {} // Should get NotFound for non existing keys other => panic!("Should get a NotFound error when downloading non-existing storage files, but got: {other:?}"), } @@ -611,12 +724,12 @@ mod fs_tests { #[tokio::test] async fn download_file_range_positive() -> anyhow::Result<()> { - let storage = create_storage()?; + let (storage, cancel) = create_storage()?; let upload_name = "upload_1"; - let upload_target = upload_dummy_file(&storage, upload_name, None).await?; + let upload_target = upload_dummy_file(&storage, upload_name, None, &cancel).await?; let full_range_download_contents = - read_and_assert_remote_file_contents(&storage, &upload_target, None).await?; + read_and_check_metadata(&storage, &upload_target, None).await?; assert_eq!( dummy_contents(upload_name), full_range_download_contents, @@ -627,7 +740,12 @@ mod fs_tests { let (first_part_local, second_part_local) = uploaded_bytes.split_at(3); let first_part_download = storage - .download_byte_range(&upload_target, 0, Some(first_part_local.len() as u64)) + .download_byte_range( + &upload_target, + 0, + Some(first_part_local.len() as u64), + &cancel, + ) .await?; assert!( first_part_download.metadata.is_none(), @@ -645,6 +763,7 @@ mod fs_tests { &upload_target, first_part_local.len() as u64, Some((first_part_local.len() + second_part_local.len()) as u64), + &cancel, ) .await?; assert!( @@ -658,14 +777,30 @@ mod fs_tests { "Second part bytes should be returned when requested" ); + let suffix_bytes = storage + .download_byte_range(&upload_target, 13, None, &cancel) + .await? + .download_stream; + let suffix_bytes = aggregate(suffix_bytes).await?; + let suffix = std::str::from_utf8(&suffix_bytes)?; + assert_eq!(upload_name, suffix); + + let all_bytes = storage + .download_byte_range(&upload_target, 0, None, &cancel) + .await? + .download_stream; + let all_bytes = aggregate(all_bytes).await?; + let all_bytes = std::str::from_utf8(&all_bytes)?; + assert_eq!(dummy_contents("upload_1"), all_bytes); + Ok(()) } #[tokio::test] async fn download_file_range_negative() -> anyhow::Result<()> { - let storage = create_storage()?; + let (storage, cancel) = create_storage()?; let upload_name = "upload_1"; - let upload_target = upload_dummy_file(&storage, upload_name, None).await?; + let upload_target = upload_dummy_file(&storage, upload_name, None, &cancel).await?; let start = 1_000_000_000; let end = start + 1; @@ -674,6 +809,7 @@ mod fs_tests { &upload_target, start, Some(end), // exclusive end + &cancel, ) .await { @@ -690,7 +826,7 @@ mod fs_tests { let end = 234; assert!(start > end, "Should test an incorrect range"); match storage - .download_byte_range(&upload_target, start, Some(end)) + .download_byte_range(&upload_target, start, Some(end), &cancel) .await { Ok(_) => panic!("Should not allow downloading wrong ranges"), @@ -707,15 +843,15 @@ mod fs_tests { #[tokio::test] async fn delete_file() -> anyhow::Result<()> { - let storage = create_storage()?; + let (storage, cancel) = create_storage()?; let upload_name = "upload_1"; - let upload_target = upload_dummy_file(&storage, upload_name, None).await?; + let upload_target = upload_dummy_file(&storage, upload_name, None, &cancel).await?; - storage.delete(&upload_target).await?; + storage.delete(&upload_target, &cancel).await?; assert!(storage.list_all().await?.is_empty()); storage - .delete(&upload_target) + .delete(&upload_target, &cancel) .await .expect("Should allow deleting non-existing storage files"); @@ -724,17 +860,17 @@ mod fs_tests { #[tokio::test] async fn file_with_metadata() -> anyhow::Result<()> { - let storage = create_storage()?; + let (storage, cancel) = create_storage()?; let upload_name = "upload_1"; let metadata = StorageMetadata(HashMap::from([ ("one".to_string(), "1".to_string()), ("two".to_string(), "2".to_string()), ])); let upload_target = - upload_dummy_file(&storage, upload_name, Some(metadata.clone())).await?; + upload_dummy_file(&storage, upload_name, Some(metadata.clone()), &cancel).await?; let full_range_download_contents = - read_and_assert_remote_file_contents(&storage, &upload_target, Some(&metadata)).await?; + read_and_check_metadata(&storage, &upload_target, Some(&metadata)).await?; assert_eq!( dummy_contents(upload_name), full_range_download_contents, @@ -745,7 +881,12 @@ mod fs_tests { let (first_part_local, _) = uploaded_bytes.split_at(3); let partial_download_with_metadata = storage - .download_byte_range(&upload_target, 0, Some(first_part_local.len() as u64)) + .download_byte_range( + &upload_target, + 0, + Some(first_part_local.len() as u64), + &cancel, + ) .await?; let first_part_remote = aggregate(partial_download_with_metadata.download_stream).await?; assert_eq!( @@ -766,16 +907,20 @@ mod fs_tests { #[tokio::test] async fn list() -> anyhow::Result<()> { // No delimiter: should recursively list everything - let storage = create_storage()?; - let child = upload_dummy_file(&storage, "grandparent/parent/child", None).await?; - let uncle = upload_dummy_file(&storage, "grandparent/uncle", None).await?; + let (storage, cancel) = create_storage()?; + let child = upload_dummy_file(&storage, "grandparent/parent/child", None, &cancel).await?; + let uncle = upload_dummy_file(&storage, "grandparent/uncle", None, &cancel).await?; - let listing = storage.list(None, ListingMode::NoDelimiter).await?; + let listing = storage + .list(None, ListingMode::NoDelimiter, None, &cancel) + .await?; assert!(listing.prefixes.is_empty()); assert_eq!(listing.keys, [uncle.clone(), child.clone()].to_vec()); // Delimiter: should only go one deep - let listing = storage.list(None, ListingMode::WithDelimiter).await?; + let listing = storage + .list(None, ListingMode::WithDelimiter, None, &cancel) + .await?; assert_eq!( listing.prefixes, @@ -788,6 +933,8 @@ mod fs_tests { .list( Some(&RemotePath::from_string("timelines/some_timeline/grandparent").unwrap()), ListingMode::WithDelimiter, + None, + &cancel, ) .await?; assert_eq!( @@ -800,10 +947,75 @@ mod fs_tests { Ok(()) } + #[tokio::test] + async fn overwrite_shorter_file() -> anyhow::Result<()> { + let (storage, cancel) = create_storage()?; + + let path = RemotePath::new("does/not/matter/file".into())?; + + let body = Bytes::from_static(b"long file contents is long"); + { + let len = body.len(); + let body = + futures::stream::once(futures::future::ready(std::io::Result::Ok(body.clone()))); + storage.upload(body, len, &path, None, &cancel).await?; + } + + let read = aggregate(storage.download(&path, &cancel).await?.download_stream).await?; + assert_eq!(body, read); + + let shorter = Bytes::from_static(b"shorter body"); + { + let len = shorter.len(); + let body = + futures::stream::once(futures::future::ready(std::io::Result::Ok(shorter.clone()))); + storage.upload(body, len, &path, None, &cancel).await?; + } + + let read = aggregate(storage.download(&path, &cancel).await?.download_stream).await?; + assert_eq!(shorter, read); + Ok(()) + } + + #[tokio::test] + async fn cancelled_upload_can_later_be_retried() -> anyhow::Result<()> { + let (storage, cancel) = create_storage()?; + + let path = RemotePath::new("does/not/matter/file".into())?; + + let body = Bytes::from_static(b"long file contents is long"); + { + let len = body.len(); + let body = + futures::stream::once(futures::future::ready(std::io::Result::Ok(body.clone()))); + let cancel = cancel.child_token(); + cancel.cancel(); + let e = storage + .upload(body, len, &path, None, &cancel) + .await + .unwrap_err(); + + assert!(TimeoutOrCancel::caused_by_cancel(&e)); + } + + { + let len = body.len(); + let body = + futures::stream::once(futures::future::ready(std::io::Result::Ok(body.clone()))); + storage.upload(body, len, &path, None, &cancel).await?; + } + + let read = aggregate(storage.download(&path, &cancel).await?.download_stream).await?; + assert_eq!(body, read); + + Ok(()) + } + async fn upload_dummy_file( storage: &LocalFs, name: &str, metadata: Option, + cancel: &CancellationToken, ) -> anyhow::Result { let from_path = storage .storage_root @@ -825,7 +1037,9 @@ mod fs_tests { let file = tokio_util::io::ReaderStream::new(file); - storage.upload(file, size, &relative_path, metadata).await?; + storage + .upload(file, size, &relative_path, metadata, cancel) + .await?; Ok(relative_path) } diff --git a/libs/remote_storage/src/s3_bucket.rs b/libs/remote_storage/src/s3_bucket.rs index 4909b8522b..438f45fbde 100644 --- a/libs/remote_storage/src/s3_bucket.rs +++ b/libs/remote_storage/src/s3_bucket.rs @@ -7,10 +7,11 @@ use std::{ borrow::Cow, collections::HashMap, + num::NonZeroU32, pin::Pin, sync::Arc, task::{Context, Poll}, - time::SystemTime, + time::{Duration, SystemTime}, }; use anyhow::{anyhow, Context as _}; @@ -45,8 +46,9 @@ use utils::backoff; use super::StorageMetadata; use crate::{ - ConcurrencyLimiter, Download, DownloadError, Listing, ListingMode, RemotePath, RemoteStorage, - S3Config, MAX_KEYS_PER_DELETE, REMOTE_STORAGE_PREFIX_SEPARATOR, + error::Cancelled, support::PermitCarrying, ConcurrencyLimiter, Download, DownloadError, + Listing, ListingMode, RemotePath, RemoteStorage, S3Config, TimeTravelError, TimeoutOrCancel, + MAX_KEYS_PER_DELETE, REMOTE_STORAGE_PREFIX_SEPARATOR, }; pub(super) mod metrics; @@ -61,9 +63,10 @@ pub struct S3Bucket { prefix_in_bucket: Option, max_keys_per_list_response: Option, concurrency_limiter: ConcurrencyLimiter, + // Per-request timeout. Accessible for tests. + pub timeout: Duration, } -#[derive(Default)] struct GetObjectRequest { bucket: String, key: String, @@ -71,7 +74,7 @@ struct GetObjectRequest { } impl S3Bucket { /// Creates the S3 storage, errors if incorrect AWS S3 configuration provided. - pub fn new(aws_config: &S3Config) -> anyhow::Result { + pub fn new(aws_config: &S3Config, timeout: Duration) -> anyhow::Result { tracing::debug!( "Creating s3 remote storage for S3 bucket {}", aws_config.bucket_name @@ -151,6 +154,7 @@ impl S3Bucket { max_keys_per_list_response: aws_config.max_keys_per_list_response, prefix_in_bucket, concurrency_limiter: ConcurrencyLimiter::new(aws_config.concurrency_limit.get()), + timeout, }) } @@ -184,40 +188,55 @@ impl S3Bucket { } } - async fn permit(&self, kind: RequestKind) -> tokio::sync::SemaphorePermit<'_> { + async fn permit( + &self, + kind: RequestKind, + cancel: &CancellationToken, + ) -> Result, Cancelled> { let started_at = start_counting_cancelled_wait(kind); - let permit = self - .concurrency_limiter - .acquire(kind) - .await - .expect("semaphore is never closed"); + let acquire = self.concurrency_limiter.acquire(kind); + + let permit = tokio::select! { + permit = acquire => permit.expect("semaphore is never closed"), + _ = cancel.cancelled() => return Err(Cancelled), + }; let started_at = ScopeGuard::into_inner(started_at); metrics::BUCKET_METRICS .wait_seconds .observe_elapsed(kind, started_at); - permit + Ok(permit) } - async fn owned_permit(&self, kind: RequestKind) -> tokio::sync::OwnedSemaphorePermit { + async fn owned_permit( + &self, + kind: RequestKind, + cancel: &CancellationToken, + ) -> Result { let started_at = start_counting_cancelled_wait(kind); - let permit = self - .concurrency_limiter - .acquire_owned(kind) - .await - .expect("semaphore is never closed"); + let acquire = self.concurrency_limiter.acquire_owned(kind); + + let permit = tokio::select! { + permit = acquire => permit.expect("semaphore is never closed"), + _ = cancel.cancelled() => return Err(Cancelled), + }; let started_at = ScopeGuard::into_inner(started_at); metrics::BUCKET_METRICS .wait_seconds .observe_elapsed(kind, started_at); - permit + Ok(permit) } - async fn download_object(&self, request: GetObjectRequest) -> Result { + async fn download_object( + &self, + request: GetObjectRequest, + cancel: &CancellationToken, + ) -> Result { let kind = RequestKind::Get; - let permit = self.owned_permit(kind).await; + + let permit = self.owned_permit(kind, cancel).await?; let started_at = start_measuring_requests(kind); @@ -227,29 +246,18 @@ impl S3Bucket { .bucket(request.bucket) .key(request.key) .set_range(request.range) - .send() - .await; + .send(); + + let get_object = tokio::select! { + res = get_object => res, + _ = tokio::time::sleep(self.timeout) => return Err(DownloadError::Timeout), + _ = cancel.cancelled() => return Err(DownloadError::Cancelled), + }; let started_at = ScopeGuard::into_inner(started_at); - match get_object { - Ok(object_output) => { - let metadata = object_output.metadata().cloned().map(StorageMetadata); - let etag = object_output.e_tag.clone(); - let last_modified = object_output.last_modified.and_then(|t| t.try_into().ok()); - - let body = object_output.body; - let body = ByteStreamAsStream::from(body); - let body = PermitCarrying::new(permit, body); - let body = TimedDownload::new(started_at, body); - - Ok(Download { - metadata, - etag, - last_modified, - download_stream: Box::pin(body), - }) - } + let object_output = match get_object { + Ok(object_output) => object_output, Err(SdkError::ServiceError(e)) if matches!(e.err(), GetObjectError::NoSuchKey(_)) => { // Count this in the AttemptOutcome::Ok bucket, because 404 is not // an error: we expect to sometimes fetch an object and find it missing, @@ -259,7 +267,7 @@ impl S3Bucket { AttemptOutcome::Ok, started_at, ); - Err(DownloadError::NotFound) + return Err(DownloadError::NotFound); } Err(e) => { metrics::BUCKET_METRICS.req_seconds.observe_elapsed( @@ -268,42 +276,76 @@ impl S3Bucket { started_at, ); - Err(DownloadError::Other( + return Err(DownloadError::Other( anyhow::Error::new(e).context("download s3 object"), - )) + )); } - } + }; + + // even if we would have no timeout left, continue anyways. the caller can decide to ignore + // the errors considering timeouts and cancellation. + let remaining = self.timeout.saturating_sub(started_at.elapsed()); + + let metadata = object_output.metadata().cloned().map(StorageMetadata); + let etag = object_output.e_tag; + let last_modified = object_output.last_modified.and_then(|t| t.try_into().ok()); + + let body = object_output.body; + let body = ByteStreamAsStream::from(body); + let body = PermitCarrying::new(permit, body); + let body = TimedDownload::new(started_at, body); + + let cancel_or_timeout = crate::support::cancel_or_timeout(remaining, cancel.clone()); + let body = crate::support::DownloadStream::new(cancel_or_timeout, body); + + Ok(Download { + metadata, + etag, + last_modified, + download_stream: Box::pin(body), + }) } async fn delete_oids( &self, - kind: RequestKind, + _permit: &tokio::sync::SemaphorePermit<'_>, delete_objects: &[ObjectIdentifier], + cancel: &CancellationToken, ) -> anyhow::Result<()> { + let kind = RequestKind::Delete; + let mut cancel = std::pin::pin!(cancel.cancelled()); + for chunk in delete_objects.chunks(MAX_KEYS_PER_DELETE) { let started_at = start_measuring_requests(kind); - let resp = self + let req = self .client .delete_objects() .bucket(self.bucket_name.clone()) .delete( Delete::builder() .set_objects(Some(chunk.to_vec())) - .build()?, + .build() + .context("build request")?, ) - .send() - .await; + .send(); + + let resp = tokio::select! { + resp = req => resp, + _ = tokio::time::sleep(self.timeout) => return Err(TimeoutOrCancel::Timeout.into()), + _ = &mut cancel => return Err(TimeoutOrCancel::Cancel.into()), + }; let started_at = ScopeGuard::into_inner(started_at); metrics::BUCKET_METRICS .req_seconds .observe_elapsed(kind, &resp, started_at); - let resp = resp?; + let resp = resp.context("request deletion")?; metrics::BUCKET_METRICS .deleted_objects_total .inc_by(chunk.len() as u64); + if let Some(errors) = resp.errors { // Log a bounded number of the errors within the response: // these requests can carry 1000 keys so logging each one @@ -319,9 +361,10 @@ impl S3Bucket { ); } - return Err(anyhow::format_err!( - "Failed to delete {} objects", - errors.len() + return Err(anyhow::anyhow!( + "Failed to delete {}/{} objects", + errors.len(), + chunk.len(), )); } } @@ -354,33 +397,6 @@ impl Stream for ByteStreamAsStream { // sense and Stream::size_hint does not really } -pin_project_lite::pin_project! { - /// An `AsyncRead` adapter which carries a permit for the lifetime of the value. - struct PermitCarrying { - permit: tokio::sync::OwnedSemaphorePermit, - #[pin] - inner: S, - } -} - -impl PermitCarrying { - fn new(permit: tokio::sync::OwnedSemaphorePermit, inner: S) -> Self { - Self { permit, inner } - } -} - -impl>> Stream for PermitCarrying { - type Item = ::Item; - - fn poll_next(self: Pin<&mut Self>, cx: &mut Context<'_>) -> Poll> { - self.project().inner.poll_next(cx) - } - - fn size_hint(&self) -> (usize, Option) { - self.inner.size_hint() - } -} - pin_project_lite::pin_project! { /// Times and tracks the outcome of the request. struct TimedDownload { @@ -435,8 +451,12 @@ impl RemoteStorage for S3Bucket { &self, prefix: Option<&RemotePath>, mode: ListingMode, + max_keys: Option, + cancel: &CancellationToken, ) -> Result { let kind = RequestKind::List; + // s3 sdk wants i32 + let mut max_keys = max_keys.map(|mk| mk.get() as i32); let mut result = Listing::default(); // get the passed prefix or if it is not set use prefix_in_bucket value @@ -454,27 +474,41 @@ impl RemoteStorage for S3Bucket { p }); + let _permit = self.permit(kind, cancel).await?; + let mut continuation_token = None; loop { - let _guard = self.permit(kind).await; let started_at = start_measuring_requests(kind); + // min of two Options, returning Some if one is value and another is + // None (None is smaller than anything, so plain min doesn't work). + let request_max_keys = self + .max_keys_per_list_response + .into_iter() + .chain(max_keys.into_iter()) + .min(); let mut request = self .client .list_objects_v2() .bucket(self.bucket_name.clone()) .set_prefix(list_prefix.clone()) .set_continuation_token(continuation_token) - .set_max_keys(self.max_keys_per_list_response); + .set_max_keys(request_max_keys); if let ListingMode::WithDelimiter = mode { request = request.delimiter(REMOTE_STORAGE_PREFIX_SEPARATOR.to_string()); } - let response = request - .send() - .await + let request = request.send(); + + let response = tokio::select! { + res = request => res, + _ = tokio::time::sleep(self.timeout) => return Err(DownloadError::Timeout), + _ = cancel.cancelled() => return Err(DownloadError::Cancelled), + }; + + let response = response .context("Failed to list S3 prefixes") .map_err(DownloadError::Other); @@ -496,6 +530,14 @@ impl RemoteStorage for S3Bucket { let object_path = object.key().expect("response does not contain a key"); let remote_path = self.s3_object_to_relative_path(object_path); result.keys.push(remote_path); + if let Some(mut mk) = max_keys { + assert!(mk > 0); + mk -= 1; + if mk == 0 { + return Ok(result); // limit reached + } + max_keys = Some(mk); + } } result.prefixes.extend( @@ -519,16 +561,17 @@ impl RemoteStorage for S3Bucket { from_size_bytes: usize, to: &RemotePath, metadata: Option, + cancel: &CancellationToken, ) -> anyhow::Result<()> { let kind = RequestKind::Put; - let _guard = self.permit(kind).await; + let _permit = self.permit(kind, cancel).await?; let started_at = start_measuring_requests(kind); let body = Body::wrap_stream(from); let bytes_stream = ByteStream::new(SdkBody::from_body_0_4(body)); - let res = self + let upload = self .client .put_object() .bucket(self.bucket_name.clone()) @@ -536,22 +579,40 @@ impl RemoteStorage for S3Bucket { .set_metadata(metadata.map(|m| m.0)) .content_length(from_size_bytes.try_into()?) .body(bytes_stream) - .send() - .await; + .send(); - let started_at = ScopeGuard::into_inner(started_at); - metrics::BUCKET_METRICS - .req_seconds - .observe_elapsed(kind, &res, started_at); + let upload = tokio::time::timeout(self.timeout, upload); - res?; + let res = tokio::select! { + res = upload => res, + _ = cancel.cancelled() => return Err(TimeoutOrCancel::Cancel.into()), + }; - Ok(()) + if let Ok(inner) = &res { + // do not incl. timeouts as errors in metrics but cancellations + let started_at = ScopeGuard::into_inner(started_at); + metrics::BUCKET_METRICS + .req_seconds + .observe_elapsed(kind, inner, started_at); + } + + match res { + Ok(Ok(_put)) => Ok(()), + Ok(Err(sdk)) => Err(sdk.into()), + Err(_timeout) => Err(TimeoutOrCancel::Timeout.into()), + } } - async fn copy(&self, from: &RemotePath, to: &RemotePath) -> anyhow::Result<()> { + async fn copy( + &self, + from: &RemotePath, + to: &RemotePath, + cancel: &CancellationToken, + ) -> anyhow::Result<()> { let kind = RequestKind::Copy; - let _guard = self.permit(kind).await; + let _permit = self.permit(kind, cancel).await?; + + let timeout = tokio::time::sleep(self.timeout); let started_at = start_measuring_requests(kind); @@ -562,14 +623,19 @@ impl RemoteStorage for S3Bucket { self.relative_path_to_s3_object(from) ); - let res = self + let op = self .client .copy_object() .bucket(self.bucket_name.clone()) .key(self.relative_path_to_s3_object(to)) .copy_source(copy_source) - .send() - .await; + .send(); + + let res = tokio::select! { + res = op => res, + _ = timeout => return Err(TimeoutOrCancel::Timeout.into()), + _ = cancel.cancelled() => return Err(TimeoutOrCancel::Cancel.into()), + }; let started_at = ScopeGuard::into_inner(started_at); metrics::BUCKET_METRICS @@ -581,14 +647,21 @@ impl RemoteStorage for S3Bucket { Ok(()) } - async fn download(&self, from: &RemotePath) -> Result { + async fn download( + &self, + from: &RemotePath, + cancel: &CancellationToken, + ) -> Result { // if prefix is not none then download file `prefix/from` // if prefix is none then download file `from` - self.download_object(GetObjectRequest { - bucket: self.bucket_name.clone(), - key: self.relative_path_to_s3_object(from), - range: None, - }) + self.download_object( + GetObjectRequest { + bucket: self.bucket_name.clone(), + key: self.relative_path_to_s3_object(from), + range: None, + }, + cancel, + ) .await } @@ -597,6 +670,7 @@ impl RemoteStorage for S3Bucket { from: &RemotePath, start_inclusive: u64, end_exclusive: Option, + cancel: &CancellationToken, ) -> Result { // S3 accepts ranges as https://www.w3.org/Protocols/rfc2616/rfc2616-sec14.html#sec14.35 // and needs both ends to be exclusive @@ -606,31 +680,39 @@ impl RemoteStorage for S3Bucket { None => format!("bytes={start_inclusive}-"), }); - self.download_object(GetObjectRequest { - bucket: self.bucket_name.clone(), - key: self.relative_path_to_s3_object(from), - range, - }) + self.download_object( + GetObjectRequest { + bucket: self.bucket_name.clone(), + key: self.relative_path_to_s3_object(from), + range, + }, + cancel, + ) .await } - async fn delete_objects<'a>(&self, paths: &'a [RemotePath]) -> anyhow::Result<()> { - let kind = RequestKind::Delete; - let _guard = self.permit(kind).await; + async fn delete_objects<'a>( + &self, + paths: &'a [RemotePath], + cancel: &CancellationToken, + ) -> anyhow::Result<()> { + let kind = RequestKind::Delete; + let permit = self.permit(kind, cancel).await?; let mut delete_objects = Vec::with_capacity(paths.len()); for path in paths { let obj_id = ObjectIdentifier::builder() .set_key(Some(self.relative_path_to_s3_object(path))) - .build()?; + .build() + .context("convert path to oid")?; delete_objects.push(obj_id); } - self.delete_oids(kind, &delete_objects).await + self.delete_oids(&permit, &delete_objects, cancel).await } - async fn delete(&self, path: &RemotePath) -> anyhow::Result<()> { + async fn delete(&self, path: &RemotePath, cancel: &CancellationToken) -> anyhow::Result<()> { let paths = std::array::from_ref(path); - self.delete_objects(paths).await + self.delete_objects(paths, cancel).await } async fn time_travel_recover( @@ -638,10 +720,10 @@ impl RemoteStorage for S3Bucket { prefix: Option<&RemotePath>, timestamp: SystemTime, done_if_after: SystemTime, - cancel: CancellationToken, - ) -> anyhow::Result<()> { + cancel: &CancellationToken, + ) -> Result<(), TimeTravelError> { let kind = RequestKind::TimeTravel; - let _guard = self.permit(kind).await; + let permit = self.permit(kind, cancel).await?; let timestamp = DateTime::from(timestamp); let done_if_after = DateTime::from(done_if_after); @@ -655,77 +737,120 @@ impl RemoteStorage for S3Bucket { let warn_threshold = 3; let max_retries = 10; - let is_permanent = |_e: &_| false; + let is_permanent = |e: &_| matches!(e, TimeTravelError::Cancelled); - let list = backoff::retry( - || async { - Ok(self - .client - .list_object_versions() - .bucket(self.bucket_name.clone()) - .set_prefix(prefix.clone()) - .send() - .await?) - }, - is_permanent, - warn_threshold, - max_retries, - "listing object versions for time_travel_recover", - backoff::Cancel::new(cancel.clone(), || anyhow!("Cancelled")), - ) - .await?; + let mut key_marker = None; + let mut version_id_marker = None; + let mut versions_and_deletes = Vec::new(); - if list.is_truncated().unwrap_or_default() { - anyhow::bail!("Received truncated ListObjectVersions response for prefix={prefix:?}"); + loop { + let response = backoff::retry( + || async { + let op = self + .client + .list_object_versions() + .bucket(self.bucket_name.clone()) + .set_prefix(prefix.clone()) + .set_key_marker(key_marker.clone()) + .set_version_id_marker(version_id_marker.clone()) + .send(); + + tokio::select! { + res = op => res.map_err(|e| TimeTravelError::Other(e.into())), + _ = cancel.cancelled() => Err(TimeTravelError::Cancelled), + } + }, + is_permanent, + warn_threshold, + max_retries, + "listing object versions for time_travel_recover", + cancel, + ) + .await + .ok_or_else(|| TimeTravelError::Cancelled) + .and_then(|x| x)?; + + tracing::trace!( + " Got List response version_id_marker={:?}, key_marker={:?}", + response.version_id_marker, + response.key_marker + ); + let versions = response + .versions + .unwrap_or_default() + .into_iter() + .map(VerOrDelete::from_version); + let deletes = response + .delete_markers + .unwrap_or_default() + .into_iter() + .map(VerOrDelete::from_delete_marker); + itertools::process_results(versions.chain(deletes), |n_vds| { + versions_and_deletes.extend(n_vds) + }) + .map_err(TimeTravelError::Other)?; + fn none_if_empty(v: Option) -> Option { + v.filter(|v| !v.is_empty()) + } + version_id_marker = none_if_empty(response.next_version_id_marker); + key_marker = none_if_empty(response.next_key_marker); + if version_id_marker.is_none() { + // The final response is not supposed to be truncated + if response.is_truncated.unwrap_or_default() { + return Err(TimeTravelError::Other(anyhow::anyhow!( + "Received truncated ListObjectVersions response for prefix={prefix:?}" + ))); + } + break; + } + // Limit the number of versions deletions, mostly so that we don't + // keep requesting forever if the list is too long, as we'd put the + // list in RAM. + // Building a list of 100k entries that reaches the limit roughly takes + // 40 seconds, and roughly corresponds to tenants of 2 TiB physical size. + const COMPLEXITY_LIMIT: usize = 100_000; + if versions_and_deletes.len() >= COMPLEXITY_LIMIT { + return Err(TimeTravelError::TooManyVersions); + } } - let mut versions_deletes = list - .versions() - .iter() - .map(VerOrDelete::Version) - .chain(list.delete_markers().iter().map(VerOrDelete::DeleteMarker)) - .collect::>(); + tracing::info!( + "Built list for time travel with {} versions and deletions", + versions_and_deletes.len() + ); - versions_deletes.sort_by_key(|vd| (vd.key(), vd.last_modified())); + // Work on the list of references instead of the objects directly, + // otherwise we get lifetime errors in the sort_by_key call below. + let mut versions_and_deletes = versions_and_deletes.iter().collect::>(); + + versions_and_deletes.sort_by_key(|vd| (&vd.key, &vd.last_modified)); let mut vds_for_key = HashMap::<_, Vec<_>>::new(); - for vd in versions_deletes { - let last_modified = vd.last_modified(); - let version_id = vd.version_id(); - let key = vd.key(); - let (Some(last_modified), Some(version_id), Some(key)) = - (last_modified, version_id, key) - else { - anyhow::bail!( - "One (or more) of last_modified, key, and id is None. \ - Is versioning enabled in the bucket? last_modified={:?} key={:?} version_id={:?}", - last_modified, key, version_id, - ); - }; + for vd in &versions_and_deletes { + let VerOrDelete { + version_id, key, .. + } = &vd; if version_id == "null" { - anyhow::bail!("Received ListVersions response for key={key} with version_id='null', \ - indicating either disabled versioning, or legacy objects with null version id values"); + return Err(TimeTravelError::Other(anyhow!("Received ListVersions response for key={key} with version_id='null', \ + indicating either disabled versioning, or legacy objects with null version id values"))); } tracing::trace!( - "Parsing version key={key} version_id={version_id} is_delete={}", - matches!(vd, VerOrDelete::DeleteMarker(_)) + "Parsing version key={key} version_id={version_id} kind={:?}", + vd.kind ); - vds_for_key - .entry(key) - .or_default() - .push((vd, last_modified, version_id)); + vds_for_key.entry(key).or_default().push(vd); } for (key, versions) in vds_for_key { - let (last_vd, last_last_modified, _version_id) = versions.last().unwrap(); - if last_last_modified > &&done_if_after { + let last_vd = versions.last().unwrap(); + if last_vd.last_modified > done_if_after { tracing::trace!("Key {key} has version later than done_if_after, skipping"); continue; } // the version we want to restore to. let version_to_restore_to = - match versions.binary_search_by_key(×tamp, |tpl| *tpl.1) { + match versions.binary_search_by_key(×tamp, |tpl| tpl.last_modified) { Ok(v) => v, Err(e) => e, }; @@ -743,7 +868,11 @@ impl RemoteStorage for S3Bucket { do_delete = true; } else { match &versions[version_to_restore_to - 1] { - (VerOrDelete::Version(_), _last_modified, version_id) => { + VerOrDelete { + kind: VerOrDeleteKind::Version, + version_id, + .. + } => { tracing::trace!("Copying old version {version_id} for {key}..."); // Restore the state to the last version by copying let source_id = @@ -751,37 +880,60 @@ impl RemoteStorage for S3Bucket { backoff::retry( || async { - Ok(self + let op = self .client .copy_object() .bucket(self.bucket_name.clone()) .key(key) .copy_source(&source_id) - .send() - .await?) + .send(); + + tokio::select! { + res = op => res.map_err(|e| TimeTravelError::Other(e.into())), + _ = cancel.cancelled() => Err(TimeTravelError::Cancelled), + } }, is_permanent, warn_threshold, max_retries, - "listing object versions for time_travel_recover", - backoff::Cancel::new(cancel.clone(), || anyhow!("Cancelled")), + "copying object version for time_travel_recover", + cancel, ) - .await?; + .await + .ok_or_else(|| TimeTravelError::Cancelled) + .and_then(|x| x)?; + tracing::info!(%version_id, %key, "Copied old version in S3"); } - (VerOrDelete::DeleteMarker(_), _last_modified, _version_id) => { + VerOrDelete { + kind: VerOrDeleteKind::DeleteMarker, + .. + } => { do_delete = true; } } }; if do_delete { - if matches!(last_vd, VerOrDelete::DeleteMarker(_)) { + if matches!(last_vd.kind, VerOrDeleteKind::DeleteMarker) { // Key has since been deleted (but there was some history), no need to do anything tracing::trace!("Key {key} already deleted, skipping."); } else { tracing::trace!("Deleting {key}..."); - let oid = ObjectIdentifier::builder().key(key.to_owned()).build()?; - self.delete_oids(kind, &[oid]).await?; + let oid = ObjectIdentifier::builder() + .key(key.to_owned()) + .build() + .map_err(|e| TimeTravelError::Other(e.into()))?; + + self.delete_oids(&permit, &[oid], cancel) + .await + .map_err(|e| { + // delete_oid0 will use TimeoutOrCancel + if TimeoutOrCancel::caused_by_cancel(&e) { + TimeTravelError::Cancelled + } else { + TimeTravelError::Other(e) + } + })?; } } } @@ -811,29 +963,59 @@ fn start_measuring_requests( }) } -enum VerOrDelete<'a> { - Version(&'a ObjectVersion), - DeleteMarker(&'a DeleteMarkerEntry), +// Save RAM and only store the needed data instead of the entire ObjectVersion/DeleteMarkerEntry +struct VerOrDelete { + kind: VerOrDeleteKind, + last_modified: DateTime, + version_id: String, + key: String, } -impl<'a> VerOrDelete<'a> { - fn last_modified(&self) -> Option<&'a DateTime> { - match self { - VerOrDelete::Version(v) => v.last_modified(), - VerOrDelete::DeleteMarker(v) => v.last_modified(), - } +#[derive(Debug)] +enum VerOrDeleteKind { + Version, + DeleteMarker, +} + +impl VerOrDelete { + fn with_kind( + kind: VerOrDeleteKind, + last_modified: Option, + version_id: Option, + key: Option, + ) -> anyhow::Result { + let lvk = (last_modified, version_id, key); + let (Some(last_modified), Some(version_id), Some(key)) = lvk else { + anyhow::bail!( + "One (or more) of last_modified, key, and id is None. \ + Is versioning enabled in the bucket? last_modified={:?}, version_id={:?}, key={:?}", + lvk.0, + lvk.1, + lvk.2, + ); + }; + Ok(Self { + kind, + last_modified, + version_id, + key, + }) } - fn version_id(&self) -> Option<&'a str> { - match self { - VerOrDelete::Version(v) => v.version_id(), - VerOrDelete::DeleteMarker(v) => v.version_id(), - } + fn from_version(v: ObjectVersion) -> anyhow::Result { + Self::with_kind( + VerOrDeleteKind::Version, + v.last_modified, + v.version_id, + v.key, + ) } - fn key(&self) -> Option<&'a str> { - match self { - VerOrDelete::Version(v) => v.key(), - VerOrDelete::DeleteMarker(v) => v.key(), - } + fn from_delete_marker(v: DeleteMarkerEntry) -> anyhow::Result { + Self::with_kind( + VerOrDeleteKind::DeleteMarker, + v.last_modified, + v.version_id, + v.key, + ) } } @@ -858,7 +1040,7 @@ mod tests { Some("test/prefix/"), Some("/test/prefix/"), ]; - let expected_outputs = vec![ + let expected_outputs = [ vec!["", "some/path", "some/path"], vec!["/", "/some/path", "/some/path"], vec![ @@ -887,7 +1069,8 @@ mod tests { concurrency_limit: NonZeroUsize::new(100).unwrap(), max_keys_per_list_response: Some(5), }; - let storage = S3Bucket::new(&config).expect("remote storage init"); + let storage = + S3Bucket::new(&config, std::time::Duration::ZERO).expect("remote storage init"); for (test_path_idx, test_path) in all_paths.iter().enumerate() { let result = storage.relative_path_to_s3_object(test_path); let expected = expected_outputs[prefix_idx][test_path_idx]; diff --git a/libs/remote_storage/src/simulate_failures.rs b/libs/remote_storage/src/simulate_failures.rs index fc4c4b315b..f5344d3ae2 100644 --- a/libs/remote_storage/src/simulate_failures.rs +++ b/libs/remote_storage/src/simulate_failures.rs @@ -4,6 +4,7 @@ use bytes::Bytes; use futures::stream::Stream; use std::collections::HashMap; +use std::num::NonZeroU32; use std::sync::Mutex; use std::time::SystemTime; use std::{collections::hash_map::Entry, sync::Arc}; @@ -11,7 +12,7 @@ use tokio_util::sync::CancellationToken; use crate::{ Download, DownloadError, GenericRemoteStorage, Listing, ListingMode, RemotePath, RemoteStorage, - StorageMetadata, + StorageMetadata, TimeTravelError, }; pub struct UnreliableWrapper { @@ -60,7 +61,7 @@ impl UnreliableWrapper { /// On the first attempts of this operation, return an error. After 'attempts_to_fail' /// attempts, let the operation go ahead, and clear the counter. /// - fn attempt(&self, op: RemoteOp) -> Result { + fn attempt(&self, op: RemoteOp) -> anyhow::Result { let mut attempts = self.attempts.lock().unwrap(); match attempts.entry(op) { @@ -78,22 +79,27 @@ impl UnreliableWrapper { } else { let error = anyhow::anyhow!("simulated failure of remote operation {:?}", e.key()); - Err(DownloadError::Other(error)) + Err(error) } } Entry::Vacant(e) => { let error = anyhow::anyhow!("simulated failure of remote operation {:?}", e.key()); e.insert(1); - Err(DownloadError::Other(error)) + Err(error) } } } - async fn delete_inner(&self, path: &RemotePath, attempt: bool) -> anyhow::Result<()> { + async fn delete_inner( + &self, + path: &RemotePath, + attempt: bool, + cancel: &CancellationToken, + ) -> anyhow::Result<()> { if attempt { self.attempt(RemoteOp::Delete(path.clone()))?; } - self.inner.delete(path).await + self.inner.delete(path, cancel).await } } @@ -104,23 +110,34 @@ impl RemoteStorage for UnreliableWrapper { async fn list_prefixes( &self, prefix: Option<&RemotePath>, + cancel: &CancellationToken, ) -> Result, DownloadError> { - self.attempt(RemoteOp::ListPrefixes(prefix.cloned()))?; - self.inner.list_prefixes(prefix).await + self.attempt(RemoteOp::ListPrefixes(prefix.cloned())) + .map_err(DownloadError::Other)?; + self.inner.list_prefixes(prefix, cancel).await } - async fn list_files(&self, folder: Option<&RemotePath>) -> anyhow::Result> { - self.attempt(RemoteOp::ListPrefixes(folder.cloned()))?; - self.inner.list_files(folder).await + async fn list_files( + &self, + folder: Option<&RemotePath>, + max_keys: Option, + cancel: &CancellationToken, + ) -> Result, DownloadError> { + self.attempt(RemoteOp::ListPrefixes(folder.cloned())) + .map_err(DownloadError::Other)?; + self.inner.list_files(folder, max_keys, cancel).await } async fn list( &self, prefix: Option<&RemotePath>, mode: ListingMode, + max_keys: Option, + cancel: &CancellationToken, ) -> Result { - self.attempt(RemoteOp::ListPrefixes(prefix.cloned()))?; - self.inner.list(prefix, mode).await + self.attempt(RemoteOp::ListPrefixes(prefix.cloned())) + .map_err(DownloadError::Other)?; + self.inner.list(prefix, mode, max_keys, cancel).await } async fn upload( @@ -131,14 +148,22 @@ impl RemoteStorage for UnreliableWrapper { data_size_bytes: usize, to: &RemotePath, metadata: Option, + cancel: &CancellationToken, ) -> anyhow::Result<()> { self.attempt(RemoteOp::Upload(to.clone()))?; - self.inner.upload(data, data_size_bytes, to, metadata).await + self.inner + .upload(data, data_size_bytes, to, metadata, cancel) + .await } - async fn download(&self, from: &RemotePath) -> Result { - self.attempt(RemoteOp::Download(from.clone()))?; - self.inner.download(from).await + async fn download( + &self, + from: &RemotePath, + cancel: &CancellationToken, + ) -> Result { + self.attempt(RemoteOp::Download(from.clone())) + .map_err(DownloadError::Other)?; + self.inner.download(from, cancel).await } async fn download_byte_range( @@ -146,26 +171,32 @@ impl RemoteStorage for UnreliableWrapper { from: &RemotePath, start_inclusive: u64, end_exclusive: Option, + cancel: &CancellationToken, ) -> Result { // Note: We treat any download_byte_range as an "attempt" of the same // operation. We don't pay attention to the ranges. That's good enough // for now. - self.attempt(RemoteOp::Download(from.clone()))?; + self.attempt(RemoteOp::Download(from.clone())) + .map_err(DownloadError::Other)?; self.inner - .download_byte_range(from, start_inclusive, end_exclusive) + .download_byte_range(from, start_inclusive, end_exclusive, cancel) .await } - async fn delete(&self, path: &RemotePath) -> anyhow::Result<()> { - self.delete_inner(path, true).await + async fn delete(&self, path: &RemotePath, cancel: &CancellationToken) -> anyhow::Result<()> { + self.delete_inner(path, true, cancel).await } - async fn delete_objects<'a>(&self, paths: &'a [RemotePath]) -> anyhow::Result<()> { + async fn delete_objects<'a>( + &self, + paths: &'a [RemotePath], + cancel: &CancellationToken, + ) -> anyhow::Result<()> { self.attempt(RemoteOp::DeleteObjects(paths.to_vec()))?; let mut error_counter = 0; for path in paths { // Dont record attempt because it was already recorded above - if (self.delete_inner(path, false).await).is_err() { + if (self.delete_inner(path, false, cancel).await).is_err() { error_counter += 1; } } @@ -178,11 +209,16 @@ impl RemoteStorage for UnreliableWrapper { Ok(()) } - async fn copy(&self, from: &RemotePath, to: &RemotePath) -> anyhow::Result<()> { + async fn copy( + &self, + from: &RemotePath, + to: &RemotePath, + cancel: &CancellationToken, + ) -> anyhow::Result<()> { // copy is equivalent to download + upload self.attempt(RemoteOp::Download(from.clone()))?; self.attempt(RemoteOp::Upload(to.clone()))?; - self.inner.copy_object(from, to).await + self.inner.copy_object(from, to, cancel).await } async fn time_travel_recover( @@ -190,9 +226,10 @@ impl RemoteStorage for UnreliableWrapper { prefix: Option<&RemotePath>, timestamp: SystemTime, done_if_after: SystemTime, - cancel: CancellationToken, - ) -> anyhow::Result<()> { - self.attempt(RemoteOp::TimeTravelRecover(prefix.map(|p| p.to_owned())))?; + cancel: &CancellationToken, + ) -> Result<(), TimeTravelError> { + self.attempt(RemoteOp::TimeTravelRecover(prefix.map(|p| p.to_owned()))) + .map_err(TimeTravelError::Other)?; self.inner .time_travel_recover(prefix, timestamp, done_if_after, cancel) .await diff --git a/libs/remote_storage/src/support.rs b/libs/remote_storage/src/support.rs new file mode 100644 index 0000000000..d146b5445b --- /dev/null +++ b/libs/remote_storage/src/support.rs @@ -0,0 +1,175 @@ +use std::{ + future::Future, + pin::Pin, + task::{Context, Poll}, + time::Duration, +}; + +use bytes::Bytes; +use futures_util::Stream; +use tokio_util::sync::CancellationToken; + +use crate::TimeoutOrCancel; + +pin_project_lite::pin_project! { + /// An `AsyncRead` adapter which carries a permit for the lifetime of the value. + pub(crate) struct PermitCarrying { + permit: tokio::sync::OwnedSemaphorePermit, + #[pin] + inner: S, + } +} + +impl PermitCarrying { + pub(crate) fn new(permit: tokio::sync::OwnedSemaphorePermit, inner: S) -> Self { + Self { permit, inner } + } +} + +impl Stream for PermitCarrying { + type Item = ::Item; + + fn poll_next(self: Pin<&mut Self>, cx: &mut Context<'_>) -> Poll> { + self.project().inner.poll_next(cx) + } + + fn size_hint(&self) -> (usize, Option) { + self.inner.size_hint() + } +} + +pin_project_lite::pin_project! { + pub(crate) struct DownloadStream { + hit: bool, + #[pin] + cancellation: F, + #[pin] + inner: S, + } +} + +impl DownloadStream { + pub(crate) fn new(cancellation: F, inner: S) -> Self { + Self { + cancellation, + hit: false, + inner, + } + } +} + +/// See documentation on [`crate::DownloadStream`] on rationale why `std::io::Error` is used. +impl Stream for DownloadStream +where + std::io::Error: From, + F: Future, + S: Stream>, +{ + type Item = ::Item; + + fn poll_next(self: Pin<&mut Self>, cx: &mut Context<'_>) -> Poll> { + let this = self.project(); + + if !*this.hit { + if let Poll::Ready(e) = this.cancellation.poll(cx) { + *this.hit = true; + + // most likely this will be a std::io::Error wrapping a DownloadError + let e = Err(std::io::Error::from(e)); + return Poll::Ready(Some(e)); + } + } + + this.inner.poll_next(cx) + } + + fn size_hint(&self) -> (usize, Option) { + self.inner.size_hint() + } +} + +/// Fires only on the first cancel or timeout, not on both. +pub(crate) async fn cancel_or_timeout( + timeout: Duration, + cancel: CancellationToken, +) -> TimeoutOrCancel { + tokio::select! { + _ = tokio::time::sleep(timeout) => TimeoutOrCancel::Timeout, + _ = cancel.cancelled() => TimeoutOrCancel::Cancel, + } +} + +#[cfg(test)] +mod tests { + use super::*; + use crate::DownloadError; + use futures::stream::StreamExt; + + #[tokio::test(start_paused = true)] + async fn cancelled_download_stream() { + let inner = futures::stream::pending(); + let timeout = Duration::from_secs(120); + let cancel = CancellationToken::new(); + + let stream = DownloadStream::new(cancel_or_timeout(timeout, cancel.clone()), inner); + let mut stream = std::pin::pin!(stream); + + let mut first = stream.next(); + + tokio::select! { + _ = &mut first => unreachable!("we haven't yet cancelled nor is timeout passed"), + _ = tokio::time::sleep(Duration::from_secs(1)) => {}, + } + + cancel.cancel(); + + let e = first.await.expect("there must be some").unwrap_err(); + assert!(matches!(e.kind(), std::io::ErrorKind::Other), "{e:?}"); + let inner = e.get_ref().expect("inner should be set"); + assert!( + inner + .downcast_ref::() + .is_some_and(|e| matches!(e, DownloadError::Cancelled)), + "{inner:?}" + ); + let e = DownloadError::from(e); + assert!(matches!(e, DownloadError::Cancelled), "{e:?}"); + + tokio::select! { + _ = stream.next() => unreachable!("no timeout ever happens as we were already cancelled"), + _ = tokio::time::sleep(Duration::from_secs(121)) => {}, + } + } + + #[tokio::test(start_paused = true)] + async fn timeouted_download_stream() { + let inner = futures::stream::pending(); + let timeout = Duration::from_secs(120); + let cancel = CancellationToken::new(); + + let stream = DownloadStream::new(cancel_or_timeout(timeout, cancel.clone()), inner); + let mut stream = std::pin::pin!(stream); + + // because the stream uses 120s timeout and we are paused, we advance to 120s right away. + let first = stream.next(); + + let e = first.await.expect("there must be some").unwrap_err(); + assert!(matches!(e.kind(), std::io::ErrorKind::Other), "{e:?}"); + let inner = e.get_ref().expect("inner should be set"); + assert!( + inner + .downcast_ref::() + .is_some_and(|e| matches!(e, DownloadError::Timeout)), + "{inner:?}" + ); + let e = DownloadError::from(e); + assert!(matches!(e, DownloadError::Timeout), "{e:?}"); + + cancel.cancel(); + + tokio::select! { + _ = stream.next() => unreachable!("no cancellation ever happens because we already timed out"), + _ = tokio::time::sleep(Duration::from_secs(121)) => {}, + } + } +} diff --git a/libs/remote_storage/tests/common/mod.rs b/libs/remote_storage/tests/common/mod.rs index bca117ed1a..da9dc08d8d 100644 --- a/libs/remote_storage/tests/common/mod.rs +++ b/libs/remote_storage/tests/common/mod.rs @@ -10,6 +10,7 @@ use futures::stream::Stream; use once_cell::sync::OnceCell; use remote_storage::{Download, GenericRemoteStorage, RemotePath}; use tokio::task::JoinSet; +use tokio_util::sync::CancellationToken; use tracing::{debug, error, info}; static LOGGING_DONE: OnceCell<()> = OnceCell::new(); @@ -58,8 +59,12 @@ pub(crate) async fn upload_simple_remote_data( ) -> ControlFlow, HashSet> { info!("Creating {upload_tasks_count} remote files"); let mut upload_tasks = JoinSet::new(); + let cancel = CancellationToken::new(); + for i in 1..upload_tasks_count + 1 { let task_client = Arc::clone(client); + let cancel = cancel.clone(); + upload_tasks.spawn(async move { let blob_path = PathBuf::from(format!("folder{}/blob_{}.txt", i / 7, i)); let blob_path = RemotePath::new( @@ -69,7 +74,9 @@ pub(crate) async fn upload_simple_remote_data( debug!("Creating remote item {i} at path {blob_path:?}"); let (data, len) = upload_stream(format!("remote blob data {i}").into_bytes().into()); - task_client.upload(data, len, &blob_path, None).await?; + task_client + .upload(data, len, &blob_path, None, &cancel) + .await?; Ok::<_, anyhow::Error>(blob_path) }); @@ -107,13 +114,15 @@ pub(crate) async fn cleanup( "Removing {} objects from the remote storage during cleanup", objects_to_delete.len() ); + let cancel = CancellationToken::new(); let mut delete_tasks = JoinSet::new(); for object_to_delete in objects_to_delete { let task_client = Arc::clone(client); + let cancel = cancel.clone(); delete_tasks.spawn(async move { debug!("Deleting remote item at path {object_to_delete:?}"); task_client - .delete(&object_to_delete) + .delete(&object_to_delete, &cancel) .await .with_context(|| format!("{object_to_delete:?} removal")) }); @@ -141,8 +150,12 @@ pub(crate) async fn upload_remote_data( ) -> ControlFlow { info!("Creating {upload_tasks_count} remote files"); let mut upload_tasks = JoinSet::new(); + let cancel = CancellationToken::new(); + for i in 1..upload_tasks_count + 1 { let task_client = Arc::clone(client); + let cancel = cancel.clone(); + upload_tasks.spawn(async move { let prefix = format!("{base_prefix_str}/sub_prefix_{i}/"); let blob_prefix = RemotePath::new(Utf8Path::new(&prefix)) @@ -152,7 +165,9 @@ pub(crate) async fn upload_remote_data( let (data, data_len) = upload_stream(format!("remote blob data {i}").into_bytes().into()); - task_client.upload(data, data_len, &blob_path, None).await?; + task_client + .upload(data, data_len, &blob_path, None, &cancel) + .await?; Ok::<_, anyhow::Error>((blob_prefix, blob_path)) }); diff --git a/libs/remote_storage/tests/common/tests.rs b/libs/remote_storage/tests/common/tests.rs index abccc24c97..72f6f956e0 100644 --- a/libs/remote_storage/tests/common/tests.rs +++ b/libs/remote_storage/tests/common/tests.rs @@ -1,9 +1,10 @@ use anyhow::Context; use camino::Utf8Path; use remote_storage::RemotePath; -use std::collections::HashSet; use std::sync::Arc; +use std::{collections::HashSet, num::NonZeroU32}; use test_context::test_context; +use tokio_util::sync::CancellationToken; use tracing::debug; use crate::common::{download_to_vec, upload_stream, wrap_stream}; @@ -45,13 +46,15 @@ async fn pagination_should_work(ctx: &mut MaybeEnabledStorageWithTestBlobs) -> a } }; + let cancel = CancellationToken::new(); + let test_client = Arc::clone(&ctx.enabled.client); let expected_remote_prefixes = ctx.remote_prefixes.clone(); let base_prefix = RemotePath::new(Utf8Path::new(ctx.enabled.base_prefix)) .context("common_prefix construction")?; let root_remote_prefixes = test_client - .list_prefixes(None) + .list_prefixes(None, &cancel) .await .context("client list root prefixes failure")? .into_iter() @@ -62,7 +65,7 @@ async fn pagination_should_work(ctx: &mut MaybeEnabledStorageWithTestBlobs) -> a ); let nested_remote_prefixes = test_client - .list_prefixes(Some(&base_prefix)) + .list_prefixes(Some(&base_prefix), &cancel) .await .context("client list nested prefixes failure")? .into_iter() @@ -99,11 +102,12 @@ async fn list_files_works(ctx: &mut MaybeEnabledStorageWithSimpleTestBlobs) -> a anyhow::bail!("S3 init failed: {e:?}") } }; + let cancel = CancellationToken::new(); let test_client = Arc::clone(&ctx.enabled.client); let base_prefix = RemotePath::new(Utf8Path::new("folder1")).context("common_prefix construction")?; let root_files = test_client - .list_files(None) + .list_files(None, None, &cancel) .await .context("client list root files failure")? .into_iter() @@ -113,8 +117,17 @@ async fn list_files_works(ctx: &mut MaybeEnabledStorageWithSimpleTestBlobs) -> a ctx.remote_blobs.clone(), "remote storage list_files on root mismatches with the uploads." ); + + // Test that max_keys limit works. In total there are about 21 files (see + // upload_simple_remote_data call in test_real_s3.rs). + let limited_root_files = test_client + .list_files(None, Some(NonZeroU32::new(2).unwrap()), &cancel) + .await + .context("client list root files failure")?; + assert_eq!(limited_root_files.len(), 2); + let nested_remote_files = test_client - .list_files(Some(&base_prefix)) + .list_files(Some(&base_prefix), None, &cancel) .await .context("client list nested files failure")? .into_iter() @@ -141,12 +154,17 @@ async fn delete_non_exising_works(ctx: &mut MaybeEnabledStorage) -> anyhow::Resu MaybeEnabledStorage::Disabled => return Ok(()), }; + let cancel = CancellationToken::new(); + let path = RemotePath::new(Utf8Path::new( format!("{}/for_sure_there_is_nothing_there_really", ctx.base_prefix).as_str(), )) .with_context(|| "RemotePath conversion")?; - ctx.client.delete(&path).await.expect("should succeed"); + ctx.client + .delete(&path, &cancel) + .await + .expect("should succeed"); Ok(()) } @@ -159,6 +177,8 @@ async fn delete_objects_works(ctx: &mut MaybeEnabledStorage) -> anyhow::Result<( MaybeEnabledStorage::Disabled => return Ok(()), }; + let cancel = CancellationToken::new(); + let path1 = RemotePath::new(Utf8Path::new(format!("{}/path1", ctx.base_prefix).as_str())) .with_context(|| "RemotePath conversion")?; @@ -169,21 +189,21 @@ async fn delete_objects_works(ctx: &mut MaybeEnabledStorage) -> anyhow::Result<( .with_context(|| "RemotePath conversion")?; let (data, len) = upload_stream("remote blob data1".as_bytes().into()); - ctx.client.upload(data, len, &path1, None).await?; + ctx.client.upload(data, len, &path1, None, &cancel).await?; let (data, len) = upload_stream("remote blob data2".as_bytes().into()); - ctx.client.upload(data, len, &path2, None).await?; + ctx.client.upload(data, len, &path2, None, &cancel).await?; let (data, len) = upload_stream("remote blob data3".as_bytes().into()); - ctx.client.upload(data, len, &path3, None).await?; + ctx.client.upload(data, len, &path3, None, &cancel).await?; - ctx.client.delete_objects(&[path1, path2]).await?; + ctx.client.delete_objects(&[path1, path2], &cancel).await?; - let prefixes = ctx.client.list_prefixes(None).await?; + let prefixes = ctx.client.list_prefixes(None, &cancel).await?; assert_eq!(prefixes.len(), 1); - ctx.client.delete_objects(&[path3]).await?; + ctx.client.delete_objects(&[path3], &cancel).await?; Ok(()) } @@ -195,6 +215,8 @@ async fn upload_download_works(ctx: &mut MaybeEnabledStorage) -> anyhow::Result< return Ok(()); }; + let cancel = CancellationToken::new(); + let path = RemotePath::new(Utf8Path::new(format!("{}/file", ctx.base_prefix).as_str())) .with_context(|| "RemotePath conversion")?; @@ -202,47 +224,56 @@ async fn upload_download_works(ctx: &mut MaybeEnabledStorage) -> anyhow::Result< let (data, len) = wrap_stream(orig.clone()); - ctx.client.upload(data, len, &path, None).await?; + ctx.client.upload(data, len, &path, None, &cancel).await?; // Normal download request - let dl = ctx.client.download(&path).await?; + let dl = ctx.client.download(&path, &cancel).await?; let buf = download_to_vec(dl).await?; assert_eq!(&buf, &orig); // Full range (end specified) let dl = ctx .client - .download_byte_range(&path, 0, Some(len as u64)) + .download_byte_range(&path, 0, Some(len as u64), &cancel) .await?; let buf = download_to_vec(dl).await?; assert_eq!(&buf, &orig); // partial range (end specified) - let dl = ctx.client.download_byte_range(&path, 4, Some(10)).await?; + let dl = ctx + .client + .download_byte_range(&path, 4, Some(10), &cancel) + .await?; let buf = download_to_vec(dl).await?; assert_eq!(&buf, &orig[4..10]); // partial range (end beyond real end) let dl = ctx .client - .download_byte_range(&path, 8, Some(len as u64 * 100)) + .download_byte_range(&path, 8, Some(len as u64 * 100), &cancel) .await?; let buf = download_to_vec(dl).await?; assert_eq!(&buf, &orig[8..]); // Partial range (end unspecified) - let dl = ctx.client.download_byte_range(&path, 4, None).await?; + let dl = ctx + .client + .download_byte_range(&path, 4, None, &cancel) + .await?; let buf = download_to_vec(dl).await?; assert_eq!(&buf, &orig[4..]); // Full range (end unspecified) - let dl = ctx.client.download_byte_range(&path, 0, None).await?; + let dl = ctx + .client + .download_byte_range(&path, 0, None, &cancel) + .await?; let buf = download_to_vec(dl).await?; assert_eq!(&buf, &orig); debug!("Cleanup: deleting file at path {path:?}"); ctx.client - .delete(&path) + .delete(&path, &cancel) .await .with_context(|| format!("{path:?} removal"))?; @@ -256,6 +287,8 @@ async fn copy_works(ctx: &mut MaybeEnabledStorage) -> anyhow::Result<()> { return Ok(()); }; + let cancel = CancellationToken::new(); + let path = RemotePath::new(Utf8Path::new( format!("{}/file_to_copy", ctx.base_prefix).as_str(), )) @@ -269,18 +302,18 @@ async fn copy_works(ctx: &mut MaybeEnabledStorage) -> anyhow::Result<()> { let (data, len) = wrap_stream(orig.clone()); - ctx.client.upload(data, len, &path, None).await?; + ctx.client.upload(data, len, &path, None, &cancel).await?; // Normal download request - ctx.client.copy_object(&path, &path_dest).await?; + ctx.client.copy_object(&path, &path_dest, &cancel).await?; - let dl = ctx.client.download(&path_dest).await?; + let dl = ctx.client.download(&path_dest, &cancel).await?; let buf = download_to_vec(dl).await?; assert_eq!(&buf, &orig); debug!("Cleanup: deleting file at path {path:?}"); ctx.client - .delete_objects(&[path.clone(), path_dest.clone()]) + .delete_objects(&[path.clone(), path_dest.clone()], &cancel) .await .with_context(|| format!("{path:?} removal"))?; diff --git a/libs/remote_storage/tests/test_real_azure.rs b/libs/remote_storage/tests/test_real_azure.rs index 6f9a1ec6f7..6adddf52a9 100644 --- a/libs/remote_storage/tests/test_real_azure.rs +++ b/libs/remote_storage/tests/test_real_azure.rs @@ -1,9 +1,9 @@ -use std::collections::HashSet; use std::env; use std::num::NonZeroUsize; use std::ops::ControlFlow; use std::sync::Arc; use std::time::UNIX_EPOCH; +use std::{collections::HashSet, time::Duration}; use anyhow::Context; use remote_storage::{ @@ -39,6 +39,17 @@ impl EnabledAzure { base_prefix: BASE_PREFIX, } } + + #[allow(unused)] // this will be needed when moving the timeout integration tests back + fn configure_request_timeout(&mut self, timeout: Duration) { + match Arc::get_mut(&mut self.client).expect("outer Arc::get_mut") { + GenericRemoteStorage::AzureBlob(azure) => { + let azure = Arc::get_mut(azure).expect("inner Arc::get_mut"); + azure.timeout = timeout; + } + _ => unreachable!(), + } + } } enum MaybeEnabledStorage { @@ -213,6 +224,7 @@ fn create_azure_client( concurrency_limit: NonZeroUsize::new(100).unwrap(), max_keys_per_list_response, }), + timeout: Duration::from_secs(120), }; Ok(Arc::new( GenericRemoteStorage::from_config(&remote_storage_config).context("remote storage init")?, diff --git a/libs/remote_storage/tests/test_real_s3.rs b/libs/remote_storage/tests/test_real_s3.rs index 9e1b989e4d..d8b9824d99 100644 --- a/libs/remote_storage/tests/test_real_s3.rs +++ b/libs/remote_storage/tests/test_real_s3.rs @@ -1,4 +1,6 @@ use std::env; +use std::fmt::{Debug, Display}; +use std::future::Future; use std::num::NonZeroUsize; use std::ops::ControlFlow; use std::sync::Arc; @@ -8,11 +10,14 @@ use std::{collections::HashSet, time::SystemTime}; use crate::common::{download_to_vec, upload_stream}; use anyhow::Context; use camino::Utf8Path; +use futures_util::StreamExt; use remote_storage::{ - GenericRemoteStorage, RemotePath, RemoteStorageConfig, RemoteStorageKind, S3Config, + DownloadError, GenericRemoteStorage, RemotePath, RemoteStorageConfig, RemoteStorageKind, + S3Config, }; use test_context::test_context; use test_context::AsyncTestContext; +use tokio::io::AsyncBufReadExt; use tokio_util::sync::CancellationToken; use tracing::info; @@ -22,9 +27,9 @@ mod common; mod tests_s3; use common::{cleanup, ensure_logging_ready, upload_remote_data, upload_simple_remote_data}; +use utils::backoff; const ENABLE_REAL_S3_REMOTE_STORAGE_ENV_VAR_NAME: &str = "ENABLE_REAL_S3_REMOTE_STORAGE"; - const BASE_PREFIX: &str = "test"; #[test_context(MaybeEnabledStorage)] @@ -39,6 +44,26 @@ async fn s3_time_travel_recovery_works(ctx: &mut MaybeEnabledStorage) -> anyhow: // to take the time from S3 response headers. const WAIT_TIME: Duration = Duration::from_millis(3_000); + async fn retry(op: O) -> Result + where + E: Display + Debug + 'static, + O: FnMut() -> F, + F: Future>, + { + let warn_threshold = 3; + let max_retries = 10; + backoff::retry( + op, + |_e| false, + warn_threshold, + max_retries, + "test retry", + &CancellationToken::new(), + ) + .await + .expect("never cancelled") + } + async fn time_point() -> SystemTime { tokio::time::sleep(WAIT_TIME).await; let ret = SystemTime::now(); @@ -46,15 +71,19 @@ async fn s3_time_travel_recovery_works(ctx: &mut MaybeEnabledStorage) -> anyhow: ret } - async fn list_files(client: &Arc) -> anyhow::Result> { - Ok(client - .list_files(None) + async fn list_files( + client: &Arc, + cancel: &CancellationToken, + ) -> anyhow::Result> { + Ok(retry(|| client.list_files(None, None, cancel)) .await .context("list root files failure")? .into_iter() .collect::>()) } + let cancel = CancellationToken::new(); + let path1 = RemotePath::new(Utf8Path::new(format!("{}/path1", ctx.base_prefix).as_str())) .with_context(|| "RemotePath conversion")?; @@ -64,24 +93,31 @@ async fn s3_time_travel_recovery_works(ctx: &mut MaybeEnabledStorage) -> anyhow: let path3 = RemotePath::new(Utf8Path::new(format!("{}/path3", ctx.base_prefix).as_str())) .with_context(|| "RemotePath conversion")?; - let (data, len) = upload_stream("remote blob data1".as_bytes().into()); - ctx.client.upload(data, len, &path1, None).await?; + retry(|| { + let (data, len) = upload_stream("remote blob data1".as_bytes().into()); + ctx.client.upload(data, len, &path1, None, &cancel) + }) + .await?; - let t0_files = list_files(&ctx.client).await?; + let t0_files = list_files(&ctx.client, &cancel).await?; let t0 = time_point().await; println!("at t0: {t0_files:?}"); let old_data = "remote blob data2"; - let (data, len) = upload_stream(old_data.as_bytes().into()); - ctx.client.upload(data, len, &path2, None).await?; - let t1_files = list_files(&ctx.client).await?; + retry(|| { + let (data, len) = upload_stream(old_data.as_bytes().into()); + ctx.client.upload(data, len, &path2, None, &cancel) + }) + .await?; + + let t1_files = list_files(&ctx.client, &cancel).await?; let t1 = time_point().await; println!("at t1: {t1_files:?}"); // A little check to ensure that our clock is not too far off from the S3 clock { - let dl = ctx.client.download(&path2).await?; + let dl = retry(|| ctx.client.download(&path2, &cancel)).await?; let last_modified = dl.last_modified.unwrap(); let half_wt = WAIT_TIME.mul_f32(0.5); let t0_hwt = t0 + half_wt; @@ -92,52 +128,60 @@ async fn s3_time_travel_recovery_works(ctx: &mut MaybeEnabledStorage) -> anyhow: } } - let (data, len) = upload_stream("remote blob data3".as_bytes().into()); - ctx.client.upload(data, len, &path3, None).await?; + retry(|| { + let (data, len) = upload_stream("remote blob data3".as_bytes().into()); + ctx.client.upload(data, len, &path3, None, &cancel) + }) + .await?; let new_data = "new remote blob data2"; - let (data, len) = upload_stream(new_data.as_bytes().into()); - ctx.client.upload(data, len, &path2, None).await?; - ctx.client.delete(&path1).await?; + retry(|| { + let (data, len) = upload_stream(new_data.as_bytes().into()); + ctx.client.upload(data, len, &path2, None, &cancel) + }) + .await?; - let t2_files = list_files(&ctx.client).await?; + retry(|| ctx.client.delete(&path1, &cancel)).await?; + let t2_files = list_files(&ctx.client, &cancel).await?; let t2 = time_point().await; println!("at t2: {t2_files:?}"); // No changes after recovery to t2 (no-op) let t_final = time_point().await; ctx.client - .time_travel_recover(None, t2, t_final, CancellationToken::new()) + .time_travel_recover(None, t2, t_final, &cancel) .await?; - let t2_files_recovered = list_files(&ctx.client).await?; + let t2_files_recovered = list_files(&ctx.client, &cancel).await?; println!("after recovery to t2: {t2_files_recovered:?}"); assert_eq!(t2_files, t2_files_recovered); - let path2_recovered_t2 = download_to_vec(ctx.client.download(&path2).await?).await?; + let path2_recovered_t2 = download_to_vec(ctx.client.download(&path2, &cancel).await?).await?; assert_eq!(path2_recovered_t2, new_data.as_bytes()); // after recovery to t1: path1 is back, path2 has the old content let t_final = time_point().await; ctx.client - .time_travel_recover(None, t1, t_final, CancellationToken::new()) + .time_travel_recover(None, t1, t_final, &cancel) .await?; - let t1_files_recovered = list_files(&ctx.client).await?; + let t1_files_recovered = list_files(&ctx.client, &cancel).await?; println!("after recovery to t1: {t1_files_recovered:?}"); assert_eq!(t1_files, t1_files_recovered); - let path2_recovered_t1 = download_to_vec(ctx.client.download(&path2).await?).await?; + let path2_recovered_t1 = download_to_vec(ctx.client.download(&path2, &cancel).await?).await?; assert_eq!(path2_recovered_t1, old_data.as_bytes()); // after recovery to t0: everything is gone except for path1 let t_final = time_point().await; ctx.client - .time_travel_recover(None, t0, t_final, CancellationToken::new()) + .time_travel_recover(None, t0, t_final, &cancel) .await?; - let t0_files_recovered = list_files(&ctx.client).await?; + let t0_files_recovered = list_files(&ctx.client, &cancel).await?; println!("after recovery to t0: {t0_files_recovered:?}"); assert_eq!(t0_files, t0_files_recovered); // cleanup - ctx.client.delete_objects(&[path1, path2, path3]).await?; + + let paths = &[path1, path2, path3]; + retry(|| ctx.client.delete_objects(paths, &cancel)).await?; Ok(()) } @@ -158,6 +202,16 @@ impl EnabledS3 { base_prefix: BASE_PREFIX, } } + + fn configure_request_timeout(&mut self, timeout: Duration) { + match Arc::get_mut(&mut self.client).expect("outer Arc::get_mut") { + GenericRemoteStorage::AwsS3(s3) => { + let s3 = Arc::get_mut(s3).expect("inner Arc::get_mut"); + s3.timeout = timeout; + } + _ => unreachable!(), + } + } } enum MaybeEnabledStorage { @@ -331,8 +385,174 @@ fn create_s3_client( concurrency_limit: NonZeroUsize::new(100).unwrap(), max_keys_per_list_response, }), + timeout: RemoteStorageConfig::DEFAULT_TIMEOUT, }; Ok(Arc::new( GenericRemoteStorage::from_config(&remote_storage_config).context("remote storage init")?, )) } + +#[test_context(MaybeEnabledStorage)] +#[tokio::test] +async fn download_is_timeouted(ctx: &mut MaybeEnabledStorage) { + let MaybeEnabledStorage::Enabled(ctx) = ctx else { + return; + }; + + let cancel = CancellationToken::new(); + + let path = RemotePath::new(Utf8Path::new( + format!("{}/file_to_copy", ctx.base_prefix).as_str(), + )) + .unwrap(); + + let len = upload_large_enough_file(&ctx.client, &path, &cancel).await; + + let timeout = std::time::Duration::from_secs(5); + + ctx.configure_request_timeout(timeout); + + let started_at = std::time::Instant::now(); + let mut stream = ctx + .client + .download(&path, &cancel) + .await + .expect("download succeeds") + .download_stream; + + if started_at.elapsed().mul_f32(0.9) >= timeout { + tracing::warn!( + elapsed_ms = started_at.elapsed().as_millis(), + "timeout might be too low, consumed most of it during headers" + ); + } + + let first = stream + .next() + .await + .expect("should have the first blob") + .expect("should have succeeded"); + + tracing::info!(len = first.len(), "downloaded first chunk"); + + assert!( + first.len() < len, + "uploaded file is too small, we downloaded all on first chunk" + ); + + tokio::time::sleep(timeout).await; + + { + let started_at = std::time::Instant::now(); + let next = stream + .next() + .await + .expect("stream should not have ended yet"); + + tracing::info!( + next.is_err = next.is_err(), + elapsed_ms = started_at.elapsed().as_millis(), + "received item after timeout" + ); + + let e = next.expect_err("expected an error, but got a chunk?"); + + let inner = e.get_ref().expect("std::io::Error::inner should be set"); + assert!( + inner + .downcast_ref::() + .is_some_and(|e| matches!(e, DownloadError::Timeout)), + "{inner:?}" + ); + } + + ctx.configure_request_timeout(RemoteStorageConfig::DEFAULT_TIMEOUT); + + ctx.client.delete_objects(&[path], &cancel).await.unwrap() +} + +#[test_context(MaybeEnabledStorage)] +#[tokio::test] +async fn download_is_cancelled(ctx: &mut MaybeEnabledStorage) { + let MaybeEnabledStorage::Enabled(ctx) = ctx else { + return; + }; + + let cancel = CancellationToken::new(); + + let path = RemotePath::new(Utf8Path::new( + format!("{}/file_to_copy", ctx.base_prefix).as_str(), + )) + .unwrap(); + + let file_len = upload_large_enough_file(&ctx.client, &path, &cancel).await; + + { + let stream = ctx + .client + .download(&path, &cancel) + .await + .expect("download succeeds") + .download_stream; + + let mut reader = std::pin::pin!(tokio_util::io::StreamReader::new(stream)); + + let first = reader.fill_buf().await.expect("should have the first blob"); + + let len = first.len(); + tracing::info!(len, "downloaded first chunk"); + + assert!( + first.len() < file_len, + "uploaded file is too small, we downloaded all on first chunk" + ); + + reader.consume(len); + + cancel.cancel(); + + let next = reader.fill_buf().await; + + let e = next.expect_err("expected an error, but got a chunk?"); + + let inner = e.get_ref().expect("std::io::Error::inner should be set"); + assert!( + inner + .downcast_ref::() + .is_some_and(|e| matches!(e, DownloadError::Cancelled)), + "{inner:?}" + ); + + let e = DownloadError::from(e); + + assert!(matches!(e, DownloadError::Cancelled), "{e:?}"); + } + + let cancel = CancellationToken::new(); + + ctx.client.delete_objects(&[path], &cancel).await.unwrap(); +} + +/// Upload a long enough file so that we cannot download it in single chunk +/// +/// For s3 the first chunk seems to be less than 10kB, so this has a bit of a safety margin +async fn upload_large_enough_file( + client: &GenericRemoteStorage, + path: &RemotePath, + cancel: &CancellationToken, +) -> usize { + let header = bytes::Bytes::from_static("remote blob data content".as_bytes()); + let body = bytes::Bytes::from(vec![0u8; 1024]); + let contents = std::iter::once(header).chain(std::iter::repeat(body).take(128)); + + let len = contents.clone().fold(0, |acc, next| acc + next.len()); + + let contents = futures::stream::iter(contents.map(std::io::Result::Ok)); + + client + .upload(contents, len, path, None, cancel) + .await + .expect("upload succeeds"); + + len +} diff --git a/libs/utils/Cargo.toml b/libs/utils/Cargo.toml index 7dee7e3963..de27ae4e28 100644 --- a/libs/utils/Cargo.toml +++ b/libs/utils/Cargo.toml @@ -25,6 +25,7 @@ hyper = { workspace = true, features = ["full"] } fail.workspace = true futures = { workspace = true} jsonwebtoken.workspace = true +leaky-bucket.workspace = true nix.workspace = true once_cell.workspace = true pin-project-lite.workspace = true diff --git a/libs/utils/benches/benchmarks.rs b/libs/utils/benches/benchmarks.rs index 98d839ca55..44eb36387c 100644 --- a/libs/utils/benches/benchmarks.rs +++ b/libs/utils/benches/benchmarks.rs @@ -1,5 +1,3 @@ -#![allow(unused)] - use criterion::{criterion_group, criterion_main, Criterion}; use utils::id; diff --git a/libs/utils/src/auth.rs b/libs/utils/src/auth.rs index 66b1f6e866..03e65f74fe 100644 --- a/libs/utils/src/auth.rs +++ b/libs/utils/src/auth.rs @@ -1,7 +1,6 @@ // For details about authentication see docs/authentication.md use arc_swap::ArcSwap; -use serde; use std::{borrow::Cow, fmt::Display, fs, sync::Arc}; use anyhow::Result; @@ -29,6 +28,11 @@ pub enum Scope { // Should only be used e.g. for status check. // Currently also used for connection from any pageserver to any safekeeper. SafekeeperData, + // The scope used by pageservers in upcalls to storage controller and cloud control plane + #[serde(rename = "generations_api")] + GenerationsApi, + // Allows access to control plane managment API and some storage controller endpoints. + Admin, } /// JWT payload. See docs/authentication.md for the format @@ -127,6 +131,10 @@ impl JwtAuth { Ok(Self::new(decoding_keys)) } + pub fn from_key(key: String) -> Result { + Ok(Self::new(vec![DecodingKey::from_ed_pem(key.as_bytes())?])) + } + /// Attempt to decode the token with the internal decoding keys. /// /// The function tries the stored decoding keys in succession, @@ -197,12 +205,11 @@ MC4CAQAwBQYDK2VwBCIEID/Drmc1AA6U/znNRWpF3zEGegOATQxfkdWxitcOMsIH // "scope": "tenant", // "tenant_id": "3d1f7595b468230304e0b73cecbcb081", // "iss": "neon.controlplane", - // "exp": 1709200879, // "iat": 1678442479 // } // ``` // - let encoded_eddsa = "eyJhbGciOiJFZERTQSIsInR5cCI6IkpXVCJ9.eyJzY29wZSI6InRlbmFudCIsInRlbmFudF9pZCI6IjNkMWY3NTk1YjQ2ODIzMDMwNGUwYjczY2VjYmNiMDgxIiwiaXNzIjoibmVvbi5jb250cm9scGxhbmUiLCJleHAiOjE3MDkyMDA4NzksImlhdCI6MTY3ODQ0MjQ3OX0.U3eA8j-uU-JnhzeO3EDHRuXLwkAUFCPxtGHEgw6p7Ccc3YRbFs2tmCdbD9PZEXP-XsxSeBQi1FY0YPcT3NXADw"; + let encoded_eddsa = "eyJhbGciOiJFZERTQSIsInR5cCI6IkpXVCJ9.eyJzY29wZSI6InRlbmFudCIsInRlbmFudF9pZCI6IjNkMWY3NTk1YjQ2ODIzMDMwNGUwYjczY2VjYmNiMDgxIiwiaXNzIjoibmVvbi5jb250cm9scGxhbmUiLCJpYXQiOjE2Nzg0NDI0Nzl9.rNheBnluMJNgXzSTTJoTNIGy4P_qe0JUHl_nVEGuDCTgHOThPVr552EnmKccrCKquPeW3c2YUk0Y9Oh4KyASAw"; // Check it can be validated with the public key let auth = JwtAuth::new(vec![DecodingKey::from_ed_pem(TEST_PUB_KEY_ED25519).unwrap()]); diff --git a/libs/utils/src/backoff.rs b/libs/utils/src/backoff.rs index d50ad39585..096c7e5854 100644 --- a/libs/utils/src/backoff.rs +++ b/libs/utils/src/backoff.rs @@ -37,69 +37,53 @@ pub fn exponential_backoff_duration_seconds(n: u32, base_increment: f64, max_sec } } -/// Configure cancellation for a retried operation: when to cancel (the token), and -/// what kind of error to return on cancellation -pub struct Cancel -where - E: Display + Debug + 'static, - CF: Fn() -> E, -{ - token: CancellationToken, - on_cancel: CF, -} - -impl Cancel -where - E: Display + Debug + 'static, - CF: Fn() -> E, -{ - pub fn new(token: CancellationToken, on_cancel: CF) -> Self { - Self { token, on_cancel } - } -} - -/// retries passed operation until one of the following conditions are met: -/// Encountered error is considered as permanent (non-retryable) -/// Retries have been exhausted. -/// `is_permanent` closure should be used to provide distinction between permanent/non-permanent errors -/// When attempts cross `warn_threshold` function starts to emit log warnings. +/// Retries passed operation until one of the following conditions are met: +/// - encountered error is considered as permanent (non-retryable) +/// - retries have been exhausted +/// - cancellation token has been cancelled +/// +/// `is_permanent` closure should be used to provide distinction between permanent/non-permanent +/// errors. When attempts cross `warn_threshold` function starts to emit log warnings. /// `description` argument is added to log messages. Its value should identify the `op` is doing -/// `cancel` argument is required: any time we are looping on retry, we should be using a CancellationToken -/// to drop out promptly on shutdown. -pub async fn retry( +/// `cancel` cancels new attempts and the backoff sleep. +/// +/// If attempts fail, they are being logged with `{:#}` which works for anyhow, but does not work +/// for any other error type. Final failed attempt is logged with `{:?}`. +/// +/// Returns `None` if cancellation was noticed during backoff or the terminal result. +pub async fn retry( mut op: O, is_permanent: impl Fn(&E) -> bool, warn_threshold: u32, max_retries: u32, description: &str, - cancel: Cancel, -) -> Result + cancel: &CancellationToken, +) -> Option> where // Not std::error::Error because anyhow::Error doesnt implement it. // For context see https://github.com/dtolnay/anyhow/issues/63 E: Display + Debug + 'static, O: FnMut() -> F, F: Future>, - CF: Fn() -> E, { let mut attempts = 0; loop { - if cancel.token.is_cancelled() { - return Err((cancel.on_cancel)()); + if cancel.is_cancelled() { + return None; } let result = op().await; - match result { + match &result { Ok(_) => { if attempts > 0 { tracing::info!("{description} succeeded after {attempts} retries"); } - return result; + return Some(result); } // These are "permanent" errors that should not be retried. - Err(ref e) if is_permanent(e) => { - return result; + Err(e) if is_permanent(e) => { + return Some(result); } // Assume that any other failure might be transient, and the operation might // succeed if we just keep trying. @@ -109,12 +93,12 @@ where Err(err) if attempts < max_retries => { tracing::warn!("{description} failed, will retry (attempt {attempts}): {err:#}"); } - Err(ref err) => { + Err(err) => { // Operation failed `max_attempts` times. Time to give up. tracing::warn!( "{description} still failed after {attempts} retries, giving up: {err:?}" ); - return result; + return Some(result); } } // sleep and retry @@ -122,7 +106,7 @@ where attempts, DEFAULT_BASE_BACKOFF_SECONDS, DEFAULT_MAX_BACKOFF_SECONDS, - &cancel.token, + cancel, ) .await; attempts += 1; @@ -131,11 +115,9 @@ where #[cfg(test)] mod tests { - use std::io; - - use tokio::sync::Mutex; - use super::*; + use std::io; + use tokio::sync::Mutex; #[test] fn backoff_defaults_produce_growing_backoff_sequence() { @@ -166,7 +148,7 @@ mod tests { #[tokio::test(start_paused = true)] async fn retry_always_error() { let count = Mutex::new(0); - let err_result = retry( + retry( || async { *count.lock().await += 1; Result::<(), io::Error>::Err(io::Error::from(io::ErrorKind::Other)) @@ -175,11 +157,11 @@ mod tests { 1, 1, "work", - Cancel::new(CancellationToken::new(), || -> io::Error { unreachable!() }), + &CancellationToken::new(), ) - .await; - - assert!(err_result.is_err()); + .await + .expect("not cancelled") + .expect_err("it can only fail"); assert_eq!(*count.lock().await, 2); } @@ -201,10 +183,11 @@ mod tests { 2, 2, "work", - Cancel::new(CancellationToken::new(), || -> io::Error { unreachable!() }), + &CancellationToken::new(), ) .await - .unwrap(); + .expect("not cancelled") + .expect("success on second try"); } #[tokio::test(start_paused = true)] @@ -224,10 +207,11 @@ mod tests { 2, 2, "work", - Cancel::new(CancellationToken::new(), || -> io::Error { unreachable!() }), + &CancellationToken::new(), ) .await - .unwrap_err(); + .expect("was not cancellation") + .expect_err("it was permanent error"); assert_eq!(*count.lock().await, 1); } diff --git a/libs/utils/src/completion.rs b/libs/utils/src/completion.rs index ca6827c9b8..2fef8d35df 100644 --- a/libs/utils/src/completion.rs +++ b/libs/utils/src/completion.rs @@ -4,7 +4,9 @@ use tokio_util::task::{task_tracker::TaskTrackerToken, TaskTracker}; /// /// Can be cloned, moved and kept around in futures as "guard objects". #[derive(Clone)] -pub struct Completion(TaskTrackerToken); +pub struct Completion { + _token: TaskTrackerToken, +} /// Barrier will wait until all clones of [`Completion`] have been dropped. #[derive(Clone)] @@ -27,6 +29,11 @@ impl Barrier { b.wait().await } } + + /// Return true if a call to wait() would complete immediately + pub fn is_ready(&self) -> bool { + futures::future::FutureExt::now_or_never(self.0.wait()).is_some() + } } impl PartialEq for Barrier { @@ -44,5 +51,5 @@ pub fn channel() -> (Completion, Barrier) { tracker.close(); let token = tracker.token(); - (Completion(token), Barrier(tracker)) + (Completion { _token: token }, Barrier(tracker)) } diff --git a/libs/utils/src/crashsafe.rs b/libs/utils/src/crashsafe.rs index 0c6855d17b..756b19138c 100644 --- a/libs/utils/src/crashsafe.rs +++ b/libs/utils/src/crashsafe.rs @@ -112,14 +112,66 @@ pub async fn fsync_async(path: impl AsRef) -> Result<(), std::io::Erro tokio::fs::File::open(path.as_ref()).await?.sync_all().await } -/// Writes a file to the specified `final_path` in a crash safe fasion +pub async fn fsync_async_opt( + path: impl AsRef, + do_fsync: bool, +) -> Result<(), std::io::Error> { + if do_fsync { + fsync_async(path.as_ref()).await?; + } + Ok(()) +} + +/// Like postgres' durable_rename, renames file issuing fsyncs do make it +/// durable. After return, file and rename are guaranteed to be persisted. /// -/// The file is first written to the specified tmp_path, and in a second -/// step, the tmp path is renamed to the final path. As renames are -/// atomic, a crash during the write operation will never leave behind a -/// partially written file. +/// Unlike postgres, it only does fsyncs to 1) file to be renamed to make +/// contents durable; 2) its directory entry to make rename durable 3) again to +/// already renamed file, which is not required by standards but postgres does +/// it, let's stick to that. Postgres additionally fsyncs newpath *before* +/// rename if it exists to ensure that at least one of the files survives, but +/// current callers don't need that. /// -/// NB: an async variant of this code exists in Pageserver's VirtualFile. +/// virtual_file.rs has similar code, but it doesn't use vfs. +/// +/// Useful links: +/// +/// +pub async fn durable_rename( + old_path: impl AsRef, + new_path: impl AsRef, + do_fsync: bool, +) -> io::Result<()> { + // first fsync the file + fsync_async_opt(old_path.as_ref(), do_fsync).await?; + + // Time to do the real deal. + tokio::fs::rename(old_path.as_ref(), new_path.as_ref()).await?; + + // Postgres'ish fsync of renamed file. + fsync_async_opt(new_path.as_ref(), do_fsync).await?; + + // Now fsync the parent + let parent = match new_path.as_ref().parent() { + Some(p) => p, + None => Utf8Path::new("./"), // assume current dir if there is no parent + }; + fsync_async_opt(parent, do_fsync).await?; + + Ok(()) +} + +/// Writes a file to the specified `final_path` in a crash safe fasion, using [`std::fs`]. +/// +/// The file is first written to the specified `tmp_path`, and in a second +/// step, the `tmp_path` is renamed to the `final_path`. Intermediary fsync +/// and atomic rename guarantee that, if we crash at any point, there will never +/// be a partially written file at `final_path` (but maybe at `tmp_path`). +/// +/// Callers are responsible for serializing calls of this function for a given `final_path`. +/// If they don't, there may be an error due to conflicting `tmp_path`, or there will +/// be no error and the content of `final_path` will be the "winner" caller's `content`. +/// I.e., the atomticity guarantees still hold. pub fn overwrite( final_path: &Utf8Path, tmp_path: &Utf8Path, @@ -139,17 +191,14 @@ pub fn overwrite( .open(tmp_path)?; file.write_all(content)?; file.sync_all()?; - drop(file); // before the rename, that's important! - // renames are atomic + drop(file); // don't keep the fd open for longer than we have to + std::fs::rename(tmp_path, final_path)?; - // Only open final path parent dirfd now, so that this operation only - // ever holds one VirtualFile fd at a time. That's important because - // the current `find_victim_slot` impl might pick the same slot for both - // VirtualFile., and it eventually does a blocking write lock instead of - // try_lock. + let final_parent_dirfd = std::fs::OpenOptions::new() .read(true) .open(final_path_parent)?; + final_parent_dirfd.sync_all()?; Ok(()) } diff --git a/libs/utils/src/generation.rs b/libs/utils/src/generation.rs index 46eadee1da..af15cee924 100644 --- a/libs/utils/src/generation.rs +++ b/libs/utils/src/generation.rs @@ -45,7 +45,7 @@ impl Generation { Self::Broken } - pub fn new(v: u32) -> Self { + pub const fn new(v: u32) -> Self { Self::Valid(v) } @@ -54,12 +54,10 @@ impl Generation { } #[track_caller] - pub fn get_suffix(&self) -> String { + pub fn get_suffix(&self) -> impl std::fmt::Display { match self { - Self::Valid(v) => { - format!("-{:08x}", v) - } - Self::None => "".into(), + Self::Valid(v) => GenerationFileSuffix(Some(*v)), + Self::None => GenerationFileSuffix(None), Self::Broken => { panic!("Tried to use a broken generation"); } @@ -90,6 +88,7 @@ impl Generation { } } + #[track_caller] pub fn next(&self) -> Generation { match self { Self::Valid(n) => Self::Valid(*n + 1), @@ -107,6 +106,18 @@ impl Generation { } } +struct GenerationFileSuffix(Option); + +impl std::fmt::Display for GenerationFileSuffix { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + if let Some(g) = self.0 { + write!(f, "-{g:08x}") + } else { + Ok(()) + } + } +} + impl Serialize for Generation { fn serialize(&self, serializer: S) -> Result where @@ -164,4 +175,24 @@ mod test { assert!(Generation::none() < Generation::new(0)); assert!(Generation::none() < Generation::new(1)); } + + #[test] + fn suffix_is_stable() { + use std::fmt::Write as _; + + // the suffix must remain stable through-out the pageserver remote storage evolution and + // not be changed accidentially without thinking about migration + let examples = [ + (line!(), Generation::None, ""), + (line!(), Generation::Valid(0), "-00000000"), + (line!(), Generation::Valid(u32::MAX), "-ffffffff"), + ]; + + let mut s = String::new(); + for (line, gen, expected) in examples { + s.clear(); + write!(s, "{}", &gen.get_suffix()).expect("string grows"); + assert_eq!(s, expected, "example on {line}"); + } + } } diff --git a/libs/utils/src/http/endpoint.rs b/libs/utils/src/http/endpoint.rs index 550ab10700..a60971abf0 100644 --- a/libs/utils/src/http/endpoint.rs +++ b/libs/utils/src/http/endpoint.rs @@ -9,7 +9,7 @@ use metrics::{register_int_counter, Encoder, IntCounter, TextEncoder}; use once_cell::sync::Lazy; use routerify::ext::RequestExt; use routerify::{Middleware, RequestInfo, Router, RouterBuilder}; -use tracing::{self, debug, info, info_span, warn, Instrument}; +use tracing::{debug, info, info_span, warn, Instrument}; use std::future::Future; use std::str::FromStr; @@ -156,6 +156,10 @@ pub struct ChannelWriter { buffer: BytesMut, pub tx: mpsc::Sender>, written: usize, + /// Time spent waiting for the channel to make progress. It is not the same as time to upload a + /// buffer because we cannot know anything about that, but this should allow us to understand + /// the actual time taken without the time spent `std::thread::park`ed. + wait_time: std::time::Duration, } impl ChannelWriter { @@ -168,6 +172,7 @@ impl ChannelWriter { buffer: BytesMut::with_capacity(buf_len).split_off(buf_len / 2), tx, written: 0, + wait_time: std::time::Duration::ZERO, } } @@ -180,6 +185,8 @@ impl ChannelWriter { tracing::trace!(n, "flushing"); let ready = self.buffer.split().freeze(); + let wait_started_at = std::time::Instant::now(); + // not ideal to call from blocking code to block_on, but we are sure that this // operation does not spawn_blocking other tasks let res: Result<(), ()> = tokio::runtime::Handle::current().block_on(async { @@ -192,6 +199,9 @@ impl ChannelWriter { // sending it to the client. Ok(()) }); + + self.wait_time += wait_started_at.elapsed(); + if res.is_err() { return Err(std::io::ErrorKind::BrokenPipe.into()); } @@ -202,6 +212,10 @@ impl ChannelWriter { pub fn flushed_bytes(&self) -> usize { self.written } + + pub fn wait_time(&self) -> std::time::Duration { + self.wait_time + } } impl std::io::Write for ChannelWriter { @@ -252,22 +266,52 @@ async fn prometheus_metrics_handler(_req: Request) -> Result { tracing::info!( bytes = writer.flushed_bytes(), - elapsed_ms = started_at.elapsed().as_millis(), + total_ms = total.as_millis(), + spawning_ms = spawned_in.as_millis(), + collection_ms = collected_in.as_millis(), + encoding_ms = encoded_in.as_millis(), "responded /metrics" ); } Err(e) => { - tracing::warn!("failed to write out /metrics response: {e:#}"); + // there is a chance that this error is not the BrokenPipe we generate in the writer + // for "closed connection", but it is highly unlikely. + tracing::warn!( + after_bytes = writer.flushed_bytes(), + total_ms = total.as_millis(), + spawning_ms = spawned_in.as_millis(), + collection_ms = collected_in.as_millis(), + encoding_ms = encoded_in.as_millis(), + "failed to write out /metrics response: {e:?}" + ); // semantics of this error are quite... unclear. we want to error the stream out to // abort the response to somehow notify the client that we failed. // diff --git a/libs/utils/src/lsn.rs b/libs/utils/src/lsn.rs index b3269ae049..1aebe91428 100644 --- a/libs/utils/src/lsn.rs +++ b/libs/utils/src/lsn.rs @@ -415,7 +415,6 @@ mod tests { use super::*; - use serde::ser::Serialize; use serde_assert::{Deserializer, Serializer, Token, Tokens}; #[test] diff --git a/libs/utils/src/pageserver_feedback.rs b/libs/utils/src/pageserver_feedback.rs index c9fbdde928..bc8fa7362e 100644 --- a/libs/utils/src/pageserver_feedback.rs +++ b/libs/utils/src/pageserver_feedback.rs @@ -123,6 +123,12 @@ impl PageserverFeedback { rf.replytime = *PG_EPOCH - Duration::from_micros(-raw_time as u64); } } + b"shard_number" => { + let len = buf.get_i32(); + // TODO: this will be implemented in the next update, + // for now, we just skip the value. + buf.advance(len as usize); + } _ => { let len = buf.get_i32(); warn!( diff --git a/libs/utils/src/seqwait.rs b/libs/utils/src/seqwait.rs index effc9c67b5..b7301776eb 100644 --- a/libs/utils/src/seqwait.rs +++ b/libs/utils/src/seqwait.rs @@ -1,6 +1,6 @@ #![warn(missing_docs)] -use std::cmp::{Eq, Ordering, PartialOrd}; +use std::cmp::{Eq, Ordering}; use std::collections::BinaryHeap; use std::fmt::Debug; use std::mem; @@ -249,7 +249,6 @@ where mod tests { use super::*; use std::sync::Arc; - use std::time::Duration; impl MonotonicCounter for i32 { fn cnt_advance(&mut self, val: i32) { diff --git a/libs/utils/src/simple_rcu.rs b/libs/utils/src/simple_rcu.rs index dc4a599111..ecc5353be3 100644 --- a/libs/utils/src/simple_rcu.rs +++ b/libs/utils/src/simple_rcu.rs @@ -221,7 +221,7 @@ impl RcuWaitList { #[cfg(test)] mod tests { use super::*; - use std::sync::{Arc, Mutex}; + use std::sync::Mutex; use std::time::Duration; #[tokio::test] diff --git a/libs/utils/src/sync/gate.rs b/libs/utils/src/sync/gate.rs index abc3842da8..c34176af57 100644 --- a/libs/utils/src/sync/gate.rs +++ b/libs/utils/src/sync/gate.rs @@ -1,4 +1,10 @@ -use std::{sync::Arc, time::Duration}; +use std::{ + sync::{ + atomic::{AtomicBool, Ordering}, + Arc, + }, + time::Duration, +}; /// Gates are a concurrency helper, primarily used for implementing safe shutdown. /// @@ -6,62 +12,70 @@ use std::{sync::Arc, time::Duration}; /// the resource calls `close()` when they want to ensure that all holders of guards /// have released them, and that no future guards will be issued. pub struct Gate { - /// Each caller of enter() takes one unit from the semaphore. In close(), we - /// take all the units to ensure all GateGuards are destroyed. - sem: Arc, - - /// For observability only: a name that will be used to log warnings if a particular - /// gate is holding up shutdown - name: String, + inner: Arc, } impl std::fmt::Debug for Gate { fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { - write!(f, "Gate<{}>", self.name) + f.debug_struct("Gate") + // use this for identification + .field("ptr", &Arc::as_ptr(&self.inner)) + .field("inner", &self.inner) + .finish() + } +} + +struct GateInner { + sem: tokio::sync::Semaphore, + closing: std::sync::atomic::AtomicBool, +} + +impl std::fmt::Debug for GateInner { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + let avail = self.sem.available_permits(); + + let guards = u32::try_from(avail) + .ok() + // the sem only supports 32-bit ish amount, but lets play it safe + .and_then(|x| Gate::MAX_UNITS.checked_sub(x)); + + let closing = self.closing.load(Ordering::Relaxed); + + if let Some(guards) = guards { + f.debug_struct("Gate") + .field("remaining_guards", &guards) + .field("closing", &closing) + .finish() + } else { + f.debug_struct("Gate") + .field("avail_permits", &avail) + .field("closing", &closing) + .finish() + } } } /// RAII guard for a [`Gate`]: as long as this exists, calls to [`Gate::close`] will /// not complete. #[derive(Debug)] -pub struct GateGuard(tokio::sync::OwnedSemaphorePermit); +pub struct GateGuard { + // Record the span where the gate was entered, so that we can identify who was blocking Gate::close + span_at_enter: tracing::Span, + gate: Arc, +} -/// Observability helper: every `warn_period`, emit a log warning that we're still waiting on this gate -async fn warn_if_stuck( - fut: Fut, - name: &str, - warn_period: std::time::Duration, -) -> ::Output { - let started = std::time::Instant::now(); - - let mut fut = std::pin::pin!(fut); - - let mut warned = false; - let ret = loop { - match tokio::time::timeout(warn_period, &mut fut).await { - Ok(ret) => break ret, - Err(_) => { - tracing::warn!( - gate = name, - elapsed_ms = started.elapsed().as_millis(), - "still waiting, taking longer than expected..." - ); - warned = true; - } +impl Drop for GateGuard { + fn drop(&mut self) { + if self.gate.closing.load(Ordering::Relaxed) { + self.span_at_enter.in_scope( + || tracing::info!(gate = ?Arc::as_ptr(&self.gate), "kept the gate from closing"), + ); } - }; - // If we emitted a warning for slowness, also emit a message when we complete, so that - // someone debugging a shutdown can know for sure whether we have moved past this operation. - if warned { - tracing::info!( - gate = name, - elapsed_ms = started.elapsed().as_millis(), - "completed, after taking longer than expected" - ) + // when the permit was acquired, it was forgotten to allow us to manage it's lifecycle + // manually, so "return" the permit now. + self.gate.sem.add_permits(1); } - - ret } #[derive(Debug)] @@ -69,15 +83,19 @@ pub enum GateError { GateClosed, } -impl Gate { - const MAX_UNITS: u32 = u32::MAX; - - pub fn new(name: String) -> Self { +impl Default for Gate { + fn default() -> Self { Self { - sem: Arc::new(tokio::sync::Semaphore::new(Self::MAX_UNITS as usize)), - name, + inner: Arc::new(GateInner { + sem: tokio::sync::Semaphore::new(Self::MAX_UNITS as usize), + closing: AtomicBool::new(false), + }), } } +} + +impl Gate { + const MAX_UNITS: u32 = u32::MAX; /// Acquire a guard that will prevent close() calls from completing. If close() /// was already called, this will return an error which should be interpreted @@ -88,11 +106,23 @@ impl Gate { /// to avoid blocking close() indefinitely: typically types that contain a Gate will /// also contain a CancellationToken. pub fn enter(&self) -> Result { - self.sem - .clone() - .try_acquire_owned() - .map(GateGuard) - .map_err(|_| GateError::GateClosed) + let permit = self + .inner + .sem + .try_acquire() + .map_err(|_| GateError::GateClosed)?; + + // we now have the permit, let's disable the normal raii functionality and leave + // "returning" the permit to our GateGuard::drop. + // + // this is done to avoid the need for multiple Arcs (one for semaphore, next for other + // fields). + permit.forget(); + + Ok(GateGuard { + span_at_enter: tracing::Span::current(), + gate: self.inner.clone(), + }) } /// Types with a shutdown() method and a gate should call this method at the @@ -102,48 +132,88 @@ impl Gate { /// important that the holders of such guards are respecting a CancellationToken which has /// been cancelled before entering this function. pub async fn close(&self) { - warn_if_stuck(self.do_close(), &self.name, Duration::from_millis(1000)).await + let started_at = std::time::Instant::now(); + let mut do_close = std::pin::pin!(self.do_close()); + + let nag_after = Duration::from_secs(1); + + let Err(_timeout) = tokio::time::timeout(nag_after, &mut do_close).await else { + return; + }; + + tracing::info!( + gate = ?self.as_ptr(), + elapsed_ms = started_at.elapsed().as_millis(), + "closing is taking longer than expected" + ); + + // close operation is not trying to be cancellation safe as pageserver does not need it. + // + // note: "closing" is not checked in Gate::enter -- it exists just for observability, + // dropping of GateGuard after this will log who they were. + self.inner.closing.store(true, Ordering::Relaxed); + + do_close.await; + + tracing::info!( + gate = ?self.as_ptr(), + elapsed_ms = started_at.elapsed().as_millis(), + "close completed" + ); + } + + /// Used as an identity of a gate. This identity will be resolved to something useful when + /// it's actually closed in a hopefully sensible `tracing::Span` which will describe it even + /// more. + /// + /// `GateGuard::drop` also logs this pointer when it has realized it has been keeping the gate + /// open for too long. + fn as_ptr(&self) -> *const GateInner { + Arc::as_ptr(&self.inner) } /// Check if [`Self::close()`] has finished waiting for all [`Self::enter()`] users to finish. This /// is usually analoguous for "Did shutdown finish?" for types that include a Gate, whereas checking /// the CancellationToken on such types is analogous to "Did shutdown start?" pub fn close_complete(&self) -> bool { - self.sem.is_closed() + self.inner.sem.is_closed() } + #[tracing::instrument(level = tracing::Level::DEBUG, skip_all, fields(gate = ?self.as_ptr()))] async fn do_close(&self) { - tracing::debug!(gate = self.name, "Closing Gate..."); - match self.sem.acquire_many(Self::MAX_UNITS).await { - Ok(_units) => { + tracing::debug!("Closing Gate..."); + + match self.inner.sem.acquire_many(Self::MAX_UNITS).await { + Ok(_permit) => { // While holding all units, close the semaphore. All subsequent calls to enter() will fail. - self.sem.close(); + self.inner.sem.close(); } - Err(_) => { + Err(_closed) => { // Semaphore closed: we are the only function that can do this, so it indicates a double-call. // This is legal. Timeline::shutdown for example is not protected from being called more than // once. - tracing::debug!(gate = self.name, "Double close") + tracing::debug!("Double close") } } - tracing::debug!(gate = self.name, "Closed Gate.") + tracing::debug!("Closed Gate.") } } #[cfg(test)] mod tests { - use futures::FutureExt; - use super::*; #[tokio::test] - async fn test_idle_gate() { - // Having taken no gates, we should not be blocked in close - let gate = Gate::new("test".to_string()); + async fn close_unused() { + // Having taken no guards, we should not be blocked in close + let gate = Gate::default(); gate.close().await; + } + #[tokio::test] + async fn close_idle() { // If a guard is dropped before entering, close should not be blocked - let gate = Gate::new("test".to_string()); + let gate = Gate::default(); let guard = gate.enter().unwrap(); drop(guard); gate.close().await; @@ -152,25 +222,30 @@ mod tests { gate.enter().expect_err("enter should fail after close"); } - #[tokio::test] - async fn test_busy_gate() { - let gate = Gate::new("test".to_string()); + #[tokio::test(start_paused = true)] + async fn close_busy_gate() { + let gate = Gate::default(); + let forever = Duration::from_secs(24 * 7 * 365); - let guard = gate.enter().unwrap(); + let guard = + tracing::info_span!("i am holding back the gate").in_scope(|| gate.enter().unwrap()); let mut close_fut = std::pin::pin!(gate.close()); - // Close should be blocked - assert!(close_fut.as_mut().now_or_never().is_none()); + // Close should be waiting for guards to drop + tokio::time::timeout(forever, &mut close_fut) + .await + .unwrap_err(); // Attempting to enter() should fail, even though close isn't done yet. gate.enter() .expect_err("enter should fail after entering close"); + // this will now log, which we cannot verify except manually drop(guard); // Guard is gone, close should finish - assert!(close_fut.as_mut().now_or_never().is_some()); + close_fut.await; // Attempting to enter() is still forbidden gate.enter().expect_err("enter should fail finishing close"); diff --git a/libs/utils/src/sync/heavier_once_cell.rs b/libs/utils/src/sync/heavier_once_cell.rs index 0ccaf4e716..703a6dfd52 100644 --- a/libs/utils/src/sync/heavier_once_cell.rs +++ b/libs/utils/src/sync/heavier_once_cell.rs @@ -69,37 +69,44 @@ impl OnceCell { F: FnOnce(InitPermit) -> Fut, Fut: std::future::Future>, { - let sem = { + loop { + let sem = { + let guard = self.inner.lock().unwrap(); + if guard.value.is_some() { + return Ok(Guard(guard)); + } + guard.init_semaphore.clone() + }; + + { + let permit = { + // increment the count for the duration of queued + let _guard = CountWaitingInitializers::start(self); + sem.acquire().await + }; + + let Ok(permit) = permit else { + let guard = self.inner.lock().unwrap(); + if !Arc::ptr_eq(&sem, &guard.init_semaphore) { + // there was a take_and_deinit in between + continue; + } + assert!( + guard.value.is_some(), + "semaphore got closed, must be initialized" + ); + return Ok(Guard(guard)); + }; + + permit.forget(); + } + + let permit = InitPermit(sem); + let (value, _permit) = factory(permit).await?; + let guard = self.inner.lock().unwrap(); - if guard.value.is_some() { - return Ok(Guard(guard)); - } - guard.init_semaphore.clone() - }; - let permit = { - // increment the count for the duration of queued - let _guard = CountWaitingInitializers::start(self); - sem.acquire_owned().await - }; - - match permit { - Ok(permit) => { - let permit = InitPermit(permit); - let (value, _permit) = factory(permit).await?; - - let guard = self.inner.lock().unwrap(); - - Ok(Self::set0(value, guard)) - } - Err(_closed) => { - let guard = self.inner.lock().unwrap(); - assert!( - guard.value.is_some(), - "semaphore got closed, must be initialized" - ); - return Ok(Guard(guard)); - } + return Ok(Self::set0(value, guard)); } } @@ -197,28 +204,41 @@ impl<'a, T> Guard<'a, T> { /// [`OnceCell::get_or_init`] will wait on it to complete. pub fn take_and_deinit(&mut self) -> (T, InitPermit) { let mut swapped = Inner::default(); - let permit = swapped - .init_semaphore - .clone() - .try_acquire_owned() - .expect("we just created this"); + let sem = swapped.init_semaphore.clone(); + // acquire and forget right away, moving the control over to InitPermit + sem.try_acquire().expect("we just created this").forget(); std::mem::swap(&mut *self.0, &mut swapped); swapped .value - .map(|v| (v, InitPermit(permit))) + .map(|v| (v, InitPermit(sem))) .expect("guard is not created unless value has been initialized") } } /// Type held by OnceCell (de)initializing task. -pub struct InitPermit(tokio::sync::OwnedSemaphorePermit); +/// +/// On drop, this type will return the permit. +pub struct InitPermit(Arc); + +impl Drop for InitPermit { + fn drop(&mut self) { + assert_eq!( + self.0.available_permits(), + 0, + "InitPermit should only exist as the unique permit" + ); + self.0.add_permits(1); + } +} #[cfg(test)] mod tests { + use futures::Future; + use super::*; use std::{ convert::Infallible, - sync::atomic::{AtomicUsize, Ordering}, + pin::{pin, Pin}, time::Duration, }; @@ -380,4 +400,85 @@ mod tests { .unwrap(); assert_eq!(*g, "now initialized"); } + + #[tokio::test(start_paused = true)] + async fn reproduce_init_take_deinit_race() { + init_take_deinit_scenario(|cell, factory| { + Box::pin(async { + cell.get_or_init(factory).await.unwrap(); + }) + }) + .await; + } + + type BoxedInitFuture = Pin>>>; + type BoxedInitFunction = Box BoxedInitFuture>; + + /// Reproduce an assertion failure. + /// + /// This has interesting generics to be generic between `get_or_init` and `get_mut_or_init`. + /// We currently only have one, but the structure is kept. + async fn init_take_deinit_scenario(init_way: F) + where + F: for<'a> Fn( + &'a OnceCell<&'static str>, + BoxedInitFunction<&'static str, Infallible>, + ) -> Pin + 'a>>, + { + let cell = OnceCell::default(); + + // acquire the init_semaphore only permit to drive initializing tasks in order to waiting + // on the same semaphore. + let permit = cell + .inner + .lock() + .unwrap() + .init_semaphore + .clone() + .try_acquire_owned() + .unwrap(); + + let mut t1 = pin!(init_way( + &cell, + Box::new(|permit| Box::pin(async move { Ok(("t1", permit)) })), + )); + + let mut t2 = pin!(init_way( + &cell, + Box::new(|permit| Box::pin(async move { Ok(("t2", permit)) })), + )); + + // drive t2 first to the init_semaphore -- the timeout will be hit once t2 future can + // no longer make progress + tokio::select! { + _ = &mut t2 => unreachable!("it cannot get permit"), + _ = tokio::time::sleep(Duration::from_secs(3600 * 24 * 7 * 365)) => {} + } + + // followed by t1 in the init_semaphore + tokio::select! { + _ = &mut t1 => unreachable!("it cannot get permit"), + _ = tokio::time::sleep(Duration::from_secs(3600 * 24 * 7 * 365)) => {} + } + + // now let t2 proceed and initialize + drop(permit); + t2.await; + + let (s, permit) = { cell.get().unwrap().take_and_deinit() }; + assert_eq!("t2", s); + + // now originally t1 would see the semaphore it has as closed. it cannot yet get a permit from + // the new one. + tokio::select! { + _ = &mut t1 => unreachable!("it cannot get permit"), + _ = tokio::time::sleep(Duration::from_secs(3600 * 24 * 7 * 365)) => {} + } + + // only now we get to initialize it + drop(permit); + t1.await; + + assert_eq!("t1", *cell.get().unwrap()); + } } diff --git a/libs/utils/src/tracing_span_assert.rs b/libs/utils/src/tracing_span_assert.rs index db17f7d8cd..d24c81ad0b 100644 --- a/libs/utils/src/tracing_span_assert.rs +++ b/libs/utils/src/tracing_span_assert.rs @@ -20,13 +20,13 @@ //! //! // Then, in the main code: //! -//! let span = tracing::info_span!("TestSpan", test_id = 1); +//! let span = tracing::info_span!("TestSpan", tenant_id = 1); //! let _guard = span.enter(); //! //! // ... down the call stack //! -//! use utils::tracing_span_assert::{check_fields_present, MultiNameExtractor}; -//! let extractor = MultiNameExtractor::new("TestExtractor", ["test", "test_id"]); +//! use utils::tracing_span_assert::{check_fields_present, ConstExtractor}; +//! let extractor = ConstExtractor::new("tenant_id"); //! if let Err(missing) = check_fields_present!([&extractor]) { //! // if you copypaste this to a custom assert method, remember to add #[track_caller] //! // to get the "user" code location for the panic. @@ -45,27 +45,26 @@ pub enum ExtractionResult { } pub trait Extractor: Send + Sync + std::fmt::Debug { - fn name(&self) -> &str; + fn id(&self) -> &str; fn extract(&self, fields: &tracing::field::FieldSet) -> ExtractionResult; } #[derive(Debug)] -pub struct MultiNameExtractor { - name: &'static str, - field_names: [&'static str; L], +pub struct ConstExtractor { + field_name: &'static str, } -impl MultiNameExtractor { - pub fn new(name: &'static str, field_names: [&'static str; L]) -> MultiNameExtractor { - MultiNameExtractor { name, field_names } +impl ConstExtractor { + pub const fn new(field_name: &'static str) -> ConstExtractor { + ConstExtractor { field_name } } } -impl Extractor for MultiNameExtractor { - fn name(&self) -> &str { - self.name +impl Extractor for ConstExtractor { + fn id(&self) -> &str { + self.field_name } fn extract(&self, fields: &tracing::field::FieldSet) -> ExtractionResult { - if fields.iter().any(|f| self.field_names.contains(&f.name())) { + if fields.iter().any(|f| f.name() == self.field_name) { ExtractionResult::Present } else { ExtractionResult::Absent @@ -203,19 +202,19 @@ mod tests { } impl<'a> fmt::Debug for MemoryIdentity<'a> { fn fmt(&self, f: &mut fmt::Formatter<'_>) -> std::fmt::Result { - write!(f, "{:p}: {}", self.as_ptr(), self.0.name()) + write!(f, "{:p}: {}", self.as_ptr(), self.0.id()) } } struct Setup { _current_thread_subscriber_guard: tracing::subscriber::DefaultGuard, - tenant_extractor: MultiNameExtractor<2>, - timeline_extractor: MultiNameExtractor<2>, + tenant_extractor: ConstExtractor, + timeline_extractor: ConstExtractor, } fn setup_current_thread() -> Setup { - let tenant_extractor = MultiNameExtractor::new("TenantId", ["tenant_id", "tenant"]); - let timeline_extractor = MultiNameExtractor::new("TimelineId", ["timeline_id", "timeline"]); + let tenant_extractor = ConstExtractor::new("tenant_id"); + let timeline_extractor = ConstExtractor::new("timeline_id"); let registry = tracing_subscriber::registry() .with(tracing_subscriber::fmt::layer()) @@ -343,12 +342,12 @@ mod tests { let span = tracing::info_span!("foo", e = "some value"); let _guard = span.enter(); - let extractor = MultiNameExtractor::new("E", ["e"]); + let extractor = ConstExtractor::new("e"); let res = check_fields_present0([&extractor]); assert!(matches!(res, Ok(Summary::Unconfigured)), "{res:?}"); // similarly for a not found key - let extractor = MultiNameExtractor::new("F", ["foobar"]); + let extractor = ConstExtractor::new("foobar"); let res = check_fields_present0([&extractor]); assert!(matches!(res, Ok(Summary::Unconfigured)), "{res:?}"); } @@ -368,16 +367,14 @@ mod tests { // normally this would work, but without any tracing-subscriber configured, both // check_field_present find nothing let _guard = subspan.enter(); - let extractors: [&dyn Extractor; 2] = [ - &MultiNameExtractor::new("E", ["e"]), - &MultiNameExtractor::new("F", ["f"]), - ]; + let extractors: [&dyn Extractor; 2] = + [&ConstExtractor::new("e"), &ConstExtractor::new("f")]; let res = check_fields_present0(extractors); assert!(matches!(res, Ok(Summary::Unconfigured)), "{res:?}"); // similarly for a not found key - let extractor = MultiNameExtractor::new("G", ["g"]); + let extractor = ConstExtractor::new("g"); let res = check_fields_present0([&extractor]); assert!(matches!(res, Ok(Summary::Unconfigured)), "{res:?}"); } @@ -410,7 +407,7 @@ mod tests { let span = tracing::info_span!("foo", e = "some value"); let _guard = span.enter(); - let extractors: [&dyn Extractor; 1] = [&MultiNameExtractor::new("E", ["e"])]; + let extractors: [&dyn Extractor; 1] = [&ConstExtractor::new("e")]; if span.is_disabled() { // the tests are running single threaded, or we got lucky and no other tests subscriber diff --git a/libs/walproposer/build.rs b/libs/walproposer/build.rs index fd09030dbd..3126b170a4 100644 --- a/libs/walproposer/build.rs +++ b/libs/walproposer/build.rs @@ -34,6 +34,9 @@ fn main() -> anyhow::Result<()> { println!("cargo:rustc-link-lib=static=walproposer"); println!("cargo:rustc-link-search={walproposer_lib_search_str}"); + // Rebuild crate when libwalproposer.a changes + println!("cargo:rerun-if-changed={walproposer_lib_search_str}/libwalproposer.a"); + let pg_config_bin = pg_install_abs.join("v16").join("bin").join("pg_config"); let inc_server_path: String = if pg_config_bin.exists() { let output = Command::new(pg_config_bin) @@ -79,6 +82,7 @@ fn main() -> anyhow::Result<()> { .allowlist_function("WalProposerBroadcast") .allowlist_function("WalProposerPoll") .allowlist_function("WalProposerFree") + .allowlist_function("SafekeeperStateDesiredEvents") .allowlist_var("DEBUG5") .allowlist_var("DEBUG4") .allowlist_var("DEBUG3") diff --git a/libs/walproposer/src/api_bindings.rs b/libs/walproposer/src/api_bindings.rs index 1f7bf952dc..f5ed6ebb97 100644 --- a/libs/walproposer/src/api_bindings.rs +++ b/libs/walproposer/src/api_bindings.rs @@ -22,6 +22,7 @@ use crate::bindings::WalProposerExecStatusType; use crate::bindings::WalproposerShmemState; use crate::bindings::XLogRecPtr; use crate::walproposer::ApiImpl; +use crate::walproposer::StreamingCallback; use crate::walproposer::WaitResult; extern "C" fn get_shmem_state(wp: *mut WalProposer) -> *mut WalproposerShmemState { @@ -36,7 +37,8 @@ extern "C" fn start_streaming(wp: *mut WalProposer, startpos: XLogRecPtr) { unsafe { let callback_data = (*(*wp).config).callback_data; let api = callback_data as *mut Box; - (*api).start_streaming(startpos) + let callback = StreamingCallback::new(wp); + (*api).start_streaming(startpos, &callback); } } @@ -134,19 +136,18 @@ extern "C" fn conn_async_read( unsafe { let callback_data = (*(*(*sk).wp).config).callback_data; let api = callback_data as *mut Box; - let (res, result) = (*api).conn_async_read(&mut (*sk)); // This function has guarantee that returned buf will be valid until // the next call. So we can store a Vec in each Safekeeper and reuse // it on the next call. let mut inbuf = take_vec_u8(&mut (*sk).inbuf).unwrap_or_default(); - inbuf.clear(); - inbuf.extend_from_slice(res); + + let result = (*api).conn_async_read(&mut (*sk), &mut inbuf); // Put a Vec back to sk->inbuf and return data ptr. + *amount = inbuf.len() as i32; *buf = store_vec_u8(&mut (*sk).inbuf, inbuf); - *amount = res.len() as i32; result } @@ -182,6 +183,10 @@ extern "C" fn recovery_download(wp: *mut WalProposer, sk: *mut Safekeeper) -> bo unsafe { let callback_data = (*(*(*sk).wp).config).callback_data; let api = callback_data as *mut Box; + + // currently `recovery_download` is always called right after election + (*api).after_election(&mut (*wp)); + (*api).recovery_download(&mut (*wp), &mut (*sk)) } } @@ -277,7 +282,8 @@ extern "C" fn wait_event_set( } WaitResult::Timeout => { *event_sk = std::ptr::null_mut(); - *events = crate::bindings::WL_TIMEOUT; + // WaitEventSetWait returns 0 for timeout. + *events = 0; 0 } WaitResult::Network(sk, event_mask) => { @@ -318,11 +324,11 @@ extern "C" fn finish_sync_safekeepers(wp: *mut WalProposer, lsn: XLogRecPtr) { } } -extern "C" fn process_safekeeper_feedback(wp: *mut WalProposer, commit_lsn: XLogRecPtr) { +extern "C" fn process_safekeeper_feedback(wp: *mut WalProposer) { unsafe { let callback_data = (*(*wp).config).callback_data; let api = callback_data as *mut Box; - (*api).process_safekeeper_feedback(&mut (*wp), commit_lsn) + (*api).process_safekeeper_feedback(&mut (*wp)) } } @@ -340,7 +346,7 @@ extern "C" fn log_internal( } } -#[derive(Debug)] +#[derive(Debug, PartialEq)] pub enum Level { Debug5, Debug4, diff --git a/libs/walproposer/src/walproposer.rs b/libs/walproposer/src/walproposer.rs index 7251545792..734967da3f 100644 --- a/libs/walproposer/src/walproposer.rs +++ b/libs/walproposer/src/walproposer.rs @@ -1,13 +1,13 @@ use std::ffi::CString; use postgres_ffi::WAL_SEGMENT_SIZE; -use utils::id::TenantTimelineId; +use utils::{id::TenantTimelineId, lsn::Lsn}; use crate::{ api_bindings::{create_api, take_vec_u8, Level}, bindings::{ - NeonWALReadResult, Safekeeper, WalProposer, WalProposerConfig, WalProposerCreate, - WalProposerFree, WalProposerStart, + NeonWALReadResult, Safekeeper, WalProposer, WalProposerBroadcast, WalProposerConfig, + WalProposerCreate, WalProposerFree, WalProposerPoll, WalProposerStart, }, }; @@ -16,11 +16,11 @@ use crate::{ /// /// Refer to `pgxn/neon/walproposer.h` for documentation. pub trait ApiImpl { - fn get_shmem_state(&self) -> &mut crate::bindings::WalproposerShmemState { + fn get_shmem_state(&self) -> *mut crate::bindings::WalproposerShmemState { todo!() } - fn start_streaming(&self, _startpos: u64) { + fn start_streaming(&self, _startpos: u64, _callback: &StreamingCallback) { todo!() } @@ -70,7 +70,11 @@ pub trait ApiImpl { todo!() } - fn conn_async_read(&self, _sk: &mut Safekeeper) -> (&[u8], crate::bindings::PGAsyncReadResult) { + fn conn_async_read( + &self, + _sk: &mut Safekeeper, + _vec: &mut Vec, + ) -> crate::bindings::PGAsyncReadResult { todo!() } @@ -138,7 +142,7 @@ pub trait ApiImpl { todo!() } - fn process_safekeeper_feedback(&self, _wp: &mut WalProposer, _commit_lsn: u64) { + fn process_safekeeper_feedback(&mut self, _wp: &mut WalProposer) { todo!() } @@ -151,12 +155,14 @@ pub trait ApiImpl { } } +#[derive(Debug)] pub enum WaitResult { Latch, Timeout, Network(*mut Safekeeper, u32), } +#[derive(Clone)] pub struct Config { /// Tenant and timeline id pub ttid: TenantTimelineId, @@ -242,6 +248,24 @@ impl Drop for Wrapper { } } +pub struct StreamingCallback { + wp: *mut WalProposer, +} + +impl StreamingCallback { + pub fn new(wp: *mut WalProposer) -> StreamingCallback { + StreamingCallback { wp } + } + + pub fn broadcast(&self, startpos: Lsn, endpos: Lsn) { + unsafe { WalProposerBroadcast(self.wp, startpos.0, endpos.0) } + } + + pub fn poll(&self) { + unsafe { WalProposerPoll(self.wp) } + } +} + #[cfg(test)] mod tests { use core::panic; @@ -344,14 +368,13 @@ mod tests { fn conn_async_read( &self, _: &mut crate::bindings::Safekeeper, - ) -> (&[u8], crate::bindings::PGAsyncReadResult) { + vec: &mut Vec, + ) -> crate::bindings::PGAsyncReadResult { println!("conn_async_read"); let reply = self.next_safekeeper_reply(); println!("conn_async_read result: {:?}", reply); - ( - reply, - crate::bindings::PGAsyncReadResult_PG_ASYNC_READ_SUCCESS, - ) + vec.extend_from_slice(reply); + crate::bindings::PGAsyncReadResult_PG_ASYNC_READ_SUCCESS } fn conn_blocking_write(&self, _: &mut crate::bindings::Safekeeper, buf: &[u8]) -> bool { @@ -453,9 +476,12 @@ mod tests { event_mask: 0, }), expected_messages: vec![ - // Greeting(ProposerGreeting { protocol_version: 2, pg_version: 160001, proposer_id: [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], system_id: 0, timeline_id: 9e4c8f36063c6c6e93bc20d65a820f3d, tenant_id: 9e4c8f36063c6c6e93bc20d65a820f3d, tli: 1, wal_seg_size: 16777216 }) + // TODO: When updating Postgres versions, this test will cause + // problems. Postgres version in message needs updating. + // + // Greeting(ProposerGreeting { protocol_version: 2, pg_version: 160002, proposer_id: [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], system_id: 0, timeline_id: 9e4c8f36063c6c6e93bc20d65a820f3d, tenant_id: 9e4c8f36063c6c6e93bc20d65a820f3d, tli: 1, wal_seg_size: 16777216 }) vec![ - 103, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0, 1, 113, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 103, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0, 2, 113, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 158, 76, 143, 54, 6, 60, 108, 110, 147, 188, 32, 214, 90, 130, 15, 61, 158, 76, 143, 54, 6, 60, 108, 110, 147, 188, 32, 214, 90, 130, 15, 61, 1, 0, 0, 0, 0, 0, 0, 1, diff --git a/pageserver/Cargo.toml b/pageserver/Cargo.toml index e44501d1ed..5adeaffe1a 100644 --- a/pageserver/Cargo.toml +++ b/pageserver/Cargo.toml @@ -12,6 +12,7 @@ testing = ["fail/failpoints"] [dependencies] anyhow.workspace = true +arc-swap.workspace = true async-compression.workspace = true async-stream.workspace = true async-trait.workspace = true @@ -21,7 +22,6 @@ camino.workspace = true camino-tempfile.workspace = true chrono = { workspace = true, features = ["serde"] } clap = { workspace = true, features = ["string"] } -close_fds.workspace = true const_format.workspace = true consumption_metrics.workspace = true crc32c.workspace = true @@ -36,6 +36,7 @@ humantime.workspace = true humantime-serde.workspace = true hyper.workspace = true itertools.workspace = true +leaky-bucket.workspace = true md5.workspace = true nix.workspace = true # hack to get the number of worker threads tokio uses @@ -72,6 +73,7 @@ url.workspace = true walkdir.workspace = true metrics.workspace = true pageserver_api.workspace = true +pageserver_compaction.workspace = true postgres_connection.workspace = true postgres_ffi.workspace = true pq_proto.workspace = true @@ -83,7 +85,7 @@ workspace_hack.workspace = true reqwest.workspace = true rpds.workspace = true enum-map.workspace = true -enumset.workspace = true +enumset = { workspace = true, features = ["serde"]} strum.workspace = true strum_macros.workspace = true diff --git a/pageserver/benches/bench_walredo.rs b/pageserver/benches/bench_walredo.rs index 4837626086..47c8bd75c6 100644 --- a/pageserver/benches/bench_walredo.rs +++ b/pageserver/benches/bench_walredo.rs @@ -6,14 +6,28 @@ //! There are two sets of inputs; `short` and `medium`. They were collected on postgres v14 by //! logging what happens when a sequential scan is requested on a small table, then picking out two //! suitable from logs. +//! +//! +//! Reference data (git blame to see commit) on an i3en.3xlarge +// ```text +//! short/short/1 time: [39.175 µs 39.348 µs 39.536 µs] +//! short/short/2 time: [51.227 µs 51.487 µs 51.755 µs] +//! short/short/4 time: [76.048 µs 76.362 µs 76.674 µs] +//! short/short/8 time: [128.94 µs 129.82 µs 130.74 µs] +//! short/short/16 time: [227.84 µs 229.00 µs 230.28 µs] +//! short/short/32 time: [455.97 µs 457.81 µs 459.90 µs] +//! short/short/64 time: [902.46 µs 904.84 µs 907.32 µs] +//! short/short/128 time: [1.7416 ms 1.7487 ms 1.7561 ms] +//! `` -use std::sync::{Arc, Barrier}; +use std::sync::Arc; use bytes::{Buf, Bytes}; use pageserver::{ config::PageServerConf, repository::Key, walrecord::NeonWalRecord, walredo::PostgresRedoManager, }; use pageserver_api::shard::TenantShardId; +use tokio::task::JoinSet; use utils::{id::TenantId, lsn::Lsn}; use criterion::{criterion_group, criterion_main, BenchmarkId, Criterion}; @@ -39,11 +53,11 @@ fn redo_scenarios(c: &mut Criterion) { .build() .unwrap(); tracing::info!("executing first"); - short().execute(rt.handle(), &manager).unwrap(); + rt.block_on(short().execute(&manager)).unwrap(); tracing::info!("first executed"); } - let thread_counts = [1, 2, 4, 8, 16]; + let thread_counts = [1, 2, 4, 8, 16, 32, 64, 128]; let mut group = c.benchmark_group("short"); group.sampling_mode(criterion::SamplingMode::Flat); @@ -74,114 +88,69 @@ fn redo_scenarios(c: &mut Criterion) { drop(group); } -/// Sets up `threads` number of requesters to `request_redo`, with the given input. +/// Sets up a multi-threaded tokio runtime with default worker thread count, +/// then, spawn `requesters` tasks that repeatedly: +/// - get input from `input_factor()` +/// - call `manager.request_redo()` with their input +/// +/// This stress-tests the scalability of a single walredo manager at high tokio-level concurrency. +/// +/// Using tokio's default worker thread count means the results will differ on machines +/// with different core countrs. We don't care about that, the performance will always +/// be different on different hardware. To compare performance of different software versions, +/// use the same hardware. fn add_multithreaded_walredo_requesters( b: &mut criterion::Bencher, - threads: u32, + nrequesters: usize, manager: &Arc, input_factory: fn() -> Request, ) { - assert_ne!(threads, 0); + assert_ne!(nrequesters, 0); - if threads == 1 { - let rt = tokio::runtime::Builder::new_current_thread() - .enable_all() - .build() - .unwrap(); - let handle = rt.handle(); - b.iter_batched_ref( - || Some(input_factory()), - |input| execute_all(input.take(), handle, manager), - criterion::BatchSize::PerIteration, - ); - } else { - let (work_tx, work_rx) = std::sync::mpsc::sync_channel(threads as usize); + let rt = tokio::runtime::Builder::new_multi_thread() + .enable_all() + .build() + .unwrap(); - let work_rx = std::sync::Arc::new(std::sync::Mutex::new(work_rx)); + let barrier = Arc::new(tokio::sync::Barrier::new(nrequesters + 1)); - let barrier = Arc::new(Barrier::new(threads as usize + 1)); - - let jhs = (0..threads) - .map(|_| { - std::thread::spawn({ - let manager = manager.clone(); - let barrier = barrier.clone(); - let work_rx = work_rx.clone(); - move || { - let rt = tokio::runtime::Builder::new_current_thread() - .enable_all() - .build() - .unwrap(); - let handle = rt.handle(); - loop { - // queue up and wait if we want to go another round - if work_rx.lock().unwrap().recv().is_err() { - break; - } - - let input = Some(input_factory()); - - barrier.wait(); - - execute_all(input, handle, &manager).unwrap(); - - barrier.wait(); - } - } - }) - }) - .collect::>(); - - let _jhs = JoinOnDrop(jhs); - - b.iter_batched( - || { - for _ in 0..threads { - work_tx.send(()).unwrap() - } - }, - |()| { - // start the work - barrier.wait(); - - // wait for work to complete - barrier.wait(); - }, - criterion::BatchSize::PerIteration, - ); - - drop(work_tx); + let mut requesters = JoinSet::new(); + for _ in 0..nrequesters { + let _entered = rt.enter(); + let manager = manager.clone(); + let barrier = barrier.clone(); + requesters.spawn(async move { + loop { + let input = input_factory(); + barrier.wait().await; + let page = input.execute(&manager).await.unwrap(); + assert_eq!(page.remaining(), 8192); + barrier.wait().await; + } + }); } -} -struct JoinOnDrop(Vec>); + let do_one_iteration = || { + rt.block_on(async { + barrier.wait().await; + // wait for work to complete + barrier.wait().await; + }) + }; -impl Drop for JoinOnDrop { - // it's not really needless because we want join all then check for panicks - #[allow(clippy::needless_collect)] - fn drop(&mut self) { - // first join all - let results = self.0.drain(..).map(|jh| jh.join()).collect::>(); - // then check the results; panicking here is not great, but it does get the message across - // to the user, and sets an exit value. - results.into_iter().try_for_each(|res| res).unwrap(); - } -} + b.iter_batched( + || { + // warmup + do_one_iteration(); + }, + |()| { + // work loop + do_one_iteration(); + }, + criterion::BatchSize::PerIteration, + ); -fn execute_all( - input: I, - handle: &tokio::runtime::Handle, - manager: &PostgresRedoManager, -) -> anyhow::Result<()> -where - I: IntoIterator, -{ - // just fire all requests as fast as possible - input.into_iter().try_for_each(|req| { - let page = req.execute(handle, manager)?; - assert_eq!(page.remaining(), 8192); - anyhow::Ok(()) - }) + rt.block_on(requesters.shutdown()); } criterion_group!(benches, redo_scenarios); @@ -493,11 +462,7 @@ struct Request { } impl Request { - fn execute( - self, - rt: &tokio::runtime::Handle, - manager: &PostgresRedoManager, - ) -> anyhow::Result { + async fn execute(self, manager: &PostgresRedoManager) -> anyhow::Result { let Request { key, lsn, @@ -506,6 +471,8 @@ impl Request { pg_version, } = self; - rt.block_on(manager.request_redo(key, lsn, base_img, records, pg_version)) + manager + .request_redo(key, lsn, base_img, records, pg_version) + .await } } diff --git a/pageserver/client/src/mgmt_api.rs b/pageserver/client/src/mgmt_api.rs index 077c3909e1..732eb951c9 100644 --- a/pageserver/client/src/mgmt_api.rs +++ b/pageserver/client/src/mgmt_api.rs @@ -7,7 +7,7 @@ use utils::{ pub mod util; -#[derive(Debug)] +#[derive(Debug, Clone)] pub struct Client { mgmt_api_endpoint: String, authorization_header: Option, @@ -24,6 +24,9 @@ pub enum Error { #[error("pageserver API: {1}")] ApiError(StatusCode, String), + + #[error("Cancelled")] + Cancelled, } pub type Result = std::result::Result; @@ -56,10 +59,18 @@ pub enum ForceAwaitLogicalSize { impl Client { pub fn new(mgmt_api_endpoint: String, jwt: Option<&str>) -> Self { + Self::from_client(reqwest::Client::new(), mgmt_api_endpoint, jwt) + } + + pub fn from_client( + client: reqwest::Client, + mgmt_api_endpoint: String, + jwt: Option<&str>, + ) -> Self { Self { mgmt_api_endpoint, authorization_header: jwt.map(|jwt| format!("Bearer {jwt}")), - client: reqwest::Client::new(), + client, } } @@ -69,6 +80,25 @@ impl Client { resp.json().await.map_err(Error::ReceiveBody) } + /// Get an arbitrary path and returning a streaming Response. This function is suitable + /// for pass-through/proxy use cases where we don't care what the response content looks + /// like. + /// + /// Use/add one of the properly typed methods below if you know aren't proxying, and + /// know what kind of response you expect. + pub async fn get_raw(&self, path: String) -> Result { + debug_assert!(path.starts_with('/')); + let uri = format!("{}{}", self.mgmt_api_endpoint, path); + + let req = self.client.request(Method::GET, uri); + let req = if let Some(value) = &self.authorization_header { + req.header(reqwest::header::AUTHORIZATION, value) + } else { + req + }; + req.send().await.map_err(Error::ReceiveBody) + } + pub async fn tenant_details( &self, tenant_shard_id: TenantShardId, @@ -171,6 +201,39 @@ impl Client { .map_err(Error::ReceiveBody) } + /// The tenant deletion API can return 202 if deletion is incomplete, or + /// 404 if it is complete. Callers are responsible for checking the status + /// code and retrying. Error codes other than 404 will return Err(). + pub async fn tenant_delete(&self, tenant_shard_id: TenantShardId) -> Result { + let uri = format!("{}/v1/tenant/{tenant_shard_id}", self.mgmt_api_endpoint); + + match self.request(Method::DELETE, &uri, ()).await { + Err(Error::ApiError(status_code, msg)) => { + if status_code == StatusCode::NOT_FOUND { + Ok(StatusCode::NOT_FOUND) + } else { + Err(Error::ApiError(status_code, msg)) + } + } + Err(e) => Err(e), + Ok(response) => Ok(response.status()), + } + } + + pub async fn tenant_time_travel_remote_storage( + &self, + tenant_shard_id: TenantShardId, + timestamp: &str, + done_if_after: &str, + ) -> Result<()> { + let uri = format!( + "{}/v1/tenant/{tenant_shard_id}/time_travel_remote_storage?travel_to={timestamp}&done_if_after={done_if_after}", + self.mgmt_api_endpoint + ); + self.request(Method::PUT, &uri, ()).await?; + Ok(()) + } + pub async fn tenant_config(&self, req: &TenantConfigRequest) -> Result<()> { let uri = format!("{}/v1/tenant/config", self.mgmt_api_endpoint); self.request(Method::PUT, &uri, req).await?; @@ -191,21 +254,30 @@ impl Client { tenant_shard_id: TenantShardId, config: LocationConfig, flush_ms: Option, + lazy: bool, ) -> Result<()> { let req_body = TenantLocationConfigRequest { tenant_id: tenant_shard_id, config, }; - let path = format!( + + let mut path = reqwest::Url::parse(&format!( "{}/v1/tenant/{}/location_config", self.mgmt_api_endpoint, tenant_shard_id - ); - let path = if let Some(flush_ms) = flush_ms { - format!("{}?flush_ms={}", path, flush_ms.as_millis()) - } else { - path - }; - self.request(Method::PUT, &path, &req_body).await?; + )) + // Should always work: mgmt_api_endpoint is configuration, not user input. + .expect("Cannot build URL"); + + if lazy { + path.query_pairs_mut().append_pair("lazy", "true"); + } + + if let Some(flush_ms) = flush_ms { + path.query_pairs_mut() + .append_pair("flush_ms", &format!("{}", flush_ms.as_millis())); + } + + self.request(Method::PUT, path, &req_body).await?; Ok(()) } @@ -218,6 +290,21 @@ impl Client { .map_err(Error::ReceiveBody) } + pub async fn get_location_config( + &self, + tenant_shard_id: TenantShardId, + ) -> Result> { + let path = format!( + "{}/v1/location_config/{tenant_shard_id}", + self.mgmt_api_endpoint + ); + self.request(Method::GET, &path, ()) + .await? + .json() + .await + .map_err(Error::ReceiveBody) + } + pub async fn timeline_create( &self, tenant_shard_id: TenantShardId, @@ -234,6 +321,32 @@ impl Client { .map_err(Error::ReceiveBody) } + /// The timeline deletion API can return 201 if deletion is incomplete, or + /// 403 if it is complete. Callers are responsible for checking the status + /// code and retrying. Error codes other than 403 will return Err(). + pub async fn timeline_delete( + &self, + tenant_shard_id: TenantShardId, + timeline_id: TimelineId, + ) -> Result { + let uri = format!( + "{}/v1/tenant/{tenant_shard_id}/timeline/{timeline_id}", + self.mgmt_api_endpoint + ); + + match self.request(Method::DELETE, &uri, ()).await { + Err(Error::ApiError(status_code, msg)) => { + if status_code == StatusCode::NOT_FOUND { + Ok(StatusCode::NOT_FOUND) + } else { + Err(Error::ApiError(status_code, msg)) + } + } + Err(e) => Err(e), + Ok(response) => Ok(response.status()), + } + } + pub async fn tenant_reset(&self, tenant_shard_id: TenantShardId) -> Result<()> { let uri = format!( "{}/v1/tenant/{}/reset", @@ -246,6 +359,22 @@ impl Client { .map_err(Error::ReceiveBody) } + pub async fn tenant_shard_split( + &self, + tenant_shard_id: TenantShardId, + req: TenantShardSplitRequest, + ) -> Result { + let uri = format!( + "{}/v1/tenant/{}/shard_split", + self.mgmt_api_endpoint, tenant_shard_id + ); + self.request(Method::PUT, &uri, req) + .await? + .json() + .await + .map_err(Error::ReceiveBody) + } + pub async fn timeline_list( &self, tenant_shard_id: &TenantShardId, @@ -275,4 +404,16 @@ impl Client { .await .map_err(Error::ReceiveBody) } + + pub async fn put_io_engine( + &self, + engine: &pageserver_api::models::virtual_file::IoEngineKind, + ) -> Result<()> { + let uri = format!("{}/v1/io_engine", self.mgmt_api_endpoint); + self.request(Method::PUT, uri, engine) + .await? + .json() + .await + .map_err(Error::ReceiveBody) + } } diff --git a/pageserver/client/src/page_service.rs b/pageserver/client/src/page_service.rs index ff542670f1..49175b3b90 100644 --- a/pageserver/client/src/page_service.rs +++ b/pageserver/client/src/page_service.rs @@ -156,7 +156,8 @@ impl PagestreamClient { PagestreamBeMessage::Error(e) => anyhow::bail!("Error: {:?}", e), PagestreamBeMessage::Exists(_) | PagestreamBeMessage::Nblocks(_) - | PagestreamBeMessage::DbSize(_) => { + | PagestreamBeMessage::DbSize(_) + | PagestreamBeMessage::GetSlruSegment(_) => { anyhow::bail!( "unexpected be message kind in response to getpage request: {}", msg.kind() diff --git a/pageserver/compaction/Cargo.toml b/pageserver/compaction/Cargo.toml new file mode 100644 index 0000000000..47f318db63 --- /dev/null +++ b/pageserver/compaction/Cargo.toml @@ -0,0 +1,54 @@ +[package] +name = "pageserver_compaction" +version = "0.1.0" +edition.workspace = true +license.workspace = true + +[features] +default = [] + +[dependencies] +anyhow.workspace = true +async-compression.workspace = true +async-stream.workspace = true +async-trait.workspace = true +byteorder.workspace = true +bytes.workspace = true +chrono = { workspace = true, features = ["serde"] } +clap = { workspace = true, features = ["string"] } +const_format.workspace = true +consumption_metrics.workspace = true +crossbeam-utils.workspace = true +either.workspace = true +flate2.workspace = true +fail.workspace = true +futures.workspace = true +git-version.workspace = true +hex.workspace = true +humantime.workspace = true +humantime-serde.workspace = true +itertools.workspace = true +once_cell.workspace = true +pageserver_api.workspace = true +pin-project-lite.workspace = true +rand.workspace = true +smallvec = { workspace = true, features = ["write"] } +svg_fmt.workspace = true +sync_wrapper.workspace = true +thiserror.workspace = true +tokio = { workspace = true, features = ["process", "sync", "fs", "rt", "io-util", "time"] } +tokio-io-timeout.workspace = true +tokio-util.workspace = true +tracing.workspace = true +tracing-error.workspace = true +tracing-subscriber.workspace = true +url.workspace = true +walkdir.workspace = true +metrics.workspace = true +utils.workspace = true +workspace_hack.workspace = true + +[dev-dependencies] +criterion.workspace = true +hex-literal.workspace = true +tokio = { workspace = true, features = ["process", "sync", "fs", "rt", "io-util", "time", "test-util"] } diff --git a/pageserver/compaction/TODO.md b/pageserver/compaction/TODO.md new file mode 100644 index 0000000000..85523ad5b3 --- /dev/null +++ b/pageserver/compaction/TODO.md @@ -0,0 +1,51 @@ +# TODO + +- If the key space can be perfectly partitioned at some key, perform planning on each + partition separately. For example, if we are compacting a level with layers like this: + + ``` + : + +--+ +----+ : +------+ + | | | | : | | + +--+ +----+ : +------+ + : + +-----+ +-+ : +--------+ + | | | | : | | + +-----+ +-+ : +--------+ + : + ``` + + At the dotted line, there is a natural split in the key space, such that all + layers are either on the left or the right of it. We can compact the + partitions separately. We could choose to create image layers for one + partition but not the other one, for example. + +- All the layers don't have to be exactly the same size, we can choose to cut a + layer short or stretch it a little larger than the target size, if it helps + the overall system. We can help perfect partitions (see previous bullet point) + to happen more frequently, by choosing the cut points wisely. For example, try + to cut layers at boundaries of underlying image layers. And "snap to grid", + i.e. don't cut layers at any key, but e.g. only when key % 10000 = 0. + +- Avoid rewriting layers when we'd just create an identical layer to an input + layer. + +- Parallelism. The code is already split up into planning and execution, so that + we first split up the compaction work into "Jobs", and then execute them. + It would be straightforward to execute multiple jobs in parallel. + +- Materialize extra pages in delta layers during compaction. This would reduce + read amplification. There has been the idea of partial image layers. Materializing + extra pages in the delta layers achieve the same goal, without introducing a new + concept. + +## Simulator + +- Expand the simulator for more workloads +- Automate a test suite that runs the simluator with different workloads and + spits out a table of results +- Model read amplification +- More sanity checking. One idea is to keep a reference count of each + MockRecord, i.e. use Arc instead of plain MockRecord, and panic if + a MockRecord that is newer than PITR horizon is completely dropped. That would + indicate that the record was lost. diff --git a/pageserver/compaction/src/bin/compaction-simulator.rs b/pageserver/compaction/src/bin/compaction-simulator.rs new file mode 100644 index 0000000000..1fd69407d3 --- /dev/null +++ b/pageserver/compaction/src/bin/compaction-simulator.rs @@ -0,0 +1,214 @@ +use clap::{Parser, Subcommand}; +use pageserver_compaction::simulator::MockTimeline; +use rand::Rng; +use std::io::Write; +use std::path::{Path, PathBuf}; +use std::sync::OnceLock; + +use utils::project_git_version; + +project_git_version!(GIT_VERSION); + +#[derive(Parser)] +#[command( + version = GIT_VERSION, + about = "Neon Pageserver compaction simulator", + long_about = "A developer tool to visualize and test compaction" +)] +#[command(propagate_version = true)] +struct CliOpts { + #[command(subcommand)] + command: Commands, +} + +#[derive(Subcommand)] +enum Commands { + RunSuite, + Simulate(SimulateCmd), +} + +#[derive(Clone, clap::ValueEnum)] +enum Distribution { + Uniform, + HotCold, +} + +/// Read and update pageserver metadata file +#[derive(Parser)] +struct SimulateCmd { + distribution: Distribution, + + /// Number of records to digest + num_records: u64, + /// Record length + record_len: u64, + + // Logical database size in MB + logical_size: u64, +} + +async fn simulate(cmd: &SimulateCmd, results_path: &Path) -> anyhow::Result<()> { + let mut executor = MockTimeline::new(); + + // Convert the logical size in MB into a key range. + let key_range = 0..((cmd.logical_size * 1024 * 1024) / 8192); + //let key_range = u64::MIN..u64::MAX; + println!( + "starting simulation with key range {:016X}-{:016X}", + key_range.start, key_range.end + ); + + // helper function to print progress indicator + let print_progress = |i| -> anyhow::Result<()> { + if i == 0 || (i + 1) % 10000 == 0 || i == cmd.num_records - 1 { + print!( + "\ringested {} / {} records, {} MiB / {} MiB...", + i + 1, + cmd.num_records, + (i + 1) * cmd.record_len / (1_000_000), + cmd.num_records * cmd.record_len / (1_000_000), + ); + std::io::stdout().flush()?; + } + Ok(()) + }; + + match cmd.distribution { + Distribution::Uniform => { + for i in 0..cmd.num_records { + executor.ingest_uniform(1, cmd.record_len, &key_range)?; + executor.compact_if_needed().await?; + + print_progress(i)?; + } + } + Distribution::HotCold => { + let splitpoint = key_range.start + (key_range.end - key_range.start) / 10; + let hot_key_range = 0..splitpoint; + let cold_key_range = splitpoint..key_range.end; + + for i in 0..cmd.num_records { + let chosen_range = if rand::thread_rng().gen_bool(0.9) { + &hot_key_range + } else { + &cold_key_range + }; + executor.ingest_uniform(1, cmd.record_len, chosen_range)?; + executor.compact_if_needed().await?; + + print_progress(i)?; + } + } + } + println!("done!"); + executor.flush_l0(); + executor.compact_if_needed().await?; + let stats = executor.stats()?; + + // Print the stats to stdout, and also to a file + print!("{stats}"); + std::fs::write(results_path.join("stats.txt"), stats)?; + + let animation_path = results_path.join("compaction-animation.html"); + executor.draw_history(std::fs::File::create(&animation_path)?)?; + println!( + "animation: file://{}", + animation_path.canonicalize()?.display() + ); + + Ok(()) +} + +async fn run_suite_cmd(results_path: &Path, workload: &SimulateCmd) -> anyhow::Result<()> { + std::fs::create_dir(results_path)?; + + set_log_file(File::create(results_path.join("log"))?); + let result = simulate(workload, results_path).await; + set_log_stdout(); + result +} + +async fn run_suite() -> anyhow::Result<()> { + let top_results_path = PathBuf::from(format!( + "compaction-suite-results.{}", + std::time::SystemTime::UNIX_EPOCH.elapsed()?.as_secs() + )); + std::fs::create_dir(&top_results_path)?; + + let workload = SimulateCmd { + distribution: Distribution::Uniform, + // Generate 20 GB of WAL + record_len: 1_000, + num_records: 20_000_000, + // Logical size 5 GB + logical_size: 5_000, + }; + + run_suite_cmd(&top_results_path.join("uniform-20GB-5GB"), &workload).await?; + + println!( + "All tests finished. Results in {}", + top_results_path.display() + ); + Ok(()) +} + +use std::fs::File; +use std::io::Stdout; +use std::sync::Mutex; +use tracing_subscriber::fmt::writer::EitherWriter; +use tracing_subscriber::fmt::MakeWriter; + +static LOG_FILE: OnceLock>> = OnceLock::new(); +fn get_log_output() -> &'static Mutex> { + LOG_FILE.get_or_init(|| std::sync::Mutex::new(EitherWriter::B(std::io::stdout()))) +} + +fn set_log_file(f: File) { + *get_log_output().lock().unwrap() = EitherWriter::A(f); +} + +fn set_log_stdout() { + *get_log_output().lock().unwrap() = EitherWriter::B(std::io::stdout()); +} + +fn init_logging() -> anyhow::Result<()> { + // We fall back to printing all spans at info-level or above if + // the RUST_LOG environment variable is not set. + let rust_log_env_filter = || { + tracing_subscriber::EnvFilter::try_from_default_env() + .unwrap_or_else(|_| tracing_subscriber::EnvFilter::new("info")) + }; + + // NB: the order of the with() calls does not matter. + // See https://docs.rs/tracing-subscriber/0.3.16/tracing_subscriber/layer/index.html#per-layer-filtering + use tracing_subscriber::prelude::*; + tracing_subscriber::registry() + .with({ + let log_layer = tracing_subscriber::fmt::layer() + .with_target(false) + .with_ansi(false) + .with_writer(|| get_log_output().make_writer()); + log_layer.with_filter(rust_log_env_filter()) + }) + .init(); + + Ok(()) +} + +#[tokio::main] +async fn main() -> anyhow::Result<()> { + let cli = CliOpts::parse(); + + init_logging()?; + + match cli.command { + Commands::Simulate(cmd) => { + simulate(&cmd, &PathBuf::from("/tmp/compactions.html")).await?; + } + Commands::RunSuite => { + run_suite().await?; + } + }; + Ok(()) +} diff --git a/pageserver/compaction/src/compact_tiered.rs b/pageserver/compaction/src/compact_tiered.rs new file mode 100644 index 0000000000..60fc7ac925 --- /dev/null +++ b/pageserver/compaction/src/compact_tiered.rs @@ -0,0 +1,857 @@ +//! # Tiered compaction algorithm. +//! +//! Read all the input delta files, and write a new set of delta files that +//! include all the input WAL records. See retile_deltas(). +//! +//! In a "normal" LSM tree, you get to remove any values that are overwritten by +//! later values, but in our system, we keep all the history. So the reshuffling +//! doesn't remove any garbage, it just reshuffles the records to reduce read +//! amplification, i.e. the number of files that you need to access to find the +//! WAL records for a given key. +//! +//! If the new delta files would be very "narrow", i.e. each file would cover +//! only a narrow key range, then we create a new set of image files +//! instead. The current threshold is that if the estimated total size of the +//! image layers is smaller than the size of the deltas, then we create image +//! layers. That amounts to 2x storage amplification, and it means that the +//! distance of image layers in LSN dimension is roughly equal to the logical +//! database size. For example, if the logical database size is 10 GB, we would +//! generate new image layers every 10 GB of WAL. +use futures::StreamExt; +use tracing::{debug, info}; + +use std::collections::{HashSet, VecDeque}; +use std::ops::Range; + +use crate::helpers::{accum_key_values, keyspace_total_size, merge_delta_keys, overlaps_with}; +use crate::interface::*; +use utils::lsn::Lsn; + +use crate::identify_levels::identify_level; + +/// Main entry point to compaction. +/// +/// The starting point is a cutoff LSN (`end_lsn`). The compaction is run on +/// everything below that point, that needs compaction. The cutoff LSN must +/// partition the layers so that there are no layers that span across that +/// LSN. To start compaction at the top of the tree, pass the end LSN of the +/// written last L0 layer. +pub async fn compact_tiered( + executor: &mut E, + end_lsn: Lsn, + target_file_size: u64, + fanout: u64, + ctx: &E::RequestContext, +) -> anyhow::Result<()> { + assert!(fanout >= 2); + // Start at L0 + let mut current_level_no = 0; + let mut current_level_target_height = target_file_size; + loop { + // end LSN +1 to include possible image layers exactly at 'end_lsn'. + let all_layers = executor + .get_layers( + &(E::Key::MIN..E::Key::MAX), + &(Lsn(u64::MIN)..end_lsn + 1), + ctx, + ) + .await?; + info!( + "Compacting L{}, total # of layers: {}", + current_level_no, + all_layers.len() + ); + + // Identify the range of LSNs that belong to this level. We assume that + // each file in this level spans an LSN range up to 1.75x target file + // size. That should give us enough slop that if we created a slightly + // oversized L0 layer, e.g. because flushing the in-memory layer was + // delayed for some reason, we don't consider the oversized layer to + // belong to L1. But not too much slop, that we don't accidentally + // "skip" levels. + let max_height = (current_level_target_height as f64 * 1.75) as u64; + let Some(level) = identify_level(all_layers, end_lsn, max_height).await? else { + break; + }; + + // Calculate the height of this level. If the # of tiers exceeds the + // fanout parameter, it's time to compact it. + let depth = level.depth(); + info!( + "Level {} identified as LSN range {}-{}: depth {}", + current_level_no, level.lsn_range.start, level.lsn_range.end, depth + ); + for l in &level.layers { + debug!("LEVEL {} layer: {}", current_level_no, l.short_id()); + } + if depth < fanout { + debug!( + level = current_level_no, + depth = depth, + fanout, + "too few deltas to compact" + ); + break; + } + + compact_level( + &level.lsn_range, + &level.layers, + executor, + target_file_size, + ctx, + ) + .await?; + if target_file_size == u64::MAX { + break; + } + current_level_no += 1; + current_level_target_height = current_level_target_height.saturating_mul(fanout); + } + Ok(()) +} + +async fn compact_level( + lsn_range: &Range, + layers: &[E::Layer], + executor: &mut E, + target_file_size: u64, + ctx: &E::RequestContext, +) -> anyhow::Result { + let mut layer_fragments = Vec::new(); + for l in layers { + layer_fragments.push(LayerFragment::new(l.clone())); + } + + let mut state = LevelCompactionState { + target_file_size, + _lsn_range: lsn_range.clone(), + layers: layer_fragments, + jobs: Vec::new(), + job_queue: Vec::new(), + next_level: false, + executor, + }; + + let first_job = CompactionJob { + key_range: E::Key::MIN..E::Key::MAX, + lsn_range: lsn_range.clone(), + strategy: CompactionStrategy::Divide, + input_layers: state + .layers + .iter() + .enumerate() + .map(|i| LayerId(i.0)) + .collect(), + completed: false, + }; + + state.jobs.push(first_job); + state.job_queue.push(JobId(0)); + state.execute(ctx).await?; + + info!( + "compaction completed! Need to process next level: {}", + state.next_level + ); + + Ok(state.next_level) +} + +/// Blackboard that keeps track of the state of all the jobs and work remaining +struct LevelCompactionState<'a, E> +where + E: CompactionJobExecutor, +{ + // parameters + target_file_size: u64, + + _lsn_range: Range, + layers: Vec>, + + // job queue + jobs: Vec>, + job_queue: Vec, + + /// If false, no need to compact levels below this + next_level: bool, + + /// Interface to the outside world + executor: &'a mut E, +} + +#[derive(Debug, Clone, Copy, Hash, PartialEq, Eq)] +struct LayerId(usize); +#[derive(Debug, Clone, Copy, Hash, PartialEq, Eq)] +struct JobId(usize); + +struct PendingJobSet { + pending: HashSet, + completed: HashSet, +} + +impl PendingJobSet { + fn new() -> Self { + PendingJobSet { + pending: HashSet::new(), + completed: HashSet::new(), + } + } + + fn complete_job(&mut self, job_id: JobId) { + self.pending.remove(&job_id); + self.completed.insert(job_id); + } + + fn all_completed(&self) -> bool { + self.pending.is_empty() + } +} + +// When we decide to rewrite a set of layers, LayerFragment is used to keep +// track which new layers supersede an old layer. When all the stakeholder jobs +// have completed, this layer can be deleted. +struct LayerFragment +where + E: CompactionJobExecutor, +{ + layer: E::Layer, + + // If we will write new layers to replace this one, this keeps track of the + // jobs that need to complete before this layer can be deleted. As the jobs + // complete, they are moved from 'pending' to 'completed' set. Once the + // 'pending' set becomes empty, the layer can be deleted. + // + // If None, this layer is not rewritten and must not be deleted. + deletable_after: Option, + + deleted: bool, +} + +impl LayerFragment +where + E: CompactionJobExecutor, +{ + fn new(layer: E::Layer) -> Self { + LayerFragment { + layer, + deletable_after: None, + deleted: false, + } + } +} + +#[derive(PartialEq)] +enum CompactionStrategy { + Divide, + CreateDelta, + CreateImage, +} + +struct CompactionJob { + key_range: Range, + lsn_range: Range, + + strategy: CompactionStrategy, + + input_layers: Vec, + + completed: bool, +} + +impl<'a, E> LevelCompactionState<'a, E> +where + E: CompactionJobExecutor, +{ + /// Main loop of the executor. + /// + /// In each iteration, we take the next job from the queue, and execute it. + /// The execution might add new jobs to the queue. Keep going until the + /// queue is empty. + /// + /// Initially, the job queue consists of one Divide job over the whole + /// level. On first call, it is divided into smaller jobs. + async fn execute(&mut self, ctx: &E::RequestContext) -> anyhow::Result<()> { + // TODO: this would be pretty straightforward to parallelize with FuturesUnordered + while let Some(next_job_id) = self.job_queue.pop() { + info!("executing job {}", next_job_id.0); + self.execute_job(next_job_id, ctx).await?; + } + + // all done! + Ok(()) + } + + async fn execute_job(&mut self, job_id: JobId, ctx: &E::RequestContext) -> anyhow::Result<()> { + let job = &self.jobs[job_id.0]; + match job.strategy { + CompactionStrategy::Divide => { + self.divide_job(job_id, ctx).await?; + Ok(()) + } + CompactionStrategy::CreateDelta => { + let mut deltas: Vec = Vec::new(); + let mut layer_ids: Vec = Vec::new(); + for layer_id in &job.input_layers { + let layer = &self.layers[layer_id.0].layer; + if let Some(dl) = self.executor.downcast_delta_layer(layer).await? { + deltas.push(dl.clone()); + layer_ids.push(*layer_id); + } + } + + self.executor + .create_delta(&job.lsn_range, &job.key_range, &deltas, ctx) + .await?; + self.jobs[job_id.0].completed = true; + + // did we complete any fragments? + for layer_id in layer_ids { + let l = &mut self.layers[layer_id.0]; + if let Some(deletable_after) = l.deletable_after.as_mut() { + deletable_after.complete_job(job_id); + if deletable_after.all_completed() { + self.executor.delete_layer(&l.layer, ctx).await?; + l.deleted = true; + } + } + } + + self.next_level = true; + + Ok(()) + } + CompactionStrategy::CreateImage => { + self.executor + .create_image(job.lsn_range.end, &job.key_range, ctx) + .await?; + self.jobs[job_id.0].completed = true; + + // TODO: we could check if any layers < PITR horizon became deletable + Ok(()) + } + } + } + + fn push_job(&mut self, job: CompactionJob) -> JobId { + let job_id = JobId(self.jobs.len()); + self.jobs.push(job); + self.job_queue.push(job_id); + job_id + } + + /// Take a partition of the key space, and decide how to compact it. + /// + /// TODO: Currently, this is called exactly once for the level, and we + /// decide whether to create new image layers to cover the whole level, or + /// write a new set of deltas. In the future, this should try to partition + /// the key space, and make the decision separately for each partition. + async fn divide_job(&mut self, job_id: JobId, ctx: &E::RequestContext) -> anyhow::Result<()> { + let job = &self.jobs[job_id.0]; + assert!(job.strategy == CompactionStrategy::Divide); + + // Check for dummy cases + if job.input_layers.is_empty() { + return Ok(()); + } + + let job = &self.jobs[job_id.0]; + assert!(job.strategy == CompactionStrategy::Divide); + + // Would it be better to create images for this partition? + // Decide based on the average density of the level + let keyspace_size = keyspace_total_size( + &self + .executor + .get_keyspace(&job.key_range, job.lsn_range.end, ctx) + .await?, + ) * 8192; + + let wal_size = job + .input_layers + .iter() + .filter(|layer_id| self.layers[layer_id.0].layer.is_delta()) + .map(|layer_id| self.layers[layer_id.0].layer.file_size()) + .sum::(); + if keyspace_size < wal_size { + // seems worth it + info!( + "covering with images, because keyspace_size is {}, size of deltas between {}-{} is {}", + keyspace_size, job.lsn_range.start, job.lsn_range.end, wal_size + ); + self.cover_with_images(job_id, ctx).await + } else { + // do deltas + info!( + "coverage not worth it, keyspace_size {}, wal_size {}", + keyspace_size, wal_size + ); + self.retile_deltas(job_id, ctx).await + } + } + + // LSN + // ^ + // | + // | ###|###|##### + // | +--+-----+--+ +--+-----+--+ + // | | | | | | | | | + // | +--+--+--+--+ +--+--+--+--+ + // | | | | | | | + // | +---+-+-+---+ ==> +---+-+-+---+ + // | | | | | | | | | + // | +---+-+-++--+ +---+-+-++--+ + // | | | | | | | | | + // | +-----+--+--+ +-----+--+--+ + // | + // +--------------> key + // + async fn cover_with_images( + &mut self, + job_id: JobId, + ctx: &E::RequestContext, + ) -> anyhow::Result<()> { + let job = &self.jobs[job_id.0]; + assert!(job.strategy == CompactionStrategy::Divide); + + // XXX: do we still need the "holes" stuff? + + let mut new_jobs = Vec::new(); + + // Slide a window through the keyspace + let keyspace = self + .executor + .get_keyspace(&job.key_range, job.lsn_range.end, ctx) + .await?; + + let mut window = KeyspaceWindow::new( + E::Key::MIN..E::Key::MAX, + keyspace, + self.target_file_size / 8192, + ); + while let Some(key_range) = window.choose_next_image() { + new_jobs.push(CompactionJob:: { + key_range, + lsn_range: job.lsn_range.clone(), + strategy: CompactionStrategy::CreateImage, + input_layers: Vec::new(), // XXX: Is it OK for this to be empty for image layer? + completed: false, + }); + } + + for j in new_jobs.into_iter().rev() { + let _job_id = self.push_job(j); + + // TODO: image layers don't let us delete anything. unless < PITR horizon + //let j = &self.jobs[job_id.0]; + // for layer_id in j.input_layers.iter() { + // self.layers[layer_id.0].pending_stakeholders.insert(job_id); + //} + } + + Ok(()) + } + + // Merge the contents of all the input delta layers into a new set + // of delta layers, based on the current partitioning. + // + // We split the new delta layers on the key dimension. We iterate through + // the key space, and for each key, check if including the next key to the + // current output layer we're building would cause the layer to become too + // large. If so, dump the current output layer and start new one. It's + // possible that there is a single key with so many page versions that + // storing all of them in a single layer file would be too large. In that + // case, we also split on the LSN dimension. + // + // LSN + // ^ + // | + // | +-----------+ +--+--+--+--+ + // | | | | | | | | + // | +-----------+ | | | | | + // | | | | | | | | + // | +-----------+ ==> | | | | | + // | | | | | | | | + // | +-----------+ | | | | | + // | | | | | | | | + // | +-----------+ +--+--+--+--+ + // | + // +--------------> key + // + // + // If one key (X) has a lot of page versions: + // + // LSN + // ^ + // | (X) + // | +-----------+ +--+--+--+--+ + // | | | | | | | | + // | +-----------+ | | +--+ | + // | | | | | | | | + // | +-----------+ ==> | | | | | + // | | | | | +--+ | + // | +-----------+ | | | | | + // | | | | | | | | + // | +-----------+ +--+--+--+--+ + // | + // +--------------> key + // + // TODO: this actually divides the layers into fixed-size chunks, not + // based on the partitioning. + // + // TODO: we should also opportunistically materialize and + // garbage collect what we can. + async fn retile_deltas( + &mut self, + job_id: JobId, + ctx: &E::RequestContext, + ) -> anyhow::Result<()> { + let job = &self.jobs[job_id.0]; + assert!(job.strategy == CompactionStrategy::Divide); + + // Sweep the key space left to right, running an estimate of how much + // disk size and keyspace we have accumulated + // + // Once the disk size reaches the target threshold, stop and think. + // If we have accumulated only a narrow band of keyspace, create an + // image layer. Otherwise write a delta layer. + + // FIXME: deal with the case of lots of values for same key + + // FIXME: we are ignoring images here. Did we already divide the work + // so that we won't encounter them here? + + let mut deltas: Vec = Vec::new(); + for layer_id in &job.input_layers { + let l = &self.layers[layer_id.0]; + if let Some(dl) = self.executor.downcast_delta_layer(&l.layer).await? { + deltas.push(dl.clone()); + } + } + // Open stream + let key_value_stream = std::pin::pin!(merge_delta_keys::(deltas.as_slice(), ctx)); + let mut new_jobs = Vec::new(); + + // Slide a window through the keyspace + let mut key_accum = std::pin::pin!(accum_key_values(key_value_stream)); + let mut all_in_window: bool = false; + let mut window = Window::new(); + loop { + if all_in_window && window.elems.is_empty() { + // All done! + break; + } + if let Some(key_range) = window.choose_next_delta(self.target_file_size, !all_in_window) + { + let batch_layers: Vec = job + .input_layers + .iter() + .filter(|layer_id| { + overlaps_with(self.layers[layer_id.0].layer.key_range(), &key_range) + }) + .cloned() + .collect(); + assert!(!batch_layers.is_empty()); + new_jobs.push(CompactionJob { + key_range, + lsn_range: job.lsn_range.clone(), + strategy: CompactionStrategy::CreateDelta, + input_layers: batch_layers, + completed: false, + }); + } else { + assert!(!all_in_window); + if let Some(next_key) = key_accum.next().await.transpose()? { + window.feed(next_key.key, next_key.size); + } else { + all_in_window = true; + } + } + } + + // All the input files are rewritten. Set up the tracking for when they can + // be deleted. + for layer_id in job.input_layers.iter() { + let l = &mut self.layers[layer_id.0]; + assert!(l.deletable_after.is_none()); + l.deletable_after = Some(PendingJobSet::new()); + } + for j in new_jobs.into_iter().rev() { + let job_id = self.push_job(j); + let j = &self.jobs[job_id.0]; + for layer_id in j.input_layers.iter() { + self.layers[layer_id.0] + .deletable_after + .as_mut() + .unwrap() + .pending + .insert(job_id); + } + } + + Ok(()) + } +} + +// Sliding window through keyspace and values +// This is used by over_with_images to decide on good split points +struct KeyspaceWindow { + head: KeyspaceWindowHead, + + start_pos: KeyspaceWindowPos, +} +struct KeyspaceWindowHead { + // overall key range to cover + key_range: Range, + + keyspace: Vec>, + target_keysize: u64, +} + +#[derive(Clone)] +struct KeyspaceWindowPos { + end_key: K, + + keyspace_idx: usize, + + accum_keysize: u64, +} +impl KeyspaceWindowPos { + fn reached_end(&self, w: &KeyspaceWindowHead) -> bool { + self.keyspace_idx == w.keyspace.len() + } + + // Advance the cursor until it reaches 'target_keysize'. + fn advance_until_size(&mut self, w: &KeyspaceWindowHead, max_size: u64) { + while self.accum_keysize < max_size && !self.reached_end(w) { + let curr_range = &w.keyspace[self.keyspace_idx]; + if self.end_key < curr_range.start { + // skip over any unused space + self.end_key = curr_range.start; + } + + // We're now within 'curr_range'. Can we advance past it completely? + let distance = K::key_range_size(&(self.end_key..curr_range.end)); + if (self.accum_keysize + distance as u64) < max_size { + // oh yeah, it fits + self.end_key = curr_range.end; + self.keyspace_idx += 1; + self.accum_keysize += distance as u64; + } else { + // advance within the range + let skip_key = self.end_key.skip_some(); + let distance = K::key_range_size(&(self.end_key..skip_key)); + if (self.accum_keysize + distance as u64) < max_size { + self.end_key = skip_key; + self.accum_keysize += distance as u64; + } else { + self.end_key = self.end_key.next(); + self.accum_keysize += 1; + } + } + } + } +} + +impl KeyspaceWindow +where + K: CompactionKey, +{ + fn new(key_range: Range, keyspace: CompactionKeySpace, target_keysize: u64) -> Self { + assert!(keyspace.first().unwrap().start >= key_range.start); + + let start_key = key_range.start; + let start_pos = KeyspaceWindowPos:: { + end_key: start_key, + keyspace_idx: 0, + accum_keysize: 0, + }; + Self { + head: KeyspaceWindowHead:: { + key_range, + keyspace, + target_keysize, + }, + start_pos, + } + } + + fn choose_next_image(&mut self) -> Option> { + if self.start_pos.keyspace_idx == self.head.keyspace.len() { + // we've reached the end + return None; + } + + let mut next_pos = self.start_pos.clone(); + next_pos.advance_until_size( + &self.head, + self.start_pos.accum_keysize + self.head.target_keysize, + ); + + // See if we can gobble up the rest of the keyspace if we stretch out the layer, up to + // 1.25x target size + let mut end_pos = next_pos.clone(); + end_pos.advance_until_size( + &self.head, + self.start_pos.accum_keysize + (self.head.target_keysize * 5 / 4), + ); + if end_pos.reached_end(&self.head) { + // gobble up any unused keyspace between the last used key and end of the range + assert!(end_pos.end_key <= self.head.key_range.end); + end_pos.end_key = self.head.key_range.end; + next_pos = end_pos; + } + + let start_key = self.start_pos.end_key; + self.start_pos = next_pos; + Some(start_key..self.start_pos.end_key) + } +} + +// Take previous partitioning, based on the image layers below. +// +// Candidate is at the front: +// +// Consider stretching an image layer to next divider? If it's close enough, +// that's the image candidate +// +// If it's too far, consider splitting at a reasonable point +// +// Is the image candidate smaller than the equivalent delta? If so, +// split off the image. Otherwise, split off one delta. +// Try to snap off the delta at a reasonable point + +struct WindowElement { + start_key: K, // inclusive + last_key: K, // inclusive + accum_size: u64, +} + +// Sliding window through keyspace and values +// +// This is used to decide what layer to write next, from the beginning of the window. +struct Window { + elems: VecDeque>, + + // last key that was split off, inclusive + splitoff_key: Option, + splitoff_size: u64, +} + +impl Window +where + K: CompactionKey, +{ + fn new() -> Self { + Self { + elems: VecDeque::new(), + splitoff_key: None, + splitoff_size: 0, + } + } + + fn feed(&mut self, key: K, size: u64) { + let last_size; + if let Some(last) = self.elems.back_mut() { + assert!(last.last_key <= key); + if key == last.last_key { + last.accum_size += size; + return; + } + last_size = last.accum_size; + } else { + last_size = 0; + } + // This is a new key. + let elem = WindowElement { + start_key: key, + last_key: key, + accum_size: last_size + size, + }; + self.elems.push_back(elem); + } + + fn remain_size(&self) -> u64 { + self.elems.back().unwrap().accum_size - self.splitoff_size + } + + fn peek_size(&self) -> u64 { + self.elems.front().unwrap().accum_size - self.splitoff_size + } + + fn commit_upto(&mut self, mut upto: usize) { + while upto > 1 { + let popped = self.elems.pop_front().unwrap(); + self.elems.front_mut().unwrap().start_key = popped.start_key; + upto -= 1; + } + } + + fn find_size_split(&self, target_size: u64) -> usize { + self.elems + .partition_point(|elem| elem.accum_size - self.splitoff_size < target_size) + } + + fn pop(&mut self) { + let first = self.elems.pop_front().unwrap(); + self.splitoff_size = first.accum_size; + + self.splitoff_key = Some(first.last_key); + } + + // the difference between delta and image is that an image covers + // any unused keyspace before and after, while a delta tries to + // minimize that. TODO: difference not implemented + fn pop_delta(&mut self) -> Range { + let first = self.elems.front().unwrap(); + let key_range = first.start_key..first.last_key.next(); + + self.pop(); + key_range + } + + // Prerequisite: we have enough input in the window + // + // On return None, the caller should feed more data and call again + fn choose_next_delta(&mut self, target_size: u64, has_more: bool) -> Option> { + if has_more && self.elems.is_empty() { + // Starting up + return None; + } + + // If we still have an undersized candidate, just keep going + while self.peek_size() < target_size { + if self.elems.len() > 1 { + self.commit_upto(2); + } else if has_more { + return None; + } else { + break; + } + } + + // Ensure we have enough input in the window to make a good decision + if has_more && self.remain_size() < target_size * 5 / 4 { + return None; + } + + // The candidate on the front is now large enough, for a delta. + // And we have enough data in the window to decide. + + // If we're willing to stretch it up to 1.25 target size, could we + // gobble up the rest of the work? This avoids creating very small + // "tail" layers at the end of the keyspace + if !has_more && self.remain_size() < target_size * 5 / 3 { + self.commit_upto(self.elems.len()); + } else { + let delta_split_at = self.find_size_split(target_size); + self.commit_upto(delta_split_at); + + // If it's still not large enough, request the caller to fill the window + if self.elems.len() == 1 && has_more { + return None; + } + } + Some(self.pop_delta()) + } +} diff --git a/pageserver/compaction/src/helpers.rs b/pageserver/compaction/src/helpers.rs new file mode 100644 index 0000000000..22a410b4af --- /dev/null +++ b/pageserver/compaction/src/helpers.rs @@ -0,0 +1,242 @@ +//! This file contains generic utility functions over the interface types, +//! which could be handy for any compaction implementation. +use crate::interface::*; + +use futures::future::BoxFuture; +use futures::{Stream, StreamExt}; +use itertools::Itertools; +use pin_project_lite::pin_project; +use std::collections::BinaryHeap; +use std::collections::VecDeque; +use std::future::Future; +use std::ops::{DerefMut, Range}; +use std::pin::Pin; +use std::task::{ready, Poll}; + +pub fn keyspace_total_size(keyspace: &CompactionKeySpace) -> u64 +where + K: CompactionKey, +{ + keyspace.iter().map(|r| K::key_range_size(r) as u64).sum() +} + +pub fn overlaps_with(a: &Range, b: &Range) -> bool { + !(a.end <= b.start || b.end <= a.start) +} + +pub fn union_to_keyspace(a: &mut CompactionKeySpace, b: CompactionKeySpace) { + let x = std::mem::take(a); + let mut all_ranges_iter = [x.into_iter(), b.into_iter()] + .into_iter() + .kmerge_by(|a, b| a.start < b.start); + let mut ranges = Vec::new(); + if let Some(first) = all_ranges_iter.next() { + let (mut start, mut end) = (first.start, first.end); + + for r in all_ranges_iter { + assert!(r.start >= start); + if r.start > end { + ranges.push(start..end); + start = r.start; + end = r.end; + } else if r.end > end { + end = r.end; + } + } + ranges.push(start..end); + } + *a = ranges +} + +pub fn intersect_keyspace( + a: &CompactionKeySpace, + r: &Range, +) -> CompactionKeySpace { + let mut ranges: Vec> = Vec::new(); + + for x in a.iter() { + if x.end <= r.start { + continue; + } + if x.start >= r.end { + break; + } + ranges.push(x.clone()) + } + + // trim the ends + if let Some(first) = ranges.first_mut() { + first.start = std::cmp::max(first.start, r.start); + } + if let Some(last) = ranges.last_mut() { + last.end = std::cmp::min(last.end, r.end); + } + ranges +} + +/// Create a stream that iterates through all DeltaEntrys among all input +/// layers, in key-lsn order. +/// +/// This is public because the create_delta() implementation likely wants to use this too +/// TODO: move to a more shared place +pub fn merge_delta_keys<'a, E: CompactionJobExecutor>( + layers: &'a [E::DeltaLayer], + ctx: &'a E::RequestContext, +) -> MergeDeltaKeys<'a, E> { + // Use a binary heap to merge the layers. Each input layer is initially + // represented by a LazyLoadLayer::Unloaded element, which uses the start of + // the layer's key range as the key. The first time a layer reaches the top + // of the heap, all the keys of the layer are loaded into a sorted vector. + // + // This helps to keep the memory usage reasonable: we only need to hold in + // memory the DeltaEntrys of the layers that overlap with the "current" key. + let mut heap: BinaryHeap> = BinaryHeap::new(); + for l in layers { + heap.push(LazyLoadLayer::Unloaded(l)); + } + MergeDeltaKeys { + heap, + ctx, + load_future: None, + } +} + +enum LazyLoadLayer<'a, E: CompactionJobExecutor> { + Loaded(VecDeque<>::DeltaEntry<'a>>), + Unloaded(&'a E::DeltaLayer), +} +impl<'a, E: CompactionJobExecutor> LazyLoadLayer<'a, E> { + fn key(&self) -> E::Key { + match self { + Self::Loaded(entries) => entries.front().unwrap().key(), + Self::Unloaded(dl) => dl.key_range().start, + } + } +} +impl<'a, E: CompactionJobExecutor> PartialOrd for LazyLoadLayer<'a, E> { + fn partial_cmp(&self, other: &Self) -> Option { + Some(self.cmp(other)) + } +} +impl<'a, E: CompactionJobExecutor> Ord for LazyLoadLayer<'a, E> { + fn cmp(&self, other: &Self) -> std::cmp::Ordering { + // reverse order so that we get a min-heap + other.key().cmp(&self.key()) + } +} +impl<'a, E: CompactionJobExecutor> PartialEq for LazyLoadLayer<'a, E> { + fn eq(&self, other: &Self) -> bool { + self.key().eq(&other.key()) + } +} +impl<'a, E: CompactionJobExecutor> Eq for LazyLoadLayer<'a, E> {} + +type LoadFuture<'a, E> = BoxFuture<'a, anyhow::Result>>; + +// Stream returned by `merge_delta_keys` +pin_project! { +#[allow(clippy::type_complexity)] +pub struct MergeDeltaKeys<'a, E: CompactionJobExecutor> { + heap: BinaryHeap>, + + #[pin] + load_future: Option>::DeltaEntry<'a>>>, + + ctx: &'a E::RequestContext, +} +} + +impl<'a, E> Stream for MergeDeltaKeys<'a, E> +where + E: CompactionJobExecutor + 'a, +{ + type Item = anyhow::Result<>::DeltaEntry<'a>>; + + fn poll_next( + self: Pin<&mut Self>, + cx: &mut std::task::Context<'_>, + ) -> Poll::Item>> { + let mut this = self.project(); + loop { + if let Some(mut load_future) = this.load_future.as_mut().as_pin_mut() { + // We are waiting for loading the keys to finish + match ready!(load_future.as_mut().poll(cx)) { + Ok(entries) => { + this.load_future.set(None); + *this.heap.peek_mut().unwrap() = + LazyLoadLayer::Loaded(VecDeque::from(entries)); + } + Err(e) => { + return Poll::Ready(Some(Err(e))); + } + } + } + + // If the topmost layer in the heap hasn't been loaded yet, start + // loading it. Otherwise return the next entry from it and update + // the layer's position in the heap (this decreaseKey operation is + // performed implicitly when `top` is dropped). + if let Some(mut top) = this.heap.peek_mut() { + match top.deref_mut() { + LazyLoadLayer::Unloaded(ref mut l) => { + let fut = l.load_keys(this.ctx); + this.load_future.set(Some(fut)); + continue; + } + LazyLoadLayer::Loaded(ref mut entries) => { + let result = entries.pop_front().unwrap(); + if entries.is_empty() { + std::collections::binary_heap::PeekMut::pop(top); + } + return Poll::Ready(Some(Ok(result))); + } + } + } else { + return Poll::Ready(None); + } + } + } +} + +// Accumulate values at key boundaries +pub struct KeySize { + pub key: K, + pub num_values: u64, + pub size: u64, +} + +pub fn accum_key_values<'a, I, K, D, E>(input: I) -> impl Stream, E>> +where + K: Eq, + I: Stream>, + D: CompactionDeltaEntry<'a, K>, +{ + async_stream::try_stream! { + // Initialize the state from the first value + let mut input = std::pin::pin!(input); + + if let Some(first) = input.next().await { + let first = first?; + let mut accum: KeySize = KeySize { + key: first.key(), + num_values: 1, + size: first.size(), + }; + while let Some(this) = input.next().await { + let this = this?; + if this.key() == accum.key { + accum.size += this.size(); + accum.num_values += 1; + } else { + yield accum; + accum = KeySize { + key: this.key(), + num_values: 1, + size: this.size(), + }; + } + } + yield accum; + } + } +} diff --git a/pageserver/compaction/src/identify_levels.rs b/pageserver/compaction/src/identify_levels.rs new file mode 100644 index 0000000000..98dd46925c --- /dev/null +++ b/pageserver/compaction/src/identify_levels.rs @@ -0,0 +1,375 @@ +//! An LSM tree consists of multiple levels, each exponentially larger than the +//! previous level. And each level consists of multiple "tiers". With tiered +//! compaction, a level is compacted when it has accumulated more than N tiers, +//! forming one tier on the next level. +//! +//! In the pageserver, we don't explicitly track the levels and tiers. Instead, +//! we identify them by looking at the shapes of the layers. It's an easy task +//! for a human, but it's not straightforward to come up with the exact +//! rules. Especially if there are cases like interrupted, half-finished +//! compactions, or highly skewed data distributions that have let us "skip" +//! some levels. It's not critical to classify all cases correctly; at worst we +//! delay some compaction work, and suffer from more read amplification, or we +//! perform some unnecessary compaction work. +//! +//! `identify_level` performs that shape-matching. +//! +//! It returns a Level struct, which has `depth()` function to count the number +//! of "tiers" in the level. The tier count is the max depth of stacked layers +//! within the level. That's a good measure, because the point of compacting is +//! to reduce read amplification, and the depth is what determines that. +//! +//! One interesting effect of this is that if we generate very small delta +//! layers at L0, e.g. because the L0 layers are flushed by timeout rather than +//! because they reach the target size, the L0 compaction will combine them to +//! one larger file. But if the combined file is still smaller than the target +//! file size, the file will still be considered to be part of L0 at the next +//! iteration. + +use anyhow::bail; +use std::collections::BTreeSet; +use std::ops::Range; +use utils::lsn::Lsn; + +use crate::interface::*; + +use tracing::{info, trace}; + +pub struct Level { + pub lsn_range: Range, + pub layers: Vec, +} + +/// Identify an LSN > `end_lsn` that partitions the LSN space, so that there are +/// no layers that cross the boundary LSN. +/// +/// A further restriction is that all layers in the returned partition cover at +/// most 'lsn_max_size' LSN bytes. +pub async fn identify_level( + all_layers: Vec, + end_lsn: Lsn, + lsn_max_size: u64, +) -> anyhow::Result>> +where + K: CompactionKey, + L: CompactionLayer + Clone, +{ + // filter out layers that are above the `end_lsn`, they are completely irrelevant. + let mut layers = Vec::new(); + for l in all_layers { + if l.lsn_range().start < end_lsn && l.lsn_range().end > end_lsn { + // shouldn't happen. Indicates that the caller passed a bogus + // end_lsn. + bail!("identify_level() called with end_lsn that does not partition the LSN space: end_lsn {} intersects with layer {}", end_lsn, l.short_id()); + } + // include image layers sitting exacty at `end_lsn`. + let is_image = !l.is_delta(); + if (is_image && l.lsn_range().start > end_lsn) + || (!is_image && l.lsn_range().start >= end_lsn) + { + continue; + } + layers.push(l); + } + // All the remaining layers either belong to this level, or are below it. + info!( + "identify level at {}, size {}, num layers below: {}", + end_lsn, + lsn_max_size, + layers.len() + ); + if layers.is_empty() { + return Ok(None); + } + + // Walk the ranges in LSN order. + // + // ----- end_lsn + // | + // | + // v + // + layers.sort_by_key(|l| l.lsn_range().end); + let mut candidate_start_lsn = end_lsn; + let mut candidate_layers: Vec = Vec::new(); + let mut current_best_start_lsn = end_lsn; + let mut current_best_layers: Vec = Vec::new(); + let mut iter = layers.into_iter(); + loop { + let Some(l) = iter.next_back() else { + // Reached end. Accept the last candidate + current_best_start_lsn = candidate_start_lsn; + current_best_layers.extend_from_slice(&std::mem::take(&mut candidate_layers)); + break; + }; + trace!( + "inspecting {} for candidate {}, current best {}", + l.short_id(), + candidate_start_lsn, + current_best_start_lsn + ); + + let r = l.lsn_range(); + + // Image layers don't restrict our choice of cutoff LSN + if l.is_delta() { + // Is this candidate workable? In other words, are there any + // delta layers that span across this LSN + // + // Valid: Not valid: + // + + + // | | + + // + <- candidate + | <- candidate + // + + + // | + // + + if r.end <= candidate_start_lsn { + // Hooray, there are no crossing LSNs. And we have visited + // through all the layers within candidate..end_lsn. The + // current candidate can be accepted. + current_best_start_lsn = r.end; + current_best_layers.extend_from_slice(&std::mem::take(&mut candidate_layers)); + candidate_start_lsn = r.start; + } + + // Is it small enough to be considered part of this level? + if r.end.0 - r.start.0 > lsn_max_size { + // Too large, this layer belongs to next level. Stop. + trace!( + "too large {}, size {} vs {}", + l.short_id(), + r.end.0 - r.start.0, + lsn_max_size + ); + break; + } + + // If this crosses the candidate lsn, push it down. + if r.start < candidate_start_lsn { + trace!( + "layer {} prevents from stopping at {}", + l.short_id(), + candidate_start_lsn + ); + candidate_start_lsn = r.start; + } + } + + // Include this layer in our candidate + candidate_layers.push(l); + } + + Ok(if current_best_start_lsn == end_lsn { + // empty level + None + } else { + Some(Level { + lsn_range: current_best_start_lsn..end_lsn, + layers: current_best_layers, + }) + }) +} + +impl Level { + /// Count the number of deltas stacked on each other. + pub fn depth(&self) -> u64 + where + K: CompactionKey, + L: CompactionLayer, + { + struct Event { + key: K, + layer_idx: usize, + start: bool, + } + let mut events: Vec> = Vec::new(); + for (idx, l) in self.layers.iter().enumerate() { + events.push(Event { + key: l.key_range().start, + layer_idx: idx, + start: true, + }); + events.push(Event { + key: l.key_range().end, + layer_idx: idx, + start: false, + }); + } + events.sort_by_key(|e| (e.key, e.start)); + + // Sweep the key space left to right. Stop at each distinct key, and + // count the number of deltas on top of the highest image at that key. + // + // This is a little inefficient, as we walk through the active_set on + // every key. We could increment/decrement a counter on each step + // instead, but that'd require a bit more complex bookkeeping. + let mut active_set: BTreeSet<(Lsn, bool, usize)> = BTreeSet::new(); + let mut max_depth = 0; + let mut events_iter = events.iter().peekable(); + while let Some(e) = events_iter.next() { + let l = &self.layers[e.layer_idx]; + let is_image = !l.is_delta(); + + // update the active set + if e.start { + active_set.insert((l.lsn_range().end, is_image, e.layer_idx)); + } else { + active_set.remove(&(l.lsn_range().end, is_image, e.layer_idx)); + } + + // recalculate depth if this was the last event at this point + let more_events_at_this_key = events_iter + .peek() + .map_or(false, |next_e| next_e.key == e.key); + if !more_events_at_this_key { + let mut active_depth = 0; + for (_end_lsn, is_image, _idx) in active_set.iter().rev() { + if *is_image { + break; + } + active_depth += 1; + } + if active_depth > max_depth { + max_depth = active_depth; + } + } + } + debug_assert_eq!(active_set, BTreeSet::new()); + max_depth + } +} + +#[cfg(test)] +mod tests { + use super::*; + use crate::simulator::{Key, MockDeltaLayer, MockImageLayer, MockLayer}; + use std::sync::{Arc, Mutex}; + + fn delta(key_range: Range, lsn_range: Range) -> MockLayer { + MockLayer::Delta(Arc::new(MockDeltaLayer { + key_range, + lsn_range, + // identify_level() doesn't pay attention to the rest of the fields + file_size: 0, + deleted: Mutex::new(false), + records: vec![], + })) + } + + fn image(key_range: Range, lsn: Lsn) -> MockLayer { + MockLayer::Image(Arc::new(MockImageLayer { + key_range, + lsn_range: lsn..(lsn + 1), + // identify_level() doesn't pay attention to the rest of the fields + file_size: 0, + deleted: Mutex::new(false), + })) + } + + #[tokio::test] + async fn test_identify_level() -> anyhow::Result<()> { + let layers = vec![ + delta(Key::MIN..Key::MAX, Lsn(0x8000)..Lsn(0x9000)), + delta(Key::MIN..Key::MAX, Lsn(0x5000)..Lsn(0x7000)), + delta(Key::MIN..Key::MAX, Lsn(0x4000)..Lsn(0x5000)), + delta(Key::MIN..Key::MAX, Lsn(0x3000)..Lsn(0x4000)), + delta(Key::MIN..Key::MAX, Lsn(0x2000)..Lsn(0x3000)), + delta(Key::MIN..Key::MAX, Lsn(0x1000)..Lsn(0x2000)), + ]; + + // All layers fit in the max file size + let level = identify_level(layers.clone(), Lsn(0x10000), 0x2000) + .await? + .unwrap(); + assert_eq!(level.depth(), 6); + + // Same LSN with smaller max file size. The second layer from the top is larger + // and belongs to next level. + let level = identify_level(layers.clone(), Lsn(0x10000), 0x1000) + .await? + .unwrap(); + assert_eq!(level.depth(), 1); + + // Call with a smaller LSN + let level = identify_level(layers.clone(), Lsn(0x3000), 0x1000) + .await? + .unwrap(); + assert_eq!(level.depth(), 2); + + // Call with an LSN that doesn't partition the space + let result = identify_level(layers, Lsn(0x6000), 0x1000).await; + assert!(result.is_err()); + Ok(()) + } + + #[tokio::test] + async fn test_overlapping_lsn_ranges() -> anyhow::Result<()> { + // The files LSN ranges overlap, so even though there are more files that + // fit under the file size, they are not included in the level because they + // overlap so that we'd need to include the oldest file, too, which is + // larger + let layers = vec![ + delta(Key::MIN..Key::MAX, Lsn(0x4000)..Lsn(0x5000)), + delta(Key::MIN..Key::MAX, Lsn(0x3000)..Lsn(0x4000)), // overlap + delta(Key::MIN..Key::MAX, Lsn(0x2500)..Lsn(0x3500)), // overlap + delta(Key::MIN..Key::MAX, Lsn(0x2000)..Lsn(0x3000)), // overlap + delta(Key::MIN..Key::MAX, Lsn(0x1000)..Lsn(0x2500)), // larger + ]; + + let level = identify_level(layers.clone(), Lsn(0x10000), 0x1000) + .await? + .unwrap(); + assert_eq!(level.depth(), 1); + + Ok(()) + } + + #[tokio::test] + async fn test_depth_nonoverlapping() -> anyhow::Result<()> { + // The key ranges don't overlap, so depth is only 1. + let layers = vec![ + delta(4000..5000, Lsn(0x6000)..Lsn(0x7000)), + delta(3000..4000, Lsn(0x7000)..Lsn(0x8000)), + delta(1000..2000, Lsn(0x8000)..Lsn(0x9000)), + ]; + + let level = identify_level(layers.clone(), Lsn(0x10000), 0x2000) + .await? + .unwrap(); + assert_eq!(level.layers.len(), 3); + assert_eq!(level.depth(), 1); + + // Staggered. The 1st and 3rd layer don't overlap with each other. + let layers = vec![ + delta(1000..2000, Lsn(0x8000)..Lsn(0x9000)), + delta(1500..2500, Lsn(0x7000)..Lsn(0x8000)), + delta(2000..3000, Lsn(0x6000)..Lsn(0x7000)), + ]; + + let level = identify_level(layers.clone(), Lsn(0x10000), 0x2000) + .await? + .unwrap(); + assert_eq!(level.layers.len(), 3); + assert_eq!(level.depth(), 2); + Ok(()) + } + + #[tokio::test] + async fn test_depth_images() -> anyhow::Result<()> { + let layers: Vec = vec![ + delta(1000..2000, Lsn(0x8000)..Lsn(0x9000)), + delta(1500..2500, Lsn(0x7000)..Lsn(0x8000)), + delta(2000..3000, Lsn(0x6000)..Lsn(0x7000)), + // This covers the same key range as the 2nd delta layer. The depth + // in that key range is therefore 0. + image(1500..2500, Lsn(0x9000)), + ]; + + let level = identify_level(layers.clone(), Lsn(0x10000), 0x2000) + .await? + .unwrap(); + assert_eq!(level.layers.len(), 4); + assert_eq!(level.depth(), 1); + Ok(()) + } +} diff --git a/pageserver/compaction/src/interface.rs b/pageserver/compaction/src/interface.rs new file mode 100644 index 0000000000..2bb2e749c0 --- /dev/null +++ b/pageserver/compaction/src/interface.rs @@ -0,0 +1,166 @@ +//! This is what the compaction implementation needs to know about +//! layers, keyspace etc. +//! +//! All the heavy lifting is done by the create_image and create_delta +//! functions that the implementor provides. +use async_trait::async_trait; +use futures::Future; +use pageserver_api::{key::Key, keyspace::key_range_size}; +use std::ops::Range; +use utils::lsn::Lsn; + +/// Public interface. This is the main thing that the implementor needs to provide +pub trait CompactionJobExecutor { + // Type system. + // + // We assume that there are two kinds of layers, deltas and images. The + // compaction doesn't distinguish whether they are stored locally or + // remotely. + // + // The keyspace is defined by the CompactionKey trait. + type Key: CompactionKey; + + type Layer: CompactionLayer + Clone; + type DeltaLayer: CompactionDeltaLayer + Clone; + type ImageLayer: CompactionImageLayer + Clone; + + // This is passed through to all the interface functions. The compaction + // implementation doesn't do anything with it, but it might be useful for + // the interface implementation. + type RequestContext: CompactionRequestContext; + + // ---- + // Functions that the planner uses to support its decisions + // ---- + + /// Return all layers that overlap the given bounding box. + fn get_layers( + &mut self, + key_range: &Range, + lsn_range: &Range, + ctx: &Self::RequestContext, + ) -> impl Future>> + Send; + + fn get_keyspace( + &mut self, + key_range: &Range, + lsn: Lsn, + ctx: &Self::RequestContext, + ) -> impl Future>> + Send; + + /// NB: This is a pretty expensive operation. In the real pageserver + /// implementation, it downloads the layer, and keeps it resident + /// until the DeltaLayer is dropped. + fn downcast_delta_layer( + &self, + layer: &Self::Layer, + ) -> impl Future>> + Send; + + // ---- + // Functions to execute the plan + // ---- + + /// Create a new image layer, materializing all the values in the key range, + /// at given 'lsn'. + fn create_image( + &mut self, + lsn: Lsn, + key_range: &Range, + ctx: &Self::RequestContext, + ) -> impl Future> + Send; + + /// Create a new delta layer, containing all the values from 'input_layers' + /// in the given key and LSN range. + fn create_delta( + &mut self, + lsn_range: &Range, + key_range: &Range, + input_layers: &[Self::DeltaLayer], + ctx: &Self::RequestContext, + ) -> impl Future> + Send; + + /// Delete a layer. The compaction implementation will call this only after + /// all the create_image() or create_delta() calls that deletion of this + /// layer depends on have finished. But if the implementor has extra lazy + /// background tasks, like uploading the index json file to remote storage. + /// it is the implementation's responsibility to track those. + fn delete_layer( + &mut self, + layer: &Self::Layer, + ctx: &Self::RequestContext, + ) -> impl Future> + Send; +} + +pub trait CompactionKey: std::cmp::Ord + Clone + Copy + std::fmt::Display { + const MIN: Self; + const MAX: Self; + + /// Calculate distance between key_range.start and key_range.end. + /// + /// This returns u32, for compatibility with Repository::key. If the + /// distance is larger, return u32::MAX. + fn key_range_size(key_range: &Range) -> u32; + + // return "self + 1" + fn next(&self) -> Self; + + // return "self + ". The amount to skip + // is left to the implementation. + // FIXME: why not just "add(u32)" ? This is hard to use + fn skip_some(&self) -> Self; +} + +impl CompactionKey for Key { + const MIN: Self = Self::MIN; + const MAX: Self = Self::MAX; + + fn key_range_size(r: &std::ops::Range) -> u32 { + key_range_size(r) + } + fn next(&self) -> Key { + (self as &Key).next() + } + fn skip_some(&self) -> Key { + self.add(128) + } +} + +/// Contiguous ranges of keys that belong to the key space. In key order, and +/// with no overlap. +pub type CompactionKeySpace = Vec>; + +/// Functions needed from all layers. +pub trait CompactionLayer { + fn key_range(&self) -> &Range; + fn lsn_range(&self) -> &Range; + + fn file_size(&self) -> u64; + + /// For debugging, short human-readable representation of the layer. E.g. filename. + fn short_id(&self) -> String; + + fn is_delta(&self) -> bool; +} + +#[async_trait] +pub trait CompactionDeltaLayer: CompactionLayer { + type DeltaEntry<'a>: CompactionDeltaEntry<'a, E::Key> + where + Self: 'a; + + /// Return all keys in this delta layer. + async fn load_keys<'a>( + &self, + ctx: &E::RequestContext, + ) -> anyhow::Result>>; +} + +pub trait CompactionImageLayer: CompactionLayer {} + +pub trait CompactionDeltaEntry<'a, K> { + fn key(&self) -> K; + fn lsn(&self) -> Lsn; + fn size(&self) -> u64; +} + +pub trait CompactionRequestContext {} diff --git a/pageserver/compaction/src/lib.rs b/pageserver/compaction/src/lib.rs new file mode 100644 index 0000000000..2d6d673de5 --- /dev/null +++ b/pageserver/compaction/src/lib.rs @@ -0,0 +1,12 @@ +// The main module implementing the compaction algorithm +pub mod compact_tiered; +pub(crate) mod identify_levels; + +// Traits that the caller of the compaction needs to implement +pub mod interface; + +// Utility functions, useful for the implementation +pub mod helpers; + +// A simulator with mock implementations of 'interface' +pub mod simulator; diff --git a/pageserver/compaction/src/simulator.rs b/pageserver/compaction/src/simulator.rs new file mode 100644 index 0000000000..def7983e75 --- /dev/null +++ b/pageserver/compaction/src/simulator.rs @@ -0,0 +1,612 @@ +mod draw; + +use draw::{LayerTraceEvent, LayerTraceFile, LayerTraceOp}; + +use async_trait::async_trait; +use futures::StreamExt; +use rand::Rng; +use tracing::info; + +use utils::lsn::Lsn; + +use std::fmt::Write; +use std::ops::Range; +use std::sync::Arc; +use std::sync::Mutex; + +use crate::helpers::{merge_delta_keys, overlaps_with}; + +use crate::interface; +use crate::interface::CompactionLayer; + +// +// Implementation for the CompactionExecutor interface +// +pub struct MockTimeline { + // Parameters for the compaction algorithm + pub target_file_size: u64, + tiers_per_level: u64, + + num_l0_flushes: u64, + last_compact_at_flush: u64, + last_flush_lsn: Lsn, + + // In-memory layer + records: Vec, + total_len: u64, + start_lsn: Lsn, + end_lsn: Lsn, + + // Current keyspace at `end_lsn`. This is updated on every ingested record. + keyspace: KeySpace, + + // historic keyspaces + old_keyspaces: Vec<(Lsn, KeySpace)>, + + // "on-disk" layers + pub live_layers: Vec, + + num_deleted_layers: u64, + + // Statistics + wal_ingested: u64, + bytes_written: u64, + bytes_deleted: u64, + layers_created: u64, + layers_deleted: u64, + + // All the events - creation and deletion of files - are collected + // in 'history'. It is used to draw the SVG animation at the end. + time: u64, + history: Vec, +} + +type KeySpace = interface::CompactionKeySpace; + +pub struct MockRequestContext {} +impl interface::CompactionRequestContext for MockRequestContext {} + +pub type Key = u64; + +impl interface::CompactionKey for Key { + const MIN: Self = u64::MIN; + const MAX: Self = u64::MAX; + + fn key_range_size(key_range: &Range) -> u32 { + std::cmp::min(key_range.end - key_range.start, u32::MAX as u64) as u32 + } + + fn next(&self) -> Self { + self + 1 + } + fn skip_some(&self) -> Self { + // round up to next xx + self + 100 + } +} + +#[derive(Clone)] +pub struct MockRecord { + lsn: Lsn, + key: Key, + len: u64, +} + +impl interface::CompactionDeltaEntry<'_, Key> for MockRecord { + fn key(&self) -> Key { + self.key + } + fn lsn(&self) -> Lsn { + self.lsn + } + fn size(&self) -> u64 { + self.len + } +} + +pub struct MockDeltaLayer { + pub key_range: Range, + pub lsn_range: Range, + + pub file_size: u64, + + pub deleted: Mutex, + + pub records: Vec, +} + +impl interface::CompactionLayer for Arc { + fn key_range(&self) -> &Range { + &self.key_range + } + fn lsn_range(&self) -> &Range { + &self.lsn_range + } + + fn file_size(&self) -> u64 { + self.file_size + } + + fn short_id(&self) -> String { + format!( + "{:016X}-{:016X}__{:08X}-{:08X}", + self.key_range.start, self.key_range.end, self.lsn_range.start.0, self.lsn_range.end.0 + ) + } + + fn is_delta(&self) -> bool { + true + } +} + +#[async_trait] +impl interface::CompactionDeltaLayer for Arc { + type DeltaEntry<'a> = MockRecord; + + async fn load_keys<'a>(&self, _ctx: &MockRequestContext) -> anyhow::Result> { + Ok(self.records.clone()) + } +} + +pub struct MockImageLayer { + pub key_range: Range, + pub lsn_range: Range, + + pub file_size: u64, + + pub deleted: Mutex, +} + +impl interface::CompactionImageLayer for Arc {} + +impl interface::CompactionLayer for Arc { + fn key_range(&self) -> &Range { + &self.key_range + } + fn lsn_range(&self) -> &Range { + &self.lsn_range + } + + fn file_size(&self) -> u64 { + self.file_size + } + + fn short_id(&self) -> String { + format!( + "{:016X}-{:016X}__{:08X}", + self.key_range.start, self.key_range.end, self.lsn_range.start.0, + ) + } + + fn is_delta(&self) -> bool { + false + } +} + +impl MockTimeline { + pub fn new() -> Self { + MockTimeline { + target_file_size: 256 * 1024 * 1024, + tiers_per_level: 4, + + num_l0_flushes: 0, + last_compact_at_flush: 0, + last_flush_lsn: Lsn(0), + + records: Vec::new(), + total_len: 0, + start_lsn: Lsn(1000), + end_lsn: Lsn(1000), + keyspace: KeySpace::new(), + + old_keyspaces: vec![], + + live_layers: vec![], + + num_deleted_layers: 0, + + wal_ingested: 0, + bytes_written: 0, + bytes_deleted: 0, + layers_created: 0, + layers_deleted: 0, + + time: 0, + history: Vec::new(), + } + } + + pub async fn compact(&mut self) -> anyhow::Result<()> { + let ctx = MockRequestContext {}; + + crate::compact_tiered::compact_tiered( + self, + self.last_flush_lsn, + self.target_file_size, + self.tiers_per_level, + &ctx, + ) + .await?; + + Ok(()) + } + + // Ingest one record to the timeline + pub fn ingest_record(&mut self, key: Key, len: u64) { + self.records.push(MockRecord { + lsn: self.end_lsn, + key, + len, + }); + self.total_len += len; + self.end_lsn += len; + + if self.total_len > self.target_file_size { + self.flush_l0(); + } + } + + pub async fn compact_if_needed(&mut self) -> anyhow::Result<()> { + if self.num_l0_flushes - self.last_compact_at_flush >= self.tiers_per_level { + self.compact().await?; + self.last_compact_at_flush = self.num_l0_flushes; + } + Ok(()) + } + + pub fn flush_l0(&mut self) { + if self.records.is_empty() { + return; + } + + let mut records = std::mem::take(&mut self.records); + records.sort_by_key(|rec| rec.key); + + let lsn_range = self.start_lsn..self.end_lsn; + let new_layer = Arc::new(MockDeltaLayer { + key_range: Key::MIN..Key::MAX, + lsn_range: lsn_range.clone(), + file_size: self.total_len, + records, + deleted: Mutex::new(false), + }); + info!("flushed L0 layer {}", new_layer.short_id()); + self.live_layers.push(MockLayer::from(&new_layer)); + + // reset L0 + self.start_lsn = self.end_lsn; + self.total_len = 0; + self.records = Vec::new(); + + self.layers_created += 1; + self.bytes_written += new_layer.file_size; + + self.time += 1; + self.history.push(LayerTraceEvent { + time_rel: self.time, + op: LayerTraceOp::Flush, + file: LayerTraceFile { + filename: new_layer.short_id(), + key_range: new_layer.key_range.clone(), + lsn_range: new_layer.lsn_range.clone(), + }, + }); + + self.num_l0_flushes += 1; + self.last_flush_lsn = self.end_lsn; + } + + // Ingest `num_records' records to the timeline, with random keys + // uniformly distributed in `key_range` + pub fn ingest_uniform( + &mut self, + num_records: u64, + len: u64, + key_range: &Range, + ) -> anyhow::Result<()> { + crate::helpers::union_to_keyspace(&mut self.keyspace, vec![key_range.clone()]); + let mut rng = rand::thread_rng(); + for _ in 0..num_records { + self.ingest_record(rng.gen_range(key_range.clone()), len); + self.wal_ingested += len; + } + Ok(()) + } + + pub fn stats(&self) -> anyhow::Result { + let mut s = String::new(); + + writeln!(s, "STATISTICS:")?; + writeln!( + s, + "WAL ingested: {:>10} MB", + self.wal_ingested / (1024 * 1024) + )?; + writeln!( + s, + "size created: {:>10} MB", + self.bytes_written / (1024 * 1024) + )?; + writeln!( + s, + "size deleted: {:>10} MB", + self.bytes_deleted / (1024 * 1024) + )?; + writeln!(s, "files created: {:>10}", self.layers_created)?; + writeln!(s, "files deleted: {:>10}", self.layers_deleted)?; + writeln!( + s, + "write amp: {:>10.2}", + self.bytes_written as f64 / self.wal_ingested as f64 + )?; + writeln!( + s, + "storage amp: {:>10.2}", + (self.bytes_written - self.bytes_deleted) as f64 / self.wal_ingested as f64 + )?; + + Ok(s) + } + + pub fn draw_history(&self, output: W) -> anyhow::Result<()> { + draw::draw_history(&self.history, output) + } +} + +impl Default for MockTimeline { + fn default() -> Self { + Self::new() + } +} + +#[derive(Clone)] +pub enum MockLayer { + Delta(Arc), + Image(Arc), +} + +impl interface::CompactionLayer for MockLayer { + fn key_range(&self) -> &Range { + match self { + MockLayer::Delta(this) => this.key_range(), + MockLayer::Image(this) => this.key_range(), + } + } + fn lsn_range(&self) -> &Range { + match self { + MockLayer::Delta(this) => this.lsn_range(), + MockLayer::Image(this) => this.lsn_range(), + } + } + fn file_size(&self) -> u64 { + match self { + MockLayer::Delta(this) => this.file_size(), + MockLayer::Image(this) => this.file_size(), + } + } + fn short_id(&self) -> String { + match self { + MockLayer::Delta(this) => this.short_id(), + MockLayer::Image(this) => this.short_id(), + } + } + + fn is_delta(&self) -> bool { + match self { + MockLayer::Delta(_) => true, + MockLayer::Image(_) => false, + } + } +} + +impl MockLayer { + fn is_deleted(&self) -> bool { + let guard = match self { + MockLayer::Delta(this) => this.deleted.lock().unwrap(), + MockLayer::Image(this) => this.deleted.lock().unwrap(), + }; + *guard + } + fn mark_deleted(&self) { + let mut deleted_guard = match self { + MockLayer::Delta(this) => this.deleted.lock().unwrap(), + MockLayer::Image(this) => this.deleted.lock().unwrap(), + }; + assert!(!*deleted_guard, "layer already deleted"); + *deleted_guard = true; + } +} + +impl From<&Arc> for MockLayer { + fn from(l: &Arc) -> Self { + MockLayer::Delta(l.clone()) + } +} + +impl From<&Arc> for MockLayer { + fn from(l: &Arc) -> Self { + MockLayer::Image(l.clone()) + } +} + +impl interface::CompactionJobExecutor for MockTimeline { + type Key = Key; + type Layer = MockLayer; + type DeltaLayer = Arc; + type ImageLayer = Arc; + type RequestContext = MockRequestContext; + + async fn get_layers( + &mut self, + key_range: &Range, + lsn_range: &Range, + _ctx: &Self::RequestContext, + ) -> anyhow::Result> { + // Clear any deleted layers from our vec + self.live_layers.retain(|l| !l.is_deleted()); + + let layers: Vec = self + .live_layers + .iter() + .filter(|l| { + overlaps_with(l.lsn_range(), lsn_range) && overlaps_with(l.key_range(), key_range) + }) + .cloned() + .collect(); + + Ok(layers) + } + + async fn get_keyspace( + &mut self, + key_range: &Range, + _lsn: Lsn, + _ctx: &Self::RequestContext, + ) -> anyhow::Result> { + // find it in the levels + if self.old_keyspaces.is_empty() { + Ok(crate::helpers::intersect_keyspace( + &self.keyspace, + key_range, + )) + } else { + // not implemented + + // The mock implementation only allows requesting the + // keyspace at the level's end LSN. That's all that the + // current implementation needs. + panic!("keyspace not available for requested lsn"); + } + } + + async fn downcast_delta_layer( + &self, + layer: &MockLayer, + ) -> anyhow::Result>> { + Ok(match layer { + MockLayer::Delta(l) => Some(l.clone()), + MockLayer::Image(_) => None, + }) + } + + async fn create_image( + &mut self, + lsn: Lsn, + key_range: &Range, + ctx: &MockRequestContext, + ) -> anyhow::Result<()> { + let keyspace = self.get_keyspace(key_range, lsn, ctx).await?; + + let mut accum_size: u64 = 0; + for r in keyspace { + accum_size += r.end - r.start; + } + + let new_layer = Arc::new(MockImageLayer { + key_range: key_range.clone(), + lsn_range: lsn..lsn, + file_size: accum_size * 8192, + deleted: Mutex::new(false), + }); + info!( + "created image layer, size {}: {}", + new_layer.file_size, + new_layer.short_id() + ); + self.live_layers.push(MockLayer::Image(new_layer.clone())); + + // update stats + self.bytes_written += new_layer.file_size; + self.layers_created += 1; + + self.time += 1; + self.history.push(LayerTraceEvent { + time_rel: self.time, + op: LayerTraceOp::CreateImage, + file: LayerTraceFile { + filename: new_layer.short_id(), + key_range: new_layer.key_range.clone(), + lsn_range: new_layer.lsn_range.clone(), + }, + }); + + Ok(()) + } + + async fn create_delta( + &mut self, + lsn_range: &Range, + key_range: &Range, + input_layers: &[Arc], + ctx: &MockRequestContext, + ) -> anyhow::Result<()> { + let mut key_value_stream = + std::pin::pin!(merge_delta_keys::(input_layers, ctx)); + let mut records: Vec = Vec::new(); + let mut total_len = 2; + while let Some(delta_entry) = key_value_stream.next().await { + let delta_entry: MockRecord = delta_entry?; + if key_range.contains(&delta_entry.key) && lsn_range.contains(&delta_entry.lsn) { + total_len += delta_entry.len; + records.push(delta_entry); + } + } + let total_records = records.len(); + let new_layer = Arc::new(MockDeltaLayer { + key_range: key_range.clone(), + lsn_range: lsn_range.clone(), + file_size: total_len, + records, + deleted: Mutex::new(false), + }); + info!( + "created delta layer, recs {}, size {}: {}", + total_records, + total_len, + new_layer.short_id() + ); + self.live_layers.push(MockLayer::Delta(new_layer.clone())); + + // update stats + self.bytes_written += total_len; + self.layers_created += 1; + + self.time += 1; + self.history.push(LayerTraceEvent { + time_rel: self.time, + op: LayerTraceOp::CreateDelta, + file: LayerTraceFile { + filename: new_layer.short_id(), + key_range: new_layer.key_range.clone(), + lsn_range: new_layer.lsn_range.clone(), + }, + }); + + Ok(()) + } + + async fn delete_layer( + &mut self, + layer: &Self::Layer, + _ctx: &MockRequestContext, + ) -> anyhow::Result<()> { + let layer = std::pin::pin!(layer); + info!("deleting layer: {}", layer.short_id()); + self.num_deleted_layers += 1; + self.bytes_deleted += layer.file_size(); + layer.mark_deleted(); + + self.time += 1; + self.history.push(LayerTraceEvent { + time_rel: self.time, + op: LayerTraceOp::Delete, + file: LayerTraceFile { + filename: layer.short_id(), + key_range: layer.key_range().clone(), + lsn_range: layer.lsn_range().clone(), + }, + }); + + Ok(()) + } +} diff --git a/pageserver/compaction/src/simulator/draw.rs b/pageserver/compaction/src/simulator/draw.rs new file mode 100644 index 0000000000..997925067f --- /dev/null +++ b/pageserver/compaction/src/simulator/draw.rs @@ -0,0 +1,411 @@ +use super::Key; +use anyhow::Result; +use std::cmp::Ordering; +use std::{ + collections::{BTreeMap, BTreeSet, HashSet}, + fmt::Write, + ops::Range, +}; +use svg_fmt::{rgb, BeginSvg, EndSvg, Fill, Stroke, Style}; +use utils::lsn::Lsn; + +// Map values to their compressed coordinate - the index the value +// would have in a sorted and deduplicated list of all values. +struct CoordinateMap { + map: BTreeMap, + stretch: f32, +} + +impl CoordinateMap { + fn new(coords: Vec, stretch: f32) -> Self { + let set: BTreeSet = coords.into_iter().collect(); + + let mut map: BTreeMap = BTreeMap::new(); + for (i, e) in set.iter().enumerate() { + map.insert(*e, i); + } + + Self { map, stretch } + } + + // This assumes that the map contains an exact point for this. + // Use map_inexact for values inbetween + fn map(&self, val: T) -> f32 { + *self.map.get(&val).unwrap() as f32 * self.stretch + } + + // the value is still assumed to be within the min/max bounds + // (this is currently unused) + fn _map_inexact(&self, val: T) -> f32 { + let prev = *self.map.range(..=val).next().unwrap().1; + let next = *self.map.range(val..).next().unwrap().1; + + // interpolate + (prev as f32 + (next - prev) as f32) * self.stretch + } + + fn max(&self) -> f32 { + self.map.len() as f32 * self.stretch + } +} + +#[derive(PartialEq, Hash, Eq)] +pub enum LayerTraceOp { + Flush, + CreateDelta, + CreateImage, + Delete, +} + +impl std::fmt::Display for LayerTraceOp { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> Result<(), std::fmt::Error> { + let op_str = match self { + LayerTraceOp::Flush => "flush", + LayerTraceOp::CreateDelta => "create_delta", + LayerTraceOp::CreateImage => "create_image", + LayerTraceOp::Delete => "delete", + }; + f.write_str(op_str) + } +} + +#[derive(PartialEq, Hash, Eq, Clone)] +pub struct LayerTraceFile { + pub filename: String, + pub key_range: Range, + pub lsn_range: Range, +} + +impl LayerTraceFile { + fn is_image(&self) -> bool { + self.lsn_range.end == self.lsn_range.start + } +} + +pub struct LayerTraceEvent { + pub time_rel: u64, + pub op: LayerTraceOp, + pub file: LayerTraceFile, +} + +pub fn draw_history(history: &[LayerTraceEvent], mut output: W) -> Result<()> { + let mut files: Vec = Vec::new(); + + for event in history { + files.push(event.file.clone()); + } + let last_time_rel = history.last().unwrap().time_rel; + + // Collect all coordinates + let mut keys: Vec = vec![]; + let mut lsns: Vec = vec![]; + for f in files.iter() { + keys.push(f.key_range.start); + keys.push(f.key_range.end); + lsns.push(f.lsn_range.start); + lsns.push(f.lsn_range.end); + } + + // Analyze + let key_map = CoordinateMap::new(keys, 2.0); + // Stretch out vertically for better visibility + let lsn_map = CoordinateMap::new(lsns, 3.0); + + let mut svg = String::new(); + + // Draw + writeln!( + svg, + "{}", + BeginSvg { + w: key_map.max(), + h: lsn_map.max(), + } + )?; + let lsn_max = lsn_map.max(); + + // Sort the files by LSN, but so that image layers go after all delta layers + // The SVG is painted in the order the elements appear, and we want to draw + // image layers on top of the delta layers if they overlap + // + // (This could also be implemented via z coordinates: image layers get one z + // coord, delta layers get another z coord.) + let mut files_sorted: Vec = files.into_iter().collect(); + files_sorted.sort_by(|a, b| { + if a.is_image() && !b.is_image() { + Ordering::Greater + } else if !a.is_image() && b.is_image() { + Ordering::Less + } else { + a.lsn_range.end.cmp(&b.lsn_range.end) + } + }); + + writeln!(svg, "")?; + let mut files_seen = HashSet::new(); + for f in files_sorted { + if files_seen.contains(&f) { + continue; + } + let key_start = key_map.map(f.key_range.start); + let key_end = key_map.map(f.key_range.end); + let key_diff = key_end - key_start; + + if key_start >= key_end { + panic!("Invalid key range {}-{}", key_start, key_end); + } + + let lsn_start = lsn_map.map(f.lsn_range.start); + let lsn_end = lsn_map.map(f.lsn_range.end); + + // Fill in and thicken rectangle if it's an + // image layer so that we can see it. + let mut style = Style::default(); + style.fill = Fill::Color(rgb(0x80, 0x80, 0x80)); + style.stroke = Stroke::Color(rgb(0, 0, 0), 0.5); + + let y_start = lsn_max - lsn_start; + let y_end = lsn_max - lsn_end; + + let x_margin = 0.25; + let y_margin = 0.5; + + match f.lsn_range.start.cmp(&f.lsn_range.end) { + Ordering::Less => { + write!( + svg, + r#" "#, + f.filename, + key_start + x_margin, + y_end + y_margin, + key_diff - x_margin * 2.0, + y_start - y_end - y_margin * 2.0, + 1.0, // border_radius, + style, + )?; + write!(svg, "{}", f.filename)?; + writeln!(svg, "")?; + } + Ordering::Equal => { + //lsn_diff = 0.3; + //lsn_offset = -lsn_diff / 2.0; + //margin = 0.05; + style.fill = Fill::Color(rgb(0x80, 0, 0x80)); + style.stroke = Stroke::Color(rgb(0x80, 0, 0x80), 3.0); + write!( + svg, + r#" "#, + f.filename, + key_start + x_margin, + y_end, + key_end - x_margin, + y_end, + style, + )?; + write!( + svg, + "{}<br>{} - {}", + f.filename, lsn_end, y_end + )?; + writeln!(svg, "")?; + } + Ordering::Greater => panic!("Invalid lsn range {}-{}", lsn_start, lsn_end), + } + files_seen.insert(f); + } + + let mut record_style = Style::default(); + record_style.fill = Fill::Color(rgb(0x80, 0x80, 0x80)); + record_style.stroke = Stroke::None; + + writeln!(svg, "{}", EndSvg)?; + + let mut layer_events_str = String::new(); + let mut first = true; + for e in history { + if !first { + writeln!(layer_events_str, ",")?; + } + write!( + layer_events_str, + r#" {{"time_rel": {}, "filename": "{}", "op": "{}"}}"#, + e.time_rel, e.file.filename, e.op + )?; + first = false; + } + writeln!(layer_events_str)?; + + writeln!( + output, + r#" + + + + + + + + +

+
+ : +
+ + pos:
+ event:
+ gc:
+
+ + + + + + + + + +
+ +
+{svg} +
+ + +"# + )?; + + Ok(()) +} diff --git a/pageserver/compaction/tests/tests.rs b/pageserver/compaction/tests/tests.rs new file mode 100644 index 0000000000..1cea2a20e1 --- /dev/null +++ b/pageserver/compaction/tests/tests.rs @@ -0,0 +1,35 @@ +use pageserver_compaction::interface::CompactionLayer; +use pageserver_compaction::simulator::MockTimeline; + +/// Test the extreme case that there are so many updates for a single key that +/// even if we produce an extremely narrow delta layer, spanning just that one +/// key, we still too many records to fit in the target file size. We need to +/// split in the LSN dimension too in that case. +/// +/// TODO: The code to avoid this problem has not been implemented yet! So the +/// assertion currently fails, but we need to make it not fail. +#[ignore] +#[tokio::test] +async fn test_many_updates_for_single_key() { + let mut executor = MockTimeline::new(); + executor.target_file_size = 10_000_000; // 10 MB + + // Ingest 100 MB of updates to a single key. + for _ in 1..1000 { + executor.ingest_uniform(100, 10, &(0..100_000)).unwrap(); + executor.ingest_uniform(10_000, 10, &(0..1)).unwrap(); + executor.compact().await.unwrap(); + } + + // Check that all the layers are smaller than the target size (with some slop) + for l in executor.live_layers.iter() { + println!("layer {}: {}", l.short_id(), l.file_size()); + } + for l in executor.live_layers.iter() { + assert!(l.file_size() < executor.target_file_size * 2); + // sanity check that none of the delta layers are stupidly small either + if l.is_delta() { + assert!(l.file_size() > executor.target_file_size / 2); + } + } +} diff --git a/pageserver/ctl/src/layer_map_analyzer.rs b/pageserver/ctl/src/layer_map_analyzer.rs index eb5c3f15cf..c4c282f33d 100644 --- a/pageserver/ctl/src/layer_map_analyzer.rs +++ b/pageserver/ctl/src/layer_map_analyzer.rs @@ -12,7 +12,7 @@ use std::collections::BinaryHeap; use std::ops::Range; use std::{fs, str}; -use pageserver::page_cache::PAGE_SZ; +use pageserver::page_cache::{self, PAGE_SZ}; use pageserver::repository::{Key, KEY_SIZE}; use pageserver::tenant::block_io::FileBlockReader; use pageserver::tenant::disk_btree::{DiskBtreeReader, VisitDirection}; @@ -100,13 +100,15 @@ pub(crate) fn parse_filename(name: &str) -> Option { // Finds the max_holes largest holes, ignoring any that are smaller than MIN_HOLE_LENGTH" async fn get_holes(path: &Utf8Path, max_holes: usize, ctx: &RequestContext) -> Result> { - let file = FileBlockReader::new(VirtualFile::open(path).await?); - let summary_blk = file.read_blk(0, ctx).await?; + let file = VirtualFile::open(path).await?; + let file_id = page_cache::next_file_id(); + let block_reader = FileBlockReader::new(&file, file_id); + let summary_blk = block_reader.read_blk(0, ctx).await?; let actual_summary = Summary::des_prefix(summary_blk.as_ref())?; let tree_reader = DiskBtreeReader::<_, DELTA_KEY_SIZE>::new( actual_summary.index_start_blk, actual_summary.index_root_blk, - file, + block_reader, ); // min-heap (reserve space for one more element added before eviction) let mut heap: BinaryHeap = BinaryHeap::with_capacity(max_holes + 1); @@ -142,7 +144,7 @@ pub(crate) async fn main(cmd: &AnalyzeLayerMapCmd) -> Result<()> { let ctx = RequestContext::new(TaskKind::DebugTool, DownloadBehavior::Error); // Initialize virtual_file (file desriptor cache) and page cache which are needed to access layer persistent B-Tree. - pageserver::virtual_file::init(10, virtual_file::IoEngineKind::StdFs); + pageserver::virtual_file::init(10, virtual_file::api::IoEngineKind::StdFs); pageserver::page_cache::init(100); let mut total_delta_layers = 0usize; diff --git a/pageserver/ctl/src/layers.rs b/pageserver/ctl/src/layers.rs index dbbcfedac0..be8f91675d 100644 --- a/pageserver/ctl/src/layers.rs +++ b/pageserver/ctl/src/layers.rs @@ -59,15 +59,17 @@ pub(crate) enum LayerCmd { async fn read_delta_file(path: impl AsRef, ctx: &RequestContext) -> Result<()> { let path = Utf8Path::from_path(path.as_ref()).expect("non-Unicode path"); - virtual_file::init(10, virtual_file::IoEngineKind::StdFs); + virtual_file::init(10, virtual_file::api::IoEngineKind::StdFs); page_cache::init(100); - let file = FileBlockReader::new(VirtualFile::open(path).await?); - let summary_blk = file.read_blk(0, ctx).await?; + let file = VirtualFile::open(path).await?; + let file_id = page_cache::next_file_id(); + let block_reader = FileBlockReader::new(&file, file_id); + let summary_blk = block_reader.read_blk(0, ctx).await?; let actual_summary = Summary::des_prefix(summary_blk.as_ref())?; let tree_reader = DiskBtreeReader::<_, DELTA_KEY_SIZE>::new( actual_summary.index_start_blk, actual_summary.index_root_blk, - &file, + &block_reader, ); // TODO(chi): dedup w/ `delta_layer.rs` by exposing the API. let mut all = vec![]; @@ -83,7 +85,7 @@ async fn read_delta_file(path: impl AsRef, ctx: &RequestContext) -> Result ctx, ) .await?; - let cursor = BlockCursor::new_fileblockreader(&file); + let cursor = BlockCursor::new_fileblockreader(&block_reader); for (k, v) in all { let value = cursor.read_blob(v.pos(), ctx).await?; println!("key:{} value_len:{}", k, value.len()); @@ -187,7 +189,7 @@ pub(crate) async fn main(cmd: &LayerCmd) -> Result<()> { new_tenant_id, new_timeline_id, } => { - pageserver::virtual_file::init(10, virtual_file::IoEngineKind::StdFs); + pageserver::virtual_file::init(10, virtual_file::api::IoEngineKind::StdFs); pageserver::page_cache::init(100); let ctx = RequestContext::new(TaskKind::DebugTool, DownloadBehavior::Error); diff --git a/pageserver/ctl/src/main.rs b/pageserver/ctl/src/main.rs index 3c90933fe9..e73d961e36 100644 --- a/pageserver/ctl/src/main.rs +++ b/pageserver/ctl/src/main.rs @@ -123,7 +123,7 @@ fn read_pg_control_file(control_file_path: &Utf8Path) -> anyhow::Result<()> { async fn print_layerfile(path: &Utf8Path) -> anyhow::Result<()> { // Basic initialization of things that don't change after startup - virtual_file::init(10, virtual_file::IoEngineKind::StdFs); + virtual_file::init(10, virtual_file::api::IoEngineKind::StdFs); page_cache::init(100); let ctx = RequestContext::new(TaskKind::DebugTool, DownloadBehavior::Error); dump_layerfile_from_path(path, true, &ctx).await diff --git a/pageserver/pagebench/src/cmd/basebackup.rs b/pageserver/pagebench/src/cmd/basebackup.rs index 2d61b0e252..55844be041 100644 --- a/pageserver/pagebench/src/cmd/basebackup.rs +++ b/pageserver/pagebench/src/cmd/basebackup.rs @@ -8,7 +8,7 @@ use utils::lsn::Lsn; use rand::prelude::*; use tokio::sync::Barrier; use tokio::task::JoinSet; -use tracing::{debug, info, instrument}; +use tracing::{info, instrument}; use std::collections::HashMap; use std::num::NonZeroUsize; @@ -25,8 +25,8 @@ use crate::util::{request_stats, tokio_thread_local_stats}; pub(crate) struct Args { #[clap(long, default_value = "http://localhost:9898")] mgmt_api_endpoint: String, - #[clap(long, default_value = "localhost:64000")] - page_service_host_port: String, + #[clap(long, default_value = "postgres://postgres@localhost:64000")] + page_service_connstring: String, #[clap(long)] pageserver_jwt: Option, #[clap(long, default_value = "1")] @@ -230,12 +230,9 @@ async fn client( ) { start_work_barrier.wait().await; - let client = pageserver_client::page_service::Client::new(crate::util::connstring::connstring( - &args.page_service_host_port, - args.pageserver_jwt.as_deref(), - )) - .await - .unwrap(); + let client = pageserver_client::page_service::Client::new(args.page_service_connstring.clone()) + .await + .unwrap(); while let Some(Work { lsn, gzip }) = work.recv().await { let start = Instant::now(); @@ -263,7 +260,7 @@ async fn client( } }) .await; - debug!("basebackup size is {} bytes", size.load(Ordering::Relaxed)); + info!("basebackup size is {} bytes", size.load(Ordering::Relaxed)); let elapsed = start.elapsed(); live_stats.inc(); STATS.with(|stats| { diff --git a/pageserver/pagebench/src/cmd/getpage_latest_lsn.rs b/pageserver/pagebench/src/cmd/getpage_latest_lsn.rs index 400b5476b7..2838511a77 100644 --- a/pageserver/pagebench/src/cmd/getpage_latest_lsn.rs +++ b/pageserver/pagebench/src/cmd/getpage_latest_lsn.rs @@ -1,6 +1,5 @@ use anyhow::Context; use camino::Utf8PathBuf; -use futures::future::join_all; use pageserver_api::key::{is_rel_block_key, key_to_rel_block, Key}; use pageserver_api::keyspace::KeySpaceAccum; use pageserver_api::models::PagestreamGetPageRequest; @@ -10,11 +9,10 @@ use utils::id::TenantTimelineId; use utils::lsn::Lsn; use rand::prelude::*; -use tokio::sync::Barrier; use tokio::task::JoinSet; -use tracing::{info, instrument}; +use tracing::info; -use std::collections::{HashMap, HashSet}; +use std::collections::HashSet; use std::future::Future; use std::num::NonZeroUsize; use std::pin::Pin; @@ -38,8 +36,12 @@ pub(crate) struct Args { num_clients: NonZeroUsize, #[clap(long)] runtime: Option, + /// Each client sends requests at the given rate. + /// + /// If a request takes too long and we should be issuing a new request already, + /// we skip that request and account it as `MISSED`. #[clap(long)] - per_target_rate_limit: Option, + per_client_rate: Option, /// Probability for sending `latest=true` in the request (uniform distribution). #[clap(long, default_value = "1")] req_latest_probability: f64, @@ -51,18 +53,26 @@ pub(crate) struct Args { /// It doesn't get invalidated if the keyspace changes under the hood, e.g., due to new ingested data or compaction. #[clap(long)] keyspace_cache: Option, + /// Before starting the benchmark, live-reconfigure the pageserver to use the given + /// [`pageserver_api::models::virtual_file::IoEngineKind`]. + #[clap(long)] + set_io_engine: Option, targets: Option>, } #[derive(Debug, Default)] struct LiveStats { completed_requests: AtomicU64, + missed: AtomicU64, } impl LiveStats { - fn inc(&self) { + fn request_done(&self) { self.completed_requests.fetch_add(1, Ordering::Relaxed); } + fn missed(&self, n: u64) { + self.missed.fetch_add(n, Ordering::Relaxed); + } } #[derive(Clone, serde::Serialize, serde::Deserialize)] @@ -79,6 +89,12 @@ impl KeyRange { } } +#[derive(PartialEq, Eq, Hash, Copy, Clone)] +struct WorkerId { + timeline: TenantTimelineId, + num_client: usize, // from 0..args.num_clients +} + #[derive(serde::Serialize)] struct Output { total: request_stats::Output, @@ -103,6 +119,10 @@ async fn main_impl( args.pageserver_jwt.as_deref(), )); + if let Some(engine_str) = &args.set_io_engine { + mgmt_api_client.put_io_engine(engine_str).await?; + } + // discover targets let timelines: Vec = crate::util::cli::targets::discover( &mgmt_api_client, @@ -206,13 +226,12 @@ async fn main_impl( let live_stats = Arc::new(LiveStats::default()); - let num_client_tasks = timelines.len(); let num_live_stats_dump = 1; - let num_work_sender_tasks = 1; + let num_work_sender_tasks = args.num_clients.get() * timelines.len(); let num_main_impl = 1; let start_work_barrier = Arc::new(tokio::sync::Barrier::new( - num_client_tasks + num_live_stats_dump + num_work_sender_tasks + num_main_impl, + num_live_stats_dump + num_work_sender_tasks + num_main_impl, )); tokio::spawn({ @@ -224,10 +243,12 @@ async fn main_impl( let start = std::time::Instant::now(); tokio::time::sleep(std::time::Duration::from_secs(1)).await; let completed_requests = stats.completed_requests.swap(0, Ordering::Relaxed); + let missed = stats.missed.swap(0, Ordering::Relaxed); let elapsed = start.elapsed(); info!( - "RPS: {:.0}", - completed_requests as f64 / elapsed.as_secs_f64() + "RPS: {:.0} MISSED: {:.0}", + completed_requests as f64 / elapsed.as_secs_f64(), + missed as f64 / elapsed.as_secs_f64() ); } } @@ -235,119 +256,104 @@ async fn main_impl( let cancel = CancellationToken::new(); - let mut work_senders: HashMap = HashMap::new(); - let mut tasks = Vec::new(); - for tl in &timelines { - let (sender, receiver) = tokio::sync::mpsc::channel(10); // TODO: not sure what the implications of this are - work_senders.insert(*tl, sender); - tasks.push(tokio::spawn(client( - args, - *tl, - Arc::clone(&start_work_barrier), - receiver, - Arc::clone(&live_stats), - cancel.clone(), - ))); - } - - let work_sender: Pin>> = { + let rps_period = args + .per_client_rate + .map(|rps_limit| Duration::from_secs_f64(1.0 / (rps_limit as f64))); + let make_worker: &dyn Fn(WorkerId) -> Pin>> = &|worker_id| { + let live_stats = live_stats.clone(); let start_work_barrier = start_work_barrier.clone(); - let cancel = cancel.clone(); - match args.per_target_rate_limit { - None => Box::pin(async move { - let weights = rand::distributions::weighted::WeightedIndex::new( - all_ranges.iter().map(|v| v.len()), - ) + let ranges: Vec = all_ranges + .iter() + .filter(|r| r.timeline == worker_id.timeline) + .cloned() + .collect(); + let weights = + rand::distributions::weighted::WeightedIndex::new(ranges.iter().map(|v| v.len())) .unwrap(); - start_work_barrier.wait().await; - - while !cancel.is_cancelled() { - let (timeline, req) = { - let mut rng = rand::thread_rng(); - let r = &all_ranges[weights.sample(&mut rng)]; - let key: i128 = rng.gen_range(r.start..r.end); - let key = Key::from_i128(key); - let (rel_tag, block_no) = - key_to_rel_block(key).expect("we filter non-rel-block keys out above"); - ( - r.timeline, - PagestreamGetPageRequest { - latest: rng.gen_bool(args.req_latest_probability), - lsn: r.timeline_lsn, - rel: rel_tag, - blkno: block_no, - }, - ) - }; - let sender = work_senders.get(&timeline).unwrap(); - // TODO: what if this blocks? - if sender.send(req).await.is_err() { - assert!(cancel.is_cancelled(), "client has gone away unexpectedly"); - } - } - }), - Some(rps_limit) => Box::pin(async move { - let period = Duration::from_secs_f64(1.0 / (rps_limit as f64)); - let make_timeline_task: &dyn Fn( - TenantTimelineId, - ) - -> Pin>> = &|timeline| { - let sender = work_senders.get(&timeline).unwrap(); - let ranges: Vec = all_ranges - .iter() - .filter(|r| r.timeline == timeline) - .cloned() - .collect(); - let weights = rand::distributions::weighted::WeightedIndex::new( - ranges.iter().map(|v| v.len()), - ) + let cancel = cancel.clone(); + Box::pin(async move { + let client = + pageserver_client::page_service::Client::new(args.page_service_connstring.clone()) + .await .unwrap(); + let mut client = client + .pagestream(worker_id.timeline.tenant_id, worker_id.timeline.timeline_id) + .await + .unwrap(); - let cancel = cancel.clone(); - Box::pin(async move { - let mut ticker = tokio::time::interval(period); - ticker.set_missed_tick_behavior( - /* TODO review this choice */ - tokio::time::MissedTickBehavior::Burst, - ); - while !cancel.is_cancelled() { - ticker.tick().await; - let req = { - let mut rng = rand::thread_rng(); - let r = &ranges[weights.sample(&mut rng)]; - let key: i128 = rng.gen_range(r.start..r.end); - let key = Key::from_i128(key); - assert!(is_rel_block_key(&key)); - let (rel_tag, block_no) = key_to_rel_block(key) - .expect("we filter non-rel-block keys out above"); - PagestreamGetPageRequest { - latest: rng.gen_bool(args.req_latest_probability), - lsn: r.timeline_lsn, - rel: rel_tag, - blkno: block_no, - } - }; - if sender.send(req).await.is_err() { - assert!(cancel.is_cancelled(), "client has gone away unexpectedly"); - } - } - }) + start_work_barrier.wait().await; + let client_start = Instant::now(); + let mut ticks_processed = 0; + while !cancel.is_cancelled() { + // Detect if a request took longer than the RPS rate + if let Some(period) = &rps_period { + let periods_passed_until_now = + usize::try_from(client_start.elapsed().as_micros() / period.as_micros()) + .unwrap(); + + if periods_passed_until_now > ticks_processed { + live_stats.missed((periods_passed_until_now - ticks_processed) as u64); + } + ticks_processed = periods_passed_until_now; + } + + let start = Instant::now(); + let req = { + let mut rng = rand::thread_rng(); + let r = &ranges[weights.sample(&mut rng)]; + let key: i128 = rng.gen_range(r.start..r.end); + let key = Key::from_i128(key); + assert!(is_rel_block_key(&key)); + let (rel_tag, block_no) = + key_to_rel_block(key).expect("we filter non-rel-block keys out above"); + PagestreamGetPageRequest { + latest: rng.gen_bool(args.req_latest_probability), + lsn: r.timeline_lsn, + rel: rel_tag, + blkno: block_no, + } }; + client.getpage(req).await.unwrap(); + let end = Instant::now(); + live_stats.request_done(); + ticks_processed += 1; + STATS.with(|stats| { + stats + .borrow() + .lock() + .unwrap() + .observe(end.duration_since(start)) + .unwrap(); + }); - let tasks: Vec<_> = work_senders - .keys() - .map(|tl| make_timeline_task(*tl)) - .collect(); - - start_work_barrier.wait().await; - - join_all(tasks).await; - }), - } + if let Some(period) = &rps_period { + let next_at = client_start + + Duration::from_micros( + (ticks_processed) as u64 * u64::try_from(period.as_micros()).unwrap(), + ); + tokio::time::sleep_until(next_at.into()).await; + } + } + }) }; - let work_sender_task = tokio::spawn(work_sender); + info!("spawning workers"); + let mut workers = JoinSet::new(); + for timeline in timelines.iter().cloned() { + for num_client in 0..args.num_clients.get() { + let worker_id = WorkerId { + timeline, + num_client, + }; + workers.spawn(make_worker(worker_id)); + } + } + let workers = async move { + while let Some(res) = workers.join_next().await { + res.unwrap(); + } + }; info!("waiting for everything to become ready"); start_work_barrier.wait().await; @@ -356,20 +362,13 @@ async fn main_impl( tokio::time::sleep(runtime.into()).await; info!("runtime over, signalling cancellation"); cancel.cancel(); - work_sender_task.await.unwrap(); + workers.await; info!("work sender exited"); } else { - work_sender_task.await.unwrap(); + workers.await; unreachable!("work sender never terminates"); } - info!("joining clients"); - for t in tasks { - t.await.unwrap(); - } - - info!("all clients stopped"); - let output = Output { total: { let mut agg_stats = request_stats::Stats::new(); @@ -386,45 +385,3 @@ async fn main_impl( anyhow::Ok(()) } - -#[instrument(skip_all)] -async fn client( - args: &'static Args, - timeline: TenantTimelineId, - start_work_barrier: Arc, - mut work: tokio::sync::mpsc::Receiver, - live_stats: Arc, - cancel: CancellationToken, -) { - let client = pageserver_client::page_service::Client::new(args.page_service_connstring.clone()) - .await - .unwrap(); - let mut client = client - .pagestream(timeline.tenant_id, timeline.timeline_id) - .await - .unwrap(); - - let do_requests = async { - start_work_barrier.wait().await; - while let Some(req) = work.recv().await { - let start = Instant::now(); - client - .getpage(req) - .await - .with_context(|| format!("getpage for {timeline}")) - .unwrap(); - let elapsed = start.elapsed(); - live_stats.inc(); - STATS.with(|stats| { - stats.borrow().lock().unwrap().observe(elapsed).unwrap(); - }); - } - }; - tokio::select! { - res = do_requests => { res }, - _ = cancel.cancelled() => { - // fallthrough to shutdown - } - } - client.shutdown().await; -} diff --git a/pageserver/pagebench/src/main.rs b/pageserver/pagebench/src/main.rs index 9fa77f0671..5d688ed2d1 100644 --- a/pageserver/pagebench/src/main.rs +++ b/pageserver/pagebench/src/main.rs @@ -3,7 +3,6 @@ use utils::logging; /// Re-usable pieces of code that aren't CLI-specific. mod util { - pub(crate) mod connstring; pub(crate) mod request_stats; #[macro_use] pub(crate) mod tokio_thread_local_stats; diff --git a/pageserver/pagebench/src/util/connstring.rs b/pageserver/pagebench/src/util/connstring.rs deleted file mode 100644 index 07a0ff042d..0000000000 --- a/pageserver/pagebench/src/util/connstring.rs +++ /dev/null @@ -1,8 +0,0 @@ -pub(crate) fn connstring(host_port: &str, jwt: Option<&str>) -> String { - let colon_and_jwt = if let Some(jwt) = jwt { - format!(":{jwt}") // TODO: urlescape - } else { - String::new() - }; - format!("postgres://postgres{colon_and_jwt}@{host_port}") -} diff --git a/pageserver/pagebench/src/util/request_stats.rs b/pageserver/pagebench/src/util/request_stats.rs index 5ecf1cbf24..4aa6950782 100644 --- a/pageserver/pagebench/src/util/request_stats.rs +++ b/pageserver/pagebench/src/util/request_stats.rs @@ -66,13 +66,10 @@ impl serde::Serialize for LatencyPercentiles { { use serde::ser::SerializeMap; let mut ser = serializer.serialize_map(Some(LATENCY_PERCENTILES.len()))?; - for p in LATENCY_PERCENTILES { + for (p, v) in LATENCY_PERCENTILES.iter().zip(&self.latency_percentiles) { ser.serialize_entry( &format!("p{p}"), - &format!( - "{}", - &humantime::format_duration(self.latency_percentiles[0]) - ), + &format!("{}", humantime::format_duration(*v)), )?; } ser.end() diff --git a/pageserver/src/auth.rs b/pageserver/src/auth.rs index 2cb661863d..4785c8c4c5 100644 --- a/pageserver/src/auth.rs +++ b/pageserver/src/auth.rs @@ -14,8 +14,12 @@ pub fn check_permission(claims: &Claims, tenant_id: Option) -> Result< } (Scope::PageServerApi, None) => Ok(()), // access to management api for PageServerApi scope (Scope::PageServerApi, Some(_)) => Ok(()), // access to tenant api using PageServerApi scope - (Scope::SafekeeperData, _) => Err(AuthError( - "SafekeeperData scope makes no sense for Pageserver".into(), + (Scope::Admin | Scope::SafekeeperData | Scope::GenerationsApi, _) => Err(AuthError( + format!( + "JWT scope '{:?}' is ineligible for Pageserver auth", + claims.scope + ) + .into(), )), } } diff --git a/pageserver/src/basebackup.rs b/pageserver/src/basebackup.rs index 009deff0aa..0479d05f8f 100644 --- a/pageserver/src/basebackup.rs +++ b/pageserver/src/basebackup.rs @@ -143,6 +143,7 @@ where ar: &'a mut Builder<&'b mut W>, buf: Vec, current_segment: Option<(SlruKind, u32)>, + total_blocks: usize, } impl<'a, 'b, W> SlruSegmentsBuilder<'a, 'b, W> @@ -154,6 +155,7 @@ where ar, buf: Vec::new(), current_segment: None, + total_blocks: 0, } } @@ -199,7 +201,8 @@ where let header = new_tar_header(&segname, self.buf.len() as u64)?; self.ar.append(&header, self.buf.as_slice()).await?; - trace!("Added to basebackup slru {} relsize {}", segname, nblocks); + self.total_blocks += nblocks; + debug!("Added to basebackup slru {} relsize {}", segname, nblocks); self.buf.clear(); @@ -207,11 +210,15 @@ where } async fn finish(mut self) -> anyhow::Result<()> { - if self.current_segment.is_none() || self.buf.is_empty() { - return Ok(()); - } + let res = if self.current_segment.is_none() || self.buf.is_empty() { + Ok(()) + } else { + self.flush().await + }; - self.flush().await + info!("Collected {} SLRU blocks", self.total_blocks); + + res } } @@ -222,6 +229,8 @@ where async fn send_tarball(mut self) -> anyhow::Result<()> { // TODO include checksum + let lazy_slru_download = self.timeline.get_lazy_slru_download() && !self.full_backup; + // Create pgdata subdirs structure for dir in PGDATA_SUBDIRS.iter() { let header = new_tar_header_dir(dir)?; @@ -248,29 +257,26 @@ where .context("could not add config file to basebackup tarball")?; } } - - // Gather non-relational files from object storage pages. - let slru_partitions = self - .timeline - .get_slru_keyspace(Version::Lsn(self.lsn), self.ctx) - .await? - .partition(Timeline::MAX_GET_VECTORED_KEYS * BLCKSZ as u64); - - let mut slru_builder = SlruSegmentsBuilder::new(&mut self.ar); - - for part in slru_partitions.parts { - let blocks = self + if !lazy_slru_download { + // Gather non-relational files from object storage pages. + let slru_partitions = self .timeline - .get_vectored(&part.ranges, self.lsn, self.ctx) - .await?; + .get_slru_keyspace(Version::Lsn(self.lsn), self.ctx) + .await? + .partition(Timeline::MAX_GET_VECTORED_KEYS * BLCKSZ as u64); - for (key, block) in blocks { - slru_builder.add_block(&key, block?).await?; + let mut slru_builder = SlruSegmentsBuilder::new(&mut self.ar); + + for part in slru_partitions.parts { + let blocks = self.timeline.get_vectored(part, self.lsn, self.ctx).await?; + + for (key, block) in blocks { + slru_builder.add_block(&key, block?).await?; + } } + slru_builder.finish().await?; } - slru_builder.finish().await?; - let mut min_restart_lsn: Lsn = Lsn::MAX; // Create tablespace directories for ((spcnode, dbnode), has_relmap_file) in diff --git a/pageserver/src/bin/pageserver.rs b/pageserver/src/bin/pageserver.rs index 84de76e55e..2f172bd384 100644 --- a/pageserver/src/bin/pageserver.rs +++ b/pageserver/src/bin/pageserver.rs @@ -33,12 +33,10 @@ use pageserver::{ use postgres_backend::AuthType; use utils::failpoint_support; use utils::logging::TracingErrorLayerEnablement; -use utils::signals::ShutdownSignals; use utils::{ auth::{JwtAuth, SwappableJwtAuth}, logging, project_build_tag, project_git_version, sentry_init::init_sentry, - signals::Signal, tcp_listener, }; @@ -274,6 +272,12 @@ fn start_pageserver( ); set_build_info_metric(GIT_VERSION, BUILD_TAG); set_launch_timestamp_metric(launch_ts); + #[cfg(target_os = "linux")] + metrics::register_internal(Box::new(metrics::more_process_metrics::Collector::new())).unwrap(); + metrics::register_internal(Box::new( + pageserver::metrics::tokio_epoll_uring::Collector::new(), + )) + .unwrap(); pageserver::preinitialize_metrics(); // If any failpoints were set from FAILPOINTS environment variable, @@ -656,34 +660,42 @@ fn start_pageserver( let mut shutdown_pageserver = Some(shutdown_pageserver.drop_guard()); // All started up! Now just sit and wait for shutdown signal. - ShutdownSignals::handle(|signal| match signal { - Signal::Quit => { - info!( - "Got {}. Terminating in immediate shutdown mode", - signal.name() - ); - std::process::exit(111); - } + { + use signal_hook::consts::*; + let signal_handler = BACKGROUND_RUNTIME.spawn_blocking(move || { + let mut signals = + signal_hook::iterator::Signals::new([SIGINT, SIGTERM, SIGQUIT]).unwrap(); + return signals + .forever() + .next() + .expect("forever() never returns None unless explicitly closed"); + }); + let signal = BACKGROUND_RUNTIME + .block_on(signal_handler) + .expect("join error"); + match signal { + SIGQUIT => { + info!("Got signal {signal}. Terminating in immediate shutdown mode",); + std::process::exit(111); + } + SIGINT | SIGTERM => { + info!("Got signal {signal}. Terminating gracefully in fast shutdown mode",); - Signal::Interrupt | Signal::Terminate => { - info!( - "Got {}. Terminating gracefully in fast shutdown mode", - signal.name() - ); - - // This cancels the `shutdown_pageserver` cancellation tree. - // Right now that tree doesn't reach very far, and `task_mgr` is used instead. - // The plan is to change that over time. - shutdown_pageserver.take(); - let bg_remote_storage = remote_storage.clone(); - let bg_deletion_queue = deletion_queue.clone(); - BACKGROUND_RUNTIME.block_on(pageserver::shutdown_pageserver( - bg_remote_storage.map(|_| bg_deletion_queue), - 0, - )); - unreachable!() + // This cancels the `shutdown_pageserver` cancellation tree. + // Right now that tree doesn't reach very far, and `task_mgr` is used instead. + // The plan is to change that over time. + shutdown_pageserver.take(); + let bg_remote_storage = remote_storage.clone(); + let bg_deletion_queue = deletion_queue.clone(); + BACKGROUND_RUNTIME.block_on(pageserver::shutdown_pageserver( + bg_remote_storage.map(|_| bg_deletion_queue), + 0, + )); + unreachable!() + } + _ => unreachable!(), } - }) + } } fn create_remote_storage_client( diff --git a/pageserver/src/config.rs b/pageserver/src/config.rs index a5dac55af1..70aa30d24e 100644 --- a/pageserver/src/config.rs +++ b/pageserver/src/config.rs @@ -7,8 +7,9 @@ use anyhow::{anyhow, bail, ensure, Context, Result}; use pageserver_api::shard::TenantShardId; use remote_storage::{RemotePath, RemoteStorageConfig}; +use serde; use serde::de::IntoDeserializer; -use std::env; +use std::{collections::HashMap, env}; use storage_broker::Uri; use utils::crashsafe::path_with_suffix_extension; use utils::id::ConnectionId; @@ -20,7 +21,6 @@ use std::num::NonZeroUsize; use std::str::FromStr; use std::sync::Arc; use std::time::Duration; -use toml_edit; use toml_edit::{Document, Item}; use camino::{Utf8Path, Utf8PathBuf}; @@ -33,12 +33,14 @@ use utils::{ use crate::disk_usage_eviction_task::DiskUsageEvictionTaskConfig; use crate::tenant::config::TenantConf; use crate::tenant::config::TenantConfOpt; +use crate::tenant::timeline::GetVectoredImpl; +use crate::tenant::vectored_blob_io::MaxVectoredReadBytes; use crate::tenant::{ TENANTS_SEGMENT_NAME, TENANT_DELETED_MARKER_FILE_NAME, TIMELINES_SEGMENT_NAME, }; use crate::virtual_file; use crate::{ - IGNORED_TENANT_FILE_NAME, METADATA_FILE_NAME, TENANT_CONFIG_NAME, TENANT_HEATMAP_BASENAME, + IGNORED_TENANT_FILE_NAME, TENANT_CONFIG_NAME, TENANT_HEATMAP_BASENAME, TENANT_LOCATION_CONFIG_NAME, TIMELINE_DELETE_MARK_SUFFIX, TIMELINE_UNINIT_MARK_SUFFIX, }; @@ -81,8 +83,18 @@ pub mod defaults { pub const DEFAULT_INGEST_BATCH_SIZE: u64 = 100; + #[cfg(target_os = "linux")] + pub const DEFAULT_VIRTUAL_FILE_IO_ENGINE: &str = "tokio-epoll-uring"; + + #[cfg(not(target_os = "linux"))] pub const DEFAULT_VIRTUAL_FILE_IO_ENGINE: &str = "std-fs"; + pub const DEFAULT_GET_VECTORED_IMPL: &str = "sequential"; + + pub const DEFAULT_MAX_VECTORED_READ_BYTES: usize = 128 * 1024; // 128 KiB + + pub const DEFAULT_VALIDATE_VECTORED_GET: bool = true; + /// /// Default built-in configuration file. /// @@ -119,6 +131,12 @@ pub mod defaults { #virtual_file_io_engine = '{DEFAULT_VIRTUAL_FILE_IO_ENGINE}' +#get_vectored_impl = '{DEFAULT_GET_VECTORED_IMPL}' + +#max_vectored_read_bytes = '{DEFAULT_MAX_VECTORED_READ_BYTES}' + +#validate_vectored_get = '{DEFAULT_VALIDATE_VECTORED_GET}' + [tenant_config] #checkpoint_distance = {DEFAULT_CHECKPOINT_DISTANCE} # in bytes #checkpoint_timeout = {DEFAULT_CHECKPOINT_TIMEOUT} @@ -133,7 +151,6 @@ pub mod defaults { #min_resident_size_override = .. # in bytes #evictions_low_residence_duration_metric_threshold = '{DEFAULT_EVICTIONS_LOW_RESIDENCE_DURATION_METRIC_THRESHOLD}' -#gc_feedback = false #heatmap_upload_concurrency = {DEFAULT_HEATMAP_UPLOAD_CONCURRENCY} #secondary_download_concurrency = {DEFAULT_SECONDARY_DOWNLOAD_CONCURRENCY} @@ -195,9 +212,9 @@ pub struct PageServerConf { pub log_format: LogFormat, - /// Number of tenants which will be concurrently loaded from remote storage proactively on startup, - /// does not limit tenants loaded in response to client I/O. A lower value implicitly deprioritizes - /// loading such tenants, vs. other work in the system. + /// Number of tenants which will be concurrently loaded from remote storage proactively on startup or attach. + /// + /// A lower value implicitly deprioritizes loading such tenants, vs. other work in the system. pub concurrent_tenant_warmup: ConfigurableSemaphore, /// Number of concurrent [`Tenant::gather_size_inputs`](crate::tenant::Tenant::gather_size_inputs) allowed. @@ -252,6 +269,12 @@ pub struct PageServerConf { pub ingest_batch_size: u64, pub virtual_file_io_engine: virtual_file::IoEngineKind, + + pub get_vectored_impl: GetVectoredImpl, + + pub max_vectored_read_bytes: MaxVectoredReadBytes, + + pub validate_vectored_get: bool, } /// We do not want to store this in a PageServerConf because the latter may be logged @@ -278,6 +301,26 @@ impl BuilderValue { } } +// Certain metadata (e.g. externally-addressable name, AZ) is delivered +// as a separate structure. This information is not neeed by the pageserver +// itself, it is only used for registering the pageserver with the control +// plane and/or storage controller. +// +#[derive(serde::Deserialize)] +pub(crate) struct NodeMetadata { + #[serde(rename = "host")] + pub(crate) postgres_host: String, + #[serde(rename = "port")] + pub(crate) postgres_port: u16, + pub(crate) http_host: String, + pub(crate) http_port: u16, + + // Deployment tools may write fields to the metadata file beyond what we + // use in this type: this type intentionally only names fields that require. + #[serde(flatten)] + pub(crate) other: HashMap, +} + // needed to simplify config construction struct PageServerConfigBuilder { listen_pg_addr: BuilderValue, @@ -337,6 +380,12 @@ struct PageServerConfigBuilder { ingest_batch_size: BuilderValue, virtual_file_io_engine: BuilderValue, + + get_vectored_impl: BuilderValue, + + max_vectored_read_bytes: BuilderValue, + + validate_vectored_get: BuilderValue, } impl Default for PageServerConfigBuilder { @@ -412,6 +461,12 @@ impl Default for PageServerConfigBuilder { ingest_batch_size: Set(DEFAULT_INGEST_BATCH_SIZE), virtual_file_io_engine: Set(DEFAULT_VIRTUAL_FILE_IO_ENGINE.parse().unwrap()), + + get_vectored_impl: Set(DEFAULT_GET_VECTORED_IMPL.parse().unwrap()), + max_vectored_read_bytes: Set(MaxVectoredReadBytes( + NonZeroUsize::new(DEFAULT_MAX_VECTORED_READ_BYTES).unwrap(), + )), + validate_vectored_get: Set(DEFAULT_VALIDATE_VECTORED_GET), } } } @@ -568,6 +623,18 @@ impl PageServerConfigBuilder { self.virtual_file_io_engine = BuilderValue::Set(value); } + pub fn get_vectored_impl(&mut self, value: GetVectoredImpl) { + self.get_vectored_impl = BuilderValue::Set(value); + } + + pub fn get_max_vectored_read_bytes(&mut self, value: MaxVectoredReadBytes) { + self.max_vectored_read_bytes = BuilderValue::Set(value); + } + + pub fn get_validate_vectored_get(&mut self, value: bool) { + self.validate_vectored_get = BuilderValue::Set(value); + } + pub fn build(self) -> anyhow::Result { let concurrent_tenant_warmup = self .concurrent_tenant_warmup @@ -675,6 +742,15 @@ impl PageServerConfigBuilder { virtual_file_io_engine: self .virtual_file_io_engine .ok_or(anyhow!("missing virtual_file_io_engine"))?, + get_vectored_impl: self + .get_vectored_impl + .ok_or(anyhow!("missing get_vectored_impl"))?, + max_vectored_read_bytes: self + .max_vectored_read_bytes + .ok_or(anyhow!("missing max_vectored_read_bytes"))?, + validate_vectored_get: self + .validate_vectored_get + .ok_or(anyhow!("missing validate_vectored_get"))?, }) } } @@ -692,6 +768,10 @@ impl PageServerConf { self.workdir.join("deletion") } + pub fn metadata_path(&self) -> Utf8PathBuf { + self.workdir.join("metadata.json") + } + pub fn deletion_list_path(&self, sequence: u64) -> Utf8PathBuf { // Encode a version in the filename, so that if we ever switch away from JSON we can // increment this. @@ -794,17 +874,6 @@ impl PageServerConf { .join(connection_id.to_string()) } - /// Points to a place in pageserver's local directory, - /// where certain timeline's metadata file should be located. - pub fn metadata_path( - &self, - tenant_shard_id: &TenantShardId, - timeline_id: &TimelineId, - ) -> Utf8PathBuf { - self.timeline_path(tenant_shard_id, timeline_id) - .join(METADATA_FILE_NAME) - } - /// Turns storage remote path of a file into its local path. pub fn local_path(&self, remote_path: &RemotePath) -> Utf8PathBuf { remote_path.with_base(&self.workdir) @@ -928,6 +997,18 @@ impl PageServerConf { "virtual_file_io_engine" => { builder.virtual_file_io_engine(parse_toml_from_str("virtual_file_io_engine", item)?) } + "get_vectored_impl" => { + builder.get_vectored_impl(parse_toml_from_str("get_vectored_impl", item)?) + } + "max_vectored_read_bytes" => { + let bytes = parse_toml_u64("max_vectored_read_bytes", item)? as usize; + builder.get_max_vectored_read_bytes( + MaxVectoredReadBytes( + NonZeroUsize::new(bytes).expect("Max byte size of vectored read must be greater than 0"))) + } + "validate_vectored_get" => { + builder.get_validate_vectored_get(parse_toml_bool("validate_vectored_get", item)?) + } _ => bail!("unrecognized pageserver option '{key}'"), } } @@ -1001,6 +1082,12 @@ impl PageServerConf { secondary_download_concurrency: defaults::DEFAULT_SECONDARY_DOWNLOAD_CONCURRENCY, ingest_batch_size: defaults::DEFAULT_INGEST_BATCH_SIZE, virtual_file_io_engine: DEFAULT_VIRTUAL_FILE_IO_ENGINE.parse().unwrap(), + get_vectored_impl: defaults::DEFAULT_GET_VECTORED_IMPL.parse().unwrap(), + max_vectored_read_bytes: MaxVectoredReadBytes( + NonZeroUsize::new(defaults::DEFAULT_MAX_VECTORED_READ_BYTES) + .expect("Invalid default constant"), + ), + validate_vectored_get: defaults::DEFAULT_VALIDATE_VECTORED_GET, } } } @@ -1128,10 +1215,7 @@ impl ConfigurableSemaphore { #[cfg(test)] mod tests { - use std::{ - fs, - num::{NonZeroU32, NonZeroUsize}, - }; + use std::{fs, num::NonZeroU32}; use camino_tempfile::{tempdir, Utf8TempDir}; use pageserver_api::models::EvictionPolicy; @@ -1232,6 +1316,12 @@ background_task_maximum_delay = '334 s' secondary_download_concurrency: defaults::DEFAULT_SECONDARY_DOWNLOAD_CONCURRENCY, ingest_batch_size: defaults::DEFAULT_INGEST_BATCH_SIZE, virtual_file_io_engine: DEFAULT_VIRTUAL_FILE_IO_ENGINE.parse().unwrap(), + get_vectored_impl: defaults::DEFAULT_GET_VECTORED_IMPL.parse().unwrap(), + max_vectored_read_bytes: MaxVectoredReadBytes( + NonZeroUsize::new(defaults::DEFAULT_MAX_VECTORED_READ_BYTES) + .expect("Invalid default constant") + ), + validate_vectored_get: defaults::DEFAULT_VALIDATE_VECTORED_GET, }, "Correct defaults should be used when no config values are provided" ); @@ -1295,6 +1385,12 @@ background_task_maximum_delay = '334 s' secondary_download_concurrency: defaults::DEFAULT_SECONDARY_DOWNLOAD_CONCURRENCY, ingest_batch_size: 100, virtual_file_io_engine: DEFAULT_VIRTUAL_FILE_IO_ENGINE.parse().unwrap(), + get_vectored_impl: defaults::DEFAULT_GET_VECTORED_IMPL.parse().unwrap(), + max_vectored_read_bytes: MaxVectoredReadBytes( + NonZeroUsize::new(defaults::DEFAULT_MAX_VECTORED_READ_BYTES) + .expect("Invalid default constant") + ), + validate_vectored_get: defaults::DEFAULT_VALIDATE_VECTORED_GET, }, "Should be able to parse all basic config values correctly" ); @@ -1340,6 +1436,7 @@ broker_endpoint = '{broker_endpoint}' parsed_remote_storage_config, RemoteStorageConfig { storage: RemoteStorageKind::LocalFs(local_storage_path.clone()), + timeout: RemoteStorageConfig::DEFAULT_TIMEOUT, }, "Remote storage config should correctly parse the local FS config and fill other storage defaults" ); @@ -1407,6 +1504,7 @@ broker_endpoint = '{broker_endpoint}' concurrency_limit: s3_concurrency_limit, max_keys_per_list_response: None, }), + timeout: RemoteStorageConfig::DEFAULT_TIMEOUT, }, "Remote storage config should correctly parse the S3 config" ); @@ -1527,17 +1625,50 @@ threshold = "20m" eviction_order: crate::disk_usage_eviction_task::EvictionOrder::AbsoluteAccessed, }) ); + match &conf.default_tenant_conf.eviction_policy { - EvictionPolicy::NoEviction => panic!("Unexpected eviction opolicy tenant settings"), - EvictionPolicy::LayerAccessThreshold(eviction_thresold) => { - assert_eq!(eviction_thresold.period, Duration::from_secs(20 * 60)); - assert_eq!(eviction_thresold.threshold, Duration::from_secs(20 * 60)); + EvictionPolicy::LayerAccessThreshold(eviction_threshold) => { + assert_eq!(eviction_threshold.period, Duration::from_secs(20 * 60)); + assert_eq!(eviction_threshold.threshold, Duration::from_secs(20 * 60)); } + other => unreachable!("Unexpected eviction policy tenant settings: {other:?}"), } Ok(()) } + #[test] + fn parse_imitation_only_pageserver_config() { + let tempdir = tempdir().unwrap(); + let (workdir, pg_distrib_dir) = prepare_fs(&tempdir).unwrap(); + + let pageserver_conf_toml = format!( + r#"pg_distrib_dir = "{pg_distrib_dir}" +metric_collection_endpoint = "http://sample.url" +metric_collection_interval = "10min" +id = 222 + +[tenant_config] +evictions_low_residence_duration_metric_threshold = "20m" + +[tenant_config.eviction_policy] +kind = "OnlyImitiate" +period = "20m" +threshold = "20m" +"#, + ); + let toml: Document = pageserver_conf_toml.parse().unwrap(); + let conf = PageServerConf::parse_and_validate(&toml, &workdir).unwrap(); + + match &conf.default_tenant_conf.eviction_policy { + EvictionPolicy::OnlyImitiate(t) => { + assert_eq!(t.period, Duration::from_secs(20 * 60)); + assert_eq!(t.threshold, Duration::from_secs(20 * 60)); + } + other => unreachable!("Unexpected eviction policy tenant settings: {other:?}"), + } + } + fn prepare_fs(tempdir: &Utf8TempDir) -> anyhow::Result<(Utf8PathBuf, Utf8PathBuf)> { let tempdir_path = tempdir.path(); diff --git a/pageserver/src/consumption_metrics.rs b/pageserver/src/consumption_metrics.rs index 012a950b60..c7f9d596c6 100644 --- a/pageserver/src/consumption_metrics.rs +++ b/pageserver/src/consumption_metrics.rs @@ -17,7 +17,7 @@ use tracing::*; use utils::id::NodeId; mod metrics; -use metrics::MetricsKey; +use crate::consumption_metrics::metrics::MetricsKey; mod disk_cache; mod upload; diff --git a/pageserver/src/consumption_metrics/metrics/tests.rs b/pageserver/src/consumption_metrics/metrics/tests.rs index 38a4c9eb5d..f9cbcea565 100644 --- a/pageserver/src/consumption_metrics/metrics/tests.rs +++ b/pageserver/src/consumption_metrics/metrics/tests.rs @@ -1,7 +1,5 @@ use super::*; use std::collections::HashMap; -use std::time::SystemTime; -use utils::lsn::Lsn; #[test] fn startup_collected_timeline_metrics_before_advancing() { diff --git a/pageserver/src/consumption_metrics/upload.rs b/pageserver/src/consumption_metrics/upload.rs index 322ed95cc8..6b840a3136 100644 --- a/pageserver/src/consumption_metrics/upload.rs +++ b/pageserver/src/consumption_metrics/upload.rs @@ -262,35 +262,33 @@ async fn upload( ) -> Result<(), UploadError> { let warn_after = 3; let max_attempts = 10; + + // this is used only with tests so far + let last_value = if is_last { "true" } else { "false" }; + let res = utils::backoff::retry( - move || { - let body = body.clone(); - async move { - let res = client - .post(metric_collection_endpoint.clone()) - .header(reqwest::header::CONTENT_TYPE, "application/json") - .header( - LAST_IN_BATCH.clone(), - if is_last { "true" } else { "false" }, - ) - .body(body) - .send() - .await; + || async { + let res = client + .post(metric_collection_endpoint.clone()) + .header(reqwest::header::CONTENT_TYPE, "application/json") + .header(LAST_IN_BATCH.clone(), last_value) + .body(body.clone()) + .send() + .await; - let res = res.and_then(|res| res.error_for_status()); + let res = res.and_then(|res| res.error_for_status()); - // 10 redirects are normally allowed, so we don't need worry about 3xx - match res { - Ok(_response) => Ok(()), - Err(e) => { - let status = e.status().filter(|s| s.is_client_error()); - if let Some(status) = status { - // rejection used to be a thing when the server could reject a - // whole batch of metrics if one metric was bad. - Err(UploadError::Rejected(status)) - } else { - Err(UploadError::Reqwest(e)) - } + // 10 redirects are normally allowed, so we don't need worry about 3xx + match res { + Ok(_response) => Ok(()), + Err(e) => { + let status = e.status().filter(|s| s.is_client_error()); + if let Some(status) = status { + // rejection used to be a thing when the server could reject a + // whole batch of metrics if one metric was bad. + Err(UploadError::Rejected(status)) + } else { + Err(UploadError::Reqwest(e)) } } } @@ -299,9 +297,11 @@ async fn upload( warn_after, max_attempts, "upload consumption_metrics", - utils::backoff::Cancel::new(cancel.clone(), || UploadError::Cancelled), + cancel, ) - .await; + .await + .ok_or_else(|| UploadError::Cancelled) + .and_then(|x| x); match &res { Ok(_) => {} diff --git a/pageserver/src/context.rs b/pageserver/src/context.rs index ee331ea154..86d0390c30 100644 --- a/pageserver/src/context.rs +++ b/pageserver/src/context.rs @@ -88,13 +88,16 @@ use crate::task_mgr::TaskKind; +pub(crate) mod optional_counter; + // The main structure of this module, see module-level comment. -#[derive(Clone, Debug)] +#[derive(Debug)] pub struct RequestContext { task_kind: TaskKind, download_behavior: DownloadBehavior, access_stats_behavior: AccessStatsBehavior, page_content_kind: PageContentKind, + pub micros_spent_throttled: optional_counter::MicroSecondsCounterU32, } /// The kind of access to the page cache. @@ -150,6 +153,7 @@ impl RequestContextBuilder { download_behavior: DownloadBehavior::Download, access_stats_behavior: AccessStatsBehavior::Update, page_content_kind: PageContentKind::Unknown, + micros_spent_throttled: Default::default(), }, } } @@ -163,6 +167,7 @@ impl RequestContextBuilder { download_behavior: original.download_behavior, access_stats_behavior: original.access_stats_behavior, page_content_kind: original.page_content_kind, + micros_spent_throttled: Default::default(), }, } } diff --git a/pageserver/src/context/optional_counter.rs b/pageserver/src/context/optional_counter.rs new file mode 100644 index 0000000000..100c649f18 --- /dev/null +++ b/pageserver/src/context/optional_counter.rs @@ -0,0 +1,101 @@ +use std::{ + sync::atomic::{AtomicU32, Ordering}, + time::Duration, +}; + +#[derive(Debug)] +pub struct CounterU32 { + inner: AtomicU32, +} +impl Default for CounterU32 { + fn default() -> Self { + Self { + inner: AtomicU32::new(u32::MAX), + } + } +} +impl CounterU32 { + pub fn open(&self) -> Result<(), &'static str> { + match self + .inner + .compare_exchange(u32::MAX, 0, Ordering::Relaxed, Ordering::Relaxed) + { + Ok(_) => Ok(()), + Err(_) => Err("open() called on clsoed state"), + } + } + pub fn close(&self) -> Result { + match self.inner.swap(u32::MAX, Ordering::Relaxed) { + u32::MAX => Err("close() called on closed state"), + x => Ok(x), + } + } + + pub fn add(&self, count: u32) -> Result<(), &'static str> { + if count == 0 { + return Ok(()); + } + let mut had_err = None; + self.inner + .fetch_update(Ordering::Relaxed, Ordering::Relaxed, |cur| match cur { + u32::MAX => { + had_err = Some("add() called on closed state"); + None + } + x => { + let (new, overflowed) = x.overflowing_add(count); + if new == u32::MAX || overflowed { + had_err = Some("add() overflowed the counter"); + None + } else { + Some(new) + } + } + }) + .map_err(|_| had_err.expect("we set it whenever the function returns None")) + .map(|_| ()) + } +} + +#[derive(Default, Debug)] +pub struct MicroSecondsCounterU32 { + inner: CounterU32, +} + +impl MicroSecondsCounterU32 { + pub fn open(&self) -> Result<(), &'static str> { + self.inner.open() + } + pub fn add(&self, duration: Duration) -> Result<(), &'static str> { + match duration.as_micros().try_into() { + Ok(x) => self.inner.add(x), + Err(_) => Err("add(): duration conversion error"), + } + } + pub fn close_and_checked_sub_from(&self, from: Duration) -> Result { + let val = self.inner.close()?; + let val = Duration::from_micros(val as u64); + let subbed = match from.checked_sub(val) { + Some(v) => v, + None => return Err("Duration::checked_sub"), + }; + Ok(subbed) + } +} + +#[cfg(test)] +mod tests { + + use super::*; + + #[test] + fn test_basic() { + let counter = MicroSecondsCounterU32::default(); + counter.open().unwrap(); + counter.add(Duration::from_micros(23)).unwrap(); + let res = counter + .close_and_checked_sub_from(Duration::from_micros(42)) + .unwrap(); + assert_eq!(res, Duration::from_micros(42 - 23)); + } +} diff --git a/pageserver/src/control_plane_client.rs b/pageserver/src/control_plane_client.rs index 950791ea48..1b3d76335d 100644 --- a/pageserver/src/control_plane_client.rs +++ b/pageserver/src/control_plane_client.rs @@ -2,17 +2,21 @@ use std::collections::HashMap; use futures::Future; use pageserver_api::{ - control_api::{ + controller_api::NodeRegisterRequest, + shard::TenantShardId, + upcall_api::{ ReAttachRequest, ReAttachResponse, ValidateRequest, ValidateRequestTenant, ValidateResponse, }, - shard::TenantShardId, }; use serde::{de::DeserializeOwned, Serialize}; use tokio_util::sync::CancellationToken; use url::Url; use utils::{backoff, generation::Generation, id::NodeId}; -use crate::config::PageServerConf; +use crate::{ + config::{NodeMetadata, PageServerConf}, + virtual_file::on_fatal_io_error, +}; /// The Pageserver's client for using the control plane API: this is a small subset /// of the overall control plane API, for dealing with generations (see docs/rfcs/025-generation-numbers.md) @@ -32,6 +36,7 @@ pub enum RetryForeverError { pub trait ControlPlaneGenerationsApi { fn re_attach( &self, + conf: &PageServerConf, ) -> impl Future, RetryForeverError>> + Send; fn validate( &self, @@ -82,58 +87,87 @@ impl ControlPlaneClient { R: Serialize, T: DeserializeOwned, { - #[derive(thiserror::Error, Debug)] - enum RemoteAttemptError { - #[error("shutdown")] - Shutdown, - #[error("remote: {0}")] - Remote(reqwest::Error), - } - - match backoff::retry( + let res = backoff::retry( || async { let response = self .http_client .post(url.clone()) .json(&request) .send() - .await - .map_err(RemoteAttemptError::Remote)?; + .await?; - response - .error_for_status_ref() - .map_err(RemoteAttemptError::Remote)?; - response - .json::() - .await - .map_err(RemoteAttemptError::Remote) + response.error_for_status_ref()?; + response.json::().await }, |_| false, 3, u32::MAX, "calling control plane generation validation API", - backoff::Cancel::new(self.cancel.clone(), || RemoteAttemptError::Shutdown), + &self.cancel, ) .await - { - Err(RemoteAttemptError::Shutdown) => Err(RetryForeverError::ShuttingDown), - Err(RemoteAttemptError::Remote(_)) => { - panic!("We retry forever, this should never be reached"); - } - Ok(r) => Ok(r), - } + .ok_or(RetryForeverError::ShuttingDown)? + .expect("We retry forever, this should never be reached"); + + Ok(res) } } impl ControlPlaneGenerationsApi for ControlPlaneClient { /// Block until we get a successful response, or error out if we are shut down - async fn re_attach(&self) -> Result, RetryForeverError> { + async fn re_attach( + &self, + conf: &PageServerConf, + ) -> Result, RetryForeverError> { let re_attach_path = self .base_url .join("re-attach") .expect("Failed to build re-attach path"); + + // Include registration content in the re-attach request if a metadata file is readable + let metadata_path = conf.metadata_path(); + let register = match tokio::fs::read_to_string(&metadata_path).await { + Ok(metadata_str) => match serde_json::from_str::(&metadata_str) { + Ok(m) => { + // Since we run one time at startup, be generous in our logging and + // dump all metadata. + tracing::info!( + "Loaded node metadata: postgres {}:{}, http {}:{}, other fields: {:?}", + m.postgres_host, + m.postgres_port, + m.http_host, + m.http_port, + m.other + ); + + Some(NodeRegisterRequest { + node_id: conf.id, + listen_pg_addr: m.postgres_host, + listen_pg_port: m.postgres_port, + listen_http_addr: m.http_host, + listen_http_port: m.http_port, + }) + } + Err(e) => { + tracing::error!("Unreadable metadata in {metadata_path}: {e}"); + None + } + }, + Err(e) => { + if e.kind() == std::io::ErrorKind::NotFound { + // This is legal: we may have been deployed with some external script + // doing registration for us. + tracing::info!("Metadata file not found at {metadata_path}"); + } else { + on_fatal_io_error(&e, &format!("Loading metadata at {metadata_path}")) + } + None + } + }; + let request = ReAttachRequest { node_id: self.node_id, + register, }; fail::fail_point!("control-plane-client-re-attach"); diff --git a/pageserver/src/deletion_queue.rs b/pageserver/src/deletion_queue.rs index 6a820e1bdc..b6aea8fae8 100644 --- a/pageserver/src/deletion_queue.rs +++ b/pageserver/src/deletion_queue.rs @@ -20,10 +20,9 @@ use remote_storage::{GenericRemoteStorage, RemotePath}; use serde::Deserialize; use serde::Serialize; use thiserror::Error; -use tokio; use tokio_util::sync::CancellationToken; use tracing::Instrument; -use tracing::{self, debug, error}; +use tracing::{debug, error}; use utils::crashsafe::path_with_suffix_extension; use utils::generation::Generation; use utils::id::TimelineId; @@ -234,7 +233,7 @@ impl DeletionHeader { let header_bytes = serde_json::to_vec(self).context("serialize deletion header")?; let header_path = conf.deletion_header_path(); let temp_path = path_with_suffix_extension(&header_path, TEMP_SUFFIX); - VirtualFile::crashsafe_overwrite(&header_path, &temp_path, &header_bytes) + VirtualFile::crashsafe_overwrite(header_path, temp_path, header_bytes) .await .maybe_fatal_err("save deletion header")?; @@ -325,7 +324,8 @@ impl DeletionList { let temp_path = path_with_suffix_extension(&path, TEMP_SUFFIX); let bytes = serde_json::to_vec(self).expect("Failed to serialize deletion list"); - VirtualFile::crashsafe_overwrite(&path, &temp_path, &bytes) + + VirtualFile::crashsafe_overwrite(path, temp_path, bytes) .await .maybe_fatal_err("save deletion list") .map_err(Into::into) @@ -700,8 +700,6 @@ impl DeletionQueue { } pub async fn shutdown(&mut self, timeout: Duration) { - self.cancel.cancel(); - match tokio::time::timeout(timeout, self.client.flush()).await { Ok(Ok(())) => { tracing::info!("Deletion queue flushed successfully on shutdown") @@ -715,6 +713,10 @@ impl DeletionQueue { tracing::warn!("Timed out flushing deletion queue on shutdown") } } + + // We only cancel _after_ flushing: otherwise we would be shutting down the + // components that do the flush. + self.cancel.cancel(); } } @@ -723,7 +725,7 @@ mod test { use camino::Utf8Path; use hex_literal::hex; use pageserver_api::shard::ShardIndex; - use std::{io::ErrorKind, time::Duration}; + use std::io::ErrorKind; use tracing::info; use remote_storage::{RemoteStorageConfig, RemoteStorageKind}; @@ -732,10 +734,7 @@ mod test { use crate::{ control_plane_client::RetryForeverError, repository::Key, - tenant::{ - harness::TenantHarness, remote_timeline_client::remote_timeline_path, - storage_layer::DeltaFileName, - }, + tenant::{harness::TenantHarness, storage_layer::DeltaFileName}, }; use super::*; @@ -832,8 +831,10 @@ mod test { } impl ControlPlaneGenerationsApi for MockControlPlane { - #[allow(clippy::diverging_sub_expression)] // False positive via async_trait - async fn re_attach(&self) -> Result, RetryForeverError> { + async fn re_attach( + &self, + _conf: &PageServerConf, + ) -> Result, RetryForeverError> { unimplemented!() } async fn validate( @@ -866,6 +867,7 @@ mod test { let remote_fs_dir = harness.conf.workdir.join("remote_fs").canonicalize_utf8()?; let storage_config = RemoteStorageConfig { storage: RemoteStorageKind::LocalFs(remote_fs_dir.clone()), + timeout: RemoteStorageConfig::DEFAULT_TIMEOUT, }; let storage = GenericRemoteStorage::from_config(&storage_config).unwrap(); @@ -1158,17 +1160,13 @@ mod test { pub(crate) mod mock { use tracing::info; - use crate::tenant::remote_timeline_client::remote_layer_path; - use super::*; - use std::sync::{ - atomic::{AtomicUsize, Ordering}, - Arc, - }; + use std::sync::atomic::{AtomicUsize, Ordering}; pub struct ConsumerState { rx: tokio::sync::mpsc::UnboundedReceiver, executor_rx: tokio::sync::mpsc::Receiver, + cancel: CancellationToken, } impl ConsumerState { @@ -1182,7 +1180,7 @@ pub(crate) mod mock { match msg { DeleterMessage::Delete(objects) => { for path in objects { - match remote_storage.delete(&path).await { + match remote_storage.delete(&path, &self.cancel).await { Ok(_) => { debug!("Deleted {path}"); } @@ -1215,7 +1213,7 @@ pub(crate) mod mock { for path in objects { info!("Executing deletion {path}"); - match remote_storage.delete(&path).await { + match remote_storage.delete(&path, &self.cancel).await { Ok(_) => { debug!("Deleted {path}"); } @@ -1265,7 +1263,11 @@ pub(crate) mod mock { executor_tx, executed, remote_storage, - consumer: std::sync::Mutex::new(ConsumerState { rx, executor_rx }), + consumer: std::sync::Mutex::new(ConsumerState { + rx, + executor_rx, + cancel: CancellationToken::new(), + }), lsn_table: Arc::new(std::sync::RwLock::new(VisibleLsnUpdates::new())), } } diff --git a/pageserver/src/deletion_queue/deleter.rs b/pageserver/src/deletion_queue/deleter.rs index 57421b1547..1f04bc0410 100644 --- a/pageserver/src/deletion_queue/deleter.rs +++ b/pageserver/src/deletion_queue/deleter.rs @@ -8,6 +8,7 @@ use remote_storage::GenericRemoteStorage; use remote_storage::RemotePath; +use remote_storage::TimeoutOrCancel; use remote_storage::MAX_KEYS_PER_DELETE; use std::time::Duration; use tokio_util::sync::CancellationToken; @@ -71,15 +72,19 @@ impl Deleter { Err(anyhow::anyhow!("failpoint: deletion-queue-before-execute")) }); - self.remote_storage.delete_objects(&self.accumulator).await + self.remote_storage + .delete_objects(&self.accumulator, &self.cancel) + .await }, - |_| false, + TimeoutOrCancel::caused_by_cancel, 3, 10, "executing deletion batch", - backoff::Cancel::new(self.cancel.clone(), || anyhow::anyhow!("Shutting down")), + &self.cancel, ) .await + .ok_or_else(|| anyhow::anyhow!("Shutting down")) + .and_then(|x| x) } /// Block until everything in accumulator has been executed diff --git a/pageserver/src/disk_usage_eviction_task.rs b/pageserver/src/disk_usage_eviction_task.rs index 800e52bb51..92c1475aef 100644 --- a/pageserver/src/disk_usage_eviction_task.rs +++ b/pageserver/src/disk_usage_eviction_task.rs @@ -58,6 +58,7 @@ use utils::{completion, id::TimelineId}; use crate::{ config::PageServerConf, + metrics::disk_usage_based_eviction::METRICS, task_mgr::{self, TaskKind, BACKGROUND_RUNTIME}, tenant::{ self, @@ -65,7 +66,6 @@ use crate::{ remote_timeline_client::LayerFileMetadata, secondary::SecondaryTenant, storage_layer::{AsLayerDesc, EvictionError, Layer, LayerFileName}, - Timeline, }, }; @@ -97,23 +97,86 @@ pub enum EvictionOrder { /// Order the layers to be evicted by how recently they have been accessed relatively within /// the set of resident layers of a tenant. - /// - /// This strategy will evict layers more fairly but is untested. RelativeAccessed { - #[serde(default)] + /// Determines if the tenant with most layers should lose first. + /// + /// Having this enabled is currently the only reasonable option, because the order in which + /// we read tenants is deterministic. If we find the need to use this as `false`, we need + /// to ensure nondeterminism by adding in a random number to break the + /// `relative_last_activity==0.0` ties. + #[serde(default = "default_highest_layer_count_loses_first")] highest_layer_count_loses_first: bool, }, } +fn default_highest_layer_count_loses_first() -> bool { + true +} + impl EvictionOrder { - /// Return true, if with [`Self::RelativeAccessed`] order the tenants with the highest layer - /// counts should be the first ones to have their layers evicted. - fn highest_layer_count_loses_first(&self) -> bool { + fn sort(&self, candidates: &mut [(MinResidentSizePartition, EvictionCandidate)]) { + use EvictionOrder::*; + match self { - EvictionOrder::AbsoluteAccessed => false, - EvictionOrder::RelativeAccessed { + AbsoluteAccessed => { + candidates.sort_unstable_by_key(|(partition, candidate)| { + (*partition, candidate.last_activity_ts) + }); + } + RelativeAccessed { .. } => candidates.sort_unstable_by_key(|(partition, candidate)| { + (*partition, candidate.relative_last_activity) + }), + } + } + + /// Called to fill in the [`EvictionCandidate::relative_last_activity`] while iterating tenants + /// layers in **most** recently used order. + fn relative_last_activity(&self, total: usize, index: usize) -> finite_f32::FiniteF32 { + use EvictionOrder::*; + + match self { + AbsoluteAccessed => finite_f32::FiniteF32::ZERO, + RelativeAccessed { highest_layer_count_loses_first, - } => *highest_layer_count_loses_first, + } => { + // keeping the -1 or not decides if every tenant should lose their least recently accessed + // layer OR if this should happen in the order of having highest layer count: + let fudge = if *highest_layer_count_loses_first { + // relative_last_activity vs. tenant layer count: + // - 0.1..=1.0 (10 layers) + // - 0.01..=1.0 (100 layers) + // - 0.001..=1.0 (1000 layers) + // + // leading to evicting less of the smallest tenants. + 0 + } else { + // use full 0.0..=1.0 range, which means even the smallest tenants could always lose a + // layer. the actual ordering is unspecified: for 10k tenants on a pageserver it could + // be that less than 10k layer evictions is enough, so we would not need to evict from + // all tenants. + // + // as the tenant ordering is now deterministic this could hit the same tenants + // disproportionetly on multiple invocations. alternative could be to remember how many + // layers did we evict last time from this tenant, and inject that as an additional + // fudge here. + 1 + }; + + let total = total.checked_sub(fudge).filter(|&x| x > 1).unwrap_or(1); + let divider = total as f32; + + // most recently used is always (total - 0) / divider == 1.0 + // least recently used depends on the fudge: + // - (total - 1) - (total - 1) / total => 0 / total + // - total - (total - 1) / total => 1 / total + let distance = (total - index) as f32; + + finite_f32::FiniteF32::try_from_normalized(distance / divider) + .unwrap_or_else(|val| { + tracing::warn!(%fudge, "calculated invalid relative_last_activity for i={index}, total={total}: {val}"); + finite_f32::FiniteF32::ZERO + }) + } } } } @@ -288,7 +351,6 @@ pub enum IterationOutcome { Finished(IterationOutcomeFinished), } -#[allow(dead_code)] #[derive(Debug, Serialize)] pub struct IterationOutcomeFinished { /// The actual usage observed before we started the iteration. @@ -303,7 +365,6 @@ pub struct IterationOutcomeFinished { } #[derive(Debug, Serialize)] -#[allow(dead_code)] struct AssumedUsage { /// The expected value for `after`, after phase 2. projected_after: U, @@ -311,14 +372,12 @@ struct AssumedUsage { failed: LayerCount, } -#[allow(dead_code)] #[derive(Debug, Serialize)] struct PlannedUsage { respecting_tenant_min_resident_size: U, fallback_to_global_lru: Option, } -#[allow(dead_code)] #[derive(Debug, Default, Serialize)] struct LayerCount { file_sizes: u64, @@ -350,13 +409,23 @@ pub(crate) async fn disk_usage_eviction_task_iteration_impl( "running disk usage based eviction due to pressure" ); - let candidates = + let (candidates, collection_time) = { + let started_at = std::time::Instant::now(); match collect_eviction_candidates(tenant_manager, eviction_order, cancel).await? { EvictionCandidates::Cancelled => { return Ok(IterationOutcome::Cancelled); } - EvictionCandidates::Finished(partitioned) => partitioned, - }; + EvictionCandidates::Finished(partitioned) => (partitioned, started_at.elapsed()), + } + }; + + METRICS.layers_collected.inc_by(candidates.len() as u64); + + tracing::info!( + elapsed_ms = collection_time.as_millis(), + total_layers = candidates.len(), + "collection completed" + ); // Debug-log the list of candidates let now = SystemTime::now(); @@ -387,55 +456,10 @@ pub(crate) async fn disk_usage_eviction_task_iteration_impl( // the tenant's min-resident-size threshold, print a warning, and memorize the disk // usage at that point, in 'usage_planned_min_resident_size_respecting'. - let selection = select_victims(&candidates, usage_pre); + let (evicted_amount, usage_planned) = + select_victims(&candidates, usage_pre).into_amount_and_planned(); - let mut candidates = candidates; - - let selection = if matches!(eviction_order, EvictionOrder::RelativeAccessed { .. }) { - // we currently have the layers ordered by AbsoluteAccessed so that we can get the summary - // for comparison here. this is a temporary measure to develop alternatives. - use std::fmt::Write; - - let mut summary_buf = String::with_capacity(256); - - { - let absolute_summary = candidates - .iter() - .take(selection.amount) - .map(|(_, candidate)| candidate) - .collect::(); - - write!(summary_buf, "{absolute_summary}").expect("string grows"); - - info!("absolute accessed selection summary: {summary_buf}"); - } - - candidates.sort_unstable_by_key(|(partition, candidate)| { - (*partition, candidate.relative_last_activity) - }); - - let selection = select_victims(&candidates, usage_pre); - - { - summary_buf.clear(); - - let relative_summary = candidates - .iter() - .take(selection.amount) - .map(|(_, candidate)| candidate) - .collect::(); - - write!(summary_buf, "{relative_summary}").expect("string grows"); - - info!("relative accessed selection summary: {summary_buf}"); - } - - selection - } else { - selection - }; - - let (evicted_amount, usage_planned) = selection.into_amount_and_planned(); + METRICS.layers_selected.inc_by(evicted_amount as u64); // phase2: evict layers @@ -464,9 +488,15 @@ pub(crate) async fn disk_usage_eviction_task_iteration_impl( if let Some(next) = next { match next { Ok(Ok(file_size)) => { + METRICS.layers_evicted.inc(); usage_assumed.add_available_bytes(file_size); } - Ok(Err((file_size, EvictionError::NotFound | EvictionError::Downloaded))) => { + Ok(Err(( + file_size, + EvictionError::NotFound + | EvictionError::Downloaded + | EvictionError::Timeout, + ))) => { evictions_failed.file_sizes += file_size; evictions_failed.count += 1; } @@ -482,7 +512,10 @@ pub(crate) async fn disk_usage_eviction_task_iteration_impl( // calling again when consumed_all is fine as evicted is fused. let Some((_partition, candidate)) = evicted.next() else { - consumed_all = true; + if !consumed_all { + tracing::info!("all evictions started, waiting"); + consumed_all = true; + } continue; }; @@ -490,11 +523,15 @@ pub(crate) async fn disk_usage_eviction_task_iteration_impl( EvictionLayer::Attached(layer) => { let file_size = layer.layer_desc().file_size; js.spawn(async move { - layer - .evict_and_wait() - .await - .map(|()| file_size) - .map_err(|e| (file_size, e)) + // have a low eviction waiting timeout because our LRU calculations go stale fast; + // also individual layer evictions could hang because of bugs and we do not want to + // pause disk_usage_based_eviction for such. + let timeout = std::time::Duration::from_secs(5); + + match layer.evict_and_wait(timeout).await { + Ok(()) => Ok(file_size), + Err(e) => Err((file_size, e)), + } }); } EvictionLayer::Secondary(layer) => { @@ -516,6 +553,30 @@ pub(crate) async fn disk_usage_eviction_task_iteration_impl( (usage_assumed, evictions_failed) }; + let started_at = std::time::Instant::now(); + + let evict_layers = async move { + let mut evict_layers = std::pin::pin!(evict_layers); + + let maximum_expected = std::time::Duration::from_secs(10); + + let res = tokio::time::timeout(maximum_expected, &mut evict_layers).await; + let tuple = if let Ok(tuple) = res { + tuple + } else { + let elapsed = started_at.elapsed(); + tracing::info!(elapsed_ms = elapsed.as_millis(), "still ongoing"); + evict_layers.await + }; + + let elapsed = started_at.elapsed(); + tracing::info!(elapsed_ms = elapsed.as_millis(), "completed"); + tuple + }; + + let evict_layers = + evict_layers.instrument(tracing::info_span!("evict_layers", layers=%evicted_amount)); + let (usage_assumed, evictions_failed) = tokio::select! { tuple = evict_layers => { tuple }, _ = cancel.cancelled() => { @@ -548,7 +609,6 @@ pub(crate) struct EvictionSecondaryLayer { #[derive(Clone)] pub(crate) enum EvictionLayer { Attached(Layer), - #[allow(dead_code)] Secondary(EvictionSecondaryLayer), } @@ -606,6 +666,7 @@ impl std::fmt::Display for EvictionLayer { } } +#[derive(Default)] pub(crate) struct DiskUsageEvictionInfo { /// Timeline's largest layer (remote or resident) pub max_layer_size: Option, @@ -750,6 +811,8 @@ async fn collect_eviction_candidates( eviction_order: EvictionOrder, cancel: &CancellationToken, ) -> anyhow::Result { + const LOG_DURATION_THRESHOLD: std::time::Duration = std::time::Duration::from_secs(10); + // get a snapshot of the list of tenants let tenants = tenant::mgr::list_tenants() .await @@ -778,6 +841,8 @@ async fn collect_eviction_candidates( continue; } + let started_at = std::time::Instant::now(); + // collect layers from all timelines in this tenant // // If one of the timelines becomes `!is_active()` during the iteration, @@ -792,6 +857,7 @@ async fn collect_eviction_candidates( } let info = tl.get_local_layers_for_disk_usage_eviction().await; debug!(tenant_id=%tl.tenant_shard_id.tenant_id, shard_id=%tl.tenant_shard_id.shard_slug(), timeline_id=%tl.timeline_id, "timeline resident layers count: {}", info.resident_layers.len()); + tenant_candidates.extend(info.resident_layers.into_iter()); max_layer_size = max_layer_size.max(info.max_layer_size.unwrap_or(0)); @@ -835,62 +901,46 @@ async fn collect_eviction_candidates( .sort_unstable_by_key(|layer_info| std::cmp::Reverse(layer_info.last_activity_ts)); let mut cumsum: i128 = 0; - // keeping the -1 or not decides if every tenant should lose their least recently accessed - // layer OR if this should happen in the order of having highest layer count: - let fudge = if eviction_order.highest_layer_count_loses_first() { - // relative_age vs. tenant layer count: - // - 0.1..=1.0 (10 layers) - // - 0.01..=1.0 (100 layers) - // - 0.001..=1.0 (1000 layers) - // - // leading to evicting less of the smallest tenants. - 0 - } else { - // use full 0.0..=1.0 range, which means even the smallest tenants could always lose a - // layer. the actual ordering is unspecified: for 10k tenants on a pageserver it could - // be that less than 10k layer evictions is enough, so we would not need to evict from - // all tenants. - // - // as the tenant ordering is now deterministic this could hit the same tenants - // disproportionetly on multiple invocations. alternative could be to remember how many - // layers did we evict last time from this tenant, and inject that as an additional - // fudge here. - 1 - }; + let total = tenant_candidates.len(); - let total = tenant_candidates - .len() - .checked_sub(fudge) - .filter(|&x| x > 0) - // support 0 or 1 resident layer tenants as well - .unwrap_or(1); - let divider = total as f32; + let tenant_candidates = + tenant_candidates + .into_iter() + .enumerate() + .map(|(i, mut candidate)| { + // as we iterate this reverse sorted list, the most recently accessed layer will always + // be 1.0; this is for us to evict it last. + candidate.relative_last_activity = + eviction_order.relative_last_activity(total, i); - for (i, mut candidate) in tenant_candidates.into_iter().enumerate() { - // as we iterate this reverse sorted list, the most recently accessed layer will always - // be 1.0; this is for us to evict it last. - candidate.relative_last_activity = if matches!( - eviction_order, - EvictionOrder::RelativeAccessed { .. } - ) { - // another possibility: use buckets, like (256.0 * relative_last_activity) as u8 or - // similarly for u16. unsure how it would help. - finite_f32::FiniteF32::try_from_normalized((total - i) as f32 / divider) - .unwrap_or_else(|val| { - tracing::warn!(%fudge, "calculated invalid relative_last_activity for i={i}, total={total}: {val}"); - finite_f32::FiniteF32::ZERO - }) - } else { - finite_f32::FiniteF32::ZERO - }; + let partition = if cumsum > min_resident_size as i128 { + MinResidentSizePartition::Above + } else { + MinResidentSizePartition::Below + }; + cumsum += i128::from(candidate.layer.get_file_size()); - let partition = if cumsum > min_resident_size as i128 { - MinResidentSizePartition::Above - } else { - MinResidentSizePartition::Below - }; - cumsum += i128::from(candidate.layer.get_file_size()); - candidates.push((partition, candidate)); + (partition, candidate) + }); + + METRICS + .tenant_layer_count + .observe(tenant_candidates.len() as f64); + + candidates.extend(tenant_candidates); + + let elapsed = started_at.elapsed(); + METRICS + .tenant_collection_time + .observe(elapsed.as_secs_f64()); + + if elapsed > LOG_DURATION_THRESHOLD { + tracing::info!( + tenant_id=%tenant.tenant_shard_id().tenant_id, + shard_id=%tenant.tenant_shard_id().shard_slug(), + elapsed_ms = elapsed.as_millis(), + "collection took longer than threshold" + ); } } @@ -906,31 +956,68 @@ async fn collect_eviction_candidates( }, ); - for secondary_tenant in secondary_tenants { - let mut layer_info = secondary_tenant.get_layers_for_eviction(); + for tenant in secondary_tenants { + // for secondary tenants we use a sum of on_disk layers and already evicted layers. this is + // to prevent repeated disk usage based evictions from completely draining less often + // updating secondaries. + let (mut layer_info, total_layers) = tenant.get_layers_for_eviction(); + + debug_assert!( + total_layers >= layer_info.resident_layers.len(), + "total_layers ({total_layers}) must be at least the resident_layers.len() ({})", + layer_info.resident_layers.len() + ); + + let started_at = std::time::Instant::now(); layer_info .resident_layers .sort_unstable_by_key(|layer_info| std::cmp::Reverse(layer_info.last_activity_ts)); - candidates.extend(layer_info.resident_layers.into_iter().map(|candidate| { - ( - // Secondary locations' layers are always considered above the min resident size, - // i.e. secondary locations are permitted to be trimmed to zero layers if all - // the layers have sufficiently old access times. - MinResidentSizePartition::Above, - candidate, - ) - })); + let tenant_candidates = + layer_info + .resident_layers + .into_iter() + .enumerate() + .map(|(i, mut candidate)| { + candidate.relative_last_activity = + eviction_order.relative_last_activity(total_layers, i); + ( + // Secondary locations' layers are always considered above the min resident size, + // i.e. secondary locations are permitted to be trimmed to zero layers if all + // the layers have sufficiently old access times. + MinResidentSizePartition::Above, + candidate, + ) + }); + + METRICS + .tenant_layer_count + .observe(tenant_candidates.len() as f64); + candidates.extend(tenant_candidates); + + tokio::task::yield_now().await; + + let elapsed = started_at.elapsed(); + + METRICS + .tenant_collection_time + .observe(elapsed.as_secs_f64()); + + if elapsed > LOG_DURATION_THRESHOLD { + tracing::info!( + tenant_id=%tenant.tenant_shard_id().tenant_id, + shard_id=%tenant.tenant_shard_id().shard_slug(), + elapsed_ms = elapsed.as_millis(), + "collection took longer than threshold" + ); + } } debug_assert!(MinResidentSizePartition::Above < MinResidentSizePartition::Below, "as explained in the function's doc comment, layers that aren't in the tenant's min_resident_size are evicted first"); - // always behave as if AbsoluteAccessed was selected. if RelativeAccessed is in use, we - // will sort later by candidate.relative_last_activity to get compare evictions. - candidates - .sort_unstable_by_key(|(partition, candidate)| (*partition, candidate.last_activity_ts)); + eviction_order.sort(&mut candidates); Ok(EvictionCandidates::Finished(candidates)) } @@ -1001,30 +1088,6 @@ impl VictimSelection { } } -struct TimelineKey(Arc); - -impl PartialEq for TimelineKey { - fn eq(&self, other: &Self) -> bool { - Arc::ptr_eq(&self.0, &other.0) - } -} - -impl Eq for TimelineKey {} - -impl std::hash::Hash for TimelineKey { - fn hash(&self, state: &mut H) { - Arc::as_ptr(&self.0).hash(state); - } -} - -impl std::ops::Deref for TimelineKey { - type Target = Timeline; - - fn deref(&self) -> &Self::Target { - self.0.as_ref() - } -} - /// A totally ordered f32 subset we can use with sorting functions. pub(crate) mod finite_f32 { @@ -1070,6 +1133,12 @@ pub(crate) mod finite_f32 { } } + impl From for f32 { + fn from(value: FiniteF32) -> f32 { + value.0 + } + } + impl FiniteF32 { pub const ZERO: FiniteF32 = FiniteF32(0.0); @@ -1082,136 +1151,9 @@ pub(crate) mod finite_f32 { Err(value) } } - } -} -mod summary { - use super::finite_f32::FiniteF32; - use super::{EvictionCandidate, LayerCount}; - use pageserver_api::shard::TenantShardId; - use std::collections::{BTreeMap, HashMap}; - use std::time::SystemTime; - - #[derive(Debug, Default)] - pub(super) struct EvictionSummary { - evicted_per_tenant: HashMap, - total: LayerCount, - - last_absolute: Option, - last_relative: Option, - } - - impl<'a> FromIterator<&'a EvictionCandidate> for EvictionSummary { - fn from_iter>(iter: T) -> Self { - let mut summary = EvictionSummary::default(); - for item in iter { - let counts = summary - .evicted_per_tenant - .entry(*item.layer.get_tenant_shard_id()) - .or_default(); - - let sz = item.layer.get_file_size(); - - counts.file_sizes += sz; - counts.count += 1; - - summary.total.file_sizes += sz; - summary.total.count += 1; - - summary.last_absolute = Some(item.last_activity_ts); - summary.last_relative = Some(item.relative_last_activity); - } - - summary - } - } - - struct SiBytesAmount(u64); - - impl std::fmt::Display for SiBytesAmount { - fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { - if self.0 < 1024 { - return write!(f, "{}B", self.0); - } - - let mut tmp = self.0; - let mut ch = 0; - let suffixes = b"KMGTPE"; - - while tmp > 1024 * 1024 && ch < suffixes.len() - 1 { - tmp /= 1024; - ch += 1; - } - - let ch = suffixes[ch] as char; - - write!(f, "{:.1}{ch}iB", tmp as f64 / 1024.0) - } - } - - impl std::fmt::Display for EvictionSummary { - fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { - // wasteful, but it's for testing - - let mut sorted: BTreeMap> = BTreeMap::new(); - - for (tenant_shard_id, count) in &self.evicted_per_tenant { - sorted - .entry(count.count) - .or_default() - .push((*tenant_shard_id, count.file_sizes)); - } - - let total_file_sizes = SiBytesAmount(self.total.file_sizes); - - writeln!( - f, - "selected {} layers of {total_file_sizes} up to ({:?}, {:.2?}):", - self.total.count, self.last_absolute, self.last_relative, - )?; - - for (count, per_tenant) in sorted.iter().rev().take(10) { - write!(f, "- {count} layers: ")?; - - if per_tenant.len() < 3 { - for (i, (tenant_shard_id, bytes)) in per_tenant.iter().enumerate() { - if i > 0 { - write!(f, ", ")?; - } - let bytes = SiBytesAmount(*bytes); - write!(f, "{tenant_shard_id} ({bytes})")?; - } - } else { - let num_tenants = per_tenant.len(); - let total_bytes = per_tenant.iter().map(|(_id, bytes)| bytes).sum::(); - let total_bytes = SiBytesAmount(total_bytes); - let layers = num_tenants * count; - - write!( - f, - "{num_tenants} tenants {total_bytes} in total {layers} layers", - )?; - } - - writeln!(f)?; - } - - if sorted.len() > 10 { - let (rem_count, rem_bytes) = sorted - .iter() - .rev() - .map(|(count, per_tenant)| { - ( - count, - per_tenant.iter().map(|(_id, bytes)| bytes).sum::(), - ) - }) - .fold((0, 0), |acc, next| (acc.0 + next.0, acc.1 + next.1)); - let rem_bytes = SiBytesAmount(rem_bytes); - writeln!(f, "- rest of tenants ({}) not shown ({rem_count} layers or {:.1}%, {rem_bytes} or {:.1}% bytes)", sorted.len() - 10, 100.0 * rem_count as f64 / self.total.count as f64, 100.0 * rem_bytes.0 as f64 / self.total.file_sizes as f64)?; - } - - Ok(()) + pub fn into_inner(self) -> f32 { + self.into() } } } @@ -1225,7 +1167,6 @@ mod filesystem_level_usage { use super::DiskUsageEvictionTaskConfig; #[derive(Debug, Clone, Copy)] - #[allow(dead_code)] pub struct Usage<'a> { config: &'a DiskUsageEvictionTaskConfig, @@ -1336,3 +1277,40 @@ mod filesystem_level_usage { assert!(!usage.has_pressure()); } } + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn relative_equal_bounds() { + let order = EvictionOrder::RelativeAccessed { + highest_layer_count_loses_first: false, + }; + + let len = 10; + let v = (0..len) + .map(|i| order.relative_last_activity(len, i).into_inner()) + .collect::>(); + + assert_eq!(v.first(), Some(&1.0)); + assert_eq!(v.last(), Some(&0.0)); + assert!(v.windows(2).all(|slice| slice[0] > slice[1])); + } + + #[test] + fn relative_spare_bounds() { + let order = EvictionOrder::RelativeAccessed { + highest_layer_count_loses_first: true, + }; + + let len = 10; + let v = (0..len) + .map(|i| order.relative_last_activity(len, i).into_inner()) + .collect::>(); + + assert_eq!(v.first(), Some(&1.0)); + assert_eq!(v.last(), Some(&0.1)); + assert!(v.windows(2).all(|slice| slice[0] > slice[1])); + } +} diff --git a/pageserver/src/http/openapi_spec.yml b/pageserver/src/http/openapi_spec.yml index a49eef8bb9..6a070e2135 100644 --- a/pageserver/src/http/openapi_spec.yml +++ b/pageserver/src/http/openapi_spec.yml @@ -178,6 +178,64 @@ paths: schema: $ref: "#/components/schemas/ServiceUnavailableError" + /v1/tenant/{tenant_id}/time_travel_remote_storage: + parameters: + - name: tenant_id + in: path + required: true + schema: + type: string + - name: travel_to + in: query + required: true + schema: + type: string + format: date-time + - name: done_if_after + in: query + required: true + schema: + type: string + format: date-time + put: + description: Time travel the tenant's remote storage + responses: + "200": + description: OK + content: + application/json: + schema: + type: string + "400": + description: Error when no tenant id found in path or invalid timestamp + content: + application/json: + schema: + $ref: "#/components/schemas/Error" + "401": + description: Unauthorized Error + content: + application/json: + schema: + $ref: "#/components/schemas/UnauthorizedError" + "403": + description: Forbidden Error + content: + application/json: + schema: + $ref: "#/components/schemas/ForbiddenError" + "500": + description: Generic operation error + content: + application/json: + schema: + $ref: "#/components/schemas/Error" + "503": + description: Temporarily unavailable, please retry. + content: + application/json: + schema: + $ref: "#/components/schemas/ServiceUnavailableError" /v1/tenant/{tenant_id}/timeline: parameters: @@ -419,12 +477,6 @@ paths: type: string format: date-time description: A timestamp to get the LSN - - name: version - in: query - required: false - schema: - type: integer - description: The version of the endpoint to use responses: "200": description: OK @@ -515,114 +567,6 @@ paths: application/json: schema: $ref: "#/components/schemas/ServiceUnavailableError" - - /v1/tenant/{tenant_id}/attach: - parameters: - - name: tenant_id - in: path - required: true - schema: - type: string - post: - description: | - Schedules attach operation to happen in the background for the given tenant. - As soon as the caller sends this request, it must assume the pageserver - starts writing to the tenant's S3 state unless it receives one of the - distinguished errors below that state otherwise. - - If a client receives a not-distinguished response, e.g., a network timeout, - it MUST retry the /attach request and poll again for the tenant's - attachment status. - - After the client has received a 202, it MUST poll the tenant's - attachment status (field `attachment_status`) to reach state `attached`. - If the `attachment_status` is missing, the client MUST retry the `/attach` - request (goto previous paragraph). This is a robustness measure in case the tenant - status endpoint is buggy, but the attach operation is ongoing. - - There is no way to cancel an in-flight request. - - In any case, the client - * MUST NOT ASSUME that the /attach request has been lost in the network, - * MUST NOT ASSUME that the request has been lost, based on the observation - that a subsequent tenant status request returns 404. The request may - still be in flight. It must be retried. - - The client SHOULD supply a `TenantConfig` for the tenant in the request body. - Settings specified in the config override the pageserver's defaults. - It is guaranteed that the config settings are applied before the pageserver - starts operating on the tenant. E.g., if the config specifies a specific - PITR interval for a tenant, then that setting will be in effect before the - pageserver starts the garbage collection loop. This enables a client to - guarantee a specific PITR setting across detach/attach cycles. - The pageserver will reject the request if it cannot parse the config, or - if there are any unknown fields in it. - - If the client does not supply a config, the pageserver will use its defaults. - This behavior is deprecated: https://github.com/neondatabase/neon/issues/4282 - requestBody: - required: false - content: - application/json: - schema: - $ref: "#/components/schemas/TenantAttachRequest" - responses: - "202": - description: Tenant attaching scheduled - "400": - description: Bad Request - content: - application/json: - schema: - $ref: "#/components/schemas/Error" - "401": - description: Unauthorized Error - content: - application/json: - schema: - $ref: "#/components/schemas/UnauthorizedError" - "403": - description: Forbidden Error - content: - application/json: - schema: - $ref: "#/components/schemas/ForbiddenError" - "404": - description: Timeline not found - content: - application/json: - schema: - $ref: "#/components/schemas/NotFoundError" - "409": - description: | - The tenant is already known to Pageserver in some way, - and hence this `/attach` call has been rejected. - - Some examples of how this can happen: - - tenant was created on this pageserver - - tenant attachment was started by an earlier call to `/attach`. - - Callers should poll the tenant status's `attachment_status` field, - like for status 202. See the longer description for `POST /attach` - for details. - content: - application/json: - schema: - $ref: "#/components/schemas/ConflictError" - "500": - description: Generic operation error - content: - application/json: - schema: - $ref: "#/components/schemas/Error" - "503": - description: Temporarily unavailable, please retry. - content: - application/json: - schema: - $ref: "#/components/schemas/ServiceUnavailableError" - - /v1/tenant/{tenant_id}/location_config: parameters: - name: tenant_id @@ -635,6 +579,12 @@ paths: required: false schema: type: integer + - name: lazy + in: query + required: false + schema: + type: boolean + description: Set to true for attaches to queue up until activated by compute. Eager (false) is the default. put: description: | Configures a _tenant location_, that is how a particular pageserver handles @@ -674,6 +624,10 @@ paths: responses: "200": description: Tenant is now in requested state + content: + application/json: + schema: + $ref: "#/components/schemas/TenantLocationConfigResponse" "503": description: Tenant's state cannot be changed right now. Wait a few seconds and retry. content: @@ -714,66 +668,6 @@ paths: application/json: schema: $ref: "#/components/schemas/Error" - - /v1/tenant/{tenant_id}/detach: - parameters: - - name: tenant_id - in: path - required: true - schema: - type: string - - name: detach_ignored - in: query - required: false - schema: - type: boolean - description: | - When true, allow to detach a tenant which state is ignored. - post: - description: | - Remove tenant data (including all corresponding timelines) from pageserver's memory and file system. - Files on the remote storage are not affected. - responses: - "200": - description: Tenant detached - "400": - description: Error when no tenant id found in path parameters - content: - application/json: - schema: - $ref: "#/components/schemas/Error" - "401": - description: Unauthorized Error - content: - application/json: - schema: - $ref: "#/components/schemas/UnauthorizedError" - "403": - description: Forbidden Error - content: - application/json: - schema: - $ref: "#/components/schemas/ForbiddenError" - "404": - description: Tenant not found - content: - application/json: - schema: - $ref: "#/components/schemas/NotFoundError" - "500": - description: Generic operation error - content: - application/json: - schema: - $ref: "#/components/schemas/Error" - "503": - description: Temporarily unavailable, please retry. - content: - application/json: - schema: - $ref: "#/components/schemas/ServiceUnavailableError" - - /v1/tenant/{tenant_id}/ignore: parameters: - name: tenant_id @@ -1038,6 +932,59 @@ paths: schema: $ref: "#/components/schemas/ServiceUnavailableError" + /v1/tenant/{tenant_shard_id}/heatmap_upload: + parameters: + - name: tenant_shard_id + in: path + required: true + schema: + type: string + post: + description: | + If the location is in an attached mode, upload the current state to the remote heatmap + responses: + "200": + description: Success + "500": + description: Generic operation error + content: + application/json: + schema: + $ref: "#/components/schemas/Error" + "503": + description: Temporarily unavailable, please retry. + content: + application/json: + schema: + $ref: "#/components/schemas/ServiceUnavailableError" + + /v1/tenant/{tenant_shard_id}/secondary/download: + parameters: + - name: tenant_shard_id + in: path + required: true + schema: + type: string + post: + description: | + If the location is in secondary mode, download latest heatmap and layers + responses: + "200": + description: Success + "500": + description: Generic operation error + content: + application/json: + schema: + $ref: "#/components/schemas/Error" + "503": + description: Temporarily unavailable, please retry. + content: + application/json: + schema: + $ref: "#/components/schemas/ServiceUnavailableError" + + /v1/tenant/{tenant_id}/timeline/: parameters: @@ -1323,6 +1270,25 @@ paths: schema: $ref: "#/components/schemas/ServiceUnavailableError" + /v1/utilization: + get: + description: | + Returns the pageservers current utilization and fitness score for new tenants. + + responses: + "200": + description: Pageserver utilization and fitness score + content: + application/json: + schema: + $ref: "#/components/schemas/PageserverUtilization" + "500": + description: Generic operation error + content: + application/json: + schema: + $ref: "#/components/schemas/Error" + components: securitySchemes: JWT: @@ -1389,16 +1355,6 @@ components: generation: type: integer description: Attachment generation number. - TenantAttachRequest: - type: object - required: - - config - properties: - config: - $ref: '#/components/schemas/TenantConfig' - generation: - type: integer - description: Attachment generation number. TenantConfigRequest: allOf: - $ref: '#/components/schemas/TenantConfig' @@ -1426,6 +1382,32 @@ components: $ref: '#/components/schemas/SecondaryConfig' tenant_conf: $ref: '#/components/schemas/TenantConfig' + TenantLocationConfigResponse: + type: object + required: + - shards + properties: + shards: + description: Pageservers where this tenant's shards are attached. Not populated for secondary locations. + type: array + items: + $ref: "#/components/schemas/TenantShardLocation" + stripe_size: + description: If multiple shards are present, this field contains the sharding stripe size, else it is null. + type: integer + nullable: true + TenantShardLocation: + type: object + required: + - node_id + - shard_id + properties: + node_id: + description: Pageserver node ID where this shard is attached + type: integer + shard_id: + description: Tenant shard ID of the shard + type: string SecondaryConfig: type: object properties: @@ -1462,7 +1444,7 @@ components: trace_read_requests: type: boolean heatmap_period: - type: integer + type: string TenantConfigResponse: type: object properties: @@ -1613,6 +1595,33 @@ components: type: string enum: [past, present, future, nodata] + PageserverUtilization: + type: object + required: + - disk_usage_bytes + - free_space_bytes + - utilization_score + properties: + disk_usage_bytes: + type: integer + format: int64 + minimum: 0 + description: The amount of disk space currently utilized by layer files. + free_space_bytes: + type: integer + format: int64 + minimum: 0 + description: The amount of usable disk space left. + utilization_score: + type: integer + format: int64 + minimum: 0 + maximum: 9223372036854775807 + default: 9223372036854775807 + description: | + Lower is better score for how good this pageserver would be for the next tenant. + The default or maximum value can be returned in situations when a proper score cannot (yet) be calculated. + Error: type: object required: diff --git a/pageserver/src/http/routes.rs b/pageserver/src/http/routes.rs index aa56806246..bb8b1bb7e5 100644 --- a/pageserver/src/http/routes.rs +++ b/pageserver/src/http/routes.rs @@ -14,16 +14,23 @@ use hyper::header; use hyper::StatusCode; use hyper::{Body, Request, Response, Uri}; use metrics::launch_timestamp::LaunchTimestamp; +use pageserver_api::models::LocationConfig; use pageserver_api::models::LocationConfigListResponse; use pageserver_api::models::ShardParameters; use pageserver_api::models::TenantDetails; +use pageserver_api::models::TenantLocationConfigResponse; +use pageserver_api::models::TenantShardLocation; +use pageserver_api::models::TenantShardSplitRequest; +use pageserver_api::models::TenantShardSplitResponse; use pageserver_api::models::TenantState; use pageserver_api::models::{ DownloadRemoteLayersTaskSpawnRequest, LocationConfigMode, TenantAttachRequest, TenantLoadRequest, TenantLocationConfigRequest, }; +use pageserver_api::shard::ShardCount; use pageserver_api::shard::TenantShardId; use remote_storage::GenericRemoteStorage; +use remote_storage::TimeTravelError; use tenant_size_model::{SizeResult, StorageModel}; use tokio_util::sync::CancellationToken; use tracing::*; @@ -45,6 +52,7 @@ use crate::tenant::mgr::{ TenantSlotError, TenantSlotUpsertError, TenantStateError, }; use crate::tenant::mgr::{TenantSlot, UpsertLocationError}; +use crate::tenant::remote_timeline_client; use crate::tenant::secondary::SecondaryController; use crate::tenant::size::ModelInputs; use crate::tenant::storage_layer::LayerAccessStatsReset; @@ -75,7 +83,13 @@ use utils::{ // For APIs that require an Active tenant, how long should we block waiting for that state? // This is not functionally necessary (clients will retry), but avoids generating a lot of // failed API calls while tenants are activating. -const ACTIVE_TENANT_TIMEOUT: Duration = Duration::from_millis(5000); +#[cfg(not(feature = "testing"))] +pub(crate) const ACTIVE_TENANT_TIMEOUT: Duration = Duration::from_millis(5000); + +// Tests run on slow/oversubscribed nodes, and may need to wait much longer for tenants to +// finish attaching, if calls to remote storage are slow. +#[cfg(feature = "testing")] +pub(crate) const ACTIVE_TENANT_TIMEOUT: Duration = Duration::from_millis(30000); pub struct State { conf: &'static PageServerConf, @@ -87,6 +101,7 @@ pub struct State { disk_usage_eviction_state: Arc, deletion_queue_client: DeletionQueueClient, secondary_controller: SecondaryController, + latest_utilization: tokio::sync::Mutex>, } impl State { @@ -115,6 +130,7 @@ impl State { disk_usage_eviction_state, deletion_queue_client, secondary_controller, + latest_utilization: Default::default(), }) } } @@ -409,6 +425,7 @@ async fn build_timeline_info_common( tenant::timeline::logical_size::Accuracy::Approximate => false, tenant::timeline::logical_size::Accuracy::Exact => true, }, + directory_entries_counts: timeline.get_directory_metrics().to_vec(), current_physical_size, current_logical_size_non_incremental: None, timeline_dir_layer_file_size_sum: None, @@ -475,52 +492,74 @@ async fn timeline_create_handler( let state = get_state(&request); async { - let tenant = state.tenant_manager.get_attached_tenant_shard(tenant_shard_id, false)?; + let tenant = state + .tenant_manager + .get_attached_tenant_shard(tenant_shard_id, false)?; tenant.wait_to_become_active(ACTIVE_TENANT_TIMEOUT).await?; - match tenant.create_timeline( - new_timeline_id, - request_data.ancestor_timeline_id.map(TimelineId::from), - request_data.ancestor_start_lsn, - request_data.pg_version.unwrap_or(crate::DEFAULT_PG_VERSION), - request_data.existing_initdb_timeline_id, - state.broker_client.clone(), - &ctx, - ) - .await { + if let Some(ancestor_id) = request_data.ancestor_timeline_id.as_ref() { + tracing::info!(%ancestor_id, "starting to branch"); + } else { + tracing::info!("bootstrapping"); + } + + match tenant + .create_timeline( + new_timeline_id, + request_data.ancestor_timeline_id, + request_data.ancestor_start_lsn, + request_data.pg_version.unwrap_or(crate::DEFAULT_PG_VERSION), + request_data.existing_initdb_timeline_id, + state.broker_client.clone(), + &ctx, + ) + .await + { Ok(new_timeline) => { // Created. Construct a TimelineInfo for it. - let timeline_info = build_timeline_info_common(&new_timeline, &ctx, tenant::timeline::GetLogicalSizePriority::User) - .await - .map_err(ApiError::InternalServerError)?; + let timeline_info = build_timeline_info_common( + &new_timeline, + &ctx, + tenant::timeline::GetLogicalSizePriority::User, + ) + .await + .map_err(ApiError::InternalServerError)?; json_response(StatusCode::CREATED, timeline_info) } Err(_) if tenant.cancel.is_cancelled() => { // In case we get some ugly error type during shutdown, cast it into a clean 503. - json_response(StatusCode::SERVICE_UNAVAILABLE, HttpErrorBody::from_msg("Tenant shutting down".to_string())) - } - Err(tenant::CreateTimelineError::Conflict | tenant::CreateTimelineError::AlreadyCreating) => { - json_response(StatusCode::CONFLICT, ()) - } - Err(tenant::CreateTimelineError::AncestorLsn(err)) => { - json_response(StatusCode::NOT_ACCEPTABLE, HttpErrorBody::from_msg( - format!("{err:#}") - )) - } - Err(e @ tenant::CreateTimelineError::AncestorNotActive) => { - json_response(StatusCode::SERVICE_UNAVAILABLE, HttpErrorBody::from_msg(e.to_string())) - } - Err(tenant::CreateTimelineError::ShuttingDown) => { - json_response(StatusCode::SERVICE_UNAVAILABLE,HttpErrorBody::from_msg("tenant shutting down".to_string())) + json_response( + StatusCode::SERVICE_UNAVAILABLE, + HttpErrorBody::from_msg("Tenant shutting down".to_string()), + ) } + Err( + tenant::CreateTimelineError::Conflict + | tenant::CreateTimelineError::AlreadyCreating, + ) => json_response(StatusCode::CONFLICT, ()), + Err(tenant::CreateTimelineError::AncestorLsn(err)) => json_response( + StatusCode::NOT_ACCEPTABLE, + HttpErrorBody::from_msg(format!("{err:#}")), + ), + Err(e @ tenant::CreateTimelineError::AncestorNotActive) => json_response( + StatusCode::SERVICE_UNAVAILABLE, + HttpErrorBody::from_msg(e.to_string()), + ), + Err(tenant::CreateTimelineError::ShuttingDown) => json_response( + StatusCode::SERVICE_UNAVAILABLE, + HttpErrorBody::from_msg("tenant shutting down".to_string()), + ), Err(tenant::CreateTimelineError::Other(err)) => Err(ApiError::InternalServerError(err)), } } .instrument(info_span!("timeline_create", tenant_id = %tenant_shard_id.tenant_id, - shard = %tenant_shard_id.shard_slug(), - timeline_id = %new_timeline_id, lsn=?request_data.ancestor_start_lsn, pg_version=?request_data.pg_version)) + shard_id = %tenant_shard_id.shard_slug(), + timeline_id = %new_timeline_id, + lsn=?request_data.ancestor_start_lsn, + pg_version=?request_data.pg_version + )) .await } @@ -535,10 +574,16 @@ async fn timeline_list_handler( parse_query_param(&request, "force-await-initial-logical-size")?; check_permission(&request, Some(tenant_shard_id.tenant_id))?; + let state = get_state(&request); let ctx = RequestContext::new(TaskKind::MgmtRequest, DownloadBehavior::Download); let response_data = async { - let tenant = mgr::get_tenant(tenant_shard_id, true)?; + let tenant = state + .tenant_manager + .get_attached_tenant_shard(tenant_shard_id, false)?; + + tenant.wait_to_become_active(ACTIVE_TENANT_TIMEOUT).await?; + let timelines = tenant.list_timelines(); let mut response_data = Vec::with_capacity(timelines.len()); @@ -580,7 +625,7 @@ async fn timeline_preserve_initdb_handler( // location where timeline recreation cand find it. async { - let tenant = mgr::get_tenant(tenant_shard_id, true)?; + let tenant = mgr::get_tenant(tenant_shard_id, false)?; let timeline = tenant .get_timeline(timeline_id, false) @@ -617,9 +662,14 @@ async fn timeline_detail_handler( // Logical size calculation needs downloading. let ctx = RequestContext::new(TaskKind::MgmtRequest, DownloadBehavior::Download); + let state = get_state(&request); let timeline_info = async { - let tenant = mgr::get_tenant(tenant_shard_id, true)?; + let tenant = state + .tenant_manager + .get_attached_tenant_shard(tenant_shard_id, false)?; + + tenant.wait_to_become_active(ACTIVE_TENANT_TIMEOUT).await?; let timeline = tenant .get_timeline(timeline_id, false) @@ -652,6 +702,7 @@ async fn get_lsn_by_timestamp_handler( ) -> Result, ApiError> { let tenant_shard_id: TenantShardId = parse_request_param(&request, "tenant_shard_id")?; check_permission(&request, Some(tenant_shard_id.tenant_id))?; + let state = get_state(&request); if !tenant_shard_id.is_zero() { // Requires SLRU contents, which are only stored on shard zero @@ -668,11 +719,14 @@ async fn get_lsn_by_timestamp_handler( let timestamp_pg = postgres_ffi::to_pg_timestamp(timestamp); let ctx = RequestContext::new(TaskKind::MgmtRequest, DownloadBehavior::Download); - let timeline = active_timeline_of_active_tenant(tenant_shard_id, timeline_id).await?; + + let timeline = + active_timeline_of_active_tenant(&state.tenant_manager, tenant_shard_id, timeline_id) + .await?; let result = timeline .find_lsn_for_timestamp(timestamp_pg, &cancel, &ctx) .await?; - #[derive(serde::Serialize)] + #[derive(serde::Serialize, Debug)] struct Result { lsn: Lsn, kind: &'static str, @@ -683,7 +737,14 @@ async fn get_lsn_by_timestamp_handler( LsnForTimestamp::Past(lsn) => (lsn, "past"), LsnForTimestamp::NoData(lsn) => (lsn, "nodata"), }; - json_response(StatusCode::OK, Result { lsn, kind }) + let result = Result { lsn, kind }; + tracing::info!( + lsn=?result.lsn, + kind=%result.kind, + timestamp=%timestamp_raw, + "lsn_by_timestamp finished" + ); + json_response(StatusCode::OK, result) } async fn get_timestamp_of_lsn_handler( @@ -692,6 +753,7 @@ async fn get_timestamp_of_lsn_handler( ) -> Result, ApiError> { let tenant_shard_id: TenantShardId = parse_request_param(&request, "tenant_shard_id")?; check_permission(&request, Some(tenant_shard_id.tenant_id))?; + let state = get_state(&request); if !tenant_shard_id.is_zero() { // Requires SLRU contents, which are only stored on shard zero @@ -708,7 +770,9 @@ async fn get_timestamp_of_lsn_handler( .map_err(ApiError::BadRequest)?; let ctx = RequestContext::new(TaskKind::MgmtRequest, DownloadBehavior::Download); - let timeline = active_timeline_of_active_tenant(tenant_shard_id, timeline_id).await?; + let timeline = + active_timeline_of_active_tenant(&state.tenant_manager, tenant_shard_id, timeline_id) + .await?; let result = timeline.get_timestamp_for_lsn(lsn, &ctx).await?; match result { @@ -753,13 +817,7 @@ async fn tenant_attach_handler( let tenant = state .tenant_manager - .upsert_location( - tenant_shard_id, - location_conf, - None, - SpawnMode::Normal, - &ctx, - ) + .upsert_location(tenant_shard_id, location_conf, None, SpawnMode::Eager, &ctx) .await?; let Some(tenant) = tenant else { @@ -808,7 +866,7 @@ async fn timeline_delete_handler( } })?; tenant.wait_to_become_active(ACTIVE_TENANT_TIMEOUT).await?; - tenant.delete_timeline(timeline_id).instrument(info_span!("timeline_delete", tenant_id=%tenant_shard_id.tenant_id, shard=%tenant_shard_id.shard_slug(), %timeline_id)) + tenant.delete_timeline(timeline_id).instrument(info_span!("timeline_delete", tenant_id=%tenant_shard_id.tenant_id, shard_id=%tenant_shard_id.shard_slug(), %timeline_id)) .await?; json_response(StatusCode::ACCEPTED, ()) @@ -833,7 +891,7 @@ async fn tenant_detach_handler( detach_ignored.unwrap_or(false), &state.deletion_queue_client, ) - .instrument(info_span!("tenant_detach", %tenant_id)) + .instrument(info_span!("tenant_detach", %tenant_id, shard_id=%tenant_shard_id.shard_slug())) .await?; json_response(StatusCode::OK, ()) @@ -852,7 +910,7 @@ async fn tenant_reset_handler( let state = get_state(&request); state .tenant_manager - .reset_tenant(tenant_shard_id, drop_cache.unwrap_or(false), ctx) + .reset_tenant(tenant_shard_id, drop_cache.unwrap_or(false), &ctx) .await .map_err(ApiError::InternalServerError)?; @@ -957,6 +1015,7 @@ async fn tenant_status( attachment_status: state.attachment_status(), generation: tenant.generation().into(), }, + walredo: tenant.wal_redo_manager_status(), timelines: tenant.list_timeline_ids(), }) } @@ -983,7 +1042,7 @@ async fn tenant_delete_handler( .delete_tenant(tenant_shard_id, ACTIVE_TENANT_TIMEOUT) .instrument(info_span!("tenant_delete_handler", tenant_id = %tenant_shard_id.tenant_id, - shard = %tenant_shard_id.shard_slug() + shard_id = %tenant_shard_id.shard_slug() )) .await?; @@ -1080,6 +1139,30 @@ async fn tenant_size_handler( ) } +async fn tenant_shard_split_handler( + mut request: Request, + _cancel: CancellationToken, +) -> Result, ApiError> { + let req: TenantShardSplitRequest = json_request(&mut request).await?; + + let tenant_shard_id: TenantShardId = parse_request_param(&request, "tenant_shard_id")?; + let state = get_state(&request); + let ctx = RequestContext::new(TaskKind::MgmtRequest, DownloadBehavior::Warn); + + let new_shards = state + .tenant_manager + .shard_split( + tenant_shard_id, + ShardCount::new(req.new_shard_count), + req.new_stripe_size, + &ctx, + ) + .await + .map_err(ApiError::InternalServerError)?; + + json_response(StatusCode::OK, TenantShardSplitResponse { new_shards }) +} + async fn layer_map_info_handler( request: Request, _cancel: CancellationToken, @@ -1088,10 +1171,13 @@ async fn layer_map_info_handler( let timeline_id: TimelineId = parse_request_param(&request, "timeline_id")?; let reset: LayerAccessStatsReset = parse_query_param(&request, "reset")?.unwrap_or(LayerAccessStatsReset::NoReset); + let state = get_state(&request); check_permission(&request, Some(tenant_shard_id.tenant_id))?; - let timeline = active_timeline_of_active_tenant(tenant_shard_id, timeline_id).await?; + let timeline = + active_timeline_of_active_tenant(&state.tenant_manager, tenant_shard_id, timeline_id) + .await?; let layer_map_info = timeline.layer_map_info(reset).await; json_response(StatusCode::OK, layer_map_info) @@ -1105,8 +1191,11 @@ async fn layer_download_handler( let timeline_id: TimelineId = parse_request_param(&request, "timeline_id")?; let layer_file_name = get_request_param(&request, "layer_file_name")?; check_permission(&request, Some(tenant_shard_id.tenant_id))?; + let state = get_state(&request); - let timeline = active_timeline_of_active_tenant(tenant_shard_id, timeline_id).await?; + let timeline = + active_timeline_of_active_tenant(&state.tenant_manager, tenant_shard_id, timeline_id) + .await?; let downloaded = timeline .download_layer(layer_file_name) .await @@ -1130,8 +1219,11 @@ async fn evict_timeline_layer_handler( check_permission(&request, Some(tenant_shard_id.tenant_id))?; let timeline_id: TimelineId = parse_request_param(&request, "timeline_id")?; let layer_file_name = get_request_param(&request, "layer_file_name")?; + let state = get_state(&request); - let timeline = active_timeline_of_active_tenant(tenant_shard_id, timeline_id).await?; + let timeline = + active_timeline_of_active_tenant(&state.tenant_manager, tenant_shard_id, timeline_id) + .await?; let evicted = timeline .evict_layer(layer_file_name) .await @@ -1326,6 +1418,7 @@ async fn put_tenant_location_config_handler( let request_data: TenantLocationConfigRequest = json_request(&mut request).await?; let flush = parse_query_param(&request, "flush_ms")?.map(Duration::from_millis); + let lazy = parse_query_param(&request, "lazy")?.unwrap_or(false); check_permission(&request, Some(tenant_shard_id.tenant_id))?; let ctx = RequestContext::new(TaskKind::MgmtRequest, DownloadBehavior::Warn); @@ -1339,7 +1432,7 @@ async fn put_tenant_location_config_handler( mgr::detach_tenant(conf, tenant_shard_id, true, &state.deletion_queue_client) .instrument(info_span!("tenant_detach", tenant_id = %tenant_shard_id.tenant_id, - shard = %tenant_shard_id.shard_slug() + shard_id = %tenant_shard_id.shard_slug() )) .await { @@ -1356,16 +1449,20 @@ async fn put_tenant_location_config_handler( let location_conf = LocationConf::try_from(&request_data.config).map_err(ApiError::BadRequest)?; - state + // lazy==true queues up for activation or jumps the queue like normal when a compute connects, + // similar to at startup ordering. + let spawn_mode = if lazy { + tenant::SpawnMode::Lazy + } else { + tenant::SpawnMode::Eager + }; + + let tenant = state .tenant_manager - .upsert_location( - tenant_shard_id, - location_conf, - flush, - tenant::SpawnMode::Normal, - &ctx, - ) + .upsert_location(tenant_shard_id, location_conf, flush, spawn_mode, &ctx) .await?; + let stripe_size = tenant.as_ref().map(|t| t.get_shard_stripe_size()); + let attached = tenant.is_some(); if let Some(_flush_ms) = flush { match state @@ -1384,7 +1481,26 @@ async fn put_tenant_location_config_handler( tracing::info!("No flush requested when configuring"); } - json_response(StatusCode::OK, ()) + // This API returns a vector of pageservers where the tenant is attached: this is + // primarily for use in the sharding service. For compatibilty, we also return this + // when called directly on a pageserver, but the payload is always zero or one shards. + let mut response = TenantLocationConfigResponse { + shards: Vec::new(), + stripe_size: None, + }; + if attached { + response.shards.push(TenantShardLocation { + shard_id: tenant_shard_id, + node_id: state.conf.id, + }); + if tenant_shard_id.shard_count.count() > 1 { + // Stripe size should be set if we are attached + debug_assert!(stripe_size.is_some()); + response.stripe_size = stripe_size; + } + } + + json_response(StatusCode::OK, response) } async fn list_location_config_handler( @@ -1409,6 +1525,102 @@ async fn list_location_config_handler( json_response(StatusCode::OK, result) } +async fn get_location_config_handler( + request: Request, + _cancel: CancellationToken, +) -> Result, ApiError> { + let state = get_state(&request); + let tenant_shard_id: TenantShardId = parse_request_param(&request, "tenant_shard_id")?; + let slot = state.tenant_manager.get(tenant_shard_id); + + let Some(slot) = slot else { + return Err(ApiError::NotFound( + anyhow::anyhow!("Tenant shard not found").into(), + )); + }; + + let result: Option = match slot { + TenantSlot::Attached(t) => Some(t.get_location_conf()), + TenantSlot::Secondary(s) => Some(s.get_location_conf()), + TenantSlot::InProgress(_) => None, + }; + + json_response(StatusCode::OK, result) +} + +// Do a time travel recovery on the given tenant/tenant shard. Tenant needs to be detached +// (from all pageservers) as it invalidates consistency assumptions. +async fn tenant_time_travel_remote_storage_handler( + request: Request, + cancel: CancellationToken, +) -> Result, ApiError> { + let tenant_shard_id: TenantShardId = parse_request_param(&request, "tenant_shard_id")?; + + check_permission(&request, Some(tenant_shard_id.tenant_id))?; + + let timestamp_raw = must_get_query_param(&request, "travel_to")?; + let timestamp = humantime::parse_rfc3339(×tamp_raw) + .with_context(|| format!("Invalid time for travel_to: {timestamp_raw:?}")) + .map_err(ApiError::BadRequest)?; + + let done_if_after_raw = must_get_query_param(&request, "done_if_after")?; + let done_if_after = humantime::parse_rfc3339(&done_if_after_raw) + .with_context(|| format!("Invalid time for done_if_after: {done_if_after_raw:?}")) + .map_err(ApiError::BadRequest)?; + + // This is just a sanity check to fend off naive wrong usages of the API: + // the tenant needs to be detached *everywhere* + let state = get_state(&request); + let we_manage_tenant = state.tenant_manager.manages_tenant_shard(tenant_shard_id); + if we_manage_tenant { + return Err(ApiError::BadRequest(anyhow!( + "Tenant {tenant_shard_id} is already attached at this pageserver" + ))); + } + + let Some(storage) = state.remote_storage.as_ref() else { + return Err(ApiError::InternalServerError(anyhow::anyhow!( + "remote storage not configured, cannot run time travel" + ))); + }; + + if timestamp > done_if_after { + return Err(ApiError::BadRequest(anyhow!( + "The done_if_after timestamp comes before the timestamp to recover to" + ))); + } + + tracing::info!("Issuing time travel request internally. timestamp={timestamp_raw}, done_if_after={done_if_after_raw}"); + + remote_timeline_client::upload::time_travel_recover_tenant( + storage, + &tenant_shard_id, + timestamp, + done_if_after, + &cancel, + ) + .await + .map_err(|e| match e { + TimeTravelError::BadInput(e) => { + warn!("bad input error: {e}"); + ApiError::BadRequest(anyhow!("bad input error")) + } + TimeTravelError::Unimplemented => { + ApiError::BadRequest(anyhow!("unimplemented for the configured remote storage")) + } + TimeTravelError::Cancelled => ApiError::InternalServerError(anyhow!("cancelled")), + TimeTravelError::TooManyVersions => { + ApiError::InternalServerError(anyhow!("too many versions in remote storage")) + } + TimeTravelError::Other(e) => { + warn!("internal error: {e}"); + ApiError::InternalServerError(anyhow!("internal error")) + } + })?; + + json_response(StatusCode::OK, ()) +} + /// Testing helper to transition a tenant to [`crate::tenant::TenantState::Broken`]. async fn handle_tenant_break( r: Request, @@ -1456,13 +1668,19 @@ async fn timeline_compact_handler( let timeline_id: TimelineId = parse_request_param(&request, "timeline_id")?; check_permission(&request, Some(tenant_shard_id.tenant_id))?; + let state = get_state(&request); + let mut flags = EnumSet::empty(); if Some(true) == parse_query_param::<_, bool>(&request, "force_repartition")? { flags |= CompactFlags::ForceRepartition; } + if Some(true) == parse_query_param::<_, bool>(&request, "force_image_layer_creation")? { + flags |= CompactFlags::ForceImageLayerCreation; + } + async { let ctx = RequestContext::new(TaskKind::MgmtRequest, DownloadBehavior::Download); - let timeline = active_timeline_of_active_tenant(tenant_shard_id, timeline_id).await?; + let timeline = active_timeline_of_active_tenant(&state.tenant_manager, tenant_shard_id, timeline_id).await?; timeline .compact(&cancel, flags, &ctx) .await @@ -1482,13 +1700,19 @@ async fn timeline_checkpoint_handler( let timeline_id: TimelineId = parse_request_param(&request, "timeline_id")?; check_permission(&request, Some(tenant_shard_id.tenant_id))?; + let state = get_state(&request); + let mut flags = EnumSet::empty(); if Some(true) == parse_query_param::<_, bool>(&request, "force_repartition")? { flags |= CompactFlags::ForceRepartition; } + if Some(true) == parse_query_param::<_, bool>(&request, "force_image_layer_creation")? { + flags |= CompactFlags::ForceImageLayerCreation; + } + async { let ctx = RequestContext::new(TaskKind::MgmtRequest, DownloadBehavior::Download); - let timeline = active_timeline_of_active_tenant(tenant_shard_id, timeline_id).await?; + let timeline = active_timeline_of_active_tenant(&state.tenant_manager, tenant_shard_id, timeline_id).await?; timeline .freeze_and_flush() .await @@ -1513,7 +1737,11 @@ async fn timeline_download_remote_layers_handler_post( let body: DownloadRemoteLayersTaskSpawnRequest = json_request(&mut request).await?; check_permission(&request, Some(tenant_shard_id.tenant_id))?; - let timeline = active_timeline_of_active_tenant(tenant_shard_id, timeline_id).await?; + let state = get_state(&request); + + let timeline = + active_timeline_of_active_tenant(&state.tenant_manager, tenant_shard_id, timeline_id) + .await?; match timeline.spawn_download_all_remote_layers(body).await { Ok(st) => json_response(StatusCode::ACCEPTED, st), Err(st) => json_response(StatusCode::CONFLICT, st), @@ -1527,8 +1755,11 @@ async fn timeline_download_remote_layers_handler_get( let tenant_shard_id: TenantShardId = parse_request_param(&request, "tenant_shard_id")?; check_permission(&request, Some(tenant_shard_id.tenant_id))?; let timeline_id: TimelineId = parse_request_param(&request, "timeline_id")?; + let state = get_state(&request); - let timeline = active_timeline_of_active_tenant(tenant_shard_id, timeline_id).await?; + let timeline = + active_timeline_of_active_tenant(&state.tenant_manager, tenant_shard_id, timeline_id) + .await?; let info = timeline .get_download_all_remote_layers_task_info() .context("task never started since last pageserver process start") @@ -1577,6 +1808,7 @@ async fn getpage_at_lsn_handler( let tenant_shard_id: TenantShardId = parse_request_param(&request, "tenant_shard_id")?; let timeline_id: TimelineId = parse_request_param(&request, "timeline_id")?; check_permission(&request, Some(tenant_shard_id.tenant_id))?; + let state = get_state(&request); struct Key(crate::repository::Key); @@ -1595,7 +1827,7 @@ async fn getpage_at_lsn_handler( async { let ctx = RequestContext::new(TaskKind::MgmtRequest, DownloadBehavior::Download); - let timeline = active_timeline_of_active_tenant(tenant_shard_id, timeline_id).await?; + let timeline = active_timeline_of_active_tenant(&state.tenant_manager, tenant_shard_id, timeline_id).await?; let page = timeline.get(key.0, lsn, &ctx).await?; @@ -1618,12 +1850,13 @@ async fn timeline_collect_keyspace( let tenant_shard_id: TenantShardId = parse_request_param(&request, "tenant_shard_id")?; let timeline_id: TimelineId = parse_request_param(&request, "timeline_id")?; check_permission(&request, Some(tenant_shard_id.tenant_id))?; + let state = get_state(&request); let at_lsn: Option = parse_query_param(&request, "at_lsn")?; async { let ctx = RequestContext::new(TaskKind::MgmtRequest, DownloadBehavior::Download); - let timeline = active_timeline_of_active_tenant(tenant_shard_id, timeline_id).await?; + let timeline = active_timeline_of_active_tenant(&state.tenant_manager, tenant_shard_id, timeline_id).await?; let at_lsn = at_lsn.unwrap_or_else(|| timeline.get_last_record_lsn()); let keys = timeline .collect_keyspace(at_lsn, &ctx) @@ -1639,10 +1872,14 @@ async fn timeline_collect_keyspace( } async fn active_timeline_of_active_tenant( + tenant_manager: &TenantManager, tenant_shard_id: TenantShardId, timeline_id: TimelineId, ) -> Result, ApiError> { - let tenant = mgr::get_tenant(tenant_shard_id, true)?; + let tenant = tenant_manager.get_attached_tenant_shard(tenant_shard_id, false)?; + + tenant.wait_to_become_active(ACTIVE_TENANT_TIMEOUT).await?; + tenant .get_timeline(timeline_id, true) .map_err(|e| ApiError::NotFound(e.into())) @@ -1799,6 +2036,64 @@ async fn post_tracing_event_handler( json_response(StatusCode::OK, ()) } +async fn put_io_engine_handler( + mut r: Request, + _cancel: CancellationToken, +) -> Result, ApiError> { + check_permission(&r, None)?; + let kind: crate::virtual_file::IoEngineKind = json_request(&mut r).await?; + crate::virtual_file::io_engine::set(kind); + json_response(StatusCode::OK, ()) +} + +/// Polled by control plane. +/// +/// See [`crate::utilization`]. +async fn get_utilization( + r: Request, + _cancel: CancellationToken, +) -> Result, ApiError> { + // this probably could be completely public, but lets make that change later. + check_permission(&r, None)?; + + let state = get_state(&r); + let mut g = state.latest_utilization.lock().await; + + let regenerate_every = Duration::from_secs(1); + let still_valid = g + .as_ref() + .is_some_and(|(captured_at, _)| captured_at.elapsed() < regenerate_every); + + // avoid needless statvfs calls even though those should be non-blocking fast. + // regenerate at most 1Hz to allow polling at any rate. + if !still_valid { + let path = state.conf.tenants_path(); + let doc = crate::utilization::regenerate(path.as_std_path()) + .map_err(ApiError::InternalServerError)?; + + let mut buf = Vec::new(); + serde_json::to_writer(&mut buf, &doc) + .context("serialize") + .map_err(ApiError::InternalServerError)?; + + let body = bytes::Bytes::from(buf); + + *g = Some((std::time::Instant::now(), body)); + } + + // hyper 0.14 doesn't yet have Response::clone so this is a bit of extra legwork + let cached = g.as_ref().expect("just set").1.clone(); + + Response::builder() + .header(hyper::http::header::CONTENT_TYPE, "application/json") + // thought of using http date header, but that is second precision which does not give any + // debugging aid + .status(StatusCode::OK) + .body(hyper::Body::from(cached)) + .context("build response") + .map_err(ApiError::InternalServerError) +} + /// Common functionality of all the HTTP API handlers. /// /// - Adds a tracing span to each request (by `request_span`) @@ -1945,6 +2240,9 @@ pub fn make_router( .put("/v1/tenant/config", |r| { api_handler(r, update_tenant_config_handler) }) + .put("/v1/tenant/:tenant_shard_id/shard_split", |r| { + api_handler(r, tenant_shard_split_handler) + }) .get("/v1/tenant/:tenant_shard_id/config", |r| { api_handler(r, get_tenant_config_handler) }) @@ -1954,6 +2252,13 @@ pub fn make_router( .get("/v1/location_config", |r| { api_handler(r, list_location_config_handler) }) + .get("/v1/location_config/:tenant_shard_id", |r| { + api_handler(r, get_location_config_handler) + }) + .put( + "/v1/tenant/:tenant_shard_id/time_travel_remote_storage", + |r| api_handler(r, tenant_time_travel_remote_storage_handler), + ) .get("/v1/tenant/:tenant_shard_id/timeline", |r| { api_handler(r, timeline_list_handler) }) @@ -2050,7 +2355,9 @@ pub fn make_router( ) .get( "/v1/tenant/:tenant_shard_id/timeline/:timeline_id/keyspace", - |r| testing_api_handler("read out the keyspace", r, timeline_collect_keyspace), + |r| api_handler(r, timeline_collect_keyspace), ) + .put("/v1/io_engine", |r| api_handler(r, put_io_engine_handler)) + .get("/v1/utilization", |r| api_handler(r, get_utilization)) .any(handler_404)) } diff --git a/pageserver/src/lib.rs b/pageserver/src/lib.rs index bcde1166b7..02a690d4e1 100644 --- a/pageserver/src/lib.rs +++ b/pageserver/src/lib.rs @@ -17,10 +17,12 @@ pub mod page_cache; pub mod page_service; pub mod pgdatadir_mapping; pub mod repository; +pub mod span; pub(crate) mod statvfs; pub mod task_mgr; pub mod tenant; pub mod trace; +pub mod utilization; pub mod virtual_file; pub mod walingest; pub mod walrecord; @@ -167,15 +169,6 @@ pub fn is_delete_mark(path: &Utf8Path) -> bool { ends_with_suffix(path, TIMELINE_DELETE_MARK_SUFFIX) } -fn is_walkdir_io_not_found(e: &walkdir::Error) -> bool { - if let Some(e) = e.io_error() { - if e.kind() == std::io::ErrorKind::NotFound { - return true; - } - } - false -} - /// During pageserver startup, we need to order operations not to exhaust tokio worker threads by /// blocking. /// diff --git a/pageserver/src/metrics.rs b/pageserver/src/metrics.rs index 9b3679e3c2..814b3e1f96 100644 --- a/pageserver/src/metrics.rs +++ b/pageserver/src/metrics.rs @@ -1,16 +1,16 @@ use enum_map::EnumMap; -use metrics::metric_vec_duration::DurationResultObserver; use metrics::{ register_counter_vec, register_gauge_vec, register_histogram, register_histogram_vec, register_int_counter, register_int_counter_pair_vec, register_int_counter_vec, register_int_gauge, register_int_gauge_vec, register_uint_gauge, register_uint_gauge_vec, - Counter, CounterVec, GaugeVec, Histogram, HistogramVec, IntCounter, IntCounterPairVec, - IntCounterVec, IntGauge, IntGaugeVec, UIntGauge, UIntGaugeVec, + Counter, CounterVec, GaugeVec, Histogram, HistogramVec, IntCounter, IntCounterPair, + IntCounterPairVec, IntCounterVec, IntGauge, IntGaugeVec, UIntGauge, UIntGaugeVec, }; use once_cell::sync::Lazy; use pageserver_api::shard::TenantShardId; use strum::{EnumCount, IntoEnumIterator, VariantNames}; use strum_macros::{EnumVariantNames, IntoStaticStr}; +use tracing::warn; use utils::id::TimelineId; /// Prometheus histogram buckets (in seconds) for operations in the critical @@ -602,6 +602,15 @@ pub(crate) mod initial_logical_size { }); } +static DIRECTORY_ENTRIES_COUNT: Lazy = Lazy::new(|| { + register_uint_gauge_vec!( + "pageserver_directory_entries_count", + "Sum of the entries in pageserver-stored directory listings", + &["tenant_id", "shard_id", "timeline_id"] + ) + .expect("failed to define a metric") +}); + pub(crate) static TENANT_STATE_METRIC: Lazy = Lazy::new(|| { register_uint_gauge_vec!( "pageserver_tenant_states_count", @@ -633,26 +642,6 @@ pub(crate) static TENANT_SYNTHETIC_SIZE_METRIC: Lazy = Lazy::new(| .expect("Failed to register pageserver_tenant_synthetic_cached_size_bytes metric") }); -// Metrics for cloud upload. These metrics reflect data uploaded to cloud storage, -// or in testing they estimate how much we would upload if we did. -static NUM_PERSISTENT_FILES_CREATED: Lazy = Lazy::new(|| { - register_int_counter_vec!( - "pageserver_created_persistent_files_total", - "Number of files created that are meant to be uploaded to cloud storage", - &["tenant_id", "shard_id", "timeline_id"] - ) - .expect("failed to define a metric") -}); - -static PERSISTENT_BYTES_WRITTEN: Lazy = Lazy::new(|| { - register_int_counter_vec!( - "pageserver_written_persistent_bytes_total", - "Total bytes written that are meant to be uploaded to cloud storage", - &["tenant_id", "shard_id", "timeline_id"] - ) - .expect("failed to define a metric") -}); - pub(crate) static EVICTION_ITERATION_DURATION: Lazy = Lazy::new(|| { register_histogram_vec!( "pageserver_eviction_iteration_duration_seconds_global", @@ -1016,15 +1005,39 @@ impl GlobalAndPerTimelineHistogram { } } -struct GlobalAndPerTimelineHistogramTimer<'a> { +struct GlobalAndPerTimelineHistogramTimer<'a, 'c> { h: &'a GlobalAndPerTimelineHistogram, + ctx: &'c RequestContext, start: std::time::Instant, + op: SmgrQueryType, } -impl<'a> Drop for GlobalAndPerTimelineHistogramTimer<'a> { +impl<'a, 'c> Drop for GlobalAndPerTimelineHistogramTimer<'a, 'c> { fn drop(&mut self) { let elapsed = self.start.elapsed(); - self.h.observe(elapsed.as_secs_f64()); + let ex_throttled = self + .ctx + .micros_spent_throttled + .close_and_checked_sub_from(elapsed); + let ex_throttled = match ex_throttled { + Ok(res) => res, + Err(error) => { + use utils::rate_limit::RateLimit; + static LOGGED: Lazy>> = + Lazy::new(|| { + Mutex::new(enum_map::EnumMap::from_array(std::array::from_fn(|_| { + RateLimit::new(Duration::from_secs(10)) + }))) + }); + let mut guard = LOGGED.lock().unwrap(); + let rate_limit = &mut guard[self.op]; + rate_limit.call(|| { + warn!(op=?self.op, error, "error deducting time spent throttled; this message is logged at a global rate limit"); + }); + elapsed + } + }; + self.h.observe(ex_throttled.as_secs_f64()); } } @@ -1036,6 +1049,7 @@ impl<'a> Drop for GlobalAndPerTimelineHistogramTimer<'a> { strum_macros::EnumCount, strum_macros::EnumIter, strum_macros::FromRepr, + enum_map::Enum, )] #[strum(serialize_all = "snake_case")] pub enum SmgrQueryType { @@ -1043,6 +1057,7 @@ pub enum SmgrQueryType { GetRelSize, GetPageAtLsn, GetDbSize, + GetSlruSegment, } #[derive(Debug)] @@ -1140,11 +1155,35 @@ impl SmgrQueryTimePerTimeline { }); Self { metrics } } - pub(crate) fn start_timer(&self, op: SmgrQueryType) -> impl Drop + '_ { + pub(crate) fn start_timer<'c: 'a, 'a>( + &'a self, + op: SmgrQueryType, + ctx: &'c RequestContext, + ) -> impl Drop + '_ { let metric = &self.metrics[op as usize]; + let start = Instant::now(); + match ctx.micros_spent_throttled.open() { + Ok(()) => (), + Err(error) => { + use utils::rate_limit::RateLimit; + static LOGGED: Lazy>> = + Lazy::new(|| { + Mutex::new(enum_map::EnumMap::from_array(std::array::from_fn(|_| { + RateLimit::new(Duration::from_secs(10)) + }))) + }); + let mut guard = LOGGED.lock().unwrap(); + let rate_limit = &mut guard[op]; + rate_limit.call(|| { + warn!(?op, error, "error opening micros_spent_throttled; this message is logged at a global rate limit"); + }); + } + } GlobalAndPerTimelineHistogramTimer { h: metric, - start: std::time::Instant::now(), + ctx, + start, + op, } } } @@ -1155,15 +1194,21 @@ mod smgr_query_time_tests { use strum::IntoEnumIterator; use utils::id::{TenantId, TimelineId}; + use crate::{ + context::{DownloadBehavior, RequestContext}, + task_mgr::TaskKind, + }; + // Regression test, we used hard-coded string constants before using an enum. #[test] fn op_label_name() { use super::SmgrQueryType::*; - let expect: [(super::SmgrQueryType, &'static str); 4] = [ + let expect: [(super::SmgrQueryType, &'static str); 5] = [ (GetRelExists, "get_rel_exists"), (GetRelSize, "get_rel_size"), (GetPageAtLsn, "get_page_at_lsn"), (GetDbSize, "get_db_size"), + (GetSlruSegment, "get_slru_segment"), ]; for (op, expect) in expect { let actual: &'static str = op.into(); @@ -1202,7 +1247,8 @@ mod smgr_query_time_tests { let (pre_global, pre_per_tenant_timeline) = get_counts(); assert_eq!(pre_per_tenant_timeline, 0); - let timer = metrics.start_timer(*op); + let ctx = RequestContext::new(TaskKind::UnitTest, DownloadBehavior::Download); + let timer = metrics.start_timer(*op, &ctx); drop(timer); let (post_global, post_per_tenant_timeline) = get_counts(); @@ -1236,11 +1282,65 @@ pub(crate) static BASEBACKUP_QUERY_TIME: Lazy = Lazy::new(| }) }); -impl DurationResultObserver for BasebackupQueryTime { - fn observe_result(&self, res: &Result, duration: std::time::Duration) { +pub(crate) struct BasebackupQueryTimeOngoingRecording<'a, 'c> { + parent: &'a BasebackupQueryTime, + ctx: &'c RequestContext, + start: std::time::Instant, +} + +impl BasebackupQueryTime { + pub(crate) fn start_recording<'c: 'a, 'a>( + &'a self, + ctx: &'c RequestContext, + ) -> BasebackupQueryTimeOngoingRecording<'_, '_> { + let start = Instant::now(); + match ctx.micros_spent_throttled.open() { + Ok(()) => (), + Err(error) => { + use utils::rate_limit::RateLimit; + static LOGGED: Lazy> = + Lazy::new(|| Mutex::new(RateLimit::new(Duration::from_secs(10)))); + let mut rate_limit = LOGGED.lock().unwrap(); + rate_limit.call(|| { + warn!(error, "error opening micros_spent_throttled; this message is logged at a global rate limit"); + }); + } + } + BasebackupQueryTimeOngoingRecording { + parent: self, + ctx, + start, + } + } +} + +impl<'a, 'c> BasebackupQueryTimeOngoingRecording<'a, 'c> { + pub(crate) fn observe(self, res: &Result) { + let elapsed = self.start.elapsed(); + let ex_throttled = self + .ctx + .micros_spent_throttled + .close_and_checked_sub_from(elapsed); + let ex_throttled = match ex_throttled { + Ok(ex_throttled) => ex_throttled, + Err(error) => { + use utils::rate_limit::RateLimit; + static LOGGED: Lazy> = + Lazy::new(|| Mutex::new(RateLimit::new(Duration::from_secs(10)))); + let mut rate_limit = LOGGED.lock().unwrap(); + rate_limit.call(|| { + warn!(error, "error deducting time spent throttled; this message is logged at a global rate limit"); + }); + elapsed + } + }; let label_value = if res.is_ok() { "ok" } else { "error" }; - let metric = self.0.get_metric_with_label_values(&[label_value]).unwrap(); - metric.observe(duration.as_secs_f64()); + let metric = self + .parent + .0 + .get_metric_with_label_values(&[label_value]) + .unwrap(); + metric.observe(ex_throttled.as_secs_f64()); } } @@ -1255,13 +1355,12 @@ pub(crate) static LIVE_CONNECTIONS_COUNT: Lazy = Lazy::new(|| { // remote storage metrics -/// NB: increment _after_ recording the current value into [`REMOTE_TIMELINE_CLIENT_CALLS_STARTED_HIST`]. -static REMOTE_TIMELINE_CLIENT_CALLS_UNFINISHED_GAUGE: Lazy = Lazy::new(|| { - register_int_gauge_vec!( - "pageserver_remote_timeline_client_calls_unfinished", - "Number of ongoing calls to remote timeline client. \ - Used to populate pageserver_remote_timeline_client_calls_started. \ - This metric is not useful for sampling from Prometheus, but useful in tests.", +static REMOTE_TIMELINE_CLIENT_CALLS: Lazy = Lazy::new(|| { + register_int_counter_pair_vec!( + "pageserver_remote_timeline_client_calls_started", + "Number of started calls to remote timeline client.", + "pageserver_remote_timeline_client_calls_finished", + "Number of finshed calls to remote timeline client.", &[ "tenant_id", "shard_id", @@ -1270,23 +1369,7 @@ static REMOTE_TIMELINE_CLIENT_CALLS_UNFINISHED_GAUGE: Lazy = Lazy:: "op_kind" ], ) - .expect("failed to define a metric") -}); - -static REMOTE_TIMELINE_CLIENT_CALLS_STARTED_HIST: Lazy = Lazy::new(|| { - register_histogram_vec!( - "pageserver_remote_timeline_client_calls_started", - "When calling a remote timeline client method, we record the current value \ - of the calls_unfinished gauge in this histogram. Plot the histogram \ - over time in a heatmap to visualize how many operations were ongoing \ - at a given instant. It gives you a better idea of the queue depth \ - than plotting the gauge directly, since operations may complete faster \ - than the sampling interval.", - &["file_kind", "op_kind"], - // The calls_unfinished gauge is an integer gauge, hence we have integer buckets. - vec![0.0, 1.0, 2.0, 4.0, 6.0, 8.0, 10.0, 15.0, 20.0, 40.0, 60.0, 80.0, 100.0, 500.0], - ) - .expect("failed to define a metric") + .unwrap() }); static REMOTE_TIMELINE_CLIENT_BYTES_STARTED_COUNTER: Lazy = @@ -1649,11 +1732,18 @@ pub(crate) static WAL_REDO_RECORD_COUNTER: Lazy = Lazy::new(|| { .unwrap() }); +#[rustfmt::skip] pub(crate) static WAL_REDO_PROCESS_LAUNCH_DURATION_HISTOGRAM: Lazy = Lazy::new(|| { register_histogram!( "pageserver_wal_redo_process_launch_duration", "Histogram of the duration of successful WalRedoProcess::launch calls", - redo_histogram_time_buckets!(), + vec![ + 0.0002, 0.0004, 0.0006, 0.0008, 0.0010, + 0.0020, 0.0040, 0.0060, 0.0080, 0.0100, + 0.0200, 0.0400, 0.0600, 0.0800, 0.1000, + 0.2000, 0.4000, 0.6000, 0.8000, 1.0000, + 1.5000, 2.0000, 2.5000, 3.0000, 4.0000, 10.0000 + ], ) .expect("failed to define a metric") }); @@ -1800,8 +1890,7 @@ pub(crate) struct TimelineMetrics { resident_physical_size_gauge: UIntGauge, /// copy of LayeredTimeline.current_logical_size pub current_logical_size_gauge: UIntGauge, - pub num_persistent_files_created: IntCounter, - pub persistent_bytes_written: IntCounter, + pub directory_entries_count_gauge: Lazy UIntGauge>>, pub evictions: IntCounter, pub evictions_with_low_residence_duration: std::sync::RwLock, } @@ -1809,12 +1898,12 @@ pub(crate) struct TimelineMetrics { impl TimelineMetrics { pub fn new( tenant_shard_id: &TenantShardId, - timeline_id: &TimelineId, + timeline_id_raw: &TimelineId, evictions_with_low_residence_duration_builder: EvictionsWithLowResidenceDurationBuilder, ) -> Self { let tenant_id = tenant_shard_id.tenant_id.to_string(); let shard_id = format!("{}", tenant_shard_id.shard_slug()); - let timeline_id = timeline_id.to_string(); + let timeline_id = timeline_id_raw.to_string(); let flush_time_histo = StorageTimeMetrics::new( StorageTimeOperation::LayerFlush, &tenant_id, @@ -1867,12 +1956,22 @@ impl TimelineMetrics { let current_logical_size_gauge = CURRENT_LOGICAL_SIZE .get_metric_with_label_values(&[&tenant_id, &shard_id, &timeline_id]) .unwrap(); - let num_persistent_files_created = NUM_PERSISTENT_FILES_CREATED - .get_metric_with_label_values(&[&tenant_id, &shard_id, &timeline_id]) - .unwrap(); - let persistent_bytes_written = PERSISTENT_BYTES_WRITTEN - .get_metric_with_label_values(&[&tenant_id, &shard_id, &timeline_id]) - .unwrap(); + // TODO use impl Trait syntax here once we have ability to use it: https://github.com/rust-lang/rust/issues/63065 + let directory_entries_count_gauge_closure = { + let tenant_shard_id = *tenant_shard_id; + let timeline_id_raw = *timeline_id_raw; + move || { + let tenant_id = tenant_shard_id.tenant_id.to_string(); + let shard_id = format!("{}", tenant_shard_id.shard_slug()); + let timeline_id = timeline_id_raw.to_string(); + let gauge: UIntGauge = DIRECTORY_ENTRIES_COUNT + .get_metric_with_label_values(&[&tenant_id, &shard_id, &timeline_id]) + .unwrap(); + gauge + } + }; + let directory_entries_count_gauge: Lazy UIntGauge>> = + Lazy::new(Box::new(directory_entries_count_gauge_closure)); let evictions = EVICTIONS .get_metric_with_label_values(&[&tenant_id, &shard_id, &timeline_id]) .unwrap(); @@ -1893,8 +1992,7 @@ impl TimelineMetrics { last_record_gauge, resident_physical_size_gauge, current_logical_size_gauge, - num_persistent_files_created, - persistent_bytes_written, + directory_entries_count_gauge, evictions, evictions_with_low_residence_duration: std::sync::RwLock::new( evictions_with_low_residence_duration, @@ -1904,8 +2002,6 @@ impl TimelineMetrics { pub(crate) fn record_new_file_metrics(&self, sz: u64) { self.resident_physical_size_add(sz); - self.num_persistent_files_created.inc_by(1); - self.persistent_bytes_written.inc_by(sz); } pub(crate) fn resident_physical_size_sub(&self, sz: u64) { @@ -1921,24 +2017,21 @@ impl TimelineMetrics { pub(crate) fn resident_physical_size_get(&self) -> u64 { self.resident_physical_size_gauge.get() } -} -impl Drop for TimelineMetrics { - fn drop(&mut self) { + pub(crate) fn shutdown(&self) { let tenant_id = &self.tenant_id; let timeline_id = &self.timeline_id; let shard_id = &self.shard_id; - let _ = LAST_RECORD_LSN.remove_label_values(&[tenant_id, &shard_id, timeline_id]); + let _ = LAST_RECORD_LSN.remove_label_values(&[tenant_id, shard_id, timeline_id]); { RESIDENT_PHYSICAL_SIZE_GLOBAL.sub(self.resident_physical_size_get()); - let _ = - RESIDENT_PHYSICAL_SIZE.remove_label_values(&[tenant_id, &shard_id, timeline_id]); + let _ = RESIDENT_PHYSICAL_SIZE.remove_label_values(&[tenant_id, shard_id, timeline_id]); } - let _ = CURRENT_LOGICAL_SIZE.remove_label_values(&[tenant_id, &shard_id, timeline_id]); - let _ = - NUM_PERSISTENT_FILES_CREATED.remove_label_values(&[tenant_id, &shard_id, timeline_id]); - let _ = PERSISTENT_BYTES_WRITTEN.remove_label_values(&[tenant_id, &shard_id, timeline_id]); - let _ = EVICTIONS.remove_label_values(&[tenant_id, &shard_id, timeline_id]); + let _ = CURRENT_LOGICAL_SIZE.remove_label_values(&[tenant_id, shard_id, timeline_id]); + if let Some(metric) = Lazy::get(&DIRECTORY_ENTRIES_COUNT) { + let _ = metric.remove_label_values(&[tenant_id, shard_id, timeline_id]); + } + let _ = EVICTIONS.remove_label_values(&[tenant_id, shard_id, timeline_id]); self.evictions_with_low_residence_duration .write() @@ -2039,7 +2132,7 @@ pub(crate) struct RemoteTimelineClientMetrics { shard_id: String, timeline_id: String, remote_physical_size_gauge: Mutex>, - calls_unfinished_gauge: Mutex>, + calls: Mutex>, bytes_started_counter: Mutex>, bytes_finished_counter: Mutex>, } @@ -2050,7 +2143,7 @@ impl RemoteTimelineClientMetrics { tenant_id: tenant_shard_id.tenant_id.to_string(), shard_id: format!("{}", tenant_shard_id.shard_slug()), timeline_id: timeline_id.to_string(), - calls_unfinished_gauge: Mutex::new(HashMap::default()), + calls: Mutex::new(HashMap::default()), bytes_started_counter: Mutex::new(HashMap::default()), bytes_finished_counter: Mutex::new(HashMap::default()), remote_physical_size_gauge: Mutex::new(None), @@ -2090,15 +2183,15 @@ impl RemoteTimelineClientMetrics { .unwrap() } - fn calls_unfinished_gauge( + fn calls_counter_pair( &self, file_kind: &RemoteOpFileKind, op_kind: &RemoteOpKind, - ) -> IntGauge { - let mut guard = self.calls_unfinished_gauge.lock().unwrap(); + ) -> IntCounterPair { + let mut guard = self.calls.lock().unwrap(); let key = (file_kind.as_str(), op_kind.as_str()); let metric = guard.entry(key).or_insert_with(move || { - REMOTE_TIMELINE_CLIENT_CALLS_UNFINISHED_GAUGE + REMOTE_TIMELINE_CLIENT_CALLS .get_metric_with_label_values(&[ &self.tenant_id, &self.shard_id, @@ -2111,17 +2204,6 @@ impl RemoteTimelineClientMetrics { metric.clone() } - fn calls_started_hist( - &self, - file_kind: &RemoteOpFileKind, - op_kind: &RemoteOpKind, - ) -> Histogram { - let key = (file_kind.as_str(), op_kind.as_str()); - REMOTE_TIMELINE_CLIENT_CALLS_STARTED_HIST - .get_metric_with_label_values(&[key.0, key.1]) - .unwrap() - } - fn bytes_started_counter( &self, file_kind: &RemoteOpFileKind, @@ -2192,7 +2274,7 @@ impl RemoteTimelineClientMetrics { #[must_use] pub(crate) struct RemoteTimelineClientCallMetricGuard { /// Decremented on drop. - calls_unfinished_metric: Option, + calls_counter_pair: Option, /// If Some(), this references the bytes_finished metric, and we increment it by the given `u64` on drop. bytes_finished: Option<(IntCounter, u64)>, } @@ -2202,10 +2284,10 @@ impl RemoteTimelineClientCallMetricGuard { /// The caller vouches to do the metric updates manually. pub fn will_decrement_manually(mut self) { let RemoteTimelineClientCallMetricGuard { - calls_unfinished_metric, + calls_counter_pair, bytes_finished, } = &mut self; - calls_unfinished_metric.take(); + calls_counter_pair.take(); bytes_finished.take(); } } @@ -2213,10 +2295,10 @@ impl RemoteTimelineClientCallMetricGuard { impl Drop for RemoteTimelineClientCallMetricGuard { fn drop(&mut self) { let RemoteTimelineClientCallMetricGuard { - calls_unfinished_metric, + calls_counter_pair, bytes_finished, } = self; - if let Some(guard) = calls_unfinished_metric.take() { + if let Some(guard) = calls_counter_pair.take() { guard.dec(); } if let Some((bytes_finished_metric, value)) = bytes_finished { @@ -2249,10 +2331,8 @@ impl RemoteTimelineClientMetrics { op_kind: &RemoteOpKind, size: RemoteTimelineClientMetricsCallTrackSize, ) -> RemoteTimelineClientCallMetricGuard { - let calls_unfinished_metric = self.calls_unfinished_gauge(file_kind, op_kind); - self.calls_started_hist(file_kind, op_kind) - .observe(calls_unfinished_metric.get() as f64); - calls_unfinished_metric.inc(); // NB: inc after the histogram, see comment on underlying metric + let calls_counter_pair = self.calls_counter_pair(file_kind, op_kind); + calls_counter_pair.inc(); let bytes_finished = match size { RemoteTimelineClientMetricsCallTrackSize::DontTrackSize { reason: _reason } => { @@ -2266,7 +2346,7 @@ impl RemoteTimelineClientMetrics { } }; RemoteTimelineClientCallMetricGuard { - calls_unfinished_metric: Some(calls_unfinished_metric), + calls_counter_pair: Some(calls_counter_pair), bytes_finished, } } @@ -2280,12 +2360,8 @@ impl RemoteTimelineClientMetrics { op_kind: &RemoteOpKind, size: RemoteTimelineClientMetricsCallTrackSize, ) { - let calls_unfinished_metric = self.calls_unfinished_gauge(file_kind, op_kind); - debug_assert!( - calls_unfinished_metric.get() > 0, - "begin and end should cancel out" - ); - calls_unfinished_metric.dec(); + let calls_counter_pair = self.calls_counter_pair(file_kind, op_kind); + calls_counter_pair.dec(); match size { RemoteTimelineClientMetricsCallTrackSize::DontTrackSize { reason: _reason } => {} RemoteTimelineClientMetricsCallTrackSize::Bytes(size) => { @@ -2302,18 +2378,15 @@ impl Drop for RemoteTimelineClientMetrics { shard_id, timeline_id, remote_physical_size_gauge, - calls_unfinished_gauge, + calls, bytes_started_counter, bytes_finished_counter, } = self; - for ((a, b), _) in calls_unfinished_gauge.get_mut().unwrap().drain() { - let _ = REMOTE_TIMELINE_CLIENT_CALLS_UNFINISHED_GAUGE.remove_label_values(&[ - tenant_id, - shard_id, - timeline_id, - a, - b, - ]); + for ((a, b), _) in calls.get_mut().unwrap().drain() { + let mut res = [Ok(()), Ok(())]; + REMOTE_TIMELINE_CLIENT_CALLS + .remove_label_values(&mut res, &[tenant_id, shard_id, timeline_id, a, b]); + // don't care about results } for ((a, b), _) in bytes_started_counter.get_mut().unwrap().drain() { let _ = REMOTE_TIMELINE_CLIENT_BYTES_STARTED_COUNTER.remove_label_values(&[ @@ -2391,6 +2464,180 @@ impl>, O, E> Future for MeasuredRemoteOp { } } +pub mod tokio_epoll_uring { + use metrics::UIntGauge; + + pub struct Collector { + descs: Vec, + systems_created: UIntGauge, + systems_destroyed: UIntGauge, + } + + const NMETRICS: usize = 2; + + impl metrics::core::Collector for Collector { + fn desc(&self) -> Vec<&metrics::core::Desc> { + self.descs.iter().collect() + } + + fn collect(&self) -> Vec { + let mut mfs = Vec::with_capacity(NMETRICS); + let tokio_epoll_uring::metrics::Metrics { + systems_created, + systems_destroyed, + } = tokio_epoll_uring::metrics::global(); + self.systems_created.set(systems_created); + mfs.extend(self.systems_created.collect()); + self.systems_destroyed.set(systems_destroyed); + mfs.extend(self.systems_destroyed.collect()); + mfs + } + } + + impl Collector { + #[allow(clippy::new_without_default)] + pub fn new() -> Self { + let mut descs = Vec::new(); + + let systems_created = UIntGauge::new( + "pageserver_tokio_epoll_uring_systems_created", + "counter of tokio-epoll-uring systems that were created", + ) + .unwrap(); + descs.extend( + metrics::core::Collector::desc(&systems_created) + .into_iter() + .cloned(), + ); + + let systems_destroyed = UIntGauge::new( + "pageserver_tokio_epoll_uring_systems_destroyed", + "counter of tokio-epoll-uring systems that were destroyed", + ) + .unwrap(); + descs.extend( + metrics::core::Collector::desc(&systems_destroyed) + .into_iter() + .cloned(), + ); + + Self { + descs, + systems_created, + systems_destroyed, + } + } + } +} + +pub(crate) mod tenant_throttling { + use metrics::{register_int_counter_vec, IntCounter}; + use once_cell::sync::Lazy; + + use crate::tenant::{self, throttle::Metric}; + + pub(crate) struct TimelineGet { + wait_time: IntCounter, + count: IntCounter, + } + + pub(crate) static TIMELINE_GET: Lazy = Lazy::new(|| { + static WAIT_USECS: Lazy = Lazy::new(|| { + register_int_counter_vec!( + "pageserver_tenant_throttling_wait_usecs_sum_global", + "Sum of microseconds that tenants spent waiting for a tenant throttle of a given kind.", + &["kind"] + ) + .unwrap() + }); + + static WAIT_COUNT: Lazy = Lazy::new(|| { + register_int_counter_vec!( + "pageserver_tenant_throttling_count_global", + "Count of tenant throttlings, by kind of throttle.", + &["kind"] + ) + .unwrap() + }); + + let kind = "timeline_get"; + TimelineGet { + wait_time: WAIT_USECS.with_label_values(&[kind]), + count: WAIT_COUNT.with_label_values(&[kind]), + } + }); + + impl Metric for &'static TimelineGet { + #[inline(always)] + fn observe_throttling( + &self, + tenant::throttle::Observation { wait_time }: &tenant::throttle::Observation, + ) { + let val = u64::try_from(wait_time.as_micros()).unwrap(); + self.wait_time.inc_by(val); + self.count.inc(); + } + } +} + +pub(crate) mod disk_usage_based_eviction { + use super::*; + + pub(crate) struct Metrics { + pub(crate) tenant_collection_time: Histogram, + pub(crate) tenant_layer_count: Histogram, + pub(crate) layers_collected: IntCounter, + pub(crate) layers_selected: IntCounter, + pub(crate) layers_evicted: IntCounter, + } + + impl Default for Metrics { + fn default() -> Self { + let tenant_collection_time = register_histogram!( + "pageserver_disk_usage_based_eviction_tenant_collection_seconds", + "Time spent collecting layers from a tenant -- not normalized by collected layer amount", + vec![0.001, 0.005, 0.01, 0.05, 0.1, 0.5, 1.0, 5.0, 10.0] + ) + .unwrap(); + + let tenant_layer_count = register_histogram!( + "pageserver_disk_usage_based_eviction_tenant_collected_layers", + "Amount of layers gathered from a tenant", + vec![5.0, 50.0, 500.0, 5000.0, 50000.0] + ) + .unwrap(); + + let layers_collected = register_int_counter!( + "pageserver_disk_usage_based_eviction_collected_layers_total", + "Amount of layers collected" + ) + .unwrap(); + + let layers_selected = register_int_counter!( + "pageserver_disk_usage_based_eviction_select_layers_total", + "Amount of layers selected" + ) + .unwrap(); + + let layers_evicted = register_int_counter!( + "pageserver_disk_usage_based_eviction_evicted_layers_total", + "Amount of layers successfully evicted" + ) + .unwrap(); + + Self { + tenant_collection_time, + tenant_layer_count, + layers_collected, + layers_selected, + layers_evicted, + } + } + } + + pub(crate) static METRICS: Lazy = Lazy::new(Metrics::default); +} + pub fn preinitialize_metrics() { // Python tests need these and on some we do alerting. // @@ -2425,6 +2672,13 @@ pub fn preinitialize_metrics() { Lazy::force(&TENANT_MANAGER); Lazy::force(&crate::tenant::storage_layer::layer::LAYER_IMPL_METRICS); + Lazy::force(&disk_usage_based_eviction::METRICS); + + for state_name in pageserver_api::models::TenantState::VARIANTS { + // initialize the metric for all gauges, otherwise the time series might seemingly show + // values from last restart. + TENANT_STATE_METRIC.with_label_values(&[state_name]).set(0); + } // countervecs [&BACKGROUND_LOOP_PERIOD_OVERRUN_COUNT] @@ -2452,4 +2706,5 @@ pub fn preinitialize_metrics() { // Custom Lazy::force(&RECONSTRUCT_TIME); + Lazy::force(&tenant_throttling::TIMELINE_GET); } diff --git a/pageserver/src/page_cache.rs b/pageserver/src/page_cache.rs index 28d2584bf4..529fb9bb07 100644 --- a/pageserver/src/page_cache.rs +++ b/pageserver/src/page_cache.rs @@ -73,7 +73,6 @@ use std::{ collections::{hash_map::Entry, HashMap}, - convert::TryInto, sync::{ atomic::{AtomicU64, AtomicU8, AtomicUsize, Ordering}, Arc, Weak, @@ -262,7 +261,9 @@ pub struct PageCache { size_metrics: &'static PageCacheSizeMetrics, } -struct PinnedSlotsPermit(tokio::sync::OwnedSemaphorePermit); +struct PinnedSlotsPermit { + _permit: tokio::sync::OwnedSemaphorePermit, +} /// /// PageReadGuard is a "lease" on a buffer, for reading. The page is kept locked @@ -558,9 +559,9 @@ impl PageCache { ) .await { - Ok(res) => Ok(PinnedSlotsPermit( - res.expect("this semaphore is never closed"), - )), + Ok(res) => Ok(PinnedSlotsPermit { + _permit: res.expect("this semaphore is never closed"), + }), Err(_timeout) => { crate::metrics::page_cache_errors_inc( crate::metrics::PageCacheErrorKind::AcquirePinnedSlotTimeout, diff --git a/pageserver/src/page_service.rs b/pageserver/src/page_service.rs index a8a3487b4e..f3ceb7d3e6 100644 --- a/pageserver/src/page_service.rs +++ b/pageserver/src/page_service.rs @@ -22,11 +22,12 @@ use pageserver_api::models::{ PagestreamBeMessage, PagestreamDbSizeRequest, PagestreamDbSizeResponse, PagestreamErrorResponse, PagestreamExistsRequest, PagestreamExistsResponse, PagestreamFeMessage, PagestreamGetPageRequest, PagestreamGetPageResponse, - PagestreamNblocksRequest, PagestreamNblocksResponse, + PagestreamGetSlruSegmentRequest, PagestreamGetSlruSegmentResponse, PagestreamNblocksRequest, + PagestreamNblocksResponse, }; use pageserver_api::shard::ShardIndex; -use pageserver_api::shard::{ShardCount, ShardNumber}; -use postgres_backend::{self, is_expected_io_error, AuthType, PostgresBackend, QueryError}; +use pageserver_api::shard::ShardNumber; +use postgres_backend::{is_expected_io_error, AuthType, PostgresBackend, QueryError}; use pq_proto::framed::ConnectionError; use pq_proto::FeStartupPacket; use pq_proto::{BeMessage, FeMessage, RowDescriptor}; @@ -43,7 +44,6 @@ use tokio::io::AsyncWriteExt; use tokio::io::{AsyncRead, AsyncWrite}; use tokio_util::io::StreamReader; use tokio_util::sync::CancellationToken; -use tracing::field; use tracing::*; use utils::id::ConnectionId; use utils::sync::gate::GateGuard; @@ -62,9 +62,10 @@ use crate::import_datadir::import_wal_from_tar; use crate::metrics; use crate::metrics::LIVE_CONNECTIONS_COUNT; use crate::pgdatadir_mapping::Version; +use crate::span::debug_assert_current_span_has_tenant_and_timeline_id; +use crate::span::debug_assert_current_span_has_tenant_and_timeline_id_no_shard_id; use crate::task_mgr; use crate::task_mgr::TaskKind; -use crate::tenant::debug_assert_current_span_has_tenant_and_timeline_id; use crate::tenant::mgr; use crate::tenant::mgr::get_active_tenant_with_timeout; use crate::tenant::mgr::GetActiveTenantError; @@ -74,8 +75,8 @@ use crate::tenant::GetTimelineError; use crate::tenant::PageReconstructError; use crate::tenant::Timeline; use crate::trace::Tracer; - use pageserver_api::key::rel_block_to_key; +use pageserver_api::reltag::SlruKind; use postgres_ffi::pg_constants::DEFAULTTABLESPACE_OID; use postgres_ffi::BLCKSZ; @@ -89,8 +90,8 @@ const ACTIVE_TENANT_TIMEOUT: Duration = Duration::from_millis(30000); /// `tokio_tar` already read the first such block. Read the second all-zeros block, /// and check that there is no more data after the EOF marker. /// -/// XXX: Currently, any trailing data after the EOF marker prints a warning. -/// Perhaps it should be a hard error? +/// 'tar' command can also write extra blocks of zeros, up to a record +/// size, controlled by the --record-size argument. Ignore them too. async fn read_tar_eof(mut reader: (impl AsyncRead + Unpin)) -> anyhow::Result<()> { use tokio::io::AsyncReadExt; let mut buf = [0u8; 512]; @@ -111,17 +112,24 @@ async fn read_tar_eof(mut reader: (impl AsyncRead + Unpin)) -> anyhow::Result<() anyhow::bail!("invalid tar EOF marker"); } - // Drain any data after the EOF marker + // Drain any extra zero-blocks after the EOF marker let mut trailing_bytes = 0; + let mut seen_nonzero_bytes = false; loop { let nbytes = reader.read(&mut buf).await?; trailing_bytes += nbytes; + if !buf.iter().all(|&x| x == 0) { + seen_nonzero_bytes = true; + } if nbytes == 0 { break; } } - if trailing_bytes > 0 { - warn!("ignored {trailing_bytes} unexpected bytes after the tar archive"); + if seen_nonzero_bytes { + anyhow::bail!("unexpected non-zero bytes after the tar archive"); + } + if trailing_bytes % 512 != 0 { + anyhow::bail!("unexpected number of zeros ({trailing_bytes}), not divisible by tar block size (512 bytes), after the tar archive"); } Ok(()) } @@ -368,6 +376,16 @@ impl From for PageStreamError { } } +impl From for QueryError { + fn from(value: WaitLsnError) -> Self { + match value { + e @ WaitLsnError::Timeout(_) => Self::Other(anyhow::Error::new(e)), + WaitLsnError::Shutdown => Self::Shutdown, + WaitLsnError::BadState => Self::Reconnect, + } + } +} + impl PageServerHandler { pub fn new( conf: &'static PageServerConf, @@ -538,7 +556,7 @@ impl PageServerHandler { where IO: AsyncRead + AsyncWrite + Send + Sync + Unpin, { - debug_assert_current_span_has_tenant_and_timeline_id(); + debug_assert_current_span_has_tenant_and_timeline_id_no_shard_id(); let tenant = mgr::get_active_tenant_with_timeout( tenant_id, @@ -620,6 +638,7 @@ impl PageServerHandler { ) } PagestreamFeMessage::GetPage(req) => { + // shard_id is filled in by the handler let span = tracing::info_span!("handle_get_page_at_lsn_request", rel = %req.rel, blkno = %req.blkno, req_lsn = %req.lsn); ( self.handle_get_page_at_lsn_request(tenant_id, timeline_id, &req, &ctx) @@ -637,6 +656,15 @@ impl PageServerHandler { span, ) } + PagestreamFeMessage::GetSlruSegment(req) => { + let span = tracing::info_span!("handle_get_slru_segment_request", kind = %req.kind, segno = %req.segno, req_lsn = %req.lsn); + ( + self.handle_get_slru_segment_request(tenant_id, timeline_id, &req, &ctx) + .instrument(span.clone()) + .await, + span, + ) + } }; match response { @@ -699,7 +727,7 @@ impl PageServerHandler { where IO: AsyncRead + AsyncWrite + Send + Sync + Unpin, { - debug_assert_current_span_has_tenant_and_timeline_id(); + debug_assert_current_span_has_tenant_and_timeline_id_no_shard_id(); // Create empty timeline info!("creating new timeline"); @@ -752,7 +780,7 @@ impl PageServerHandler { Ok(()) } - #[instrument(skip_all, fields(%start_lsn, %end_lsn))] + #[instrument(skip_all, fields(shard_id, %start_lsn, %end_lsn))] async fn handle_import_wal( &self, pgb: &mut PostgresBackend, @@ -765,8 +793,6 @@ impl PageServerHandler { where IO: AsyncRead + AsyncWrite + Send + Sync + Unpin, { - debug_assert_current_span_has_tenant_and_timeline_id(); - let timeline = self .get_active_tenant_timeline(tenant_id, timeline_id, ShardSelector::Zero) .await?; @@ -873,6 +899,7 @@ impl PageServerHandler { Ok(lsn) } + #[instrument(skip_all, fields(shard_id))] async fn handle_get_rel_exists_request( &mut self, tenant_id: TenantId, @@ -883,7 +910,7 @@ impl PageServerHandler { let timeline = self.get_timeline_shard_zero(tenant_id, timeline_id).await?; let _timer = timeline .query_metrics - .start_timer(metrics::SmgrQueryType::GetRelExists); + .start_timer(metrics::SmgrQueryType::GetRelExists, ctx); let latest_gc_cutoff_lsn = timeline.get_latest_gc_cutoff_lsn(); let lsn = @@ -899,6 +926,7 @@ impl PageServerHandler { })) } + #[instrument(skip_all, fields(shard_id))] async fn handle_get_nblocks_request( &mut self, tenant_id: TenantId, @@ -910,7 +938,7 @@ impl PageServerHandler { let _timer = timeline .query_metrics - .start_timer(metrics::SmgrQueryType::GetRelSize); + .start_timer(metrics::SmgrQueryType::GetRelSize, ctx); let latest_gc_cutoff_lsn = timeline.get_latest_gc_cutoff_lsn(); let lsn = @@ -926,6 +954,7 @@ impl PageServerHandler { })) } + #[instrument(skip_all, fields(shard_id))] async fn handle_db_size_request( &mut self, tenant_id: TenantId, @@ -937,7 +966,7 @@ impl PageServerHandler { let _timer = timeline .query_metrics - .start_timer(metrics::SmgrQueryType::GetDbSize); + .start_timer(metrics::SmgrQueryType::GetDbSize, ctx); let latest_gc_cutoff_lsn = timeline.get_latest_gc_cutoff_lsn(); let lsn = @@ -968,7 +997,7 @@ impl PageServerHandler { ) -> Result<&Arc, Key> { let key = if let Some((first_idx, first_timeline)) = self.shard_timelines.iter().next() { // Fastest path: single sharded case - if first_idx.shard_count < ShardCount(2) { + if first_idx.shard_count.count() == 1 { return Ok(&first_timeline.timeline); } @@ -1076,6 +1105,7 @@ impl PageServerHandler { } } + #[instrument(skip_all, fields(shard_id))] async fn handle_get_page_at_lsn_request( &mut self, tenant_id: TenantId, @@ -1084,7 +1114,10 @@ impl PageServerHandler { ctx: &RequestContext, ) -> Result { let timeline = match self.get_cached_timeline_for_page(req) { - Ok(tl) => tl, + Ok(tl) => { + set_tracing_field_shard_id(tl); + tl + } Err(key) => { match self .load_timeline_for_page(tenant_id, timeline_id, key) @@ -1111,7 +1144,7 @@ impl PageServerHandler { let _timer = timeline .query_metrics - .start_timer(metrics::SmgrQueryType::GetPageAtLsn); + .start_timer(metrics::SmgrQueryType::GetPageAtLsn, ctx); let latest_gc_cutoff_lsn = timeline.get_latest_gc_cutoff_lsn(); let lsn = @@ -1127,8 +1160,36 @@ impl PageServerHandler { })) } + #[instrument(skip_all, fields(shard_id))] + async fn handle_get_slru_segment_request( + &mut self, + tenant_id: TenantId, + timeline_id: TimelineId, + req: &PagestreamGetSlruSegmentRequest, + ctx: &RequestContext, + ) -> Result { + let timeline = self.get_timeline_shard_zero(tenant_id, timeline_id).await?; + + let _timer = timeline + .query_metrics + .start_timer(metrics::SmgrQueryType::GetSlruSegment, ctx); + + let latest_gc_cutoff_lsn = timeline.get_latest_gc_cutoff_lsn(); + let lsn = + Self::wait_or_get_last_lsn(timeline, req.lsn, req.latest, &latest_gc_cutoff_lsn, ctx) + .await?; + + let kind = SlruKind::from_repr(req.kind) + .ok_or(PageStreamError::BadRequest("invalid SLRU kind".into()))?; + let segment = timeline.get_slru_segment(kind, req.segno, lsn, ctx).await?; + + Ok(PagestreamBeMessage::GetSlruSegment( + PagestreamGetSlruSegmentResponse { segment }, + )) + } + #[allow(clippy::too_many_arguments)] - #[instrument(skip_all, fields(?lsn, ?prev_lsn, %full_backup))] + #[instrument(skip_all, fields(shard_id, ?lsn, ?prev_lsn, %full_backup))] async fn handle_basebackup_request( &mut self, pgb: &mut PostgresBackend, @@ -1138,13 +1199,11 @@ impl PageServerHandler { prev_lsn: Option, full_backup: bool, gzip: bool, - ctx: RequestContext, - ) -> anyhow::Result<()> + ctx: &RequestContext, + ) -> Result<(), QueryError> where IO: AsyncRead + AsyncWrite + Send + Sync + Unpin, { - debug_assert_current_span_has_tenant_and_timeline_id(); - let started = std::time::Instant::now(); // check that the timeline exists @@ -1155,7 +1214,7 @@ impl PageServerHandler { if let Some(lsn) = lsn { // Backup was requested at a particular LSN. Wait for it to arrive. info!("waiting for {}", lsn); - timeline.wait_lsn(lsn, &ctx).await?; + timeline.wait_lsn(lsn, ctx).await?; timeline .check_lsn_is_in_scope(lsn, &latest_gc_cutoff_lsn) .context("invalid basebackup lsn")?; @@ -1177,7 +1236,7 @@ impl PageServerHandler { lsn, prev_lsn, full_backup, - &ctx, + ctx, ) .await?; } else { @@ -1198,7 +1257,7 @@ impl PageServerHandler { lsn, prev_lsn, full_backup, - &ctx, + ctx, ) .await?; // shutdown the encoder to ensure the gzip footer is written @@ -1210,7 +1269,7 @@ impl PageServerHandler { lsn, prev_lsn, full_backup, - &ctx, + ctx, ) .await?; } @@ -1266,6 +1325,7 @@ impl PageServerHandler { .await .map_err(GetActiveTimelineError::Tenant)?; let timeline = tenant.get_timeline(timeline_id, true)?; + set_tracing_field_shard_id(&timeline); Ok(timeline) } } @@ -1389,25 +1449,25 @@ where false }; - ::metrics::metric_vec_duration::observe_async_block_duration_by_result( - &*metrics::BASEBACKUP_QUERY_TIME, - async move { - self.handle_basebackup_request( - pgb, - tenant_id, - timeline_id, - lsn, - None, - false, - gzip, - ctx, - ) - .await?; - pgb.write_message_noflush(&BeMessage::CommandComplete(b"SELECT 1"))?; - anyhow::Ok(()) - }, - ) - .await?; + let metric_recording = metrics::BASEBACKUP_QUERY_TIME.start_recording(&ctx); + let res = async { + self.handle_basebackup_request( + pgb, + tenant_id, + timeline_id, + lsn, + None, + false, + gzip, + &ctx, + ) + .await?; + pgb.write_message_noflush(&BeMessage::CommandComplete(b"SELECT 1"))?; + Result::<(), QueryError>::Ok(()) + } + .await; + metric_recording.observe(&res); + res?; } // return pair of prev_lsn and last_lsn else if query_string.starts_with("get_last_record_rlsn ") { @@ -1430,21 +1490,29 @@ where .record("timeline_id", field::display(timeline_id)); self.check_permission(Some(tenant_id))?; - let timeline = self - .get_active_tenant_timeline(tenant_id, timeline_id, ShardSelector::Zero) - .await?; + async { + let timeline = self + .get_active_tenant_timeline(tenant_id, timeline_id, ShardSelector::Zero) + .await?; - let end_of_timeline = timeline.get_last_record_rlsn(); + let end_of_timeline = timeline.get_last_record_rlsn(); - pgb.write_message_noflush(&BeMessage::RowDescription(&[ - RowDescriptor::text_col(b"prev_lsn"), - RowDescriptor::text_col(b"last_lsn"), - ]))? - .write_message_noflush(&BeMessage::DataRow(&[ - Some(end_of_timeline.prev.to_string().as_bytes()), - Some(end_of_timeline.last.to_string().as_bytes()), - ]))? - .write_message_noflush(&BeMessage::CommandComplete(b"SELECT 1"))?; + pgb.write_message_noflush(&BeMessage::RowDescription(&[ + RowDescriptor::text_col(b"prev_lsn"), + RowDescriptor::text_col(b"last_lsn"), + ]))? + .write_message_noflush(&BeMessage::DataRow(&[ + Some(end_of_timeline.prev.to_string().as_bytes()), + Some(end_of_timeline.last.to_string().as_bytes()), + ]))? + .write_message_noflush(&BeMessage::CommandComplete(b"SELECT 1"))?; + anyhow::Ok(()) + } + .instrument(info_span!( + "handle_get_last_record_lsn", + shard_id = tracing::field::Empty + )) + .await?; } // same as basebackup, but result includes relational data as well else if query_string.starts_with("fullbackup ") { @@ -1495,7 +1563,7 @@ where prev_lsn, true, false, - ctx, + &ctx, ) .await?; pgb.write_message_noflush(&BeMessage::CommandComplete(b"SELECT 1"))?; @@ -1678,6 +1746,7 @@ impl From for QueryError { | GetActiveTenantError::WillNotBecomeActive(TenantState::Stopping { .. }) => { QueryError::Shutdown } + e @ GetActiveTenantError::NotFound(_) => QueryError::NotFound(format!("{e}").into()), e => QueryError::Other(anyhow::anyhow!(e)), } } @@ -1700,3 +1769,12 @@ impl From for QueryError { } } } + +fn set_tracing_field_shard_id(timeline: &Timeline) { + debug_assert_current_span_has_tenant_and_timeline_id_no_shard_id(); + tracing::Span::current().record( + "shard_id", + tracing::field::display(timeline.tenant_shard_id.shard_slug()), + ); + debug_assert_current_span_has_tenant_and_timeline_id(); +} diff --git a/pageserver/src/pgdatadir_mapping.rs b/pageserver/src/pgdatadir_mapping.rs index b65fe1eddd..727650a5a5 100644 --- a/pageserver/src/pgdatadir_mapping.rs +++ b/pageserver/src/pgdatadir_mapping.rs @@ -10,9 +10,12 @@ use super::tenant::{PageReconstructError, Timeline}; use crate::context::RequestContext; use crate::keyspace::{KeySpace, KeySpaceAccum}; use crate::repository::*; +use crate::span::debug_assert_current_span_has_tenant_and_timeline_id_no_shard_id; use crate::walrecord::NeonWalRecord; use anyhow::{ensure, Context}; -use bytes::{Buf, Bytes}; +use bytes::{Buf, Bytes, BytesMut}; +use enum_map::Enum; +use itertools::Itertools; use pageserver_api::key::{ dbdir_key_range, is_rel_block_key, is_slru_block_key, rel_block_to_key, rel_dir_to_key, rel_key_range, rel_size_to_key, relmap_file_key, slru_block_to_key, slru_dir_to_key, @@ -33,6 +36,8 @@ use tracing::{debug, trace, warn}; use utils::bin_ser::DeserializeError; use utils::{bin_ser::BeSer, lsn::Lsn}; +const MAX_AUX_FILE_DELTAS: usize = 1024; + #[derive(Debug)] pub enum LsnForTimestamp { /// Found commits both before and after the given timestamp @@ -154,6 +159,7 @@ impl Timeline { pending_updates: HashMap::new(), pending_deletions: Vec::new(), pending_nblocks: 0, + pending_directory_entries: Vec::new(), lsn, } } @@ -321,6 +327,27 @@ impl Timeline { } } + /// Get the whole SLRU segment + pub(crate) async fn get_slru_segment( + &self, + kind: SlruKind, + segno: u32, + lsn: Lsn, + ctx: &RequestContext, + ) -> Result { + let n_blocks = self + .get_slru_segment_size(kind, segno, Version::Lsn(lsn), ctx) + .await?; + let mut segment = BytesMut::with_capacity(n_blocks as usize * BLCKSZ as usize); + for blkno in 0..n_blocks { + let block = self + .get_slru_page_at_lsn(kind, segno, blkno, lsn, ctx) + .await?; + segment.extend_from_slice(&block[..BLCKSZ as usize]); + } + Ok(segment.freeze()) + } + /// Look up given SLRU page version. pub(crate) async fn get_slru_page_at_lsn( &self, @@ -678,7 +705,7 @@ impl Timeline { lsn: Lsn, ctx: &RequestContext, ) -> Result { - crate::tenant::debug_assert_current_span_has_tenant_and_timeline_id(); + debug_assert_current_span_has_tenant_and_timeline_id_no_shard_id(); // Fetch list of database dirs and iterate them let buf = self.get(DBDIR_KEY, lsn, ctx).await?; @@ -846,6 +873,10 @@ pub struct DatadirModification<'a> { pending_updates: HashMap>, pending_deletions: Vec<(Range, Lsn)>, pending_nblocks: i64, + + /// For special "directory" keys that store key-value maps, track the size of the map + /// if it was updated in this modification. + pending_directory_entries: Vec<(DirectoryKind, usize)>, } impl<'a> DatadirModification<'a> { @@ -877,6 +908,7 @@ impl<'a> DatadirModification<'a> { let buf = DbDirectory::ser(&DbDirectory { dbdirs: HashMap::new(), })?; + self.pending_directory_entries.push((DirectoryKind::Db, 0)); self.put(DBDIR_KEY, Value::Image(buf.into())); // Create AuxFilesDirectory @@ -885,16 +917,24 @@ impl<'a> DatadirModification<'a> { let buf = TwoPhaseDirectory::ser(&TwoPhaseDirectory { xids: HashSet::new(), })?; + self.pending_directory_entries + .push((DirectoryKind::TwoPhase, 0)); self.put(TWOPHASEDIR_KEY, Value::Image(buf.into())); let buf: Bytes = SlruSegmentDirectory::ser(&SlruSegmentDirectory::default())?.into(); let empty_dir = Value::Image(buf); self.put(slru_dir_to_key(SlruKind::Clog), empty_dir.clone()); + self.pending_directory_entries + .push((DirectoryKind::SlruSegment(SlruKind::Clog), 0)); self.put( slru_dir_to_key(SlruKind::MultiXactMembers), empty_dir.clone(), ); + self.pending_directory_entries + .push((DirectoryKind::SlruSegment(SlruKind::Clog), 0)); self.put(slru_dir_to_key(SlruKind::MultiXactOffsets), empty_dir); + self.pending_directory_entries + .push((DirectoryKind::SlruSegment(SlruKind::MultiXactOffsets), 0)); Ok(()) } @@ -995,6 +1035,7 @@ impl<'a> DatadirModification<'a> { let buf = RelDirectory::ser(&RelDirectory { rels: HashSet::new(), })?; + self.pending_directory_entries.push((DirectoryKind::Rel, 0)); self.put( rel_dir_to_key(spcnode, dbnode), Value::Image(Bytes::from(buf)), @@ -1017,6 +1058,8 @@ impl<'a> DatadirModification<'a> { if !dir.xids.insert(xid) { anyhow::bail!("twophase file for xid {} already exists", xid); } + self.pending_directory_entries + .push((DirectoryKind::TwoPhase, dir.xids.len())); self.put( TWOPHASEDIR_KEY, Value::Image(Bytes::from(TwoPhaseDirectory::ser(&dir)?)), @@ -1052,6 +1095,8 @@ impl<'a> DatadirModification<'a> { let mut dir = DbDirectory::des(&buf)?; if dir.dbdirs.remove(&(spcnode, dbnode)).is_some() { let buf = DbDirectory::ser(&dir)?; + self.pending_directory_entries + .push((DirectoryKind::Db, dir.dbdirs.len())); self.put(DBDIR_KEY, Value::Image(buf.into())); } else { warn!( @@ -1089,6 +1134,8 @@ impl<'a> DatadirModification<'a> { // Didn't exist. Update dbdir dbdir.dbdirs.insert((rel.spcnode, rel.dbnode), false); let buf = DbDirectory::ser(&dbdir).context("serialize db")?; + self.pending_directory_entries + .push((DirectoryKind::Db, dbdir.dbdirs.len())); self.put(DBDIR_KEY, Value::Image(buf.into())); // and create the RelDirectory @@ -1103,6 +1150,10 @@ impl<'a> DatadirModification<'a> { if !rel_dir.rels.insert((rel.relnode, rel.forknum)) { return Err(RelationError::AlreadyExists); } + + self.pending_directory_entries + .push((DirectoryKind::Rel, rel_dir.rels.len())); + self.put( rel_dir_key, Value::Image(Bytes::from( @@ -1194,6 +1245,9 @@ impl<'a> DatadirModification<'a> { let buf = self.get(dir_key, ctx).await?; let mut dir = RelDirectory::des(&buf)?; + self.pending_directory_entries + .push((DirectoryKind::Rel, dir.rels.len())); + if dir.rels.remove(&(rel.relnode, rel.forknum)) { self.put(dir_key, Value::Image(Bytes::from(RelDirectory::ser(&dir)?))); } else { @@ -1229,6 +1283,8 @@ impl<'a> DatadirModification<'a> { if !dir.segments.insert(segno) { anyhow::bail!("slru segment {kind:?}/{segno} already exists"); } + self.pending_directory_entries + .push((DirectoryKind::SlruSegment(kind), dir.segments.len())); self.put( dir_key, Value::Image(Bytes::from(SlruSegmentDirectory::ser(&dir)?)), @@ -1273,6 +1329,8 @@ impl<'a> DatadirModification<'a> { if !dir.segments.remove(&segno) { warn!("slru segment {:?}/{} does not exist", kind, segno); } + self.pending_directory_entries + .push((DirectoryKind::SlruSegment(kind), dir.segments.len())); self.put( dir_key, Value::Image(Bytes::from(SlruSegmentDirectory::ser(&dir)?)), @@ -1303,6 +1361,8 @@ impl<'a> DatadirModification<'a> { if !dir.xids.remove(&xid) { warn!("twophase file for xid {} does not exist", xid); } + self.pending_directory_entries + .push((DirectoryKind::TwoPhase, dir.xids.len())); self.put( TWOPHASEDIR_KEY, Value::Image(Bytes::from(TwoPhaseDirectory::ser(&dir)?)), @@ -1318,6 +1378,8 @@ impl<'a> DatadirModification<'a> { let buf = AuxFilesDirectory::ser(&AuxFilesDirectory { files: HashMap::new(), })?; + self.pending_directory_entries + .push((DirectoryKind::AuxFiles, 0)); self.put(AUX_FILES_KEY, Value::Image(Bytes::from(buf))); Ok(()) } @@ -1328,28 +1390,86 @@ impl<'a> DatadirModification<'a> { content: &[u8], ctx: &RequestContext, ) -> anyhow::Result<()> { - let mut dir = match self.get(AUX_FILES_KEY, ctx).await { - Ok(buf) => AuxFilesDirectory::des(&buf)?, - Err(e) => { - // This is expected: historical databases do not have the key. - debug!("Failed to get info about AUX files: {}", e); - AuxFilesDirectory { - files: HashMap::new(), + let file_path = path.to_string(); + let content = if content.is_empty() { + None + } else { + Some(Bytes::copy_from_slice(content)) + }; + + let n_files; + let mut aux_files = self.tline.aux_files.lock().await; + if let Some(mut dir) = aux_files.dir.take() { + // We already updated aux files in `self`: emit a delta and update our latest value + dir.upsert(file_path.clone(), content.clone()); + n_files = dir.files.len(); + if aux_files.n_deltas == MAX_AUX_FILE_DELTAS { + self.put( + AUX_FILES_KEY, + Value::Image(Bytes::from( + AuxFilesDirectory::ser(&dir).context("serialize")?, + )), + ); + aux_files.n_deltas = 0; + } else { + self.put( + AUX_FILES_KEY, + Value::WalRecord(NeonWalRecord::AuxFile { file_path, content }), + ); + aux_files.n_deltas += 1; + } + aux_files.dir = Some(dir); + } else { + // Check if the AUX_FILES_KEY is initialized + match self.get(AUX_FILES_KEY, ctx).await { + Ok(dir_bytes) => { + let mut dir = AuxFilesDirectory::des(&dir_bytes)?; + // Key is already set, we may append a delta + self.put( + AUX_FILES_KEY, + Value::WalRecord(NeonWalRecord::AuxFile { + file_path: file_path.clone(), + content: content.clone(), + }), + ); + dir.upsert(file_path, content); + n_files = dir.files.len(); + aux_files.dir = Some(dir); + } + Err( + e @ (PageReconstructError::AncestorStopping(_) + | PageReconstructError::Cancelled + | PageReconstructError::AncestorLsnTimeout(_)), + ) => { + // Important that we do not interpret a shutdown error as "not found" and thereby + // reset the map. + return Err(e.into()); + } + // FIXME: PageReconstructError doesn't have an explicit variant for key-not-found, so + // we are assuming that all _other_ possible errors represents a missing key. If some + // other error occurs, we may incorrectly reset the map of aux files. + Err(PageReconstructError::Other(_) | PageReconstructError::WalRedo(_)) => { + // Key is missing, we must insert an image as the basis for subsequent deltas. + + let mut dir = AuxFilesDirectory { + files: HashMap::new(), + }; + dir.upsert(file_path, content); + self.put( + AUX_FILES_KEY, + Value::Image(Bytes::from( + AuxFilesDirectory::ser(&dir).context("serialize")?, + )), + ); + n_files = 1; + aux_files.dir = Some(dir); } } - }; - let path = path.to_string(); - if content.is_empty() { - dir.files.remove(&path); - } else { - dir.files.insert(path, Bytes::copy_from_slice(content)); } - self.put( - AUX_FILES_KEY, - Value::Image(Bytes::from( - AuxFilesDirectory::ser(&dir).context("serialize")?, - )), - ); + + self.pending_directory_entries + .push((DirectoryKind::AuxFiles, n_files)); + Ok(()) } @@ -1379,7 +1499,7 @@ impl<'a> DatadirModification<'a> { return Ok(()); } - let writer = self.tline.writer().await; + let mut writer = self.tline.writer().await; // Flush relation and SLRU data blocks, keep metadata. let mut retained_pending_updates = HashMap::<_, Vec<_>>::new(); @@ -1405,6 +1525,10 @@ impl<'a> DatadirModification<'a> { self.pending_nblocks = 0; } + for (kind, count) in std::mem::take(&mut self.pending_directory_entries) { + writer.update_directory_entries_count(kind, count as u64); + } + Ok(()) } @@ -1414,14 +1538,22 @@ impl<'a> DatadirModification<'a> { /// All the modifications in this atomic update are stamped by the specified LSN. /// pub async fn commit(&mut self, ctx: &RequestContext) -> anyhow::Result<()> { - let writer = self.tline.writer().await; + let mut writer = self.tline.writer().await; let pending_nblocks = self.pending_nblocks; self.pending_nblocks = 0; if !self.pending_updates.is_empty() { - writer.put_batch(&self.pending_updates, ctx).await?; - self.pending_updates.clear(); + // The put_batch call below expects expects the inputs to be sorted by Lsn, + // so we do that first. + let lsn_ordered_batch: Vec<(Key, Lsn, Value)> = self + .pending_updates + .drain() + .map(|(key, vals)| vals.into_iter().map(move |(lsn, val)| (key, lsn, val))) + .kmerge_by(|lhs, rhs| lhs.1 .0 < rhs.1 .0) + .collect(); + + writer.put_batch(lsn_ordered_batch, ctx).await?; } if !self.pending_deletions.is_empty() { @@ -1442,6 +1574,10 @@ impl<'a> DatadirModification<'a> { writer.update_current_logical_size(pending_nblocks * i64::from(BLCKSZ)); } + for (kind, count) in std::mem::take(&mut self.pending_directory_entries) { + writer.update_directory_entries_count(kind, count as u64); + } + Ok(()) } @@ -1550,9 +1686,19 @@ struct RelDirectory { rels: HashSet<(Oid, u8)>, } -#[derive(Debug, Serialize, Deserialize, Default)] -struct AuxFilesDirectory { - files: HashMap, +#[derive(Debug, Serialize, Deserialize, Default, PartialEq)] +pub(crate) struct AuxFilesDirectory { + pub(crate) files: HashMap, +} + +impl AuxFilesDirectory { + pub(crate) fn upsert(&mut self, key: String, value: Option) { + if let Some(value) = value { + self.files.insert(key, value); + } else { + self.files.remove(&key); + } + } } #[derive(Debug, Serialize, Deserialize)] @@ -1566,13 +1712,82 @@ struct SlruSegmentDirectory { segments: HashSet, } +#[derive(Copy, Clone, PartialEq, Eq, Debug, enum_map::Enum)] +#[repr(u8)] +pub(crate) enum DirectoryKind { + Db, + TwoPhase, + Rel, + AuxFiles, + SlruSegment(SlruKind), +} + +impl DirectoryKind { + pub(crate) const KINDS_NUM: usize = ::LENGTH; + pub(crate) fn offset(&self) -> usize { + self.into_usize() + } +} + static ZERO_PAGE: Bytes = Bytes::from_static(&[0u8; BLCKSZ as usize]); #[allow(clippy::bool_assert_comparison)] #[cfg(test)] mod tests { - //use super::repo_harness::*; - //use super::*; + use hex_literal::hex; + use utils::id::TimelineId; + + use super::*; + + use crate::{tenant::harness::TenantHarness, DEFAULT_PG_VERSION}; + + /// Test a round trip of aux file updates, from DatadirModification to reading back from the Timeline + #[tokio::test] + async fn aux_files_round_trip() -> anyhow::Result<()> { + let name = "aux_files_round_trip"; + let harness = TenantHarness::create(name)?; + + pub const TIMELINE_ID: TimelineId = + TimelineId::from_array(hex!("11223344556677881122334455667788")); + + let (tenant, ctx) = harness.load().await; + let tline = tenant + .create_empty_timeline(TIMELINE_ID, Lsn(0), DEFAULT_PG_VERSION, &ctx) + .await?; + let tline = tline.raw_timeline().unwrap(); + + // First modification: insert two keys + let mut modification = tline.begin_modification(Lsn(0x1000)); + modification.put_file("foo/bar1", b"content1", &ctx).await?; + modification.set_lsn(Lsn(0x1008))?; + modification.put_file("foo/bar2", b"content2", &ctx).await?; + modification.commit(&ctx).await?; + let expect_1008 = HashMap::from([ + ("foo/bar1".to_string(), Bytes::from_static(b"content1")), + ("foo/bar2".to_string(), Bytes::from_static(b"content2")), + ]); + + let readback = tline.list_aux_files(Lsn(0x1008), &ctx).await?; + assert_eq!(readback, expect_1008); + + // Second modification: update one key, remove the other + let mut modification = tline.begin_modification(Lsn(0x2000)); + modification.put_file("foo/bar1", b"content3", &ctx).await?; + modification.set_lsn(Lsn(0x2008))?; + modification.put_file("foo/bar2", b"", &ctx).await?; + modification.commit(&ctx).await?; + let expect_2008 = + HashMap::from([("foo/bar1".to_string(), Bytes::from_static(b"content3"))]); + + let readback = tline.list_aux_files(Lsn(0x2008), &ctx).await?; + assert_eq!(readback, expect_2008); + + // Reading back in time works + let readback = tline.list_aux_files(Lsn(0x1008), &ctx).await?; + assert_eq!(readback, expect_1008); + + Ok(()) + } /* fn assert_current_logical_size(timeline: &DatadirTimeline, lsn: Lsn) { diff --git a/pageserver/src/repository.rs b/pageserver/src/repository.rs index c726139524..9959d105eb 100644 --- a/pageserver/src/repository.rs +++ b/pageserver/src/repository.rs @@ -37,7 +37,6 @@ impl Value { mod test { use super::*; - use bytes::Bytes; use utils::bin_ser::BeSer; macro_rules! roundtrip { diff --git a/pageserver/src/span.rs b/pageserver/src/span.rs new file mode 100644 index 0000000000..91fee50514 --- /dev/null +++ b/pageserver/src/span.rs @@ -0,0 +1,43 @@ +use utils::tracing_span_assert::check_fields_present; + +mod extractors { + use utils::tracing_span_assert::ConstExtractor; + + pub(super) const TENANT_ID: ConstExtractor = ConstExtractor::new("tenant_id"); + pub(super) const SHARD_ID: ConstExtractor = ConstExtractor::new("shard_id"); + pub(super) const TIMELINE_ID: ConstExtractor = ConstExtractor::new("timeline_id"); +} + +#[track_caller] +pub(crate) fn debug_assert_current_span_has_tenant_id() { + if cfg!(debug_assertions) { + if let Err(missing) = check_fields_present!([&extractors::TENANT_ID, &extractors::SHARD_ID]) + { + panic!("missing extractors: {missing:?}") + } + } +} + +#[track_caller] +pub(crate) fn debug_assert_current_span_has_tenant_and_timeline_id() { + if cfg!(debug_assertions) { + if let Err(missing) = check_fields_present!([ + &extractors::TENANT_ID, + &extractors::SHARD_ID, + &extractors::TIMELINE_ID, + ]) { + panic!("missing extractors: {missing:?}") + } + } +} + +#[track_caller] +pub(crate) fn debug_assert_current_span_has_tenant_and_timeline_id_no_shard_id() { + if cfg!(debug_assertions) { + if let Err(missing) = + check_fields_present!([&extractors::TENANT_ID, &extractors::TIMELINE_ID,]) + { + panic!("missing extractors: {missing:?}") + } + } +} diff --git a/pageserver/src/task_mgr.rs b/pageserver/src/task_mgr.rs index 5a06a97525..275a72c0b0 100644 --- a/pageserver/src/task_mgr.rs +++ b/pageserver/src/task_mgr.rs @@ -30,10 +30,6 @@ //! only a single tenant or timeline. //! -// Clippy 1.60 incorrectly complains about the tokio::task_local!() macro. -// Silence it. See https://github.com/rust-lang/rust-clippy/issues/9224. -#![allow(clippy::declare_interior_mutable_const)] - use std::collections::HashMap; use std::fmt; use std::future::Future; @@ -192,6 +188,7 @@ task_local! { serde::Serialize, serde::Deserialize, strum_macros::IntoStaticStr, + strum_macros::EnumString, )] pub enum TaskKind { // Pageserver startup, i.e., `main` @@ -275,9 +272,6 @@ pub enum TaskKind { // Task that uploads a file to remote storage RemoteUploadTask, - // Task that downloads a file from remote storage - RemoteDownloadTask, - // task that handles the initial downloading of all tenants InitialLoad, @@ -312,7 +306,6 @@ struct MutableTaskState { } struct PageServerTask { - #[allow(dead_code)] // unused currently task_id: PageserverTaskId, kind: TaskKind, @@ -576,8 +569,8 @@ pub fn shutdown_token() -> CancellationToken { /// Has the current task been requested to shut down? pub fn is_shutdown_requested() -> bool { - if let Ok(cancel) = SHUTDOWN_TOKEN.try_with(|t| t.clone()) { - cancel.is_cancelled() + if let Ok(true_or_false) = SHUTDOWN_TOKEN.try_with(|t| t.is_cancelled()) { + true_or_false } else { if !cfg!(test) { warn!("is_shutdown_requested() called in an unexpected task or thread"); diff --git a/pageserver/src/tenant.rs b/pageserver/src/tenant.rs index 7bb5881aab..f0996328c0 100644 --- a/pageserver/src/tenant.rs +++ b/pageserver/src/tenant.rs @@ -20,14 +20,16 @@ use futures::FutureExt; use futures::StreamExt; use pageserver_api::models; use pageserver_api::models::TimelineState; +use pageserver_api::models::WalRedoManagerStatus; use pageserver_api::shard::ShardIdentity; +use pageserver_api::shard::ShardStripeSize; use pageserver_api::shard::TenantShardId; use remote_storage::DownloadError; use remote_storage::GenericRemoteStorage; +use remote_storage::TimeoutOrCancel; use std::fmt; use storage_broker::BrokerClientChannel; use tokio::io::BufReader; -use tokio::runtime::Handle; use tokio::sync::watch; use tokio::task::JoinSet; use tokio_util::sync::CancellationToken; @@ -47,11 +49,11 @@ use self::config::AttachmentMode; use self::config::LocationConf; use self::config::TenantConf; use self::delete::DeleteTenantFlow; -use self::metadata::LoadMetadataError; use self::metadata::TimelineMetadata; use self::mgr::GetActiveTenantError; use self::mgr::GetTenantError; use self::mgr::TenantsMap; +use self::remote_timeline_client::upload::upload_index_part; use self::remote_timeline_client::RemoteTimelineClient; use self::timeline::uninit::TimelineExclusionError; use self::timeline::uninit::TimelineUninitMark; @@ -66,13 +68,14 @@ use crate::deletion_queue::DeletionQueueError; use crate::import_datadir; use crate::is_uninit_mark; use crate::metrics::TENANT; -use crate::metrics::{remove_tenant_metrics, TENANT_STATE_METRIC, TENANT_SYNTHETIC_SIZE_METRIC}; +use crate::metrics::{ + remove_tenant_metrics, BROKEN_TENANTS_SET, TENANT_STATE_METRIC, TENANT_SYNTHETIC_SIZE_METRIC, +}; use crate::repository::GcResult; use crate::task_mgr; use crate::task_mgr::TaskKind; use crate::tenant::config::LocationMode; use crate::tenant::config::TenantConfOpt; -use crate::tenant::metadata::load_metadata; pub use crate::tenant::remote_timeline_client::index::IndexPart; use crate::tenant::remote_timeline_client::remote_initdb_archive_path; use crate::tenant::remote_timeline_client::MaybeDeletedIndexPart; @@ -89,7 +92,6 @@ use std::fmt::Debug; use std::fmt::Display; use std::fs; use std::fs::File; -use std::io; use std::ops::Bound::Included; use std::sync::atomic::AtomicU64; use std::sync::atomic::Ordering; @@ -97,6 +99,7 @@ use std::sync::Arc; use std::sync::{Mutex, RwLock}; use std::time::{Duration, Instant}; +use crate::span; use crate::tenant::timeline::delete::DeleteTimelineFlow; use crate::tenant::timeline::uninit::cleanup_timeline_directory; use crate::virtual_file::VirtualFile; @@ -107,7 +110,6 @@ pub use pageserver_api::models::TenantState; use tokio::sync::Semaphore; static INIT_DB_SEMAPHORE: Lazy = Lazy::new(|| Semaphore::new(8)); -use toml_edit; use utils::{ crashsafe, generation::Generation, @@ -143,14 +145,13 @@ macro_rules! pausable_failpoint { pub mod blob_io; pub mod block_io; +pub mod vectored_blob_io; pub mod disk_btree; pub(crate) mod ephemeral_file; pub mod layer_map; -mod span; pub mod metadata; -mod par_fsync; pub mod remote_timeline_client; pub mod storage_layer; @@ -165,11 +166,10 @@ pub(crate) mod timeline; pub mod size; -pub(crate) use timeline::span::debug_assert_current_span_has_tenant_and_timeline_id; -pub(crate) use timeline::{LogicalSizeCalculationCause, PageReconstructError, Timeline}; +pub(crate) mod throttle; -// re-export for use in remote_timeline_client.rs -pub use crate::tenant::metadata::save_metadata; +pub(crate) use crate::span::debug_assert_current_span_has_tenant_and_timeline_id; +pub(crate) use timeline::{LogicalSizeCalculationCause, PageReconstructError, Timeline}; // re-export for use in walreceiver pub use crate::tenant::timeline::WalReceiverInfo; @@ -204,7 +204,7 @@ impl AttachedTenantConf { match &location_conf.mode { LocationMode::Attached(attach_conf) => Ok(Self { tenant_conf: location_conf.tenant_conf, - location: attach_conf.clone(), + location: *attach_conf, }), LocationMode::Secondary(_) => { anyhow::bail!("Attempted to construct AttachedTenantConf from a LocationConf in secondary mode") @@ -226,7 +226,11 @@ pub(crate) struct TenantPreload { /// When we spawn a tenant, there is a special mode for tenant creation that /// avoids trying to read anything from remote storage. pub(crate) enum SpawnMode { - Normal, + /// Activate as soon as possible + Eager, + /// Lazy activation in the background, with the option to skip the queue if the need comes up + Lazy, + /// Tenant has been created during the lifetime of this process Create, } @@ -275,7 +279,7 @@ pub struct Tenant { // with timelines, which in turn may cause dropping replication connection, expiration of wait_for_lsn // timeout... gc_cs: tokio::sync::Mutex<()>, - walredo_mgr: Arc, + walredo_mgr: Option>, // provides access to timeline data sitting in the remote storage pub(crate) remote_storage: Option, @@ -303,6 +307,11 @@ pub struct Tenant { // Users of the Tenant such as the page service must take this Gate to avoid // trying to use a Tenant which is shutting down. pub(crate) gate: Gate, + + /// Throttle applied at the top of [`Timeline::get`]. + /// All [`Tenant::timelines`] of a given [`Tenant`] instance share the same [`throttle::Throttle`] instance. + pub(crate) timeline_get_throttle: + Arc>, } impl std::fmt::Debug for Tenant { @@ -364,6 +373,14 @@ impl WalRedoManager { } } } + + pub(crate) fn status(&self) -> Option { + match self { + WalRedoManager::Prod(m) => m.status(), + #[cfg(test)] + WalRedoManager::Test(_) => None, + } + } } #[derive(Debug, thiserror::Error, PartialEq, Eq)] @@ -475,11 +492,6 @@ impl From for InitdbError { } } -struct TenantDirectoryScan { - sorted_timelines_to_load: Vec<(TimelineId, TimelineMetadata)>, - timelines_to_resume_deletion: Vec<(TimelineId, Option)>, -} - enum CreateTimelineCause { Load, Delete, @@ -616,12 +628,15 @@ impl Tenant { deletion_queue_client, } = resources; + let attach_mode = attached_conf.location.attach_mode; + let generation = attached_conf.location.generation; + let tenant = Arc::new(Tenant::new( TenantState::Attaching, conf, attached_conf, shard_identity, - wal_redo_manager, + Some(wal_redo_manager), tenant_shard_id, remote_storage.clone(), deletion_queue_client, @@ -629,10 +644,10 @@ impl Tenant { // The attach task will carry a GateGuard, so that shutdown() reliably waits for it to drop out if // we shut down while attaching. - let Ok(attach_gate_guard) = tenant.gate.enter() else { - // We just created the Tenant: nothing else can have shut it down yet - unreachable!(); - }; + let attach_gate_guard = tenant + .gate + .enter() + .expect("We just created the Tenant: nothing else can have shut it down yet"); // Do all the hard work in the background let tenant_clone = Arc::clone(&tenant); @@ -645,6 +660,12 @@ impl Tenant { "attach tenant", false, async move { + + info!( + ?attach_mode, + "Attaching tenant" + ); + let _gate_guard = attach_gate_guard; // Is this tenant being spawned as part of process startup? @@ -682,41 +703,37 @@ impl Tenant { .and_then(|x| x.initial_tenant_load_remote.take()); enum AttachType<'a> { - // During pageserver startup, we are attaching this tenant lazily in the background - Warmup(tokio::sync::SemaphorePermit<'a>), - // During pageserver startup, we are attaching this tenant as soon as we can, - // because a client tried to access it. + /// We are attaching this tenant lazily in the background. + Warmup { + _permit: tokio::sync::SemaphorePermit<'a>, + during_startup: bool + }, + /// We are attaching this tenant as soon as we can, because for example an + /// endpoint tried to access it. OnDemand, - // During normal operations after startup, we are attaching a tenant. + /// During normal operations after startup, we are attaching a tenant, and + /// eager attach was requested. Normal, } - // Before doing any I/O, wait for either or: - // - A client to attempt to access to this tenant (on-demand loading) - // - A permit to become available in the warmup semaphore (background warmup) - // - // Some-ness of init_order is how we know if we're attaching during startup or later - // in process lifetime. - let attach_type = if init_order.is_some() { + let attach_type = if matches!(mode, SpawnMode::Lazy) { + // Before doing any I/O, wait for at least one of: + // - A client attempting to access to this tenant (on-demand loading) + // - A permit becoming available in the warmup semaphore (background warmup) + tokio::select!( - _ = tenant_clone.activate_now_sem.acquire() => { + permit = tenant_clone.activate_now_sem.acquire() => { + let _ = permit.expect("activate_now_sem is never closed"); tracing::info!("Activating tenant (on-demand)"); AttachType::OnDemand }, - permit_result = conf.concurrent_tenant_warmup.inner().acquire() => { - match permit_result { - Ok(p) => { - tracing::info!("Activating tenant (warmup)"); - AttachType::Warmup(p) - } - Err(_) => { - // This is unexpected: the warmup semaphore should stay alive - // for the lifetime of init_order. Log a warning and proceed. - tracing::warn!("warmup_limit semaphore unexpectedly closed"); - AttachType::Normal - } + permit = conf.concurrent_tenant_warmup.inner().acquire() => { + let _permit = permit.expect("concurrent_tenant_warmup semaphore is never closed"); + tracing::info!("Activating tenant (warmup)"); + AttachType::Warmup { + _permit, + during_startup: init_order.is_some() } - } _ = tenant_clone.cancel.cancelled() => { // This is safe, but should be pretty rare: it is interesting if a tenant @@ -731,39 +748,32 @@ impl Tenant { }, ) } else { + // SpawnMode::{Create,Eager} always cause jumping ahead of the + // concurrent_tenant_warmup queue AttachType::Normal }; - let preload_timer = TENANT.preload.start_timer(); - let preload = match mode { - SpawnMode::Create => { - // Don't count the skipped preload into the histogram of preload durations - preload_timer.stop_and_discard(); + let preload = match (&mode, &remote_storage) { + (SpawnMode::Create, _) => { None }, - SpawnMode::Normal => { - match &remote_storage { - Some(remote_storage) => Some( - match tenant_clone - .preload(remote_storage, task_mgr::shutdown_token()) - .instrument( - tracing::info_span!(parent: None, "attach_preload", tenant_id=%tenant_shard_id.tenant_id, shard_id=%tenant_shard_id.shard_slug()), - ) - .await { - Ok(p) => { - preload_timer.observe_duration(); - p - } - , - Err(e) => { - make_broken(&tenant_clone, anyhow::anyhow!(e)); - return Ok(()); - } - }, - ), - None => None, + (SpawnMode::Eager | SpawnMode::Lazy, Some(remote_storage)) => { + let _preload_timer = TENANT.preload.start_timer(); + let res = tenant_clone + .preload(remote_storage, task_mgr::shutdown_token()) + .await; + match res { + Ok(p) => Some(p), + Err(e) => { + make_broken(&tenant_clone, anyhow::anyhow!(e)); + return Ok(()); + } } } + (_, None) => { + let _preload_timer = TENANT.preload.start_timer(); + None + } }; // Remote preload is complete. @@ -799,36 +809,37 @@ impl Tenant { info!("ready for backgound jobs barrier"); } - match DeleteTenantFlow::resume_from_attach( + let deleted = DeleteTenantFlow::resume_from_attach( deletion, &tenant_clone, preload, tenants, &ctx, ) - .await - { - Err(err) => { - make_broken(&tenant_clone, anyhow::anyhow!(err)); - return Ok(()); - } - Ok(()) => return Ok(()), + .await; + + if let Err(e) = deleted { + make_broken(&tenant_clone, anyhow::anyhow!(e)); } + + return Ok(()); } // We will time the duration of the attach phase unless this is a creation (attach will do no work) - let attach_timer = match mode { - SpawnMode::Create => None, - SpawnMode::Normal => {Some(TENANT.attach.start_timer())} + let attached = { + let _attach_timer = match mode { + SpawnMode::Create => None, + SpawnMode::Eager | SpawnMode::Lazy => Some(TENANT.attach.start_timer()), + }; + tenant_clone.attach(preload, mode, &ctx).await }; - match tenant_clone.attach(preload, mode, &ctx).await { + + match attached { Ok(()) => { info!("attach finished, activating"); - if let Some(t)= attach_timer {t.observe_duration();} tenant_clone.activate(broker_client, None, &ctx); } Err(e) => { - if let Some(t)= attach_timer {t.observe_duration();} make_broken(&tenant_clone, anyhow::anyhow!(e)); } } @@ -840,35 +851,27 @@ impl Tenant { // It also prevents the warmup proccess competing with the concurrency limit on // logical size calculations: if logical size calculation semaphore is saturated, // then warmup will wait for that before proceeding to the next tenant. - if let AttachType::Warmup(_permit) = attach_type { - let mut futs = FuturesUnordered::new(); - let timelines: Vec<_> = tenant_clone.timelines.lock().unwrap().values().cloned().collect(); - for t in timelines { - futs.push(t.await_initial_logical_size()) - } + if matches!(attach_type, AttachType::Warmup { during_startup: true, .. }) { + let mut futs: FuturesUnordered<_> = tenant_clone.timelines.lock().unwrap().values().cloned().map(|t| t.await_initial_logical_size()).collect(); tracing::info!("Waiting for initial logical sizes while warming up..."); - while futs.next().await.is_some() { - - } + while futs.next().await.is_some() {} tracing::info!("Warm-up complete"); } Ok(()) } - .instrument({ - let span = tracing::info_span!(parent: None, "attach", tenant_id=%tenant_shard_id.tenant_id, shard_id=%tenant_shard_id.shard_slug()); - span.follows_from(Span::current()); - span - }), + .instrument(tracing::info_span!(parent: None, "attach", tenant_id=%tenant_shard_id.tenant_id, shard_id=%tenant_shard_id.shard_slug(), gen=?generation)), ); Ok(tenant) } + #[instrument(skip_all)] pub(crate) async fn preload( self: &Arc, remote_storage: &GenericRemoteStorage, cancel: CancellationToken, ) -> anyhow::Result { + span::debug_assert_current_span_has_tenant_id(); // Get list of remote timelines // download index files for every tenant timeline info!("listing remote timelines"); @@ -921,10 +924,8 @@ impl Tenant { deleting: false, timelines: HashMap::new(), }, - (None, SpawnMode::Normal) => { - // Deprecated dev mode: load from local disk state instead of remote storage - // https://github.com/neondatabase/neon/issues/5624 - return self.load_local(ctx).await; + (None, _) => { + anyhow::bail!("local-only deployment is no longer supported, https://github.com/neondatabase/neon/issues/5624"); } }; @@ -994,6 +995,7 @@ impl Tenant { TimelineResources { remote_client: Some(remote_client), deletion_queue_client: self.deletion_queue_client.clone(), + timeline_get_throttle: self.timeline_get_throttle.clone(), }, ctx, ) @@ -1020,6 +1022,7 @@ impl Tenant { Some(remote_timeline_client), self.deletion_queue_client.clone(), ) + .instrument(tracing::info_span!("timeline_delete", %timeline_id)) .await .context("resume_deletion") .map_err(LoadLocalTimelineError::ResumeDeletion)?; @@ -1146,17 +1149,6 @@ impl Tenant { None }; - // timeline loading after attach expects to find metadata file for each metadata - save_metadata( - self.conf, - &self.tenant_shard_id, - &timeline_id, - &remote_metadata, - ) - .await - .context("save_metadata") - .map_err(LoadLocalTimelineError::Load)?; - self.timeline_init_and_sync( timeline_id, resources, @@ -1174,10 +1166,6 @@ impl Tenant { tenant_shard_id: TenantShardId, reason: String, ) -> Arc { - let wal_redo_manager = Arc::new(WalRedoManager::from(PostgresRedoManager::new( - conf, - tenant_shard_id, - ))); Arc::new(Tenant::new( TenantState::Broken { reason, @@ -1188,156 +1176,13 @@ impl Tenant { // Shard identity isn't meaningful for a broken tenant: it's just a placeholder // to occupy the slot for this TenantShardId. ShardIdentity::broken(tenant_shard_id.shard_number, tenant_shard_id.shard_count), - wal_redo_manager, + None, tenant_shard_id, None, DeletionQueueClient::broken(), )) } - fn scan_and_sort_timelines_dir(self: Arc) -> anyhow::Result { - let mut timelines_to_load: HashMap = HashMap::new(); - // Note timelines_to_resume_deletion needs to be separate because it can be not sortable - // from the point of `tree_sort_timelines`. I e some parents can be missing because deletion - // completed in non topological order (for example because parent has smaller number of layer files in it) - let mut timelines_to_resume_deletion: Vec<(TimelineId, Option)> = vec![]; - - let timelines_dir = self.conf.timelines_path(&self.tenant_shard_id); - - for entry in timelines_dir - .read_dir_utf8() - .context("list timelines directory for tenant")? - { - let entry = entry.context("read timeline dir entry")?; - let timeline_dir = entry.path(); - - if crate::is_temporary(timeline_dir) { - info!("Found temporary timeline directory, removing: {timeline_dir}"); - if let Err(e) = std::fs::remove_dir_all(timeline_dir) { - error!("Failed to remove temporary directory '{timeline_dir}': {e:?}"); - } - } else if is_uninit_mark(timeline_dir) { - if !timeline_dir.exists() { - warn!("Timeline dir entry become invalid: {timeline_dir}"); - continue; - } - - let timeline_uninit_mark_file = &timeline_dir; - info!( - "Found an uninit mark file {timeline_uninit_mark_file}, removing the timeline and its uninit mark", - ); - let timeline_id = - TimelineId::try_from(timeline_uninit_mark_file.file_stem()) - .with_context(|| { - format!( - "Could not parse timeline id out of the timeline uninit mark name {timeline_uninit_mark_file}", - ) - })?; - let timeline_dir = self.conf.timeline_path(&self.tenant_shard_id, &timeline_id); - if let Err(e) = - remove_timeline_and_uninit_mark(&timeline_dir, timeline_uninit_mark_file) - { - error!("Failed to clean up uninit marked timeline: {e:?}"); - } - } else if crate::is_delete_mark(timeline_dir) { - // If metadata exists, load as usual, continue deletion - let timeline_id = TimelineId::try_from(timeline_dir.file_stem()) - .with_context(|| { - format!( - "Could not parse timeline id out of the timeline uninit mark name {timeline_dir}", - ) - })?; - - info!("Found deletion mark for timeline {}", timeline_id); - - match load_metadata(self.conf, &self.tenant_shard_id, &timeline_id) { - Ok(metadata) => { - timelines_to_resume_deletion.push((timeline_id, Some(metadata))) - } - Err(e) => match &e { - LoadMetadataError::Read(r) => { - if r.kind() != io::ErrorKind::NotFound { - return Err(anyhow::anyhow!(e)).with_context(|| { - format!("Failed to load metadata for timeline_id {timeline_id}") - }); - } - - // If metadata doesnt exist it means that we've crashed without - // completing cleanup_remaining_timeline_fs_traces in DeleteTimelineFlow. - // So save timeline_id for later call to `DeleteTimelineFlow::cleanup_remaining_timeline_fs_traces`. - // We cant do it here because the method is async so we'd need block_on - // and here we're in spawn_blocking. cleanup_remaining_timeline_fs_traces uses fs operations - // so that basically results in a cycle: - // spawn_blocking - // - block_on - // - spawn_blocking - // which can lead to running out of threads in blocing pool. - timelines_to_resume_deletion.push((timeline_id, None)); - } - _ => { - return Err(anyhow::anyhow!(e)).with_context(|| { - format!("Failed to load metadata for timeline_id {timeline_id}") - }) - } - }, - } - } else { - if !timeline_dir.exists() { - warn!("Timeline dir entry become invalid: {timeline_dir}"); - continue; - } - let timeline_id = TimelineId::try_from(timeline_dir.file_name()) - .with_context(|| { - format!( - "Could not parse timeline id out of the timeline dir name {timeline_dir}", - ) - })?; - let timeline_uninit_mark_file = self - .conf - .timeline_uninit_mark_file_path(self.tenant_shard_id, timeline_id); - if timeline_uninit_mark_file.exists() { - info!( - %timeline_id, - "Found an uninit mark file, removing the timeline and its uninit mark", - ); - if let Err(e) = - remove_timeline_and_uninit_mark(timeline_dir, &timeline_uninit_mark_file) - { - error!("Failed to clean up uninit marked timeline: {e:?}"); - } - continue; - } - - let timeline_delete_mark_file = self - .conf - .timeline_delete_mark_file_path(self.tenant_shard_id, timeline_id); - if timeline_delete_mark_file.exists() { - // Cleanup should be done in `is_delete_mark` branch above - continue; - } - - let file_name = entry.file_name(); - if let Ok(timeline_id) = file_name.parse::() { - let metadata = load_metadata(self.conf, &self.tenant_shard_id, &timeline_id) - .context("failed to load metadata")?; - timelines_to_load.insert(timeline_id, metadata); - } else { - // A file or directory that doesn't look like a timeline ID - warn!("unexpected file or directory in timelines directory: {file_name}"); - } - } - } - - // Sort the array of timeline IDs into tree-order, so that parent comes before - // all its children. - tree_sort_timelines(timelines_to_load, |m| m.ancestor_timeline()).map(|sorted_timelines| { - TenantDirectoryScan { - sorted_timelines_to_load: sorted_timelines, - timelines_to_resume_deletion, - } - }) - } - async fn load_timeline_metadata( self: &Arc, timeline_ids: HashSet, @@ -1359,7 +1204,7 @@ impl Tenant { async move { debug!("starting index part download"); - let index_part = client.download_index_file(cancel_clone).await; + let index_part = client.download_index_file(&cancel_clone).await; debug!("finished index part download"); @@ -1401,141 +1246,6 @@ impl Tenant { Ok(timeline_preloads) } - /// - /// Background task to load in-memory data structures for this tenant, from - /// files on disk. Used at pageserver startup. - /// - /// No background tasks are started as part of this routine. - async fn load_local(self: &Arc, ctx: &RequestContext) -> anyhow::Result<()> { - span::debug_assert_current_span_has_tenant_id(); - - debug!("loading tenant task"); - - // Load in-memory state to reflect the local files on disk - // - // Scan the directory, peek into the metadata file of each timeline, and - // collect a list of timelines and their ancestors. - let span = info_span!("blocking"); - let cloned = Arc::clone(self); - - let scan = tokio::task::spawn_blocking(move || { - let _g = span.entered(); - cloned.scan_and_sort_timelines_dir() - }) - .await - .context("load spawn_blocking") - .and_then(|res| res)?; - - // FIXME original collect_timeline_files contained one more check: - // 1. "Timeline has no ancestor and no layer files" - - // Process loadable timelines first - for (timeline_id, local_metadata) in scan.sorted_timelines_to_load { - if let Err(e) = self - .load_local_timeline(timeline_id, local_metadata, ctx, false) - .await - { - match e { - LoadLocalTimelineError::Load(source) => { - return Err(anyhow::anyhow!(source)).with_context(|| { - format!("Failed to load local timeline: {timeline_id}") - }) - } - LoadLocalTimelineError::ResumeDeletion(source) => { - // Make sure resumed deletion wont fail loading for entire tenant. - error!("Failed to resume timeline deletion: {source:#}") - } - } - } - } - - // Resume deletion ones with deleted_mark - for (timeline_id, maybe_local_metadata) in scan.timelines_to_resume_deletion { - match maybe_local_metadata { - None => { - // See comment in `scan_and_sort_timelines_dir`. - if let Err(e) = - DeleteTimelineFlow::cleanup_remaining_timeline_fs_traces(self, timeline_id) - .await - { - warn!( - "cannot clean up deleted timeline dir timeline_id: {} error: {:#}", - timeline_id, e - ); - } - } - Some(local_metadata) => { - if let Err(e) = self - .load_local_timeline(timeline_id, local_metadata, ctx, true) - .await - { - match e { - LoadLocalTimelineError::Load(source) => { - // We tried to load deleted timeline, this is a bug. - return Err(anyhow::anyhow!(source).context( - format!("This is a bug. We tried to load deleted timeline which is wrong and loading failed. Timeline: {timeline_id}") - )); - } - LoadLocalTimelineError::ResumeDeletion(source) => { - // Make sure resumed deletion wont fail loading for entire tenant. - error!("Failed to resume timeline deletion: {source:#}") - } - } - } - } - } - } - - trace!("Done"); - - Ok(()) - } - - /// Subroutine of `load_tenant`, to load an individual timeline - /// - /// NB: The parent is assumed to be already loaded! - #[instrument(skip(self, local_metadata, ctx))] - async fn load_local_timeline( - self: &Arc, - timeline_id: TimelineId, - local_metadata: TimelineMetadata, - ctx: &RequestContext, - found_delete_mark: bool, - ) -> Result<(), LoadLocalTimelineError> { - span::debug_assert_current_span_has_tenant_id(); - - let resources = self.build_timeline_resources(timeline_id); - - if found_delete_mark { - // There is no remote client, we found local metadata. - // Continue cleaning up local disk. - DeleteTimelineFlow::resume_deletion( - Arc::clone(self), - timeline_id, - &local_metadata, - None, - self.deletion_queue_client.clone(), - ) - .await - .context("resume deletion") - .map_err(LoadLocalTimelineError::ResumeDeletion)?; - return Ok(()); - } - - let ancestor = if let Some(ancestor_timeline_id) = local_metadata.ancestor_timeline() { - let ancestor_timeline = self.get_timeline(ancestor_timeline_id, false) - .with_context(|| anyhow::anyhow!("cannot find ancestor timeline {ancestor_timeline_id} for timeline {timeline_id}")) - .map_err(LoadLocalTimelineError::Load)?; - Some(ancestor_timeline) - } else { - None - }; - - self.timeline_init_and_sync(timeline_id, resources, None, local_metadata, ancestor, ctx) - .await - .map_err(LoadLocalTimelineError::Load) - } - pub(crate) fn tenant_shard_id(&self) -> TenantShardId { self.tenant_shard_id } @@ -1956,6 +1666,10 @@ impl Tenant { self.generation } + pub(crate) fn wal_redo_manager_status(&self) -> Option { + self.walredo_mgr.as_ref().and_then(|mgr| mgr.status()) + } + /// Changes tenant status to active, unless shutdown was already requested. /// /// `background_jobs_can_start` is an optional barrier set to a value during pageserver startup @@ -2093,7 +1807,10 @@ impl Tenant { let timelines = self.timelines.lock().unwrap(); timelines.values().for_each(|timeline| { let timeline = Arc::clone(timeline); - let span = Span::current(); + let timeline_id = timeline.timeline_id; + + let span = + tracing::info_span!("timeline_shutdown", %timeline_id, ?freeze_and_flush); js.spawn(async move { if freeze_and_flush { timeline.flush_and_shutdown().instrument(span).await @@ -2129,6 +1846,8 @@ impl Tenant { // Wait for any in-flight operations to complete self.gate.close().await; + remove_tenant_metrics(&self.tenant_shard_id); + Ok(()) } @@ -2337,12 +2056,7 @@ impl Tenant { } pub(crate) fn get_attach_mode(&self) -> AttachmentMode { - self.tenant_conf - .read() - .unwrap() - .location - .attach_mode - .clone() + self.tenant_conf.read().unwrap().location.attach_mode } /// For API access: generate a LocationConfig equivalent to the one that would be used to @@ -2358,14 +2072,14 @@ impl Tenant { }; // We have a pageserver TenantConf, we need the API-facing TenantConfig. - let tenant_config: models::TenantConfig = conf.tenant_conf.into(); + let tenant_config: models::TenantConfig = conf.tenant_conf.clone().into(); models::LocationConfig { mode: location_config_mode, generation: self.generation.into(), secondary_conf: None, shard_number: self.shard_identity.number.0, - shard_count: self.shard_identity.count.0, + shard_count: self.shard_identity.count.literal(), shard_stripe_size: self.shard_identity.stripe_size.0, tenant_conf: tenant_config, } @@ -2375,9 +2089,74 @@ impl Tenant { &self.tenant_shard_id } + pub(crate) fn get_shard_stripe_size(&self) -> ShardStripeSize { + self.shard_identity.stripe_size + } + pub(crate) fn get_generation(&self) -> Generation { self.generation } + + /// This function partially shuts down the tenant (it shuts down the Timelines) and is fallible, + /// and can leave the tenant in a bad state if it fails. The caller is responsible for + /// resetting this tenant to a valid state if we fail. + pub(crate) async fn split_prepare( + &self, + child_shards: &Vec, + ) -> anyhow::Result<()> { + let timelines = self.timelines.lock().unwrap().clone(); + for timeline in timelines.values() { + let Some(tl_client) = &timeline.remote_client else { + anyhow::bail!("Remote storage is mandatory"); + }; + + let Some(remote_storage) = &self.remote_storage else { + anyhow::bail!("Remote storage is mandatory"); + }; + + // We do not block timeline creation/deletion during splits inside the pageserver: it is up to higher levels + // to ensure that they do not start a split if currently in the process of doing these. + + // Upload an index from the parent: this is partly to provide freshness for the + // child tenants that will copy it, and partly for general ease-of-debugging: there will + // always be a parent shard index in the same generation as we wrote the child shard index. + tl_client.schedule_index_upload_for_file_changes()?; + tl_client.wait_completion().await?; + + // Shut down the timeline's remote client: this means that the indices we write + // for child shards will not be invalidated by the parent shard deleting layers. + tl_client.shutdown().await?; + + // Download methods can still be used after shutdown, as they don't flow through the remote client's + // queue. In principal the RemoteTimelineClient could provide this without downloading it, but this + // operation is rare, so it's simpler to just download it (and robustly guarantees that the index + // we use here really is the remotely persistent one). + let result = tl_client + .download_index_file(&self.cancel) + .instrument(info_span!("download_index_file", tenant_id=%self.tenant_shard_id.tenant_id, shard_id=%self.tenant_shard_id.shard_slug(), timeline_id=%timeline.timeline_id)) + .await?; + let index_part = match result { + MaybeDeletedIndexPart::Deleted(_) => { + anyhow::bail!("Timeline deletion happened concurrently with split") + } + MaybeDeletedIndexPart::IndexPart(p) => p, + }; + + for child_shard in child_shards { + upload_index_part( + remote_storage, + child_shard, + &timeline.timeline_id, + self.generation, + &index_part, + &self.cancel, + ) + .await?; + } + } + + Ok(()) + } } /// Given a Vec of timelines and their ancestors (timeline_id, ancestor_id), @@ -2431,93 +2210,93 @@ where impl Tenant { pub fn tenant_specific_overrides(&self) -> TenantConfOpt { - self.tenant_conf.read().unwrap().tenant_conf + self.tenant_conf.read().unwrap().tenant_conf.clone() } pub fn effective_config(&self) -> TenantConf { self.tenant_specific_overrides() - .merge(self.conf.default_tenant_conf) + .merge(self.conf.default_tenant_conf.clone()) } pub fn get_checkpoint_distance(&self) -> u64 { - let tenant_conf = self.tenant_conf.read().unwrap().tenant_conf; + let tenant_conf = self.tenant_conf.read().unwrap().tenant_conf.clone(); tenant_conf .checkpoint_distance .unwrap_or(self.conf.default_tenant_conf.checkpoint_distance) } pub fn get_checkpoint_timeout(&self) -> Duration { - let tenant_conf = self.tenant_conf.read().unwrap().tenant_conf; + let tenant_conf = self.tenant_conf.read().unwrap().tenant_conf.clone(); tenant_conf .checkpoint_timeout .unwrap_or(self.conf.default_tenant_conf.checkpoint_timeout) } pub fn get_compaction_target_size(&self) -> u64 { - let tenant_conf = self.tenant_conf.read().unwrap().tenant_conf; + let tenant_conf = self.tenant_conf.read().unwrap().tenant_conf.clone(); tenant_conf .compaction_target_size .unwrap_or(self.conf.default_tenant_conf.compaction_target_size) } pub fn get_compaction_period(&self) -> Duration { - let tenant_conf = self.tenant_conf.read().unwrap().tenant_conf; + let tenant_conf = self.tenant_conf.read().unwrap().tenant_conf.clone(); tenant_conf .compaction_period .unwrap_or(self.conf.default_tenant_conf.compaction_period) } pub fn get_compaction_threshold(&self) -> usize { - let tenant_conf = self.tenant_conf.read().unwrap().tenant_conf; + let tenant_conf = self.tenant_conf.read().unwrap().tenant_conf.clone(); tenant_conf .compaction_threshold .unwrap_or(self.conf.default_tenant_conf.compaction_threshold) } pub fn get_gc_horizon(&self) -> u64 { - let tenant_conf = self.tenant_conf.read().unwrap().tenant_conf; + let tenant_conf = self.tenant_conf.read().unwrap().tenant_conf.clone(); tenant_conf .gc_horizon .unwrap_or(self.conf.default_tenant_conf.gc_horizon) } pub fn get_gc_period(&self) -> Duration { - let tenant_conf = self.tenant_conf.read().unwrap().tenant_conf; + let tenant_conf = self.tenant_conf.read().unwrap().tenant_conf.clone(); tenant_conf .gc_period .unwrap_or(self.conf.default_tenant_conf.gc_period) } pub fn get_image_creation_threshold(&self) -> usize { - let tenant_conf = self.tenant_conf.read().unwrap().tenant_conf; + let tenant_conf = self.tenant_conf.read().unwrap().tenant_conf.clone(); tenant_conf .image_creation_threshold .unwrap_or(self.conf.default_tenant_conf.image_creation_threshold) } pub fn get_pitr_interval(&self) -> Duration { - let tenant_conf = self.tenant_conf.read().unwrap().tenant_conf; + let tenant_conf = self.tenant_conf.read().unwrap().tenant_conf.clone(); tenant_conf .pitr_interval .unwrap_or(self.conf.default_tenant_conf.pitr_interval) } pub fn get_trace_read_requests(&self) -> bool { - let tenant_conf = self.tenant_conf.read().unwrap().tenant_conf; + let tenant_conf = self.tenant_conf.read().unwrap().tenant_conf.clone(); tenant_conf .trace_read_requests .unwrap_or(self.conf.default_tenant_conf.trace_read_requests) } pub fn get_min_resident_size_override(&self) -> Option { - let tenant_conf = self.tenant_conf.read().unwrap().tenant_conf; + let tenant_conf = self.tenant_conf.read().unwrap().tenant_conf.clone(); tenant_conf .min_resident_size_override .or(self.conf.default_tenant_conf.min_resident_size_override) } pub fn get_heatmap_period(&self) -> Option { - let tenant_conf = self.tenant_conf.read().unwrap().tenant_conf; + let tenant_conf = self.tenant_conf.read().unwrap().tenant_conf.clone(); let heatmap_period = tenant_conf .heatmap_period .unwrap_or(self.conf.default_tenant_conf.heatmap_period); @@ -2530,6 +2309,7 @@ impl Tenant { pub fn set_new_tenant_config(&self, new_tenant_conf: TenantConfOpt) { self.tenant_conf.write().unwrap().tenant_conf = new_tenant_conf; + self.tenant_conf_updated(); // Don't hold self.timelines.lock() during the notifies. // There's no risk of deadlock right now, but there could be if we consolidate // mutexes in struct Timeline in the future. @@ -2541,6 +2321,7 @@ impl Tenant { pub(crate) fn set_new_location_config(&self, new_conf: AttachedTenantConf) { *self.tenant_conf.write().unwrap() = new_conf; + self.tenant_conf_updated(); // Don't hold self.timelines.lock() during the notifies. // There's no risk of deadlock right now, but there could be if we consolidate // mutexes in struct Timeline in the future. @@ -2550,6 +2331,24 @@ impl Tenant { } } + fn get_timeline_get_throttle_config( + psconf: &'static PageServerConf, + overrides: &TenantConfOpt, + ) -> throttle::Config { + overrides + .timeline_get_throttle + .clone() + .unwrap_or(psconf.default_tenant_conf.timeline_get_throttle.clone()) + } + + pub(crate) fn tenant_conf_updated(&self) { + let conf = { + let guard = self.tenant_conf.read().unwrap(); + Self::get_timeline_get_throttle_config(self.conf, &guard.tenant_conf) + }; + self.timeline_get_throttle.reconfigure(conf) + } + /// Helper function to create a new Timeline struct. /// /// The returned Timeline is in Loading state. The caller is responsible for @@ -2590,7 +2389,7 @@ impl Tenant { self.tenant_shard_id, self.generation, self.shard_identity, - Arc::clone(&self.walredo_mgr), + self.walredo_mgr.clone(), resources, pg_version, state, @@ -2608,7 +2407,7 @@ impl Tenant { conf: &'static PageServerConf, attached_conf: AttachedTenantConf, shard_identity: ShardIdentity, - walredo_mgr: Arc, + walredo_mgr: Option>, tenant_shard_id: TenantShardId, remote_storage: Option, deletion_queue_client: DeletionQueueClient, @@ -2616,9 +2415,16 @@ impl Tenant { let (state, mut rx) = watch::channel(state); tokio::spawn(async move { - // Strings for metric labels + // reflect tenant state in metrics: + // - global per tenant state: TENANT_STATE_METRIC + // - "set" of broken tenants: BROKEN_TENANTS_SET + // + // set of broken tenants should not have zero counts so that it remains accessible for + // alerting. + let tid = tenant_shard_id.to_string(); - let shard_id_str = format!("{}", tenant_shard_id.shard_slug()); + let shard_id = tenant_shard_id.shard_slug().to_string(); + let set_key = &[tid.as_str(), shard_id.as_str()][..]; fn inspect_state(state: &TenantState) -> ([&'static str; 1], bool) { ([state.into()], matches!(state, TenantState::Broken { .. })) @@ -2627,21 +2433,13 @@ impl Tenant { let mut tuple = inspect_state(&rx.borrow_and_update()); let is_broken = tuple.1; - let mut counted_broken = if !is_broken { - // the tenant might be ignored and reloaded, so first remove any previous set - // element. it most likely has already been scraped, as these are manual operations - // right now. most likely we will add it back very soon. - drop( - crate::metrics::BROKEN_TENANTS_SET.remove_label_values(&[&tid, &shard_id_str]), - ); - false - } else { + let mut counted_broken = if is_broken { // add the id to the set right away, there should not be any updates on the channel - // after - crate::metrics::BROKEN_TENANTS_SET - .with_label_values(&[&tid, &shard_id_str]) - .set(1); + // after before tenant is removed, if ever + BROKEN_TENANTS_SET.with_label_values(set_key).set(1); true + } else { + false }; loop { @@ -2650,10 +2448,9 @@ impl Tenant { current.inc(); if rx.changed().await.is_err() { - // tenant has been dropped; decrement the counter because a tenant with that - // state is no longer in tenant map, but allow any broken set item to exist - // still. + // tenant has been dropped current.dec(); + drop(BROKEN_TENANTS_SET.remove_label_values(set_key)); break; } @@ -2663,10 +2460,9 @@ impl Tenant { let is_broken = tuple.1; if is_broken && !counted_broken { counted_broken = true; - // insert the tenant_id (back) into the set - crate::metrics::BROKEN_TENANTS_SET - .with_label_values(&[&tid, &shard_id_str]) - .inc(); + // insert the tenant_id (back) into the set while avoiding needless counter + // access + BROKEN_TENANTS_SET.with_label_values(set_key).set(1); } } }); @@ -2679,7 +2475,6 @@ impl Tenant { // using now here is good enough approximation to catch tenants with really long // activation times. constructed_at: Instant::now(), - tenant_conf: Arc::new(RwLock::new(attached_conf)), timelines: Mutex::new(HashMap::new()), timelines_creating: Mutex::new(HashSet::new()), gc_cs: tokio::sync::Mutex::new(()), @@ -2693,7 +2488,12 @@ impl Tenant { activate_now_sem: tokio::sync::Semaphore::new(0), delete_progress: Arc::new(tokio::sync::Mutex::new(DeleteTenantFlow::default())), cancel: CancellationToken::default(), - gate: Gate::new(format!("Tenant<{tenant_shard_id}>")), + gate: Gate::default(), + timeline_get_throttle: Arc::new(throttle::Throttle::new( + Tenant::get_timeline_get_throttle_config(conf, &attached_conf.tenant_conf), + &crate::metrics::tenant_throttling::TIMELINE_GET, + )), + tenant_conf: Arc::new(RwLock::new(attached_conf)), } } @@ -2781,19 +2581,24 @@ impl Tenant { legacy_config_path: &Utf8Path, location_conf: &LocationConf, ) -> anyhow::Result<()> { - // Forward compat: write out an old-style configuration that old versions can read, in case we roll back - Self::persist_tenant_config_legacy( - tenant_shard_id, - legacy_config_path, - &location_conf.tenant_conf, - ) - .await?; - if let LocationMode::Attached(attach_conf) = &location_conf.mode { - // Once we use LocationMode, generations are mandatory. If we aren't using generations, - // then drop out after writing legacy-style config. + // The modern-style LocationConf config file requires a generation to be set. In case someone + // is running a pageserver without the infrastructure to set generations, write out the legacy-style + // config file that only contains TenantConf. + // + // This will eventually be removed in https://github.com/neondatabase/neon/issues/5388 + if attach_conf.generation.is_none() { - tracing::debug!("Running without generations, not writing new-style LocationConf"); + tracing::info!( + "Running without generations, writing legacy-style tenant config file" + ); + Self::persist_tenant_config_legacy( + tenant_shard_id, + legacy_config_path, + &location_conf.tenant_conf, + ) + .await?; + return Ok(()); } } @@ -2816,17 +2621,10 @@ impl Tenant { let tenant_shard_id = *tenant_shard_id; let config_path = config_path.to_owned(); - tokio::task::spawn_blocking(move || { - Handle::current().block_on(async move { - let conf_content = conf_content.as_bytes(); - VirtualFile::crashsafe_overwrite(&config_path, &temp_path, conf_content) - .await - .with_context(|| { - format!("write tenant {tenant_shard_id} config to {config_path}") - }) - }) - }) - .await??; + let conf_content = conf_content.into_bytes(); + VirtualFile::crashsafe_overwrite(config_path.clone(), temp_path, conf_content) + .await + .with_context(|| format!("write tenant {tenant_shard_id} config to {config_path}"))?; Ok(()) } @@ -2853,17 +2651,12 @@ impl Tenant { let tenant_shard_id = *tenant_shard_id; let target_config_path = target_config_path.to_owned(); - tokio::task::spawn_blocking(move || { - Handle::current().block_on(async move { - let conf_content = conf_content.as_bytes(); - VirtualFile::crashsafe_overwrite(&target_config_path, &temp_path, conf_content) - .await - .with_context(|| { - format!("write tenant {tenant_shard_id} config to {target_config_path}") - }) - }) - }) - .await??; + let conf_content = conf_content.into_bytes(); + VirtualFile::crashsafe_overwrite(target_config_path.clone(), temp_path, conf_content) + .await + .with_context(|| { + format!("write tenant {tenant_shard_id} config to {target_config_path}") + })?; Ok(()) } @@ -3208,14 +3001,12 @@ impl Tenant { .context("branch initial metadata upload")?; } - info!("branched timeline {dst_id} from {src_id} at {start_lsn}"); - Ok(new_timeline) } /// For unit tests, make this visible so that other modules can directly create timelines #[cfg(test)] - #[tracing::instrument(fields(tenant_id=%self.tenant_shard_id.tenant_id, shard_id=%self.tenant_shard_id.shard_slug(), %timeline_id))] + #[tracing::instrument(skip_all, fields(tenant_id=%self.tenant_shard_id.tenant_id, shard_id=%self.tenant_shard_id.shard_slug(), %timeline_id))] pub(crate) async fn bootstrap_timeline_test( &self, timeline_id: TimelineId, @@ -3276,11 +3067,11 @@ impl Tenant { 3, u32::MAX, "persist_initdb_tar_zst", - backoff::Cancel::new(self.cancel.clone(), || anyhow::anyhow!("Cancelled")), + &self.cancel, ) - .await?; - - Ok(()) + .await + .ok_or_else(|| anyhow::Error::new(TimeoutOrCancel::Cancel)) + .and_then(|x| x) } /// - run initdb to init temporary instance and get bootstrap data @@ -3329,8 +3120,10 @@ impl Tenant { ); let dest_path = &remote_initdb_archive_path(&self.tenant_shard_id.tenant_id, &timeline_id); + + // if this fails, it will get retried by retried control plane requests storage - .copy_object(source_path, dest_path) + .copy_object(source_path, dest_path, &self.cancel) .await .context("copy initdb tar")?; } @@ -3427,12 +3220,6 @@ impl Tenant { // All done! let timeline = raw_timeline.finish_creation()?; - info!( - "created root timeline {} timeline.lsn {}", - timeline_id, - timeline.get_last_record_lsn() - ); - Ok(timeline) } @@ -3455,6 +3242,7 @@ impl Tenant { TimelineResources { remote_client, deletion_queue_client: self.deletion_queue_client.clone(), + timeline_get_throttle: self.timeline_get_throttle.clone(), } } @@ -3491,10 +3279,7 @@ impl Tenant { timeline_struct.init_empty_layer_map(start_lsn); - if let Err(e) = self - .create_timeline_files(&uninit_mark.timeline_path, &new_timeline_id, new_metadata) - .await - { + if let Err(e) = self.create_timeline_files(&uninit_mark.timeline_path).await { error!("Failed to create initial files for timeline {tenant_shard_id}/{new_timeline_id}, cleaning up: {e:?}"); cleanup_timeline_directory(uninit_mark); return Err(e); @@ -3511,26 +3296,13 @@ impl Tenant { )) } - async fn create_timeline_files( - &self, - timeline_path: &Utf8Path, - new_timeline_id: &TimelineId, - new_metadata: &TimelineMetadata, - ) -> anyhow::Result<()> { + async fn create_timeline_files(&self, timeline_path: &Utf8Path) -> anyhow::Result<()> { crashsafe::create_dir(timeline_path).context("Failed to create timeline directory")?; fail::fail_point!("after-timeline-uninit-mark-creation", |_| { anyhow::bail!("failpoint after-timeline-uninit-mark-creation"); }); - save_metadata( - self.conf, - &self.tenant_shard_id, - new_timeline_id, - new_metadata, - ) - .await - .context("Failed to create timeline metadata")?; Ok(()) } @@ -3697,9 +3469,8 @@ impl Tenant { // Run each timeline's flush in a task holding the timeline's gate: this // means that if this function's future is cancelled, the Timeline shutdown // will still wait for any I/O in here to complete. - let gate = match timeline.gate.enter() { - Ok(g) => g, - Err(_) => continue, + let Ok(gate) = timeline.gate.enter() else { + continue; }; let jh = tokio::task::spawn(async move { flush_timeline(gate, timeline).await }); results.push(jh); @@ -3724,29 +3495,10 @@ impl Tenant { Ok(()) } -} -fn remove_timeline_and_uninit_mark( - timeline_dir: &Utf8Path, - uninit_mark: &Utf8Path, -) -> anyhow::Result<()> { - fs::remove_dir_all(timeline_dir) - .or_else(|e| { - if e.kind() == std::io::ErrorKind::NotFound { - // we can leave the uninit mark without a timeline dir, - // just remove the mark then - Ok(()) - } else { - Err(e) - } - }) - .with_context(|| { - format!("Failed to remove unit marked timeline directory {timeline_dir}") - })?; - fs::remove_file(uninit_mark) - .with_context(|| format!("Failed to remove timeline uninit mark file {uninit_mark}"))?; - - Ok(()) + pub(crate) fn get_tenant_conf(&self) -> TenantConfOpt { + self.tenant_conf.read().unwrap().tenant_conf.clone() + } } /// Create the cluster temporarily in 'initdbpath' directory inside the repository @@ -3778,6 +3530,11 @@ async fn run_initdb( .env_clear() .env("LD_LIBRARY_PATH", &initdb_lib_dir) .env("DYLD_LIBRARY_PATH", &initdb_lib_dir) + .stdin(std::process::Stdio::null()) + // stdout invocation produces the same output every time, we don't need it + .stdout(std::process::Stdio::null()) + // we would be interested in the stderr output, if there was any + .stderr(std::process::Stdio::piped()) .spawn()?; // Ideally we'd select here with the cancellation token, but the problem is that @@ -3802,11 +3559,6 @@ async fn run_initdb( Ok(()) } -impl Drop for Tenant { - fn drop(&mut self) { - remove_tenant_metrics(&self.tenant_shard_id); - } -} /// Dump contents of a layer file to stdout. pub async fn dump_layerfile_from_path( path: &Utf8Path, @@ -3841,24 +3593,18 @@ pub async fn dump_layerfile_from_path( #[cfg(test)] pub(crate) mod harness { use bytes::{Bytes, BytesMut}; - use camino::Utf8PathBuf; use once_cell::sync::OnceCell; use pageserver_api::models::ShardParameters; use pageserver_api::shard::ShardIndex; - use std::fs; - use std::sync::Arc; use utils::logging; - use utils::lsn::Lsn; use crate::deletion_queue::mock::MockDeletionQueue; - use crate::{ - config::PageServerConf, repository::Key, tenant::Tenant, walrecord::NeonWalRecord, - }; + use crate::walredo::apply_neon; + use crate::{repository::Key, walrecord::NeonWalRecord}; use super::*; - use crate::tenant::config::{TenantConf, TenantConfOpt}; use hex_literal::hex; - use utils::id::{TenantId, TimelineId}; + use utils::id::TenantId; pub const TIMELINE_ID: TimelineId = TimelineId::from_array(hex!("11223344556677881122334455667788")); @@ -3866,8 +3612,7 @@ pub(crate) mod harness { TimelineId::from_array(hex!("AA223344556677881122334455667788")); /// Convenience function to create a page image with given string as the only content - #[allow(non_snake_case)] - pub fn TEST_IMG(s: &str) -> Bytes { + pub fn test_img(s: &str) -> Bytes { let mut buf = BytesMut::new(); buf.extend_from_slice(s.as_bytes()); buf.resize(64, 0); @@ -3883,6 +3628,7 @@ pub(crate) mod harness { compaction_target_size: Some(tenant_conf.compaction_target_size), compaction_period: Some(tenant_conf.compaction_period), compaction_threshold: Some(tenant_conf.compaction_threshold), + compaction_algorithm: Some(tenant_conf.compaction_algorithm), gc_horizon: Some(tenant_conf.gc_horizon), gc_period: Some(tenant_conf.gc_period), image_creation_threshold: Some(tenant_conf.image_creation_threshold), @@ -3896,17 +3642,13 @@ pub(crate) mod harness { evictions_low_residence_duration_metric_threshold: Some( tenant_conf.evictions_low_residence_duration_metric_threshold, ), - gc_feedback: Some(tenant_conf.gc_feedback), heatmap_period: Some(tenant_conf.heatmap_period), + lazy_slru_download: Some(tenant_conf.lazy_slru_download), + timeline_get_throttle: Some(tenant_conf.timeline_get_throttle), } } } - enum LoadMode { - Local, - Remote, - } - pub struct TenantHarness { pub conf: &'static PageServerConf, pub tenant_conf: TenantConf, @@ -3934,7 +3676,10 @@ pub(crate) mod harness { } impl TenantHarness { - pub fn create(test_name: &'static str) -> anyhow::Result { + pub fn create_custom( + test_name: &'static str, + tenant_conf: TenantConf, + ) -> anyhow::Result { setup_logging(); let repo_dir = PageServerConf::test_repo_dir(test_name); @@ -3946,14 +3691,6 @@ pub(crate) mod harness { // OK in a test. let conf: &'static PageServerConf = Box::leak(Box::new(conf)); - // Disable automatic GC and compaction to make the unit tests more deterministic. - // The tests perform them manually if needed. - let tenant_conf = TenantConf { - gc_period: Duration::ZERO, - compaction_period: Duration::ZERO, - ..TenantConf::default() - }; - let tenant_id = TenantId::generate(); let tenant_shard_id = TenantShardId::unsharded(tenant_id); fs::create_dir_all(conf.tenant_path(&tenant_shard_id))?; @@ -3964,6 +3701,7 @@ pub(crate) mod harness { std::fs::create_dir_all(&remote_fs_dir).unwrap(); let config = RemoteStorageConfig { storage: RemoteStorageKind::LocalFs(remote_fs_dir.clone()), + timeout: RemoteStorageConfig::DEFAULT_TIMEOUT, }; let remote_storage = GenericRemoteStorage::from_config(&config).unwrap(); let deletion_queue = MockDeletionQueue::new(Some(remote_storage.clone())); @@ -3980,45 +3718,36 @@ pub(crate) mod harness { }) } - pub async fn load(&self) -> (Arc, RequestContext) { + pub fn create(test_name: &'static str) -> anyhow::Result { + // Disable automatic GC and compaction to make the unit tests more deterministic. + // The tests perform them manually if needed. + let tenant_conf = TenantConf { + gc_period: Duration::ZERO, + compaction_period: Duration::ZERO, + ..TenantConf::default() + }; + + Self::create_custom(test_name, tenant_conf) + } + + pub fn span(&self) -> tracing::Span { + info_span!("TenantHarness", tenant_id=%self.tenant_shard_id.tenant_id, shard_id=%self.tenant_shard_id.shard_slug()) + } + + pub(crate) async fn load(&self) -> (Arc, RequestContext) { let ctx = RequestContext::new(TaskKind::UnitTest, DownloadBehavior::Error); ( - self.try_load(&ctx) + self.do_try_load(&ctx) .await .expect("failed to load test tenant"), ctx, ) } - fn remote_empty(&self) -> bool { - let tenant_path = self.conf.tenant_path(&self.tenant_shard_id); - let remote_tenant_dir = self - .remote_fs_dir - .join(tenant_path.strip_prefix(&self.conf.workdir).unwrap()); - if std::fs::metadata(&remote_tenant_dir).is_err() { - return true; - } - - match std::fs::read_dir(remote_tenant_dir) - .unwrap() - .flatten() - .next() - { - Some(entry) => { - tracing::debug!( - "remote_empty: not empty, found file {}", - entry.file_name().to_string_lossy(), - ); - false - } - None => true, - } - } - - async fn do_try_load( + #[instrument(skip_all, fields(tenant_id=%self.tenant_shard_id.tenant_id, shard_id=%self.tenant_shard_id.shard_slug()))] + pub(crate) async fn do_try_load( &self, ctx: &RequestContext, - mode: LoadMode, ) -> anyhow::Result> { let walredo_mgr = Arc::new(WalRedoManager::from(TestRedoManager)); @@ -4026,37 +3755,23 @@ pub(crate) mod harness { TenantState::Loading, self.conf, AttachedTenantConf::try_from(LocationConf::attached_single( - TenantConfOpt::from(self.tenant_conf), + TenantConfOpt::from(self.tenant_conf.clone()), self.generation, &ShardParameters::default(), )) .unwrap(), // This is a legacy/test code path: sharding isn't supported here. ShardIdentity::unsharded(), - walredo_mgr, + Some(walredo_mgr), self.tenant_shard_id, Some(self.remote_storage.clone()), self.deletion_queue.new_client(), )); - match mode { - LoadMode::Local => { - tenant - .load_local(ctx) - .instrument(info_span!("try_load", tenant_id=%self.tenant_shard_id.tenant_id, shard_id=%self.tenant_shard_id.shard_slug())) - .await?; - } - LoadMode::Remote => { - let preload = tenant - .preload(&self.remote_storage, CancellationToken::new()) - .instrument(info_span!("try_load_preload", tenant_id=%self.tenant_shard_id.tenant_id, shard_id=%self.tenant_shard_id.shard_slug())) - .await?; - tenant - .attach(Some(preload), SpawnMode::Normal, ctx) - .instrument(info_span!("try_load", tenant_id=%self.tenant_shard_id.tenant_id, shard_id=%self.tenant_shard_id.shard_slug())) - .await?; - } - } + let preload = tenant + .preload(&self.remote_storage, CancellationToken::new()) + .await?; + tenant.attach(Some(preload), SpawnMode::Eager, ctx).await?; tenant.state.send_replace(TenantState::Active); for timeline in tenant.timelines.lock().unwrap().values() { @@ -4065,27 +3780,6 @@ pub(crate) mod harness { Ok(tenant) } - /// For tests that specifically want to exercise the local load path, which does - /// not use remote storage. - pub async fn try_load_local(&self, ctx: &RequestContext) -> anyhow::Result> { - self.do_try_load(ctx, LoadMode::Local).await - } - - /// The 'load' in this function is either a local load or a normal attachment, - pub async fn try_load(&self, ctx: &RequestContext) -> anyhow::Result> { - // If we have nothing in remote storage, must use load_local instead of attach: attach - // will error out if there are no timelines. - // - // See https://github.com/neondatabase/neon/issues/5456 for how we will eliminate - // this weird state of a Tenant which exists but doesn't have any timelines. - let mode = match self.remote_empty() { - true => LoadMode::Local, - false => LoadMode::Remote, - }; - - self.do_try_load(ctx, mode).await - } - pub fn timeline_path(&self, timeline_id: &TimelineId) -> Utf8PathBuf { self.conf.timeline_path(&self.tenant_shard_id, timeline_id) } @@ -4106,20 +3800,33 @@ pub(crate) mod harness { records: Vec<(Lsn, NeonWalRecord)>, _pg_version: u32, ) -> anyhow::Result { - let s = format!( - "redo for {} to get to {}, with {} and {} records", - key, - lsn, - if base_img.is_some() { - "base image" - } else { - "no base image" - }, - records.len() - ); - println!("{s}"); + let records_neon = records.iter().all(|r| apply_neon::can_apply_in_neon(&r.1)); + if records_neon { + // For Neon wal records, we can decode without spawning postgres, so do so. + let base_img = base_img.expect("Neon WAL redo requires base image").1; + let mut page = BytesMut::new(); + page.extend_from_slice(&base_img); + for (_record_lsn, record) in records { + apply_neon::apply_in_neon(&record, key, &mut page)?; + } + Ok(page.freeze()) + } else { + // We never spawn a postgres walredo process in unit tests: just log what we might have done. + let s = format!( + "redo for {} to get to {}, with {} and {} records", + key, + lsn, + if base_img.is_some() { + "base image" + } else { + "no base image" + }, + records.len() + ); + println!("{s}"); - Ok(TEST_IMG(&s)) + Ok(test_img(&s)) + } } } } @@ -4130,13 +3837,12 @@ mod tests { use crate::keyspace::KeySpaceAccum; use crate::repository::{Key, Value}; use crate::tenant::harness::*; + use crate::tenant::timeline::CompactFlags; use crate::DEFAULT_PG_VERSION; - use crate::METADATA_FILE_NAME; use bytes::BytesMut; use hex_literal::hex; - use once_cell::sync::Lazy; + use pageserver_api::keyspace::KeySpace; use rand::{thread_rng, Rng}; - use tokio_util::sync::CancellationToken; static TEST_KEY: Lazy = Lazy::new(|| Key::from_slice(&hex!("010000000033333333444444445500000001"))); @@ -4148,24 +3854,24 @@ mod tests { .create_test_timeline(TIMELINE_ID, Lsn(0x08), DEFAULT_PG_VERSION, &ctx) .await?; - let writer = tline.writer().await; + let mut writer = tline.writer().await; writer .put( *TEST_KEY, Lsn(0x10), - &Value::Image(TEST_IMG("foo at 0x10")), + &Value::Image(test_img("foo at 0x10")), &ctx, ) .await?; writer.finish_write(Lsn(0x10)); drop(writer); - let writer = tline.writer().await; + let mut writer = tline.writer().await; writer .put( *TEST_KEY, Lsn(0x20), - &Value::Image(TEST_IMG("foo at 0x20")), + &Value::Image(test_img("foo at 0x20")), &ctx, ) .await?; @@ -4174,15 +3880,15 @@ mod tests { assert_eq!( tline.get(*TEST_KEY, Lsn(0x10), &ctx).await?, - TEST_IMG("foo at 0x10") + test_img("foo at 0x10") ); assert_eq!( tline.get(*TEST_KEY, Lsn(0x1f), &ctx).await?, - TEST_IMG("foo at 0x10") + test_img("foo at 0x10") ); assert_eq!( tline.get(*TEST_KEY, Lsn(0x20), &ctx).await?, - TEST_IMG("foo at 0x20") + test_img("foo at 0x20") ); Ok(()) @@ -4226,7 +3932,7 @@ mod tests { let tline = tenant .create_test_timeline(TIMELINE_ID, Lsn(0x10), DEFAULT_PG_VERSION, &ctx) .await?; - let writer = tline.writer().await; + let mut writer = tline.writer().await; #[allow(non_snake_case)] let TEST_KEY_A: Key = Key::from_hex("110000000033333333444444445500000001").unwrap(); @@ -4260,7 +3966,7 @@ mod tests { let newtline = tenant .get_timeline(NEW_TIMELINE_ID, true) .expect("Should have a local timeline"); - let new_writer = newtline.writer().await; + let mut new_writer = newtline.writer().await; new_writer .put(TEST_KEY_A, Lsn(0x40), &test_value("bar at 0x40"), &ctx) .await?; @@ -4291,15 +3997,14 @@ mod tests { ctx: &RequestContext, ) -> anyhow::Result<()> { let mut lsn = start_lsn; - #[allow(non_snake_case)] { - let writer = tline.writer().await; + let mut writer = tline.writer().await; // Create a relation on the timeline writer .put( *TEST_KEY, lsn, - &Value::Image(TEST_IMG(&format!("foo at {}", lsn))), + &Value::Image(test_img(&format!("foo at {}", lsn))), ctx, ) .await?; @@ -4309,7 +4014,7 @@ mod tests { .put( *TEST_KEY, lsn, - &Value::Image(TEST_IMG(&format!("foo at {}", lsn))), + &Value::Image(test_img(&format!("foo at {}", lsn))), ctx, ) .await?; @@ -4318,12 +4023,12 @@ mod tests { } tline.freeze_and_flush().await?; { - let writer = tline.writer().await; + let mut writer = tline.writer().await; writer .put( *TEST_KEY, lsn, - &Value::Image(TEST_IMG(&format!("foo at {}", lsn))), + &Value::Image(test_img(&format!("foo at {}", lsn))), ctx, ) .await?; @@ -4333,7 +4038,7 @@ mod tests { .put( *TEST_KEY, lsn, - &Value::Image(TEST_IMG(&format!("foo at {}", lsn))), + &Value::Image(test_img(&format!("foo at {}", lsn))), ctx, ) .await?; @@ -4488,7 +4193,7 @@ mod tests { // Broken, as long as you don't need to access data from the parent. assert_eq!( newtline.get(*TEST_KEY, Lsn(0x70), &ctx).await?, - TEST_IMG(&format!("foo at {}", Lsn(0x70))) + test_img(&format!("foo at {}", Lsn(0x70))) ); // This needs to traverse to the parent, and fails. @@ -4565,7 +4270,7 @@ mod tests { // Check that the data is still accessible on the branch. assert_eq!( newtline.get(*TEST_KEY, Lsn(0x50), &ctx).await?, - TEST_IMG(&format!("foo at {}", Lsn(0x40))) + test_img(&format!("foo at {}", Lsn(0x40))) ); Ok(()) @@ -4584,7 +4289,7 @@ mod tests { // so that all uploads finish & we can call harness.load() below again tenant .shutdown(Default::default(), true) - .instrument(info_span!("test_shutdown", tenant_id=%tenant.tenant_shard_id)) + .instrument(harness.span()) .await .ok() .unwrap(); @@ -4625,7 +4330,7 @@ mod tests { // so that all uploads finish & we can call harness.load() below again tenant .shutdown(Default::default(), true) - .instrument(info_span!("test_shutdown", tenant_id=%tenant.tenant_shard_id)) + .instrument(harness.span()) .await .ok() .unwrap(); @@ -4674,60 +4379,6 @@ mod tests { Ok(()) } - #[tokio::test] - async fn corrupt_local_metadata() -> anyhow::Result<()> { - const TEST_NAME: &str = "corrupt_metadata"; - let harness = TenantHarness::create(TEST_NAME)?; - let (tenant, ctx) = harness.load().await; - - let tline = tenant - .create_test_timeline(TIMELINE_ID, Lsn(0x10), DEFAULT_PG_VERSION, &ctx) - .await?; - drop(tline); - // so that all uploads finish & we can call harness.try_load() below again - tenant - .shutdown(Default::default(), true) - .instrument(info_span!("test_shutdown", tenant_id=%tenant.tenant_shard_id)) - .await - .ok() - .unwrap(); - drop(tenant); - - // Corrupt local metadata - let metadata_path = harness.timeline_path(&TIMELINE_ID).join(METADATA_FILE_NAME); - assert!(metadata_path.is_file()); - let mut metadata_bytes = std::fs::read(&metadata_path)?; - assert_eq!(metadata_bytes.len(), 512); - metadata_bytes[8] ^= 1; - std::fs::write(metadata_path, metadata_bytes)?; - - let err = harness.try_load_local(&ctx).await.expect_err("should fail"); - // get all the stack with all .context, not only the last one - let message = format!("{err:#}"); - let expected = "failed to load metadata"; - assert!( - message.contains(expected), - "message '{message}' expected to contain {expected}" - ); - - let mut found_error_message = false; - let mut err_source = err.source(); - while let Some(source) = err_source { - if source.to_string().contains("metadata checksum mismatch") { - found_error_message = true; - break; - } - err_source = source.source(); - } - assert!( - found_error_message, - "didn't find the corrupted metadata error in {}", - message - ); - - Ok(()) - } - #[tokio::test] async fn test_images() -> anyhow::Result<()> { let (tenant, ctx) = TenantHarness::create("test_images")?.load().await; @@ -4735,12 +4386,12 @@ mod tests { .create_test_timeline(TIMELINE_ID, Lsn(0x08), DEFAULT_PG_VERSION, &ctx) .await?; - let writer = tline.writer().await; + let mut writer = tline.writer().await; writer .put( *TEST_KEY, Lsn(0x10), - &Value::Image(TEST_IMG("foo at 0x10")), + &Value::Image(test_img("foo at 0x10")), &ctx, ) .await?; @@ -4752,12 +4403,12 @@ mod tests { .compact(&CancellationToken::new(), EnumSet::empty(), &ctx) .await?; - let writer = tline.writer().await; + let mut writer = tline.writer().await; writer .put( *TEST_KEY, Lsn(0x20), - &Value::Image(TEST_IMG("foo at 0x20")), + &Value::Image(test_img("foo at 0x20")), &ctx, ) .await?; @@ -4769,12 +4420,12 @@ mod tests { .compact(&CancellationToken::new(), EnumSet::empty(), &ctx) .await?; - let writer = tline.writer().await; + let mut writer = tline.writer().await; writer .put( *TEST_KEY, Lsn(0x30), - &Value::Image(TEST_IMG("foo at 0x30")), + &Value::Image(test_img("foo at 0x30")), &ctx, ) .await?; @@ -4786,12 +4437,12 @@ mod tests { .compact(&CancellationToken::new(), EnumSet::empty(), &ctx) .await?; - let writer = tline.writer().await; + let mut writer = tline.writer().await; writer .put( *TEST_KEY, Lsn(0x40), - &Value::Image(TEST_IMG("foo at 0x40")), + &Value::Image(test_img("foo at 0x40")), &ctx, ) .await?; @@ -4805,28 +4456,83 @@ mod tests { assert_eq!( tline.get(*TEST_KEY, Lsn(0x10), &ctx).await?, - TEST_IMG("foo at 0x10") + test_img("foo at 0x10") ); assert_eq!( tline.get(*TEST_KEY, Lsn(0x1f), &ctx).await?, - TEST_IMG("foo at 0x10") + test_img("foo at 0x10") ); assert_eq!( tline.get(*TEST_KEY, Lsn(0x20), &ctx).await?, - TEST_IMG("foo at 0x20") + test_img("foo at 0x20") ); assert_eq!( tline.get(*TEST_KEY, Lsn(0x30), &ctx).await?, - TEST_IMG("foo at 0x30") + test_img("foo at 0x30") ); assert_eq!( tline.get(*TEST_KEY, Lsn(0x40), &ctx).await?, - TEST_IMG("foo at 0x40") + test_img("foo at 0x40") ); Ok(()) } + async fn bulk_insert_compact_gc( + timeline: Arc, + ctx: &RequestContext, + mut lsn: Lsn, + repeat: usize, + key_count: usize, + ) -> anyhow::Result<()> { + let mut test_key = Key::from_hex("010000000033333333444444445500000000").unwrap(); + let mut blknum = 0; + + // Enforce that key range is monotonously increasing + let mut keyspace = KeySpaceAccum::new(); + + for _ in 0..repeat { + for _ in 0..key_count { + test_key.field6 = blknum; + let mut writer = timeline.writer().await; + writer + .put( + test_key, + lsn, + &Value::Image(test_img(&format!("{} at {}", blknum, lsn))), + ctx, + ) + .await?; + writer.finish_write(lsn); + drop(writer); + + keyspace.add_key(test_key); + + lsn = Lsn(lsn.0 + 0x10); + blknum += 1; + } + + let cutoff = timeline.get_last_record_lsn(); + + timeline + .update_gc_info( + Vec::new(), + cutoff, + Duration::ZERO, + &CancellationToken::new(), + ctx, + ) + .await?; + timeline.freeze_and_flush().await?; + timeline + .compact(&CancellationToken::new(), EnumSet::empty(), ctx) + .await?; + timeline.gc().await?; + } + + Ok(()) + } + // // Insert 1000 key-value pairs with increasing keys, flush, compact, GC. // Repeat 50 times. @@ -4839,49 +4545,234 @@ mod tests { .create_test_timeline(TIMELINE_ID, Lsn(0x08), DEFAULT_PG_VERSION, &ctx) .await?; - let mut lsn = Lsn(0x10); + let lsn = Lsn(0x10); + bulk_insert_compact_gc(tline.clone(), &ctx, lsn, 50, 10000).await?; - let mut keyspace = KeySpaceAccum::new(); + Ok(()) + } - let mut test_key = Key::from_hex("010000000033333333444444445500000000").unwrap(); - let mut blknum = 0; - for _ in 0..50 { - for _ in 0..10000 { - test_key.field6 = blknum; - let writer = tline.writer().await; - writer - .put( - test_key, - lsn, - &Value::Image(TEST_IMG(&format!("{} at {}", blknum, lsn))), - &ctx, - ) - .await?; - writer.finish_write(lsn); - drop(writer); + // Test the vectored get real implementation against a simple sequential implementation. + // + // The test generates a keyspace by repeatedly flushing the in-memory layer and compacting. + // Projected to 2D the key space looks like below. Lsn grows upwards on the Y axis and keys + // grow to the right on the X axis. + // [Delta] + // [Delta] + // [Delta] + // [Delta] + // ------------ Image --------------- + // + // After layer generation we pick the ranges to query as follows: + // 1. The beginning of each delta layer + // 2. At the seam between two adjacent delta layers + // + // There's one major downside to this test: delta layers only contains images, + // so the search can stop at the first delta layer and doesn't traverse any deeper. + #[tokio::test] + async fn test_get_vectored() -> anyhow::Result<()> { + let harness = TenantHarness::create("test_get_vectored")?; + let (tenant, ctx) = harness.load().await; + let tline = tenant + .create_test_timeline(TIMELINE_ID, Lsn(0x08), DEFAULT_PG_VERSION, &ctx) + .await?; - keyspace.add_key(test_key); + let lsn = Lsn(0x10); + bulk_insert_compact_gc(tline.clone(), &ctx, lsn, 50, 10000).await?; - lsn = Lsn(lsn.0 + 0x10); - blknum += 1; + let guard = tline.layers.read().await; + guard.layer_map().dump(true, &ctx).await?; + + let mut reads = Vec::new(); + let mut prev = None; + guard.layer_map().iter_historic_layers().for_each(|desc| { + if !desc.is_delta() { + prev = Some(desc.clone()); + return; } - let cutoff = tline.get_last_record_lsn(); + let start = desc.key_range.start; + let end = desc + .key_range + .start + .add(Timeline::MAX_GET_VECTORED_KEYS.try_into().unwrap()); + reads.push(KeySpace { + ranges: vec![start..end], + }); + if let Some(prev) = &prev { + if !prev.is_delta() { + return; + } + + let first_range = Key { + field6: prev.key_range.end.field6 - 4, + ..prev.key_range.end + }..prev.key_range.end; + + let second_range = desc.key_range.start..Key { + field6: desc.key_range.start.field6 + 4, + ..desc.key_range.start + }; + + reads.push(KeySpace { + ranges: vec![first_range, second_range], + }); + }; + + prev = Some(desc.clone()); + }); + + drop(guard); + + // Pick a big LSN such that we query over all the changes. + let reads_lsn = Lsn(u64::MAX - 1); + + for read in reads { + info!("Doing vectored read on {:?}", read); + + let vectored_res = tline.get_vectored_impl(read.clone(), reads_lsn, &ctx).await; tline - .update_gc_info( - Vec::new(), - cutoff, - Duration::ZERO, - &CancellationToken::new(), + .validate_get_vectored_impl(&vectored_res, read, reads_lsn, &ctx) + .await; + } + + Ok(()) + } + + // Test that vectored get handles layer gaps correctly + // by advancing into the next ancestor timeline if required. + // + // The test generates timelines that look like the diagram below. + // We leave a gap in one of the L1 layers at `gap_at_key` (`/` in the diagram). + // The reconstruct data for that key lies in the ancestor timeline (`X` in the diagram). + // + // ``` + //-------------------------------+ + // ... | + // [ L1 ] | + // [ / L1 ] | Child Timeline + // ... | + // ------------------------------+ + // [ X L1 ] | Parent Timeline + // ------------------------------+ + // ``` + #[tokio::test] + async fn test_get_vectored_key_gap() -> anyhow::Result<()> { + let tenant_conf = TenantConf { + // Make compaction deterministic + gc_period: Duration::ZERO, + compaction_period: Duration::ZERO, + // Encourage creation of L1 layers + checkpoint_distance: 16 * 1024, + compaction_target_size: 8 * 1024, + ..TenantConf::default() + }; + + let harness = TenantHarness::create_custom("test_get_vectored_key_gap", tenant_conf)?; + let (tenant, ctx) = harness.load().await; + + let mut current_key = Key::from_hex("010000000033333333444444445500000000").unwrap(); + let gap_at_key = current_key.add(100); + let mut current_lsn = Lsn(0x10); + + const KEY_COUNT: usize = 10_000; + + let timeline_id = TimelineId::generate(); + let current_timeline = tenant + .create_test_timeline(timeline_id, current_lsn, DEFAULT_PG_VERSION, &ctx) + .await?; + + current_lsn += 0x100; + + let mut writer = current_timeline.writer().await; + writer + .put( + gap_at_key, + current_lsn, + &Value::Image(test_img(&format!("{} at {}", gap_at_key, current_lsn))), + &ctx, + ) + .await?; + writer.finish_write(current_lsn); + drop(writer); + + let mut latest_lsns = HashMap::new(); + latest_lsns.insert(gap_at_key, current_lsn); + + current_timeline.freeze_and_flush().await?; + + let child_timeline_id = TimelineId::generate(); + + tenant + .branch_timeline_test( + ¤t_timeline, + child_timeline_id, + Some(current_lsn), + &ctx, + ) + .await?; + let child_timeline = tenant + .get_timeline(child_timeline_id, true) + .expect("Should have the branched timeline"); + + for i in 0..KEY_COUNT { + if current_key == gap_at_key { + current_key = current_key.next(); + continue; + } + + current_lsn += 0x10; + + let mut writer = child_timeline.writer().await; + writer + .put( + current_key, + current_lsn, + &Value::Image(test_img(&format!("{} at {}", current_key, current_lsn))), &ctx, ) .await?; - tline.freeze_and_flush().await?; - tline - .compact(&CancellationToken::new(), EnumSet::empty(), &ctx) - .await?; - tline.gc().await?; + writer.finish_write(current_lsn); + drop(writer); + + latest_lsns.insert(current_key, current_lsn); + current_key = current_key.next(); + + // Flush every now and then to encourage layer file creation. + if i % 500 == 0 { + child_timeline.freeze_and_flush().await?; + } + } + + child_timeline.freeze_and_flush().await?; + let mut flags = EnumSet::new(); + flags.insert(CompactFlags::ForceRepartition); + child_timeline + .compact(&CancellationToken::new(), flags, &ctx) + .await?; + + let key_near_end = { + let mut tmp = current_key; + tmp.field6 -= 10; + tmp + }; + + let key_near_gap = { + let mut tmp = gap_at_key; + tmp.field6 -= 10; + tmp + }; + + let read = KeySpace { + ranges: vec![key_near_gap..gap_at_key.next(), key_near_end..current_key], + }; + let results = child_timeline + .get_vectored_impl(read.clone(), current_lsn, &ctx) + .await?; + + for (key, img_res) in results { + let expected = test_img(&format!("{} at {}", key, latest_lsns[&key])); + assert_eq!(img_res?, expected); } Ok(()) @@ -4910,12 +4801,12 @@ mod tests { for blknum in 0..NUM_KEYS { lsn = Lsn(lsn.0 + 0x10); test_key.field6 = blknum as u32; - let writer = tline.writer().await; + let mut writer = tline.writer().await; writer .put( test_key, lsn, - &Value::Image(TEST_IMG(&format!("{} at {}", blknum, lsn))), + &Value::Image(test_img(&format!("{} at {}", blknum, lsn))), &ctx, ) .await?; @@ -4931,12 +4822,12 @@ mod tests { lsn = Lsn(lsn.0 + 0x10); let blknum = thread_rng().gen_range(0..NUM_KEYS); test_key.field6 = blknum as u32; - let writer = tline.writer().await; + let mut writer = tline.writer().await; writer .put( test_key, lsn, - &Value::Image(TEST_IMG(&format!("{} at {}", blknum, lsn))), + &Value::Image(test_img(&format!("{} at {}", blknum, lsn))), &ctx, ) .await?; @@ -4950,7 +4841,7 @@ mod tests { test_key.field6 = blknum as u32; assert_eq!( tline.get(test_key, lsn, &ctx).await?, - TEST_IMG(&format!("{} at {}", blknum, last_lsn)) + test_img(&format!("{} at {}", blknum, last_lsn)) ); } @@ -4999,12 +4890,12 @@ mod tests { for blknum in 0..NUM_KEYS { lsn = Lsn(lsn.0 + 0x10); test_key.field6 = blknum as u32; - let writer = tline.writer().await; + let mut writer = tline.writer().await; writer .put( test_key, lsn, - &Value::Image(TEST_IMG(&format!("{} at {}", blknum, lsn))), + &Value::Image(test_img(&format!("{} at {}", blknum, lsn))), &ctx, ) .await?; @@ -5028,12 +4919,12 @@ mod tests { lsn = Lsn(lsn.0 + 0x10); let blknum = thread_rng().gen_range(0..NUM_KEYS); test_key.field6 = blknum as u32; - let writer = tline.writer().await; + let mut writer = tline.writer().await; writer .put( test_key, lsn, - &Value::Image(TEST_IMG(&format!("{} at {}", blknum, lsn))), + &Value::Image(test_img(&format!("{} at {}", blknum, lsn))), &ctx, ) .await?; @@ -5048,7 +4939,7 @@ mod tests { test_key.field6 = blknum as u32; assert_eq!( tline.get(test_key, lsn, &ctx).await?, - TEST_IMG(&format!("{} at {}", blknum, last_lsn)) + test_img(&format!("{} at {}", blknum, last_lsn)) ); } @@ -5105,12 +4996,12 @@ mod tests { lsn = Lsn(lsn.0 + 0x10); let blknum = thread_rng().gen_range(0..NUM_KEYS); test_key.field6 = blknum as u32; - let writer = tline.writer().await; + let mut writer = tline.writer().await; writer .put( test_key, lsn, - &Value::Image(TEST_IMG(&format!("{} {} at {}", idx, blknum, lsn))), + &Value::Image(test_img(&format!("{} {} at {}", idx, blknum, lsn))), &ctx, ) .await?; @@ -5132,7 +5023,7 @@ mod tests { test_key.field6 = blknum as u32; assert_eq!( tline.get(test_key, *lsn, &ctx).await?, - TEST_IMG(&format!("{idx} {blknum} at {lsn}")) + test_img(&format!("{idx} {blknum} at {lsn}")) ); } } @@ -5220,7 +5111,7 @@ mod tests { let raw_tline = tline.raw_timeline().unwrap(); raw_tline .shutdown() - .instrument(info_span!("test_shutdown", tenant_id=%raw_tline.tenant_shard_id)) + .instrument(info_span!("test_shutdown", tenant_id=%raw_tline.tenant_shard_id, shard_id=%raw_tline.tenant_shard_id.shard_slug(), timeline_id=%TIMELINE_ID)) .await; std::mem::forget(tline); } @@ -5251,4 +5142,23 @@ mod tests { Ok(()) } + + #[tokio::test] + async fn test_read_at_max_lsn() -> anyhow::Result<()> { + let harness = TenantHarness::create("test_read_at_max_lsn")?; + let (tenant, ctx) = harness.load().await; + let tline = tenant + .create_test_timeline(TIMELINE_ID, Lsn(0x08), DEFAULT_PG_VERSION, &ctx) + .await?; + + let lsn = Lsn(0x10); + bulk_insert_compact_gc(tline.clone(), &ctx, lsn, 50, 10000).await?; + + let test_key = Key::from_hex("010000000033333333444444445500000000").unwrap(); + let read_lsn = Lsn(u64::MAX - 1); + + assert!(tline.get(test_key, read_lsn, &ctx).await.is_ok()); + + Ok(()) + } } diff --git a/pageserver/src/tenant/blob_io.rs b/pageserver/src/tenant/blob_io.rs index 6de2e95055..0d33100ead 100644 --- a/pageserver/src/tenant/blob_io.rs +++ b/pageserver/src/tenant/blob_io.rs @@ -11,6 +11,9 @@ //! len < 128: 0XXXXXXX //! len >= 128: 1XXXXXXX XXXXXXXX XXXXXXXX XXXXXXXX //! +use bytes::{BufMut, BytesMut}; +use tokio_epoll_uring::{BoundedBuf, IoBuf, Slice}; + use crate::context::RequestContext; use crate::page_cache::PAGE_SZ; use crate::tenant::block_io::BlockCursor; @@ -100,6 +103,8 @@ pub struct BlobWriter { offset: u64, /// A buffer to save on write calls, only used if BUFFERED=true buf: Vec, + /// We do tiny writes for the length headers; they need to be in an owned buffer; + io_buf: Option, } impl BlobWriter { @@ -108,6 +113,7 @@ impl BlobWriter { inner, offset: start_offset, buf: Vec::with_capacity(Self::CAPACITY), + io_buf: Some(BytesMut::new()), } } @@ -117,21 +123,31 @@ impl BlobWriter { const CAPACITY: usize = if BUFFERED { PAGE_SZ } else { 0 }; - #[inline(always)] /// Writes the given buffer directly to the underlying `VirtualFile`. /// You need to make sure that the internal buffer is empty, otherwise /// data will be written in wrong order. - async fn write_all_unbuffered(&mut self, src_buf: &[u8]) -> Result<(), Error> { - self.inner.write_all(src_buf).await?; - self.offset += src_buf.len() as u64; - Ok(()) + #[inline(always)] + async fn write_all_unbuffered, Buf: IoBuf + Send>( + &mut self, + src_buf: B, + ) -> (B::Buf, Result<(), Error>) { + let (src_buf, res) = self.inner.write_all(src_buf).await; + let nbytes = match res { + Ok(nbytes) => nbytes, + Err(e) => return (src_buf, Err(e)), + }; + self.offset += nbytes as u64; + (src_buf, Ok(())) } #[inline(always)] /// Flushes the internal buffer to the underlying `VirtualFile`. pub async fn flush_buffer(&mut self) -> Result<(), Error> { - self.inner.write_all(&self.buf).await?; - self.buf.clear(); + let buf = std::mem::take(&mut self.buf); + let (mut buf, res) = self.inner.write_all(buf).await; + res?; + buf.clear(); + self.buf = buf; Ok(()) } @@ -146,62 +162,97 @@ impl BlobWriter { } /// Internal, possibly buffered, write function - async fn write_all(&mut self, mut src_buf: &[u8]) -> Result<(), Error> { + async fn write_all, Buf: IoBuf + Send>( + &mut self, + src_buf: B, + ) -> (B::Buf, Result<(), Error>) { if !BUFFERED { assert!(self.buf.is_empty()); - self.write_all_unbuffered(src_buf).await?; - return Ok(()); + return self.write_all_unbuffered(src_buf).await; } let remaining = Self::CAPACITY - self.buf.len(); + let src_buf_len = src_buf.bytes_init(); + if src_buf_len == 0 { + return (Slice::into_inner(src_buf.slice_full()), Ok(())); + } + let mut src_buf = src_buf.slice(0..src_buf_len); // First try to copy as much as we can into the buffer if remaining > 0 { - let copied = self.write_into_buffer(src_buf); - src_buf = &src_buf[copied..]; + let copied = self.write_into_buffer(&src_buf); + src_buf = src_buf.slice(copied..); } // Then, if the buffer is full, flush it out if self.buf.len() == Self::CAPACITY { - self.flush_buffer().await?; + if let Err(e) = self.flush_buffer().await { + return (Slice::into_inner(src_buf), Err(e)); + } } // Finally, write the tail of src_buf: // If it wholly fits into the buffer without // completely filling it, then put it there. // If not, write it out directly. - if !src_buf.is_empty() { + let src_buf = if !src_buf.is_empty() { assert_eq!(self.buf.len(), 0); if src_buf.len() < Self::CAPACITY { - let copied = self.write_into_buffer(src_buf); + let copied = self.write_into_buffer(&src_buf); // We just verified above that src_buf fits into our internal buffer. assert_eq!(copied, src_buf.len()); + Slice::into_inner(src_buf) } else { - self.write_all_unbuffered(src_buf).await?; + let (src_buf, res) = self.write_all_unbuffered(src_buf).await; + if let Err(e) = res { + return (src_buf, Err(e)); + } + src_buf } - } - Ok(()) + } else { + Slice::into_inner(src_buf) + }; + (src_buf, Ok(())) } /// Write a blob of data. Returns the offset that it was written to, /// which can be used to retrieve the data later. - pub async fn write_blob(&mut self, srcbuf: &[u8]) -> Result { + pub async fn write_blob, Buf: IoBuf + Send>( + &mut self, + srcbuf: B, + ) -> (B::Buf, Result) { let offset = self.offset; - if srcbuf.len() < 128 { - // Short blob. Write a 1-byte length header - let len_buf = srcbuf.len() as u8; - self.write_all(&[len_buf]).await?; - } else { - // Write a 4-byte length header - if srcbuf.len() > 0x7fff_ffff { - return Err(Error::new( - ErrorKind::Other, - format!("blob too large ({} bytes)", srcbuf.len()), - )); + let len = srcbuf.bytes_init(); + + let mut io_buf = self.io_buf.take().expect("we always put it back below"); + io_buf.clear(); + let (io_buf, hdr_res) = async { + if len < 128 { + // Short blob. Write a 1-byte length header + io_buf.put_u8(len as u8); + self.write_all(io_buf).await + } else { + // Write a 4-byte length header + if len > 0x7fff_ffff { + return ( + io_buf, + Err(Error::new( + ErrorKind::Other, + format!("blob too large ({} bytes)", len), + )), + ); + } + let mut len_buf = (len as u32).to_be_bytes(); + len_buf[0] |= 0x80; + io_buf.extend_from_slice(&len_buf[..]); + self.write_all(io_buf).await } - let mut len_buf = ((srcbuf.len()) as u32).to_be_bytes(); - len_buf[0] |= 0x80; - self.write_all(&len_buf).await?; } - self.write_all(srcbuf).await?; - Ok(offset) + .await; + self.io_buf = Some(io_buf); + match hdr_res { + Ok(_) => (), + Err(e) => return (Slice::into_inner(srcbuf.slice(..)), Err(e)), + } + let (srcbuf, res) = self.write_all(srcbuf).await; + (srcbuf, res.map(|_| offset)) } } @@ -248,12 +299,14 @@ mod tests { let file = VirtualFile::create(pathbuf.as_path()).await?; let mut wtr = BlobWriter::::new(file, 0); for blob in blobs.iter() { - let offs = wtr.write_blob(blob).await?; + let (_, res) = wtr.write_blob(blob.clone()).await; + let offs = res?; offsets.push(offs); } // Write out one page worth of zeros so that we can // read again with read_blk - let offs = wtr.write_blob(&vec![0; PAGE_SZ]).await?; + let (_, res) = wtr.write_blob(vec![0; PAGE_SZ]).await; + let offs = res?; println!("Writing final blob at offs={offs}"); wtr.flush_buffer().await?; } diff --git a/pageserver/src/tenant/block_io.rs b/pageserver/src/tenant/block_io.rs index 1b6bccc120..37c84be342 100644 --- a/pageserver/src/tenant/block_io.rs +++ b/pageserver/src/tenant/block_io.rs @@ -5,7 +5,7 @@ use super::ephemeral_file::EphemeralFile; use super::storage_layer::delta_layer::{Adapter, DeltaLayerInner}; use crate::context::RequestContext; -use crate::page_cache::{self, PageReadGuard, PageWriteGuard, ReadBufResult, PAGE_SZ}; +use crate::page_cache::{self, FileId, PageReadGuard, PageWriteGuard, ReadBufResult, PAGE_SZ}; use crate::virtual_file::VirtualFile; use bytes::Bytes; use std::ops::Deref; @@ -78,7 +78,7 @@ impl<'a> Deref for BlockLease<'a> { /// /// Unlike traits, we also support the read function to be async though. pub(crate) enum BlockReaderRef<'a> { - FileBlockReader(&'a FileBlockReader), + FileBlockReader(&'a FileBlockReader<'a>), EphemeralFile(&'a EphemeralFile), Adapter(Adapter<&'a DeltaLayerInner>), #[cfg(test)] @@ -160,17 +160,15 @@ impl<'a> BlockCursor<'a> { /// /// The file is assumed to be immutable. This doesn't provide any functions /// for modifying the file, nor for invalidating the cache if it is modified. -pub struct FileBlockReader { - pub file: VirtualFile, +pub struct FileBlockReader<'a> { + pub file: &'a VirtualFile, /// Unique ID of this file, used as key in the page cache. file_id: page_cache::FileId, } -impl FileBlockReader { - pub fn new(file: VirtualFile) -> Self { - let file_id = page_cache::next_file_id(); - +impl<'a> FileBlockReader<'a> { + pub fn new(file: &'a VirtualFile, file_id: FileId) -> Self { FileBlockReader { file_id, file } } @@ -190,11 +188,11 @@ impl FileBlockReader { /// Returns a "lease" object that can be used to /// access to the contents of the page. (For the page cache, the /// lease object represents a lock on the buffer.) - pub async fn read_blk( + pub async fn read_blk<'b>( &self, blknum: u32, ctx: &RequestContext, - ) -> Result { + ) -> Result, std::io::Error> { let cache = page_cache::get(); match cache .read_immutable_buf(self.file_id, blknum, ctx) @@ -215,7 +213,7 @@ impl FileBlockReader { } } -impl BlockReader for FileBlockReader { +impl BlockReader for FileBlockReader<'_> { fn block_cursor(&self) -> BlockCursor<'_> { BlockCursor::new(BlockReaderRef::FileBlockReader(self)) } diff --git a/pageserver/src/tenant/config.rs b/pageserver/src/tenant/config.rs index c44164c12d..57fc444cdd 100644 --- a/pageserver/src/tenant/config.rs +++ b/pageserver/src/tenant/config.rs @@ -9,8 +9,9 @@ //! may lead to a data loss. //! use anyhow::bail; -use pageserver_api::models; +use pageserver_api::models::CompactionAlgorithm; use pageserver_api::models::EvictionPolicy; +use pageserver_api::models::{self, ThrottleConfig}; use pageserver_api::shard::{ShardCount, ShardIdentity, ShardNumber, ShardStripeSize}; use serde::de::IntoDeserializer; use serde::{Deserialize, Serialize}; @@ -20,6 +21,7 @@ use std::time::Duration; use utils::generation::Generation; pub mod defaults { + // FIXME: This current value is very low. I would imagine something like 1 GB or 10 GB // would be more appropriate. But a low value forces the code to be exercised more, // which is good for now to trigger bugs. @@ -27,12 +29,17 @@ pub mod defaults { pub const DEFAULT_CHECKPOINT_DISTANCE: u64 = 256 * 1024 * 1024; pub const DEFAULT_CHECKPOINT_TIMEOUT: &str = "10 m"; + // FIXME the below configs are only used by legacy algorithm. The new algorithm + // has different parameters. + // Target file size, when creating image and delta layers. // This parameter determines L1 layer file size. pub const DEFAULT_COMPACTION_TARGET_SIZE: u64 = 128 * 1024 * 1024; pub const DEFAULT_COMPACTION_PERIOD: &str = "20 s"; pub const DEFAULT_COMPACTION_THRESHOLD: usize = 10; + pub const DEFAULT_COMPACTION_ALGORITHM: super::CompactionAlgorithm = + super::CompactionAlgorithm::Legacy; pub const DEFAULT_GC_HORIZON: u64 = 64 * 1024 * 1024; @@ -45,13 +52,16 @@ pub mod defaults { pub const DEFAULT_PITR_INTERVAL: &str = "7 days"; pub const DEFAULT_WALRECEIVER_CONNECT_TIMEOUT: &str = "10 seconds"; pub const DEFAULT_WALRECEIVER_LAGGING_WAL_TIMEOUT: &str = "10 seconds"; - pub const DEFAULT_MAX_WALRECEIVER_LSN_WAL_LAG: u64 = 10 * 1024 * 1024; + // The default limit on WAL lag should be set to avoid causing disconnects under high throughput + // scenarios: since the broker stats are updated ~1/s, a value of 1GiB should be sufficient for + // throughputs up to 1GiB/s per timeline. + pub const DEFAULT_MAX_WALRECEIVER_LSN_WAL_LAG: u64 = 1024 * 1024 * 1024; pub const DEFAULT_EVICTIONS_LOW_RESIDENCE_DURATION_METRIC_THRESHOLD: &str = "24 hour"; pub const DEFAULT_INGEST_BATCH_SIZE: u64 = 100; } -#[derive(Debug, Clone, Serialize, Deserialize, PartialEq, Eq)] +#[derive(Debug, Copy, Clone, Serialize, Deserialize, PartialEq, Eq)] pub(crate) enum AttachmentMode { /// Our generation is current as far as we know, and as far as we know we are the only attached /// pageserver. This is the "normal" attachment mode. @@ -66,7 +76,7 @@ pub(crate) enum AttachmentMode { Stale, } -#[derive(Debug, Clone, Serialize, Deserialize, PartialEq, Eq)] +#[derive(Debug, Copy, Clone, Serialize, Deserialize, PartialEq, Eq)] pub(crate) struct AttachedLocationConfig { pub(crate) generation: Generation, pub(crate) attach_mode: AttachmentMode, @@ -251,7 +261,7 @@ impl LocationConf { } else { ShardIdentity::new( ShardNumber(conf.shard_number), - ShardCount(conf.shard_count), + ShardCount::new(conf.shard_count), ShardStripeSize(conf.shard_stripe_size), )? }; @@ -285,7 +295,7 @@ impl Default for LocationConf { /// /// For storing and transmitting individual tenant's configuration, see /// TenantConfOpt. -#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)] +#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)] pub struct TenantConf { // Flush out an inmemory layer, if it's holding WAL older than this // This puts a backstop on how much WAL needs to be re-digested if the @@ -305,6 +315,7 @@ pub struct TenantConf { pub compaction_period: Duration, // Level0 delta layer threshold for compaction. pub compaction_threshold: usize, + pub compaction_algorithm: CompactionAlgorithm, // Determines how much history is retained, to allow // branching and read replicas at an older point in time. // The unit is #of bytes of WAL. @@ -339,17 +350,22 @@ pub struct TenantConf { // See the corresponding metric's help string. #[serde(with = "humantime_serde")] pub evictions_low_residence_duration_metric_threshold: Duration, - pub gc_feedback: bool, /// If non-zero, the period between uploads of a heatmap from attached tenants. This /// may be disabled if a Tenant will not have secondary locations: only secondary /// locations will use the heatmap uploaded by attached locations. + #[serde(with = "humantime_serde")] pub heatmap_period: Duration, + + /// If true then SLRU segments are dowloaded on demand, if false SLRU segments are included in basebackup + pub lazy_slru_download: bool, + + pub timeline_get_throttle: pageserver_api::models::ThrottleConfig, } /// Same as TenantConf, but this struct preserves the information about /// which parameters are set and which are not. -#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize, Default)] +#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize, Default)] pub struct TenantConfOpt { #[serde(skip_serializing_if = "Option::is_none")] #[serde(default)] @@ -373,6 +389,10 @@ pub struct TenantConfOpt { #[serde(default)] pub compaction_threshold: Option, + #[serde(skip_serializing_if = "Option::is_none")] + #[serde(default)] + pub compaction_algorithm: Option, + #[serde(skip_serializing_if = "Option::is_none")] #[serde(default)] pub gc_horizon: Option, @@ -422,14 +442,17 @@ pub struct TenantConfOpt { #[serde(default)] pub evictions_low_residence_duration_metric_threshold: Option, - #[serde(skip_serializing_if = "Option::is_none")] - #[serde(default)] - pub gc_feedback: Option, - #[serde(skip_serializing_if = "Option::is_none")] #[serde(with = "humantime_serde")] #[serde(default)] pub heatmap_period: Option, + + #[serde(skip_serializing_if = "Option::is_none")] + #[serde(default)] + pub lazy_slru_download: Option, + + #[serde(skip_serializing_if = "Option::is_none")] + pub timeline_get_throttle: Option, } impl TenantConfOpt { @@ -450,6 +473,9 @@ impl TenantConfOpt { compaction_threshold: self .compaction_threshold .unwrap_or(global_conf.compaction_threshold), + compaction_algorithm: self + .compaction_algorithm + .unwrap_or(global_conf.compaction_algorithm), gc_horizon: self.gc_horizon.unwrap_or(global_conf.gc_horizon), gc_period: self.gc_period.unwrap_or(global_conf.gc_period), image_creation_threshold: self @@ -473,8 +499,14 @@ impl TenantConfOpt { evictions_low_residence_duration_metric_threshold: self .evictions_low_residence_duration_metric_threshold .unwrap_or(global_conf.evictions_low_residence_duration_metric_threshold), - gc_feedback: self.gc_feedback.unwrap_or(global_conf.gc_feedback), heatmap_period: self.heatmap_period.unwrap_or(global_conf.heatmap_period), + lazy_slru_download: self + .lazy_slru_download + .unwrap_or(global_conf.lazy_slru_download), + timeline_get_throttle: self + .timeline_get_throttle + .clone() + .unwrap_or(global_conf.timeline_get_throttle), } } } @@ -490,6 +522,7 @@ impl Default for TenantConf { compaction_period: humantime::parse_duration(DEFAULT_COMPACTION_PERIOD) .expect("cannot parse default compaction period"), compaction_threshold: DEFAULT_COMPACTION_THRESHOLD, + compaction_algorithm: DEFAULT_COMPACTION_ALGORITHM, gc_horizon: DEFAULT_GC_HORIZON, gc_period: humantime::parse_duration(DEFAULT_GC_PERIOD) .expect("cannot parse default gc period"), @@ -511,8 +544,9 @@ impl Default for TenantConf { DEFAULT_EVICTIONS_LOW_RESIDENCE_DURATION_METRIC_THRESHOLD, ) .expect("cannot parse default evictions_low_residence_duration_metric_threshold"), - gc_feedback: false, heatmap_period: Duration::ZERO, + lazy_slru_download: false, + timeline_get_throttle: crate::tenant::throttle::Config::disabled(), } } } @@ -566,6 +600,7 @@ impl From for models::TenantConfig { Self { checkpoint_distance: value.checkpoint_distance, checkpoint_timeout: value.checkpoint_timeout.map(humantime), + compaction_algorithm: value.compaction_algorithm, compaction_target_size: value.compaction_target_size, compaction_period: value.compaction_period.map(humantime), compaction_threshold: value.compaction_threshold, @@ -582,8 +617,9 @@ impl From for models::TenantConfig { evictions_low_residence_duration_metric_threshold: value .evictions_low_residence_duration_metric_threshold .map(humantime), - gc_feedback: value.gc_feedback, heatmap_period: value.heatmap_period.map(humantime), + lazy_slru_download: value.lazy_slru_download, + timeline_get_throttle: value.timeline_get_throttle.map(ThrottleConfig::from), } } } diff --git a/pageserver/src/tenant/delete.rs b/pageserver/src/tenant/delete.rs index 97de0cdcf9..ffb7206b1e 100644 --- a/pageserver/src/tenant/delete.rs +++ b/pageserver/src/tenant/delete.rs @@ -3,10 +3,10 @@ use std::sync::Arc; use anyhow::Context; use camino::{Utf8Path, Utf8PathBuf}; use pageserver_api::{models::TenantState, shard::TenantShardId}; -use remote_storage::{GenericRemoteStorage, RemotePath}; +use remote_storage::{GenericRemoteStorage, RemotePath, TimeoutOrCancel}; use tokio::sync::OwnedMutexGuard; use tokio_util::sync::CancellationToken; -use tracing::{error, instrument, Instrument, Span}; +use tracing::{error, instrument, Instrument}; use utils::{backoff, completion, crashsafe, fs_ext, id::TimelineId}; @@ -84,16 +84,18 @@ async fn create_remote_delete_mark( let data = bytes::Bytes::from_static(data); let stream = futures::stream::once(futures::future::ready(Ok(data))); remote_storage - .upload(stream, 0, &remote_mark_path, None) + .upload(stream, 0, &remote_mark_path, None, cancel) .await }, - |_e| false, + TimeoutOrCancel::caused_by_cancel, FAILED_UPLOAD_WARN_THRESHOLD, FAILED_REMOTE_OP_RETRIES, "mark_upload", - backoff::Cancel::new(cancel.clone(), || anyhow::anyhow!("Cancelled")), + cancel, ) .await + .ok_or_else(|| anyhow::Error::new(TimeoutOrCancel::Cancel)) + .and_then(|x| x) .context("mark_upload")?; Ok(()) @@ -136,7 +138,11 @@ async fn schedule_ordered_timeline_deletions( let mut already_running_deletions = vec![]; for (timeline_id, _) in sorted.into_iter().rev() { - if let Err(e) = DeleteTimelineFlow::run(tenant, timeline_id, true).await { + let span = tracing::info_span!("timeline_delete", %timeline_id); + let res = DeleteTimelineFlow::run(tenant, timeline_id, true) + .instrument(span) + .await; + if let Err(e) = res { match e { DeleteTimelineError::NotFound => { // Timeline deletion finished after call to clone above but before call @@ -178,14 +184,16 @@ async fn remove_tenant_remote_delete_mark( if let Some(remote_storage) = remote_storage { let path = remote_tenant_delete_mark_path(conf, tenant_shard_id)?; backoff::retry( - || async { remote_storage.delete(&path).await }, - |_e| false, + || async { remote_storage.delete(&path, cancel).await }, + TimeoutOrCancel::caused_by_cancel, FAILED_UPLOAD_WARN_THRESHOLD, FAILED_REMOTE_OP_RETRIES, "remove_tenant_remote_delete_mark", - backoff::Cancel::new(cancel.clone(), || anyhow::anyhow!("Cancelled")), + cancel, ) .await + .ok_or_else(|| anyhow::Error::new(TimeoutOrCancel::Cancel)) + .and_then(|x| x) .context("remove_tenant_remote_delete_mark")?; } Ok(()) @@ -238,6 +246,8 @@ async fn cleanup_remaining_fs_traces( rm(conf.tenant_deleted_mark_file_path(tenant_shard_id), false).await?; + rm(conf.tenant_heatmap_path(tenant_shard_id), false).await?; + fail::fail_point!("tenant-delete-before-remove-tenant-dir", |_| { Err(anyhow::anyhow!( "failpoint: tenant-delete-before-remove-tenant-dir" @@ -410,7 +420,7 @@ impl DeleteTenantFlow { .expect("cant be stopping or broken"); tenant - .attach(preload, super::SpawnMode::Normal, ctx) + .attach(preload, super::SpawnMode::Eager, ctx) .await .context("attach")?; @@ -488,11 +498,7 @@ impl DeleteTenantFlow { }; Ok(()) } - .instrument({ - let span = tracing::info_span!(parent: None, "delete_tenant", tenant_id=%tenant_shard_id.tenant_id, shard_id=%tenant_shard_id.shard_slug()); - span.follows_from(Span::current()); - span - }), + .instrument(tracing::info_span!(parent: None, "delete_tenant", tenant_id=%tenant_shard_id.tenant_id, shard_id=%tenant_shard_id.shard_slug())), ); } diff --git a/pageserver/src/tenant/disk_btree.rs b/pageserver/src/tenant/disk_btree.rs index 06a04bf536..6d85d1e60e 100644 --- a/pageserver/src/tenant/disk_btree.rs +++ b/pageserver/src/tenant/disk_btree.rs @@ -18,11 +18,19 @@ //! - An Iterator interface would be more convenient for the callers than the //! 'visit' function //! +use async_stream::try_stream; use byteorder::{ReadBytesExt, BE}; use bytes::{BufMut, Bytes, BytesMut}; use either::Either; +use futures::Stream; use hex; -use std::{cmp::Ordering, io, result}; +use std::{ + cmp::Ordering, + io, + iter::Rev, + ops::{Range, RangeInclusive}, + result, +}; use thiserror::Error; use tracing::error; @@ -36,7 +44,6 @@ use crate::{ pub const VALUE_SZ: usize = 5; pub const MAX_VALUE: u64 = 0x007f_ffff_ffff; -#[allow(dead_code)] pub const PAGE_SZ: usize = 8192; #[derive(Clone, Copy, Debug)] @@ -252,6 +259,90 @@ where Ok(result) } + /// Return a stream which yields all key, value pairs from the index + /// starting from the first key greater or equal to `start_key`. + /// + /// Note that this is a copy of [`Self::visit`]. + /// TODO: Once the sequential read path is removed this will become + /// the only index traversal method. + pub fn get_stream_from<'a>( + &'a self, + start_key: &'a [u8; L], + ctx: &'a RequestContext, + ) -> impl Stream, u64), DiskBtreeError>> + 'a { + try_stream! { + let mut stack = Vec::new(); + stack.push((self.root_blk, None)); + let block_cursor = self.reader.block_cursor(); + while let Some((node_blknum, opt_iter)) = stack.pop() { + // Locate the node. + let node_buf = block_cursor + .read_blk(self.start_blk + node_blknum, ctx) + .await?; + + let node = OnDiskNode::deparse(node_buf.as_ref())?; + let prefix_len = node.prefix_len as usize; + let suffix_len = node.suffix_len as usize; + + assert!(node.num_children > 0); + + let mut keybuf = Vec::new(); + keybuf.extend(node.prefix); + keybuf.resize(prefix_len + suffix_len, 0); + + let mut iter: Either, Rev>> = if let Some(iter) = opt_iter { + iter + } else { + // Locate the first match + let idx = match node.binary_search(start_key, keybuf.as_mut_slice()) { + Ok(idx) => idx, + Err(idx) => { + if node.level == 0 { + // Imagine that the node contains the following keys: + // + // 1 + // 3 <-- idx + // 5 + // + // If the search key is '2' and there is exact match, + // the binary search would return the index of key + // '3'. That's cool, '3' is the first key to return. + idx + } else { + // This is an internal page, so each key represents a lower + // bound for what's in the child page. If there is no exact + // match, we have to return the *previous* entry. + // + // 1 <-- return this + // 3 <-- idx + // 5 + idx.saturating_sub(1) + } + } + }; + Either::Left(idx..node.num_children.into()) + }; + + // idx points to the first match now. Keep going from there + while let Some(idx) = iter.next() { + let key_off = idx * suffix_len; + let suffix = &node.keys[key_off..key_off + suffix_len]; + keybuf[prefix_len..].copy_from_slice(suffix); + let value = node.value(idx); + #[allow(clippy::collapsible_if)] + if node.level == 0 { + // leaf + yield (keybuf.clone(), value.to_u64()); + } else { + stack.push((node_blknum, Some(iter))); + stack.push((value.to_blknum(), None)); + break; + } + } + } + } + } + /// /// Scan the tree, starting from 'search_key', in the given direction. 'visitor' /// will be called for every key >= 'search_key' (or <= 'search_key', if scanning @@ -701,8 +792,6 @@ impl BuildNode { #[cfg(test)] pub(crate) mod tests { use super::*; - use crate::context::DownloadBehavior; - use crate::task_mgr::TaskKind; use crate::tenant::block_io::{BlockCursor, BlockLease, BlockReaderRef}; use rand::Rng; use std::collections::BTreeMap; diff --git a/pageserver/src/tenant/ephemeral_file.rs b/pageserver/src/tenant/ephemeral_file.rs index 6b8cd77d78..e48b9e83bd 100644 --- a/pageserver/src/tenant/ephemeral_file.rs +++ b/pageserver/src/tenant/ephemeral_file.rs @@ -6,6 +6,7 @@ use crate::context::RequestContext; use crate::page_cache::{self, PAGE_SZ}; use crate::tenant::block_io::{BlockCursor, BlockLease, BlockReader}; use crate::virtual_file::{self, VirtualFile}; +use bytes::BytesMut; use camino::Utf8PathBuf; use pageserver_api::shard::TenantShardId; use std::cmp::min; @@ -26,7 +27,10 @@ pub struct EphemeralFile { /// An ephemeral file is append-only. /// We keep the last page, which can still be modified, in [`Self::mutable_tail`]. /// The other pages, which can no longer be modified, are accessed through the page cache. - mutable_tail: [u8; PAGE_SZ], + /// + /// None <=> IO is ongoing. + /// Size is fixed to PAGE_SZ at creation time and must not be changed. + mutable_tail: Option, } impl EphemeralFile { @@ -60,7 +64,7 @@ impl EphemeralFile { _timeline_id: timeline_id, file, len: 0, - mutable_tail: [0u8; PAGE_SZ], + mutable_tail: Some(BytesMut::zeroed(PAGE_SZ)), }) } @@ -103,7 +107,13 @@ impl EphemeralFile { }; } else { debug_assert_eq!(blknum as u64, self.len / PAGE_SZ as u64); - Ok(BlockLease::EphemeralFileMutableTail(&self.mutable_tail)) + Ok(BlockLease::EphemeralFileMutableTail( + self.mutable_tail + .as_deref() + .expect("we're not doing IO, it must be Some()") + .try_into() + .expect("we ensure that it's always PAGE_SZ"), + )) } } @@ -135,21 +145,27 @@ impl EphemeralFile { ) -> Result<(), io::Error> { let mut src_remaining = src; while !src_remaining.is_empty() { - let dst_remaining = &mut self.ephemeral_file.mutable_tail[self.off..]; + let dst_remaining = &mut self + .ephemeral_file + .mutable_tail + .as_deref_mut() + .expect("IO is not yet ongoing")[self.off..]; let n = min(dst_remaining.len(), src_remaining.len()); dst_remaining[..n].copy_from_slice(&src_remaining[..n]); self.off += n; src_remaining = &src_remaining[n..]; if self.off == PAGE_SZ { - match self + let mutable_tail = std::mem::take(&mut self.ephemeral_file.mutable_tail) + .expect("IO is not yet ongoing"); + let (mutable_tail, res) = self .ephemeral_file .file - .write_all_at( - &self.ephemeral_file.mutable_tail, - self.blknum as u64 * PAGE_SZ as u64, - ) - .await - { + .write_all_at(mutable_tail, self.blknum as u64 * PAGE_SZ as u64) + .await; + // TODO: If we panic before we can put the mutable_tail back, subsequent calls will fail. + // I.e., the IO isn't retryable if we panic. + self.ephemeral_file.mutable_tail = Some(mutable_tail); + match res { Ok(_) => { // Pre-warm the page cache with what we just wrote. // This isn't necessary for coherency/correctness, but it's how we've always done it. @@ -169,7 +185,12 @@ impl EphemeralFile { Ok(page_cache::ReadBufResult::NotFound(mut write_guard)) => { let buf: &mut [u8] = write_guard.deref_mut(); debug_assert_eq!(buf.len(), PAGE_SZ); - buf.copy_from_slice(&self.ephemeral_file.mutable_tail); + buf.copy_from_slice( + self.ephemeral_file + .mutable_tail + .as_deref() + .expect("IO is not ongoing"), + ); let _ = write_guard.mark_valid(); // pre-warm successful } @@ -181,7 +202,11 @@ impl EphemeralFile { // Zero the buffer for re-use. // Zeroing is critical for correcntess because the write_blob code below // and similarly read_blk expect zeroed pages. - self.ephemeral_file.mutable_tail.fill(0); + self.ephemeral_file + .mutable_tail + .as_deref_mut() + .expect("IO is not ongoing") + .fill(0); // This block is done, move to next one. self.blknum += 1; self.off = 0; @@ -275,7 +300,7 @@ mod tests { use super::*; use crate::context::DownloadBehavior; use crate::task_mgr::TaskKind; - use crate::tenant::block_io::{BlockCursor, BlockReaderRef}; + use crate::tenant::block_io::BlockReaderRef; use rand::{thread_rng, RngCore}; use std::fs; use std::str::FromStr; diff --git a/pageserver/src/tenant/layer_map.rs b/pageserver/src/tenant/layer_map.rs index c31d401e84..b8ed69052f 100644 --- a/pageserver/src/tenant/layer_map.rs +++ b/pageserver/src/tenant/layer_map.rs @@ -51,7 +51,9 @@ use crate::keyspace::KeyPartitioning; use crate::repository::Key; use crate::tenant::storage_layer::InMemoryLayer; use anyhow::Result; -use std::collections::VecDeque; +use pageserver_api::keyspace::KeySpaceAccum; +use std::collections::{HashMap, VecDeque}; +use std::iter::Peekable; use std::ops::Range; use std::sync::Arc; use utils::lsn::Lsn; @@ -144,11 +146,235 @@ impl Drop for BatchedUpdates<'_> { } /// Return value of LayerMap::search +#[derive(Eq, PartialEq, Debug, Hash)] pub struct SearchResult { pub layer: Arc, pub lsn_floor: Lsn, } +/// Return value of [`LayerMap::range_search`] +/// +/// Contains a mapping from a layer description to a keyspace +/// accumulator that contains all the keys which intersect the layer +/// from the original search space. Keys that were not found are accumulated +/// in a separate key space accumulator. +#[derive(Debug)] +pub struct RangeSearchResult { + pub found: HashMap, + pub not_found: KeySpaceAccum, +} + +impl RangeSearchResult { + fn new() -> Self { + Self { + found: HashMap::new(), + not_found: KeySpaceAccum::new(), + } + } +} + +/// Collector for results of range search queries on the LayerMap. +/// It should be provided with two iterators for the delta and image coverage +/// that contain all the changes for layers which intersect the range. +struct RangeSearchCollector +where + Iter: Iterator>)>, +{ + delta_coverage: Peekable, + image_coverage: Peekable, + key_range: Range, + end_lsn: Lsn, + + current_delta: Option>, + current_image: Option>, + + result: RangeSearchResult, +} + +#[derive(Debug)] +enum NextLayerType { + Delta(i128), + Image(i128), + Both(i128), +} + +impl NextLayerType { + fn next_change_at_key(&self) -> Key { + match self { + NextLayerType::Delta(at) => Key::from_i128(*at), + NextLayerType::Image(at) => Key::from_i128(*at), + NextLayerType::Both(at) => Key::from_i128(*at), + } + } +} + +impl RangeSearchCollector +where + Iter: Iterator>)>, +{ + fn new( + key_range: Range, + end_lsn: Lsn, + delta_coverage: Iter, + image_coverage: Iter, + ) -> Self { + Self { + delta_coverage: delta_coverage.peekable(), + image_coverage: image_coverage.peekable(), + key_range, + end_lsn, + current_delta: None, + current_image: None, + result: RangeSearchResult::new(), + } + } + + /// Run the collector. Collection is implemented via a two pointer algorithm. + /// One pointer tracks the start of the current range and the other tracks + /// the beginning of the next range which will overlap with the next change + /// in coverage across both image and delta. + fn collect(mut self) -> RangeSearchResult { + let next_layer_type = self.choose_next_layer_type(); + let mut current_range_start = match next_layer_type { + None => { + // No changes for the range + self.pad_range(self.key_range.clone()); + return self.result; + } + Some(layer_type) if self.key_range.end <= layer_type.next_change_at_key() => { + // Changes only after the end of the range + self.pad_range(self.key_range.clone()); + return self.result; + } + Some(layer_type) => { + // Changes for the range exist. Record anything before the first + // coverage change as not found. + let coverage_start = layer_type.next_change_at_key(); + let range_before = self.key_range.start..coverage_start; + self.pad_range(range_before); + + self.advance(&layer_type); + coverage_start + } + }; + + while current_range_start < self.key_range.end { + let next_layer_type = self.choose_next_layer_type(); + match next_layer_type { + Some(t) => { + let current_range_end = t.next_change_at_key(); + self.add_range(current_range_start..current_range_end); + current_range_start = current_range_end; + + self.advance(&t); + } + None => { + self.add_range(current_range_start..self.key_range.end); + current_range_start = self.key_range.end; + } + } + } + + self.result + } + + /// Mark a range as not found (i.e. no layers intersect it) + fn pad_range(&mut self, key_range: Range) { + if !key_range.is_empty() { + self.result.not_found.add_range(key_range); + } + } + + /// Select the appropiate layer for the given range and update + /// the collector. + fn add_range(&mut self, covered_range: Range) { + let selected = LayerMap::select_layer( + self.current_delta.clone(), + self.current_image.clone(), + self.end_lsn, + ); + + match selected { + Some(search_result) => self + .result + .found + .entry(search_result) + .or_default() + .add_range(covered_range), + None => self.pad_range(covered_range), + } + } + + /// Move to the next coverage change. + fn advance(&mut self, layer_type: &NextLayerType) { + match layer_type { + NextLayerType::Delta(_) => { + let (_, layer) = self.delta_coverage.next().unwrap(); + self.current_delta = layer; + } + NextLayerType::Image(_) => { + let (_, layer) = self.image_coverage.next().unwrap(); + self.current_image = layer; + } + NextLayerType::Both(_) => { + let (_, image_layer) = self.image_coverage.next().unwrap(); + let (_, delta_layer) = self.delta_coverage.next().unwrap(); + + self.current_image = image_layer; + self.current_delta = delta_layer; + } + } + } + + /// Pick the next coverage change: the one at the lesser key or both if they're alligned. + fn choose_next_layer_type(&mut self) -> Option { + let next_delta_at = self.delta_coverage.peek().map(|(key, _)| key); + let next_image_at = self.image_coverage.peek().map(|(key, _)| key); + + match (next_delta_at, next_image_at) { + (None, None) => None, + (Some(next_delta_at), None) => Some(NextLayerType::Delta(*next_delta_at)), + (None, Some(next_image_at)) => Some(NextLayerType::Image(*next_image_at)), + (Some(next_delta_at), Some(next_image_at)) if next_image_at < next_delta_at => { + Some(NextLayerType::Image(*next_image_at)) + } + (Some(next_delta_at), Some(next_image_at)) if next_delta_at < next_image_at => { + Some(NextLayerType::Delta(*next_delta_at)) + } + (Some(next_delta_at), Some(_)) => Some(NextLayerType::Both(*next_delta_at)), + } + } +} + +#[derive(PartialEq, Eq, Hash, Debug, Clone)] +pub enum InMemoryLayerHandle { + Open { + lsn_floor: Lsn, + end_lsn: Lsn, + }, + Frozen { + idx: usize, + lsn_floor: Lsn, + end_lsn: Lsn, + }, +} + +impl InMemoryLayerHandle { + pub fn get_lsn_floor(&self) -> Lsn { + match self { + InMemoryLayerHandle::Open { lsn_floor, .. } => *lsn_floor, + InMemoryLayerHandle::Frozen { lsn_floor, .. } => *lsn_floor, + } + } + + pub fn get_end_lsn(&self) -> Lsn { + match self { + InMemoryLayerHandle::Open { end_lsn, .. } => *end_lsn, + InMemoryLayerHandle::Frozen { end_lsn, .. } => *end_lsn, + } + } +} + impl LayerMap { /// /// Find the latest layer (by lsn.end) that covers the given @@ -186,7 +412,18 @@ impl LayerMap { let latest_delta = version.delta_coverage.query(key.to_i128()); let latest_image = version.image_coverage.query(key.to_i128()); - match (latest_delta, latest_image) { + Self::select_layer(latest_delta, latest_image, end_lsn) + } + + fn select_layer( + delta_layer: Option>, + image_layer: Option>, + end_lsn: Lsn, + ) -> Option { + assert!(delta_layer.as_ref().map_or(true, |l| l.is_delta())); + assert!(image_layer.as_ref().map_or(true, |l| !l.is_delta())); + + match (delta_layer, image_layer) { (None, None) => None, (None, Some(image)) => { let lsn_floor = image.get_lsn_range().start; @@ -223,6 +460,24 @@ impl LayerMap { } } + pub fn range_search(&self, key_range: Range, end_lsn: Lsn) -> RangeSearchResult { + let version = match self.historic.get().unwrap().get_version(end_lsn.0 - 1) { + Some(version) => version, + None => { + let mut result = RangeSearchResult::new(); + result.not_found.add_range(key_range); + return result; + } + }; + + let raw_range = key_range.start.to_i128()..key_range.end.to_i128(); + let delta_changes = version.delta_coverage.range_overlaps(&raw_range); + let image_changes = version.image_coverage.range_overlaps(&raw_range); + + let collector = RangeSearchCollector::new(key_range, end_lsn, delta_changes, image_changes); + collector.collect() + } + /// Start a batch of updates, applied on drop pub fn batch_update(&mut self) -> BatchedUpdates<'_> { BatchedUpdates { layer_map: self } @@ -321,6 +576,43 @@ impl LayerMap { self.historic.iter() } + /// Get a handle for the first in memory layer that matches the provided predicate. + /// The handle should be used with [`Self::get_in_memory_layer`] to retrieve the actual layer. + /// + /// Note: [`Self::find_in_memory_layer`] and [`Self::get_in_memory_layer`] should be called during + /// the same exclusive region established by holding the layer manager lock. + pub fn find_in_memory_layer(&self, mut pred: Pred) -> Option + where + Pred: FnMut(&Arc) -> bool, + { + if let Some(open) = &self.open_layer { + if pred(open) { + return Some(InMemoryLayerHandle::Open { + lsn_floor: open.get_lsn_range().start, + end_lsn: open.get_lsn_range().end, + }); + } + } + + let pos = self.frozen_layers.iter().rev().position(pred); + pos.map(|rev_idx| { + let idx = self.frozen_layers.len() - 1 - rev_idx; + InMemoryLayerHandle::Frozen { + idx, + lsn_floor: self.frozen_layers[idx].get_lsn_range().start, + end_lsn: self.frozen_layers[idx].get_lsn_range().end, + } + }) + } + + /// Get the layer pointed to by the provided handle. + pub fn get_in_memory_layer(&self, handle: &InMemoryLayerHandle) -> Option> { + match handle { + InMemoryLayerHandle::Open { .. } => self.open_layer.clone(), + InMemoryLayerHandle::Frozen { idx, .. } => self.frozen_layers.get(*idx).cloned(), + } + } + /// /// Divide the whole given range of keys into sub-ranges based on the latest /// image layer that covers each range at the specified lsn (inclusive). @@ -631,3 +923,133 @@ impl LayerMap { Ok(()) } } + +#[cfg(test)] +mod tests { + use pageserver_api::keyspace::KeySpace; + + use super::*; + + #[derive(Clone)] + struct LayerDesc { + key_range: Range, + lsn_range: Range, + is_delta: bool, + } + + fn create_layer_map(layers: Vec) -> LayerMap { + let mut layer_map = LayerMap::default(); + + for layer in layers { + layer_map.insert_historic_noflush(PersistentLayerDesc::new_test( + layer.key_range, + layer.lsn_range, + layer.is_delta, + )); + } + + layer_map.flush_updates(); + layer_map + } + + fn assert_range_search_result_eq(lhs: RangeSearchResult, rhs: RangeSearchResult) { + assert_eq!(lhs.not_found.to_keyspace(), rhs.not_found.to_keyspace()); + let lhs: HashMap = lhs + .found + .into_iter() + .map(|(search_result, accum)| (search_result, accum.to_keyspace())) + .collect(); + let rhs: HashMap = rhs + .found + .into_iter() + .map(|(search_result, accum)| (search_result, accum.to_keyspace())) + .collect(); + + assert_eq!(lhs, rhs); + } + + fn brute_force_range_search( + layer_map: &LayerMap, + key_range: Range, + end_lsn: Lsn, + ) -> RangeSearchResult { + let mut range_search_result = RangeSearchResult::new(); + + let mut key = key_range.start; + while key != key_range.end { + let res = layer_map.search(key, end_lsn); + match res { + Some(res) => { + range_search_result + .found + .entry(res) + .or_default() + .add_key(key); + } + None => { + range_search_result.not_found.add_key(key); + } + } + + key = key.next(); + } + + range_search_result + } + + #[test] + fn ranged_search_on_empty_layer_map() { + let layer_map = LayerMap::default(); + let range = Key::from_i128(100)..Key::from_i128(200); + + let res = layer_map.range_search(range.clone(), Lsn(100)); + assert_eq!( + res.not_found.to_keyspace(), + KeySpace { + ranges: vec![range] + } + ); + } + + #[test] + fn ranged_search() { + let layers = vec![ + LayerDesc { + key_range: Key::from_i128(15)..Key::from_i128(50), + lsn_range: Lsn(0)..Lsn(5), + is_delta: false, + }, + LayerDesc { + key_range: Key::from_i128(10)..Key::from_i128(20), + lsn_range: Lsn(5)..Lsn(20), + is_delta: true, + }, + LayerDesc { + key_range: Key::from_i128(15)..Key::from_i128(25), + lsn_range: Lsn(20)..Lsn(30), + is_delta: true, + }, + LayerDesc { + key_range: Key::from_i128(35)..Key::from_i128(40), + lsn_range: Lsn(25)..Lsn(35), + is_delta: true, + }, + LayerDesc { + key_range: Key::from_i128(35)..Key::from_i128(40), + lsn_range: Lsn(35)..Lsn(40), + is_delta: false, + }, + ]; + + let layer_map = create_layer_map(layers.clone()); + for start in 0..60 { + for end in (start + 1)..60 { + let range = Key::from_i128(start)..Key::from_i128(end); + let result = layer_map.range_search(range.clone(), Lsn(100)); + let expected = brute_force_range_search(&layer_map, range, Lsn(100)); + + assert_range_search_result_eq(result, expected); + } + } + } +} diff --git a/pageserver/src/tenant/layer_map/layer_coverage.rs b/pageserver/src/tenant/layer_map/layer_coverage.rs index 1d9101d3d1..cf0085c071 100644 --- a/pageserver/src/tenant/layer_map/layer_coverage.rs +++ b/pageserver/src/tenant/layer_map/layer_coverage.rs @@ -129,6 +129,42 @@ impl LayerCoverage { .map(|(k, v)| (*k, v.as_ref().map(|x| x.1.clone()))) } + /// Returns an iterator which includes all coverage changes for layers that intersect + /// with the provided range. + pub fn range_overlaps( + &self, + key_range: &Range, + ) -> impl Iterator)> + '_ + where + Value: Eq, + { + let first_change = self.query(key_range.start); + match first_change { + Some(change) => { + // If the start of the range is covered, we have to deal with two cases: + // 1. Start of the range is aligned with the start of a layer. + // In this case the return of `self.range` will contain the layer which aligns with the start of the key range. + // We advance said iterator to avoid duplicating the first change. + // 2. Start of the range is not aligned with the start of a layer. + let range = key_range.start..key_range.end; + let mut range_coverage = self.range(range).peekable(); + if range_coverage + .peek() + .is_some_and(|c| c.1.as_ref() == Some(&change)) + { + range_coverage.next(); + } + itertools::Either::Left( + std::iter::once((key_range.start, Some(change))).chain(range_coverage), + ) + } + None => { + let range = key_range.start..key_range.end; + let coverage = self.range(range); + itertools::Either::Right(coverage) + } + } + } /// O(1) clone pub fn clone(&self) -> Self { Self { diff --git a/pageserver/src/tenant/metadata.rs b/pageserver/src/tenant/metadata.rs index 6fb86c65e2..1736950d1f 100644 --- a/pageserver/src/tenant/metadata.rs +++ b/pageserver/src/tenant/metadata.rs @@ -8,20 +8,11 @@ //! //! [`remote_timeline_client`]: super::remote_timeline_client -use std::io::{self}; - -use anyhow::{ensure, Context}; -use pageserver_api::shard::TenantShardId; +use anyhow::ensure; use serde::{de::Error, Deserialize, Serialize, Serializer}; -use thiserror::Error; use utils::bin_ser::SerializeError; -use utils::crashsafe::path_with_suffix_extension; use utils::{bin_ser::BeSer, id::TimelineId, lsn::Lsn}; -use crate::config::PageServerConf; -use crate::virtual_file::VirtualFile; -use crate::TEMP_FILE_SUFFIX; - /// Use special format number to enable backward compatibility. const METADATA_FORMAT_VERSION: u16 = 4; @@ -268,43 +259,6 @@ impl Serialize for TimelineMetadata { } } -/// Save timeline metadata to file -#[tracing::instrument(skip_all, fields(%tenant_id=tenant_shard_id.tenant_id, %shard_id=tenant_shard_id.shard_slug(), %timeline_id))] -pub async fn save_metadata( - conf: &'static PageServerConf, - tenant_shard_id: &TenantShardId, - timeline_id: &TimelineId, - data: &TimelineMetadata, -) -> anyhow::Result<()> { - let path = conf.metadata_path(tenant_shard_id, timeline_id); - let temp_path = path_with_suffix_extension(&path, TEMP_FILE_SUFFIX); - let metadata_bytes = data.to_bytes().context("serialize metadata")?; - VirtualFile::crashsafe_overwrite(&path, &temp_path, &metadata_bytes) - .await - .context("write metadata")?; - Ok(()) -} - -#[derive(Error, Debug)] -pub enum LoadMetadataError { - #[error(transparent)] - Read(#[from] io::Error), - - #[error(transparent)] - Decode(#[from] anyhow::Error), -} - -pub fn load_metadata( - conf: &'static PageServerConf, - tenant_shard_id: &TenantShardId, - timeline_id: &TimelineId, -) -> Result { - let metadata_path = conf.metadata_path(tenant_shard_id, timeline_id); - let metadata_bytes = std::fs::read(metadata_path)?; - - Ok(TimelineMetadata::from_bytes(&metadata_bytes)?) -} - #[cfg(test)] mod tests { use super::*; diff --git a/pageserver/src/tenant/mgr.rs b/pageserver/src/tenant/mgr.rs index 32535e0134..26fcce1f38 100644 --- a/pageserver/src/tenant/mgr.rs +++ b/pageserver/src/tenant/mgr.rs @@ -2,9 +2,13 @@ //! page server. use camino::{Utf8DirEntry, Utf8Path, Utf8PathBuf}; +use futures::stream::StreamExt; +use itertools::Itertools; use pageserver_api::key::Key; use pageserver_api::models::ShardParameters; -use pageserver_api::shard::{ShardCount, ShardIdentity, ShardNumber, TenantShardId}; +use pageserver_api::shard::{ + ShardCount, ShardIdentity, ShardNumber, ShardStripeSize, TenantShardId, +}; use rand::{distributions::Alphanumeric, Rng}; use std::borrow::Cow; use std::cmp::Ordering; @@ -22,7 +26,7 @@ use tokio_util::sync::CancellationToken; use tracing::*; use remote_storage::GenericRemoteStorage; -use utils::crashsafe; +use utils::{completion, crashsafe}; use crate::config::PageServerConf; use crate::context::{DownloadBehavior, RequestContext}; @@ -30,6 +34,7 @@ use crate::control_plane_client::{ ControlPlaneClient, ControlPlaneGenerationsApi, RetryForeverError, }; use crate::deletion_queue::DeletionQueueClient; +use crate::http::routes::ACTIVE_TENANT_TIMEOUT; use crate::metrics::{TENANT, TENANT_MANAGER as METRICS}; use crate::task_mgr::{self, TaskKind}; use crate::tenant::config::{ @@ -39,7 +44,7 @@ use crate::tenant::config::{ use crate::tenant::delete::DeleteTenantFlow; use crate::tenant::span::debug_assert_current_span_has_tenant_id; use crate::tenant::{AttachedTenantConf, SpawnMode, Tenant, TenantState}; -use crate::{InitializationOrder, IGNORED_TENANT_FILE_NAME, TEMP_FILE_SUFFIX}; +use crate::{InitializationOrder, IGNORED_TENANT_FILE_NAME, METADATA_FILE_NAME, TEMP_FILE_SUFFIX}; use utils::crashsafe::path_with_suffix_extension; use utils::fs_ext::PathExt; @@ -292,7 +297,7 @@ async fn init_load_generations( } else if let Some(client) = ControlPlaneClient::new(conf, cancel) { info!("Calling control plane API to re-attach tenants"); // If we are configured to use the control plane API, then it is the source of truth for what tenants to load. - match client.re_attach().await { + match client.re_attach(conf).await { Ok(tenants) => tenants, Err(RetryForeverError::ShuttingDown) => { anyhow::bail!("Shut down while waiting for control plane re-attach response") @@ -356,12 +361,6 @@ fn load_tenant_config( return Ok(None); } - let tenant_ignore_mark_file = tenant_dir_path.join(IGNORED_TENANT_FILE_NAME); - if tenant_ignore_mark_file.exists() { - info!("Found an ignore mark file {tenant_ignore_mark_file:?}, skipping the tenant"); - return Ok(None); - } - let tenant_shard_id = match tenant_dir_path .file_name() .unwrap_or_default() @@ -374,6 +373,59 @@ fn load_tenant_config( } }; + // Clean up legacy `metadata` files. + // Doing it here because every single tenant directory is visited here. + // In any later code, there's different treatment of tenant dirs + // ... depending on whether the tenant is in re-attach response or not + // ... epending on whether the tenant is ignored or not + assert_eq!( + &conf.tenant_path(&tenant_shard_id), + &tenant_dir_path, + "later use of conf....path() methods would be dubious" + ); + let timelines: Vec = match conf.timelines_path(&tenant_shard_id).read_dir_utf8() { + Ok(iter) => { + let mut timelines = Vec::new(); + for res in iter { + let p = res?; + let Some(timeline_id) = p.file_name().parse::().ok() else { + // skip any entries that aren't TimelineId, such as + // - *.___temp dirs + // - unfinished initdb uploads (test_non_uploaded_root_timeline_is_deleted_after_restart) + continue; + }; + timelines.push(timeline_id); + } + timelines + } + Err(e) if e.kind() == std::io::ErrorKind::NotFound => vec![], + Err(e) => return Err(anyhow::anyhow!(e)), + }; + for timeline_id in timelines { + let timeline_path = &conf.timeline_path(&tenant_shard_id, &timeline_id); + let metadata_path = timeline_path.join(METADATA_FILE_NAME); + match std::fs::remove_file(&metadata_path) { + Ok(()) => { + crashsafe::fsync(timeline_path) + .context("fsync timeline dir after removing legacy metadata file")?; + info!("removed legacy metadata file at {metadata_path}"); + } + Err(e) if e.kind() == std::io::ErrorKind::NotFound => { + // something removed the file earlier, or it was never there + // We don't care, this software version doesn't write it again, so, we're good. + } + Err(e) => { + anyhow::bail!("remove legacy metadata file: {e}: {metadata_path}"); + } + } + } + + let tenant_ignore_mark_file = tenant_dir_path.join(IGNORED_TENANT_FILE_NAME); + if tenant_ignore_mark_file.exists() { + info!("Found an ignore mark file {tenant_ignore_mark_file:?}, skipping the tenant"); + return Ok(None); + } + Ok(Some(( tenant_shard_id, Tenant::load_tenant_config(conf, &tenant_shard_id), @@ -482,7 +534,7 @@ pub async fn init_tenant_mgr( TenantSlot::Secondary(SecondaryTenant::new( tenant_shard_id, location_conf.shard, - location_conf.tenant_conf, + location_conf.tenant_conf.clone(), &SecondaryLocationConfig { warm: false }, )), ); @@ -545,7 +597,7 @@ pub async fn init_tenant_mgr( shard_identity, Some(init_order.clone()), &TENANTS, - SpawnMode::Normal, + SpawnMode::Lazy, &ctx, ) { Ok(tenant) => { @@ -607,13 +659,6 @@ pub(crate) fn tenant_spawn( "Cannot load tenant, ignore mark found at {tenant_ignore_mark:?}" ); - info!( - tenant_id = %tenant_shard_id.tenant_id, - shard_id = %tenant_shard_id.shard_slug(), - generation = ?location_conf.location.generation, - attach_mode = ?location_conf.location.attach_mode, - "Attaching tenant" - ); let tenant = match Tenant::spawn( conf, tenant_shard_id, @@ -651,8 +696,6 @@ pub(crate) async fn shutdown_all_tenants() { } async fn shutdown_all_tenants0(tenants: &std::sync::RwLock) { - use utils::completion; - let mut join_set = JoinSet::new(); // Atomically, 1. create the shutdown tasks and 2. prevent creation of new tenants. @@ -691,7 +734,7 @@ async fn shutdown_all_tenants0(tenants: &std::sync::RwLock) { // going to log too many lines debug!("tenant successfully stopped"); } - .instrument(info_span!("shutdown", tenant_id=%tenant_shard_id.tenant_id, shard=%tenant_shard_id.shard_slug())), + .instrument(info_span!("shutdown", tenant_id=%tenant_shard_id.tenant_id, shard_id=%tenant_shard_id.shard_slug())), ); total_attached += 1; @@ -801,7 +844,7 @@ pub(crate) async fn set_new_tenant_config( info!("configuring tenant {tenant_id}"); let tenant = get_tenant(tenant_shard_id, true)?; - if tenant.tenant_shard_id().shard_count > ShardCount(0) { + if !tenant.tenant_shard_id().shard_count.is_unsharded() { // Note that we use ShardParameters::default below. return Err(SetNewTenantConfigError::Other(anyhow::anyhow!( "This API may only be used on single-sharded tenants, use the /location_config API for sharded tenants" @@ -812,7 +855,7 @@ pub(crate) async fn set_new_tenant_config( // API to use is the location_config/ endpoint, which lets the caller provide // the full LocationConf. let location_conf = LocationConf::attached_single( - new_tenant_conf, + new_tenant_conf.clone(), tenant.generation, &ShardParameters::default(), ); @@ -898,6 +941,17 @@ impl TenantManager { } } + /// Whether the `TenantManager` is responsible for the tenant shard + pub(crate) fn manages_tenant_shard(&self, tenant_shard_id: TenantShardId) -> bool { + let locked = self.tenants.read().unwrap(); + + let peek_slot = tenant_map_peek_slot(&locked, &tenant_shard_id, TenantSlotPeekMode::Read) + .ok() + .flatten(); + + peek_slot.is_some() + } + #[instrument(skip_all, fields(tenant_id=%tenant_shard_id.tenant_id, shard_id=%tenant_shard_id.shard_slug()))] pub(crate) async fn upsert_location( &self, @@ -1054,9 +1108,9 @@ impl TenantManager { // Edge case: if we were called with SpawnMode::Create, but a Tenant already existed, then // the caller thinks they're creating but the tenant already existed. We must switch to - // Normal mode so that when starting this Tenant we properly probe remote storage for timelines, + // Eager mode so that when starting this Tenant we properly probe remote storage for timelines, // rather than assuming it to be empty. - spawn_mode = SpawnMode::Normal; + spawn_mode = SpawnMode::Eager; } Some(TenantSlot::Secondary(state)) => { info!("Shutting down secondary tenant"); @@ -1196,7 +1250,7 @@ impl TenantManager { &self, tenant_shard_id: TenantShardId, drop_cache: bool, - ctx: RequestContext, + ctx: &RequestContext, ) -> anyhow::Result<()> { let mut slot_guard = tenant_map_acquire_slot(&tenant_shard_id, TenantSlotAcquireMode::Any)?; let Some(old_slot) = slot_guard.get_old_value() else { @@ -1248,8 +1302,8 @@ impl TenantManager { shard_identity, None, self.tenants, - SpawnMode::Normal, - &ctx, + SpawnMode::Eager, + ctx, )?; slot_guard.upsert(TenantSlot::Attached(tenant))?; @@ -1306,11 +1360,22 @@ impl TenantManager { } } + pub(crate) fn get(&self, tenant_shard_id: TenantShardId) -> Option { + let locked = self.tenants.read().unwrap(); + match &*locked { + TenantsMap::Initializing => None, + TenantsMap::Open(map) | TenantsMap::ShuttingDown(map) => { + map.get(&tenant_shard_id).cloned() + } + } + } + pub(crate) async fn delete_tenant( &self, tenant_shard_id: TenantShardId, activation_timeout: Duration, ) -> Result<(), DeleteTenantError> { + super::span::debug_assert_current_span_has_tenant_id(); // We acquire a SlotGuard during this function to protect against concurrent // changes while the ::prepare phase of DeleteTenantFlow executes, but then // have to return the Tenant to the map while the background deletion runs. @@ -1370,6 +1435,334 @@ impl TenantManager { slot_guard.revert(); result } + + #[instrument(skip_all, fields(tenant_id=%tenant_shard_id.tenant_id, shard_id=%tenant_shard_id.shard_slug(), new_shard_count=%new_shard_count.literal()))] + pub(crate) async fn shard_split( + &self, + tenant_shard_id: TenantShardId, + new_shard_count: ShardCount, + new_stripe_size: Option, + ctx: &RequestContext, + ) -> anyhow::Result> { + let tenant = get_tenant(tenant_shard_id, true)?; + + // Validate the incoming request + if new_shard_count.count() <= tenant_shard_id.shard_count.count() { + anyhow::bail!("Requested shard count is not an increase"); + } + let expansion_factor = new_shard_count.count() / tenant_shard_id.shard_count.count(); + if !expansion_factor.is_power_of_two() { + anyhow::bail!("Requested split is not a power of two"); + } + + if let Some(new_stripe_size) = new_stripe_size { + if tenant.get_shard_stripe_size() != new_stripe_size + && tenant_shard_id.shard_count.count() > 1 + { + // This tenant already has multiple shards, it is illegal to try and change its stripe size + anyhow::bail!( + "Shard stripe size may not be modified once tenant has multiple shards" + ); + } + } + + // Plan: identify what the new child shards will be + let child_shards = tenant_shard_id.split(new_shard_count); + tracing::info!( + "Shard {} splits into: {}", + tenant_shard_id.to_index(), + child_shards + .iter() + .map(|id| format!("{}", id.to_index())) + .join(",") + ); + + let parent_shard_identity = tenant.shard_identity; + let parent_tenant_conf = tenant.get_tenant_conf(); + let parent_generation = tenant.generation; + + // Phase 1: Write out child shards' remote index files, in the parent tenant's current generation + if let Err(e) = tenant.split_prepare(&child_shards).await { + // If [`Tenant::split_prepare`] fails, we must reload the tenant, because it might + // have been left in a partially-shut-down state. + tracing::warn!("Failed to prepare for split: {e}, reloading Tenant before returning"); + self.reset_tenant(tenant_shard_id, false, ctx).await?; + return Err(e); + } + + self.resources.deletion_queue_client.flush_advisory(); + + // Phase 2: Put the parent shard to InProgress and grab a reference to the parent Tenant + drop(tenant); + let mut parent_slot_guard = + tenant_map_acquire_slot(&tenant_shard_id, TenantSlotAcquireMode::Any)?; + let parent = match parent_slot_guard.get_old_value() { + Some(TenantSlot::Attached(t)) => t, + Some(TenantSlot::Secondary(_)) => anyhow::bail!("Tenant location in secondary mode"), + Some(TenantSlot::InProgress(_)) => { + // tenant_map_acquire_slot never returns InProgress, if a slot was InProgress + // it would return an error. + unreachable!() + } + None => { + // We don't actually need the parent shard to still be attached to do our work, but it's + // a weird enough situation that the caller probably didn't want us to continue working + // if they had detached the tenant they requested the split on. + anyhow::bail!("Detached parent shard in the middle of split!") + } + }; + + // Optimization: hardlink layers from the parent into the children, so that they don't have to + // re-download & duplicate the data referenced in their initial IndexPart + self.shard_split_hardlink(parent, child_shards.clone()) + .await?; + + // Take a snapshot of where the parent's WAL ingest had got to: we will wait for + // child shards to reach this point. + let mut target_lsns = HashMap::new(); + for timeline in parent.timelines.lock().unwrap().clone().values() { + target_lsns.insert(timeline.timeline_id, timeline.get_last_record_lsn()); + } + + // TODO: we should have the parent shard stop its WAL ingest here, it's a waste of resources + // and could slow down the children trying to catch up. + + // Phase 3: Spawn the child shards + for child_shard in &child_shards { + let mut child_shard_identity = parent_shard_identity; + if let Some(new_stripe_size) = new_stripe_size { + child_shard_identity.stripe_size = new_stripe_size; + } + child_shard_identity.count = child_shard.shard_count; + child_shard_identity.number = child_shard.shard_number; + + let child_location_conf = LocationConf { + mode: LocationMode::Attached(AttachedLocationConfig { + generation: parent_generation, + attach_mode: AttachmentMode::Single, + }), + shard: child_shard_identity, + tenant_conf: parent_tenant_conf.clone(), + }; + + self.upsert_location( + *child_shard, + child_location_conf, + None, + SpawnMode::Eager, + ctx, + ) + .await?; + } + + // Phase 4: wait for child chards WAL ingest to catch up to target LSN + for child_shard_id in &child_shards { + let child_shard_id = *child_shard_id; + let child_shard = { + let locked = TENANTS.read().unwrap(); + let peek_slot = + tenant_map_peek_slot(&locked, &child_shard_id, TenantSlotPeekMode::Read)?; + peek_slot.and_then(|s| s.get_attached()).cloned() + }; + if let Some(t) = child_shard { + // Wait for the child shard to become active: this should be very quick because it only + // has to download the index_part that we just uploaded when creating it. + if let Err(e) = t.wait_to_become_active(ACTIVE_TENANT_TIMEOUT).await { + // This is not fatal: we have durably created the child shard. It just makes the + // split operation less seamless for clients, as we will may detach the parent + // shard before the child shards are fully ready to serve requests. + tracing::warn!("Failed to wait for shard {child_shard_id} to activate: {e}"); + continue; + } + + let timelines = t.timelines.lock().unwrap().clone(); + for timeline in timelines.values() { + let Some(target_lsn) = target_lsns.get(&timeline.timeline_id) else { + continue; + }; + + tracing::info!( + "Waiting for child shard {}/{} to reach target lsn {}...", + child_shard_id, + timeline.timeline_id, + target_lsn + ); + if let Err(e) = timeline.wait_lsn(*target_lsn, ctx).await { + // Failure here might mean shutdown, in any case this part is an optimization + // and we shouldn't hold up the split operation. + tracing::warn!( + "Failed to wait for timeline {} to reach lsn {target_lsn}: {e}", + timeline.timeline_id + ); + } else { + tracing::info!( + "Child shard {}/{} reached target lsn {}", + child_shard_id, + timeline.timeline_id, + target_lsn + ); + } + } + } + } + + // Phase 5: Shut down the parent shard, and erase it from disk + let (_guard, progress) = completion::channel(); + match parent.shutdown(progress, false).await { + Ok(()) => {} + Err(other) => { + other.wait().await; + } + } + let local_tenant_directory = self.conf.tenant_path(&tenant_shard_id); + let tmp_path = safe_rename_tenant_dir(&local_tenant_directory) + .await + .with_context(|| format!("local tenant directory {local_tenant_directory:?} rename"))?; + task_mgr::spawn( + task_mgr::BACKGROUND_RUNTIME.handle(), + TaskKind::MgmtRequest, + None, + None, + "tenant_files_delete", + false, + async move { + fs::remove_dir_all(tmp_path.as_path()) + .await + .with_context(|| format!("tenant directory {:?} deletion", tmp_path)) + }, + ); + + parent_slot_guard.drop_old_value()?; + + // Phase 6: Release the InProgress on the parent shard + drop(parent_slot_guard); + + Ok(child_shards) + } + + /// Part of [`Self::shard_split`]: hard link parent shard layers into child shards, as an optimization + /// to avoid the children downloading them again. + /// + /// For each resident layer in the parent shard, we will hard link it into all of the child shards. + async fn shard_split_hardlink( + &self, + parent_shard: &Tenant, + child_shards: Vec, + ) -> anyhow::Result<()> { + debug_assert_current_span_has_tenant_id(); + + let parent_path = self.conf.tenant_path(parent_shard.get_tenant_shard_id()); + let (parent_timelines, parent_layers) = { + let mut parent_layers = Vec::new(); + let timelines = parent_shard.timelines.lock().unwrap().clone(); + let parent_timelines = timelines.keys().cloned().collect::>(); + for timeline in timelines.values() { + let timeline_layers = timeline + .layers + .read() + .await + .resident_layers() + .collect::>() + .await; + for layer in timeline_layers { + let relative_path = layer + .local_path() + .strip_prefix(&parent_path) + .context("Removing prefix from parent layer path")?; + parent_layers.push(relative_path.to_owned()); + } + } + debug_assert!( + !parent_layers.is_empty(), + "shutdown cannot empty the layermap" + ); + (parent_timelines, parent_layers) + }; + + let mut child_prefixes = Vec::new(); + let mut create_dirs = Vec::new(); + + for child in child_shards { + let child_prefix = self.conf.tenant_path(&child); + create_dirs.push(child_prefix.clone()); + create_dirs.extend( + parent_timelines + .iter() + .map(|t| self.conf.timeline_path(&child, t)), + ); + + child_prefixes.push(child_prefix); + } + + // Since we will do a large number of small filesystem metadata operations, batch them into + // spawn_blocking calls rather than doing each one as a tokio::fs round-trip. + let jh = tokio::task::spawn_blocking(move || -> anyhow::Result { + for dir in &create_dirs { + if let Err(e) = std::fs::create_dir_all(dir) { + // Ignore AlreadyExists errors, drop out on all other errors + match e.kind() { + std::io::ErrorKind::AlreadyExists => {} + _ => { + return Err(anyhow::anyhow!(e).context(format!("Creating {dir}"))); + } + } + } + } + + for child_prefix in child_prefixes { + for relative_layer in &parent_layers { + let parent_path = parent_path.join(relative_layer); + let child_path = child_prefix.join(relative_layer); + if let Err(e) = std::fs::hard_link(&parent_path, &child_path) { + match e.kind() { + std::io::ErrorKind::AlreadyExists => {} + std::io::ErrorKind::NotFound => { + tracing::info!( + "Layer {} not found during hard-linking, evicted during split?", + relative_layer + ); + } + _ => { + return Err(anyhow::anyhow!(e).context(format!( + "Hard linking {relative_layer} into {child_prefix}" + ))) + } + } + } + } + } + + // Durability is not required for correctness, but if we crashed during split and + // then came restarted with empty timeline dirs, it would be very inefficient to + // re-populate from remote storage. + for dir in create_dirs { + if let Err(e) = crashsafe::fsync(&dir) { + // Something removed a newly created timeline dir out from underneath us? Extremely + // unexpected, but not worth panic'ing over as this whole function is just an + // optimization. + tracing::warn!("Failed to fsync directory {dir}: {e}") + } + } + + Ok(parent_layers.len()) + }); + + match jh.await { + Ok(Ok(layer_count)) => { + tracing::info!(count = layer_count, "Hard linked layers into child shards"); + } + Ok(Err(e)) => { + // This is an optimization, so we tolerate failure. + tracing::warn!("Error hard-linking layers, proceeding anyway: {e}") + } + Err(e) => { + // This is something totally unexpected like a panic, so bail out. + anyhow::bail!("Error joining hard linking task: {e}"); + } + } + + Ok(()) + } } #[derive(Debug, thiserror::Error)] @@ -1699,7 +2092,7 @@ pub(crate) async fn load_tenant( shard_identity, None, &TENANTS, - SpawnMode::Normal, + SpawnMode::Eager, ctx, ) .with_context(|| format!("Failed to schedule tenant processing in path {tenant_path:?}"))?; @@ -1715,6 +2108,7 @@ pub(crate) async fn ignore_tenant( ignore_tenant0(conf, &TENANTS, tenant_id).await } +#[instrument(skip_all, fields(shard_id))] async fn ignore_tenant0( conf: &'static PageServerConf, tenants: &std::sync::RwLock, @@ -1722,6 +2116,10 @@ async fn ignore_tenant0( ) -> Result<(), TenantStateError> { // This is a legacy API (replaced by `/location_conf`). It does not support sharding let tenant_shard_id = TenantShardId::unsharded(tenant_id); + tracing::Span::current().record( + "shard_id", + tracing::field::display(tenant_shard_id.shard_slug()), + ); remove_tenant_from_memory(tenants, tenant_shard_id, async { let ignore_mark_file = conf.tenant_ignore_mark_file_path(&tenant_shard_id); @@ -2117,7 +2515,7 @@ fn tenant_map_acquire_slot_impl( METRICS.tenant_slot_writes.inc(); let mut locked = tenants.write().unwrap(); - let span = tracing::info_span!("acquire_slot", tenant_id=%tenant_shard_id.tenant_id, shard = %tenant_shard_id.shard_slug()); + let span = tracing::info_span!("acquire_slot", tenant_id=%tenant_shard_id.tenant_id, shard_id = %tenant_shard_id.shard_slug()); let _guard = span.enter(); let m = match &mut *locked { @@ -2199,8 +2597,6 @@ async fn remove_tenant_from_memory( where F: std::future::Future>, { - use utils::completion; - let mut slot_guard = tenant_map_acquire_slot_impl(&tenant_shard_id, tenants, TenantSlotAcquireMode::MustExist)?; @@ -2280,7 +2676,7 @@ pub(crate) async fn immediate_gc( let tenant = guard .get(&tenant_shard_id) - .map(Arc::clone) + .cloned() .with_context(|| format!("tenant {tenant_shard_id}")) .map_err(|e| ApiError::NotFound(e.into()))?; @@ -2353,7 +2749,7 @@ pub(crate) async fn immediate_gc( mod tests { use std::collections::BTreeMap; use std::sync::Arc; - use tracing::{info_span, Instrument}; + use tracing::Instrument; use crate::tenant::mgr::TenantSlot; @@ -2364,17 +2760,16 @@ mod tests { // Test that if an InProgress tenant is in the map during shutdown, the shutdown will gracefully // wait for it to complete before proceeding. - let (t, _ctx) = TenantHarness::create("shutdown_awaits_in_progress_tenant") - .unwrap() - .load() - .await; + let h = TenantHarness::create("shutdown_awaits_in_progress_tenant").unwrap(); + let (t, _ctx) = h.load().await; // harness loads it to active, which is forced and nothing is running on the tenant let id = t.tenant_shard_id(); // tenant harness configures the logging and we cannot escape it - let _e = info_span!("testing", tenant_id = %id).entered(); + let span = h.span(); + let _e = span.enter(); let tenants = BTreeMap::from([(id, TenantSlot::Attached(t.clone()))]); let tenants = Arc::new(std::sync::RwLock::new(TenantsMap::Open(tenants))); @@ -2395,7 +2790,7 @@ mod tests { }; super::remove_tenant_from_memory(&tenants, id, cleanup).await } - .instrument(info_span!("foobar", tenant_id = %id)) + .instrument(h.span()) }); // now the long cleanup should be in place, with the stopping state diff --git a/pageserver/src/tenant/par_fsync.rs b/pageserver/src/tenant/par_fsync.rs deleted file mode 100644 index 3acb0fb431..0000000000 --- a/pageserver/src/tenant/par_fsync.rs +++ /dev/null @@ -1,84 +0,0 @@ -use std::{ - io, - sync::atomic::{AtomicUsize, Ordering}, -}; - -use camino::{Utf8Path, Utf8PathBuf}; - -fn fsync_path(path: &Utf8Path) -> io::Result<()> { - // TODO use VirtualFile::fsync_all once we fully go async. - let file = std::fs::File::open(path)?; - file.sync_all() -} - -fn parallel_worker(paths: &[Utf8PathBuf], next_path_idx: &AtomicUsize) -> io::Result<()> { - while let Some(path) = paths.get(next_path_idx.fetch_add(1, Ordering::Relaxed)) { - fsync_path(path)?; - } - - Ok(()) -} - -fn fsync_in_thread_pool(paths: &[Utf8PathBuf]) -> io::Result<()> { - // TODO: remove this function in favor of `par_fsync_async` once we asyncify everything. - - /// Use at most this number of threads. - /// Increasing this limit will - /// - use more memory - /// - increase the cost of spawn/join latency - const MAX_NUM_THREADS: usize = 64; - let num_threads = paths.len().min(MAX_NUM_THREADS); - let next_path_idx = AtomicUsize::new(0); - - std::thread::scope(|s| -> io::Result<()> { - let mut handles = vec![]; - // Spawn `num_threads - 1`, as the current thread is also a worker. - for _ in 1..num_threads { - handles.push(s.spawn(|| parallel_worker(paths, &next_path_idx))); - } - - parallel_worker(paths, &next_path_idx)?; - - for handle in handles { - handle.join().unwrap()?; - } - - Ok(()) - }) -} - -/// Parallel fsync all files. Can be used in non-async context as it is using rayon thread pool. -pub fn par_fsync(paths: &[Utf8PathBuf]) -> io::Result<()> { - if paths.len() == 1 { - fsync_path(&paths[0])?; - return Ok(()); - } - - fsync_in_thread_pool(paths) -} - -/// Parallel fsync asynchronously. -pub async fn par_fsync_async(paths: &[Utf8PathBuf]) -> io::Result<()> { - const MAX_CONCURRENT_FSYNC: usize = 64; - let mut next = paths.iter().peekable(); - let mut js = tokio::task::JoinSet::new(); - loop { - while js.len() < MAX_CONCURRENT_FSYNC && next.peek().is_some() { - let next = next.next().expect("just peeked"); - let next = next.to_owned(); - js.spawn_blocking(move || fsync_path(&next)); - } - - // now the joinset has been filled up, wait for next to complete - if let Some(res) = js.join_next().await { - res??; - } else { - // last item had already completed - assert!( - next.peek().is_none(), - "joinset emptied, we shouldn't have more work" - ); - return Ok(()); - } - } -} diff --git a/pageserver/src/tenant/remote_timeline_client.rs b/pageserver/src/tenant/remote_timeline_client.rs index 80ff5c9a2d..40be2ca8f3 100644 --- a/pageserver/src/tenant/remote_timeline_client.rs +++ b/pageserver/src/tenant/remote_timeline_client.rs @@ -196,14 +196,12 @@ pub(crate) use upload::upload_initdb_dir; use utils::backoff::{ self, exponential_backoff, DEFAULT_BASE_BACKOFF_SECONDS, DEFAULT_MAX_BACKOFF_SECONDS, }; -use utils::timeout::{timeout_cancellable, TimeoutCancellableError}; use std::collections::{HashMap, VecDeque}; use std::sync::atomic::{AtomicU32, Ordering}; use std::sync::{Arc, Mutex}; -use std::time::Duration; -use remote_storage::{DownloadError, GenericRemoteStorage, RemotePath}; +use remote_storage::{DownloadError, GenericRemoteStorage, RemotePath, TimeoutOrCancel}; use std::ops::DerefMut; use tracing::{debug, error, info, instrument, warn}; use tracing::{info_span, Instrument}; @@ -217,6 +215,7 @@ use crate::metrics::{ }; use crate::task_mgr::shutdown_token; use crate::tenant::debug_assert_current_span_has_tenant_and_timeline_id; +use crate::tenant::remote_timeline_client::download::download_retry; use crate::tenant::storage_layer::AsLayerDesc; use crate::tenant::upload_queue::Delete; use crate::tenant::TIMELINES_SEGMENT_NAME; @@ -325,45 +324,6 @@ pub struct RemoteTimelineClient { cancel: CancellationToken, } -/// This timeout is intended to deal with hangs in lower layers, e.g. stuck TCP flows. It is not -/// intended to be snappy enough for prompt shutdown, as we have a CancellationToken for that. -const UPLOAD_TIMEOUT: Duration = Duration::from_secs(120); -const DOWNLOAD_TIMEOUT: Duration = Duration::from_secs(120); - -/// Wrapper for timeout_cancellable that flattens result and converts TimeoutCancellableError to anyhow. -/// -/// This is a convenience for the various upload functions. In future -/// the anyhow::Error result should be replaced with a more structured type that -/// enables callers to avoid handling shutdown as an error. -async fn upload_cancellable(cancel: &CancellationToken, future: F) -> anyhow::Result<()> -where - F: std::future::Future>, -{ - match timeout_cancellable(UPLOAD_TIMEOUT, cancel, future).await { - Ok(Ok(())) => Ok(()), - Ok(Err(e)) => Err(e), - Err(TimeoutCancellableError::Timeout) => Err(anyhow::anyhow!("Timeout")), - Err(TimeoutCancellableError::Cancelled) => Err(anyhow::anyhow!("Shutting down")), - } -} -/// Wrapper for timeout_cancellable that flattens result and converts TimeoutCancellableError to DownloaDError. -async fn download_cancellable( - cancel: &CancellationToken, - future: F, -) -> Result -where - F: std::future::Future>, -{ - match timeout_cancellable(DOWNLOAD_TIMEOUT, cancel, future).await { - Ok(Ok(r)) => Ok(r), - Ok(Err(e)) => Err(e), - Err(TimeoutCancellableError::Timeout) => { - Err(DownloadError::Other(anyhow::anyhow!("Timed out"))) - } - Err(TimeoutCancellableError::Cancelled) => Err(DownloadError::Cancelled), - } -} - impl RemoteTimelineClient { /// /// Create a remote storage client for given timeline @@ -506,7 +466,7 @@ impl RemoteTimelineClient { /// Download index file pub async fn download_index_file( &self, - cancel: CancellationToken, + cancel: &CancellationToken, ) -> Result { let _unfinished_gauge_guard = self.metrics.call_begin( &RemoteOpFileKind::Index, @@ -654,7 +614,7 @@ impl RemoteTimelineClient { metadata, ); let op = UploadOp::UploadMetadata(index_part, disk_consistent_lsn); - self.calls_unfinished_metric_begin(&op); + self.metric_begin(&op); upload_queue.queued_operations.push_back(op); upload_queue.latest_files_changes_since_metadata_upload_scheduled = 0; @@ -694,7 +654,7 @@ impl RemoteTimelineClient { metadata.generation, metadata.shard ); let op = UploadOp::UploadLayer(layer, metadata); - self.calls_unfinished_metric_begin(&op); + self.metric_begin(&op); upload_queue.queued_operations.push_back(op); } @@ -863,10 +823,14 @@ impl RemoteTimelineClient { } // schedule the actual deletions + if with_metadata.is_empty() { + // avoid scheduling the op & bumping the metric + return; + } let op = UploadOp::Delete(Delete { layers: with_metadata, }); - self.calls_unfinished_metric_begin(&op); + self.metric_begin(&op); upload_queue.queued_operations.push_back(op); } @@ -1046,9 +1010,11 @@ impl RemoteTimelineClient { // when executed as part of tenant deletion this happens in the background 2, "persist_index_part_with_deleted_flag", - backoff::Cancel::new(self.cancel.clone(), || anyhow::anyhow!("Cancelled")), + &self.cancel, ) - .await?; + .await + .ok_or_else(|| anyhow::Error::new(TimeoutOrCancel::Cancel)) + .and_then(|x| x)?; // all good, disarm the guard and mark as success ScopeGuard::into_inner(undo_deleted_at); @@ -1079,13 +1045,15 @@ impl RemoteTimelineClient { upload::preserve_initdb_archive(&self.storage_impl, tenant_id, timeline_id, cancel) .await }, - |_e| false, + TimeoutOrCancel::caused_by_cancel, FAILED_DOWNLOAD_WARN_THRESHOLD, FAILED_REMOTE_OP_RETRIES, "preserve_initdb_tar_zst", - backoff::Cancel::new(cancel.clone(), || anyhow::anyhow!("Cancelled!")), + &cancel.clone(), ) .await + .ok_or_else(|| anyhow::Error::new(TimeoutOrCancel::Cancel)) + .and_then(|x| x) .context("backing up initdb archive")?; Ok(()) } @@ -1141,20 +1109,19 @@ impl RemoteTimelineClient { // taking the burden of listing all the layers that we already know we should delete. self.deletion_queue_client.flush_immediate().await?; - let remaining = backoff::retry( + let cancel = shutdown_token(); + + let remaining = download_retry( || async { self.storage_impl - .list_files(Some(&timeline_storage_path)) + .list_files(Some(&timeline_storage_path), None, &cancel) .await }, - |_e| false, - FAILED_DOWNLOAD_WARN_THRESHOLD, - FAILED_REMOTE_OP_RETRIES, - "list_prefixes", - backoff::Cancel::new(shutdown_token(), || anyhow::anyhow!("Cancelled!")), + "list remaining files", + &cancel, ) .await - .context("list prefixes")?; + .context("list files remaining files")?; // We will delete the current index_part object last, since it acts as a deletion // marker via its deleted_at attribute @@ -1343,6 +1310,7 @@ impl RemoteTimelineClient { /// queue. /// async fn perform_upload_task(self: &Arc, task: Arc) { + let cancel = shutdown_token(); // Loop to retry until it completes. loop { // If we're requested to shut down, close up shop and exit. @@ -1354,7 +1322,7 @@ impl RemoteTimelineClient { // the Future, but we're not 100% sure if the remote storage library // is cancellation safe, so we don't dare to do that. Hopefully, the // upload finishes or times out soon enough. - if task_mgr::is_shutdown_requested() { + if cancel.is_cancelled() { info!("upload task cancelled by shutdown request"); match self.stop() { Ok(()) => {} @@ -1440,6 +1408,10 @@ impl RemoteTimelineClient { Ok(()) => { break; } + Err(e) if TimeoutOrCancel::caused_by_cancel(&e) => { + // loop around to do the proper stopping + continue; + } Err(e) => { let retries = task.retries.fetch_add(1, Ordering::SeqCst); @@ -1465,7 +1437,7 @@ impl RemoteTimelineClient { retries, DEFAULT_BASE_BACKOFF_SECONDS, DEFAULT_MAX_BACKOFF_SECONDS, - &shutdown_token(), + &cancel, ) .await; } @@ -1548,10 +1520,10 @@ impl RemoteTimelineClient { .await; } - self.calls_unfinished_metric_end(&task.op); + self.metric_end(&task.op); } - fn calls_unfinished_metric_impl( + fn metric_impl( &self, op: &UploadOp, ) -> Option<( @@ -1588,17 +1560,17 @@ impl RemoteTimelineClient { Some(res) } - fn calls_unfinished_metric_begin(&self, op: &UploadOp) { - let (file_kind, op_kind, track_bytes) = match self.calls_unfinished_metric_impl(op) { + fn metric_begin(&self, op: &UploadOp) { + let (file_kind, op_kind, track_bytes) = match self.metric_impl(op) { Some(x) => x, None => return, }; let guard = self.metrics.call_begin(&file_kind, &op_kind, track_bytes); - guard.will_decrement_manually(); // in unfinished_ops_metric_end() + guard.will_decrement_manually(); // in metric_end(), see right below } - fn calls_unfinished_metric_end(&self, op: &UploadOp) { - let (file_kind, op_kind, track_bytes) = match self.calls_unfinished_metric_impl(op) { + fn metric_end(&self, op: &UploadOp) { + let (file_kind, op_kind, track_bytes) = match self.metric_impl(op) { Some(x) => x, None => return, }; @@ -1683,7 +1655,7 @@ impl RemoteTimelineClient { // Tear down queued ops for op in qi.queued_operations.into_iter() { - self.calls_unfinished_metric_end(&op); + self.metric_end(&op); // Dropping UploadOp::Barrier() here will make wait_completion() return with an Err() // which is exactly what we want to happen. drop(op); @@ -1695,23 +1667,6 @@ impl RemoteTimelineClient { } } } - - pub(crate) fn get_layers_metadata( - &self, - layers: Vec, - ) -> anyhow::Result>> { - let q = self.upload_queue.lock().unwrap(); - let q = match &*q { - UploadQueue::Stopped(_) | UploadQueue::Uninitialized => { - anyhow::bail!("queue is in state {}", q.as_str()) - } - UploadQueue::Initialized(inner) => inner, - }; - - let decorated = layers.into_iter().map(|l| q.latest_files.get(&l).cloned()); - - Ok(decorated.collect()) - } } pub fn remote_timelines_path(tenant_shard_id: &TenantShardId) -> RemotePath { @@ -1719,6 +1674,11 @@ pub fn remote_timelines_path(tenant_shard_id: &TenantShardId) -> RemotePath { RemotePath::from_string(&path).expect("Failed to construct path") } +fn remote_timelines_path_unsharded(tenant_id: &TenantId) -> RemotePath { + let path = format!("tenants/{tenant_id}/{TIMELINES_SEGMENT_NAME}"); + RemotePath::from_string(&path).expect("Failed to construct path") +} + pub fn remote_timeline_path( tenant_shard_id: &TenantShardId, timeline_id: &TimelineId, @@ -1831,14 +1791,12 @@ mod tests { context::RequestContext, tenant::{ harness::{TenantHarness, TIMELINE_ID}, - storage_layer::Layer, - Generation, Tenant, Timeline, + Tenant, Timeline, }, DEFAULT_PG_VERSION, }; use std::collections::HashSet; - use utils::lsn::Lsn; pub(super) fn dummy_contents(name: &str) -> Vec { format!("contents for {name}").into() @@ -1939,6 +1897,7 @@ mod tests { tracing::info_span!( "test", tenant_id = %self.harness.tenant_shard_id.tenant_id, + shard_id = %self.harness.tenant_shard_id.shard_slug(), timeline_id = %TIMELINE_ID ) } @@ -1976,7 +1935,7 @@ mod tests { // Download back the index.json, and check that the list of files is correct let initial_index_part = match client - .download_index_file(CancellationToken::new()) + .download_index_file(&CancellationToken::new()) .await .unwrap() { @@ -2070,7 +2029,7 @@ mod tests { // Download back the index.json, and check that the list of files is correct let index_part = match client - .download_index_file(CancellationToken::new()) + .download_index_file(&CancellationToken::new()) .await .unwrap() { @@ -2272,7 +2231,7 @@ mod tests { let client = test_state.build_client(get_generation); let download_r = client - .download_index_file(CancellationToken::new()) + .download_index_file(&CancellationToken::new()) .await .expect("download should always succeed"); assert!(matches!(download_r, MaybeDeletedIndexPart::IndexPart(_))); diff --git a/pageserver/src/tenant/remote_timeline_client/download.rs b/pageserver/src/tenant/remote_timeline_client/download.rs index 4309c683e2..6fff6e78e2 100644 --- a/pageserver/src/tenant/remote_timeline_client/download.rs +++ b/pageserver/src/tenant/remote_timeline_client/download.rs @@ -11,19 +11,17 @@ use camino::{Utf8Path, Utf8PathBuf}; use pageserver_api::shard::TenantShardId; use tokio::fs::{self, File, OpenOptions}; use tokio::io::{AsyncSeekExt, AsyncWriteExt}; +use tokio_util::io::StreamReader; use tokio_util::sync::CancellationToken; use tracing::warn; -use utils::timeout::timeout_cancellable; -use utils::{backoff, crashsafe}; +use utils::backoff; use crate::config::PageServerConf; -use crate::tenant::remote_timeline_client::{ - download_cancellable, remote_layer_path, remote_timelines_path, DOWNLOAD_TIMEOUT, -}; +use crate::span::debug_assert_current_span_has_tenant_and_timeline_id; +use crate::tenant::remote_timeline_client::{remote_layer_path, remote_timelines_path}; use crate::tenant::storage_layer::LayerFileName; -use crate::tenant::timeline::span::debug_assert_current_span_has_tenant_and_timeline_id; use crate::tenant::Generation; -use crate::virtual_file::on_fatal_io_error; +use crate::virtual_file::{on_fatal_io_error, MaybeFatalIo, VirtualFile}; use crate::TEMP_FILE_SUFFIX; use remote_storage::{DownloadError, GenericRemoteStorage, ListingMode}; use utils::crashsafe::path_with_suffix_extension; @@ -52,9 +50,8 @@ pub async fn download_layer_file<'a>( ) -> Result { debug_assert_current_span_has_tenant_and_timeline_id(); - let local_path = conf - .timeline_path(&tenant_shard_id, &timeline_id) - .join(layer_file_name.file_name()); + let timeline_path = conf.timeline_path(&tenant_shard_id, &timeline_id); + let local_path = timeline_path.join(layer_file_name.file_name()); let remote_path = remote_layer_path( &tenant_shard_id.tenant_id, @@ -76,7 +73,6 @@ pub async fn download_layer_file<'a>( // If pageserver crashes the temp file will be deleted on startup and re-downloaded. let temp_file_path = path_with_suffix_extension(&local_path, TEMP_DOWNLOAD_EXTENSION); - let cancel_inner = cancel.clone(); let (mut destination_file, bytes_amount) = download_retry( || async { let destination_file = tokio::fs::File::create(&temp_file_path) @@ -84,60 +80,28 @@ pub async fn download_layer_file<'a>( .with_context(|| format!("create a destination file for layer '{temp_file_path}'")) .map_err(DownloadError::Other)?; - // Cancellation safety: it is safe to cancel this future, because it isn't writing to a local - // file: the write to local file doesn't start until after the request header is returned - // and we start draining the body stream below - let download = download_cancellable(&cancel_inner, storage.download(&remote_path)) - .await - .with_context(|| { - format!( - "open a download stream for layer with remote storage path '{remote_path:?}'" - ) - }) - .map_err(DownloadError::Other)?; + let download = storage.download(&remote_path, cancel).await?; let mut destination_file = tokio::io::BufWriter::with_capacity(super::BUFFER_SIZE, destination_file); let mut reader = tokio_util::io::StreamReader::new(download.download_stream); - // Cancellation safety: it is safe to cancel this future because it is writing into a temporary file, - // and we will unlink the temporary file if there is an error. This unlink is important because we - // are in a retry loop, and we wouldn't want to leave behind a rogue write I/O to a file that - // we will imminiently try and write to again. - let bytes_amount: u64 = match timeout_cancellable( - DOWNLOAD_TIMEOUT, - &cancel_inner, - tokio::io::copy_buf(&mut reader, &mut destination_file), - ) - .await - .with_context(|| { - format!( - "download layer at remote path '{remote_path:?}' into file {temp_file_path:?}" - ) - }) - .map_err(DownloadError::Other)? - { - Ok(b) => Ok(b), + let bytes_amount = tokio::io::copy_buf(&mut reader, &mut destination_file).await; + + match bytes_amount { + Ok(bytes_amount) => { + let destination_file = destination_file.into_inner(); + Ok((destination_file, bytes_amount)) + } Err(e) => { - // Remove incomplete files: on restart Timeline would do this anyway, but we must - // do it here for the retry case. if let Err(e) = tokio::fs::remove_file(&temp_file_path).await { on_fatal_io_error(&e, &format!("Removing temporary file {temp_file_path}")); } - Err(e) + + Err(e.into()) } } - .with_context(|| { - format!( - "download layer at remote path '{remote_path:?}' into file {temp_file_path:?}" - ) - }) - .map_err(DownloadError::Other)?; - - let destination_file = destination_file.into_inner(); - - Ok((destination_file, bytes_amount)) }, &format!("download {remote_path:?}"), cancel, @@ -184,10 +148,21 @@ pub async fn download_layer_file<'a>( .with_context(|| format!("rename download layer file to {local_path}")) .map_err(DownloadError::Other)?; - crashsafe::fsync_async(&local_path) - .await - .with_context(|| format!("fsync layer file {local_path}")) - .map_err(DownloadError::Other)?; + // We use fatal_err() below because the after the rename above, + // the in-memory state of the filesystem already has the layer file in its final place, + // and subsequent pageserver code could think it's durable while it really isn't. + let work = async move { + let timeline_dir = VirtualFile::open(&timeline_path) + .await + .fatal_err("VirtualFile::open for timeline dir fsync"); + timeline_dir + .sync_all() + .await + .fatal_err("VirtualFile::sync_all timeline dir"); + }; + crate::virtual_file::io_engine::get() + .spawn_blocking_and_block_on_if_std(work) + .await; tracing::debug!("download complete: {local_path}"); @@ -196,7 +171,7 @@ pub async fn download_layer_file<'a>( const TEMP_DOWNLOAD_EXTENSION: &str = "temp_download"; -pub fn is_temp_download_file(path: &Utf8Path) -> bool { +pub(crate) fn is_temp_download_file(path: &Utf8Path) -> bool { let extension = path.extension(); match extension { Some(TEMP_DOWNLOAD_EXTENSION) => true, @@ -217,16 +192,17 @@ pub async fn list_remote_timelines( anyhow::bail!("storage-sync-list-remote-timelines"); }); - let cancel_inner = cancel.clone(); let listing = download_retry_forever( || { - download_cancellable( - &cancel_inner, - storage.list(Some(&remote_path), ListingMode::WithDelimiter), + storage.list( + Some(&remote_path), + ListingMode::WithDelimiter, + None, + &cancel, ) }, &format!("list timelines for {tenant_shard_id}"), - cancel, + &cancel, ) .await?; @@ -259,29 +235,22 @@ async fn do_download_index_part( tenant_shard_id: &TenantShardId, timeline_id: &TimelineId, index_generation: Generation, - cancel: CancellationToken, + cancel: &CancellationToken, ) -> Result { - use futures::stream::StreamExt; - let remote_path = remote_index_path(tenant_shard_id, timeline_id, index_generation); - let cancel_inner = cancel.clone(); let index_part_bytes = download_retry_forever( || async { - // Cancellation: if is safe to cancel this future because we're just downloading into - // a memory buffer, not touching local disk. - let index_part_download = - download_cancellable(&cancel_inner, storage.download(&remote_path)).await?; + let download = storage.download(&remote_path, cancel).await?; - let mut index_part_bytes = Vec::new(); - let mut stream = std::pin::pin!(index_part_download.download_stream); - while let Some(chunk) = stream.next().await { - let chunk = chunk - .with_context(|| format!("download index part at {remote_path:?}")) - .map_err(DownloadError::Other)?; - index_part_bytes.extend_from_slice(&chunk[..]); - } - Ok(index_part_bytes) + let mut bytes = Vec::new(); + + let stream = download.download_stream; + let mut stream = StreamReader::new(stream); + + tokio::io::copy_buf(&mut stream, &mut bytes).await?; + + Ok(bytes) }, &format!("download {remote_path:?}"), cancel, @@ -289,7 +258,7 @@ async fn do_download_index_part( .await?; let index_part: IndexPart = serde_json::from_slice(&index_part_bytes) - .with_context(|| format!("download index part file at {remote_path:?}")) + .with_context(|| format!("deserialize index part file at {remote_path:?}")) .map_err(DownloadError::Other)?; Ok(index_part) @@ -306,7 +275,7 @@ pub(super) async fn download_index_part( tenant_shard_id: &TenantShardId, timeline_id: &TimelineId, my_generation: Generation, - cancel: CancellationToken, + cancel: &CancellationToken, ) -> Result { debug_assert_current_span_has_tenant_and_timeline_id(); @@ -326,14 +295,8 @@ pub(super) async fn download_index_part( // index in our generation. // // This is an optimization to avoid doing the listing for the general case below. - let res = do_download_index_part( - storage, - tenant_shard_id, - timeline_id, - my_generation, - cancel.clone(), - ) - .await; + let res = + do_download_index_part(storage, tenant_shard_id, timeline_id, my_generation, cancel).await; match res { Ok(index_part) => { tracing::debug!( @@ -358,7 +321,7 @@ pub(super) async fn download_index_part( tenant_shard_id, timeline_id, my_generation.previous(), - cancel.clone(), + cancel, ) .await; match res { @@ -380,16 +343,13 @@ pub(super) async fn download_index_part( // objects, and select the highest one with a generation <= my_generation. Constructing the prefix is equivalent // to constructing a full index path with no generation, because the generation is a suffix. let index_prefix = remote_index_path(tenant_shard_id, timeline_id, Generation::none()); - let indices = backoff::retry( - || async { storage.list_files(Some(&index_prefix)).await }, - |_| false, - FAILED_DOWNLOAD_WARN_THRESHOLD, - FAILED_REMOTE_OP_RETRIES, - "listing index_part files", - backoff::Cancel::new(cancel.clone(), || anyhow::anyhow!("Cancelled")), + + let indices = download_retry( + || async { storage.list_files(Some(&index_prefix), None, cancel).await }, + "list index_part files", + cancel, ) - .await - .map_err(DownloadError::Other)?; + .await?; // General case logic for which index to use: the latest index whose generation // is <= our own. See "Finding the remote indices for timelines" in docs/rfcs/025-generation-numbers.md @@ -446,8 +406,6 @@ pub(crate) async fn download_initdb_tar_zst( "{INITDB_PATH}.download-{timeline_id}.{TEMP_FILE_SUFFIX}" )); - let cancel_inner = cancel.clone(); - let file = download_retry( || async { let file = OpenOptions::new() @@ -460,26 +418,17 @@ pub(crate) async fn download_initdb_tar_zst( .with_context(|| format!("tempfile creation {temp_path}")) .map_err(DownloadError::Other)?; - let download = match download_cancellable(&cancel_inner, storage.download(&remote_path)) - .await - { + let download = match storage.download(&remote_path, cancel).await { Ok(dl) => dl, Err(DownloadError::NotFound) => { - download_cancellable(&cancel_inner, storage.download(&remote_preserved_path)) - .await? + storage.download(&remote_preserved_path, cancel).await? } Err(other) => Err(other)?, }; let mut download = tokio_util::io::StreamReader::new(download.download_stream); - let mut writer = tokio::io::BufWriter::with_capacity(8 * 1024, file); + let mut writer = tokio::io::BufWriter::with_capacity(super::BUFFER_SIZE, file); - // TODO: this consumption of the response body should be subject to timeout + cancellation, but - // not without thinking carefully about how to recover safely from cancelling a write to - // local storage (e.g. by writing into a temp file as we do in download_layer) - tokio::io::copy_buf(&mut download, &mut writer) - .await - .with_context(|| format!("download initdb.tar.zst at {remote_path:?}")) - .map_err(DownloadError::Other)?; + tokio::io::copy_buf(&mut download, &mut writer).await?; let mut file = writer.into_inner(); @@ -510,12 +459,12 @@ pub(crate) async fn download_initdb_tar_zst( /// Helper function to handle retries for a download operation. /// -/// Remote operations can fail due to rate limits (IAM, S3), spurious network +/// Remote operations can fail due to rate limits (S3), spurious network /// problems, or other external reasons. Retry FAILED_DOWNLOAD_RETRIES times, /// with backoff. /// /// (See similar logic for uploads in `perform_upload_task`) -async fn download_retry( +pub(super) async fn download_retry( op: O, description: &str, cancel: &CancellationToken, @@ -526,19 +475,21 @@ where { backoff::retry( op, - |e| matches!(e, DownloadError::BadInput(_) | DownloadError::NotFound), + DownloadError::is_permanent, FAILED_DOWNLOAD_WARN_THRESHOLD, FAILED_REMOTE_OP_RETRIES, description, - backoff::Cancel::new(cancel.clone(), || DownloadError::Cancelled), + cancel, ) .await + .ok_or_else(|| DownloadError::Cancelled) + .and_then(|x| x) } async fn download_retry_forever( op: O, description: &str, - cancel: CancellationToken, + cancel: &CancellationToken, ) -> Result where O: FnMut() -> F, @@ -546,11 +497,13 @@ where { backoff::retry( op, - |e| matches!(e, DownloadError::BadInput(_) | DownloadError::NotFound), + DownloadError::is_permanent, FAILED_DOWNLOAD_WARN_THRESHOLD, u32::MAX, description, - backoff::Cancel::new(cancel, || DownloadError::Cancelled), + cancel, ) .await + .ok_or_else(|| DownloadError::Cancelled) + .and_then(|x| x) } diff --git a/pageserver/src/tenant/remote_timeline_client/upload.rs b/pageserver/src/tenant/remote_timeline_client/upload.rs index 58d95f75c2..137fe48b73 100644 --- a/pageserver/src/tenant/remote_timeline_client/upload.rs +++ b/pageserver/src/tenant/remote_timeline_client/upload.rs @@ -5,19 +5,21 @@ use camino::Utf8Path; use fail::fail_point; use pageserver_api::shard::TenantShardId; use std::io::{ErrorKind, SeekFrom}; +use std::time::SystemTime; use tokio::fs::{self, File}; use tokio::io::AsyncSeekExt; use tokio_util::sync::CancellationToken; +use utils::backoff; use super::Generation; use crate::{ config::PageServerConf, tenant::remote_timeline_client::{ index::IndexPart, remote_index_path, remote_initdb_archive_path, - remote_initdb_preserved_archive_path, remote_path, upload_cancellable, + remote_initdb_preserved_archive_path, remote_path, }, }; -use remote_storage::GenericRemoteStorage; +use remote_storage::{GenericRemoteStorage, TimeTravelError}; use utils::id::{TenantId, TimelineId}; use super::index::LayerFileMetadata; @@ -25,7 +27,7 @@ use super::index::LayerFileMetadata; use tracing::info; /// Serializes and uploads the given index part data to the remote storage. -pub(super) async fn upload_index_part<'a>( +pub(crate) async fn upload_index_part<'a>( storage: &'a GenericRemoteStorage, tenant_shard_id: &TenantShardId, timeline_id: &TimelineId, @@ -47,16 +49,15 @@ pub(super) async fn upload_index_part<'a>( let index_part_bytes = bytes::Bytes::from(index_part_bytes); let remote_path = remote_index_path(tenant_shard_id, timeline_id, generation); - upload_cancellable( - cancel, - storage.upload_storage_object( + storage + .upload_storage_object( futures::stream::once(futures::future::ready(Ok(index_part_bytes))), index_part_size, &remote_path, - ), - ) - .await - .with_context(|| format!("upload index part for '{tenant_shard_id} / {timeline_id}'")) + cancel, + ) + .await + .with_context(|| format!("upload index part for '{tenant_shard_id} / {timeline_id}'")) } /// Attempts to upload given layer files. @@ -113,11 +114,10 @@ pub(super) async fn upload_timeline_layer<'a>( let reader = tokio_util::io::ReaderStream::with_capacity(source_file, super::BUFFER_SIZE); - upload_cancellable(cancel, storage.upload(reader, fs_size, &storage_path, None)) + storage + .upload(reader, fs_size, &storage_path, None, cancel) .await - .with_context(|| format!("upload layer from local path '{source_path}'"))?; - - Ok(()) + .with_context(|| format!("upload layer from local path '{source_path}'")) } /// Uploads the given `initdb` data to the remote storage. @@ -137,12 +137,10 @@ pub(crate) async fn upload_initdb_dir( let file = tokio_util::io::ReaderStream::with_capacity(initdb_tar_zst, super::BUFFER_SIZE); let remote_path = remote_initdb_archive_path(tenant_id, timeline_id); - upload_cancellable( - cancel, - storage.upload_storage_object(file, size as usize, &remote_path), - ) - .await - .with_context(|| format!("upload initdb dir for '{tenant_id} / {timeline_id}'")) + storage + .upload_storage_object(file, size as usize, &remote_path, cancel) + .await + .with_context(|| format!("upload initdb dir for '{tenant_id} / {timeline_id}'")) } pub(crate) async fn preserve_initdb_archive( @@ -153,7 +151,52 @@ pub(crate) async fn preserve_initdb_archive( ) -> anyhow::Result<()> { let source_path = remote_initdb_archive_path(tenant_id, timeline_id); let dest_path = remote_initdb_preserved_archive_path(tenant_id, timeline_id); - upload_cancellable(cancel, storage.copy_object(&source_path, &dest_path)) + storage + .copy_object(&source_path, &dest_path, cancel) .await .with_context(|| format!("backing up initdb archive for '{tenant_id} / {timeline_id}'")) } + +pub(crate) async fn time_travel_recover_tenant( + storage: &GenericRemoteStorage, + tenant_shard_id: &TenantShardId, + timestamp: SystemTime, + done_if_after: SystemTime, + cancel: &CancellationToken, +) -> Result<(), TimeTravelError> { + let warn_after = 3; + let max_attempts = 10; + let mut prefixes = Vec::with_capacity(2); + if tenant_shard_id.is_zero() { + // Also recover the unsharded prefix for a shard of zero: + // - if the tenant is totally unsharded, the unsharded prefix contains all the data + // - if the tenant is sharded, we still want to recover the initdb data, but we only + // want to do it once, so let's do it on the 0 shard + let timelines_path_unsharded = + super::remote_timelines_path_unsharded(&tenant_shard_id.tenant_id); + prefixes.push(timelines_path_unsharded); + } + if !tenant_shard_id.is_unsharded() { + // If the tenant is sharded, we need to recover the sharded prefix + let timelines_path = super::remote_timelines_path(tenant_shard_id); + prefixes.push(timelines_path); + } + for prefix in &prefixes { + backoff::retry( + || async { + storage + .time_travel_recover(Some(prefix), timestamp, done_if_after, cancel) + .await + }, + |e| !matches!(e, TimeTravelError::Other(_)), + warn_after, + max_attempts, + "time travel recovery of tenant prefix", + cancel, + ) + .await + .ok_or_else(|| TimeTravelError::Cancelled) + .and_then(|x| x)?; + } + Ok(()) +} diff --git a/pageserver/src/tenant/secondary.rs b/pageserver/src/tenant/secondary.rs index d00d901be6..14e88b836e 100644 --- a/pageserver/src/tenant/secondary.rs +++ b/pageserver/src/tenant/secondary.rs @@ -32,7 +32,7 @@ use remote_storage::GenericRemoteStorage; use tokio_util::sync::CancellationToken; use tracing::instrument; -use utils::{completion::Barrier, fs_ext, id::TimelineId, sync::gate::Gate}; +use utils::{completion::Barrier, id::TimelineId, sync::gate::Gate}; enum DownloadCommand { Download(TenantShardId), @@ -112,7 +112,7 @@ impl SecondaryTenant { // on shutdown we walk the tenants and fire their // individual cancellations? cancel: CancellationToken::new(), - gate: Gate::new(format!("SecondaryTenant {tenant_shard_id}")), + gate: Gate::default(), shard_identity, tenant_conf: std::sync::Mutex::new(tenant_conf), @@ -121,6 +121,10 @@ impl SecondaryTenant { }) } + pub(crate) fn tenant_shard_id(&self) -> TenantShardId { + self.tenant_shard_id + } + pub(crate) async fn shutdown(&self) { self.cancel.cancel(); @@ -133,7 +137,7 @@ impl SecondaryTenant { } pub(crate) fn set_tenant_conf(&self, config: &TenantConfOpt) { - *(self.tenant_conf.lock().unwrap()) = *config; + *(self.tenant_conf.lock().unwrap()) = config.clone(); } /// For API access: generate a LocationConfig equivalent to the one that would be used to @@ -144,13 +148,13 @@ impl SecondaryTenant { let conf = models::LocationConfigSecondary { warm: conf.warm }; - let tenant_conf = *self.tenant_conf.lock().unwrap(); + let tenant_conf = self.tenant_conf.lock().unwrap().clone(); models::LocationConfig { mode: models::LocationConfigMode::Secondary, generation: None, secondary_conf: Some(conf), shard_number: self.tenant_shard_id.shard_number.0, - shard_count: self.tenant_shard_id.shard_count.0, + shard_count: self.tenant_shard_id.shard_count.literal(), shard_stripe_size: self.shard_identity.stripe_size.0, tenant_conf: tenant_conf.into(), } @@ -160,20 +164,21 @@ impl SecondaryTenant { &self.tenant_shard_id } - pub(crate) fn get_layers_for_eviction(self: &Arc) -> DiskUsageEvictionInfo { + pub(crate) fn get_layers_for_eviction(self: &Arc) -> (DiskUsageEvictionInfo, usize) { self.detail.lock().unwrap().get_layers_for_eviction(self) } + /// Cancellation safe, but on cancellation the eviction will go through #[instrument(skip_all, fields(tenant_id=%self.tenant_shard_id.tenant_id, shard_id=%self.tenant_shard_id.shard_slug(), timeline_id=%timeline_id, name=%name))] pub(crate) async fn evict_layer( - &self, + self: &Arc, conf: &PageServerConf, timeline_id: TimelineId, name: LayerFileName, ) { debug_assert_current_span_has_tenant_id(); - let _guard = match self.gate.enter() { + let guard = match self.gate.enter() { Ok(g) => g, Err(_) => { tracing::debug!("Dropping layer evictions, secondary tenant shutting down",); @@ -187,35 +192,57 @@ impl SecondaryTenant { .timeline_path(&self.tenant_shard_id, &timeline_id) .join(name.file_name()); - // We tolerate ENOENT, because between planning eviction and executing - // it, the secondary downloader could have seen an updated heatmap that - // resulted in a layer being deleted. - // Other local I/O errors are process-fatal: these should never happen. - tokio::fs::remove_file(path) - .await - .or_else(fs_ext::ignore_not_found) - .fatal_err("Deleting layer during eviction"); + let this = self.clone(); - // Update the timeline's state. This does not have to be synchronized with - // the download process, because: - // - If downloader is racing with us to remove a file (e.g. because it is - // removed from heatmap), then our mutual .remove() operations will both - // succeed. - // - If downloader is racing with us to download the object (this would require - // multiple eviction iterations to race with multiple download iterations), then - // if we remove it from the state, the worst that happens is the downloader - // downloads it again before re-inserting, or we delete the file but it remains - // in the state map (in which case it will be downloaded if this secondary - // tenant transitions to attached and tries to access it) - // - // The important assumption here is that the secondary timeline state does not - // have to 100% match what is on disk, because it's a best-effort warming - // of the cache. - let mut detail = self.detail.lock().unwrap(); - if let Some(timeline_detail) = detail.timelines.get_mut(&timeline_id) { - timeline_detail.on_disk_layers.remove(&name); - timeline_detail.evicted_at.insert(name, now); - } + // spawn it to be cancellation safe + tokio::task::spawn_blocking(move || { + let _guard = guard; + // We tolerate ENOENT, because between planning eviction and executing + // it, the secondary downloader could have seen an updated heatmap that + // resulted in a layer being deleted. + // Other local I/O errors are process-fatal: these should never happen. + let deleted = std::fs::remove_file(path); + + let not_found = deleted + .as_ref() + .is_err_and(|x| x.kind() == std::io::ErrorKind::NotFound); + + let deleted = if not_found { + false + } else { + deleted + .map(|()| true) + .fatal_err("Deleting layer during eviction") + }; + + if !deleted { + // skip updating accounting and putting perhaps later timestamp + return; + } + + // Update the timeline's state. This does not have to be synchronized with + // the download process, because: + // - If downloader is racing with us to remove a file (e.g. because it is + // removed from heatmap), then our mutual .remove() operations will both + // succeed. + // - If downloader is racing with us to download the object (this would require + // multiple eviction iterations to race with multiple download iterations), then + // if we remove it from the state, the worst that happens is the downloader + // downloads it again before re-inserting, or we delete the file but it remains + // in the state map (in which case it will be downloaded if this secondary + // tenant transitions to attached and tries to access it) + // + // The important assumption here is that the secondary timeline state does not + // have to 100% match what is on disk, because it's a best-effort warming + // of the cache. + let mut detail = this.detail.lock().unwrap(); + if let Some(timeline_detail) = detail.timelines.get_mut(&timeline_id) { + timeline_detail.on_disk_layers.remove(&name); + timeline_detail.evicted_at.insert(name, now); + } + }) + .await + .expect("secondary eviction should not have panicked"); } } diff --git a/pageserver/src/tenant/secondary/downloader.rs b/pageserver/src/tenant/secondary/downloader.rs index 702c0b1ec1..b679077358 100644 --- a/pageserver/src/tenant/secondary/downloader.rs +++ b/pageserver/src/tenant/secondary/downloader.rs @@ -16,7 +16,8 @@ use crate::{ config::SecondaryLocationConfig, debug_assert_current_span_has_tenant_and_timeline_id, remote_timeline_client::{ - index::LayerFileMetadata, FAILED_DOWNLOAD_WARN_THRESHOLD, FAILED_REMOTE_OP_RETRIES, + index::LayerFileMetadata, is_temp_download_file, FAILED_DOWNLOAD_WARN_THRESHOLD, + FAILED_REMOTE_OP_RETRIES, }, span::debug_assert_current_span_has_tenant_id, storage_layer::LayerFileName, @@ -37,6 +38,7 @@ use crate::tenant::{ remote_timeline_client::{download::download_layer_file, remote_heatmap_path}, }; +use camino::Utf8PathBuf; use chrono::format::{DelayedFormat, StrftimeItems}; use futures::Future; use pageserver_api::shard::TenantShardId; @@ -44,7 +46,7 @@ use rand::Rng; use remote_storage::{DownloadError, GenericRemoteStorage}; use tokio_util::sync::CancellationToken; -use tracing::{info_span, instrument, Instrument}; +use tracing::{info_span, instrument, warn, Instrument}; use utils::{ backoff, completion::Barrier, crashsafe::path_with_suffix_extension, fs_ext, id::TimelineId, }; @@ -146,14 +148,15 @@ impl SecondaryDetail { } } + /// Additionally returns the total number of layers, used for more stable relative access time + /// based eviction. pub(super) fn get_layers_for_eviction( &self, parent: &Arc, - ) -> DiskUsageEvictionInfo { - let mut result = DiskUsageEvictionInfo { - max_layer_size: None, - resident_layers: Vec::new(), - }; + ) -> (DiskUsageEvictionInfo, usize) { + let mut result = DiskUsageEvictionInfo::default(); + let mut total_layers = 0; + for (timeline_id, timeline_detail) in &self.timelines { result .resident_layers @@ -169,6 +172,10 @@ impl SecondaryDetail { relative_last_activity: finite_f32::FiniteF32::ZERO, } })); + + // total might be missing currently downloading layers, but as a lower than actual + // value it is good enough approximation. + total_layers += timeline_detail.on_disk_layers.len() + timeline_detail.evicted_at.len(); } result.max_layer_size = result .resident_layers @@ -183,7 +190,7 @@ impl SecondaryDetail { result.resident_layers.len() ); - result + (result, total_layers) } } @@ -312,9 +319,7 @@ impl JobGenerator for UpdateError { fn from(value: std::io::Error) -> Self { if let Some(nix::errno::Errno::ENOSPC) = value.raw_os_error().map(nix::errno::from_i32) { UpdateError::NoSpace + } else if value + .get_ref() + .and_then(|x| x.downcast_ref::()) + .is_some() + { + UpdateError::from(DownloadError::from(value)) } else { - // An I/O error from e.g. tokio::io::copy is most likely a remote storage issue + // An I/O error from e.g. tokio::io::copy_buf is most likely a remote storage issue UpdateError::Other(anyhow::anyhow!(value)) } } @@ -481,14 +492,9 @@ impl<'a> TenantDownloader<'a> { let temp_path = path_with_suffix_extension(&heatmap_path, TEMP_FILE_SUFFIX); let context_msg = format!("write tenant {tenant_shard_id} heatmap to {heatmap_path}"); let heatmap_path_bg = heatmap_path.clone(); - tokio::task::spawn_blocking(move || { - tokio::runtime::Handle::current().block_on(async move { - VirtualFile::crashsafe_overwrite(&heatmap_path_bg, &temp_path, &heatmap_bytes).await - }) - }) - .await - .expect("Blocking task is never aborted") - .maybe_fatal_err(&context_msg)?; + VirtualFile::crashsafe_overwrite(heatmap_path_bg, temp_path, heatmap_bytes) + .await + .maybe_fatal_err(&context_msg)?; tracing::debug!("Wrote local heatmap to {}", heatmap_path); @@ -520,28 +526,29 @@ impl<'a> TenantDownloader<'a> { tracing::debug!("Downloading heatmap for secondary tenant",); let heatmap_path = remote_heatmap_path(tenant_shard_id); + let cancel = &self.secondary_state.cancel; let heatmap_bytes = backoff::retry( || async { let download = self .remote_storage - .download(&heatmap_path) + .download(&heatmap_path, cancel) .await .map_err(UpdateError::from)?; let mut heatmap_bytes = Vec::new(); let mut body = tokio_util::io::StreamReader::new(download.download_stream); - let _size = tokio::io::copy(&mut body, &mut heatmap_bytes).await?; + let _size = tokio::io::copy_buf(&mut body, &mut heatmap_bytes).await?; Ok(heatmap_bytes) }, |e| matches!(e, UpdateError::NoData | UpdateError::Cancelled), FAILED_DOWNLOAD_WARN_THRESHOLD, FAILED_REMOTE_OP_RETRIES, "download heatmap", - backoff::Cancel::new(self.secondary_state.cancel.clone(), || { - UpdateError::Cancelled - }), + cancel, ) - .await?; + .await + .ok_or_else(|| UpdateError::Cancelled) + .and_then(|x| x)?; SECONDARY_MODE.download_heatmap.inc(); @@ -668,20 +675,17 @@ impl<'a> TenantDownloader<'a> { .await { Ok(bytes) => bytes, - Err(e) => { - if let DownloadError::NotFound = e { - // A heatmap might be out of date and refer to a layer that doesn't exist any more. - // This is harmless: continue to download the next layer. It is expected during compaction - // GC. - tracing::debug!( - "Skipped downloading missing layer {}, raced with compaction/gc?", - layer.name - ); - continue; - } else { - return Err(e.into()); - } + Err(DownloadError::NotFound) => { + // A heatmap might be out of date and refer to a layer that doesn't exist any more. + // This is harmless: continue to download the next layer. It is expected during compaction + // GC. + tracing::debug!( + "Skipped downloading missing layer {}, raced with compaction/gc?", + layer.name + ); + continue; } + Err(e) => return Err(e.into()), }; if downloaded_bytes != layer.metadata.file_size { @@ -771,19 +775,33 @@ async fn init_timeline_state( .await .fatal_err(&format!("Listing {timeline_path}")) { - let dentry_file_name = dentry.file_name(); - let file_name = dentry_file_name.to_string_lossy(); - let local_meta = dentry.metadata().await.fatal_err(&format!( - "Read metadata on {}", - dentry.path().to_string_lossy() - )); + let Ok(file_path) = Utf8PathBuf::from_path_buf(dentry.path()) else { + tracing::warn!("Malformed filename at {}", dentry.path().to_string_lossy()); + continue; + }; + let local_meta = dentry + .metadata() + .await + .fatal_err(&format!("Read metadata on {}", file_path)); - // Secondary mode doesn't use local metadata files, but they might have been left behind by an attached tenant. + let file_name = file_path.file_name().expect("created it from the dentry"); if file_name == METADATA_FILE_NAME { + // Secondary mode doesn't use local metadata files, but they might have been left behind by an attached tenant. + warn!(path=?dentry.path(), "found legacy metadata file, these should have been removed in load_tenant_config"); + continue; + } else if crate::is_temporary(&file_path) || is_temp_download_file(&file_path) { + // Temporary files are frequently left behind from restarting during downloads + tracing::info!("Cleaning up temporary file {file_path}"); + if let Err(e) = tokio::fs::remove_file(&file_path) + .await + .or_else(fs_ext::ignore_not_found) + { + tracing::error!("Failed to remove temporary file {file_path}: {e}"); + } continue; } - match LayerFileName::from_str(&file_name) { + match LayerFileName::from_str(file_name) { Ok(name) => { let remote_meta = heatmap_metadata.get(&name); match remote_meta { diff --git a/pageserver/src/tenant/secondary/heatmap_uploader.rs b/pageserver/src/tenant/secondary/heatmap_uploader.rs index df865658a4..a8b05f4c0e 100644 --- a/pageserver/src/tenant/secondary/heatmap_uploader.rs +++ b/pageserver/src/tenant/secondary/heatmap_uploader.rs @@ -18,21 +18,19 @@ use crate::{ }; use futures::Future; -use md5; use pageserver_api::shard::TenantShardId; use rand::Rng; -use remote_storage::GenericRemoteStorage; +use remote_storage::{GenericRemoteStorage, TimeoutOrCancel}; use super::{ + heatmap::HeatMapTenant, scheduler::{self, JobGenerator, RunningJob, SchedulingResult, TenantBackgroundJobs}, - CommandRequest, + CommandRequest, UploadCommand, }; use tokio_util::sync::CancellationToken; use tracing::{info_span, instrument, Instrument}; use utils::{backoff, completion::Barrier, yielding_loop::yielding_loop}; -use super::{heatmap::HeatMapTenant, UploadCommand}; - pub(super) async fn heatmap_uploader_task( tenant_manager: Arc, remote_storage: GenericRemoteStorage, @@ -371,17 +369,12 @@ async fn upload_tenant_heatmap( }; let timelines = tenant.timelines.lock().unwrap().clone(); - let tenant_cancel = tenant.cancel.clone(); - // Ensure that Tenant::shutdown waits for any upload in flight: this is needed because otherwise // when we delete a tenant, we might race with an upload in flight and end up leaving a heatmap behind // in remote storage. - let _guard = match tenant.gate.enter() { - Ok(g) => g, - Err(_) => { - tracing::info!("Skipping heatmap upload for tenant which is shutting down"); - return Err(UploadHeatmapError::Cancelled); - } + let Ok(_guard) = tenant.gate.enter() else { + tracing::info!("Skipping heatmap upload for tenant which is shutting down"); + return Err(UploadHeatmapError::Cancelled); }; for (timeline_id, timeline) in timelines { @@ -401,6 +394,7 @@ async fn upload_tenant_heatmap( // Serialize the heatmap let bytes = serde_json::to_vec(&heatmap).map_err(|e| anyhow::anyhow!(e))?; + let bytes = bytes::Bytes::from(bytes); let size = bytes.len(); // Drop out early if nothing changed since our last upload @@ -411,26 +405,27 @@ async fn upload_tenant_heatmap( let path = remote_heatmap_path(tenant.get_tenant_shard_id()); - // Write the heatmap. + let cancel = &tenant.cancel; + tracing::debug!("Uploading {size} byte heatmap to {path}"); if let Err(e) = backoff::retry( || async { - let bytes = futures::stream::once(futures::future::ready(Ok(bytes::Bytes::from( - bytes.clone(), - )))); + let bytes = futures::stream::once(futures::future::ready(Ok(bytes.clone()))); remote_storage - .upload_storage_object(bytes, size, &path) + .upload_storage_object(bytes, size, &path, cancel) .await }, - |_| false, + TimeoutOrCancel::caused_by_cancel, 3, u32::MAX, "Uploading heatmap", - backoff::Cancel::new(tenant_cancel.clone(), || anyhow::anyhow!("Shutting down")), + cancel, ) .await + .ok_or_else(|| anyhow::anyhow!("Shutting down")) + .and_then(|x| x) { - if tenant_cancel.is_cancelled() { + if cancel.is_cancelled() { return Err(UploadHeatmapError::Cancelled); } else { return Err(e.into()); diff --git a/pageserver/src/tenant/span.rs b/pageserver/src/tenant/span.rs deleted file mode 100644 index 04e92f4096..0000000000 --- a/pageserver/src/tenant/span.rs +++ /dev/null @@ -1,17 +0,0 @@ -#[cfg(debug_assertions)] -use utils::tracing_span_assert::{check_fields_present, MultiNameExtractor}; - -#[cfg(not(debug_assertions))] -pub(crate) fn debug_assert_current_span_has_tenant_id() {} - -#[cfg(debug_assertions)] -pub(crate) static TENANT_ID_EXTRACTOR: once_cell::sync::Lazy> = - once_cell::sync::Lazy::new(|| MultiNameExtractor::new("TenantId", ["tenant_id"])); - -#[cfg(debug_assertions)] -#[track_caller] -pub(crate) fn debug_assert_current_span_has_tenant_id() { - if let Err(missing) = check_fields_present!([&*TENANT_ID_EXTRACTOR]) { - panic!("missing extractors: {missing:?}") - } -} diff --git a/pageserver/src/tenant/storage_layer.rs b/pageserver/src/tenant/storage_layer.rs index 6e9a4932d8..299950cc21 100644 --- a/pageserver/src/tenant/storage_layer.rs +++ b/pageserver/src/tenant/storage_layer.rs @@ -8,15 +8,21 @@ pub(crate) mod layer; mod layer_desc; use crate::context::{AccessStatsBehavior, RequestContext}; +use crate::repository::Value; use crate::task_mgr::TaskKind; use crate::walrecord::NeonWalRecord; use bytes::Bytes; use enum_map::EnumMap; use enumset::EnumSet; use once_cell::sync::Lazy; +use pageserver_api::key::Key; +use pageserver_api::keyspace::{KeySpace, KeySpaceRandomAccum}; use pageserver_api::models::{ LayerAccessKind, LayerResidenceEvent, LayerResidenceEventReason, LayerResidenceStatus, }; +use std::cmp::{Ordering, Reverse}; +use std::collections::hash_map::Entry; +use std::collections::{BinaryHeap, HashMap}; use std::ops::Range; use std::sync::Mutex; use std::time::{Duration, SystemTime, UNIX_EPOCH}; @@ -34,6 +40,11 @@ pub use layer_desc::{PersistentLayerDesc, PersistentLayerKey}; pub(crate) use layer::{EvictionError, Layer, ResidentLayer}; +use super::layer_map::InMemoryLayerHandle; +use super::timeline::layer_manager::LayerManager; +use super::timeline::GetVectoredError; +use super::PageReconstructError; + pub fn range_overlaps(a: &Range, b: &Range) -> bool where T: PartialOrd, @@ -61,12 +72,287 @@ where /// the same ValueReconstructState struct in the next 'get_value_reconstruct_data' /// call, to collect more records. /// -#[derive(Debug)] +#[derive(Debug, Default)] pub struct ValueReconstructState { pub records: Vec<(Lsn, NeonWalRecord)>, pub img: Option<(Lsn, Bytes)>, } +#[derive(Clone, Copy, Debug, Default, Eq, PartialEq)] +pub(crate) enum ValueReconstructSituation { + Complete, + #[default] + Continue, +} + +/// Reconstruct data accumulated for a single key during a vectored get +#[derive(Debug, Default, Clone)] +pub(crate) struct VectoredValueReconstructState { + pub(crate) records: Vec<(Lsn, NeonWalRecord)>, + pub(crate) img: Option<(Lsn, Bytes)>, + + situation: ValueReconstructSituation, +} + +impl VectoredValueReconstructState { + fn get_cached_lsn(&self) -> Option { + self.img.as_ref().map(|img| img.0) + } +} + +impl From for ValueReconstructState { + fn from(mut state: VectoredValueReconstructState) -> Self { + // walredo expects the records to be descending in terms of Lsn + state.records.sort_by_key(|(lsn, _)| Reverse(*lsn)); + + ValueReconstructState { + records: state.records, + img: state.img, + } + } +} + +/// Bag of data accumulated during a vectored get +pub(crate) struct ValuesReconstructState { + pub(crate) keys: HashMap>, + + keys_done: KeySpaceRandomAccum, +} + +impl ValuesReconstructState { + pub(crate) fn new() -> Self { + Self { + keys: HashMap::new(), + keys_done: KeySpaceRandomAccum::new(), + } + } + + /// Associate a key with the error which it encountered and mark it as done + pub(crate) fn on_key_error(&mut self, key: Key, err: PageReconstructError) { + let previous = self.keys.insert(key, Err(err)); + if let Some(Ok(state)) = previous { + if state.situation == ValueReconstructSituation::Continue { + self.keys_done.add_key(key); + } + } + } + + /// Update the state collected for a given key. + /// Returns true if this was the last value needed for the key and false otherwise. + /// + /// If the key is done after the update, mark it as such. + pub(crate) fn update_key( + &mut self, + key: &Key, + lsn: Lsn, + value: Value, + ) -> ValueReconstructSituation { + let state = self + .keys + .entry(*key) + .or_insert(Ok(VectoredValueReconstructState::default())); + + if let Ok(state) = state { + let key_done = match state.situation { + ValueReconstructSituation::Complete => unreachable!(), + ValueReconstructSituation::Continue => match value { + Value::Image(img) => { + state.img = Some((lsn, img)); + true + } + Value::WalRecord(rec) => { + let reached_cache = + state.get_cached_lsn().map(|clsn| clsn + 1) == Some(lsn); + let will_init = rec.will_init(); + state.records.push((lsn, rec)); + will_init || reached_cache + } + }, + }; + + if key_done && state.situation == ValueReconstructSituation::Continue { + state.situation = ValueReconstructSituation::Complete; + self.keys_done.add_key(*key); + } + + state.situation + } else { + ValueReconstructSituation::Complete + } + } + + /// Returns the Lsn at which this key is cached if one exists. + /// The read path should go no further than this Lsn for the given key. + pub(crate) fn get_cached_lsn(&self, key: &Key) -> Option { + self.keys + .get(key) + .and_then(|k| k.as_ref().ok()) + .and_then(|state| state.get_cached_lsn()) + } + + /// Returns the key space describing the keys that have + /// been marked as completed since the last call to this function. + pub(crate) fn consume_done_keys(&mut self) -> KeySpace { + self.keys_done.consume_keyspace() + } +} + +impl Default for ValuesReconstructState { + fn default() -> Self { + Self::new() + } +} + +/// Description of layer to be read - the layer map can turn +/// this description into the actual layer. +#[derive(PartialEq, Eq, Hash, Debug, Clone)] +pub(crate) enum ReadableLayerDesc { + Persistent { + desc: PersistentLayerDesc, + lsn_range: Range, + }, + InMemory { + handle: InMemoryLayerHandle, + lsn_ceil: Lsn, + }, +} + +/// Wraper for 'ReadableLayerDesc' sorted by Lsn +#[derive(Debug)] +struct ReadableLayerDescOrdered(ReadableLayerDesc); + +/// Data structure which maintains a fringe of layers for the +/// read path. The fringe is the set of layers which intersects +/// the current keyspace that the search is descending on. +/// Each layer tracks the keyspace that intersects it. +/// +/// The fringe must appear sorted by Lsn. Hence, it uses +/// a two layer indexing scheme. +#[derive(Debug)] +pub(crate) struct LayerFringe { + layers_by_lsn: BinaryHeap, + layers: HashMap, +} + +impl LayerFringe { + pub(crate) fn new() -> Self { + LayerFringe { + layers_by_lsn: BinaryHeap::new(), + layers: HashMap::new(), + } + } + + pub(crate) fn next_layer(&mut self) -> Option<(ReadableLayerDesc, KeySpace)> { + let handle = match self.layers_by_lsn.pop() { + Some(h) => h, + None => return None, + }; + + let removed = self.layers.remove_entry(&handle.0); + match removed { + Some((layer, keyspace)) => Some((layer, keyspace)), + None => unreachable!("fringe internals are always consistent"), + } + } + + pub(crate) fn update(&mut self, layer: ReadableLayerDesc, keyspace: KeySpace) { + let entry = self.layers.entry(layer.clone()); + match entry { + Entry::Occupied(mut entry) => { + entry.get_mut().merge(&keyspace); + } + Entry::Vacant(entry) => { + self.layers_by_lsn + .push(ReadableLayerDescOrdered(entry.key().clone())); + entry.insert(keyspace); + } + } + } +} + +impl Default for LayerFringe { + fn default() -> Self { + Self::new() + } +} + +impl Ord for ReadableLayerDescOrdered { + fn cmp(&self, other: &Self) -> Ordering { + let ord = self.0.get_lsn_ceil().cmp(&other.0.get_lsn_ceil()); + if ord == std::cmp::Ordering::Equal { + self.0 + .get_lsn_floor() + .cmp(&other.0.get_lsn_floor()) + .reverse() + } else { + ord + } + } +} + +impl PartialOrd for ReadableLayerDescOrdered { + fn partial_cmp(&self, other: &Self) -> Option { + Some(self.cmp(other)) + } +} + +impl PartialEq for ReadableLayerDescOrdered { + fn eq(&self, other: &Self) -> bool { + self.0.get_lsn_floor() == other.0.get_lsn_floor() + && self.0.get_lsn_ceil() == other.0.get_lsn_ceil() + } +} + +impl Eq for ReadableLayerDescOrdered {} + +impl ReadableLayerDesc { + pub(crate) fn get_lsn_floor(&self) -> Lsn { + match self { + ReadableLayerDesc::Persistent { lsn_range, .. } => lsn_range.start, + ReadableLayerDesc::InMemory { handle, .. } => handle.get_lsn_floor(), + } + } + + pub(crate) fn get_lsn_ceil(&self) -> Lsn { + match self { + ReadableLayerDesc::Persistent { lsn_range, .. } => lsn_range.end, + ReadableLayerDesc::InMemory { lsn_ceil, .. } => *lsn_ceil, + } + } + + pub(crate) async fn get_values_reconstruct_data( + &self, + layer_manager: &LayerManager, + keyspace: KeySpace, + reconstruct_state: &mut ValuesReconstructState, + ctx: &RequestContext, + ) -> Result<(), GetVectoredError> { + match self { + ReadableLayerDesc::Persistent { desc, lsn_range } => { + let layer = layer_manager.get_from_desc(desc); + layer + .get_values_reconstruct_data( + keyspace, + lsn_range.clone(), + reconstruct_state, + ctx, + ) + .await + } + ReadableLayerDesc::InMemory { handle, lsn_ceil } => { + let layer = layer_manager + .layer_map() + .get_in_memory_layer(handle) + .unwrap(); + + layer + .get_values_reconstruct_data(keyspace, *lsn_ceil, reconstruct_state, ctx) + .await + } + } + } +} + /// Return value from [`Layer::get_value_reconstruct_data`] #[derive(Clone, Copy, Debug)] pub enum ValueReconstructResult { @@ -257,6 +543,12 @@ impl LayerAccessStats { ret } + /// Get the latest access timestamp, falling back to latest residence event, further falling + /// back to `SystemTime::now` for a usable timestamp for eviction. + pub(crate) fn latest_activity_or_now(&self) -> SystemTime { + self.latest_activity().unwrap_or_else(SystemTime::now) + } + /// Get the latest access timestamp, falling back to latest residence event. /// /// This function can only return `None` if there has not yet been a call to the @@ -271,7 +563,7 @@ impl LayerAccessStats { /// that that type can only be produced by inserting into the layer map. /// /// [`record_residence_event`]: Self::record_residence_event - pub(crate) fn latest_activity(&self) -> Option { + fn latest_activity(&self) -> Option { let locked = self.0.lock().unwrap(); let inner = &locked.for_eviction_policy; match inner.last_accesses.recent() { diff --git a/pageserver/src/tenant/storage_layer/delta_layer.rs b/pageserver/src/tenant/storage_layer/delta_layer.rs index 3a445ef71e..b7132ee3bf 100644 --- a/pageserver/src/tenant/storage_layer/delta_layer.rs +++ b/pageserver/src/tenant/storage_layer/delta_layer.rs @@ -29,18 +29,25 @@ //! use crate::config::PageServerConf; use crate::context::{PageContentKind, RequestContext, RequestContextBuilder}; -use crate::page_cache::PAGE_SZ; +use crate::page_cache::{self, FileId, PAGE_SZ}; use crate::repository::{Key, Value, KEY_SIZE}; use crate::tenant::blob_io::BlobWriter; use crate::tenant::block_io::{BlockBuf, BlockCursor, BlockLease, BlockReader, FileBlockReader}; use crate::tenant::disk_btree::{DiskBtreeBuilder, DiskBtreeReader, VisitDirection}; use crate::tenant::storage_layer::{Layer, ValueReconstructResult, ValueReconstructState}; -use crate::tenant::Timeline; +use crate::tenant::timeline::GetVectoredError; +use crate::tenant::vectored_blob_io::{ + BlobFlag, MaxVectoredReadBytes, VectoredBlobReader, VectoredRead, VectoredReadPlanner, +}; +use crate::tenant::{PageReconstructError, Timeline}; use crate::virtual_file::{self, VirtualFile}; use crate::{walrecord, TEMP_FILE_SUFFIX}; use crate::{DELTA_FILE_MAGIC, STORAGE_FORMAT_VERSION}; -use anyhow::{bail, ensure, Context, Result}; +use anyhow::{anyhow, bail, ensure, Context, Result}; +use bytes::BytesMut; use camino::{Utf8Path, Utf8PathBuf}; +use futures::StreamExt; +use pageserver_api::keyspace::KeySpace; use pageserver_api::models::LayerAccessKind; use pageserver_api::shard::TenantShardId; use rand::{distributions::Alphanumeric, Rng}; @@ -59,7 +66,9 @@ use utils::{ lsn::Lsn, }; -use super::{AsLayerDesc, LayerAccessStats, PersistentLayerDesc, ResidentLayer}; +use super::{ + AsLayerDesc, LayerAccessStats, PersistentLayerDesc, ResidentLayer, ValuesReconstructState, +}; /// /// Header stored in the beginning of the file @@ -208,8 +217,10 @@ pub struct DeltaLayerInner { index_start_blk: u32, index_root_blk: u32, - /// Reader object for reading blocks from the file. - file: FileBlockReader, + file: VirtualFile, + file_id: FileId, + + max_vectored_read_bytes: Option, } impl std::fmt::Debug for DeltaLayerInner { @@ -291,7 +302,7 @@ impl DeltaLayer { async fn load_inner(&self, ctx: &RequestContext) -> Result> { let path = self.path(); - let loaded = DeltaLayerInner::load(&path, None, ctx) + let loaded = DeltaLayerInner::load(&path, None, None, ctx) .await .and_then(|res| res)?; @@ -416,27 +427,31 @@ impl DeltaLayerWriterInner { /// The values must be appended in key, lsn order. /// async fn put_value(&mut self, key: Key, lsn: Lsn, val: Value) -> anyhow::Result<()> { - self.put_value_bytes(key, lsn, &Value::ser(&val)?, val.will_init()) - .await + let (_, res) = self + .put_value_bytes(key, lsn, Value::ser(&val)?, val.will_init()) + .await; + res } async fn put_value_bytes( &mut self, key: Key, lsn: Lsn, - val: &[u8], + val: Vec, will_init: bool, - ) -> anyhow::Result<()> { + ) -> (Vec, anyhow::Result<()>) { assert!(self.lsn_range.start <= lsn); - - let off = self.blob_writer.write_blob(val).await?; + let (val, res) = self.blob_writer.write_blob(val).await; + let off = match res { + Ok(off) => off, + Err(e) => return (val, Err(anyhow::anyhow!(e))), + }; let blob_ref = BlobRef::new(off, will_init); let delta_key = DeltaKey::from_key_lsn(&key, lsn); - self.tree.append(&delta_key.0, blob_ref.0)?; - - Ok(()) + let res = self.tree.append(&delta_key.0, blob_ref.0); + (val, res.map_err(|e| anyhow::anyhow!(e))) } fn size(&self) -> u64 { @@ -457,7 +472,8 @@ impl DeltaLayerWriterInner { file.seek(SeekFrom::Start(index_start_blk as u64 * PAGE_SZ as u64)) .await?; for buf in block_buf.blocks { - file.write_all(buf.as_ref()).await?; + let (_buf, res) = file.write_all(buf).await; + res?; } assert!(self.lsn_range.start < self.lsn_range.end); // Fill in the summary on blk 0 @@ -472,17 +488,12 @@ impl DeltaLayerWriterInner { index_root_blk, }; - let mut buf = smallvec::SmallVec::<[u8; PAGE_SZ]>::new(); + let mut buf = Vec::with_capacity(PAGE_SZ); + // TODO: could use smallvec here but it's a pain with Slice Summary::ser_into(&summary, &mut buf)?; - if buf.spilled() { - // This is bad as we only have one free block for the summary - warn!( - "Used more than one page size for summary buffer: {}", - buf.len() - ); - } file.seek(SeekFrom::Start(0)).await?; - file.write_all(&buf).await?; + let (_buf, res) = file.write_all(buf).await; + res?; let metadata = file .metadata() @@ -587,9 +598,9 @@ impl DeltaLayerWriter { &mut self, key: Key, lsn: Lsn, - val: &[u8], + val: Vec, will_init: bool, - ) -> anyhow::Result<()> { + ) -> (Vec, anyhow::Result<()>) { self.inner .as_mut() .unwrap() @@ -609,7 +620,19 @@ impl DeltaLayerWriter { key_end: Key, timeline: &Arc, ) -> anyhow::Result { - self.inner.take().unwrap().finish(key_end, timeline).await + let inner = self.inner.take().unwrap(); + let temp_path = inner.path.clone(); + let result = inner.finish(key_end, timeline).await; + // The delta layer files can sometimes be really large. Clean them up. + if result.is_err() { + tracing::warn!( + "Cleaning up temporary delta file {temp_path} after error during writing" + ); + if let Err(e) = std::fs::remove_file(&temp_path) { + tracing::warn!("Error cleaning up temporary delta layer file {temp_path}: {e:?}") + } + } + result } } @@ -647,34 +670,28 @@ impl DeltaLayer { where F: Fn(Summary) -> Summary, { - let file = VirtualFile::open_with_options( + let mut file = VirtualFile::open_with_options( path, virtual_file::OpenOptions::new().read(true).write(true), ) .await .with_context(|| format!("Failed to open file '{}'", path))?; - let file = FileBlockReader::new(file); - let summary_blk = file.read_blk(0, ctx).await?; + let file_id = page_cache::next_file_id(); + let block_reader = FileBlockReader::new(&file, file_id); + let summary_blk = block_reader.read_blk(0, ctx).await?; let actual_summary = Summary::des_prefix(summary_blk.as_ref()).context("deserialize")?; - let mut file = file.file; if actual_summary.magic != DELTA_FILE_MAGIC { return Err(RewriteSummaryError::MagicMismatch); } let new_summary = rewrite(actual_summary); - let mut buf = smallvec::SmallVec::<[u8; PAGE_SZ]>::new(); + let mut buf = Vec::with_capacity(PAGE_SZ); + // TODO: could use smallvec here, but it's a pain with Slice Summary::ser_into(&new_summary, &mut buf).context("serialize")?; - if buf.spilled() { - // The code in DeltaLayerWriterInner just warn!()s for this. - // It should probably error out as well. - return Err(RewriteSummaryError::Other(anyhow::anyhow!( - "Used more than one page size for summary buffer: {}", - buf.len() - ))); - } file.seek(SeekFrom::Start(0)).await?; - file.write_all(&buf).await?; + let (_buf, res) = file.write_all(buf).await; + res?; Ok(()) } } @@ -686,15 +703,18 @@ impl DeltaLayerInner { pub(super) async fn load( path: &Utf8Path, summary: Option, + max_vectored_read_bytes: Option, ctx: &RequestContext, ) -> Result, anyhow::Error> { let file = match VirtualFile::open(path).await { Ok(file) => file, Err(e) => return Ok(Err(anyhow::Error::new(e).context("open layer file"))), }; - let file = FileBlockReader::new(file); + let file_id = page_cache::next_file_id(); - let summary_blk = match file.read_blk(0, ctx).await { + let block_reader = FileBlockReader::new(&file, file_id); + + let summary_blk = match block_reader.read_blk(0, ctx).await { Ok(blk) => blk, Err(e) => return Ok(Err(anyhow::Error::new(e).context("read first block"))), }; @@ -718,8 +738,10 @@ impl DeltaLayerInner { Ok(Ok(DeltaLayerInner { file, + file_id, index_start_blk: actual_summary.index_start_blk, index_root_blk: actual_summary.index_root_blk, + max_vectored_read_bytes, })) } @@ -732,11 +754,11 @@ impl DeltaLayerInner { ) -> anyhow::Result { let mut need_image = true; // Scan the page versions backwards, starting from `lsn`. - let file = &self.file; + let block_reader = FileBlockReader::new(&self.file, self.file_id); let tree_reader = DiskBtreeReader::<_, DELTA_KEY_SIZE>::new( self.index_start_blk, self.index_root_blk, - file, + &block_reader, ); let search_key = DeltaKey::from_key_lsn(&key, Lsn(lsn_range.end.0 - 1)); @@ -770,19 +792,19 @@ impl DeltaLayerInner { .build(); // Ok, 'offsets' now contains the offsets of all the entries we need to read - let cursor = file.block_cursor(); + let cursor = block_reader.block_cursor(); let mut buf = Vec::new(); for (entry_lsn, pos) in offsets { cursor .read_blob_into_buf(pos, &mut buf, ctx) .await .with_context(|| { - format!("Failed to read blob from virtual file {}", file.file.path) + format!("Failed to read blob from virtual file {}", self.file.path) })?; let val = Value::des(&buf).with_context(|| { format!( "Failed to deserialize file blob from virtual file {}", - file.file.path + self.file.path ) })?; match val { @@ -812,16 +834,205 @@ impl DeltaLayerInner { } } + // Look up the keys in the provided keyspace and update + // the reconstruct state with whatever is found. + // + // If the key is cached, go no further than the cached Lsn. + // + // Currently, the index is visited for each range, but this + // can be further optimised to visit the index only once. + pub(super) async fn get_values_reconstruct_data( + &self, + keyspace: KeySpace, + lsn_range: Range, + reconstruct_state: &mut ValuesReconstructState, + ctx: &RequestContext, + ) -> Result<(), GetVectoredError> { + let block_reader = FileBlockReader::new(&self.file, self.file_id); + let index_reader = DiskBtreeReader::<_, DELTA_KEY_SIZE>::new( + self.index_start_blk, + self.index_root_blk, + block_reader, + ); + + let planner = VectoredReadPlanner::new( + self.max_vectored_read_bytes + .expect("Layer is loaded with max vectored bytes config") + .0 + .into(), + ); + + let data_end_offset = self.index_start_blk as u64 * PAGE_SZ as u64; + + let reads = Self::plan_reads( + keyspace, + lsn_range, + data_end_offset, + index_reader, + planner, + reconstruct_state, + ctx, + ) + .await + .map_err(GetVectoredError::Other)?; + + self.do_reads_and_update_state(reads, reconstruct_state) + .await; + + Ok(()) + } + + async fn plan_reads( + keyspace: KeySpace, + lsn_range: Range, + data_end_offset: u64, + index_reader: DiskBtreeReader, + mut planner: VectoredReadPlanner, + reconstruct_state: &mut ValuesReconstructState, + ctx: &RequestContext, + ) -> anyhow::Result> + where + Reader: BlockReader, + { + let ctx = RequestContextBuilder::extend(ctx) + .page_content_kind(PageContentKind::DeltaLayerBtreeNode) + .build(); + + for range in keyspace.ranges.iter() { + let mut range_end_handled = false; + + let start_key = DeltaKey::from_key_lsn(&range.start, lsn_range.start); + let index_stream = index_reader.get_stream_from(&start_key.0, &ctx); + let mut index_stream = std::pin::pin!(index_stream); + + while let Some(index_entry) = index_stream.next().await { + let (raw_key, value) = index_entry?; + let key = Key::from_slice(&raw_key[..KEY_SIZE]); + let lsn = DeltaKey::extract_lsn_from_buf(&raw_key); + let blob_ref = BlobRef(value); + + // Lsns are not monotonically increasing across keys, so we don't assert on them. + assert!(key >= range.start); + + let outside_lsn_range = !lsn_range.contains(&lsn); + let below_cached_lsn = reconstruct_state.get_cached_lsn(&key) >= Some(lsn); + + let flag = { + if outside_lsn_range || below_cached_lsn { + BlobFlag::Ignore + } else if blob_ref.will_init() { + BlobFlag::ReplaceAll + } else { + // Usual path: add blob to the read + BlobFlag::None + } + }; + + if key >= range.end || (key.next() == range.end && lsn >= lsn_range.end) { + planner.handle_range_end(blob_ref.pos()); + range_end_handled = true; + break; + } else { + planner.handle(key, lsn, blob_ref.pos(), flag); + } + } + + if !range_end_handled { + tracing::info!("Handling range end fallback at {}", data_end_offset); + planner.handle_range_end(data_end_offset); + } + } + + Ok(planner.finish()) + } + + async fn do_reads_and_update_state( + &self, + reads: Vec, + reconstruct_state: &mut ValuesReconstructState, + ) { + let vectored_blob_reader = VectoredBlobReader::new(&self.file); + let mut ignore_key_with_err = None; + + let max_vectored_read_bytes = self + .max_vectored_read_bytes + .expect("Layer is loaded with max vectored bytes config") + .0 + .into(); + let mut buf = Some(BytesMut::with_capacity(max_vectored_read_bytes)); + + // Note that reads are processed in reverse order (from highest key+lsn). + // This is the order that `ReconstructState` requires such that it can + // track when a key is done. + for read in reads.into_iter().rev() { + let res = vectored_blob_reader + .read_blobs(&read, buf.take().expect("Should have a buffer")) + .await; + + let blobs_buf = match res { + Ok(blobs_buf) => blobs_buf, + Err(err) => { + let kind = err.kind(); + for (_, blob_meta) in read.blobs_at.as_slice() { + reconstruct_state.on_key_error( + blob_meta.key, + PageReconstructError::from(anyhow!( + "Failed to read blobs from virtual file {}: {}", + self.file.path, + kind + )), + ); + } + + // We have "lost" the buffer since the lower level IO api + // doesn't return the buffer on error. Allocate a new one. + buf = Some(BytesMut::with_capacity(max_vectored_read_bytes)); + + continue; + } + }; + + for meta in blobs_buf.blobs.iter().rev() { + if Some(meta.meta.key) == ignore_key_with_err { + continue; + } + + let value = Value::des(&blobs_buf.buf[meta.start..meta.end]); + let value = match value { + Ok(v) => v, + Err(e) => { + reconstruct_state.on_key_error( + meta.meta.key, + PageReconstructError::from(anyhow!(e).context(format!( + "Failed to deserialize blob from virtual file {}", + self.file.path, + ))), + ); + + ignore_key_with_err = Some(meta.meta.key); + continue; + } + }; + + // Invariant: once a key reaches [`ValueReconstructSituation::Complete`] + // state, no further updates shall be made to it. The call below will + // panic if the invariant is violated. + reconstruct_state.update_key(&meta.meta.key, meta.meta.lsn, value); + } + + buf = Some(blobs_buf.buf); + } + } + pub(super) async fn load_keys<'a>( &'a self, ctx: &RequestContext, ) -> Result>> { - let file = &self.file; - + let block_reader = FileBlockReader::new(&self.file, self.file_id); let tree_reader = DiskBtreeReader::<_, DELTA_KEY_SIZE>::new( self.index_start_blk, self.index_root_blk, - file, + block_reader, ); let mut all_keys: Vec> = Vec::new(); @@ -873,18 +1084,18 @@ impl DeltaLayerInner { self.index_start_blk, self.index_root_blk ); - let file = &self.file; + let block_reader = FileBlockReader::new(&self.file, self.file_id); let tree_reader = DiskBtreeReader::<_, DELTA_KEY_SIZE>::new( self.index_start_blk, self.index_root_blk, - file, + block_reader, ); tree_reader.dump().await?; let keys = self.load_keys(ctx).await?; - async fn dump_blob(val: ValueRef<'_>, ctx: &RequestContext) -> anyhow::Result { + async fn dump_blob(val: &ValueRef<'_>, ctx: &RequestContext) -> anyhow::Result { let buf = val.reader.read_blob(val.blob_ref.pos(), ctx).await?; let val = Value::des(&buf)?; let desc = match val { @@ -906,13 +1117,32 @@ impl DeltaLayerInner { for entry in keys { let DeltaEntry { key, lsn, val, .. } = entry; - let desc = match dump_blob(val, ctx).await { + let desc = match dump_blob(&val, ctx).await { Ok(desc) => desc, Err(err) => { format!("ERROR: {err}") } }; println!(" key {key} at {lsn}: {desc}"); + + // Print more details about CHECKPOINT records. Would be nice to print details + // of many other record types too, but these are particularly interesting, as + // have a lot of special processing for them in walingest.rs. + use pageserver_api::key::CHECKPOINT_KEY; + use postgres_ffi::CheckPoint; + if key == CHECKPOINT_KEY { + let buf = val.reader.read_blob(val.blob_ref.pos(), ctx).await?; + let val = Value::des(&buf)?; + match val { + Value::Image(img) => { + let checkpoint = CheckPoint::decode(&img)?; + println!(" CHECKPOINT: {:?}", checkpoint); + } + Value::WalRecord(_rec) => { + println!(" unexpected walrecord value for checkpoint key"); + } + } + } } Ok(()) @@ -953,7 +1183,8 @@ impl> Adapter { blknum: u32, ctx: &RequestContext, ) -> Result { - self.0.as_ref().file.read_blk(blknum, ctx).await + let block_reader = FileBlockReader::new(&self.0.as_ref().file, self.0.as_ref().file_id); + block_reader.read_blk(blknum, ctx).await } } @@ -962,3 +1193,143 @@ impl AsRef for DeltaLayerInner { self } } + +impl<'a> pageserver_compaction::interface::CompactionDeltaEntry<'a, Key> for DeltaEntry<'a> { + fn key(&self) -> Key { + self.key + } + fn lsn(&self) -> Lsn { + self.lsn + } + fn size(&self) -> u64 { + self.size + } +} + +#[cfg(test)] +mod test { + use std::collections::BTreeMap; + + use super::*; + use crate::{ + context::DownloadBehavior, task_mgr::TaskKind, tenant::disk_btree::tests::TestDisk, + }; + + /// Construct an index for a fictional delta layer and and then + /// traverse in order to plan vectored reads for a query. Finally, + /// verify that the traversal fed the right index key and value + /// pairs into the planner. + #[tokio::test] + async fn test_delta_layer_index_traversal() { + let base_key = Key { + field1: 0, + field2: 1663, + field3: 12972, + field4: 16396, + field5: 0, + field6: 246080, + }; + + // Populate the index with some entries + let entries: BTreeMap> = BTreeMap::from([ + (base_key, vec![Lsn(1), Lsn(5), Lsn(25), Lsn(26), Lsn(28)]), + (base_key.add(1), vec![Lsn(2), Lsn(5), Lsn(10), Lsn(50)]), + (base_key.add(2), vec![Lsn(2), Lsn(5), Lsn(10), Lsn(50)]), + (base_key.add(5), vec![Lsn(10), Lsn(15), Lsn(16), Lsn(20)]), + ]); + + let mut disk = TestDisk::default(); + let mut writer = DiskBtreeBuilder::<_, DELTA_KEY_SIZE>::new(&mut disk); + + let mut disk_offset = 0; + for (key, lsns) in &entries { + for lsn in lsns { + let index_key = DeltaKey::from_key_lsn(key, *lsn); + let blob_ref = BlobRef::new(disk_offset, false); + writer + .append(&index_key.0, blob_ref.0) + .expect("In memory disk append should never fail"); + + disk_offset += 1; + } + } + + // Prepare all the arguments for the call into `plan_reads` below + let (root_offset, _writer) = writer + .finish() + .expect("In memory disk finish should never fail"); + let reader = DiskBtreeReader::<_, DELTA_KEY_SIZE>::new(0, root_offset, disk); + let planner = VectoredReadPlanner::new(100); + let mut reconstruct_state = ValuesReconstructState::new(); + let ctx = RequestContext::new(TaskKind::UnitTest, DownloadBehavior::Error); + + let keyspace = KeySpace { + ranges: vec![ + base_key..base_key.add(3), + base_key.add(3)..base_key.add(100), + ], + }; + let lsn_range = Lsn(2)..Lsn(40); + + // Plan and validate + let vectored_reads = DeltaLayerInner::plan_reads( + keyspace.clone(), + lsn_range.clone(), + disk_offset, + reader, + planner, + &mut reconstruct_state, + &ctx, + ) + .await + .expect("Read planning should not fail"); + + validate(keyspace, lsn_range, vectored_reads, entries); + } + + fn validate( + keyspace: KeySpace, + lsn_range: Range, + vectored_reads: Vec, + index_entries: BTreeMap>, + ) { + #[derive(Debug, PartialEq, Eq)] + struct BlobSpec { + key: Key, + lsn: Lsn, + at: u64, + } + + let mut planned_blobs = Vec::new(); + for read in vectored_reads { + for (at, meta) in read.blobs_at.as_slice() { + planned_blobs.push(BlobSpec { + key: meta.key, + lsn: meta.lsn, + at: *at, + }); + } + } + + let mut expected_blobs = Vec::new(); + let mut disk_offset = 0; + for (key, lsns) in index_entries { + for lsn in lsns { + let key_included = keyspace.ranges.iter().any(|range| range.contains(&key)); + let lsn_included = lsn_range.contains(&lsn); + + if key_included && lsn_included { + expected_blobs.push(BlobSpec { + key, + lsn, + at: disk_offset, + }); + } + + disk_offset += 1; + } + } + + assert_eq!(planned_blobs, expected_blobs); + } +} diff --git a/pageserver/src/tenant/storage_layer/image_layer.rs b/pageserver/src/tenant/storage_layer/image_layer.rs index c62e6aed51..14c79e413c 100644 --- a/pageserver/src/tenant/storage_layer/image_layer.rs +++ b/pageserver/src/tenant/storage_layer/image_layer.rs @@ -25,21 +25,26 @@ //! actual page images are stored in the "values" part. use crate::config::PageServerConf; use crate::context::{PageContentKind, RequestContext, RequestContextBuilder}; -use crate::page_cache::PAGE_SZ; -use crate::repository::{Key, KEY_SIZE}; +use crate::page_cache::{self, FileId, PAGE_SZ}; +use crate::repository::{Key, Value, KEY_SIZE}; use crate::tenant::blob_io::BlobWriter; use crate::tenant::block_io::{BlockBuf, BlockReader, FileBlockReader}; use crate::tenant::disk_btree::{DiskBtreeBuilder, DiskBtreeReader, VisitDirection}; use crate::tenant::storage_layer::{ LayerAccessStats, ValueReconstructResult, ValueReconstructState, }; -use crate::tenant::Timeline; +use crate::tenant::timeline::GetVectoredError; +use crate::tenant::vectored_blob_io::{ + BlobFlag, MaxVectoredReadBytes, VectoredBlobReader, VectoredRead, VectoredReadPlanner, +}; +use crate::tenant::{PageReconstructError, Timeline}; use crate::virtual_file::{self, VirtualFile}; use crate::{IMAGE_FILE_MAGIC, STORAGE_FORMAT_VERSION, TEMP_FILE_SUFFIX}; -use anyhow::{bail, ensure, Context, Result}; -use bytes::Bytes; +use anyhow::{anyhow, bail, ensure, Context, Result}; +use bytes::{Bytes, BytesMut}; use camino::{Utf8Path, Utf8PathBuf}; use hex; +use pageserver_api::keyspace::KeySpace; use pageserver_api::models::LayerAccessKind; use pageserver_api::shard::TenantShardId; use rand::{distributions::Alphanumeric, Rng}; @@ -50,6 +55,7 @@ use std::ops::Range; use std::os::unix::prelude::FileExt; use std::sync::Arc; use tokio::sync::OnceCell; +use tokio_stream::StreamExt; use tracing::*; use utils::{ @@ -59,7 +65,7 @@ use utils::{ }; use super::filename::ImageFileName; -use super::{AsLayerDesc, Layer, PersistentLayerDesc, ResidentLayer}; +use super::{AsLayerDesc, Layer, PersistentLayerDesc, ResidentLayer, ValuesReconstructState}; /// /// Header stored in the beginning of the file @@ -150,8 +156,10 @@ pub struct ImageLayerInner { lsn: Lsn, - /// Reader object for reading blocks from the file. - file: FileBlockReader, + file: VirtualFile, + file_id: FileId, + + max_vectored_read_bytes: Option, } impl std::fmt::Debug for ImageLayerInner { @@ -165,9 +173,12 @@ impl std::fmt::Debug for ImageLayerInner { impl ImageLayerInner { pub(super) async fn dump(&self, ctx: &RequestContext) -> anyhow::Result<()> { - let file = &self.file; - let tree_reader = - DiskBtreeReader::<_, KEY_SIZE>::new(self.index_start_blk, self.index_root_blk, file); + let block_reader = FileBlockReader::new(&self.file, self.file_id); + let tree_reader = DiskBtreeReader::<_, KEY_SIZE>::new( + self.index_start_blk, + self.index_root_blk, + block_reader, + ); tree_reader.dump().await?; @@ -250,7 +261,7 @@ impl ImageLayer { async fn load_inner(&self, ctx: &RequestContext) -> Result { let path = self.path(); - let loaded = ImageLayerInner::load(&path, self.desc.image_layer_lsn(), None, ctx) + let loaded = ImageLayerInner::load(&path, self.desc.image_layer_lsn(), None, None, ctx) .await .and_then(|res| res)?; @@ -325,34 +336,28 @@ impl ImageLayer { where F: Fn(Summary) -> Summary, { - let file = VirtualFile::open_with_options( + let mut file = VirtualFile::open_with_options( path, virtual_file::OpenOptions::new().read(true).write(true), ) .await .with_context(|| format!("Failed to open file '{}'", path))?; - let file = FileBlockReader::new(file); - let summary_blk = file.read_blk(0, ctx).await?; + let file_id = page_cache::next_file_id(); + let block_reader = FileBlockReader::new(&file, file_id); + let summary_blk = block_reader.read_blk(0, ctx).await?; let actual_summary = Summary::des_prefix(summary_blk.as_ref()).context("deserialize")?; - let mut file = file.file; if actual_summary.magic != IMAGE_FILE_MAGIC { return Err(RewriteSummaryError::MagicMismatch); } let new_summary = rewrite(actual_summary); - let mut buf = smallvec::SmallVec::<[u8; PAGE_SZ]>::new(); + let mut buf = Vec::with_capacity(PAGE_SZ); + // TODO: could use smallvec here but it's a pain with Slice Summary::ser_into(&new_summary, &mut buf).context("serialize")?; - if buf.spilled() { - // The code in ImageLayerWriterInner just warn!()s for this. - // It should probably error out as well. - return Err(RewriteSummaryError::Other(anyhow::anyhow!( - "Used more than one page size for summary buffer: {}", - buf.len() - ))); - } file.seek(SeekFrom::Start(0)).await?; - file.write_all(&buf).await?; + let (_buf, res) = file.write_all(buf).await; + res?; Ok(()) } } @@ -365,14 +370,16 @@ impl ImageLayerInner { path: &Utf8Path, lsn: Lsn, summary: Option, + max_vectored_read_bytes: Option, ctx: &RequestContext, ) -> Result, anyhow::Error> { let file = match VirtualFile::open(path).await { Ok(file) => file, Err(e) => return Ok(Err(anyhow::Error::new(e).context("open layer file"))), }; - let file = FileBlockReader::new(file); - let summary_blk = match file.read_blk(0, ctx).await { + let file_id = page_cache::next_file_id(); + let block_reader = FileBlockReader::new(&file, file_id); + let summary_blk = match block_reader.read_blk(0, ctx).await { Ok(blk) => blk, Err(e) => return Ok(Err(anyhow::Error::new(e).context("read first block"))), }; @@ -403,6 +410,8 @@ impl ImageLayerInner { index_root_blk: actual_summary.index_root_blk, lsn, file, + file_id, + max_vectored_read_bytes, })) } @@ -412,8 +421,9 @@ impl ImageLayerInner { reconstruct_state: &mut ValueReconstructState, ctx: &RequestContext, ) -> anyhow::Result { - let file = &self.file; - let tree_reader = DiskBtreeReader::new(self.index_start_blk, self.index_root_blk, file); + let block_reader = FileBlockReader::new(&self.file, self.file_id); + let tree_reader = + DiskBtreeReader::new(self.index_start_blk, self.index_root_blk, &block_reader); let mut keybuf: [u8; KEY_SIZE] = [0u8; KEY_SIZE]; key.write_to_byte_slice(&mut keybuf); @@ -426,7 +436,7 @@ impl ImageLayerInner { ) .await? { - let blob = file + let blob = block_reader .block_cursor() .read_blob( offset, @@ -444,6 +454,124 @@ impl ImageLayerInner { Ok(ValueReconstructResult::Missing) } } + + // Look up the keys in the provided keyspace and update + // the reconstruct state with whatever is found. + pub(super) async fn get_values_reconstruct_data( + &self, + keyspace: KeySpace, + reconstruct_state: &mut ValuesReconstructState, + ctx: &RequestContext, + ) -> Result<(), GetVectoredError> { + let reads = self + .plan_reads(keyspace, ctx) + .await + .map_err(GetVectoredError::Other)?; + + self.do_reads_and_update_state(reads, reconstruct_state) + .await; + + Ok(()) + } + + async fn plan_reads( + &self, + keyspace: KeySpace, + ctx: &RequestContext, + ) -> anyhow::Result> { + let mut planner = VectoredReadPlanner::new( + self.max_vectored_read_bytes + .expect("Layer is loaded with max vectored bytes config") + .0 + .into(), + ); + + let block_reader = FileBlockReader::new(&self.file, self.file_id); + let tree_reader = + DiskBtreeReader::new(self.index_start_blk, self.index_root_blk, block_reader); + + let ctx = RequestContextBuilder::extend(ctx) + .page_content_kind(PageContentKind::ImageLayerBtreeNode) + .build(); + + for range in keyspace.ranges.iter() { + let mut range_end_handled = false; + + let mut search_key: [u8; KEY_SIZE] = [0u8; KEY_SIZE]; + range.start.write_to_byte_slice(&mut search_key); + + let index_stream = tree_reader.get_stream_from(&search_key, &ctx); + let mut index_stream = std::pin::pin!(index_stream); + + while let Some(index_entry) = index_stream.next().await { + let (raw_key, offset) = index_entry?; + + let key = Key::from_slice(&raw_key[..KEY_SIZE]); + assert!(key >= range.start); + + if key >= range.end { + planner.handle_range_end(offset); + range_end_handled = true; + break; + } else { + planner.handle(key, self.lsn, offset, BlobFlag::None); + } + } + + if !range_end_handled { + let payload_end = self.index_start_blk as u64 * PAGE_SZ as u64; + planner.handle_range_end(payload_end); + } + } + + Ok(planner.finish()) + } + + async fn do_reads_and_update_state( + &self, + reads: Vec, + reconstruct_state: &mut ValuesReconstructState, + ) { + let max_vectored_read_bytes = self + .max_vectored_read_bytes + .expect("Layer is loaded with max vectored bytes config") + .0 + .into(); + + let vectored_blob_reader = VectoredBlobReader::new(&self.file); + for read in reads.into_iter() { + let buf = BytesMut::with_capacity(max_vectored_read_bytes); + let res = vectored_blob_reader.read_blobs(&read, buf).await; + + match res { + Ok(blobs_buf) => { + let frozen_buf = blobs_buf.buf.freeze(); + + for meta in blobs_buf.blobs.iter() { + let img_buf = frozen_buf.slice(meta.start..meta.end); + reconstruct_state.update_key( + &meta.meta.key, + self.lsn, + Value::Image(img_buf), + ); + } + } + Err(err) => { + let kind = err.kind(); + for (_, blob_meta) in read.blobs_at.as_slice() { + reconstruct_state.on_key_error( + blob_meta.key, + PageReconstructError::from(anyhow!( + "Failed to read blobs from virtual file {}: {}", + self.file.path, + kind + )), + ); + } + } + }; + } + } } /// A builder object for constructing a new image layer. @@ -528,9 +656,11 @@ impl ImageLayerWriterInner { /// /// The page versions must be appended in blknum order. /// - async fn put_image(&mut self, key: Key, img: &[u8]) -> anyhow::Result<()> { + async fn put_image(&mut self, key: Key, img: Bytes) -> anyhow::Result<()> { ensure!(self.key_range.contains(&key)); - let off = self.blob_writer.write_blob(img).await?; + let (_img, res) = self.blob_writer.write_blob(img).await; + // TODO: re-use the buffer for `img` further upstack + let off = res?; let mut keybuf: [u8; KEY_SIZE] = [0u8; KEY_SIZE]; key.write_to_byte_slice(&mut keybuf); @@ -553,7 +683,8 @@ impl ImageLayerWriterInner { .await?; let (index_root_blk, block_buf) = self.tree.finish()?; for buf in block_buf.blocks { - file.write_all(buf.as_ref()).await?; + let (_buf, res) = file.write_all(buf).await; + res?; } // Fill in the summary on blk 0 @@ -568,17 +699,12 @@ impl ImageLayerWriterInner { index_root_blk, }; - let mut buf = smallvec::SmallVec::<[u8; PAGE_SZ]>::new(); + let mut buf = Vec::with_capacity(PAGE_SZ); + // TODO: could use smallvec here but it's a pain with Slice Summary::ser_into(&summary, &mut buf)?; - if buf.spilled() { - // This is bad as we only have one free block for the summary - warn!( - "Used more than one page size for summary buffer: {}", - buf.len() - ); - } file.seek(SeekFrom::Start(0)).await?; - file.write_all(&buf).await?; + let (_buf, res) = file.write_all(buf).await; + res?; let metadata = file .metadata() @@ -659,7 +785,7 @@ impl ImageLayerWriter { /// /// The page versions must be appended in blknum order. /// - pub async fn put_image(&mut self, key: Key, img: &[u8]) -> anyhow::Result<()> { + pub async fn put_image(&mut self, key: Key, img: Bytes) -> anyhow::Result<()> { self.inner.as_mut().unwrap().put_image(key, img).await } diff --git a/pageserver/src/tenant/storage_layer/inmemory_layer.rs b/pageserver/src/tenant/storage_layer/inmemory_layer.rs index 7c9103eea8..5f1db21d49 100644 --- a/pageserver/src/tenant/storage_layer/inmemory_layer.rs +++ b/pageserver/src/tenant/storage_layer/inmemory_layer.rs @@ -9,13 +9,15 @@ use crate::context::{PageContentKind, RequestContext, RequestContextBuilder}; use crate::repository::{Key, Value}; use crate::tenant::block_io::BlockReader; use crate::tenant::ephemeral_file::EphemeralFile; -use crate::tenant::storage_layer::{ValueReconstructResult, ValueReconstructState}; -use crate::tenant::Timeline; +use crate::tenant::storage_layer::ValueReconstructResult; +use crate::tenant::timeline::GetVectoredError; +use crate::tenant::{PageReconstructError, Timeline}; use crate::walrecord; -use anyhow::{ensure, Result}; +use anyhow::{anyhow, ensure, Result}; +use pageserver_api::keyspace::KeySpace; use pageserver_api::models::InMemoryLayerInfo; use pageserver_api::shard::TenantShardId; -use std::collections::HashMap; +use std::collections::{BinaryHeap, HashMap, HashSet}; use std::sync::{Arc, OnceLock}; use tracing::*; use utils::{bin_ser::BeSer, id::TimelineId, lsn::Lsn, vec_map::VecMap}; @@ -25,7 +27,10 @@ use std::fmt::Write as _; use std::ops::Range; use tokio::sync::{RwLock, RwLockWriteGuard}; -use super::{DeltaLayerWriter, ResidentLayer}; +use super::{ + DeltaLayerWriter, ResidentLayer, ValueReconstructSituation, ValueReconstructState, + ValuesReconstructState, +}; pub struct InMemoryLayer { conf: &'static PageServerConf, @@ -202,6 +207,91 @@ impl InMemoryLayer { Ok(ValueReconstructResult::Complete) } } + + // Look up the keys in the provided keyspace and update + // the reconstruct state with whatever is found. + // + // If the key is cached, go no further than the cached Lsn. + pub(crate) async fn get_values_reconstruct_data( + &self, + keyspace: KeySpace, + end_lsn: Lsn, + reconstruct_state: &mut ValuesReconstructState, + ctx: &RequestContext, + ) -> Result<(), GetVectoredError> { + let ctx = RequestContextBuilder::extend(ctx) + .page_content_kind(PageContentKind::InMemoryLayer) + .build(); + + let inner = self.inner.read().await; + let reader = inner.file.block_cursor(); + + #[derive(Eq, PartialEq, Ord, PartialOrd)] + struct BlockRead { + key: Key, + lsn: Lsn, + block_offset: u64, + } + + let mut planned_block_reads = BinaryHeap::new(); + + for range in keyspace.ranges.iter() { + let mut key = range.start; + while key < range.end { + if let Some(vec_map) = inner.index.get(&key) { + let lsn_range = match reconstruct_state.get_cached_lsn(&key) { + Some(cached_lsn) => (cached_lsn + 1)..end_lsn, + None => self.start_lsn..end_lsn, + }; + + let slice = vec_map.slice_range(lsn_range); + for (entry_lsn, pos) in slice.iter().rev() { + planned_block_reads.push(BlockRead { + key, + lsn: *entry_lsn, + block_offset: *pos, + }); + } + } + + key = key.next(); + } + } + + let keyspace_size = keyspace.total_size(); + + let mut completed_keys = HashSet::new(); + while completed_keys.len() < keyspace_size && !planned_block_reads.is_empty() { + let block_read = planned_block_reads.pop().unwrap(); + if completed_keys.contains(&block_read.key) { + continue; + } + + let buf = reader.read_blob(block_read.block_offset, &ctx).await; + if let Err(e) = buf { + reconstruct_state + .on_key_error(block_read.key, PageReconstructError::from(anyhow!(e))); + completed_keys.insert(block_read.key); + continue; + } + + let value = Value::des(&buf.unwrap()); + if let Err(e) = value { + reconstruct_state + .on_key_error(block_read.key, PageReconstructError::from(anyhow!(e))); + completed_keys.insert(block_read.key); + continue; + } + + let key_situation = + reconstruct_state.update_key(&block_read.key, block_read.lsn, value.unwrap()); + if key_situation == ValueReconstructSituation::Complete { + completed_keys.insert(block_read.key); + } + } + + Ok(()) + } } impl std::fmt::Display for InMemoryLayer { @@ -246,32 +336,17 @@ impl InMemoryLayer { /// Common subroutine of the public put_wal_record() and put_page_image() functions. /// Adds the page version to the in-memory tree + pub(crate) async fn put_value( &self, key: Key, lsn: Lsn, - val: &Value, + buf: &[u8], ctx: &RequestContext, ) -> Result<()> { let mut inner = self.inner.write().await; self.assert_writable(); - self.put_value_locked(&mut inner, key, lsn, val, ctx).await - } - - pub(crate) async fn put_values( - &self, - values: &HashMap>, - ctx: &RequestContext, - ) -> Result<()> { - let mut inner = self.inner.write().await; - self.assert_writable(); - for (key, vals) in values { - for (lsn, val) in vals { - self.put_value_locked(&mut inner, *key, *lsn, val, ctx) - .await?; - } - } - Ok(()) + self.put_value_locked(&mut inner, key, lsn, buf, ctx).await } async fn put_value_locked( @@ -279,22 +354,16 @@ impl InMemoryLayer { locked_inner: &mut RwLockWriteGuard<'_, InMemoryLayerInner>, key: Key, lsn: Lsn, - val: &Value, + buf: &[u8], ctx: &RequestContext, ) -> Result<()> { trace!("put_value key {} at {}/{}", key, self.timeline_id, lsn); let off = { - // Avoid doing allocations for "small" values. - // In the regression test suite, the limit of 256 avoided allocations in 95% of cases: - // https://github.com/neondatabase/neon/pull/5056#discussion_r1301975061 - let mut buf = smallvec::SmallVec::<[u8; 256]>::new(); - buf.clear(); - val.ser_into(&mut buf)?; locked_inner .file .write_blob( - &buf, + buf, &RequestContextBuilder::extend(ctx) .page_content_kind(PageContentKind::InMemoryLayer) .build(), @@ -322,7 +391,12 @@ impl InMemoryLayer { pub async fn freeze(&self, end_lsn: Lsn) { let inner = self.inner.write().await; - assert!(self.start_lsn < end_lsn); + assert!( + self.start_lsn < end_lsn, + "{} >= {}", + self.start_lsn, + end_lsn + ); self.end_lsn.set(end_lsn).expect("end_lsn set only once"); for vec_map in inner.index.values() { @@ -383,9 +457,11 @@ impl InMemoryLayer { for (lsn, pos) in vec_map.as_slice() { cursor.read_blob_into_buf(*pos, &mut buf, &ctx).await?; let will_init = Value::des(&buf)?.will_init(); - delta_layer_writer - .put_value_bytes(key, *lsn, &buf, will_init) - .await?; + let res; + (buf, res) = delta_layer_writer + .put_value_bytes(key, *lsn, buf, will_init) + .await; + res?; } } diff --git a/pageserver/src/tenant/storage_layer/layer.rs b/pageserver/src/tenant/storage_layer/layer.rs index 12af866810..959065bc4c 100644 --- a/pageserver/src/tenant/storage_layer/layer.rs +++ b/pageserver/src/tenant/storage_layer/layer.rs @@ -1,5 +1,6 @@ use anyhow::Context; use camino::{Utf8Path, Utf8PathBuf}; +use pageserver_api::keyspace::KeySpace; use pageserver_api::models::{ HistoricLayerInfo, LayerAccessKind, LayerResidenceEventReason, LayerResidenceStatus, }; @@ -7,7 +8,7 @@ use pageserver_api::shard::ShardIndex; use std::ops::Range; use std::sync::atomic::{AtomicBool, AtomicUsize, Ordering}; use std::sync::{Arc, Weak}; -use std::time::SystemTime; +use std::time::{Duration, SystemTime}; use tracing::Instrument; use utils::lsn::Lsn; use utils::sync::heavier_once_cell; @@ -15,17 +16,22 @@ use utils::sync::heavier_once_cell; use crate::config::PageServerConf; use crate::context::RequestContext; use crate::repository::Key; +use crate::span::debug_assert_current_span_has_tenant_and_timeline_id; +use crate::tenant::timeline::GetVectoredError; use crate::tenant::{remote_timeline_client::LayerFileMetadata, Timeline}; use super::delta_layer::{self, DeltaEntry}; use super::image_layer; use super::{ AsLayerDesc, LayerAccessStats, LayerAccessStatsReset, LayerFileName, PersistentLayerDesc, - ValueReconstructResult, ValueReconstructState, + ValueReconstructResult, ValueReconstructState, ValuesReconstructState, }; use utils::generation::Generation; +#[cfg(test)] +mod tests; + /// A Layer contains all data in a "rectangle" consisting of a range of keys and /// range of LSNs. /// @@ -189,6 +195,7 @@ impl Layer { let downloaded = resident.expect("just initialized"); // if the rename works, the path is as expected + // TODO: sync system call std::fs::rename(temp_path, owner.local_path()) .with_context(|| format!("rename temporary file as correct path for {owner}"))?; @@ -202,10 +209,15 @@ impl Layer { /// If for a bad luck or blocking of the executor, we miss the actual eviction and the layer is /// re-downloaded, [`EvictionError::Downloaded`] is returned. /// + /// Timeout is mandatory, because waiting for eviction is only needed for our tests; eviction + /// will happen regardless the future returned by this method completing unless there is a + /// read access (currently including [`Layer::keep_resident`]) before eviction gets to + /// complete. + /// /// Technically cancellation safe, but cancelling might shift the viewpoint of what generation /// of download-evict cycle on retry. - pub(crate) async fn evict_and_wait(&self) -> Result<(), EvictionError> { - self.0.evict_and_wait().await + pub(crate) async fn evict_and_wait(&self, timeout: Duration) -> Result<(), EvictionError> { + self.0.evict_and_wait(timeout).await } /// Delete the layer file when the `self` gets dropped, also try to schedule a remote index upload @@ -261,6 +273,29 @@ impl Layer { .with_context(|| format!("get_value_reconstruct_data for layer {self}")) } + pub(crate) async fn get_values_reconstruct_data( + &self, + keyspace: KeySpace, + lsn_range: Range, + reconstruct_data: &mut ValuesReconstructState, + ctx: &RequestContext, + ) -> Result<(), GetVectoredError> { + let layer = self + .0 + .get_or_maybe_download(true, Some(ctx)) + .await + .map_err(|err| GetVectoredError::Other(anyhow::anyhow!(err)))?; + + self.0 + .access_stats + .record_access(LayerAccessKind::GetValueReconstructData, ctx); + + layer + .get_values_reconstruct_data(keyspace, lsn_range, reconstruct_data, &self.0, ctx) + .instrument(tracing::debug_span!("get_values_reconstruct_data", layer=%self)) + .await + } + /// Download the layer if evicted. /// /// Will not error when the layer is already downloaded. @@ -334,7 +369,7 @@ impl Layer { /// /// Does not start local deletion, use [`Self::delete_on_drop`] for that /// separatedly. - #[cfg(feature = "testing")] + #[cfg(any(feature = "testing", test))] pub(crate) fn wait_drop(&self) -> impl std::future::Future + 'static { let mut rx = self.0.status.subscribe(); @@ -501,6 +536,18 @@ impl Drop for LayerInner { // carry this until we are finished for [`Layer::wait_drop`] support let _status = status; + let Some(timeline) = timeline.upgrade() else { + // no need to nag that timeline is gone: under normal situation on + // task_mgr::remove_tenant_from_memory the timeline is gone before we get dropped. + LAYER_IMPL_METRICS.inc_deletes_failed(DeleteFailed::TimelineGone); + return; + }; + + let Ok(_guard) = timeline.gate.enter() else { + LAYER_IMPL_METRICS.inc_deletes_failed(DeleteFailed::TimelineGone); + return; + }; + let removed = match std::fs::remove_file(path) { Ok(()) => true, Err(e) if e.kind() == std::io::ErrorKind::NotFound => { @@ -519,32 +566,26 @@ impl Drop for LayerInner { } }; - if let Some(timeline) = timeline.upgrade() { - if removed { - timeline.metrics.resident_physical_size_sub(file_size); - } - if let Some(remote_client) = timeline.remote_client.as_ref() { - let res = remote_client.schedule_deletion_of_unlinked(vec![(file_name, meta)]); + if removed { + timeline.metrics.resident_physical_size_sub(file_size); + } + if let Some(remote_client) = timeline.remote_client.as_ref() { + let res = remote_client.schedule_deletion_of_unlinked(vec![(file_name, meta)]); - if let Err(e) = res { - // test_timeline_deletion_with_files_stuck_in_upload_queue is good at - // demonstrating this deadlock (without spawn_blocking): stop will drop - // queued items, which will have ResidentLayer's, and those drops would try - // to re-entrantly lock the RemoteTimelineClient inner state. - if !timeline.is_active() { - tracing::info!("scheduling deletion on drop failed: {e:#}"); - } else { - tracing::warn!("scheduling deletion on drop failed: {e:#}"); - } - LAYER_IMPL_METRICS.inc_deletes_failed(DeleteFailed::DeleteSchedulingFailed); + if let Err(e) = res { + // test_timeline_deletion_with_files_stuck_in_upload_queue is good at + // demonstrating this deadlock (without spawn_blocking): stop will drop + // queued items, which will have ResidentLayer's, and those drops would try + // to re-entrantly lock the RemoteTimelineClient inner state. + if !timeline.is_active() { + tracing::info!("scheduling deletion on drop failed: {e:#}"); } else { - LAYER_IMPL_METRICS.inc_completed_deletes(); + tracing::warn!("scheduling deletion on drop failed: {e:#}"); } + LAYER_IMPL_METRICS.inc_deletes_failed(DeleteFailed::DeleteSchedulingFailed); + } else { + LAYER_IMPL_METRICS.inc_completed_deletes(); } - } else { - // no need to nag that timeline is gone: under normal situation on - // task_mgr::remove_tenant_from_memory the timeline is gone before we get dropped. - LAYER_IMPL_METRICS.inc_deletes_failed(DeleteFailed::TimelineGone); } }); } @@ -603,7 +644,7 @@ impl LayerInner { /// Cancellation safe, however dropping the future and calling this method again might result /// in a new attempt to evict OR join the previously started attempt. - pub(crate) async fn evict_and_wait(&self) -> Result<(), EvictionError> { + pub(crate) async fn evict_and_wait(&self, timeout: Duration) -> Result<(), EvictionError> { use tokio::sync::broadcast::error::RecvError; assert!(self.have_remote_client); @@ -623,16 +664,22 @@ impl LayerInner { if strong.is_some() { // drop the DownloadedLayer outside of the holding the guard drop(strong); + + // idea here is that only one evicter should ever get to witness a strong reference, + // which means whenever get_or_maybe_download upgrades a weak, it must mark up a + // cancelled eviction and signal us, like it currently does. + // + // a second concurrent evict_and_wait will not see a strong reference. LAYER_IMPL_METRICS.inc_started_evictions(); } - match rx.recv().await { - Ok(Status::Evicted) => Ok(()), - Ok(Status::Downloaded) => Err(EvictionError::Downloaded), - Err(RecvError::Closed) => { + match tokio::time::timeout(timeout, rx.recv()).await { + Ok(Ok(Status::Evicted)) => Ok(()), + Ok(Ok(Status::Downloaded)) => Err(EvictionError::Downloaded), + Ok(Err(RecvError::Closed)) => { unreachable!("sender cannot be dropped while we are in &self method") } - Err(RecvError::Lagged(_)) => { + Ok(Err(RecvError::Lagged(_))) => { // this is quite unlikely, but we are blocking a lot in the async context, so // we might be missing this because we are stuck on a LIFO slot on a thread // which is busy blocking for a 1TB database create_image_layers. @@ -645,6 +692,7 @@ impl LayerInner { None => Ok(()), } } + Err(_timeout) => Err(EvictionError::Timeout), } } @@ -836,23 +884,20 @@ impl LayerInner { timeline: Arc, permit: heavier_once_cell::InitPermit, ) -> Result { - let task_name = format!("download layer {}", self); + debug_assert_current_span_has_tenant_and_timeline_id(); let (tx, rx) = tokio::sync::oneshot::channel(); - // this is sadly needed because of task_mgr::shutdown_tasks, otherwise we cannot - // block tenant::mgr::remove_tenant_from_memory. - let this: Arc = self.clone(); - crate::task_mgr::spawn( - &tokio::runtime::Handle::current(), - crate::task_mgr::TaskKind::RemoteDownloadTask, - Some(self.desc.tenant_shard_id), - Some(self.desc.timeline_id), - &task_name, - false, - async move { + let guard = timeline + .gate + .enter() + .map_err(|_| DownloadError::DownloadCancelled)?; + + tokio::task::spawn(async move { + + let _guard = guard; let client = timeline .remote_client @@ -862,7 +907,7 @@ impl LayerInner { let result = client.download_layer_file( &this.desc.filename(), &this.metadata(), - &crate::task_mgr::shutdown_token() + &timeline.cancel ) .await; @@ -885,7 +930,6 @@ impl LayerInner { tokio::select! { _ = tokio::time::sleep(backoff) => {}, - _ = crate::task_mgr::shutdown_token().cancelled_owned() => {}, _ = timeline.cancel.cancelled() => {}, }; @@ -915,11 +959,10 @@ impl LayerInner { } } } - - Ok(()) } .in_current_span(), ); + match rx.await { Ok((Ok(()), permit)) => { if let Some(reason) = self @@ -932,7 +975,7 @@ impl LayerInner { } self.consecutive_failures.store(0, Ordering::Relaxed); - tracing::info!("on-demand download successful"); + tracing::info!(size=%self.desc.file_size, "on-demand download successful"); Ok(permit) } @@ -1021,16 +1064,10 @@ impl LayerInner { /// `DownloadedLayer` is being dropped, so it calls this method. fn on_downloaded_layer_drop(self: Arc, version: usize) { - let delete = self.wanted_deleted.load(Ordering::Acquire); let evict = self.wanted_evicted.load(Ordering::Acquire); let can_evict = self.have_remote_client; - if delete { - // do nothing now, only in LayerInner::drop -- this was originally implemented because - // we could had already scheduled the deletion at the time. - // - // FIXME: this is not true anymore, we can safely evict wanted deleted files. - } else if can_evict && evict { + if can_evict && evict { let span = tracing::info_span!(parent: None, "layer_evict", tenant_id = %self.desc.tenant_shard_id.tenant_id, shard_id = %self.desc.tenant_shard_id.shard_slug(), timeline_id = %self.desc.timeline_id, layer=%self, %version); // downgrade for queueing, in case there's a tear down already ongoing we should not @@ -1064,6 +1101,10 @@ impl LayerInner { return Err(EvictionCancelled::TimelineGone); }; + let Ok(_gate) = timeline.gate.enter() else { + return Err(EvictionCancelled::TimelineGone); + }; + // to avoid starting a new download while we evict, keep holding on to the // permit. let _permit = { @@ -1170,11 +1211,14 @@ pub(crate) enum EvictionError { /// Evictions must always lose to downloads in races, and this time it happened. #[error("layer was downloaded instead")] Downloaded, + + #[error("eviction did not happen within timeout")] + Timeout, } /// Error internal to the [`LayerInner::get_or_maybe_download`] #[derive(Debug, thiserror::Error)] -enum DownloadError { +pub(crate) enum DownloadError { #[error("timeline has already shutdown")] TimelineShutdown, #[error("no remote storage configured")] @@ -1271,9 +1315,14 @@ impl DownloadedLayer { owner.desc.key_range.clone(), owner.desc.lsn_range.clone(), )); - delta_layer::DeltaLayerInner::load(&owner.path, summary, ctx) - .await - .map(|res| res.map(LayerKind::Delta)) + delta_layer::DeltaLayerInner::load( + &owner.path, + summary, + Some(owner.conf.max_vectored_read_bytes), + ctx, + ) + .await + .map(|res| res.map(LayerKind::Delta)) } else { let lsn = owner.desc.image_layer_lsn(); let summary = Some(image_layer::Summary::expected( @@ -1282,9 +1331,15 @@ impl DownloadedLayer { owner.desc.key_range.clone(), lsn, )); - image_layer::ImageLayerInner::load(&owner.path, lsn, summary, ctx) - .await - .map(|res| res.map(LayerKind::Image)) + image_layer::ImageLayerInner::load( + &owner.path, + lsn, + summary, + Some(owner.conf.max_vectored_read_bytes), + ctx, + ) + .await + .map(|res| res.map(LayerKind::Image)) }; match res { @@ -1334,6 +1389,28 @@ impl DownloadedLayer { } } + async fn get_values_reconstruct_data( + &self, + keyspace: KeySpace, + lsn_range: Range, + reconstruct_data: &mut ValuesReconstructState, + owner: &Arc, + ctx: &RequestContext, + ) -> Result<(), GetVectoredError> { + use LayerKind::*; + + match self.get(owner, ctx).await.map_err(GetVectoredError::from)? { + Delta(d) => { + d.get_values_reconstruct_data(keyspace, lsn_range, reconstruct_data, ctx) + .await + } + Image(i) => { + i.get_values_reconstruct_data(keyspace, reconstruct_data, ctx) + .await + } + } + } + async fn dump(&self, owner: &Arc, ctx: &RequestContext) -> anyhow::Result<()> { use LayerKind::*; match self.get(owner, ctx).await? { @@ -1410,10 +1487,6 @@ impl ResidentLayer { &self.owner.0.path } - pub(crate) fn access_stats(&self) -> &LayerAccessStats { - self.owner.access_stats() - } - pub(crate) fn metadata(&self) -> LayerFileMetadata { self.owner.metadata() } diff --git a/pageserver/src/tenant/storage_layer/layer/tests.rs b/pageserver/src/tenant/storage_layer/layer/tests.rs new file mode 100644 index 0000000000..b43534efd4 --- /dev/null +++ b/pageserver/src/tenant/storage_layer/layer/tests.rs @@ -0,0 +1,475 @@ +use futures::StreamExt; +use pageserver_api::key::CONTROLFILE_KEY; +use tokio::task::JoinSet; +use tracing::Instrument; +use utils::{ + completion::{self, Completion}, + id::TimelineId, +}; + +use super::*; +use crate::{context::DownloadBehavior, task_mgr::BACKGROUND_RUNTIME}; +use crate::{task_mgr::TaskKind, tenant::harness::TenantHarness}; + +/// Used in tests to advance a future to wanted await point, and not futher. +const ADVANCE: std::time::Duration = std::time::Duration::from_secs(3600); + +/// Used in tests to indicate forever long timeout; has to be longer than the amount of ADVANCE +/// timeout uses to advance futures. +const FOREVER: std::time::Duration = std::time::Duration::from_secs(ADVANCE.as_secs() * 24 * 7); + +/// Demonstrate the API and resident -> evicted -> resident -> deleted transitions. +#[tokio::test] +async fn smoke_test() { + let handle = BACKGROUND_RUNTIME.handle(); + + let h = TenantHarness::create("smoke_test").unwrap(); + let span = h.span(); + let download_span = span.in_scope(|| tracing::info_span!("downloading", timeline_id = 1)); + let (tenant, _) = h.load().await; + + let ctx = RequestContext::new(TaskKind::UnitTest, DownloadBehavior::Download); + + let timeline = tenant + .create_test_timeline(TimelineId::generate(), Lsn(0x10), 14, &ctx) + .await + .unwrap(); + + let layer = { + let mut layers = { + let layers = timeline.layers.read().await; + layers.resident_layers().collect::>().await + }; + + assert_eq!(layers.len(), 1); + + layers.swap_remove(0) + }; + + // all layers created at pageserver are like `layer`, initialized with strong + // Arc. + + let img_before = { + let mut data = ValueReconstructState::default(); + layer + .get_value_reconstruct_data(CONTROLFILE_KEY, Lsn(0x10)..Lsn(0x11), &mut data, &ctx) + .await + .unwrap(); + data.img + .take() + .expect("tenant harness writes the control file") + }; + + // important part is evicting the layer, which can be done when there are no more ResidentLayer + // instances -- there currently are none, only two `Layer` values, one in the layermap and on + // in scope. + layer.evict_and_wait(FOREVER).await.unwrap(); + + // double-evict returns an error, which is valid if both eviction_task and disk usage based + // eviction would both evict the same layer at the same time. + + let e = layer.evict_and_wait(FOREVER).await.unwrap_err(); + assert!(matches!(e, EvictionError::NotFound)); + + // on accesses when the layer is evicted, it will automatically be downloaded. + let img_after = { + let mut data = ValueReconstructState::default(); + layer + .get_value_reconstruct_data(CONTROLFILE_KEY, Lsn(0x10)..Lsn(0x11), &mut data, &ctx) + .instrument(download_span.clone()) + .await + .unwrap(); + data.img.take().unwrap() + }; + + assert_eq!(img_before, img_after); + + // evict_and_wait can timeout, but it doesn't cancel the evicting itself + // + // ZERO for timeout does not work reliably, so first take up all spawn_blocking slots to + // artificially slow it down. + let helper = SpawnBlockingPoolHelper::consume_all_spawn_blocking_threads(handle).await; + + match layer + .evict_and_wait(std::time::Duration::ZERO) + .await + .unwrap_err() + { + EvictionError::Timeout => { + // expected, but note that the eviction is "still ongoing" + helper.release().await; + // exhaust spawn_blocking pool to ensure it is now complete + SpawnBlockingPoolHelper::consume_and_release_all_of_spawn_blocking_threads(handle) + .await; + } + other => unreachable!("{other:?}"), + } + + // only way to query if a layer is resident is to acquire a ResidentLayer instance. + // Layer::keep_resident never downloads, but it might initialize if the layer file is found + // downloaded locally. + let none = layer.keep_resident().await.unwrap(); + assert!( + none.is_none(), + "Expected none, because eviction removed the local file, found: {none:?}" + ); + + // plain downloading is rarely needed + layer + .download_and_keep_resident() + .instrument(download_span) + .await + .unwrap(); + + // last important part is deletion on drop: gc and compaction use it for compacted L0 layers + // or fully garbage collected layers. deletion means deleting the local file, and scheduling a + // deletion of the already unlinked from index_part.json remote file. + // + // marking a layer to be deleted on drop is irreversible; there is no technical reason against + // reversiblity, but currently it is not needed so it is not provided. + layer.delete_on_drop(); + + let path = layer.local_path().to_owned(); + + // wait_drop produces an unconnected to Layer future which will resolve when the + // LayerInner::drop has completed. + let mut wait_drop = std::pin::pin!(layer.wait_drop()); + + // paused time doesn't really work well with timeouts and evict_and_wait, so delay pausing + // until here + tokio::time::pause(); + tokio::time::timeout(ADVANCE, &mut wait_drop) + .await + .expect_err("should had timed out because two strong references exist"); + + tokio::fs::metadata(&path) + .await + .expect("the local layer file still exists"); + + let rtc = timeline.remote_client.as_ref().unwrap(); + + { + let layers = &[layer]; + let mut g = timeline.layers.write().await; + g.finish_gc_timeline(layers); + // this just updates the remote_physical_size for demonstration purposes + rtc.schedule_gc_update(layers).unwrap(); + } + + // when strong references are dropped, the file is deleted and remote deletion is scheduled + wait_drop.await; + + let e = tokio::fs::metadata(&path) + .await + .expect_err("the local file is deleted"); + assert_eq!(e.kind(), std::io::ErrorKind::NotFound); + + rtc.wait_completion().await.unwrap(); + + assert_eq!(rtc.get_remote_physical_size(), 0); +} + +/// This test demonstrates a previous hang when a eviction and deletion were requested at the same +/// time. Now both of them complete per Arc drop semantics. +#[tokio::test(start_paused = true)] +async fn evict_and_wait_on_wanted_deleted() { + // this is the runtime on which Layer spawns the blocking tasks on + let handle = BACKGROUND_RUNTIME.handle(); + + let h = TenantHarness::create("evict_and_wait_on_wanted_deleted").unwrap(); + utils::logging::replace_panic_hook_with_tracing_panic_hook().forget(); + let (tenant, ctx) = h.load().await; + + let timeline = tenant + .create_test_timeline(TimelineId::generate(), Lsn(0x10), 14, &ctx) + .await + .unwrap(); + + let layer = { + let mut layers = { + let layers = timeline.layers.read().await; + layers.resident_layers().collect::>().await + }; + + assert_eq!(layers.len(), 1); + + layers.swap_remove(0) + }; + + // setup done + + let resident = layer.keep_resident().await.unwrap(); + + { + let mut evict_and_wait = std::pin::pin!(layer.evict_and_wait(FOREVER)); + + // drive the future to await on the status channel + tokio::time::timeout(ADVANCE, &mut evict_and_wait) + .await + .expect_err("should had been a timeout since we are holding the layer resident"); + + layer.delete_on_drop(); + + drop(resident); + + // make sure the eviction task gets to run + SpawnBlockingPoolHelper::consume_and_release_all_of_spawn_blocking_threads(handle).await; + + let resident = layer.keep_resident().await; + assert!( + matches!(resident, Ok(None)), + "keep_resident should not have re-initialized: {resident:?}" + ); + + evict_and_wait + .await + .expect("evict_and_wait should had succeeded"); + + // works as intended + } + + // assert that once we remove the `layer` from the layer map and drop our reference, + // the deletion of the layer in remote_storage happens. + { + let mut layers = timeline.layers.write().await; + layers.finish_gc_timeline(&[layer]); + } + + SpawnBlockingPoolHelper::consume_and_release_all_of_spawn_blocking_threads(handle).await; + + assert_eq!(1, LAYER_IMPL_METRICS.started_deletes.get()); + assert_eq!(1, LAYER_IMPL_METRICS.completed_deletes.get()); + assert_eq!(1, LAYER_IMPL_METRICS.started_evictions.get()); + assert_eq!(1, LAYER_IMPL_METRICS.completed_evictions.get()); +} + +/// This test shows that ensures we are able to read the layer while the layer eviction has been +/// started but not completed due to spawn_blocking pool being blocked. +/// +/// Here `Layer::keep_resident` is used to "simulate" reads, because it cannot download. +#[tokio::test(start_paused = true)] +async fn residency_check_while_evict_and_wait_on_clogged_spawn_blocking() { + // this is the runtime on which Layer spawns the blocking tasks on + let handle = BACKGROUND_RUNTIME.handle(); + let h = TenantHarness::create("residency_check_while_evict_and_wait_on_clogged_spawn_blocking") + .unwrap(); + let (tenant, ctx) = h.load().await; + + let timeline = tenant + .create_test_timeline(TimelineId::generate(), Lsn(0x10), 14, &ctx) + .await + .unwrap(); + + let layer = { + let mut layers = { + let layers = timeline.layers.read().await; + layers.resident_layers().collect::>().await + }; + + assert_eq!(layers.len(), 1); + + layers.swap_remove(0) + }; + + // setup done + + let resident = layer.keep_resident().await.unwrap(); + + let mut evict_and_wait = std::pin::pin!(layer.evict_and_wait(FOREVER)); + + // drive the future to await on the status channel + tokio::time::timeout(ADVANCE, &mut evict_and_wait) + .await + .expect_err("should had been a timeout since we are holding the layer resident"); + assert_eq!(1, LAYER_IMPL_METRICS.started_evictions.get()); + + // clog up BACKGROUND_RUNTIME spawn_blocking + let helper = SpawnBlockingPoolHelper::consume_all_spawn_blocking_threads(handle).await; + + // now the eviction cannot proceed because the threads are consumed while completion exists + drop(resident); + + // because no actual eviction happened, we get to just reinitialize the DownloadedLayer + layer + .keep_resident() + .await + .expect("keep_resident should had reinitialized without downloading") + .expect("ResidentLayer"); + + // because the keep_resident check alters wanted evicted without sending a message, we will + // never get completed + let e = tokio::time::timeout(ADVANCE, &mut evict_and_wait) + .await + .expect("no timeout, because keep_resident re-initialized") + .expect_err("eviction should not have succeeded because re-initialized"); + + // works as intended: evictions lose to "downloads" + assert!(matches!(e, EvictionError::Downloaded), "{e:?}"); + assert_eq!(0, LAYER_IMPL_METRICS.completed_evictions.get()); + + // this is not wrong: the eviction is technically still "on the way" as it's still queued + // because spawn_blocking is clogged up + assert_eq!( + 0, + LAYER_IMPL_METRICS + .cancelled_evictions + .values() + .map(|ctr| ctr.get()) + .sum::() + ); + + let mut second_eviction = std::pin::pin!(layer.evict_and_wait(FOREVER)); + + // advance to the wait on the queue + tokio::time::timeout(ADVANCE, &mut second_eviction) + .await + .expect_err("timeout because spawn_blocking is clogged"); + + // in this case we don't leak started evictions, but I think there is still a chance of that + // happening, because we could have upgrades race multiple evictions while only one of them + // happens? + assert_eq!(2, LAYER_IMPL_METRICS.started_evictions.get()); + + helper.release().await; + + // the second_eviction gets to run here + // + // synchronize to be *strictly* after the second_eviction spawn_blocking run + SpawnBlockingPoolHelper::consume_and_release_all_of_spawn_blocking_threads(handle).await; + + tokio::time::timeout(ADVANCE, &mut second_eviction) + .await + .expect("eviction goes through now that spawn_blocking is unclogged") + .expect("eviction should succeed, because version matches"); + + assert_eq!(1, LAYER_IMPL_METRICS.completed_evictions.get()); + + // now we finally can observe the original spawn_blocking failing + // it would had been possible to observe it earlier, but here it is guaranteed to have + // happened. + assert_eq!( + 1, + LAYER_IMPL_METRICS + .cancelled_evictions + .values() + .map(|ctr| ctr.get()) + .sum::() + ); +} + +struct SpawnBlockingPoolHelper { + awaited_by_spawn_blocking_tasks: Completion, + blocking_tasks: JoinSet<()>, +} + +impl SpawnBlockingPoolHelper { + /// All `crate::task_mgr::BACKGROUND_RUNTIME` spawn_blocking threads will be consumed until + /// release is called. + /// + /// In the tests this can be used to ensure something cannot be started on the target runtimes + /// spawn_blocking pool. + /// + /// This should be no issue nowdays, because nextest runs each test in it's own process. + async fn consume_all_spawn_blocking_threads(handle: &tokio::runtime::Handle) -> Self { + let (completion, barrier) = completion::channel(); + let (tx, mut rx) = tokio::sync::mpsc::channel(8); + + let assumed_max_blocking_threads = 512; + + let mut blocking_tasks = JoinSet::new(); + + for _ in 0..assumed_max_blocking_threads { + let barrier = barrier.clone(); + let tx = tx.clone(); + blocking_tasks.spawn_blocking_on( + move || { + tx.blocking_send(()).unwrap(); + drop(tx); + tokio::runtime::Handle::current().block_on(barrier.wait()); + }, + handle, + ); + } + + drop(barrier); + + for _ in 0..assumed_max_blocking_threads { + rx.recv().await.unwrap(); + } + + SpawnBlockingPoolHelper { + awaited_by_spawn_blocking_tasks: completion, + blocking_tasks, + } + } + + /// Release all previously blocked spawn_blocking threads + async fn release(self) { + let SpawnBlockingPoolHelper { + awaited_by_spawn_blocking_tasks, + mut blocking_tasks, + } = self; + + drop(awaited_by_spawn_blocking_tasks); + + while let Some(res) = blocking_tasks.join_next().await { + res.expect("none of the tasks should had panicked"); + } + } + + /// In the tests it is used as an easy way of making sure something scheduled on the target + /// runtimes `spawn_blocking` has completed, because it must've been scheduled and completed + /// before our tasks have a chance to schedule and complete. + async fn consume_and_release_all_of_spawn_blocking_threads(handle: &tokio::runtime::Handle) { + Self::consume_all_spawn_blocking_threads(handle) + .await + .release() + .await + } +} + +#[test] +fn spawn_blocking_pool_helper_actually_works() { + // create a custom runtime for which we know and control how many blocking threads it has + // + // because the amount is not configurable for our helper, expect the same amount as + // BACKGROUND_RUNTIME using the tokio defaults would have. + let rt = tokio::runtime::Builder::new_current_thread() + .max_blocking_threads(512) + .enable_all() + .build() + .unwrap(); + + let handle = rt.handle(); + + rt.block_on(async move { + // this will not return until all threads are spun up and actually executing the code + // waiting on `consumed` to be `SpawnBlockingPoolHelper::release`'d. + let consumed = SpawnBlockingPoolHelper::consume_all_spawn_blocking_threads(handle).await; + + println!("consumed"); + + let mut jh = std::pin::pin!(tokio::task::spawn_blocking(move || { + // this will not get to run before we release + })); + + println!("spawned"); + + tokio::time::timeout(std::time::Duration::from_secs(1), &mut jh) + .await + .expect_err("the task should not have gotten to run yet"); + + println!("tried to join"); + + consumed.release().await; + + println!("released"); + + tokio::time::timeout(std::time::Duration::from_secs(1), jh) + .await + .expect("no timeout") + .expect("no join error"); + + println!("joined"); + }); +} diff --git a/pageserver/src/tenant/storage_layer/layer_desc.rs b/pageserver/src/tenant/storage_layer/layer_desc.rs index bf24407fc5..c375923e81 100644 --- a/pageserver/src/tenant/storage_layer/layer_desc.rs +++ b/pageserver/src/tenant/storage_layer/layer_desc.rs @@ -15,7 +15,7 @@ use utils::id::TenantId; /// A unique identifier of a persistent layer. This is different from `LayerDescriptor`, which is only used in the /// benchmarks. This struct contains all necessary information to find the image / delta layer. It also provides /// a unified way to generate layer information like file name. -#[derive(Debug, PartialEq, Eq, Clone, Serialize, Deserialize)] +#[derive(Debug, PartialEq, Eq, Clone, Serialize, Deserialize, Hash)] pub struct PersistentLayerDesc { pub tenant_shard_id: TenantShardId, pub timeline_id: TimelineId, @@ -55,13 +55,13 @@ impl PersistentLayerDesc { } #[cfg(test)] - pub fn new_test(key_range: Range) -> Self { + pub fn new_test(key_range: Range, lsn_range: Range, is_delta: bool) -> Self { Self { tenant_shard_id: TenantShardId::unsharded(TenantId::generate()), timeline_id: TimelineId::generate(), key_range, - lsn_range: Lsn(0)..Lsn(1), - is_delta: false, + lsn_range, + is_delta, file_size: 0, } } diff --git a/pageserver/src/tenant/tasks.rs b/pageserver/src/tenant/tasks.rs index 5f39c46a84..e4f5f75132 100644 --- a/pageserver/src/tenant/tasks.rs +++ b/pageserver/src/tenant/tasks.rs @@ -9,6 +9,7 @@ use crate::context::{DownloadBehavior, RequestContext}; use crate::metrics::TENANT_TASK_EVENTS; use crate::task_mgr; use crate::task_mgr::{TaskKind, BACKGROUND_RUNTIME}; +use crate::tenant::throttle::Stats; use crate::tenant::timeline::CompactionError; use crate::tenant::{Tenant, TenantState}; use tokio_util::sync::CancellationToken; @@ -100,6 +101,7 @@ pub fn start_background_loops( _ = completion::Barrier::maybe_wait(background_jobs_can_start) => {} }; compaction_loop(tenant, cancel) + // If you rename this span, change the RUST_LOG env variable in test_runner/performance/test_branch_creation.py .instrument(info_span!("compaction_loop", tenant_id = %tenant_shard_id.tenant_id, shard_id = %tenant_shard_id.shard_slug())) .await; Ok(()) @@ -139,6 +141,8 @@ async fn compaction_loop(tenant: Arc, cancel: CancellationToken) { // How many errors we have seen consequtively let mut error_run_count = 0; + let mut last_throttle_flag_reset_at = Instant::now(); + TENANT_TASK_EVENTS.with_label_values(&["start"]).inc(); async { let ctx = RequestContext::todo_child(TaskKind::Compaction, DownloadBehavior::Download); @@ -195,11 +199,38 @@ async fn compaction_loop(tenant: Arc, cancel: CancellationToken) { } }; - warn_when_period_overrun(started_at.elapsed(), period, BackgroundLoopKind::Compaction); + let elapsed = started_at.elapsed(); + warn_when_period_overrun(elapsed, period, BackgroundLoopKind::Compaction); + + // the duration is recorded by performance tests by enabling debug in this function + tracing::debug!(elapsed_ms=elapsed.as_millis(), "compaction iteration complete"); // Perhaps we did no work and the walredo process has been idle for some time: // give it a chance to shut down to avoid leaving walredo process running indefinitely. - tenant.walredo_mgr.maybe_quiesce(period * 10); + if let Some(walredo_mgr) = &tenant.walredo_mgr { + walredo_mgr.maybe_quiesce(period * 10); + } + + // TODO: move this (and walredo quiesce) to a separate task that isn't affected by the back-off, + // so we get some upper bound guarantee on when walredo quiesce / this throttling reporting here happens. + info_span!(parent: None, "timeline_get_throttle", tenant_id=%tenant.tenant_shard_id, shard_id=%tenant.tenant_shard_id.shard_slug()).in_scope(|| { + let now = Instant::now(); + let prev = std::mem::replace(&mut last_throttle_flag_reset_at, now); + let Stats { count_accounted, count_throttled, sum_throttled_usecs } = tenant.timeline_get_throttle.reset_stats(); + if count_throttled == 0 { + return; + } + let allowed_rps = tenant.timeline_get_throttle.steady_rps(); + let delta = now - prev; + info!( + n_seconds=%format_args!("{:.3}", + delta.as_secs_f64()), + count_accounted, + count_throttled, + sum_throttled_usecs, + allowed_rps=%format_args!("{allowed_rps:.0}"), + "shard was throttled in the last n_seconds") + }); // Sleep if tokio::time::timeout(sleep_duration, cancel.cancelled()) diff --git a/pageserver/src/tenant/throttle.rs b/pageserver/src/tenant/throttle.rs new file mode 100644 index 0000000000..280773e9c3 --- /dev/null +++ b/pageserver/src/tenant/throttle.rs @@ -0,0 +1,175 @@ +use std::{ + str::FromStr, + sync::{ + atomic::{AtomicU64, Ordering}, + Arc, Mutex, + }, + time::{Duration, Instant}, +}; + +use arc_swap::ArcSwap; +use enumset::EnumSet; +use tracing::{error, warn}; + +use crate::{context::RequestContext, task_mgr::TaskKind}; + +/// Throttle for `async` functions. +/// +/// Runtime reconfigurable. +/// +/// To share a throttle among multiple entities, wrap it in an [`Arc`]. +/// +/// The intial use case for this is tenant-wide throttling of getpage@lsn requests. +pub struct Throttle { + inner: ArcSwap, + metric: M, + /// will be turned into [`Stats::count_accounted`] + count_accounted: AtomicU64, + /// will be turned into [`Stats::count_throttled`] + count_throttled: AtomicU64, + /// will be turned into [`Stats::sum_throttled_usecs`] + sum_throttled_usecs: AtomicU64, +} + +pub struct Inner { + task_kinds: EnumSet, + rate_limiter: Arc, + config: Config, +} + +pub type Config = pageserver_api::models::ThrottleConfig; + +pub struct Observation { + pub wait_time: Duration, +} +pub trait Metric { + fn observe_throttling(&self, observation: &Observation); +} + +/// See [`Throttle::reset_stats`]. +pub struct Stats { + // Number of requests that were subject to throttling, i.e., requests of the configured [`Config::task_kinds`]. + pub count_accounted: u64, + // Subset of the `accounted` requests that were actually throttled. + // Note that the numbers are stored as two independent atomics, so, there might be a slight drift. + pub count_throttled: u64, + // Sum of microseconds that throttled requests spent waiting for throttling. + pub sum_throttled_usecs: u64, +} + +impl Throttle +where + M: Metric, +{ + pub fn new(config: Config, metric: M) -> Self { + Self { + inner: ArcSwap::new(Arc::new(Self::new_inner(config))), + metric, + count_accounted: AtomicU64::new(0), + count_throttled: AtomicU64::new(0), + sum_throttled_usecs: AtomicU64::new(0), + } + } + fn new_inner(config: Config) -> Inner { + let Config { + task_kinds, + initial, + refill_interval, + refill_amount, + max, + fair, + } = &config; + let task_kinds: EnumSet = task_kinds + .iter() + .filter_map(|s| match TaskKind::from_str(s) { + Ok(v) => Some(v), + Err(e) => { + // TODO: avoid this failure mode + error!( + "cannot parse task kind, ignoring for rate limiting {}", + utils::error::report_compact_sources(&e) + ); + None + } + }) + .collect(); + Inner { + task_kinds, + rate_limiter: Arc::new( + leaky_bucket::RateLimiter::builder() + .initial(*initial) + .interval(*refill_interval) + .refill(refill_amount.get()) + .max(*max) + .fair(*fair) + .build(), + ), + config, + } + } + pub fn reconfigure(&self, config: Config) { + self.inner.store(Arc::new(Self::new_inner(config))); + } + + /// The [`Throttle`] keeps an internal flag that is true if there was ever any actual throttling. + /// This method allows retrieving & resetting that flag. + /// Useful for periodic reporting. + pub fn reset_stats(&self) -> Stats { + let count_accounted = self.count_accounted.swap(0, Ordering::Relaxed); + let count_throttled = self.count_throttled.swap(0, Ordering::Relaxed); + let sum_throttled_usecs = self.sum_throttled_usecs.swap(0, Ordering::Relaxed); + Stats { + count_accounted, + count_throttled, + sum_throttled_usecs, + } + } + + /// See [`Config::steady_rps`]. + pub fn steady_rps(&self) -> f64 { + self.inner.load().config.steady_rps() + } + + pub async fn throttle(&self, ctx: &RequestContext, key_count: usize) { + let inner = self.inner.load_full(); // clones the `Inner` Arc + if !inner.task_kinds.contains(ctx.task_kind()) { + return; + }; + let start = std::time::Instant::now(); + let mut did_throttle = false; + let acquire = inner.rate_limiter.acquire(key_count); + // turn off runtime-induced preemption (aka coop) so our `did_throttle` is accurate + let acquire = tokio::task::unconstrained(acquire); + let mut acquire = std::pin::pin!(acquire); + std::future::poll_fn(|cx| { + use std::future::Future; + let poll = acquire.as_mut().poll(cx); + did_throttle = did_throttle || poll.is_pending(); + poll + }) + .await; + self.count_accounted.fetch_add(1, Ordering::Relaxed); + if did_throttle { + self.count_throttled.fetch_add(1, Ordering::Relaxed); + let now = Instant::now(); + let wait_time = now - start; + self.sum_throttled_usecs + .fetch_add(wait_time.as_micros() as u64, Ordering::Relaxed); + let observation = Observation { wait_time }; + self.metric.observe_throttling(&observation); + match ctx.micros_spent_throttled.add(wait_time) { + Ok(res) => res, + Err(error) => { + use once_cell::sync::Lazy; + use utils::rate_limit::RateLimit; + static WARN_RATE_LIMIT: Lazy> = + Lazy::new(|| Mutex::new(RateLimit::new(Duration::from_secs(10)))); + let mut guard = WARN_RATE_LIMIT.lock().unwrap(); + guard.call(move || { + warn!(error, "error adding time spent throttled; this message is logged at a global rate limit"); + }); + } + } + } + } +} diff --git a/pageserver/src/tenant/timeline.rs b/pageserver/src/tenant/timeline.rs index 70c6ee2042..a733a3b1a7 100644 --- a/pageserver/src/tenant/timeline.rs +++ b/pageserver/src/tenant/timeline.rs @@ -1,3 +1,4 @@ +mod compaction; pub mod delete; mod eviction_task; mod init; @@ -9,15 +10,17 @@ mod walreceiver; use anyhow::{anyhow, bail, ensure, Context, Result}; use bytes::Bytes; -use camino::{Utf8Path, Utf8PathBuf}; +use camino::Utf8Path; use enumset::EnumSet; use fail::fail_point; -use itertools::Itertools; +use futures::stream::StreamExt; +use once_cell::sync::Lazy; use pageserver_api::{ - keyspace::{key_range_size, KeySpaceAccum}, + key::AUX_FILES_KEY, + keyspace::KeySpaceAccum, models::{ - DownloadRemoteLayersTaskInfo, DownloadRemoteLayersTaskSpawnRequest, EvictionPolicy, - LayerMapInfo, TimelineState, + CompactionAlgorithm, DownloadRemoteLayersTaskInfo, DownloadRemoteLayersTaskSpawnRequest, + EvictionPolicy, LayerMapInfo, TimelineState, }, reltag::BlockNumber, shard::{ShardIdentity, TenantShardId}, @@ -31,14 +34,21 @@ use tokio::{ }; use tokio_util::sync::CancellationToken; use tracing::*; -use utils::sync::gate::Gate; +use utils::{ + bin_ser::BeSer, + sync::gate::{Gate, GateGuard}, +}; -use std::collections::{BTreeMap, BinaryHeap, HashMap, HashSet}; use std::ops::{Deref, Range}; use std::pin::pin; use std::sync::atomic::Ordering as AtomicOrdering; use std::sync::{Arc, Mutex, RwLock, Weak}; use std::time::{Duration, Instant, SystemTime}; +use std::{ + array, + collections::{BTreeMap, HashMap, HashSet}, + sync::atomic::AtomicU64, +}; use std::{ cmp::{max, min, Ordering}, ops::ControlFlow, @@ -47,12 +57,12 @@ use std::{ use crate::tenant::timeline::logical_size::CurrentLogicalSize; use crate::tenant::{ layer_map::{LayerMap, SearchResult}, - metadata::{save_metadata, TimelineMetadata}, - par_fsync, + metadata::TimelineMetadata, }; use crate::{ - context::{AccessStatsBehavior, DownloadBehavior, RequestContext, RequestContextBuilder}, + context::{DownloadBehavior, RequestContext}, disk_usage_eviction_task::DiskUsageEvictionInfo, + pgdatadir_mapping::CollectKeySpaceError, }; use crate::{deletion_queue::DeletionQueueClient, tenant::remote_timeline_client::StopError}; use crate::{ @@ -60,16 +70,20 @@ use crate::{ tenant::storage_layer::{ AsLayerDesc, DeltaLayerWriter, EvictionError, ImageLayerWriter, InMemoryLayer, Layer, LayerAccessStatsReset, LayerFileName, ResidentLayer, ValueReconstructResult, - ValueReconstructState, + ValueReconstructState, ValuesReconstructState, }, }; use crate::{ disk_usage_eviction_task::EvictionCandidate, tenant::storage_layer::delta_layer::DeltaEntry, }; use crate::{pgdatadir_mapping::LsnForTimestamp, tenant::tasks::BackgroundLoopKind}; +use crate::{ + pgdatadir_mapping::{AuxFilesDirectory, DirectoryKind}, + virtual_file::{MaybeFatalIo, VirtualFile}, +}; use crate::config::PageServerConf; -use crate::keyspace::{KeyPartitioning, KeySpace, KeySpaceRandomAccum}; +use crate::keyspace::{KeyPartitioning, KeySpace}; use crate::metrics::{ TimelineMetrics, MATERIALIZED_PAGE_CACHE_HIT, MATERIALIZED_PAGE_CACHE_HIT_DIRECT, }; @@ -104,11 +118,11 @@ use self::layer_manager::LayerManager; use self::logical_size::LogicalSize; use self::walreceiver::{WalReceiver, WalReceiverConf}; -use super::config::TenantConf; -use super::remote_timeline_client::index::{IndexLayerMetadata, IndexPart}; use super::remote_timeline_client::RemoteTimelineClient; use super::secondary::heatmap::{HeatMapLayer, HeatMapTimeline}; +use super::{config::TenantConf, storage_layer::ReadableLayerDesc}; use super::{debug_assert_current_span_has_tenant_and_timeline_id, AttachedTenantConf}; +use super::{remote_timeline_client::index::IndexPart, storage_layer::LayerFringe}; #[derive(Debug, PartialEq, Eq, Clone, Copy)] pub(super) enum FlushLoopState { @@ -124,7 +138,7 @@ pub(super) enum FlushLoopState { /// Wrapper for key range to provide reverse ordering by range length for BinaryHeap #[derive(Debug, Clone, PartialEq, Eq)] -pub struct Hole { +pub(crate) struct Hole { key_range: Range, coverage_size: usize, } @@ -157,6 +171,14 @@ fn drop_wlock(rlock: tokio::sync::RwLockWriteGuard<'_, T>) { pub struct TimelineResources { pub remote_client: Option, pub deletion_queue_client: DeletionQueueClient, + pub timeline_get_throttle: Arc< + crate::tenant::throttle::Throttle<&'static crate::metrics::tenant_throttling::TimelineGet>, + >, +} + +pub(crate) struct AuxFilesState { + pub(crate) dir: Option, + pub(crate) n_deltas: usize, } pub struct Timeline { @@ -200,23 +222,12 @@ pub struct Timeline { /// so that e.g. on-demand-download/eviction, and layer spreading, can operate just on `LayerFileManager`. pub(crate) layers: Arc>, - /// Set of key ranges which should be covered by image layers to - /// allow GC to remove old layers. This set is created by GC and its cutoff LSN is also stored. - /// It is used by compaction task when it checks if new image layer should be created. - /// Newly created image layer doesn't help to remove the delta layer, until the - /// newly created image layer falls off the PITR horizon. So on next GC cycle, - /// gc_timeline may still want the new image layer to be created. To avoid redundant - /// image layers creation we should check if image layer exists but beyond PITR horizon. - /// This is why we need remember GC cutoff LSN. - /// - wanted_image_layers: Mutex>, - last_freeze_at: AtomicLsn, // Atomic would be more appropriate here. last_freeze_ts: RwLock, - // WAL redo manager - walredo_mgr: Arc, + // WAL redo manager. `None` only for broken tenants. + walredo_mgr: Option>, /// Remote storage client. /// See [`remote_timeline_client`](super::remote_timeline_client) module comment for details. @@ -257,12 +268,14 @@ pub struct Timeline { // in `crate::page_service` writes these metrics. pub(crate) query_metrics: crate::metrics::SmgrQueryTimePerTimeline, + directory_metrics: [AtomicU64; DirectoryKind::KINDS_NUM], + /// Ensures layers aren't frozen by checkpointer between /// [`Timeline::get_layer_for_write`] and layer reads. /// Locked automatically by [`TimelineWriter`] and checkpointer. /// Must always be acquired before the layer map/individual layer lock /// to avoid deadlock. - write_lock: tokio::sync::Mutex<()>, + write_lock: tokio::sync::Mutex>, /// Used to avoid multiple `flush_loop` tasks running pub(super) flush_loop_state: Mutex, @@ -291,7 +304,7 @@ pub struct Timeline { pub initdb_lsn: Lsn, /// When did we last calculate the partitioning? - partitioning: Mutex<(KeyPartitioning, Lsn)>, + partitioning: tokio::sync::Mutex<(KeyPartitioning, Lsn)>, /// Configuration: how often should the partitioning be recalculated. repartition_threshold: u64, @@ -333,7 +346,7 @@ pub struct Timeline { /// /// Must only be taken in two places: /// - [`Timeline::compact`] (this file) - /// - [`delete::delete_local_layer_files`] + /// - [`delete::delete_local_timeline_directory`] /// /// Timeline deletion will acquire both compaction and gc locks in whatever order. compaction_lock: tokio::sync::Mutex<()>, @@ -342,10 +355,18 @@ pub struct Timeline { /// /// Must only be taken in two places: /// - [`Timeline::gc`] (this file) - /// - [`delete::delete_local_layer_files`] + /// - [`delete::delete_local_timeline_directory`] /// /// Timeline deletion will acquire both compaction and gc locks in whatever order. gc_lock: tokio::sync::Mutex<()>, + + /// Cloned from [`super::Tenant::timeline_get_throttle`] on construction. + timeline_get_throttle: Arc< + crate::tenant::throttle::Throttle<&'static crate::metrics::tenant_throttling::TimelineGet>, + >, + + /// Keep aux directory cache to avoid it's reconstruction on each update + pub(crate) aux_files: tokio::sync::Mutex, } pub struct WalReceiverInfo { @@ -455,6 +476,30 @@ pub(crate) enum GetVectoredError { #[error("Requested at invalid LSN: {0}")] InvalidLsn(Lsn), + + #[error("Requested key {0} not found")] + MissingKey(Key), + + #[error(transparent)] + GetReadyAncestorError(GetReadyAncestorError), + + #[error(transparent)] + Other(#[from] anyhow::Error), +} + +#[derive(thiserror::Error, Debug)] +pub(crate) enum GetReadyAncestorError { + #[error("ancestor timeline {0} is being stopped")] + AncestorStopping(TimelineId), + + #[error("Ancestor LSN wait error: {0}")] + AncestorLsnTimeout(#[from] WaitLsnError), + + #[error("Cancelled")] + Cancelled, + + #[error(transparent)] + Other(#[from] anyhow::Error), } #[derive(Clone, Copy)] @@ -473,6 +518,7 @@ pub enum GetLogicalSizePriority { #[derive(enumset::EnumSetType)] pub(crate) enum CompactFlags { ForceRepartition, + ForceImageLayerCreation, } impl std::fmt::Debug for Timeline { @@ -535,22 +581,51 @@ impl From for CreateImageLayersError { } } +impl From for PageReconstructError { + fn from(e: GetReadyAncestorError) -> Self { + use GetReadyAncestorError::*; + match e { + AncestorStopping(tid) => PageReconstructError::AncestorStopping(tid), + AncestorLsnTimeout(wait_err) => PageReconstructError::AncestorLsnTimeout(wait_err), + Cancelled => PageReconstructError::Cancelled, + Other(other) => PageReconstructError::Other(other), + } + } +} + +#[derive( + Eq, + PartialEq, + Debug, + Copy, + Clone, + strum_macros::EnumString, + strum_macros::Display, + serde_with::DeserializeFromStr, + serde_with::SerializeDisplay, +)] +#[strum(serialize_all = "kebab-case")] +pub enum GetVectoredImpl { + Sequential, + Vectored, +} + /// Public interface functions impl Timeline { /// Get the LSN where this branch was created - pub fn get_ancestor_lsn(&self) -> Lsn { + pub(crate) fn get_ancestor_lsn(&self) -> Lsn { self.ancestor_lsn } /// Get the ancestor's timeline id - pub fn get_ancestor_timeline_id(&self) -> Option { + pub(crate) fn get_ancestor_timeline_id(&self) -> Option { self.ancestor_timeline .as_ref() .map(|ancestor| ancestor.timeline_id) } /// Lock and get timeline's GC cutoff - pub fn get_latest_gc_cutoff_lsn(&self) -> RcuReadGuard { + pub(crate) fn get_latest_gc_cutoff_lsn(&self) -> RcuReadGuard { self.latest_gc_cutoff_lsn.read() } @@ -579,6 +654,8 @@ impl Timeline { return Err(PageReconstructError::Other(anyhow::anyhow!("Invalid LSN"))); } + self.timeline_get_throttle.throttle(ctx, 1).await; + // This check is debug-only because of the cost of hashing, and because it's a double-check: we // already checked the key against the shard_identity when looking up the Timeline from // page_service. @@ -662,7 +739,7 @@ impl Timeline { /// which actually vectorizes the read path. pub(crate) async fn get_vectored( &self, - key_ranges: &[Range], + keyspace: KeySpace, lsn: Lsn, ctx: &RequestContext, ) -> Result>, GetVectoredError> { @@ -670,63 +747,230 @@ impl Timeline { return Err(GetVectoredError::InvalidLsn(lsn)); } - let key_count = key_ranges - .iter() - .map(|range| key_range_size(range) as u64) - .sum(); + let key_count = keyspace.total_size().try_into().unwrap(); if key_count > Timeline::MAX_GET_VECTORED_KEYS { return Err(GetVectoredError::Oversized(key_count)); } + self.timeline_get_throttle + .throttle(ctx, key_count as usize) + .await; + + for range in &keyspace.ranges { + let mut key = range.start; + while key != range.end { + assert!(!self.shard_identity.is_key_disposable(&key)); + key = key.next(); + } + } + + trace!( + "get vectored request for {:?}@{} from task kind {:?} will use {} implementation", + keyspace, + lsn, + ctx.task_kind(), + self.conf.get_vectored_impl + ); + let _timer = crate::metrics::GET_VECTORED_LATENCY .for_task_kind(ctx.task_kind()) .map(|t| t.start_timer()); - let mut values = BTreeMap::new(); - for range in key_ranges { - let mut key = range.start; - while key != range.end { - assert!(!self.shard_identity.is_key_disposable(&key)); + match self.conf.get_vectored_impl { + GetVectoredImpl::Sequential => { + self.get_vectored_sequential_impl(keyspace, lsn, ctx).await + } + GetVectoredImpl::Vectored => { + let vectored_res = self.get_vectored_impl(keyspace.clone(), lsn, ctx).await; - let block = self.get(key, lsn, ctx).await; - - if matches!( - block, - Err(PageReconstructError::Cancelled | PageReconstructError::AncestorStopping(_)) - ) { - return Err(GetVectoredError::Cancelled); + if self.conf.validate_vectored_get { + self.validate_get_vectored_impl(&vectored_res, keyspace, lsn, ctx) + .await; } - values.insert(key, block); - key = key.next(); + vectored_res + } + } + } + + pub(super) async fn get_vectored_sequential_impl( + &self, + keyspace: KeySpace, + lsn: Lsn, + ctx: &RequestContext, + ) -> Result>, GetVectoredError> { + let mut values = BTreeMap::new(); + for range in keyspace.ranges { + let mut key = range.start; + while key != range.end { + let block = self.get(key, lsn, ctx).await; + + use PageReconstructError::*; + match block { + Err(Cancelled | AncestorStopping(_)) => { + return Err(GetVectoredError::Cancelled) + } + Err(Other(err)) if err.to_string().contains("could not find data for key") => { + return Err(GetVectoredError::MissingKey(key)) + } + _ => { + values.insert(key, block); + key = key.next(); + } + } } } Ok(values) } + pub(super) async fn get_vectored_impl( + &self, + keyspace: KeySpace, + lsn: Lsn, + ctx: &RequestContext, + ) -> Result>, GetVectoredError> { + let mut reconstruct_state = ValuesReconstructState::new(); + + self.get_vectored_reconstruct_data(keyspace, lsn, &mut reconstruct_state, ctx) + .await?; + + let mut results: BTreeMap> = BTreeMap::new(); + for (key, res) in reconstruct_state.keys { + match res { + Err(err) => { + results.insert(key, Err(err)); + } + Ok(state) => { + let state = ValueReconstructState::from(state); + + let reconstruct_res = self.reconstruct_value(key, lsn, state).await; + results.insert(key, reconstruct_res); + } + } + } + + Ok(results) + } + + pub(super) async fn validate_get_vectored_impl( + &self, + vectored_res: &Result>, GetVectoredError>, + keyspace: KeySpace, + lsn: Lsn, + ctx: &RequestContext, + ) { + let sequential_res = self + .get_vectored_sequential_impl(keyspace.clone(), lsn, ctx) + .await; + + fn errors_match(lhs: &GetVectoredError, rhs: &GetVectoredError) -> bool { + use GetVectoredError::*; + match (lhs, rhs) { + (Oversized(l), Oversized(r)) => l == r, + (InvalidLsn(l), InvalidLsn(r)) => l == r, + (MissingKey(l), MissingKey(r)) => l == r, + (GetReadyAncestorError(_), GetReadyAncestorError(_)) => true, + (Other(_), Other(_)) => true, + _ => false, + } + } + + match (&sequential_res, vectored_res) { + (Err(GetVectoredError::Cancelled), _) => {}, + (_, Err(GetVectoredError::Cancelled)) => {}, + (Err(seq_err), Ok(_)) => { + panic!(concat!("Sequential get failed with {}, but vectored get did not", + " - keyspace={:?} lsn={}"), + seq_err, keyspace, lsn) }, + (Ok(_), Err(vec_err)) => { + panic!(concat!("Vectored get failed with {}, but sequential get did not", + " - keyspace={:?} lsn={}"), + vec_err, keyspace, lsn) }, + (Err(seq_err), Err(vec_err)) => { + assert!(errors_match(seq_err, vec_err), + "Mismatched errors: {seq_err} != {vec_err} - keyspace={keyspace:?} lsn={lsn}")}, + (Ok(seq_values), Ok(vec_values)) => { + seq_values.iter().zip(vec_values.iter()).for_each(|((seq_key, seq_res), (vec_key, vec_res))| { + assert_eq!(seq_key, vec_key); + match (seq_res, vec_res) { + (Ok(seq_blob), Ok(vec_blob)) => { + Self::validate_key_equivalence(seq_key, &keyspace, lsn, seq_blob, vec_blob); + }, + (Err(err), Ok(_)) => { + panic!( + concat!("Sequential get failed with {} for key {}, but vectored get did not", + " - keyspace={:?} lsn={}"), + err, seq_key, keyspace, lsn) }, + (Ok(_), Err(err)) => { + panic!( + concat!("Vectored get failed with {} for key {}, but sequential get did not", + " - keyspace={:?} lsn={}"), + err, seq_key, keyspace, lsn) }, + (Err(_), Err(_)) => {} + } + }) + } + } + } + + fn validate_key_equivalence( + key: &Key, + keyspace: &KeySpace, + lsn: Lsn, + seq: &Bytes, + vec: &Bytes, + ) { + if *key == AUX_FILES_KEY { + // The value reconstruct of AUX_FILES_KEY from records is not deterministic + // since it uses a hash map under the hood. Hence, deserialise both results + // before comparing. + let seq_aux_dir_res = AuxFilesDirectory::des(seq); + let vec_aux_dir_res = AuxFilesDirectory::des(vec); + match (&seq_aux_dir_res, &vec_aux_dir_res) { + (Ok(seq_aux_dir), Ok(vec_aux_dir)) => { + assert_eq!( + seq_aux_dir, vec_aux_dir, + "Mismatch for key {} - keyspace={:?} lsn={}", + key, keyspace, lsn + ); + } + (Err(_), Err(_)) => {} + _ => { + panic!("Mismatch for {key}: {seq_aux_dir_res:?} != {vec_aux_dir_res:?}"); + } + } + } else { + // All other keys should reconstruct deterministically, so we simply compare the blobs. + assert_eq!( + seq, vec, + "Image mismatch for key {key} - keyspace={keyspace:?} lsn={lsn}" + ); + } + } + /// Get last or prev record separately. Same as get_last_record_rlsn().last/prev. - pub fn get_last_record_lsn(&self) -> Lsn { + pub(crate) fn get_last_record_lsn(&self) -> Lsn { self.last_record_lsn.load().last } - pub fn get_prev_record_lsn(&self) -> Lsn { + pub(crate) fn get_prev_record_lsn(&self) -> Lsn { self.last_record_lsn.load().prev } /// Atomically get both last and prev. - pub fn get_last_record_rlsn(&self) -> RecordLsn { + pub(crate) fn get_last_record_rlsn(&self) -> RecordLsn { self.last_record_lsn.load() } - pub fn get_disk_consistent_lsn(&self) -> Lsn { + pub(crate) fn get_disk_consistent_lsn(&self) -> Lsn { self.disk_consistent_lsn.load() } /// remote_consistent_lsn from the perspective of the tenant's current generation, /// not validated with control plane yet. /// See [`Self::get_remote_consistent_lsn_visible`]. - pub fn get_remote_consistent_lsn_projected(&self) -> Option { + pub(crate) fn get_remote_consistent_lsn_projected(&self) -> Option { if let Some(remote_client) = &self.remote_client { remote_client.remote_consistent_lsn_projected() } else { @@ -737,7 +981,7 @@ impl Timeline { /// remote_consistent_lsn which the tenant is guaranteed not to go backward from, /// i.e. a value of remote_consistent_lsn_projected which has undergone /// generation validation in the deletion queue. - pub fn get_remote_consistent_lsn_visible(&self) -> Option { + pub(crate) fn get_remote_consistent_lsn_visible(&self) -> Option { if let Some(remote_client) = &self.remote_client { remote_client.remote_consistent_lsn_visible() } else { @@ -748,7 +992,7 @@ impl Timeline { /// The sum of the file size of all historic layers in the layer map. /// This method makes no distinction between local and remote layers. /// Hence, the result **does not represent local filesystem usage**. - pub async fn layer_size_sum(&self) -> u64 { + pub(crate) async fn layer_size_sum(&self) -> u64 { let guard = self.layers.read().await; let layer_map = guard.layer_map(); let mut size = 0; @@ -758,10 +1002,14 @@ impl Timeline { size } - pub fn resident_physical_size(&self) -> u64 { + pub(crate) fn resident_physical_size(&self) -> u64 { self.metrics.resident_physical_size_get() } + pub(crate) fn get_directory_metrics(&self) -> [u64; DirectoryKind::KINDS_NUM] { + array::from_fn(|idx| self.directory_metrics[idx].load(AtomicOrdering::Relaxed)) + } + /// /// Wait until WAL has been received and processed up to this LSN. /// @@ -834,7 +1082,7 @@ impl Timeline { } /// Check that it is valid to request operations with that lsn. - pub fn check_lsn_is_in_scope( + pub(crate) fn check_lsn_is_in_scope( &self, lsn: Lsn, latest_gc_cutoff_lsn: &RcuReadGuard, @@ -850,7 +1098,7 @@ impl Timeline { /// Flush to disk all data that was written with the put_* functions #[instrument(skip(self), fields(tenant_id=%self.tenant_shard_id.tenant_id, shard_id=%self.tenant_shard_id.shard_slug(), timeline_id=%self.timeline_id))] - pub async fn freeze_and_flush(&self) -> anyhow::Result<()> { + pub(crate) async fn freeze_and_flush(&self) -> anyhow::Result<()> { self.freeze_inmem_layer(false).await; self.flush_frozen_layers_and_wait().await } @@ -893,164 +1141,30 @@ impl Timeline { return Ok(()); } - // High level strategy for compaction / image creation: - // - // 1. First, calculate the desired "partitioning" of the - // currently in-use key space. The goal is to partition the - // key space into roughly fixed-size chunks, but also take into - // account any existing image layers, and try to align the - // chunk boundaries with the existing image layers to avoid - // too much churn. Also try to align chunk boundaries with - // relation boundaries. In principle, we don't know about - // relation boundaries here, we just deal with key-value - // pairs, and the code in pgdatadir_mapping.rs knows how to - // map relations into key-value pairs. But in practice we know - // that 'field6' is the block number, and the fields 1-5 - // identify a relation. This is just an optimization, - // though. - // - // 2. Once we know the partitioning, for each partition, - // decide if it's time to create a new image layer. The - // criteria is: there has been too much "churn" since the last - // image layer? The "churn" is fuzzy concept, it's a - // combination of too many delta files, or too much WAL in - // total in the delta file. Or perhaps: if creating an image - // file would allow to delete some older files. - // - // 3. After that, we compact all level0 delta files if there - // are too many of them. While compacting, we also garbage - // collect any page versions that are no longer needed because - // of the new image layers we created in step 2. - // - // TODO: This high level strategy hasn't been implemented yet. - // Below are functions compact_level0() and create_image_layers() - // but they are a bit ad hoc and don't quite work like it's explained - // above. Rewrite it. - - // Is the timeline being deleted? - if self.is_stopping() { - trace!("Dropping out of compaction on timeline shutdown"); - return Err(CompactionError::ShuttingDown); + match self.get_compaction_algorithm() { + CompactionAlgorithm::Tiered => self.compact_tiered(cancel, ctx).await, + CompactionAlgorithm::Legacy => self.compact_legacy(cancel, flags, ctx).await, } - - let target_file_size = self.get_checkpoint_distance(); - - // Define partitioning schema if needed - - // FIXME: the match should only cover repartitioning, not the next steps - match self - .repartition( - self.get_last_record_lsn(), - self.get_compaction_target_size(), - flags, - ctx, - ) - .await - { - Ok((partitioning, lsn)) => { - // Disables access_stats updates, so that the files we read remain candidates for eviction after we're done with them - let image_ctx = RequestContextBuilder::extend(ctx) - .access_stats_behavior(AccessStatsBehavior::Skip) - .build(); - - // 2. Compact - let timer = self.metrics.compact_time_histo.start_timer(); - self.compact_level0(target_file_size, ctx).await?; - timer.stop_and_record(); - - // 3. Create new image layers for partitions that have been modified - // "enough". - let layers = self - .create_image_layers(&partitioning, lsn, false, &image_ctx) - .await - .map_err(anyhow::Error::from)?; - if let Some(remote_client) = &self.remote_client { - for layer in layers { - remote_client.schedule_layer_file_upload(layer)?; - } - } - - if let Some(remote_client) = &self.remote_client { - // should any new image layer been created, not uploading index_part will - // result in a mismatch between remote_physical_size and layermap calculated - // size, which will fail some tests, but should not be an issue otherwise. - remote_client.schedule_index_upload_for_file_changes()?; - } - } - Err(err) => { - // no partitioning? This is normal, if the timeline was just created - // as an empty timeline. Also in unit tests, when we use the timeline - // as a simple key-value store, ignoring the datadir layout. Log the - // error but continue. - // - // Suppress error when it's due to cancellation - if !self.cancel.is_cancelled() { - error!("could not compact, repartitioning keyspace failed: {err:?}"); - } - } - }; - - Ok(()) } /// Mutate the timeline with a [`TimelineWriter`]. - pub async fn writer(&self) -> TimelineWriter<'_> { + pub(crate) async fn writer(&self) -> TimelineWriter<'_> { TimelineWriter { tl: self, - _write_guard: self.write_lock.lock().await, + write_guard: self.write_lock.lock().await, } } - /// Check if more than 'checkpoint_distance' of WAL has been accumulated in - /// the in-memory layer, and initiate flushing it if so. - /// - /// Also flush after a period of time without new data -- it helps - /// safekeepers to regard pageserver as caught up and suspend activity. - pub async fn check_checkpoint_distance(self: &Arc) -> anyhow::Result<()> { - let last_lsn = self.get_last_record_lsn(); - let open_layer_size = { - let guard = self.layers.read().await; - let layers = guard.layer_map(); - let Some(open_layer) = layers.open_layer.as_ref() else { - return Ok(()); - }; - open_layer.size().await? - }; - let last_freeze_at = self.last_freeze_at.load(); - let last_freeze_ts = *(self.last_freeze_ts.read().unwrap()); - let distance = last_lsn.widening_sub(last_freeze_at); - // Checkpointing the open layer can be triggered by layer size or LSN range. - // S3 has a 5 GB limit on the size of one upload (without multi-part upload), and - // we want to stay below that with a big margin. The LSN distance determines how - // much WAL the safekeepers need to store. - if distance >= self.get_checkpoint_distance().into() - || open_layer_size > self.get_checkpoint_distance() - || (distance > 0 && last_freeze_ts.elapsed() >= self.get_checkpoint_timeout()) - { - info!( - "check_checkpoint_distance {}, layer size {}, elapsed since last flush {:?}", - distance, - open_layer_size, - last_freeze_ts.elapsed() - ); - - self.freeze_inmem_layer(true).await; - self.last_freeze_at.store(last_lsn); - *(self.last_freeze_ts.write().unwrap()) = Instant::now(); - - // Wake up the layer flusher - self.flush_frozen_layers(); - } - Ok(()) - } - - pub fn activate( + pub(crate) fn activate( self: &Arc, broker_client: BrokerClientChannel, background_jobs_can_start: Option<&completion::Barrier>, ctx: &RequestContext, ) { - self.spawn_initial_logical_size_computation_task(ctx); + if self.tenant_shard_id.is_zero() { + // Logical size is only maintained accurately on shard zero. + self.spawn_initial_logical_size_computation_task(ctx); + } self.launch_wal_receiver(ctx, broker_client); self.set_state(TimelineState::Active); self.launch_eviction_task(background_jobs_can_start); @@ -1060,7 +1174,6 @@ impl Timeline { /// also to remote storage. This method can easily take multiple seconds for a busy timeline. /// /// While we are flushing, we continue to accept read I/O. - #[instrument(skip_all, fields(timeline_id=%self.timeline_id))] pub(crate) async fn flush_and_shutdown(&self) { debug_assert_current_span_has_tenant_and_timeline_id(); @@ -1109,6 +1222,8 @@ impl Timeline { /// Shut down immediately, without waiting for any open layers to flush to disk. This is a subset of /// the graceful [`Timeline::flush_and_shutdown`] function. pub(crate) async fn shutdown(&self) { + debug_assert_current_span_has_tenant_and_timeline_id(); + // Signal any subscribers to our cancellation token to drop out tracing::debug!("Cancelling CancellationToken"); self.cancel.cancel(); @@ -1142,9 +1257,11 @@ impl Timeline { // Finally wait until any gate-holders are complete self.gate.close().await; + + self.metrics.shutdown(); } - pub fn set_state(&self, new_state: TimelineState) { + pub(crate) fn set_state(&self, new_state: TimelineState) { match (self.current_state(), new_state) { (equal_state_1, equal_state_2) if equal_state_1 == equal_state_2 => { info!("Ignoring new state, equal to the existing one: {equal_state_2:?}"); @@ -1164,7 +1281,7 @@ impl Timeline { } } - pub fn set_broken(&self, reason: String) { + pub(crate) fn set_broken(&self, reason: String) { let backtrace_str: String = format!("{}", std::backtrace::Backtrace::force_capture()); let broken_state = TimelineState::Broken { reason, @@ -1178,27 +1295,27 @@ impl Timeline { self.cancel.cancel(); } - pub fn current_state(&self) -> TimelineState { + pub(crate) fn current_state(&self) -> TimelineState { self.state.borrow().clone() } - pub fn is_broken(&self) -> bool { + pub(crate) fn is_broken(&self) -> bool { matches!(&*self.state.borrow(), TimelineState::Broken { .. }) } - pub fn is_active(&self) -> bool { + pub(crate) fn is_active(&self) -> bool { self.current_state() == TimelineState::Active } - pub fn is_stopping(&self) -> bool { + pub(crate) fn is_stopping(&self) -> bool { self.current_state() == TimelineState::Stopping } - pub fn subscribe_for_state_updates(&self) -> watch::Receiver { + pub(crate) fn subscribe_for_state_updates(&self) -> watch::Receiver { self.state.subscribe() } - pub async fn wait_to_become_active( + pub(crate) async fn wait_to_become_active( &self, _ctx: &RequestContext, // Prepare for use by cancellation ) -> Result<(), TimelineState> { @@ -1223,7 +1340,7 @@ impl Timeline { } } - pub async fn layer_map_info(&self, reset: LayerAccessStatsReset) -> LayerMapInfo { + pub(crate) async fn layer_map_info(&self, reset: LayerAccessStatsReset) -> LayerMapInfo { let guard = self.layers.read().await; let layer_map = guard.layer_map(); let mut in_memory_layers = Vec::with_capacity(layer_map.frozen_layers.len() + 1); @@ -1247,7 +1364,10 @@ impl Timeline { } #[instrument(skip_all, fields(tenant_id = %self.tenant_shard_id.tenant_id, shard_id = %self.tenant_shard_id.shard_slug(), timeline_id = %self.timeline_id))] - pub async fn download_layer(&self, layer_file_name: &str) -> anyhow::Result> { + pub(crate) async fn download_layer( + &self, + layer_file_name: &str, + ) -> anyhow::Result> { let Some(layer) = self.find_layer(layer_file_name).await else { return Ok(None); }; @@ -1264,7 +1384,7 @@ impl Timeline { /// Evict just one layer. /// /// Returns `Ok(None)` in the case where the layer could not be found by its `layer_file_name`. - pub async fn evict_layer(&self, layer_file_name: &str) -> anyhow::Result> { + pub(crate) async fn evict_layer(&self, layer_file_name: &str) -> anyhow::Result> { let _gate = self .gate .enter() @@ -1274,10 +1394,14 @@ impl Timeline { return Ok(None); }; - match local_layer.evict_and_wait().await { + // curl has this by default + let timeout = std::time::Duration::from_secs(120); + + match local_layer.evict_and_wait(timeout).await { Ok(()) => Ok(Some(true)), Err(EvictionError::NotFound) => Ok(Some(false)), Err(EvictionError::Downloaded) => Ok(Some(false)), + Err(EvictionError::Timeout) => Ok(Some(false)), } } } @@ -1287,43 +1411,57 @@ const REPARTITION_FREQ_IN_CHECKPOINT_DISTANCE: u64 = 10; // Private functions impl Timeline { + pub(crate) fn get_lazy_slru_download(&self) -> bool { + let tenant_conf = self.tenant_conf.read().unwrap().tenant_conf.clone(); + tenant_conf + .lazy_slru_download + .unwrap_or(self.conf.default_tenant_conf.lazy_slru_download) + } + fn get_checkpoint_distance(&self) -> u64 { - let tenant_conf = self.tenant_conf.read().unwrap().tenant_conf; + let tenant_conf = self.tenant_conf.read().unwrap().tenant_conf.clone(); tenant_conf .checkpoint_distance .unwrap_or(self.conf.default_tenant_conf.checkpoint_distance) } fn get_checkpoint_timeout(&self) -> Duration { - let tenant_conf = self.tenant_conf.read().unwrap().tenant_conf; + let tenant_conf = self.tenant_conf.read().unwrap().tenant_conf.clone(); tenant_conf .checkpoint_timeout .unwrap_or(self.conf.default_tenant_conf.checkpoint_timeout) } fn get_compaction_target_size(&self) -> u64 { - let tenant_conf = self.tenant_conf.read().unwrap().tenant_conf; + let tenant_conf = self.tenant_conf.read().unwrap().tenant_conf.clone(); tenant_conf .compaction_target_size .unwrap_or(self.conf.default_tenant_conf.compaction_target_size) } fn get_compaction_threshold(&self) -> usize { - let tenant_conf = self.tenant_conf.read().unwrap().tenant_conf; + let tenant_conf = self.tenant_conf.read().unwrap().tenant_conf.clone(); tenant_conf .compaction_threshold .unwrap_or(self.conf.default_tenant_conf.compaction_threshold) } fn get_image_creation_threshold(&self) -> usize { - let tenant_conf = self.tenant_conf.read().unwrap().tenant_conf; + let tenant_conf = self.tenant_conf.read().unwrap().tenant_conf.clone(); tenant_conf .image_creation_threshold .unwrap_or(self.conf.default_tenant_conf.image_creation_threshold) } + fn get_compaction_algorithm(&self) -> CompactionAlgorithm { + let tenant_conf = &self.tenant_conf.read().unwrap().tenant_conf; + tenant_conf + .compaction_algorithm + .unwrap_or(self.conf.default_tenant_conf.compaction_algorithm) + } + fn get_eviction_policy(&self) -> EvictionPolicy { - let tenant_conf = self.tenant_conf.read().unwrap().tenant_conf; + let tenant_conf = self.tenant_conf.read().unwrap().tenant_conf.clone(); tenant_conf .eviction_policy .unwrap_or(self.conf.default_tenant_conf.eviction_policy) @@ -1338,13 +1476,6 @@ impl Timeline { .unwrap_or(default_tenant_conf.evictions_low_residence_duration_metric_threshold) } - fn get_gc_feedback(&self) -> bool { - let tenant_conf = &self.tenant_conf.read().unwrap().tenant_conf; - tenant_conf - .gc_feedback - .unwrap_or(self.conf.default_tenant_conf.gc_feedback) - } - pub(super) fn tenant_conf_updated(&self) { // NB: Most tenant conf options are read by background loops, so, // changes will automatically be picked up. @@ -1386,7 +1517,7 @@ impl Timeline { tenant_shard_id: TenantShardId, generation: Generation, shard_identity: ShardIdentity, - walredo_mgr: Arc, + walredo_mgr: Option>, resources: TimelineResources, pg_version: u32, state: TimelineState, @@ -1417,8 +1548,7 @@ impl Timeline { generation, shard_identity, pg_version, - layers: Arc::new(tokio::sync::RwLock::new(LayerManager::create())), - wanted_image_layers: Mutex::new(None), + layers: Default::default(), walredo_mgr, walreceiver: Mutex::new(None), @@ -1454,12 +1584,14 @@ impl Timeline { &timeline_id, ), + directory_metrics: array::from_fn(|_| AtomicU64::new(0)), + flush_loop_state: Mutex::new(FlushLoopState::NotStarted), layer_flush_start_tx, layer_flush_done_tx, - write_lock: tokio::sync::Mutex::new(()), + write_lock: tokio::sync::Mutex::new(None), gc_info: std::sync::RwLock::new(GcInfo { retain_lsns: Vec::new(), @@ -1479,7 +1611,7 @@ impl Timeline { // initial logical size is 0. LogicalSize::empty_initial() }, - partitioning: Mutex::new((KeyPartitioning::new(), Lsn(0))), + partitioning: tokio::sync::Mutex::new((KeyPartitioning::new(), Lsn(0))), repartition_threshold: 0, last_received_wal: Mutex::new(None), @@ -1495,10 +1627,17 @@ impl Timeline { delete_progress: Arc::new(tokio::sync::Mutex::new(DeleteTimelineFlow::default())), cancel, - gate: Gate::new(format!("Timeline<{tenant_shard_id}/{timeline_id}>")), + gate: Gate::default(), compaction_lock: tokio::sync::Mutex::default(), gc_lock: tokio::sync::Mutex::default(), + + timeline_get_throttle: resources.timeline_get_throttle, + + aux_files: tokio::sync::Mutex::new(AuxFilesState { + dir: None, + n_deltas: 0, + }), }; result.repartition_threshold = result.get_checkpoint_distance() / REPARTITION_FREQ_IN_CHECKPOINT_DISTANCE; @@ -1663,7 +1802,11 @@ impl Timeline { discovered_layers.push((file_name, file_size)); continue; } - Discovered::Metadata | Discovered::IgnoredBackup => { + Discovered::Metadata => { + warn!("found legacy metadata file, these should have been removed in load_tenant_config"); + continue; + } + Discovered::IgnoredBackup => { continue; } Discovered::Unknown(file_name) => { @@ -1817,6 +1960,12 @@ impl Timeline { priority: GetLogicalSizePriority, ctx: &RequestContext, ) -> logical_size::CurrentLogicalSize { + if !self.tenant_shard_id.is_zero() { + // Logical size is only accurately maintained on shard zero: when called elsewhere, for example + // when HTTP API is serving a GET for timeline zero, return zero + return logical_size::CurrentLogicalSize::Approximate(logical_size::Approximate::zero()); + } + let current_size = self.current_logical_size.current_size(); debug!("Current size: {current_size:?}"); @@ -1917,7 +2066,7 @@ impl Timeline { .await; Ok(()) } - .instrument(info_span!(parent: None, "initial_size_calculation", tenant_id=%self.tenant_shard_id.tenant_id, timeline_id=%self.timeline_id)), + .instrument(info_span!(parent: None, "initial_size_calculation", tenant_id=%self.tenant_shard_id.tenant_id, shard_id=%self.tenant_shard_id.shard_slug(), timeline_id=%self.timeline_id)), ); } @@ -2059,7 +2208,7 @@ impl Timeline { .expect("only this task sets it"); } - pub fn spawn_ondemand_logical_size_calculation( + pub(crate) fn spawn_ondemand_logical_size_calculation( self: &Arc, lsn: Lsn, cause: LogicalSizeCalculationCause, @@ -2104,16 +2253,22 @@ impl Timeline { cause: LogicalSizeCalculationCause, ctx: &RequestContext, ) -> Result { - span::debug_assert_current_span_has_tenant_and_timeline_id(); + crate::span::debug_assert_current_span_has_tenant_and_timeline_id(); + // We should never be calculating logical sizes on shard !=0, because these shards do not have + // accurate relation sizes, and they do not emit consumption metrics. + debug_assert!(self.tenant_shard_id.is_zero()); - let _guard = self.gate.enter(); + let guard = self + .gate + .enter() + .map_err(|_| CalculateLogicalSizeError::Cancelled)?; let self_calculation = Arc::clone(self); let mut calculation = pin!(async { let ctx = ctx.attached_child(); self_calculation - .calculate_logical_size(lsn, cause, &ctx) + .calculate_logical_size(lsn, cause, &guard, &ctx) .await }); @@ -2138,37 +2293,20 @@ impl Timeline { /// # Cancel-Safety /// /// This method is cancellation-safe. - pub async fn calculate_logical_size( + async fn calculate_logical_size( &self, up_to_lsn: Lsn, cause: LogicalSizeCalculationCause, + _guard: &GateGuard, ctx: &RequestContext, ) -> Result { info!( "Calculating logical size for timeline {} at {}", self.timeline_id, up_to_lsn ); - // These failpoints are used by python tests to ensure that we don't delete - // the timeline while the logical size computation is ongoing. - // The first failpoint is used to make this function pause. - // Then the python test initiates timeline delete operation in a thread. - // It waits for a few seconds, then arms the second failpoint and disables - // the first failpoint. The second failpoint prints an error if the timeline - // delete code has deleted the on-disk state while we're still running here. - // It shouldn't do that. If it does it anyway, the error will be caught - // by the test suite, highlighting the problem. - fail::fail_point!("timeline-calculate-logical-size-pause"); - fail::fail_point!("timeline-calculate-logical-size-check-dir-exists", |_| { - if !self - .conf - .metadata_path(&self.tenant_shard_id, &self.timeline_id) - .exists() - { - error!("timeline-calculate-logical-size-pre metadata file does not exist") - } - // need to return something - Ok(0) - }); + + pausable_failpoint!("timeline-calculate-logical-size-pause"); + // See if we've already done the work for initial size calculation. // This is a short-cut for timelines that are mostly unused. if let Some(size) = self.current_logical_size.initialized_size(up_to_lsn) { @@ -2213,6 +2351,29 @@ impl Timeline { } } + pub(crate) fn update_directory_entries_count(&self, kind: DirectoryKind, count: u64) { + self.directory_metrics[kind.offset()].store(count, AtomicOrdering::Relaxed); + let aux_metric = + self.directory_metrics[DirectoryKind::AuxFiles.offset()].load(AtomicOrdering::Relaxed); + + let sum_of_entries = self + .directory_metrics + .iter() + .map(|v| v.load(AtomicOrdering::Relaxed)) + .sum(); + // Set a high general threshold and a lower threshold for the auxiliary files, + // as we can have large numbers of relations in the db directory. + const SUM_THRESHOLD: u64 = 5000; + const AUX_THRESHOLD: u64 = 1000; + if sum_of_entries >= SUM_THRESHOLD || aux_metric >= AUX_THRESHOLD { + self.metrics + .directory_entries_count_gauge + .set(sum_of_entries); + } else if let Some(metric) = Lazy::get(&self.metrics.directory_entries_count_gauge) { + metric.set(sum_of_entries); + } + } + async fn find_layer(&self, layer_file_name: &str) -> Option { let guard = self.layers.read().await; for historic_layer in guard.layer_map().iter_historic_layers() { @@ -2233,45 +2394,28 @@ impl Timeline { /// should treat this as a cue to simply skip doing any heatmap uploading /// for this timeline. pub(crate) async fn generate_heatmap(&self) -> Option { - let eviction_info = self.get_local_layers_for_disk_usage_eviction().await; + // no point in heatmaps without remote client + let _remote_client = self.remote_client.as_ref()?; - let remote_client = match &self.remote_client { - Some(c) => c, - None => return None, - }; + if !self.is_active() { + return None; + } - let layer_file_names = eviction_info - .resident_layers - .iter() - .map(|l| l.layer.get_name()) - .collect::>(); + let guard = self.layers.read().await; - let decorated = match remote_client.get_layers_metadata(layer_file_names) { - Ok(d) => d, - Err(_) => { - // Getting metadata only fails on Timeline in bad state. - return None; - } - }; + let resident = guard.resident_layers().map(|layer| { + let last_activity_ts = layer.access_stats().latest_activity_or_now(); - let heatmap_layers = std::iter::zip( - eviction_info.resident_layers.into_iter(), - decorated.into_iter(), - ) - .filter_map(|(layer, remote_info)| { - remote_info.map(|remote_info| { - HeatMapLayer::new( - layer.layer.get_name(), - IndexLayerMetadata::from(remote_info), - layer.last_activity_ts, - ) - }) + HeatMapLayer::new( + layer.layer_desc().filename(), + layer.metadata().into(), + last_activity_ts, + ) }); - Some(HeatMapTimeline::new( - self.timeline_id, - heatmap_layers.collect(), - )) + let layers = resident.collect().await; + + Some(HeatMapTimeline::new(self.timeline_id, layers)) } } @@ -2334,7 +2478,7 @@ impl Timeline { // 'prev_lsn' tracks the last LSN that we were at in our search. It's used // to check that each iteration make some progress, to break infinite // looping if something goes wrong. - let mut prev_lsn = Lsn(u64::MAX); + let mut prev_lsn = None; let mut result = ValueReconstructResult::Continue; let mut cont_lsn = Lsn(request_lsn.0 + 1); @@ -2354,18 +2498,20 @@ impl Timeline { MATERIALIZED_PAGE_CACHE_HIT.inc_by(1); return Ok(traversal_path); } - if prev_lsn <= cont_lsn { - // Didn't make any progress in last iteration. Error out to avoid - // getting stuck in the loop. - return Err(layer_traversal_error(format!( - "could not find layer with more data for key {} at LSN {}, request LSN {}, ancestor {}", - key, - Lsn(cont_lsn.0 - 1), - request_lsn, - timeline.ancestor_lsn - ), traversal_path)); + if let Some(prev) = prev_lsn { + if prev <= cont_lsn { + // Didn't make any progress in last iteration. Error out to avoid + // getting stuck in the loop. + return Err(layer_traversal_error(format!( + "could not find layer with more data for key {} at LSN {}, request LSN {}, ancestor {}", + key, + Lsn(cont_lsn.0 - 1), + request_lsn, + timeline.ancestor_lsn + ), traversal_path)); + } } - prev_lsn = cont_lsn; + prev_lsn = Some(cont_lsn); } ValueReconstructResult::Missing => { return Err(layer_traversal_error( @@ -2392,62 +2538,10 @@ impl Timeline { timeline.ancestor_lsn, cont_lsn ); - let ancestor = match timeline.get_ancestor_timeline() { - Ok(timeline) => timeline, - Err(e) => return Err(PageReconstructError::from(e)), - }; - // It's possible that the ancestor timeline isn't active yet, or - // is active but hasn't yet caught up to the branch point. Wait - // for it. - // - // This cannot happen while the pageserver is running normally, - // because you cannot create a branch from a point that isn't - // present in the pageserver yet. However, we don't wait for the - // branch point to be uploaded to cloud storage before creating - // a branch. I.e., the branch LSN need not be remote consistent - // for the branching operation to succeed. - // - // Hence, if we try to load a tenant in such a state where - // 1. the existence of the branch was persisted (in IndexPart and/or locally) - // 2. but the ancestor state is behind branch_lsn because it was not yet persisted - // then we will need to wait for the ancestor timeline to - // re-stream WAL up to branch_lsn before we access it. - // - // How can a tenant get in such a state? - // - ungraceful pageserver process exit - // - detach+attach => this is a bug, https://github.com/neondatabase/neon/issues/4219 - // - // NB: this could be avoided by requiring - // branch_lsn >= remote_consistent_lsn - // during branch creation. - match ancestor.wait_to_become_active(ctx).await { - Ok(()) => {} - Err(TimelineState::Stopping) => { - return Err(PageReconstructError::AncestorStopping(ancestor.timeline_id)); - } - Err(state) => { - return Err(PageReconstructError::Other(anyhow::anyhow!( - "Timeline {} will not become active. Current state: {:?}", - ancestor.timeline_id, - &state, - ))); - } - } - ancestor - .wait_lsn(timeline.ancestor_lsn, ctx) - .await - .map_err(|e| match e { - e @ WaitLsnError::Timeout(_) => PageReconstructError::AncestorLsnTimeout(e), - WaitLsnError::Shutdown => PageReconstructError::Cancelled, - e @ WaitLsnError::BadState => { - PageReconstructError::Other(anyhow::anyhow!(e)) - } - })?; - - timeline_owned = ancestor; + timeline_owned = timeline.get_ready_ancestor_timeline(ctx).await?; timeline = &*timeline_owned; - prev_lsn = Lsn(u64::MAX); + prev_lsn = None; continue 'outer; } @@ -2555,6 +2649,164 @@ impl Timeline { } } + /// Get the data needed to reconstruct all keys in the provided keyspace + /// + /// The algorithm is as follows: + /// 1. While some keys are still not done and there's a timeline to visit: + /// 2. Visit the timeline (see [`Timeline::get_vectored_reconstruct_data_timeline`]: + /// 2.1: Build the fringe for the current keyspace + /// 2.2 Visit the newest layer from the fringe to collect all values for the range it + /// intersects + /// 2.3. Pop the timeline from the fringe + /// 2.4. If the fringe is empty, go back to 1 + async fn get_vectored_reconstruct_data( + &self, + mut keyspace: KeySpace, + request_lsn: Lsn, + reconstruct_state: &mut ValuesReconstructState, + ctx: &RequestContext, + ) -> Result<(), GetVectoredError> { + let mut timeline_owned: Arc; + let mut timeline = self; + + let mut cont_lsn = Lsn(request_lsn.0 + 1); + + loop { + if self.cancel.is_cancelled() { + return Err(GetVectoredError::Cancelled); + } + + let completed = Self::get_vectored_reconstruct_data_timeline( + timeline, + keyspace.clone(), + cont_lsn, + reconstruct_state, + &self.cancel, + ctx, + ) + .await?; + + keyspace.remove_overlapping_with(&completed); + if keyspace.total_size() == 0 || timeline.ancestor_timeline.is_none() { + break; + } + + cont_lsn = Lsn(timeline.ancestor_lsn.0 + 1); + timeline_owned = timeline + .get_ready_ancestor_timeline(ctx) + .await + .map_err(GetVectoredError::GetReadyAncestorError)?; + timeline = &*timeline_owned; + } + + if keyspace.total_size() != 0 { + return Err(GetVectoredError::MissingKey(keyspace.start().unwrap())); + } + + Ok(()) + } + + /// Collect the reconstruct data for a ketspace from the specified timeline. + /// + /// Maintain a fringe [`LayerFringe`] which tracks all the layers that intersect + /// the current keyspace. The current keyspace of the search at any given timeline + /// is the original keyspace minus all the keys that have been completed minus + /// any keys for which we couldn't find an intersecting layer. It's not tracked explicitly, + /// but if you merge all the keyspaces in the fringe, you get the "current keyspace". + /// + /// This is basically a depth-first search visitor implementation where a vertex + /// is the (layer, lsn range, key space) tuple. The fringe acts as the stack. + /// + /// At each iteration pop the top of the fringe (the layer with the highest Lsn) + /// and get all the required reconstruct data from the layer in one go. + async fn get_vectored_reconstruct_data_timeline( + timeline: &Timeline, + keyspace: KeySpace, + mut cont_lsn: Lsn, + reconstruct_state: &mut ValuesReconstructState, + cancel: &CancellationToken, + ctx: &RequestContext, + ) -> Result { + let mut unmapped_keyspace = keyspace.clone(); + let mut fringe = LayerFringe::new(); + + let mut completed_keyspace = KeySpace::default(); + + // Hold the layer map whilst visiting the timeline to prevent + // compaction, eviction and flushes from rendering the layers unreadable. + // + // TODO: Do we actually need to do this? In theory holding on + // to [`tenant::storage_layer::Layer`] should be enough. However, + // [`Timeline::get`] also holds the lock during IO, so more investigation + // is needed. + let guard = timeline.layers.read().await; + let layers = guard.layer_map(); + + loop { + if cancel.is_cancelled() { + return Err(GetVectoredError::Cancelled); + } + + let keys_done_last_step = reconstruct_state.consume_done_keys(); + unmapped_keyspace.remove_overlapping_with(&keys_done_last_step); + completed_keyspace.merge(&keys_done_last_step); + + let in_memory_layer = layers.find_in_memory_layer(|l| { + let start_lsn = l.get_lsn_range().start; + cont_lsn > start_lsn + }); + + match in_memory_layer { + Some(l) => { + fringe.update( + ReadableLayerDesc::InMemory { + handle: l, + lsn_ceil: cont_lsn, + }, + unmapped_keyspace.clone(), + ); + } + None => { + for range in unmapped_keyspace.ranges.iter() { + let results = layers.range_search(range.clone(), cont_lsn); + + results + .found + .into_iter() + .map(|(SearchResult { layer, lsn_floor }, keyspace_accum)| { + ( + ReadableLayerDesc::Persistent { + desc: (*layer).clone(), + lsn_range: lsn_floor..cont_lsn, + }, + keyspace_accum.to_keyspace(), + ) + }) + .for_each(|(layer, keyspace)| fringe.update(layer, keyspace)); + } + } + } + + if let Some((layer_to_read, keyspace_to_read)) = fringe.next_layer() { + layer_to_read + .get_values_reconstruct_data( + &guard, + keyspace_to_read.clone(), + reconstruct_state, + ctx, + ) + .await?; + + unmapped_keyspace = keyspace_to_read; + cont_lsn = layer_to_read.get_lsn_floor(); + } else { + break; + } + } + + Ok(completed_keyspace) + } + /// # Cancel-safety /// /// This method is cancellation-safe. @@ -2575,6 +2827,66 @@ impl Timeline { Some((lsn, img)) } + async fn get_ready_ancestor_timeline( + &self, + ctx: &RequestContext, + ) -> Result, GetReadyAncestorError> { + let ancestor = match self.get_ancestor_timeline() { + Ok(timeline) => timeline, + Err(e) => return Err(GetReadyAncestorError::from(e)), + }; + + // It's possible that the ancestor timeline isn't active yet, or + // is active but hasn't yet caught up to the branch point. Wait + // for it. + // + // This cannot happen while the pageserver is running normally, + // because you cannot create a branch from a point that isn't + // present in the pageserver yet. However, we don't wait for the + // branch point to be uploaded to cloud storage before creating + // a branch. I.e., the branch LSN need not be remote consistent + // for the branching operation to succeed. + // + // Hence, if we try to load a tenant in such a state where + // 1. the existence of the branch was persisted (in IndexPart and/or locally) + // 2. but the ancestor state is behind branch_lsn because it was not yet persisted + // then we will need to wait for the ancestor timeline to + // re-stream WAL up to branch_lsn before we access it. + // + // How can a tenant get in such a state? + // - ungraceful pageserver process exit + // - detach+attach => this is a bug, https://github.com/neondatabase/neon/issues/4219 + // + // NB: this could be avoided by requiring + // branch_lsn >= remote_consistent_lsn + // during branch creation. + match ancestor.wait_to_become_active(ctx).await { + Ok(()) => {} + Err(TimelineState::Stopping) => { + return Err(GetReadyAncestorError::AncestorStopping( + ancestor.timeline_id, + )); + } + Err(state) => { + return Err(GetReadyAncestorError::Other(anyhow::anyhow!( + "Timeline {} will not become active. Current state: {:?}", + ancestor.timeline_id, + &state, + ))); + } + } + ancestor + .wait_lsn(self.ancestor_lsn, ctx) + .await + .map_err(|e| match e { + e @ WaitLsnError::Timeout(_) => GetReadyAncestorError::AncestorLsnTimeout(e), + WaitLsnError::Shutdown => GetReadyAncestorError::Cancelled, + e @ WaitLsnError::BadState => GetReadyAncestorError::Other(anyhow::anyhow!(e)), + })?; + + Ok(ancestor) + } + fn get_ancestor_timeline(&self) -> anyhow::Result> { let ancestor = self.ancestor_timeline.as_ref().with_context(|| { format!( @@ -2607,43 +2919,6 @@ impl Timeline { Ok(layer) } - async fn put_value( - &self, - key: Key, - lsn: Lsn, - val: &Value, - ctx: &RequestContext, - ) -> anyhow::Result<()> { - //info!("PUT: key {} at {}", key, lsn); - let layer = self.get_layer_for_write(lsn).await?; - layer.put_value(key, lsn, val, ctx).await?; - Ok(()) - } - - async fn put_values( - &self, - values: &HashMap>, - ctx: &RequestContext, - ) -> anyhow::Result<()> { - // Pick the first LSN in the batch to get the layer to write to. - for lsns in values.values() { - if let Some((lsn, _)) = lsns.first() { - let layer = self.get_layer_for_write(*lsn).await?; - layer.put_values(values, ctx).await?; - break; - } - } - Ok(()) - } - - async fn put_tombstones(&self, tombstones: &[(Range, Lsn)]) -> anyhow::Result<()> { - if let Some((_, lsn)) = tombstones.first() { - let layer = self.get_layer_for_write(*lsn).await?; - layer.put_tombstones(tombstones).await?; - } - Ok(()) - } - pub(crate) fn finish_write(&self, new_lsn: Lsn) { assert!(new_lsn.is_aligned()); @@ -2654,14 +2929,20 @@ impl Timeline { async fn freeze_inmem_layer(&self, write_lock_held: bool) { // Freeze the current open in-memory layer. It will be written to disk on next // iteration. + let _write_guard = if write_lock_held { None } else { Some(self.write_lock.lock().await) }; + + self.freeze_inmem_layer_at(self.get_last_record_lsn()).await; + } + + async fn freeze_inmem_layer_at(&self, at: Lsn) { let mut guard = self.layers.write().await; guard - .try_freeze_in_memory_layer(self.get_last_record_lsn(), &self.last_freeze_at) + .try_freeze_in_memory_layer(at, &self.last_freeze_at) .await; } @@ -2785,12 +3066,13 @@ impl Timeline { } /// Flush one frozen in-memory layer to disk, as a new delta layer. - #[instrument(skip_all, fields(tenant_id=%self.tenant_shard_id.tenant_id, shard_id = %self.tenant_shard_id.shard_slug(), timeline_id=%self.timeline_id, layer=%frozen_layer))] + #[instrument(skip_all, fields(layer=%frozen_layer))] async fn flush_frozen_layer( self: &Arc, frozen_layer: Arc, ctx: &RequestContext, ) -> Result<(), FlushLayerError> { + debug_assert_current_span_has_tenant_and_timeline_id(); // As a special case, when we have just imported an image into the repository, // instead of writing out a L0 delta layer, we directly write out image layer // files instead. This is possible as long as *all* the data imported into the @@ -2868,7 +3150,7 @@ impl Timeline { // The new on-disk layers are now in the layer map. We can remove the // in-memory layer from the map now. The flushed layer is stored in // the mapping in `create_delta_layer`. - let metadata = { + { let mut guard = self.layers.write().await; if self.cancel.is_cancelled() { @@ -2882,9 +3164,7 @@ impl Timeline { self.disk_consistent_lsn.store(disk_consistent_lsn); // Schedule remote uploads that will reflect our new disk_consistent_lsn - Some(self.schedule_uploads(disk_consistent_lsn, layers_to_upload)?) - } else { - None + self.schedule_uploads(disk_consistent_lsn, layers_to_upload)?; } // release lock on 'layers' }; @@ -2899,22 +3179,6 @@ impl Timeline { // This failpoint is used by another test case `test_pageserver_recovery`. fail_point!("flush-frozen-exit"); - // Update the metadata file, with new 'disk_consistent_lsn' - // - // TODO: This perhaps should be done in 'flush_frozen_layers', after flushing - // *all* the layers, to avoid fsyncing the file multiple times. - - // If we updated our disk_consistent_lsn, persist the updated metadata to local disk. - if let Some(metadata) = metadata { - save_metadata( - self.conf, - &self.tenant_shard_id, - &self.timeline_id, - &metadata, - ) - .await - .context("save_metadata")?; - } Ok(()) } @@ -2970,25 +3234,6 @@ impl Timeline { Ok(metadata) } - async fn update_metadata_file( - &self, - disk_consistent_lsn: Lsn, - layers_to_upload: impl IntoIterator, - ) -> anyhow::Result<()> { - let metadata = self.schedule_uploads(disk_consistent_lsn, layers_to_upload)?; - - save_metadata( - self.conf, - &self.tenant_shard_id, - &self.timeline_id, - &metadata, - ) - .await - .context("save_metadata")?; - - Ok(()) - } - pub(crate) async fn preserve_initdb_archive(&self) -> anyhow::Result<()> { if let Some(remote_client) = &self.remote_client { remote_client @@ -3011,53 +3256,48 @@ impl Timeline { frozen_layer: &Arc, ctx: &RequestContext, ) -> anyhow::Result { - let span = tracing::info_span!("blocking"); - let new_delta: ResidentLayer = tokio::task::spawn_blocking({ - let self_clone = Arc::clone(self); - let frozen_layer = Arc::clone(frozen_layer); - let ctx = ctx.attached_child(); - move || { - // Write it out - // Keep this inside `spawn_blocking` and `Handle::current` - // as long as the write path is still sync and the read impl - // is still not fully async. Otherwise executor threads would - // be blocked. - let _g = span.entered(); - let new_delta = - Handle::current().block_on(frozen_layer.write_to_disk(&self_clone, &ctx))?; - let new_delta_path = new_delta.local_path().to_owned(); - - // Sync it to disk. - // - // We must also fsync the timeline dir to ensure the directory entries for - // new layer files are durable. - // - // NB: timeline dir must be synced _after_ the file contents are durable. - // So, two separate fsyncs are required, they mustn't be batched. - // - // TODO: If we're running inside 'flush_frozen_layers' and there are multiple - // files to flush, the fsync overhead can be reduces as follows: - // 1. write them all to temporary file names - // 2. fsync them - // 3. rename to the final name - // 4. fsync the parent directory. - // Note that (1),(2),(3) today happen inside write_to_disk(). - // - // FIXME: the writer already fsyncs all data, only rename needs to be fsynced here - par_fsync::par_fsync(&[new_delta_path]).context("fsync of delta layer")?; - par_fsync::par_fsync(&[self_clone + let self_clone = Arc::clone(self); + let frozen_layer = Arc::clone(frozen_layer); + let ctx = ctx.attached_child(); + let work = async move { + let new_delta = frozen_layer.write_to_disk(&self_clone, &ctx).await?; + // The write_to_disk() above calls writer.finish() which already did the fsync of the inodes. + // We just need to fsync the directory in which these inodes are linked, + // which we know to be the timeline directory. + // + // We use fatal_err() below because the after write_to_disk returns with success, + // the in-memory state of the filesystem already has the layer file in its final place, + // and subsequent pageserver code could think it's durable while it really isn't. + let timeline_dir = VirtualFile::open( + &self_clone .conf - .timeline_path(&self_clone.tenant_shard_id, &self_clone.timeline_id)]) - .context("fsync of timeline dir")?; - - anyhow::Ok(new_delta) + .timeline_path(&self_clone.tenant_shard_id, &self_clone.timeline_id), + ) + .await + .fatal_err("VirtualFile::open for timeline dir fsync"); + timeline_dir + .sync_all() + .await + .fatal_err("VirtualFile::sync_all timeline dir"); + anyhow::Ok(new_delta) + }; + // Before tokio-epoll-uring, we ran write_to_disk & the sync_all inside spawn_blocking. + // Preserve that behavior to maintain the same behavior for `virtual_file_io_engine=std-fs`. + use crate::virtual_file::io_engine::IoEngine; + match crate::virtual_file::io_engine::get() { + IoEngine::NotSet => panic!("io engine not set"), + IoEngine::StdFs => { + let span = tracing::info_span!("blocking"); + tokio::task::spawn_blocking({ + move || Handle::current().block_on(work.instrument(span)) + }) + .await + .context("spawn_blocking") + .and_then(|x| x) } - }) - .await - .context("spawn_blocking") - .and_then(|x| x)?; - - Ok(new_delta) + #[cfg(target_os = "linux")] + IoEngine::TokioEpollUring => work.await, + } } async fn repartition( @@ -3067,30 +3307,34 @@ impl Timeline { flags: EnumSet, ctx: &RequestContext, ) -> anyhow::Result<(KeyPartitioning, Lsn)> { - { - let partitioning_guard = self.partitioning.lock().unwrap(); - let distance = lsn.0 - partitioning_guard.1 .0; - if partitioning_guard.1 != Lsn(0) - && distance <= self.repartition_threshold - && !flags.contains(CompactFlags::ForceRepartition) - { - debug!( - distance, - threshold = self.repartition_threshold, - "no repartitioning needed" - ); - return Ok((partitioning_guard.0.clone(), partitioning_guard.1)); - } + let Ok(mut partitioning_guard) = self.partitioning.try_lock() else { + // NB: there are two callers, one is the compaction task, of which there is only one per struct Tenant and hence Timeline. + // The other is the initdb optimization in flush_frozen_layer, used by `boostrap_timeline`, which runs before `.activate()` + // and hence before the compaction task starts. + anyhow::bail!("repartition() called concurrently, this should not happen"); + }; + if lsn < partitioning_guard.1 { + anyhow::bail!("repartition() called with LSN going backwards, this should not happen"); } + + let distance = lsn.0 - partitioning_guard.1 .0; + if partitioning_guard.1 != Lsn(0) + && distance <= self.repartition_threshold + && !flags.contains(CompactFlags::ForceRepartition) + { + debug!( + distance, + threshold = self.repartition_threshold, + "no repartitioning needed" + ); + return Ok((partitioning_guard.0.clone(), partitioning_guard.1)); + } + let keyspace = self.collect_keyspace(lsn, ctx).await?; let partitioning = keyspace.partition(partition_size); - let mut partitioning_guard = self.partitioning.lock().unwrap(); - if lsn > partitioning_guard.1 { - *partitioning_guard = (partitioning, lsn); - } else { - warn!("Concurrent repartitioning of keyspace. This unexpected, but probably harmless"); - } + *partitioning_guard = (partitioning, lsn); + Ok((partitioning_guard.0.clone(), partitioning_guard.1)) } @@ -3102,31 +3346,6 @@ impl Timeline { let layers = guard.layer_map(); let mut max_deltas = 0; - { - let wanted_image_layers = self.wanted_image_layers.lock().unwrap(); - if let Some((cutoff_lsn, wanted)) = &*wanted_image_layers { - let img_range = - partition.ranges.first().unwrap().start..partition.ranges.last().unwrap().end; - if wanted.overlaps(&img_range) { - // - // gc_timeline only pays attention to image layers that are older than the GC cutoff, - // but create_image_layers creates image layers at last-record-lsn. - // So it's possible that gc_timeline wants a new image layer to be created for a key range, - // but the range is already covered by image layers at more recent LSNs. Before we - // create a new image layer, check if the range is already covered at more recent LSNs. - if !layers - .image_layer_exists(&img_range, &(Lsn::min(lsn, *cutoff_lsn)..lsn + 1)) - { - debug!( - "Force generation of layer {}-{} wanted by GC, cutoff={}, lsn={})", - img_range.start, img_range.end, cutoff_lsn, lsn - ); - return true; - } - } - } - } - for part_range in &partition.ranges { let image_coverage = layers.image_coverage(part_range, lsn); for (img_range, last_img) in image_coverage { @@ -3194,123 +3413,128 @@ impl Timeline { for partition in partitioning.parts.iter() { let img_range = start..partition.ranges.last().unwrap().end; - start = img_range.end; - if force || self.time_for_new_image_layer(partition, lsn).await { - let mut image_layer_writer = ImageLayerWriter::new( - self.conf, - self.timeline_id, - self.tenant_shard_id, - &img_range, - lsn, - ) - .await?; + if !force && !self.time_for_new_image_layer(partition, lsn).await { + start = img_range.end; + continue; + } - fail_point!("image-layer-writer-fail-before-finish", |_| { - Err(CreateImageLayersError::Other(anyhow::anyhow!( - "failpoint image-layer-writer-fail-before-finish" - ))) - }); + let mut image_layer_writer = ImageLayerWriter::new( + self.conf, + self.timeline_id, + self.tenant_shard_id, + &img_range, + lsn, + ) + .await?; - let mut key_request_accum = KeySpaceAccum::new(); - for range in &partition.ranges { - let mut key = range.start; - while key < range.end { - if self.shard_identity.is_key_disposable(&key) { - debug!( - "Dropping key {} during compaction (it belongs on shard {:?})", - key, - self.shard_identity.get_shard_number(&key) - ); - key = key.next(); - continue; - } + fail_point!("image-layer-writer-fail-before-finish", |_| { + Err(CreateImageLayersError::Other(anyhow::anyhow!( + "failpoint image-layer-writer-fail-before-finish" + ))) + }); + let mut wrote_keys = false; + + let mut key_request_accum = KeySpaceAccum::new(); + for range in &partition.ranges { + let mut key = range.start; + while key < range.end { + // Decide whether to retain this key: usually we do, but sharded tenants may + // need to drop keys that don't belong to them. If we retain the key, add it + // to `key_request_accum` for later issuing a vectored get + if self.shard_identity.is_key_disposable(&key) { + debug!( + "Dropping key {} during compaction (it belongs on shard {:?})", + key, + self.shard_identity.get_shard_number(&key) + ); + } else { key_request_accum.add_key(key); - if key_request_accum.size() >= Timeline::MAX_GET_VECTORED_KEYS - || key.next() == range.end - { - let results = self - .get_vectored( - &key_request_accum.consume_keyspace().ranges, - lsn, - ctx, - ) - .await?; + } - for (img_key, img) in results { - let img = match img { - Ok(img) => img, - Err(err) => { - // If we fail to reconstruct a VM or FSM page, we can zero the - // page without losing any actual user data. That seems better - // than failing repeatedly and getting stuck. - // - // We had a bug at one point, where we truncated the FSM and VM - // in the pageserver, but the Postgres didn't know about that - // and continued to generate incremental WAL records for pages - // that didn't exist in the pageserver. Trying to replay those - // WAL records failed to find the previous image of the page. - // This special case allows us to recover from that situation. - // See https://github.com/neondatabase/neon/issues/2601. - // - // Unfortunately we cannot do this for the main fork, or for - // any metadata keys, keys, as that would lead to actual data - // loss. - if is_rel_fsm_block_key(img_key) - || is_rel_vm_block_key(img_key) - { - warn!("could not reconstruct FSM or VM key {img_key}, filling with zeros: {err:?}"); - ZERO_PAGE.clone() - } else { - return Err( - CreateImageLayersError::PageReconstructError(err), - ); - } + let last_key_in_range = key.next() == range.end; + key = key.next(); + + // Maybe flush `key_rest_accum` + if key_request_accum.size() >= Timeline::MAX_GET_VECTORED_KEYS + || last_key_in_range + { + let results = self + .get_vectored(key_request_accum.consume_keyspace(), lsn, ctx) + .await?; + + for (img_key, img) in results { + let img = match img { + Ok(img) => img, + Err(err) => { + // If we fail to reconstruct a VM or FSM page, we can zero the + // page without losing any actual user data. That seems better + // than failing repeatedly and getting stuck. + // + // We had a bug at one point, where we truncated the FSM and VM + // in the pageserver, but the Postgres didn't know about that + // and continued to generate incremental WAL records for pages + // that didn't exist in the pageserver. Trying to replay those + // WAL records failed to find the previous image of the page. + // This special case allows us to recover from that situation. + // See https://github.com/neondatabase/neon/issues/2601. + // + // Unfortunately we cannot do this for the main fork, or for + // any metadata keys, keys, as that would lead to actual data + // loss. + if is_rel_fsm_block_key(img_key) || is_rel_vm_block_key(img_key) + { + warn!("could not reconstruct FSM or VM key {img_key}, filling with zeros: {err:?}"); + ZERO_PAGE.clone() + } else { + return Err(CreateImageLayersError::PageReconstructError( + err, + )); } - }; + } + }; - image_layer_writer.put_image(img_key, &img).await?; - } + // Write all the keys we just read into our new image layer. + image_layer_writer.put_image(img_key, img).await?; + wrote_keys = true; } - - key = key.next(); } } + } + + if wrote_keys { + // Normal path: we have written some data into the new image layer for this + // partition, so flush it to disk. + start = img_range.end; let image_layer = image_layer_writer.finish(self).await?; image_layers.push(image_layer); + } else { + // Special case: the image layer may be empty if this is a sharded tenant and the + // partition does not cover any keys owned by this shard. In this case, to ensure + // we don't leave gaps between image layers, leave `start` where it is, so that the next + // layer we write will cover the key range that we just scanned. + tracing::debug!("no data in range {}-{}", img_range.start, img_range.end); } } - // All layers that the GC wanted us to create have now been created. - // - // It's possible that another GC cycle happened while we were compacting, and added - // something new to wanted_image_layers, and we now clear that before processing it. - // That's OK, because the next GC iteration will put it back in. - *self.wanted_image_layers.lock().unwrap() = None; - // Sync the new layer to disk before adding it to the layer map, to make sure - // we don't garbage collect something based on the new layer, before it has - // reached the disk. - // - // We must also fsync the timeline dir to ensure the directory entries for - // new layer files are durable - // - // Compaction creates multiple image layers. It would be better to create them all - // and fsync them all in parallel. - let all_paths = image_layers - .iter() - .map(|layer| layer.local_path().to_owned()) - .collect::>(); - - par_fsync::par_fsync_async(&all_paths) + // The writer.finish() above already did the fsync of the inodes. + // We just need to fsync the directory in which these inodes are linked, + // which we know to be the timeline directory. + if !image_layers.is_empty() { + // We use fatal_err() below because the after writer.finish() returns with success, + // the in-memory state of the filesystem already has the layer file in its final place, + // and subsequent pageserver code could think it's durable while it really isn't. + let timeline_dir = VirtualFile::open( + &self + .conf + .timeline_path(&self.tenant_shard_id, &self.timeline_id), + ) .await - .context("fsync of newly created layer files")?; - - if !all_paths.is_empty() { - par_fsync::par_fsync_async(&[self - .conf - .timeline_path(&self.tenant_shard_id, &self.timeline_id)]) - .await - .context("fsync of timeline dir")?; + .fatal_err("VirtualFile::open for timeline dir fsync"); + timeline_dir + .sync_all() + .await + .fatal_err("VirtualFile::sync_all timeline dir"); } let mut guard = self.layers.write().await; @@ -3351,12 +3575,6 @@ impl Timeline { } } -#[derive(Default)] -struct CompactLevel0Phase1Result { - new_layers: Vec, - deltas_to_compact: Vec, -} - /// Top-level failure to compact. #[derive(Debug, thiserror::Error)] pub(crate) enum CompactionError { @@ -3367,6 +3585,18 @@ pub(crate) enum CompactionError { Other(#[from] anyhow::Error), } +impl From for CompactionError { + fn from(err: CollectKeySpaceError) -> Self { + match err { + CollectKeySpaceError::Cancelled + | CollectKeySpaceError::PageRead(PageReconstructError::Cancelled) => { + CompactionError::ShuttingDown + } + e => CompactionError::Other(e.into()), + } + } +} + #[serde_as] #[derive(serde::Serialize)] struct RecordedDuration(#[serde_as(as = "serde_with::DurationMicroSeconds")] Duration); @@ -3379,7 +3609,7 @@ enum DurationRecorder { } impl DurationRecorder { - pub fn till_now(&self) -> DurationRecorder { + fn till_now(&self) -> DurationRecorder { match self { DurationRecorder::NotStarted => { panic!("must only call on recorded measurements") @@ -3390,7 +3620,7 @@ impl DurationRecorder { } } } - pub fn into_recorded(self) -> Option { + fn into_recorded(self) -> Option { match self { DurationRecorder::NotStarted => None, DurationRecorder::Recorded(recorded, _) => Some(recorded), @@ -3398,580 +3628,20 @@ impl DurationRecorder { } } -#[derive(Default)] -struct CompactLevel0Phase1StatsBuilder { - version: Option, - tenant_id: Option, - timeline_id: Option, - read_lock_acquisition_micros: DurationRecorder, - read_lock_held_spawn_blocking_startup_micros: DurationRecorder, - read_lock_held_key_sort_micros: DurationRecorder, - read_lock_held_prerequisites_micros: DurationRecorder, - read_lock_held_compute_holes_micros: DurationRecorder, - read_lock_drop_micros: DurationRecorder, - write_layer_files_micros: DurationRecorder, - level0_deltas_count: Option, - new_deltas_count: Option, - new_deltas_size: Option, -} - -#[derive(serde::Serialize)] -struct CompactLevel0Phase1Stats { - version: u64, - tenant_id: TenantShardId, - timeline_id: TimelineId, - read_lock_acquisition_micros: RecordedDuration, - read_lock_held_spawn_blocking_startup_micros: RecordedDuration, - read_lock_held_key_sort_micros: RecordedDuration, - read_lock_held_prerequisites_micros: RecordedDuration, - read_lock_held_compute_holes_micros: RecordedDuration, - read_lock_drop_micros: RecordedDuration, - write_layer_files_micros: RecordedDuration, - level0_deltas_count: usize, - new_deltas_count: usize, - new_deltas_size: u64, -} - -impl TryFrom for CompactLevel0Phase1Stats { - type Error = anyhow::Error; - - fn try_from(value: CompactLevel0Phase1StatsBuilder) -> Result { - Ok(Self { - version: value.version.ok_or_else(|| anyhow!("version not set"))?, - tenant_id: value - .tenant_id - .ok_or_else(|| anyhow!("tenant_id not set"))?, - timeline_id: value - .timeline_id - .ok_or_else(|| anyhow!("timeline_id not set"))?, - read_lock_acquisition_micros: value - .read_lock_acquisition_micros - .into_recorded() - .ok_or_else(|| anyhow!("read_lock_acquisition_micros not set"))?, - read_lock_held_spawn_blocking_startup_micros: value - .read_lock_held_spawn_blocking_startup_micros - .into_recorded() - .ok_or_else(|| anyhow!("read_lock_held_spawn_blocking_startup_micros not set"))?, - read_lock_held_key_sort_micros: value - .read_lock_held_key_sort_micros - .into_recorded() - .ok_or_else(|| anyhow!("read_lock_held_key_sort_micros not set"))?, - read_lock_held_prerequisites_micros: value - .read_lock_held_prerequisites_micros - .into_recorded() - .ok_or_else(|| anyhow!("read_lock_held_prerequisites_micros not set"))?, - read_lock_held_compute_holes_micros: value - .read_lock_held_compute_holes_micros - .into_recorded() - .ok_or_else(|| anyhow!("read_lock_held_compute_holes_micros not set"))?, - read_lock_drop_micros: value - .read_lock_drop_micros - .into_recorded() - .ok_or_else(|| anyhow!("read_lock_drop_micros not set"))?, - write_layer_files_micros: value - .write_layer_files_micros - .into_recorded() - .ok_or_else(|| anyhow!("write_layer_files_micros not set"))?, - level0_deltas_count: value - .level0_deltas_count - .ok_or_else(|| anyhow!("level0_deltas_count not set"))?, - new_deltas_count: value - .new_deltas_count - .ok_or_else(|| anyhow!("new_deltas_count not set"))?, - new_deltas_size: value - .new_deltas_size - .ok_or_else(|| anyhow!("new_deltas_size not set"))?, - }) - } -} - impl Timeline { - /// Level0 files first phase of compaction, explained in the [`Self::compact`] comment. - async fn compact_level0_phase1( + async fn finish_compact_batch( self: &Arc, - guard: tokio::sync::OwnedRwLockReadGuard, - mut stats: CompactLevel0Phase1StatsBuilder, - target_file_size: u64, - ctx: &RequestContext, - ) -> Result { - stats.read_lock_held_spawn_blocking_startup_micros = - stats.read_lock_acquisition_micros.till_now(); // set by caller - let layers = guard.layer_map(); - let level0_deltas = layers.get_level0_deltas()?; - let mut level0_deltas = level0_deltas - .into_iter() - .map(|x| guard.get_from_desc(&x)) - .collect_vec(); - stats.level0_deltas_count = Some(level0_deltas.len()); - // Only compact if enough layers have accumulated. - let threshold = self.get_compaction_threshold(); - if level0_deltas.is_empty() || level0_deltas.len() < threshold { - debug!( - level0_deltas = level0_deltas.len(), - threshold, "too few deltas to compact" - ); - return Ok(CompactLevel0Phase1Result::default()); - } - - // This failpoint is used together with `test_duplicate_layers` integration test. - // It returns the compaction result exactly the same layers as input to compaction. - // We want to ensure that this will not cause any problem when updating the layer map - // after the compaction is finished. - // - // Currently, there are two rare edge cases that will cause duplicated layers being - // inserted. - // 1. The compaction job is inturrupted / did not finish successfully. Assume we have file 1, 2, 3, 4, which - // is compacted to 5, but the page server is shut down, next time we start page server we will get a layer - // map containing 1, 2, 3, 4, and 5, whereas 5 has the same content as 4. If we trigger L0 compation at this - // point again, it is likely that we will get a file 6 which has the same content and the key range as 5, - // and this causes an overwrite. This is acceptable because the content is the same, and we should do a - // layer replace instead of the normal remove / upload process. - // 2. The input workload pattern creates exactly n files that are sorted, non-overlapping and is of target file - // size length. Compaction will likely create the same set of n files afterwards. - // - // This failpoint is a superset of both of the cases. - if cfg!(feature = "testing") { - let active = (|| { - ::fail::fail_point!("compact-level0-phase1-return-same", |_| true); - false - })(); - - if active { - let mut new_layers = Vec::with_capacity(level0_deltas.len()); - for delta in &level0_deltas { - // we are just faking these layers as being produced again for this failpoint - new_layers.push( - delta - .download_and_keep_resident() - .await - .context("download layer for failpoint")?, - ); - } - tracing::info!("compact-level0-phase1-return-same"); // so that we can check if we hit the failpoint - return Ok(CompactLevel0Phase1Result { - new_layers, - deltas_to_compact: level0_deltas, - }); - } - } - - // Gather the files to compact in this iteration. - // - // Start with the oldest Level 0 delta file, and collect any other - // level 0 files that form a contiguous sequence, such that the end - // LSN of previous file matches the start LSN of the next file. - // - // Note that if the files don't form such a sequence, we might - // "compact" just a single file. That's a bit pointless, but it allows - // us to get rid of the level 0 file, and compact the other files on - // the next iteration. This could probably made smarter, but such - // "gaps" in the sequence of level 0 files should only happen in case - // of a crash, partial download from cloud storage, or something like - // that, so it's not a big deal in practice. - level0_deltas.sort_by_key(|l| l.layer_desc().lsn_range.start); - let mut level0_deltas_iter = level0_deltas.iter(); - - let first_level0_delta = level0_deltas_iter.next().unwrap(); - let mut prev_lsn_end = first_level0_delta.layer_desc().lsn_range.end; - let mut deltas_to_compact = Vec::with_capacity(level0_deltas.len()); - - deltas_to_compact.push(first_level0_delta.download_and_keep_resident().await?); - for l in level0_deltas_iter { - let lsn_range = &l.layer_desc().lsn_range; - - if lsn_range.start != prev_lsn_end { - break; - } - deltas_to_compact.push(l.download_and_keep_resident().await?); - prev_lsn_end = lsn_range.end; - } - let lsn_range = Range { - start: deltas_to_compact - .first() - .unwrap() - .layer_desc() - .lsn_range - .start, - end: deltas_to_compact.last().unwrap().layer_desc().lsn_range.end, - }; - - info!( - "Starting Level0 compaction in LSN range {}-{} for {} layers ({} deltas in total)", - lsn_range.start, - lsn_range.end, - deltas_to_compact.len(), - level0_deltas.len() - ); - - for l in deltas_to_compact.iter() { - info!("compact includes {l}"); - } - - // We don't need the original list of layers anymore. Drop it so that - // we don't accidentally use it later in the function. - drop(level0_deltas); - - stats.read_lock_held_prerequisites_micros = stats - .read_lock_held_spawn_blocking_startup_micros - .till_now(); - - // Determine N largest holes where N is number of compacted layers. - let max_holes = deltas_to_compact.len(); - let last_record_lsn = self.get_last_record_lsn(); - let min_hole_range = (target_file_size / page_cache::PAGE_SZ as u64) as i128; - let min_hole_coverage_size = 3; // TODO: something more flexible? - - // min-heap (reserve space for one more element added before eviction) - let mut heap: BinaryHeap = BinaryHeap::with_capacity(max_holes + 1); - let mut prev: Option = None; - - let mut all_keys = Vec::new(); - - for l in deltas_to_compact.iter() { - all_keys.extend(l.load_keys(ctx).await?); - } - - // FIXME: should spawn_blocking the rest of this function - - // The current stdlib sorting implementation is designed in a way where it is - // particularly fast where the slice is made up of sorted sub-ranges. - all_keys.sort_by_key(|DeltaEntry { key, lsn, .. }| (*key, *lsn)); - - stats.read_lock_held_key_sort_micros = stats.read_lock_held_prerequisites_micros.till_now(); - - for &DeltaEntry { key: next_key, .. } in all_keys.iter() { - if let Some(prev_key) = prev { - // just first fast filter - if next_key.to_i128() - prev_key.to_i128() >= min_hole_range { - let key_range = prev_key..next_key; - // Measuring hole by just subtraction of i128 representation of key range boundaries - // has not so much sense, because largest holes will corresponds field1/field2 changes. - // But we are mostly interested to eliminate holes which cause generation of excessive image layers. - // That is why it is better to measure size of hole as number of covering image layers. - let coverage_size = layers.image_coverage(&key_range, last_record_lsn).len(); - if coverage_size >= min_hole_coverage_size { - heap.push(Hole { - key_range, - coverage_size, - }); - if heap.len() > max_holes { - heap.pop(); // remove smallest hole - } - } - } - } - prev = Some(next_key.next()); - } - stats.read_lock_held_compute_holes_micros = stats.read_lock_held_key_sort_micros.till_now(); - drop_rlock(guard); - stats.read_lock_drop_micros = stats.read_lock_held_compute_holes_micros.till_now(); - let mut holes = heap.into_vec(); - holes.sort_unstable_by_key(|hole| hole.key_range.start); - let mut next_hole = 0; // index of next hole in holes vector - - // This iterator walks through all key-value pairs from all the layers - // we're compacting, in key, LSN order. - let all_values_iter = all_keys.iter(); - - // This iterator walks through all keys and is needed to calculate size used by each key - let mut all_keys_iter = all_keys - .iter() - .map(|DeltaEntry { key, lsn, size, .. }| (*key, *lsn, *size)) - .coalesce(|mut prev, cur| { - // Coalesce keys that belong to the same key pair. - // This ensures that compaction doesn't put them - // into different layer files. - // Still limit this by the target file size, - // so that we keep the size of the files in - // check. - if prev.0 == cur.0 && prev.2 < target_file_size { - prev.2 += cur.2; - Ok(prev) - } else { - Err((prev, cur)) - } - }); - - // Merge the contents of all the input delta layers into a new set - // of delta layers, based on the current partitioning. - // - // We split the new delta layers on the key dimension. We iterate through the key space, and for each key, check if including the next key to the current output layer we're building would cause the layer to become too large. If so, dump the current output layer and start new one. - // It's possible that there is a single key with so many page versions that storing all of them in a single layer file - // would be too large. In that case, we also split on the LSN dimension. - // - // LSN - // ^ - // | - // | +-----------+ +--+--+--+--+ - // | | | | | | | | - // | +-----------+ | | | | | - // | | | | | | | | - // | +-----------+ ==> | | | | | - // | | | | | | | | - // | +-----------+ | | | | | - // | | | | | | | | - // | +-----------+ +--+--+--+--+ - // | - // +--------------> key - // - // - // If one key (X) has a lot of page versions: - // - // LSN - // ^ - // | (X) - // | +-----------+ +--+--+--+--+ - // | | | | | | | | - // | +-----------+ | | +--+ | - // | | | | | | | | - // | +-----------+ ==> | | | | | - // | | | | | +--+ | - // | +-----------+ | | | | | - // | | | | | | | | - // | +-----------+ +--+--+--+--+ - // | - // +--------------> key - // TODO: this actually divides the layers into fixed-size chunks, not - // based on the partitioning. - // - // TODO: we should also opportunistically materialize and - // garbage collect what we can. - let mut new_layers = Vec::new(); - let mut prev_key: Option = None; - let mut writer: Option = None; - let mut key_values_total_size = 0u64; - let mut dup_start_lsn: Lsn = Lsn::INVALID; // start LSN of layer containing values of the single key - let mut dup_end_lsn: Lsn = Lsn::INVALID; // end LSN of layer containing values of the single key - - for &DeltaEntry { - key, lsn, ref val, .. - } in all_values_iter - { - let value = val.load(ctx).await?; - let same_key = prev_key.map_or(false, |prev_key| prev_key == key); - // We need to check key boundaries once we reach next key or end of layer with the same key - if !same_key || lsn == dup_end_lsn { - let mut next_key_size = 0u64; - let is_dup_layer = dup_end_lsn.is_valid(); - dup_start_lsn = Lsn::INVALID; - if !same_key { - dup_end_lsn = Lsn::INVALID; - } - // Determine size occupied by this key. We stop at next key or when size becomes larger than target_file_size - for (next_key, next_lsn, next_size) in all_keys_iter.by_ref() { - next_key_size = next_size; - if key != next_key { - if dup_end_lsn.is_valid() { - // We are writting segment with duplicates: - // place all remaining values of this key in separate segment - dup_start_lsn = dup_end_lsn; // new segments starts where old stops - dup_end_lsn = lsn_range.end; // there are no more values of this key till end of LSN range - } - break; - } - key_values_total_size += next_size; - // Check if it is time to split segment: if total keys size is larger than target file size. - // We need to avoid generation of empty segments if next_size > target_file_size. - if key_values_total_size > target_file_size && lsn != next_lsn { - // Split key between multiple layers: such layer can contain only single key - dup_start_lsn = if dup_end_lsn.is_valid() { - dup_end_lsn // new segment with duplicates starts where old one stops - } else { - lsn // start with the first LSN for this key - }; - dup_end_lsn = next_lsn; // upper LSN boundary is exclusive - break; - } - } - // handle case when loop reaches last key: in this case dup_end is non-zero but dup_start is not set. - if dup_end_lsn.is_valid() && !dup_start_lsn.is_valid() { - dup_start_lsn = dup_end_lsn; - dup_end_lsn = lsn_range.end; - } - if writer.is_some() { - let written_size = writer.as_mut().unwrap().size(); - let contains_hole = - next_hole < holes.len() && key >= holes[next_hole].key_range.end; - // check if key cause layer overflow or contains hole... - if is_dup_layer - || dup_end_lsn.is_valid() - || written_size + key_values_total_size > target_file_size - || contains_hole - { - // ... if so, flush previous layer and prepare to write new one - new_layers.push( - writer - .take() - .unwrap() - .finish(prev_key.unwrap().next(), self) - .await?, - ); - writer = None; - - if contains_hole { - // skip hole - next_hole += 1; - } - } - } - // Remember size of key value because at next iteration we will access next item - key_values_total_size = next_key_size; - } - if writer.is_none() { - // Create writer if not initiaized yet - writer = Some( - DeltaLayerWriter::new( - self.conf, - self.timeline_id, - self.tenant_shard_id, - key, - if dup_end_lsn.is_valid() { - // this is a layer containing slice of values of the same key - debug!("Create new dup layer {}..{}", dup_start_lsn, dup_end_lsn); - dup_start_lsn..dup_end_lsn - } else { - debug!("Create new layer {}..{}", lsn_range.start, lsn_range.end); - lsn_range.clone() - }, - ) - .await?, - ); - } - - fail_point!("delta-layer-writer-fail-before-finish", |_| { - Err(CompactionError::Other(anyhow::anyhow!( - "failpoint delta-layer-writer-fail-before-finish" - ))) - }); - - if !self.shard_identity.is_key_disposable(&key) { - writer.as_mut().unwrap().put_value(key, lsn, value).await?; - } else { - debug!( - "Dropping key {} during compaction (it belongs on shard {:?})", - key, - self.shard_identity.get_shard_number(&key) - ); - } - - if !new_layers.is_empty() { - fail_point!("after-timeline-compacted-first-L1"); - } - - prev_key = Some(key); - } - if let Some(writer) = writer { - new_layers.push(writer.finish(prev_key.unwrap().next(), self).await?); - } - - // Sync layers - if !new_layers.is_empty() { - // Print a warning if the created layer is larger than double the target size - // Add two pages for potential overhead. This should in theory be already - // accounted for in the target calculation, but for very small targets, - // we still might easily hit the limit otherwise. - let warn_limit = target_file_size * 2 + page_cache::PAGE_SZ as u64 * 2; - for layer in new_layers.iter() { - if layer.layer_desc().file_size > warn_limit { - warn!( - %layer, - "created delta file of size {} larger than double of target of {target_file_size}", layer.layer_desc().file_size - ); - } - } - - // FIXME: the writer already fsyncs all data, only rename needs to be fsynced here - let layer_paths: Vec = new_layers - .iter() - .map(|l| l.local_path().to_owned()) - .collect(); - - // Fsync all the layer files and directory using multiple threads to - // minimize latency. - par_fsync::par_fsync_async(&layer_paths) - .await - .context("fsync all new layers")?; - - let timeline_dir = self - .conf - .timeline_path(&self.tenant_shard_id, &self.timeline_id); - - par_fsync::par_fsync_async(&[timeline_dir]) - .await - .context("fsync of timeline dir")?; - } - - stats.write_layer_files_micros = stats.read_lock_drop_micros.till_now(); - stats.new_deltas_count = Some(new_layers.len()); - stats.new_deltas_size = Some(new_layers.iter().map(|l| l.layer_desc().file_size).sum()); - - match TryInto::::try_into(stats) - .and_then(|stats| serde_json::to_string(&stats).context("serde_json::to_string")) - { - Ok(stats_json) => { - info!( - stats_json = stats_json.as_str(), - "compact_level0_phase1 stats available" - ) - } - Err(e) => { - warn!("compact_level0_phase1 stats failed to serialize: {:#}", e); - } - } - - Ok(CompactLevel0Phase1Result { - new_layers, - deltas_to_compact: deltas_to_compact - .into_iter() - .map(|x| x.drop_eviction_guard()) - .collect::>(), - }) - } - - /// - /// Collect a bunch of Level 0 layer files, and compact and reshuffle them as - /// as Level 1 files. - /// - async fn compact_level0( - self: &Arc, - target_file_size: u64, - ctx: &RequestContext, - ) -> Result<(), CompactionError> { - let CompactLevel0Phase1Result { - new_layers, - deltas_to_compact, - } = { - let phase1_span = info_span!("compact_level0_phase1"); - let ctx = ctx.attached_child(); - let mut stats = CompactLevel0Phase1StatsBuilder { - version: Some(2), - tenant_id: Some(self.tenant_shard_id), - timeline_id: Some(self.timeline_id), - ..Default::default() - }; - - let begin = tokio::time::Instant::now(); - let phase1_layers_locked = Arc::clone(&self.layers).read_owned().await; - let now = tokio::time::Instant::now(); - stats.read_lock_acquisition_micros = - DurationRecorder::Recorded(RecordedDuration(now - begin), now); - self.compact_level0_phase1(phase1_layers_locked, stats, target_file_size, &ctx) - .instrument(phase1_span) - .await? - }; - - if new_layers.is_empty() && deltas_to_compact.is_empty() { - // nothing to do - return Ok(()); - } - + new_deltas: &[ResidentLayer], + new_images: &[ResidentLayer], + layers_to_remove: &[Layer], + ) -> anyhow::Result<()> { let mut guard = self.layers.write().await; let mut duplicated_layers = HashSet::new(); - let mut insert_layers = Vec::with_capacity(new_layers.len()); + let mut insert_layers = Vec::with_capacity(new_deltas.len()); - for l in &new_layers { + for l in new_deltas { if guard.contains(l.as_ref()) { // expected in tests tracing::error!(layer=%l, "duplicated L1 layer"); @@ -3982,24 +3652,28 @@ impl Timeline { // because we have not implemented L0 => L0 compaction. duplicated_layers.insert(l.layer_desc().key()); } else if LayerMap::is_l0(l.layer_desc()) { - return Err(CompactionError::Other(anyhow!("compaction generates a L0 layer file as output, which will cause infinite compaction."))); + bail!("compaction generates a L0 layer file as output, which will cause infinite compaction."); } else { insert_layers.push(l.clone()); } } - let remove_layers = { - let mut deltas_to_compact = deltas_to_compact; - // only remove those inputs which were not outputs - deltas_to_compact.retain(|l| !duplicated_layers.contains(&l.layer_desc().key())); - deltas_to_compact - }; + // only remove those inputs which were not outputs + let remove_layers: Vec = layers_to_remove + .iter() + .filter(|l| !duplicated_layers.contains(&l.layer_desc().key())) + .cloned() + .collect(); + + if !new_images.is_empty() { + guard.track_new_image_layers(new_images, &self.metrics); + } // deletion will happen later, the layer file manager calls garbage_collect_on_drop guard.finish_compact_l0(&remove_layers, &insert_layers, &self.metrics); if let Some(remote_client) = self.remote_client.as_ref() { - remote_client.schedule_compaction_update(&remove_layers, &new_layers)?; + remote_client.schedule_compaction_update(&remove_layers, new_deltas)?; } drop_wlock(guard); @@ -4195,7 +3869,6 @@ impl Timeline { debug!("retain_lsns: {:?}", retain_lsns); let mut layers_to_remove = Vec::new(); - let mut wanted_image_layers = KeySpaceRandomAccum::default(); // Scan all layers in the timeline (remote or on-disk). // @@ -4277,15 +3950,6 @@ impl Timeline { .image_layer_exists(&l.get_key_range(), &(l.get_lsn_range().end..new_gc_cutoff)) { debug!("keeping {} because it is the latest layer", l.filename()); - // Collect delta key ranges that need image layers to allow garbage - // collecting the layers. - // It is not so obvious whether we need to propagate information only about - // delta layers. Image layers can form "stairs" preventing old image from been deleted. - // But image layers are in any case less sparse than delta layers. Also we need some - // protection from replacing recent image layers with new one after each GC iteration. - if self.get_gc_feedback() && l.is_incremental() && !LayerMap::is_l0(&l) { - wanted_image_layers.add_range(l.get_key_range()); - } result.layers_not_updated += 1; continue 'outer; } @@ -4298,24 +3962,13 @@ impl Timeline { ); layers_to_remove.push(l); } - self.wanted_image_layers - .lock() - .unwrap() - .replace((new_gc_cutoff, wanted_image_layers.to_keyspace())); if !layers_to_remove.is_empty() { - // Persist the new GC cutoff value in the metadata file, before - // we actually remove anything. - // - // This does not in fact have any effect as we no longer consider local metadata unless - // running without remote storage. - // + // Persist the new GC cutoff value before we actually remove anything. // This unconditionally schedules also an index_part.json update, even though, we will // be doing one a bit later with the unlinked gc'd layers. - // - // TODO: remove when implementing . - self.update_metadata_file(self.disk_consistent_lsn.load(), None) - .await?; + let disk_consistent_lsn = self.disk_consistent_lsn.load(); + self.schedule_uploads(disk_consistent_lsn, None)?; let gc_layers = layers_to_remove .iter() @@ -4330,10 +3983,6 @@ impl Timeline { guard.finish_gc_timeline(&gc_layers); - if result.layers_removed != 0 { - fail_point!("after-timeline-gc-removed-layers"); - } - #[cfg(feature = "testing")] { result.doomed_layers = gc_layers; @@ -4402,6 +4051,9 @@ impl Timeline { let img = match self .walredo_mgr + .as_ref() + .context("timeline has no walredo manager") + .map_err(PageReconstructError::WalRedo)? .request_redo(key, request_lsn, data.img, data.records, self.pg_version) .await .context("reconstruct a page image") @@ -4590,7 +4242,9 @@ impl Timeline { } } - pub fn get_download_all_remote_layers_task_info(&self) -> Option { + pub(crate) fn get_download_all_remote_layers_task_info( + &self, + ) -> Option { self.download_all_remote_layers_task_info .read() .unwrap() @@ -4602,41 +4256,24 @@ impl Timeline { /// Returns non-remote layers for eviction. pub(crate) async fn get_local_layers_for_disk_usage_eviction(&self) -> DiskUsageEvictionInfo { let guard = self.layers.read().await; - let layers = guard.layer_map(); - let mut max_layer_size: Option = None; - let mut resident_layers = Vec::new(); - for l in layers.iter_historic_layers() { - let file_size = l.file_size(); - max_layer_size = max_layer_size.map_or(Some(file_size), |m| Some(m.max(file_size))); + let resident_layers = guard + .resident_layers() + .map(|layer| { + let file_size = layer.layer_desc().file_size; + max_layer_size = max_layer_size.map_or(Some(file_size), |m| Some(m.max(file_size))); - let l = guard.get_from_desc(&l); + let last_activity_ts = layer.access_stats().latest_activity_or_now(); - let l = match l.keep_resident().await { - Ok(Some(l)) => l, - Ok(None) => continue, - Err(e) => { - // these should not happen, but we cannot make them statically impossible right - // now. - tracing::warn!(layer=%l, "failed to keep the layer resident: {e:#}"); - continue; + EvictionCandidate { + layer: layer.into(), + last_activity_ts, + relative_last_activity: finite_f32::FiniteF32::ZERO, } - }; - - let last_activity_ts = l.access_stats().latest_activity().unwrap_or_else(|| { - // We only use this fallback if there's an implementation error. - // `latest_activity` already does rate-limited warn!() log. - debug!(layer=%l, "last_activity returns None, using SystemTime::now"); - SystemTime::now() - }); - - resident_layers.push(EvictionCandidate { - layer: l.drop_eviction_guard().into(), - last_activity_ts, - relative_last_activity: finite_f32::FiniteF32::ZERO, - }); - } + }) + .collect() + .await; DiskUsageEvictionInfo { max_layer_size, @@ -4682,13 +4319,43 @@ fn layer_traversal_error(msg: String, path: Vec) -> PageRecon PageReconstructError::from(msg) } +struct TimelineWriterState { + open_layer: Arc, + current_size: u64, + // Previous Lsn which passed through + prev_lsn: Option, + // Largest Lsn which passed through the current writer + max_lsn: Option, + // Cached details of the last freeze. Avoids going trough the atomic/lock on every put. + cached_last_freeze_at: Lsn, + cached_last_freeze_ts: Instant, +} + +impl TimelineWriterState { + fn new( + open_layer: Arc, + current_size: u64, + last_freeze_at: Lsn, + last_freeze_ts: Instant, + ) -> Self { + Self { + open_layer, + current_size, + prev_lsn: None, + max_lsn: None, + cached_last_freeze_at: last_freeze_at, + cached_last_freeze_ts: last_freeze_ts, + } + } +} + /// Various functions to mutate the timeline. // TODO Currently, Deref is used to allow easy access to read methods from this trait. // This is probably considered a bad practice in Rust and should be fixed eventually, // but will cause large code changes. -pub struct TimelineWriter<'a> { +pub(crate) struct TimelineWriter<'a> { tl: &'a Timeline, - _write_guard: tokio::sync::MutexGuard<'a, ()>, + write_guard: tokio::sync::MutexGuard<'a, Option>, } impl Deref for TimelineWriter<'_> { @@ -4699,31 +4366,239 @@ impl Deref for TimelineWriter<'_> { } } +impl Drop for TimelineWriter<'_> { + fn drop(&mut self) { + self.write_guard.take(); + } +} + +#[derive(PartialEq)] +enum OpenLayerAction { + Roll, + Open, + None, +} + impl<'a> TimelineWriter<'a> { /// Put a new page version that can be constructed from a WAL record /// /// This will implicitly extend the relation, if the page is beyond the /// current end-of-file. - pub async fn put( - &self, + pub(crate) async fn put( + &mut self, key: Key, lsn: Lsn, value: &Value, ctx: &RequestContext, ) -> anyhow::Result<()> { - self.tl.put_value(key, lsn, value, ctx).await + // Avoid doing allocations for "small" values. + // In the regression test suite, the limit of 256 avoided allocations in 95% of cases: + // https://github.com/neondatabase/neon/pull/5056#discussion_r1301975061 + let mut buf = smallvec::SmallVec::<[u8; 256]>::new(); + value.ser_into(&mut buf)?; + let buf_size: u64 = buf.len().try_into().expect("oversized value buf"); + + let action = self.get_open_layer_action(lsn, buf_size); + let layer = self.handle_open_layer_action(lsn, action).await?; + let res = layer.put_value(key, lsn, &buf, ctx).await; + + if res.is_ok() { + // Update the current size only when the entire write was ok. + // In case of failures, we may have had partial writes which + // render the size tracking out of sync. That's ok because + // the checkpoint distance should be significantly smaller + // than the S3 single shot upload limit of 5GiB. + let state = self.write_guard.as_mut().unwrap(); + + state.current_size += buf_size; + state.prev_lsn = Some(lsn); + state.max_lsn = std::cmp::max(state.max_lsn, Some(lsn)); + } + + res } + /// "Tick" the timeline writer: it will roll the open layer if required + /// and do nothing else. + pub(crate) async fn tick(&mut self) -> anyhow::Result<()> { + self.open_layer_if_present().await?; + + let last_record_lsn = self.get_last_record_lsn(); + let action = self.get_open_layer_action(last_record_lsn, 0); + if action == OpenLayerAction::Roll { + self.roll_layer(last_record_lsn).await?; + } + + Ok(()) + } + + /// Populate the timeline writer state only if an in-memory layer + /// is already open. + async fn open_layer_if_present(&mut self) -> anyhow::Result<()> { + assert!(self.write_guard.is_none()); + + let open_layer = { + let guard = self.layers.read().await; + let layers = guard.layer_map(); + match layers.open_layer { + Some(ref open_layer) => open_layer.clone(), + None => { + return Ok(()); + } + } + }; + + let initial_size = open_layer.size().await?; + let last_freeze_at = self.last_freeze_at.load(); + let last_freeze_ts = *self.last_freeze_ts.read().unwrap(); + self.write_guard.replace(TimelineWriterState::new( + open_layer, + initial_size, + last_freeze_at, + last_freeze_ts, + )); + + Ok(()) + } + + async fn handle_open_layer_action( + &mut self, + at: Lsn, + action: OpenLayerAction, + ) -> anyhow::Result<&Arc> { + match action { + OpenLayerAction::Roll => { + let freeze_at = self.write_guard.as_ref().unwrap().max_lsn.unwrap(); + self.roll_layer(freeze_at).await?; + self.open_layer(at).await?; + } + OpenLayerAction::Open => self.open_layer(at).await?, + OpenLayerAction::None => { + assert!(self.write_guard.is_some()); + } + } + + Ok(&self.write_guard.as_ref().unwrap().open_layer) + } + + async fn open_layer(&mut self, at: Lsn) -> anyhow::Result<()> { + let layer = self.tl.get_layer_for_write(at).await?; + let initial_size = layer.size().await?; + + let last_freeze_at = self.last_freeze_at.load(); + let last_freeze_ts = *self.last_freeze_ts.read().unwrap(); + self.write_guard.replace(TimelineWriterState::new( + layer, + initial_size, + last_freeze_at, + last_freeze_ts, + )); + + Ok(()) + } + + async fn roll_layer(&mut self, freeze_at: Lsn) -> anyhow::Result<()> { + assert!(self.write_guard.is_some()); + + self.tl.freeze_inmem_layer_at(freeze_at).await; + + let now = Instant::now(); + *(self.last_freeze_ts.write().unwrap()) = now; + + self.tl.flush_frozen_layers(); + + let current_size = self.write_guard.as_ref().unwrap().current_size; + if current_size > self.get_checkpoint_distance() { + warn!("Flushed oversized open layer with size {}", current_size) + } + + Ok(()) + } + + fn get_open_layer_action(&self, lsn: Lsn, new_value_size: u64) -> OpenLayerAction { + let state = &*self.write_guard; + let Some(state) = &state else { + return OpenLayerAction::Open; + }; + + if state.prev_lsn == Some(lsn) { + // Rolling mid LSN is not supported by downstream code. + // Hence, only roll at LSN boundaries. + return OpenLayerAction::None; + } + + if state.current_size == 0 { + // Don't roll empty layers + return OpenLayerAction::None; + } + + let distance = lsn.widening_sub(state.cached_last_freeze_at); + let proposed_open_layer_size = state.current_size + new_value_size; + + // Rolling the open layer can be triggered by: + // 1. The distance from the last LSN we rolled at. This bounds the amount of WAL that + // the safekeepers need to store. For sharded tenants, we multiply by shard count to + // account for how writes are distributed across shards: we expect each node to consume + // 1/count of the LSN on average. + // 2. The size of the currently open layer. + // 3. The time since the last roll. It helps safekeepers to regard pageserver as caught + // up and suspend activity. + if distance + >= self.get_checkpoint_distance() as i128 * self.shard_identity.count.count() as i128 + { + info!( + "Will roll layer at {} with layer size {} due to LSN distance ({})", + lsn, state.current_size, distance + ); + + OpenLayerAction::Roll + } else if proposed_open_layer_size >= self.get_checkpoint_distance() { + info!( + "Will roll layer at {} with layer size {} due to layer size ({})", + lsn, state.current_size, proposed_open_layer_size + ); + + OpenLayerAction::Roll + } else if distance > 0 + && state.cached_last_freeze_ts.elapsed() >= self.get_checkpoint_timeout() + { + info!( + "Will roll layer at {} with layer size {} due to time since last flush ({:?})", + lsn, + state.current_size, + state.cached_last_freeze_ts.elapsed() + ); + + OpenLayerAction::Roll + } else { + OpenLayerAction::None + } + } + + /// Put a batch keys at the specified Lsns. + /// + /// The batch should be sorted by Lsn such that it's safe + /// to roll the open layer mid batch. pub(crate) async fn put_batch( - &self, - batch: &HashMap>, + &mut self, + batch: Vec<(Key, Lsn, Value)>, ctx: &RequestContext, ) -> anyhow::Result<()> { - self.tl.put_values(batch, ctx).await + for (key, lsn, val) in batch { + self.put(key, lsn, &val, ctx).await? + } + + Ok(()) } - pub(crate) async fn delete_batch(&self, batch: &[(Range, Lsn)]) -> anyhow::Result<()> { - self.tl.put_tombstones(batch).await + pub(crate) async fn delete_batch(&mut self, batch: &[(Range, Lsn)]) -> anyhow::Result<()> { + if let Some((_, lsn)) = batch.first() { + let action = self.get_open_layer_action(*lsn, 0); + let layer = self.handle_open_layer_action(*lsn, action).await?; + layer.put_tombstones(batch).await?; + } + + Ok(()) } /// Track the end of the latest digested WAL record. @@ -4784,8 +4659,7 @@ mod tests { let harness = TenantHarness::create("two_layer_eviction_attempts_at_the_same_time").unwrap(); - let ctx = any_context(); - let tenant = harness.try_load(&ctx).await.unwrap(); + let (tenant, ctx) = harness.load().await; let timeline = tenant .create_test_timeline(TimelineId::generate(), Lsn(0x10), 14, &ctx) .await @@ -4799,8 +4673,10 @@ mod tests { .expect("should had been resident") .drop_eviction_guard(); - let first = async { layer.evict_and_wait().await }; - let second = async { layer.evict_and_wait().await }; + let forever = std::time::Duration::from_secs(120); + + let first = layer.evict_and_wait(forever); + let second = layer.evict_and_wait(forever); let (first, second) = tokio::join!(first, second); @@ -4819,12 +4695,6 @@ mod tests { } } - fn any_context() -> crate::context::RequestContext { - use crate::context::*; - use crate::task_mgr::*; - RequestContext::new(TaskKind::UnitTest, DownloadBehavior::Error) - } - async fn find_some_layer(timeline: &Timeline) -> Layer { let layers = timeline.layers.read().await; let desc = layers diff --git a/pageserver/src/tenant/timeline/compaction.rs b/pageserver/src/tenant/timeline/compaction.rs new file mode 100644 index 0000000000..74b75dabf0 --- /dev/null +++ b/pageserver/src/tenant/timeline/compaction.rs @@ -0,0 +1,1158 @@ +//! New compaction implementation. The algorithm itself is implemented in the +//! compaction crate. This file implements the callbacks and structs that allow +//! the algorithm to drive the process. +//! +//! The old legacy algorithm is implemented directly in `timeline.rs`. + +use std::collections::BinaryHeap; +use std::ops::{Deref, Range}; +use std::sync::Arc; + +use super::layer_manager::LayerManager; +use super::{CompactFlags, DurationRecorder, RecordedDuration, Timeline}; + +use anyhow::{anyhow, Context}; +use async_trait::async_trait; +use enumset::EnumSet; +use fail::fail_point; +use itertools::Itertools; +use pageserver_api::shard::TenantShardId; +use tokio_util::sync::CancellationToken; +use tracing::{debug, info, info_span, trace, warn, Instrument}; +use utils::id::TimelineId; + +use crate::context::{AccessStatsBehavior, RequestContext, RequestContextBuilder}; +use crate::tenant::storage_layer::{AsLayerDesc, PersistentLayerDesc}; +use crate::tenant::timeline::{drop_rlock, is_rel_fsm_block_key, is_rel_vm_block_key, Hole}; +use crate::tenant::timeline::{DeltaLayerWriter, ImageLayerWriter}; +use crate::tenant::timeline::{Layer, ResidentLayer}; +use crate::tenant::DeltaLayer; +use crate::tenant::PageReconstructError; +use crate::virtual_file::{MaybeFatalIo, VirtualFile}; +use crate::{page_cache, ZERO_PAGE}; + +use crate::keyspace::KeySpace; +use crate::repository::Key; + +use utils::lsn::Lsn; + +use pageserver_compaction::helpers::overlaps_with; +use pageserver_compaction::interface::*; + +use super::CompactionError; + +impl Timeline { + /// TODO: cancellation + pub(crate) async fn compact_legacy( + self: &Arc, + _cancel: &CancellationToken, + flags: EnumSet, + ctx: &RequestContext, + ) -> Result<(), CompactionError> { + // High level strategy for compaction / image creation: + // + // 1. First, calculate the desired "partitioning" of the + // currently in-use key space. The goal is to partition the + // key space into roughly fixed-size chunks, but also take into + // account any existing image layers, and try to align the + // chunk boundaries with the existing image layers to avoid + // too much churn. Also try to align chunk boundaries with + // relation boundaries. In principle, we don't know about + // relation boundaries here, we just deal with key-value + // pairs, and the code in pgdatadir_mapping.rs knows how to + // map relations into key-value pairs. But in practice we know + // that 'field6' is the block number, and the fields 1-5 + // identify a relation. This is just an optimization, + // though. + // + // 2. Once we know the partitioning, for each partition, + // decide if it's time to create a new image layer. The + // criteria is: there has been too much "churn" since the last + // image layer? The "churn" is fuzzy concept, it's a + // combination of too many delta files, or too much WAL in + // total in the delta file. Or perhaps: if creating an image + // file would allow to delete some older files. + // + // 3. After that, we compact all level0 delta files if there + // are too many of them. While compacting, we also garbage + // collect any page versions that are no longer needed because + // of the new image layers we created in step 2. + // + // TODO: This high level strategy hasn't been implemented yet. + // Below are functions compact_level0() and create_image_layers() + // but they are a bit ad hoc and don't quite work like it's explained + // above. Rewrite it. + + // Is the timeline being deleted? + if self.is_stopping() { + trace!("Dropping out of compaction on timeline shutdown"); + return Err(CompactionError::ShuttingDown); + } + + let target_file_size = self.get_checkpoint_distance(); + + // Define partitioning schema if needed + + // FIXME: the match should only cover repartitioning, not the next steps + match self + .repartition( + self.get_last_record_lsn(), + self.get_compaction_target_size(), + flags, + ctx, + ) + .await + { + Ok((partitioning, lsn)) => { + // Disables access_stats updates, so that the files we read remain candidates for eviction after we're done with them + let image_ctx = RequestContextBuilder::extend(ctx) + .access_stats_behavior(AccessStatsBehavior::Skip) + .build(); + + // 2. Compact + let timer = self.metrics.compact_time_histo.start_timer(); + self.compact_level0(target_file_size, ctx).await?; + timer.stop_and_record(); + + // 3. Create new image layers for partitions that have been modified + // "enough". + let layers = self + .create_image_layers( + &partitioning, + lsn, + flags.contains(CompactFlags::ForceImageLayerCreation), + &image_ctx, + ) + .await + .map_err(anyhow::Error::from)?; + if let Some(remote_client) = &self.remote_client { + for layer in layers { + remote_client.schedule_layer_file_upload(layer)?; + } + } + + if let Some(remote_client) = &self.remote_client { + // should any new image layer been created, not uploading index_part will + // result in a mismatch between remote_physical_size and layermap calculated + // size, which will fail some tests, but should not be an issue otherwise. + remote_client.schedule_index_upload_for_file_changes()?; + } + } + Err(err) => { + // no partitioning? This is normal, if the timeline was just created + // as an empty timeline. Also in unit tests, when we use the timeline + // as a simple key-value store, ignoring the datadir layout. Log the + // error but continue. + // + // Suppress error when it's due to cancellation + if !self.cancel.is_cancelled() { + tracing::error!("could not compact, repartitioning keyspace failed: {err:?}"); + } + } + }; + + Ok(()) + } + + /// Collect a bunch of Level 0 layer files, and compact and reshuffle them as + /// as Level 1 files. + async fn compact_level0( + self: &Arc, + target_file_size: u64, + ctx: &RequestContext, + ) -> Result<(), CompactionError> { + let CompactLevel0Phase1Result { + new_layers, + deltas_to_compact, + } = { + let phase1_span = info_span!("compact_level0_phase1"); + let ctx = ctx.attached_child(); + let mut stats = CompactLevel0Phase1StatsBuilder { + version: Some(2), + tenant_id: Some(self.tenant_shard_id), + timeline_id: Some(self.timeline_id), + ..Default::default() + }; + + let begin = tokio::time::Instant::now(); + let phase1_layers_locked = Arc::clone(&self.layers).read_owned().await; + let now = tokio::time::Instant::now(); + stats.read_lock_acquisition_micros = + DurationRecorder::Recorded(RecordedDuration(now - begin), now); + self.compact_level0_phase1(phase1_layers_locked, stats, target_file_size, &ctx) + .instrument(phase1_span) + .await? + }; + + if new_layers.is_empty() && deltas_to_compact.is_empty() { + // nothing to do + return Ok(()); + } + + self.finish_compact_batch(&new_layers, &Vec::new(), &deltas_to_compact) + .await?; + Ok(()) + } + + /// Level0 files first phase of compaction, explained in the [`Self::compact_legacy`] comment. + async fn compact_level0_phase1( + self: &Arc, + guard: tokio::sync::OwnedRwLockReadGuard, + mut stats: CompactLevel0Phase1StatsBuilder, + target_file_size: u64, + ctx: &RequestContext, + ) -> Result { + stats.read_lock_held_spawn_blocking_startup_micros = + stats.read_lock_acquisition_micros.till_now(); // set by caller + let layers = guard.layer_map(); + let level0_deltas = layers.get_level0_deltas()?; + let mut level0_deltas = level0_deltas + .into_iter() + .map(|x| guard.get_from_desc(&x)) + .collect_vec(); + stats.level0_deltas_count = Some(level0_deltas.len()); + // Only compact if enough layers have accumulated. + let threshold = self.get_compaction_threshold(); + if level0_deltas.is_empty() || level0_deltas.len() < threshold { + debug!( + level0_deltas = level0_deltas.len(), + threshold, "too few deltas to compact" + ); + return Ok(CompactLevel0Phase1Result::default()); + } + + // This failpoint is used together with `test_duplicate_layers` integration test. + // It returns the compaction result exactly the same layers as input to compaction. + // We want to ensure that this will not cause any problem when updating the layer map + // after the compaction is finished. + // + // Currently, there are two rare edge cases that will cause duplicated layers being + // inserted. + // 1. The compaction job is inturrupted / did not finish successfully. Assume we have file 1, 2, 3, 4, which + // is compacted to 5, but the page server is shut down, next time we start page server we will get a layer + // map containing 1, 2, 3, 4, and 5, whereas 5 has the same content as 4. If we trigger L0 compation at this + // point again, it is likely that we will get a file 6 which has the same content and the key range as 5, + // and this causes an overwrite. This is acceptable because the content is the same, and we should do a + // layer replace instead of the normal remove / upload process. + // 2. The input workload pattern creates exactly n files that are sorted, non-overlapping and is of target file + // size length. Compaction will likely create the same set of n files afterwards. + // + // This failpoint is a superset of both of the cases. + if cfg!(feature = "testing") { + let active = (|| { + ::fail::fail_point!("compact-level0-phase1-return-same", |_| true); + false + })(); + + if active { + let mut new_layers = Vec::with_capacity(level0_deltas.len()); + for delta in &level0_deltas { + // we are just faking these layers as being produced again for this failpoint + new_layers.push( + delta + .download_and_keep_resident() + .await + .context("download layer for failpoint")?, + ); + } + tracing::info!("compact-level0-phase1-return-same"); // so that we can check if we hit the failpoint + return Ok(CompactLevel0Phase1Result { + new_layers, + deltas_to_compact: level0_deltas, + }); + } + } + + // Gather the files to compact in this iteration. + // + // Start with the oldest Level 0 delta file, and collect any other + // level 0 files that form a contiguous sequence, such that the end + // LSN of previous file matches the start LSN of the next file. + // + // Note that if the files don't form such a sequence, we might + // "compact" just a single file. That's a bit pointless, but it allows + // us to get rid of the level 0 file, and compact the other files on + // the next iteration. This could probably made smarter, but such + // "gaps" in the sequence of level 0 files should only happen in case + // of a crash, partial download from cloud storage, or something like + // that, so it's not a big deal in practice. + level0_deltas.sort_by_key(|l| l.layer_desc().lsn_range.start); + let mut level0_deltas_iter = level0_deltas.iter(); + + let first_level0_delta = level0_deltas_iter.next().unwrap(); + let mut prev_lsn_end = first_level0_delta.layer_desc().lsn_range.end; + let mut deltas_to_compact = Vec::with_capacity(level0_deltas.len()); + + deltas_to_compact.push(first_level0_delta.download_and_keep_resident().await?); + for l in level0_deltas_iter { + let lsn_range = &l.layer_desc().lsn_range; + + if lsn_range.start != prev_lsn_end { + break; + } + deltas_to_compact.push(l.download_and_keep_resident().await?); + prev_lsn_end = lsn_range.end; + } + let lsn_range = Range { + start: deltas_to_compact + .first() + .unwrap() + .layer_desc() + .lsn_range + .start, + end: deltas_to_compact.last().unwrap().layer_desc().lsn_range.end, + }; + + info!( + "Starting Level0 compaction in LSN range {}-{} for {} layers ({} deltas in total)", + lsn_range.start, + lsn_range.end, + deltas_to_compact.len(), + level0_deltas.len() + ); + + for l in deltas_to_compact.iter() { + info!("compact includes {l}"); + } + + // We don't need the original list of layers anymore. Drop it so that + // we don't accidentally use it later in the function. + drop(level0_deltas); + + stats.read_lock_held_prerequisites_micros = stats + .read_lock_held_spawn_blocking_startup_micros + .till_now(); + + // Determine N largest holes where N is number of compacted layers. + let max_holes = deltas_to_compact.len(); + let last_record_lsn = self.get_last_record_lsn(); + let min_hole_range = (target_file_size / page_cache::PAGE_SZ as u64) as i128; + let min_hole_coverage_size = 3; // TODO: something more flexible? + + // min-heap (reserve space for one more element added before eviction) + let mut heap: BinaryHeap = BinaryHeap::with_capacity(max_holes + 1); + let mut prev: Option = None; + + let mut all_keys = Vec::new(); + + for l in deltas_to_compact.iter() { + all_keys.extend(l.load_keys(ctx).await?); + } + + // FIXME: should spawn_blocking the rest of this function + + // The current stdlib sorting implementation is designed in a way where it is + // particularly fast where the slice is made up of sorted sub-ranges. + all_keys.sort_by_key(|DeltaEntry { key, lsn, .. }| (*key, *lsn)); + + stats.read_lock_held_key_sort_micros = stats.read_lock_held_prerequisites_micros.till_now(); + + for &DeltaEntry { key: next_key, .. } in all_keys.iter() { + if let Some(prev_key) = prev { + // just first fast filter + if next_key.to_i128() - prev_key.to_i128() >= min_hole_range { + let key_range = prev_key..next_key; + // Measuring hole by just subtraction of i128 representation of key range boundaries + // has not so much sense, because largest holes will corresponds field1/field2 changes. + // But we are mostly interested to eliminate holes which cause generation of excessive image layers. + // That is why it is better to measure size of hole as number of covering image layers. + let coverage_size = layers.image_coverage(&key_range, last_record_lsn).len(); + if coverage_size >= min_hole_coverage_size { + heap.push(Hole { + key_range, + coverage_size, + }); + if heap.len() > max_holes { + heap.pop(); // remove smallest hole + } + } + } + } + prev = Some(next_key.next()); + } + stats.read_lock_held_compute_holes_micros = stats.read_lock_held_key_sort_micros.till_now(); + drop_rlock(guard); + stats.read_lock_drop_micros = stats.read_lock_held_compute_holes_micros.till_now(); + let mut holes = heap.into_vec(); + holes.sort_unstable_by_key(|hole| hole.key_range.start); + let mut next_hole = 0; // index of next hole in holes vector + + // This iterator walks through all key-value pairs from all the layers + // we're compacting, in key, LSN order. + let all_values_iter = all_keys.iter(); + + // This iterator walks through all keys and is needed to calculate size used by each key + let mut all_keys_iter = all_keys + .iter() + .map(|DeltaEntry { key, lsn, size, .. }| (*key, *lsn, *size)) + .coalesce(|mut prev, cur| { + // Coalesce keys that belong to the same key pair. + // This ensures that compaction doesn't put them + // into different layer files. + // Still limit this by the target file size, + // so that we keep the size of the files in + // check. + if prev.0 == cur.0 && prev.2 < target_file_size { + prev.2 += cur.2; + Ok(prev) + } else { + Err((prev, cur)) + } + }); + + // Merge the contents of all the input delta layers into a new set + // of delta layers, based on the current partitioning. + // + // We split the new delta layers on the key dimension. We iterate through the key space, and for each key, check if including the next key to the current output layer we're building would cause the layer to become too large. If so, dump the current output layer and start new one. + // It's possible that there is a single key with so many page versions that storing all of them in a single layer file + // would be too large. In that case, we also split on the LSN dimension. + // + // LSN + // ^ + // | + // | +-----------+ +--+--+--+--+ + // | | | | | | | | + // | +-----------+ | | | | | + // | | | | | | | | + // | +-----------+ ==> | | | | | + // | | | | | | | | + // | +-----------+ | | | | | + // | | | | | | | | + // | +-----------+ +--+--+--+--+ + // | + // +--------------> key + // + // + // If one key (X) has a lot of page versions: + // + // LSN + // ^ + // | (X) + // | +-----------+ +--+--+--+--+ + // | | | | | | | | + // | +-----------+ | | +--+ | + // | | | | | | | | + // | +-----------+ ==> | | | | | + // | | | | | +--+ | + // | +-----------+ | | | | | + // | | | | | | | | + // | +-----------+ +--+--+--+--+ + // | + // +--------------> key + // TODO: this actually divides the layers into fixed-size chunks, not + // based on the partitioning. + // + // TODO: we should also opportunistically materialize and + // garbage collect what we can. + let mut new_layers = Vec::new(); + let mut prev_key: Option = None; + let mut writer: Option = None; + let mut key_values_total_size = 0u64; + let mut dup_start_lsn: Lsn = Lsn::INVALID; // start LSN of layer containing values of the single key + let mut dup_end_lsn: Lsn = Lsn::INVALID; // end LSN of layer containing values of the single key + + for &DeltaEntry { + key, lsn, ref val, .. + } in all_values_iter + { + let value = val.load(ctx).await?; + let same_key = prev_key.map_or(false, |prev_key| prev_key == key); + // We need to check key boundaries once we reach next key or end of layer with the same key + if !same_key || lsn == dup_end_lsn { + let mut next_key_size = 0u64; + let is_dup_layer = dup_end_lsn.is_valid(); + dup_start_lsn = Lsn::INVALID; + if !same_key { + dup_end_lsn = Lsn::INVALID; + } + // Determine size occupied by this key. We stop at next key or when size becomes larger than target_file_size + for (next_key, next_lsn, next_size) in all_keys_iter.by_ref() { + next_key_size = next_size; + if key != next_key { + if dup_end_lsn.is_valid() { + // We are writting segment with duplicates: + // place all remaining values of this key in separate segment + dup_start_lsn = dup_end_lsn; // new segments starts where old stops + dup_end_lsn = lsn_range.end; // there are no more values of this key till end of LSN range + } + break; + } + key_values_total_size += next_size; + // Check if it is time to split segment: if total keys size is larger than target file size. + // We need to avoid generation of empty segments if next_size > target_file_size. + if key_values_total_size > target_file_size && lsn != next_lsn { + // Split key between multiple layers: such layer can contain only single key + dup_start_lsn = if dup_end_lsn.is_valid() { + dup_end_lsn // new segment with duplicates starts where old one stops + } else { + lsn // start with the first LSN for this key + }; + dup_end_lsn = next_lsn; // upper LSN boundary is exclusive + break; + } + } + // handle case when loop reaches last key: in this case dup_end is non-zero but dup_start is not set. + if dup_end_lsn.is_valid() && !dup_start_lsn.is_valid() { + dup_start_lsn = dup_end_lsn; + dup_end_lsn = lsn_range.end; + } + if writer.is_some() { + let written_size = writer.as_mut().unwrap().size(); + let contains_hole = + next_hole < holes.len() && key >= holes[next_hole].key_range.end; + // check if key cause layer overflow or contains hole... + if is_dup_layer + || dup_end_lsn.is_valid() + || written_size + key_values_total_size > target_file_size + || contains_hole + { + // ... if so, flush previous layer and prepare to write new one + new_layers.push( + writer + .take() + .unwrap() + .finish(prev_key.unwrap().next(), self) + .await?, + ); + writer = None; + + if contains_hole { + // skip hole + next_hole += 1; + } + } + } + // Remember size of key value because at next iteration we will access next item + key_values_total_size = next_key_size; + } + fail_point!("delta-layer-writer-fail-before-finish", |_| { + Err(CompactionError::Other(anyhow::anyhow!( + "failpoint delta-layer-writer-fail-before-finish" + ))) + }); + + if !self.shard_identity.is_key_disposable(&key) { + if writer.is_none() { + // Create writer if not initiaized yet + writer = Some( + DeltaLayerWriter::new( + self.conf, + self.timeline_id, + self.tenant_shard_id, + key, + if dup_end_lsn.is_valid() { + // this is a layer containing slice of values of the same key + debug!("Create new dup layer {}..{}", dup_start_lsn, dup_end_lsn); + dup_start_lsn..dup_end_lsn + } else { + debug!("Create new layer {}..{}", lsn_range.start, lsn_range.end); + lsn_range.clone() + }, + ) + .await?, + ); + } + + writer.as_mut().unwrap().put_value(key, lsn, value).await?; + } else { + debug!( + "Dropping key {} during compaction (it belongs on shard {:?})", + key, + self.shard_identity.get_shard_number(&key) + ); + } + + if !new_layers.is_empty() { + fail_point!("after-timeline-compacted-first-L1"); + } + + prev_key = Some(key); + } + if let Some(writer) = writer { + new_layers.push(writer.finish(prev_key.unwrap().next(), self).await?); + } + + // Sync layers + if !new_layers.is_empty() { + // Print a warning if the created layer is larger than double the target size + // Add two pages for potential overhead. This should in theory be already + // accounted for in the target calculation, but for very small targets, + // we still might easily hit the limit otherwise. + let warn_limit = target_file_size * 2 + page_cache::PAGE_SZ as u64 * 2; + for layer in new_layers.iter() { + if layer.layer_desc().file_size > warn_limit { + warn!( + %layer, + "created delta file of size {} larger than double of target of {target_file_size}", layer.layer_desc().file_size + ); + } + } + + // The writer.finish() above already did the fsync of the inodes. + // We just need to fsync the directory in which these inodes are linked, + // which we know to be the timeline directory. + // + // We use fatal_err() below because the after writer.finish() returns with success, + // the in-memory state of the filesystem already has the layer file in its final place, + // and subsequent pageserver code could think it's durable while it really isn't. + let timeline_dir = VirtualFile::open( + &self + .conf + .timeline_path(&self.tenant_shard_id, &self.timeline_id), + ) + .await + .fatal_err("VirtualFile::open for timeline dir fsync"); + timeline_dir + .sync_all() + .await + .fatal_err("VirtualFile::sync_all timeline dir"); + } + + stats.write_layer_files_micros = stats.read_lock_drop_micros.till_now(); + stats.new_deltas_count = Some(new_layers.len()); + stats.new_deltas_size = Some(new_layers.iter().map(|l| l.layer_desc().file_size).sum()); + + match TryInto::::try_into(stats) + .and_then(|stats| serde_json::to_string(&stats).context("serde_json::to_string")) + { + Ok(stats_json) => { + info!( + stats_json = stats_json.as_str(), + "compact_level0_phase1 stats available" + ) + } + Err(e) => { + warn!("compact_level0_phase1 stats failed to serialize: {:#}", e); + } + } + + Ok(CompactLevel0Phase1Result { + new_layers, + deltas_to_compact: deltas_to_compact + .into_iter() + .map(|x| x.drop_eviction_guard()) + .collect::>(), + }) + } +} + +#[derive(Default)] +struct CompactLevel0Phase1Result { + new_layers: Vec, + deltas_to_compact: Vec, +} + +#[derive(Default)] +struct CompactLevel0Phase1StatsBuilder { + version: Option, + tenant_id: Option, + timeline_id: Option, + read_lock_acquisition_micros: DurationRecorder, + read_lock_held_spawn_blocking_startup_micros: DurationRecorder, + read_lock_held_key_sort_micros: DurationRecorder, + read_lock_held_prerequisites_micros: DurationRecorder, + read_lock_held_compute_holes_micros: DurationRecorder, + read_lock_drop_micros: DurationRecorder, + write_layer_files_micros: DurationRecorder, + level0_deltas_count: Option, + new_deltas_count: Option, + new_deltas_size: Option, +} + +#[derive(serde::Serialize)] +struct CompactLevel0Phase1Stats { + version: u64, + tenant_id: TenantShardId, + timeline_id: TimelineId, + read_lock_acquisition_micros: RecordedDuration, + read_lock_held_spawn_blocking_startup_micros: RecordedDuration, + read_lock_held_key_sort_micros: RecordedDuration, + read_lock_held_prerequisites_micros: RecordedDuration, + read_lock_held_compute_holes_micros: RecordedDuration, + read_lock_drop_micros: RecordedDuration, + write_layer_files_micros: RecordedDuration, + level0_deltas_count: usize, + new_deltas_count: usize, + new_deltas_size: u64, +} + +impl TryFrom for CompactLevel0Phase1Stats { + type Error = anyhow::Error; + + fn try_from(value: CompactLevel0Phase1StatsBuilder) -> Result { + Ok(Self { + version: value.version.ok_or_else(|| anyhow!("version not set"))?, + tenant_id: value + .tenant_id + .ok_or_else(|| anyhow!("tenant_id not set"))?, + timeline_id: value + .timeline_id + .ok_or_else(|| anyhow!("timeline_id not set"))?, + read_lock_acquisition_micros: value + .read_lock_acquisition_micros + .into_recorded() + .ok_or_else(|| anyhow!("read_lock_acquisition_micros not set"))?, + read_lock_held_spawn_blocking_startup_micros: value + .read_lock_held_spawn_blocking_startup_micros + .into_recorded() + .ok_or_else(|| anyhow!("read_lock_held_spawn_blocking_startup_micros not set"))?, + read_lock_held_key_sort_micros: value + .read_lock_held_key_sort_micros + .into_recorded() + .ok_or_else(|| anyhow!("read_lock_held_key_sort_micros not set"))?, + read_lock_held_prerequisites_micros: value + .read_lock_held_prerequisites_micros + .into_recorded() + .ok_or_else(|| anyhow!("read_lock_held_prerequisites_micros not set"))?, + read_lock_held_compute_holes_micros: value + .read_lock_held_compute_holes_micros + .into_recorded() + .ok_or_else(|| anyhow!("read_lock_held_compute_holes_micros not set"))?, + read_lock_drop_micros: value + .read_lock_drop_micros + .into_recorded() + .ok_or_else(|| anyhow!("read_lock_drop_micros not set"))?, + write_layer_files_micros: value + .write_layer_files_micros + .into_recorded() + .ok_or_else(|| anyhow!("write_layer_files_micros not set"))?, + level0_deltas_count: value + .level0_deltas_count + .ok_or_else(|| anyhow!("level0_deltas_count not set"))?, + new_deltas_count: value + .new_deltas_count + .ok_or_else(|| anyhow!("new_deltas_count not set"))?, + new_deltas_size: value + .new_deltas_size + .ok_or_else(|| anyhow!("new_deltas_size not set"))?, + }) + } +} + +impl Timeline { + /// Entry point for new tiered compaction algorithm. + /// + /// All the real work is in the implementation in the pageserver_compaction + /// crate. The code here would apply to any algorithm implemented by the + /// same interface, but tiered is the only one at the moment. + /// + /// TODO: cancellation + pub(crate) async fn compact_tiered( + self: &Arc, + _cancel: &CancellationToken, + ctx: &RequestContext, + ) -> Result<(), CompactionError> { + let fanout = self.get_compaction_threshold() as u64; + let target_file_size = self.get_checkpoint_distance(); + + // Find the top of the historical layers + let end_lsn = { + let guard = self.layers.read().await; + let layers = guard.layer_map(); + + let l0_deltas = layers.get_level0_deltas()?; + drop(guard); + + // As an optimization, if we find that there are too few L0 layers, + // bail out early. We know that the compaction algorithm would do + // nothing in that case. + if l0_deltas.len() < fanout as usize { + // doesn't need compacting + return Ok(()); + } + l0_deltas.iter().map(|l| l.lsn_range.end).max().unwrap() + }; + + // Is the timeline being deleted? + if self.is_stopping() { + trace!("Dropping out of compaction on timeline shutdown"); + return Err(CompactionError::ShuttingDown); + } + + let keyspace = self.collect_keyspace(end_lsn, ctx).await?; + let mut adaptor = TimelineAdaptor::new(self, (end_lsn, keyspace)); + + pageserver_compaction::compact_tiered::compact_tiered( + &mut adaptor, + end_lsn, + target_file_size, + fanout, + ctx, + ) + .await?; + + adaptor.flush_updates().await?; + Ok(()) + } +} + +struct TimelineAdaptor { + timeline: Arc, + + keyspace: (Lsn, KeySpace), + + new_deltas: Vec, + new_images: Vec, + layers_to_delete: Vec>, +} + +impl TimelineAdaptor { + pub fn new(timeline: &Arc, keyspace: (Lsn, KeySpace)) -> Self { + Self { + timeline: timeline.clone(), + keyspace, + new_images: Vec::new(), + new_deltas: Vec::new(), + layers_to_delete: Vec::new(), + } + } + + pub async fn flush_updates(&mut self) -> anyhow::Result<()> { + let layers_to_delete = { + let guard = self.timeline.layers.read().await; + self.layers_to_delete + .iter() + .map(|x| guard.get_from_desc(x)) + .collect::>() + }; + self.timeline + .finish_compact_batch(&self.new_deltas, &self.new_images, &layers_to_delete) + .await?; + self.new_images.clear(); + self.new_deltas.clear(); + self.layers_to_delete.clear(); + Ok(()) + } +} + +#[derive(Clone)] +struct ResidentDeltaLayer(ResidentLayer); +#[derive(Clone)] +struct ResidentImageLayer(ResidentLayer); + +impl CompactionJobExecutor for TimelineAdaptor { + type Key = crate::repository::Key; + + type Layer = OwnArc; + type DeltaLayer = ResidentDeltaLayer; + type ImageLayer = ResidentImageLayer; + + type RequestContext = crate::context::RequestContext; + + async fn get_layers( + &mut self, + key_range: &Range, + lsn_range: &Range, + _ctx: &RequestContext, + ) -> anyhow::Result>> { + self.flush_updates().await?; + + let guard = self.timeline.layers.read().await; + let layer_map = guard.layer_map(); + + let result = layer_map + .iter_historic_layers() + .filter(|l| { + overlaps_with(&l.lsn_range, lsn_range) && overlaps_with(&l.key_range, key_range) + }) + .map(OwnArc) + .collect(); + Ok(result) + } + + async fn get_keyspace( + &mut self, + key_range: &Range, + lsn: Lsn, + _ctx: &RequestContext, + ) -> anyhow::Result>> { + if lsn == self.keyspace.0 { + Ok(pageserver_compaction::helpers::intersect_keyspace( + &self.keyspace.1.ranges, + key_range, + )) + } else { + // The current compaction implementatin only ever requests the key space + // at the compaction end LSN. + anyhow::bail!("keyspace not available for requested lsn"); + } + } + + async fn downcast_delta_layer( + &self, + layer: &OwnArc, + ) -> anyhow::Result> { + // this is a lot more complex than a simple downcast... + if layer.is_delta() { + let l = { + let guard = self.timeline.layers.read().await; + guard.get_from_desc(layer) + }; + let result = l.download_and_keep_resident().await?; + + Ok(Some(ResidentDeltaLayer(result))) + } else { + Ok(None) + } + } + + async fn create_image( + &mut self, + lsn: Lsn, + key_range: &Range, + ctx: &RequestContext, + ) -> anyhow::Result<()> { + Ok(self.create_image_impl(lsn, key_range, ctx).await?) + } + + async fn create_delta( + &mut self, + lsn_range: &Range, + key_range: &Range, + input_layers: &[ResidentDeltaLayer], + ctx: &RequestContext, + ) -> anyhow::Result<()> { + debug!("Create new layer {}..{}", lsn_range.start, lsn_range.end); + + let mut all_entries = Vec::new(); + for dl in input_layers.iter() { + all_entries.extend(dl.load_keys(ctx).await?); + } + + // The current stdlib sorting implementation is designed in a way where it is + // particularly fast where the slice is made up of sorted sub-ranges. + all_entries.sort_by_key(|DeltaEntry { key, lsn, .. }| (*key, *lsn)); + + let mut writer = DeltaLayerWriter::new( + self.timeline.conf, + self.timeline.timeline_id, + self.timeline.tenant_shard_id, + key_range.start, + lsn_range.clone(), + ) + .await?; + + let mut dup_values = 0; + + // This iterator walks through all key-value pairs from all the layers + // we're compacting, in key, LSN order. + let mut prev: Option<(Key, Lsn)> = None; + for &DeltaEntry { + key, lsn, ref val, .. + } in all_entries.iter() + { + if prev == Some((key, lsn)) { + // This is a duplicate. Skip it. + // + // It can happen if compaction is interrupted after writing some + // layers but not all, and we are compacting the range again. + // The calculations in the algorithm assume that there are no + // duplicates, so the math on targeted file size is likely off, + // and we will create smaller files than expected. + dup_values += 1; + continue; + } + + let value = val.load(ctx).await?; + + writer.put_value(key, lsn, value).await?; + + prev = Some((key, lsn)); + } + + if dup_values > 0 { + warn!("delta layer created with {} duplicate values", dup_values); + } + + fail_point!("delta-layer-writer-fail-before-finish", |_| { + Err(anyhow::anyhow!( + "failpoint delta-layer-writer-fail-before-finish" + )) + }); + + let new_delta_layer = writer + .finish(prev.unwrap().0.next(), &self.timeline) + .await?; + + self.new_deltas.push(new_delta_layer); + Ok(()) + } + + async fn delete_layer( + &mut self, + layer: &OwnArc, + _ctx: &RequestContext, + ) -> anyhow::Result<()> { + self.layers_to_delete.push(layer.clone().0); + Ok(()) + } +} + +impl TimelineAdaptor { + async fn create_image_impl( + &mut self, + lsn: Lsn, + key_range: &Range, + ctx: &RequestContext, + ) -> Result<(), PageReconstructError> { + let timer = self.timeline.metrics.create_images_time_histo.start_timer(); + + let mut image_layer_writer = ImageLayerWriter::new( + self.timeline.conf, + self.timeline.timeline_id, + self.timeline.tenant_shard_id, + key_range, + lsn, + ) + .await?; + + fail_point!("image-layer-writer-fail-before-finish", |_| { + Err(PageReconstructError::Other(anyhow::anyhow!( + "failpoint image-layer-writer-fail-before-finish" + ))) + }); + let keyspace_ranges = self.get_keyspace(key_range, lsn, ctx).await?; + for range in &keyspace_ranges { + let mut key = range.start; + while key < range.end { + let img = match self.timeline.get(key, lsn, ctx).await { + Ok(img) => img, + Err(err) => { + // If we fail to reconstruct a VM or FSM page, we can zero the + // page without losing any actual user data. That seems better + // than failing repeatedly and getting stuck. + // + // We had a bug at one point, where we truncated the FSM and VM + // in the pageserver, but the Postgres didn't know about that + // and continued to generate incremental WAL records for pages + // that didn't exist in the pageserver. Trying to replay those + // WAL records failed to find the previous image of the page. + // This special case allows us to recover from that situation. + // See https://github.com/neondatabase/neon/issues/2601. + // + // Unfortunately we cannot do this for the main fork, or for + // any metadata keys, keys, as that would lead to actual data + // loss. + if is_rel_fsm_block_key(key) || is_rel_vm_block_key(key) { + warn!("could not reconstruct FSM or VM key {key}, filling with zeros: {err:?}"); + ZERO_PAGE.clone() + } else { + return Err(err); + } + } + }; + image_layer_writer.put_image(key, img).await?; + key = key.next(); + } + } + let image_layer = image_layer_writer.finish(&self.timeline).await?; + + self.new_images.push(image_layer); + + timer.stop_and_record(); + + Ok(()) + } +} + +impl CompactionRequestContext for crate::context::RequestContext {} + +#[derive(Debug, Clone)] +pub struct OwnArc(pub Arc); + +impl Deref for OwnArc { + type Target = as Deref>::Target; + fn deref(&self) -> &Self::Target { + &self.0 + } +} + +impl AsRef for OwnArc { + fn as_ref(&self) -> &T { + self.0.as_ref() + } +} + +impl CompactionLayer for OwnArc { + fn key_range(&self) -> &Range { + &self.key_range + } + fn lsn_range(&self) -> &Range { + &self.lsn_range + } + fn file_size(&self) -> u64 { + self.file_size + } + fn short_id(&self) -> std::string::String { + self.as_ref().short_id().to_string() + } + fn is_delta(&self) -> bool { + self.as_ref().is_delta() + } +} + +impl CompactionLayer for OwnArc { + fn key_range(&self) -> &Range { + &self.layer_desc().key_range + } + fn lsn_range(&self) -> &Range { + &self.layer_desc().lsn_range + } + fn file_size(&self) -> u64 { + self.layer_desc().file_size + } + fn short_id(&self) -> std::string::String { + self.layer_desc().short_id().to_string() + } + fn is_delta(&self) -> bool { + true + } +} + +use crate::tenant::timeline::DeltaEntry; + +impl CompactionLayer for ResidentDeltaLayer { + fn key_range(&self) -> &Range { + &self.0.layer_desc().key_range + } + fn lsn_range(&self) -> &Range { + &self.0.layer_desc().lsn_range + } + fn file_size(&self) -> u64 { + self.0.layer_desc().file_size + } + fn short_id(&self) -> std::string::String { + self.0.layer_desc().short_id().to_string() + } + fn is_delta(&self) -> bool { + true + } +} + +#[async_trait] +impl CompactionDeltaLayer for ResidentDeltaLayer { + type DeltaEntry<'a> = DeltaEntry<'a>; + + async fn load_keys<'a>(&self, ctx: &RequestContext) -> anyhow::Result>> { + self.0.load_keys(ctx).await + } +} + +impl CompactionLayer for ResidentImageLayer { + fn key_range(&self) -> &Range { + &self.0.layer_desc().key_range + } + fn lsn_range(&self) -> &Range { + &self.0.layer_desc().lsn_range + } + fn file_size(&self) -> u64 { + self.0.layer_desc().file_size + } + fn short_id(&self) -> std::string::String { + self.0.layer_desc().short_id().to_string() + } + fn is_delta(&self) -> bool { + false + } +} +impl CompactionImageLayer for ResidentImageLayer {} diff --git a/pageserver/src/tenant/timeline/delete.rs b/pageserver/src/tenant/timeline/delete.rs index be873181d9..a0c9d99196 100644 --- a/pageserver/src/tenant/timeline/delete.rs +++ b/pageserver/src/tenant/timeline/delete.rs @@ -6,7 +6,7 @@ use std::{ use anyhow::Context; use pageserver_api::{models::TimelineState, shard::TenantShardId}; use tokio::sync::OwnedMutexGuard; -use tracing::{debug, error, info, instrument, warn, Instrument, Span}; +use tracing::{debug, error, info, instrument, Instrument}; use utils::{crashsafe, fs_ext, id::TimelineId}; use crate::{ @@ -124,7 +124,7 @@ async fn set_deleted_in_remote_index(timeline: &Timeline) -> Result<(), DeleteTi /// No timeout here, GC & Compaction should be responsive to the /// `TimelineState::Stopping` change. // pub(super): documentation link -pub(super) async fn delete_local_layer_files( +pub(super) async fn delete_local_timeline_directory( conf: &PageServerConf, tenant_shard_id: TenantShardId, timeline: &Timeline, @@ -149,8 +149,6 @@ pub(super) async fn delete_local_layer_files( // NB: This need not be atomic because the deleted flag in the IndexPart // will be observed during tenant/timeline load. The deletion will be resumed there. // - // For configurations without remote storage, we guarantee crash-safety by persising delete mark file. - // // Note that here we do not bail out on std::io::ErrorKind::NotFound. // This can happen if we're called a second time, e.g., // because of a previous failure/cancellation at/after @@ -158,72 +156,21 @@ pub(super) async fn delete_local_layer_files( // // ErrorKind::NotFound can also happen if we race with tenant detach, because, // no locks are shared. - // - // For now, log and continue. - // warn! level is technically not appropriate for the - // first case because we should expect retries to happen. - // But the error is so rare, it seems better to get attention if it happens. - // - // Note that metadata removal is skipped, this is not technically needed, - // but allows to reuse timeline loading code during resumed deletion. - // (we always expect that metadata is in place when timeline is being loaded) + tokio::fs::remove_dir_all(local_timeline_directory) + .await + .or_else(fs_ext::ignore_not_found) + .context("remove local timeline directory")?; - #[cfg(feature = "testing")] - let mut counter = 0; - - // Timeline directory may not exist if we failed to delete mark file and request was retried. - if !local_timeline_directory.exists() { - return Ok(()); - } - - let metadata_path = conf.metadata_path(&tenant_shard_id, &timeline.timeline_id); - - for entry in walkdir::WalkDir::new(&local_timeline_directory).contents_first(true) { - #[cfg(feature = "testing")] - { - counter += 1; - if counter == 2 { - fail::fail_point!("timeline-delete-during-rm", |_| { - Err(anyhow::anyhow!("failpoint: timeline-delete-during-rm"))? - }); - } - } - - let entry = entry?; - if entry.path() == metadata_path { - debug!("found metadata, skipping"); - continue; - } - - if entry.path() == local_timeline_directory { - // Keeping directory because metedata file is still there - debug!("found timeline dir itself, skipping"); - continue; - } - - let metadata = match entry.metadata() { - Ok(metadata) => metadata, - Err(e) => { - if crate::is_walkdir_io_not_found(&e) { - warn!( - timeline_dir=?local_timeline_directory, - path=?entry.path().display(), - "got not found err while removing timeline dir, proceeding anyway" - ); - continue; - } - anyhow::bail!(e); - } - }; - - if metadata.is_dir() { - warn!(path=%entry.path().display(), "unexpected directory under timeline dir"); - tokio::fs::remove_dir(entry.path()).await - } else { - tokio::fs::remove_file(entry.path()).await - } - .with_context(|| format!("Failed to remove: {}", entry.path().display()))?; - } + // Make sure previous deletions are ordered before mark removal. + // Otherwise there is no guarantee that they reach the disk before mark deletion. + // So its possible for mark to reach disk first and for other deletions + // to be reordered later and thus missed if a crash occurs. + // Note that we dont need to sync after mark file is removed + // because we can tolerate the case when mark file reappears on startup. + let timeline_path = conf.timelines_path(&tenant_shard_id); + crashsafe::fsync_async(timeline_path) + .await + .context("fsync_pre_mark_remove")?; info!("finished deleting layer files, releasing locks"); drop(guards); @@ -254,39 +201,6 @@ async fn cleanup_remaining_timeline_fs_traces( tenant_shard_id: TenantShardId, timeline_id: TimelineId, ) -> anyhow::Result<()> { - // Remove local metadata - tokio::fs::remove_file(conf.metadata_path(&tenant_shard_id, &timeline_id)) - .await - .or_else(fs_ext::ignore_not_found) - .context("remove metadata")?; - - fail::fail_point!("timeline-delete-after-rm-metadata", |_| { - Err(anyhow::anyhow!( - "failpoint: timeline-delete-after-rm-metadata" - ))? - }); - - // Remove timeline dir - tokio::fs::remove_dir(conf.timeline_path(&tenant_shard_id, &timeline_id)) - .await - .or_else(fs_ext::ignore_not_found) - .context("timeline dir")?; - - fail::fail_point!("timeline-delete-after-rm-dir", |_| { - Err(anyhow::anyhow!("failpoint: timeline-delete-after-rm-dir"))? - }); - - // Make sure previous deletions are ordered before mark removal. - // Otherwise there is no guarantee that they reach the disk before mark deletion. - // So its possible for mark to reach disk first and for other deletions - // to be reordered later and thus missed if a crash occurs. - // Note that we dont need to sync after mark file is removed - // because we can tolerate the case when mark file reappears on startup. - let timeline_path = conf.timelines_path(&tenant_shard_id); - crashsafe::fsync_async(timeline_path) - .await - .context("fsync_pre_mark_remove")?; - // Remove delete mark // TODO: once we are confident that no more exist in the field, remove this // line. It cleans up a legacy marker file that might in rare cases be present. @@ -356,12 +270,14 @@ impl DeleteTimelineFlow { // NB: If this fails half-way through, and is retried, the retry will go through // all the same steps again. Make sure the code here is idempotent, and don't // error out if some of the shutdown tasks have already been completed! - #[instrument(skip(tenant), fields(tenant_id=%tenant.tenant_shard_id.tenant_id, shard_id=%tenant.tenant_shard_id.shard_slug()))] + #[instrument(skip_all, fields(%inplace))] pub async fn run( tenant: &Arc, timeline_id: TimelineId, inplace: bool, ) -> Result<(), DeleteTimelineError> { + super::debug_assert_current_span_has_tenant_and_timeline_id(); + let (timeline, mut guard) = Self::prepare(tenant, timeline_id)?; guard.mark_in_progress()?; @@ -417,6 +333,7 @@ impl DeleteTimelineFlow { TimelineResources { remote_client, deletion_queue_client, + timeline_get_throttle: tenant.timeline_get_throttle.clone(), }, // Important. We dont pass ancestor above because it can be missing. // Thus we need to skip the validation here. @@ -539,12 +456,7 @@ impl DeleteTimelineFlow { }; Ok(()) } - .instrument({ - let span = - tracing::info_span!(parent: None, "delete_timeline", tenant_id=%tenant_shard_id.tenant_id, shard_id=%tenant_shard_id.shard_slug(),timeline_id=%timeline_id); - span.follows_from(Span::current()); - span - }), + .instrument(tracing::info_span!(parent: None, "delete_timeline", tenant_id=%tenant_shard_id.tenant_id, shard_id=%tenant_shard_id.shard_slug(),timeline_id=%timeline_id)), ); } @@ -554,15 +466,12 @@ impl DeleteTimelineFlow { tenant: &Tenant, timeline: &Timeline, ) -> Result<(), DeleteTimelineError> { - delete_local_layer_files(conf, tenant.tenant_shard_id, timeline).await?; + delete_local_timeline_directory(conf, tenant.tenant_shard_id, timeline).await?; delete_remote_layers_and_index(timeline).await?; pausable_failpoint!("in_progress_delete"); - cleanup_remaining_timeline_fs_traces(conf, tenant.tenant_shard_id, timeline.timeline_id) - .await?; - remove_timeline_from_tenant(tenant, timeline.timeline_id, &guard).await?; *guard = Self::Finished; diff --git a/pageserver/src/tenant/timeline/eviction_task.rs b/pageserver/src/tenant/timeline/eviction_task.rs index 01a5bfc32b..dd603135d2 100644 --- a/pageserver/src/tenant/timeline/eviction_task.rs +++ b/pageserver/src/tenant/timeline/eviction_task.rs @@ -34,7 +34,7 @@ use crate::{ }, }; -use utils::completion; +use utils::{completion, sync::gate::GateGuard}; use super::Timeline; @@ -81,10 +81,17 @@ impl Timeline { #[instrument(skip_all, fields(tenant_id = %self.tenant_shard_id.tenant_id, shard_id = %self.tenant_shard_id.shard_slug(), timeline_id = %self.timeline_id))] async fn eviction_task(self: Arc, cancel: CancellationToken) { use crate::tenant::tasks::random_init_delay; + + // acquire the gate guard only once within a useful span + let Ok(guard) = self.gate.enter() else { + return; + }; + { let policy = self.get_eviction_policy(); let period = match policy { EvictionPolicy::LayerAccessThreshold(lat) => lat.period, + EvictionPolicy::OnlyImitiate(lat) => lat.period, EvictionPolicy::NoEviction => Duration::from_secs(10), }; if random_init_delay(period, &cancel).await.is_err() { @@ -95,7 +102,9 @@ impl Timeline { let ctx = RequestContext::new(TaskKind::Eviction, DownloadBehavior::Warn); loop { let policy = self.get_eviction_policy(); - let cf = self.eviction_iteration(&policy, &cancel, &ctx).await; + let cf = self + .eviction_iteration(&policy, &cancel, &guard, &ctx) + .await; match cf { ControlFlow::Break(()) => break, @@ -116,42 +125,59 @@ impl Timeline { self: &Arc, policy: &EvictionPolicy, cancel: &CancellationToken, + gate: &GateGuard, ctx: &RequestContext, ) -> ControlFlow<(), Instant> { debug!("eviction iteration: {policy:?}"); - match policy { + let start = Instant::now(); + let (period, threshold) = match policy { EvictionPolicy::NoEviction => { // check again in 10 seconds; XXX config watch mechanism - ControlFlow::Continue(Instant::now() + Duration::from_secs(10)) + return ControlFlow::Continue(Instant::now() + Duration::from_secs(10)); } EvictionPolicy::LayerAccessThreshold(p) => { - let start = Instant::now(); - match self.eviction_iteration_threshold(p, cancel, ctx).await { + match self + .eviction_iteration_threshold(p, cancel, gate, ctx) + .await + { ControlFlow::Break(()) => return ControlFlow::Break(()), ControlFlow::Continue(()) => (), } - let elapsed = start.elapsed(); - crate::tenant::tasks::warn_when_period_overrun( - elapsed, - p.period, - BackgroundLoopKind::Eviction, - ); - crate::metrics::EVICTION_ITERATION_DURATION - .get_metric_with_label_values(&[ - &format!("{}", p.period.as_secs()), - &format!("{}", p.threshold.as_secs()), - ]) - .unwrap() - .observe(elapsed.as_secs_f64()); - ControlFlow::Continue(start + p.period) + (p.period, p.threshold) } - } + EvictionPolicy::OnlyImitiate(p) => { + if self.imitiate_only(p, cancel, gate, ctx).await.is_break() { + return ControlFlow::Break(()); + } + (p.period, p.threshold) + } + }; + + let elapsed = start.elapsed(); + crate::tenant::tasks::warn_when_period_overrun( + elapsed, + period, + BackgroundLoopKind::Eviction, + ); + // FIXME: if we were to mix policies on a pageserver, we would have no way to sense this. I + // don't think that is a relevant fear however, and regardless the imitation should be the + // most costly part. + crate::metrics::EVICTION_ITERATION_DURATION + .get_metric_with_label_values(&[ + &format!("{}", period.as_secs()), + &format!("{}", threshold.as_secs()), + ]) + .unwrap() + .observe(elapsed.as_secs_f64()); + + ControlFlow::Continue(start + period) } async fn eviction_iteration_threshold( self: &Arc, p: &EvictionPolicyLayerAccessThreshold, cancel: &CancellationToken, + gate: &GateGuard, ctx: &RequestContext, ) -> ControlFlow<()> { let now = SystemTime::now(); @@ -167,42 +193,19 @@ impl Timeline { _ = self.cancel.cancelled() => return ControlFlow::Break(()), }; - // If we evict layers but keep cached values derived from those layers, then - // we face a storm of on-demand downloads after pageserver restart. - // The reason is that the restart empties the caches, and so, the values - // need to be re-computed by accessing layers, which we evicted while the - // caches were filled. - // - // Solutions here would be one of the following: - // 1. Have a persistent cache. - // 2. Count every access to a cached value to the access stats of all layers - // that were accessed to compute the value in the first place. - // 3. Invalidate the caches at a period of < p.threshold/2, so that the values - // get re-computed from layers, thereby counting towards layer access stats. - // 4. Make the eviction task imitate the layer accesses that typically hit caches. - // - // We follow approach (4) here because in Neon prod deployment: - // - page cache is quite small => high churn => low hit rate - // => eviction gets correct access stats - // - value-level caches such as logical size & repatition have a high hit rate, - // especially for inactive tenants - // => eviction sees zero accesses for these - // => they cause the on-demand download storm on pageserver restart - // - // We should probably move to persistent caches in the future, or avoid - // having inactive tenants attached to pageserver in the first place. - match self.imitate_layer_accesses(p, cancel, ctx).await { + match self.imitate_layer_accesses(p, cancel, gate, ctx).await { ControlFlow::Break(()) => return ControlFlow::Break(()), ControlFlow::Continue(()) => (), } - #[allow(dead_code)] #[derive(Debug, Default)] struct EvictionStats { candidates: usize, evicted: usize, errors: usize, not_evictable: usize, + timeouts: usize, + #[allow(dead_code)] skipped_for_shutdown: usize, } @@ -239,12 +242,7 @@ impl Timeline { } }; - let last_activity_ts = hist_layer.access_stats().latest_activity().unwrap_or_else(|| { - // We only use this fallback if there's an implementation error. - // `latest_activity` already does rate-limited warn!() log. - debug!(layer=%hist_layer, "last_activity returns None, using SystemTime::now"); - SystemTime::now() - }); + let last_activity_ts = hist_layer.access_stats().latest_activity_or_now(); let no_activity_for = match now.duration_since(last_activity_ts) { Ok(d) => d, @@ -270,7 +268,11 @@ impl Timeline { let layer = guard.drop_eviction_guard(); if no_activity_for > p.threshold { // this could cause a lot of allocations in some cases - js.spawn(async move { layer.evict_and_wait().await }); + js.spawn(async move { + layer + .evict_and_wait(std::time::Duration::from_secs(5)) + .await + }); stats.candidates += 1; } } @@ -283,6 +285,9 @@ impl Timeline { Ok(Err(EvictionError::NotFound | EvictionError::Downloaded)) => { stats.not_evictable += 1; } + Ok(Err(EvictionError::Timeout)) => { + stats.timeouts += 1; + } Err(je) if je.is_cancelled() => unreachable!("not used"), Err(je) if je.is_panic() => { /* already logged */ @@ -298,7 +303,8 @@ impl Timeline { stats = join_all => { if stats.candidates == stats.not_evictable { debug!(stats=?stats, "eviction iteration complete"); - } else if stats.errors > 0 || stats.not_evictable > 0 { + } else if stats.errors > 0 || stats.not_evictable > 0 || stats.timeouts > 0 { + // reminder: timeouts are not eviction cancellations warn!(stats=?stats, "eviction iteration complete"); } else { info!(stats=?stats, "eviction iteration complete"); @@ -312,13 +318,68 @@ impl Timeline { ControlFlow::Continue(()) } + /// Like `eviction_iteration_threshold`, but without any eviction. Eviction will be done by + /// disk usage based eviction task. + async fn imitiate_only( + self: &Arc, + p: &EvictionPolicyLayerAccessThreshold, + cancel: &CancellationToken, + gate: &GateGuard, + ctx: &RequestContext, + ) -> ControlFlow<()> { + let acquire_permit = crate::tenant::tasks::concurrent_background_tasks_rate_limit_permit( + BackgroundLoopKind::Eviction, + ctx, + ); + + let _permit = tokio::select! { + permit = acquire_permit => permit, + _ = cancel.cancelled() => return ControlFlow::Break(()), + _ = self.cancel.cancelled() => return ControlFlow::Break(()), + }; + + self.imitate_layer_accesses(p, cancel, gate, ctx).await + } + + /// If we evict layers but keep cached values derived from those layers, then + /// we face a storm of on-demand downloads after pageserver restart. + /// The reason is that the restart empties the caches, and so, the values + /// need to be re-computed by accessing layers, which we evicted while the + /// caches were filled. + /// + /// Solutions here would be one of the following: + /// 1. Have a persistent cache. + /// 2. Count every access to a cached value to the access stats of all layers + /// that were accessed to compute the value in the first place. + /// 3. Invalidate the caches at a period of < p.threshold/2, so that the values + /// get re-computed from layers, thereby counting towards layer access stats. + /// 4. Make the eviction task imitate the layer accesses that typically hit caches. + /// + /// We follow approach (4) here because in Neon prod deployment: + /// - page cache is quite small => high churn => low hit rate + /// => eviction gets correct access stats + /// - value-level caches such as logical size & repatition have a high hit rate, + /// especially for inactive tenants + /// => eviction sees zero accesses for these + /// => they cause the on-demand download storm on pageserver restart + /// + /// We should probably move to persistent caches in the future, or avoid + /// having inactive tenants attached to pageserver in the first place. #[instrument(skip_all)] async fn imitate_layer_accesses( &self, p: &EvictionPolicyLayerAccessThreshold, cancel: &CancellationToken, + gate: &GateGuard, ctx: &RequestContext, ) -> ControlFlow<()> { + if !self.tenant_shard_id.is_zero() { + // Shards !=0 do not maintain accurate relation sizes, and do not need to calculate logical size + // for consumption metrics (consumption metrics are only sent from shard 0). We may therefore + // skip imitating logical size accesses for eviction purposes. + return ControlFlow::Continue(()); + } + let mut state = self.eviction_task_timeline_state.lock().await; // Only do the imitate_layer accesses approximately as often as the threshold. A little @@ -328,7 +389,7 @@ impl Timeline { match state.last_layer_access_imitation { Some(ts) if ts.elapsed() < inter_imitate_period => { /* no need to run */ } _ => { - self.imitate_timeline_cached_layer_accesses(ctx).await; + self.imitate_timeline_cached_layer_accesses(gate, ctx).await; state.last_layer_access_imitation = Some(tokio::time::Instant::now()) } } @@ -368,12 +429,21 @@ impl Timeline { /// Recompute the values which would cause on-demand downloads during restart. #[instrument(skip_all)] - async fn imitate_timeline_cached_layer_accesses(&self, ctx: &RequestContext) { + async fn imitate_timeline_cached_layer_accesses( + &self, + guard: &GateGuard, + ctx: &RequestContext, + ) { let lsn = self.get_last_record_lsn(); // imitiate on-restart initial logical size let size = self - .calculate_logical_size(lsn, LogicalSizeCalculationCause::EvictionTaskImitation, ctx) + .calculate_logical_size( + lsn, + LogicalSizeCalculationCause::EvictionTaskImitation, + guard, + ctx, + ) .instrument(info_span!("calculate_logical_size")) .await; diff --git a/pageserver/src/tenant/timeline/layer_manager.rs b/pageserver/src/tenant/timeline/layer_manager.rs index e38f5be209..ebcdcfdb4d 100644 --- a/pageserver/src/tenant/timeline/layer_manager.rs +++ b/pageserver/src/tenant/timeline/layer_manager.rs @@ -1,4 +1,5 @@ use anyhow::{bail, ensure, Context, Result}; +use futures::StreamExt; use pageserver_api::shard::TenantShardId; use std::{collections::HashMap, sync::Arc}; use tracing::trace; @@ -20,19 +21,13 @@ use crate::{ }; /// Provides semantic APIs to manipulate the layer map. +#[derive(Default)] pub(crate) struct LayerManager { layer_map: LayerMap, layer_fmgr: LayerFileManager, } impl LayerManager { - pub(crate) fn create() -> Self { - Self { - layer_map: LayerMap::default(), - layer_fmgr: LayerFileManager::new(), - } - } - pub(crate) fn get_from_desc(&self, desc: &PersistentLayerDesc) -> Layer { self.layer_fmgr.get_from_desc(desc) } @@ -246,6 +241,32 @@ impl LayerManager { layer.delete_on_drop(); } + pub(crate) fn resident_layers(&self) -> impl futures::stream::Stream + '_ { + // for small layer maps, we most likely have all resident, but for larger more are likely + // to be evicted assuming lots of layers correlated with longer lifespan. + + let layers = self + .layer_map() + .iter_historic_layers() + .map(|desc| self.get_from_desc(&desc)); + + let layers = futures::stream::iter(layers); + + layers.filter_map(|layer| async move { + // TODO(#6028): this query does not really need to see the ResidentLayer + match layer.keep_resident().await { + Ok(Some(layer)) => Some(layer.drop_eviction_guard()), + Ok(None) => None, + Err(e) => { + // these should not happen, but we cannot make them statically impossible right + // now. + tracing::warn!(%layer, "failed to keep the layer resident: {e:#}"); + None + } + } + }) + } + pub(crate) fn contains(&self, layer: &Layer) -> bool { self.layer_fmgr.contains(layer) } @@ -253,6 +274,12 @@ impl LayerManager { pub(crate) struct LayerFileManager(HashMap); +impl Default for LayerFileManager { + fn default() -> Self { + Self(HashMap::default()) + } +} + impl LayerFileManager { fn get_from_desc(&self, desc: &PersistentLayerDesc) -> T { // The assumption for the `expect()` is that all code maintains the following invariant: @@ -275,10 +302,6 @@ impl LayerFileManager { self.0.contains_key(&layer.layer_desc().key()) } - pub(crate) fn new() -> Self { - Self(HashMap::new()) - } - pub(crate) fn remove(&mut self, layer: &T) { let present = self.0.remove(&layer.layer_desc().key()); if present.is_none() && cfg!(debug_assertions) { diff --git a/pageserver/src/tenant/timeline/logical_size.rs b/pageserver/src/tenant/timeline/logical_size.rs index 03bc59ea38..8f9ca0e29f 100644 --- a/pageserver/src/tenant/timeline/logical_size.rs +++ b/pageserver/src/tenant/timeline/logical_size.rs @@ -101,6 +101,14 @@ impl From<&Exact> for u64 { } } +impl Approximate { + /// For use in situations where we don't have a sane logical size value but need + /// to return something, e.g. in HTTP API on shard >0 of a sharded tenant. + pub(crate) fn zero() -> Self { + Self(0) + } +} + impl CurrentLogicalSize { pub(crate) fn size_dont_care_about_accuracy(&self) -> u64 { match self { diff --git a/pageserver/src/tenant/timeline/span.rs b/pageserver/src/tenant/timeline/span.rs index 3b580c9d1b..8b13789179 100644 --- a/pageserver/src/tenant/timeline/span.rs +++ b/pageserver/src/tenant/timeline/span.rs @@ -1,20 +1 @@ -#[cfg(debug_assertions)] -use utils::tracing_span_assert::{check_fields_present, Extractor, MultiNameExtractor}; -#[cfg(not(debug_assertions))] -pub(crate) fn debug_assert_current_span_has_tenant_and_timeline_id() {} - -#[cfg(debug_assertions)] -#[track_caller] -pub(crate) fn debug_assert_current_span_has_tenant_and_timeline_id() { - static TIMELINE_ID_EXTRACTOR: once_cell::sync::Lazy> = - once_cell::sync::Lazy::new(|| MultiNameExtractor::new("TimelineId", ["timeline_id"])); - - let fields: [&dyn Extractor; 2] = [ - &*crate::tenant::span::TENANT_ID_EXTRACTOR, - &*TIMELINE_ID_EXTRACTOR, - ]; - if let Err(missing) = check_fields_present!(fields) { - panic!("missing extractors: {missing:?}") - } -} diff --git a/pageserver/src/tenant/timeline/walreceiver/walreceiver_connection.rs b/pageserver/src/tenant/timeline/walreceiver/walreceiver_connection.rs index e398d683e5..8297ca6563 100644 --- a/pageserver/src/tenant/timeline/walreceiver/walreceiver_connection.rs +++ b/pageserver/src/tenant/timeline/walreceiver/walreceiver_connection.rs @@ -389,15 +389,16 @@ pub(super) async fn handle_walreceiver_connection( } } - timeline - .check_checkpoint_distance() - .await - .with_context(|| { - format!( - "Failed to check checkpoint distance for timeline {}", - timeline.timeline_id - ) - })?; + { + // This is a hack. It piggybacks on the keepalive messages sent by the + // safekeeper in order to enforce `checkpoint_timeout` on the currently + // open layer. This hack doesn't provide a bound on the total size of + // in-memory layers on a pageserver. See https://github.com/neondatabase/neon/issues/6916. + let mut writer = timeline.writer().await; + if let Err(err) = writer.tick().await { + warn!("Timeline writer tick failed: {err}"); + } + } if let Some(last_lsn) = status_update { let timeline_remote_consistent_lsn = timeline @@ -426,13 +427,21 @@ pub(super) async fn handle_walreceiver_connection( // Send the replication feedback message. // Regular standby_status_update fields are put into this message. - let current_timeline_size = timeline - .get_current_logical_size( - crate::tenant::timeline::GetLogicalSizePriority::User, - &ctx, - ) - // FIXME: https://github.com/neondatabase/neon/issues/5963 - .size_dont_care_about_accuracy(); + let current_timeline_size = if timeline.tenant_shard_id.is_zero() { + timeline + .get_current_logical_size( + crate::tenant::timeline::GetLogicalSizePriority::User, + &ctx, + ) + // FIXME: https://github.com/neondatabase/neon/issues/5963 + .size_dont_care_about_accuracy() + } else { + // Non-zero shards send zero for logical size. The safekeeper will ignore + // this number. This is because in a sharded tenant, only shard zero maintains + // accurate logical size. + 0 + }; + let status_update = PageserverFeedback { current_timeline_size, last_received_lsn, diff --git a/pageserver/src/tenant/upload_queue.rs b/pageserver/src/tenant/upload_queue.rs index 0b61bc0a10..a5516bb9a9 100644 --- a/pageserver/src/tenant/upload_queue.rs +++ b/pageserver/src/tenant/upload_queue.rs @@ -130,7 +130,7 @@ pub(super) struct UploadQueueStopped { pub(crate) enum NotInitialized { #[error("queue is in state Uninitialized")] Uninitialized, - #[error("queue is in state Stopping")] + #[error("queue is in state Stopped")] Stopped, #[error("queue is shutting down")] ShuttingDown, diff --git a/pageserver/src/tenant/vectored_blob_io.rs b/pageserver/src/tenant/vectored_blob_io.rs new file mode 100644 index 0000000000..805f70b23b --- /dev/null +++ b/pageserver/src/tenant/vectored_blob_io.rs @@ -0,0 +1,436 @@ +//! +//! Utilities for vectored reading of variable-sized "blobs". +//! +//! The "blob" api is an abstraction on top of the "block" api, +//! with the main difference being that blobs do not have a fixed +//! size (each blob is prefixed with 1 or 4 byte length field) +//! +//! The vectored apis provided in this module allow for planning +//! and executing disk IO which covers multiple blobs. +//! +//! Reads are planned with [`VectoredReadPlanner`] which will coalesce +//! adjacent blocks into a single disk IO request and exectuted by +//! [`VectoredBlobReader`] which does all the required offset juggling +//! and returns a buffer housing all the blobs and a list of offsets. +//! +//! Note that the vectored blob api does *not* go through the page cache. + +use std::collections::BTreeMap; +use std::num::NonZeroUsize; + +use bytes::BytesMut; +use pageserver_api::key::Key; +use utils::lsn::Lsn; +use utils::vec_map::VecMap; + +use crate::virtual_file::VirtualFile; + +#[derive(Copy, Clone, Debug, PartialEq, Eq)] +pub struct MaxVectoredReadBytes(pub NonZeroUsize); + +/// Metadata bundled with the start and end offset of a blob. +#[derive(Copy, Clone, Debug)] +pub struct BlobMeta { + pub key: Key, + pub lsn: Lsn, +} + +/// Blob offsets into [`VectoredBlobsBuf::buf`] +pub struct VectoredBlob { + pub start: usize, + pub end: usize, + pub meta: BlobMeta, +} + +/// Return type of [`VectoredBlobReader::read_blobs`] +pub struct VectoredBlobsBuf { + /// Buffer for all blobs in this read + pub buf: BytesMut, + /// Offsets into the buffer and metadata for all blobs in this read + pub blobs: Vec, +} + +/// Description of one disk read for multiple blobs. +/// Used as the argument form [`VectoredBlobReader::read_blobs`] +#[derive(Debug)] +pub struct VectoredRead { + pub start: u64, + pub end: u64, + /// Starting offsets and metadata for each blob in this read + pub blobs_at: VecMap, +} + +impl VectoredRead { + fn size(&self) -> usize { + (self.end - self.start) as usize + } +} + +#[derive(Eq, PartialEq)] +enum VectoredReadExtended { + Yes, + No, +} + +struct VectoredReadBuilder { + start: u64, + end: u64, + blobs_at: VecMap, + max_read_size: usize, +} + +impl VectoredReadBuilder { + fn new(start_offset: u64, end_offset: u64, meta: BlobMeta, max_read_size: usize) -> Self { + let mut blobs_at = VecMap::default(); + blobs_at + .append(start_offset, meta) + .expect("First insertion always succeeds"); + + Self { + start: start_offset, + end: end_offset, + blobs_at, + max_read_size, + } + } + + /// Attempt to extend the current read with a new blob if the start + /// offset matches with the current end of the vectored read + /// and the resuting size is below the max read size + fn extend(&mut self, start: u64, end: u64, meta: BlobMeta) -> VectoredReadExtended { + let size = (end - start) as usize; + if self.end == start && self.size() + size <= self.max_read_size { + self.end = end; + self.blobs_at + .append(start, meta) + .expect("LSNs are ordered within vectored reads"); + + return VectoredReadExtended::Yes; + } + + VectoredReadExtended::No + } + + fn size(&self) -> usize { + (self.end - self.start) as usize + } + + fn build(self) -> VectoredRead { + VectoredRead { + start: self.start, + end: self.end, + blobs_at: self.blobs_at, + } + } +} + +#[derive(Copy, Clone, Debug)] +pub enum BlobFlag { + None, + Ignore, + ReplaceAll, +} + +/// Planner for vectored blob reads. +/// +/// Blob offsets are received via [`VectoredReadPlanner::handle`] +/// and coalesced into disk reads. +/// +/// The implementation is very simple: +/// * Collect all blob offsets in an ordered structure +/// * Iterate over the collected blobs and coalesce them into reads at the end +pub struct VectoredReadPlanner { + // Track all the blob offsets. Start offsets must be ordered. + blobs: BTreeMap>, + // Arguments for previous blob passed into [`VectoredReadPlanner::handle`] + prev: Option<(Key, Lsn, u64, BlobFlag)>, + + max_read_size: usize, +} + +impl VectoredReadPlanner { + pub fn new(max_read_size: usize) -> Self { + Self { + blobs: BTreeMap::new(), + prev: None, + max_read_size, + } + } + + /// Include a new blob in the read plan. + /// + /// This function is called from a B-Tree index visitor (see `DeltaLayerInner::plan_reads` + /// and `ImageLayerInner::plan_reads`). Said visitor wants to collect blob offsets for all + /// keys in a given keyspace. This function must be called for each key in the desired + /// keyspace (monotonically continuous). [`Self::handle_range_end`] must + /// be called after every range in the offset. + /// + /// In the event that keys are skipped, the behaviour is undefined and can lead to an + /// incorrect read plan. We can end up asserting, erroring in wal redo or returning + /// incorrect data to the user. + /// + /// The `flag` argument has two interesting values: + /// * [`BlobFlag::ReplaceAll`]: The blob for this key should replace all existing blobs. + /// This is used for WAL records that `will_init`. + /// * [`BlobFlag::Ignore`]: This blob should not be included in the read. This happens + /// if the blob is cached. + pub fn handle(&mut self, key: Key, lsn: Lsn, offset: u64, flag: BlobFlag) { + // Implementation note: internally lag behind by one blob such that + // we have a start and end offset when initialising [`VectoredRead`] + let (prev_key, prev_lsn, prev_offset, prev_flag) = match self.prev { + None => { + self.prev = Some((key, lsn, offset, flag)); + return; + } + Some(prev) => prev, + }; + + self.add_blob(prev_key, prev_lsn, prev_offset, offset, prev_flag); + + self.prev = Some((key, lsn, offset, flag)); + } + + pub fn handle_range_end(&mut self, offset: u64) { + if let Some((prev_key, prev_lsn, prev_offset, prev_flag)) = self.prev { + self.add_blob(prev_key, prev_lsn, prev_offset, offset, prev_flag); + } + + self.prev = None; + } + + fn add_blob(&mut self, key: Key, lsn: Lsn, start_offset: u64, end_offset: u64, flag: BlobFlag) { + match flag { + BlobFlag::None => { + let blobs_for_key = self.blobs.entry(key).or_default(); + blobs_for_key.push((lsn, start_offset, end_offset)); + } + BlobFlag::ReplaceAll => { + let blobs_for_key = self.blobs.entry(key).or_default(); + blobs_for_key.clear(); + blobs_for_key.push((lsn, start_offset, end_offset)); + } + BlobFlag::Ignore => {} + } + } + + pub fn finish(self) -> Vec { + let mut current_read_builder: Option = None; + let mut reads = Vec::new(); + + for (key, blobs_for_key) in self.blobs { + for (lsn, start_offset, end_offset) in blobs_for_key { + let extended = match &mut current_read_builder { + Some(read_builder) => { + read_builder.extend(start_offset, end_offset, BlobMeta { key, lsn }) + } + None => VectoredReadExtended::No, + }; + + if extended == VectoredReadExtended::No { + let next_read_builder = VectoredReadBuilder::new( + start_offset, + end_offset, + BlobMeta { key, lsn }, + self.max_read_size, + ); + + let prev_read_builder = current_read_builder.replace(next_read_builder); + + // `current_read_builder` is None in the first iteration of the outer loop + if let Some(read_builder) = prev_read_builder { + reads.push(read_builder.build()); + } + } + } + } + + if let Some(read_builder) = current_read_builder { + reads.push(read_builder.build()); + } + + reads + } +} + +/// Disk reader for vectored blob spans (does not go through the page cache) +pub struct VectoredBlobReader<'a> { + file: &'a VirtualFile, +} + +impl<'a> VectoredBlobReader<'a> { + pub fn new(file: &'a VirtualFile) -> Self { + Self { file } + } + + /// Read the requested blobs into the buffer. + /// + /// We have to deal with the fact that blobs are not fixed size. + /// Each blob is prefixed by a size header. + /// + /// The success return value is a struct which contains the buffer + /// filled from disk and a list of offsets at which each blob lies + /// in the buffer. + pub async fn read_blobs( + &self, + read: &VectoredRead, + buf: BytesMut, + ) -> Result { + assert!(read.size() > 0); + assert!( + read.size() <= buf.capacity(), + "{} > {}", + read.size(), + buf.capacity() + ); + let buf = self + .file + .read_exact_at_n(buf, read.start, read.size()) + .await?; + + let blobs_at = read.blobs_at.as_slice(); + let start_offset = blobs_at.first().expect("VectoredRead is never empty").0; + + let mut metas = Vec::with_capacity(blobs_at.len()); + + // Blobs in `read` only provide their starting offset. The end offset + // of a blob is implicit: the start of the next blob if one exists + // or the end of the read. + let pairs = blobs_at.iter().zip( + blobs_at + .iter() + .map(Some) + .skip(1) + .chain(std::iter::once(None)), + ); + + for ((offset, meta), next) in pairs { + let offset_in_buf = offset - start_offset; + let first_len_byte = buf[offset_in_buf as usize]; + + // Each blob is prefixed by a header containing it's size. + // Extract the size and skip that header to find the start of the data. + // The size can be 1 or 4 bytes. The most significant bit is 0 in the + // 1 byte case and 1 in the 4 byte case. + let (size_length, blob_size) = if first_len_byte < 0x80 { + (1, first_len_byte as u64) + } else { + let mut blob_size_buf = [0u8; 4]; + let offset_in_buf = offset_in_buf as usize; + + blob_size_buf.copy_from_slice(&buf[offset_in_buf..offset_in_buf + 4]); + blob_size_buf[0] &= 0x7f; + (4, u32::from_be_bytes(blob_size_buf) as u64) + }; + + let start = offset_in_buf + size_length; + let end = match next { + Some((next_blob_start_offset, _)) => next_blob_start_offset - start_offset, + None => start + blob_size, + }; + + assert_eq!(end - start, blob_size); + + metas.push(VectoredBlob { + start: start as usize, + end: end as usize, + meta: *meta, + }) + } + + Ok(VectoredBlobsBuf { buf, blobs: metas }) + } +} + +#[cfg(test)] +mod tests { + use super::*; + + fn validate_read(read: &VectoredRead, offset_range: &[(Key, Lsn, u64, BlobFlag)]) { + assert_eq!(read.start, offset_range.first().unwrap().2); + + let expected_offsets_in_read: Vec<_> = offset_range.iter().map(|o| o.2).collect(); + + let offsets_in_read: Vec<_> = read + .blobs_at + .as_slice() + .iter() + .map(|(offset, _)| *offset) + .collect(); + + assert_eq!(expected_offsets_in_read, offsets_in_read); + } + + #[test] + fn planner_max_read_size_test() { + let max_read_size = 128 * 1024; + let key = Key::MIN; + let lsn = Lsn(0); + + let blob_descriptions = vec![ + (key, lsn, 0, BlobFlag::None), + (key, lsn, 32 * 1024, BlobFlag::None), + (key, lsn, 96 * 1024, BlobFlag::None), // Last in read 1 + (key, lsn, 128 * 1024, BlobFlag::None), // Last in read 2 + (key, lsn, 198 * 1024, BlobFlag::None), // Last in read 3 + (key, lsn, 268 * 1024, BlobFlag::None), // Last in read 4 + (key, lsn, 396 * 1024, BlobFlag::None), // Last in read 5 + (key, lsn, 652 * 1024, BlobFlag::None), // Last in read 6 + ]; + + let ranges = [ + &blob_descriptions[0..3], + &blob_descriptions[3..4], + &blob_descriptions[4..5], + &blob_descriptions[5..6], + &blob_descriptions[6..7], + &blob_descriptions[7..], + ]; + + let mut planner = VectoredReadPlanner::new(max_read_size); + for (key, lsn, offset, flag) in blob_descriptions.clone() { + planner.handle(key, lsn, offset, flag); + } + + planner.handle_range_end(652 * 1024); + + let reads = planner.finish(); + assert_eq!(reads.len(), 6); + + for (idx, read) in reads.iter().enumerate() { + validate_read(read, ranges[idx]); + } + } + + #[test] + fn planner_replacement_test() { + let max_read_size = 128 * 1024; + let first_key = Key::MIN; + let second_key = first_key.next(); + let lsn = Lsn(0); + + let blob_descriptions = vec![ + (first_key, lsn, 0, BlobFlag::None), // First in read 1 + (first_key, lsn, 1024, BlobFlag::None), // Last in read 1 + (second_key, lsn, 2 * 1024, BlobFlag::ReplaceAll), + (second_key, lsn, 3 * 1024, BlobFlag::None), + (second_key, lsn, 4 * 1024, BlobFlag::ReplaceAll), // First in read 2 + (second_key, lsn, 5 * 1024, BlobFlag::None), // Last in read 2 + ]; + + let ranges = [&blob_descriptions[0..2], &blob_descriptions[4..]]; + + let mut planner = VectoredReadPlanner::new(max_read_size); + for (key, lsn, offset, flag) in blob_descriptions.clone() { + planner.handle(key, lsn, offset, flag); + } + + planner.handle_range_end(6 * 1024); + + let reads = planner.finish(); + assert_eq!(reads.len(), 2); + + for (idx, read) in reads.iter().enumerate() { + validate_read(read, ranges[idx]); + } + } +} diff --git a/pageserver/src/utilization.rs b/pageserver/src/utilization.rs new file mode 100644 index 0000000000..830c9897ca --- /dev/null +++ b/pageserver/src/utilization.rs @@ -0,0 +1,38 @@ +//! An utilization metric which is used to decide on which pageserver to put next tenant. +//! +//! The metric is exposed via `GET /v1/utilization`. Refer and maintain it's openapi spec as the +//! truth. + +use anyhow::Context; +use std::path::Path; + +use pageserver_api::models::PageserverUtilization; + +pub(crate) fn regenerate(tenants_path: &Path) -> anyhow::Result { + // TODO: currently the http api ratelimits this to 1Hz at most, which is probably good enough + + let statvfs = nix::sys::statvfs::statvfs(tenants_path) + .map_err(std::io::Error::from) + .context("statvfs tenants directory")?; + + let blocksz = statvfs.block_size(); + + #[cfg_attr(not(target_os = "macos"), allow(clippy::unnecessary_cast))] + let free = statvfs.blocks_available() as u64 * blocksz; + let used = crate::metrics::RESIDENT_PHYSICAL_SIZE_GLOBAL.get(); + let captured_at = std::time::SystemTime::now(); + + let doc = PageserverUtilization { + disk_usage_bytes: used, + free_space_bytes: free, + // lower is better; start with a constant + // + // note that u64::MAX will be output as i64::MAX as u64, but that should not matter + utilization_score: u64::MAX, + captured_at, + }; + + // TODO: make utilization_score into a metric + + Ok(doc) +} diff --git a/pageserver/src/virtual_file.rs b/pageserver/src/virtual_file.rs index d200a4ba5e..6d4774cf75 100644 --- a/pageserver/src/virtual_file.rs +++ b/pageserver/src/virtual_file.rs @@ -17,20 +17,21 @@ use crate::tenant::TENANTS_SEGMENT_NAME; use camino::{Utf8Path, Utf8PathBuf}; use once_cell::sync::OnceCell; use pageserver_api::shard::TenantShardId; -use std::fs::{self, File}; +use std::fs::File; use std::io::{Error, ErrorKind, Seek, SeekFrom}; -use tokio_epoll_uring::IoBufMut; +use tokio_epoll_uring::{BoundedBuf, IoBuf, IoBufMut, Slice}; use std::os::fd::{AsRawFd, FromRawFd, IntoRawFd, OwnedFd, RawFd}; -use std::os::unix::fs::FileExt; use std::sync::atomic::{AtomicBool, AtomicUsize, Ordering}; use tokio::sync::{RwLock, RwLockReadGuard, RwLockWriteGuard}; use tokio::time::Instant; -use utils::fs_ext; -mod io_engine; +pub use pageserver_api::models::virtual_file as api; +pub(crate) mod io_engine; +mod metadata; mod open_options; -pub use io_engine::IoEngineKind; +pub(crate) use io_engine::IoEngineKind; +pub(crate) use metadata::Metadata; pub(crate) use open_options::*; /// @@ -403,52 +404,57 @@ impl VirtualFile { Ok(vfile) } - /// Async & [`VirtualFile`]-enabled version of [`::utils::crashsafe::overwrite`]. - pub async fn crashsafe_overwrite( - final_path: &Utf8Path, - tmp_path: &Utf8Path, - content: &[u8], + /// Async version of [`::utils::crashsafe::overwrite`]. + /// + /// # NB: + /// + /// Doesn't actually use the [`VirtualFile`] file descriptor cache, but, + /// it did at an earlier time. + /// And it will use this module's [`io_engine`] in the near future, so, leaving it here. + pub async fn crashsafe_overwrite + Send, Buf: IoBuf + Send>( + final_path: Utf8PathBuf, + tmp_path: Utf8PathBuf, + content: B, ) -> std::io::Result<()> { - let Some(final_path_parent) = final_path.parent() else { - return Err(std::io::Error::from_raw_os_error( - nix::errno::Errno::EINVAL as i32, - )); - }; - std::fs::remove_file(tmp_path).or_else(fs_ext::ignore_not_found)?; - let mut file = Self::open_with_options( - tmp_path, - OpenOptions::new() - .write(true) - // Use `create_new` so that, if we race with ourselves or something else, - // we bail out instead of causing damage. - .create_new(true), - ) - .await?; - file.write_all(content).await?; - file.sync_all().await?; - drop(file); // before the rename, that's important! - // renames are atomic - std::fs::rename(tmp_path, final_path)?; - // Only open final path parent dirfd now, so that this operation only - // ever holds one VirtualFile fd at a time. That's important because - // the current `find_victim_slot` impl might pick the same slot for both - // VirtualFile., and it eventually does a blocking write lock instead of - // try_lock. - let final_parent_dirfd = - Self::open_with_options(final_path_parent, OpenOptions::new().read(true)).await?; - final_parent_dirfd.sync_all().await?; - Ok(()) + // TODO: use tokio_epoll_uring if configured as `io_engine`. + // See https://github.com/neondatabase/neon/issues/6663 + + tokio::task::spawn_blocking(move || { + let slice_storage; + let content_len = content.bytes_init(); + let content = if content.bytes_init() > 0 { + slice_storage = Some(content.slice(0..content_len)); + slice_storage.as_deref().expect("just set it to Some()") + } else { + &[] + }; + utils::crashsafe::overwrite(&final_path, &tmp_path, content) + }) + .await + .expect("blocking task is never aborted") } /// Call File::sync_all() on the underlying File. pub async fn sync_all(&self) -> Result<(), Error> { - with_file!(self, StorageIoOperation::Fsync, |file_guard| file_guard - .with_std_file(|std_file| std_file.sync_all())) + with_file!(self, StorageIoOperation::Fsync, |file_guard| { + let (_file_guard, res) = io_engine::get().sync_all(file_guard).await; + res + }) } - pub async fn metadata(&self) -> Result { - with_file!(self, StorageIoOperation::Metadata, |file_guard| file_guard - .with_std_file(|std_file| std_file.metadata())) + /// Call File::sync_data() on the underlying File. + pub async fn sync_data(&self) -> Result<(), Error> { + with_file!(self, StorageIoOperation::Fsync, |file_guard| { + let (_file_guard, res) = io_engine::get().sync_data(file_guard).await; + res + }) + } + + pub async fn metadata(&self) -> Result { + with_file!(self, StorageIoOperation::Metadata, |file_guard| { + let (_file_guard, res) = io_engine::get().metadata(file_guard).await; + res + }) } /// Helper function internal to `VirtualFile` that looks up the underlying File, @@ -555,7 +561,18 @@ impl VirtualFile { B: IoBufMut + Send, { let (buf, res) = - read_exact_at_impl(buf, offset, |buf, offset| self.read_at(buf, offset)).await; + read_exact_at_impl(buf, offset, None, |buf, offset| self.read_at(buf, offset)).await; + res.map(|()| buf) + } + + pub async fn read_exact_at_n(&self, buf: B, offset: u64, count: usize) -> Result + where + B: IoBufMut + Send, + { + let (buf, res) = read_exact_at_impl(buf, offset, Some(count), |buf, offset| { + self.read_at(buf, offset) + }) + .await; res.map(|()| buf) } @@ -575,50 +592,88 @@ impl VirtualFile { } // Copied from https://doc.rust-lang.org/1.72.0/src/std/os/unix/fs.rs.html#219-235 - pub async fn write_all_at(&self, mut buf: &[u8], mut offset: u64) -> Result<(), Error> { + pub async fn write_all_at, Buf: IoBuf + Send>( + &self, + buf: B, + mut offset: u64, + ) -> (B::Buf, Result<(), Error>) { + let buf_len = buf.bytes_init(); + if buf_len == 0 { + return (Slice::into_inner(buf.slice_full()), Ok(())); + } + let mut buf = buf.slice(0..buf_len); while !buf.is_empty() { - match self.write_at(buf, offset).await { + let res; + (buf, res) = self.write_at(buf, offset).await; + match res { Ok(0) => { - return Err(Error::new( - std::io::ErrorKind::WriteZero, - "failed to write whole buffer", - )); + return ( + Slice::into_inner(buf), + Err(Error::new( + std::io::ErrorKind::WriteZero, + "failed to write whole buffer", + )), + ); } Ok(n) => { - buf = &buf[n..]; + buf = buf.slice(n..); offset += n as u64; } - Err(ref e) if e.kind() == std::io::ErrorKind::Interrupted => {} - Err(e) => return Err(e), + Err(e) if e.kind() == std::io::ErrorKind::Interrupted => {} + Err(e) => return (Slice::into_inner(buf), Err(e)), } } - Ok(()) + (Slice::into_inner(buf), Ok(())) } - pub async fn write_all(&mut self, mut buf: &[u8]) -> Result<(), Error> { + /// Writes `buf.slice(0..buf.bytes_init())`. + /// Returns the IoBuf that is underlying the BoundedBuf `buf`. + /// I.e., the returned value's `bytes_init()` method returns something different than the `bytes_init()` that was passed in. + /// It's quite brittle and easy to mis-use, so, we return the size in the Ok() variant. + pub async fn write_all, Buf: IoBuf + Send>( + &mut self, + buf: B, + ) -> (B::Buf, Result) { + let nbytes = buf.bytes_init(); + if nbytes == 0 { + return (Slice::into_inner(buf.slice_full()), Ok(0)); + } + let mut buf = buf.slice(0..nbytes); while !buf.is_empty() { - match self.write(buf).await { + let res; + (buf, res) = self.write(buf).await; + match res { Ok(0) => { - return Err(Error::new( - std::io::ErrorKind::WriteZero, - "failed to write whole buffer", - )); + return ( + Slice::into_inner(buf), + Err(Error::new( + std::io::ErrorKind::WriteZero, + "failed to write whole buffer", + )), + ); } Ok(n) => { - buf = &buf[n..]; + buf = buf.slice(n..); } Err(ref e) if e.kind() == std::io::ErrorKind::Interrupted => {} - Err(e) => return Err(e), + Err(e) => return (Slice::into_inner(buf), Err(e)), } } - Ok(()) + (Slice::into_inner(buf), Ok(nbytes)) } - async fn write(&mut self, buf: &[u8]) -> Result { + async fn write( + &mut self, + buf: Slice, + ) -> (Slice, Result) { let pos = self.pos; - let n = self.write_at(buf, pos).await?; + let (buf, res) = self.write_at(buf, pos).await; + let n = match res { + Ok(n) => n, + Err(e) => return (buf, Err(e)), + }; self.pos += n as u64; - Ok(n) + (buf, Ok(n)) } pub(crate) async fn read_at(&self, buf: B, offset: u64) -> (B, Result) @@ -646,16 +701,30 @@ impl VirtualFile { }) } - async fn write_at(&self, buf: &[u8], offset: u64) -> Result { - let result = with_file!(self, StorageIoOperation::Write, |file_guard| { - file_guard.with_std_file(|std_file| std_file.write_at(buf, offset)) - }); - if let Ok(size) = result { - STORAGE_IO_SIZE - .with_label_values(&["write", &self.tenant_id, &self.shard_id, &self.timeline_id]) - .add(size as i64); - } - result + async fn write_at( + &self, + buf: Slice, + offset: u64, + ) -> (Slice, Result) { + let file_guard = match self.lock_file().await { + Ok(file_guard) => file_guard, + Err(e) => return (buf, Err(e)), + }; + observe_duration!(StorageIoOperation::Write, { + let ((_file_guard, buf), result) = + io_engine::get().write_at(file_guard, offset, buf).await; + if let Ok(size) = result { + STORAGE_IO_SIZE + .with_label_values(&[ + "write", + &self.tenant_id, + &self.shard_id, + &self.timeline_id, + ]) + .add(size as i64); + } + (buf, result) + }) } } @@ -663,6 +732,7 @@ impl VirtualFile { pub async fn read_exact_at_impl( buf: B, mut offset: u64, + count: Option, mut read_at: F, ) -> (B, std::io::Result<()>) where @@ -670,8 +740,15 @@ where F: FnMut(tokio_epoll_uring::Slice, u64) -> Fut, Fut: std::future::Future, std::io::Result)>, { - use tokio_epoll_uring::BoundedBuf; - let mut buf: tokio_epoll_uring::Slice = buf.slice_full(); // includes all the uninitialized memory + let mut buf: tokio_epoll_uring::Slice = match count { + Some(count) => { + assert!(count <= buf.bytes_total()); + assert!(count > 0); + buf.slice(..count) // may include uninitialized memory + } + None => buf.slice_full(), // includes all the uninitialized memory + }; + while buf.bytes_total() != 0 { let res; (buf, res) = read_at(buf, offset).await; @@ -761,7 +838,7 @@ mod test_read_exact_at_impl { result: Ok(vec![b'a', b'b', b'c', b'd', b'e']), }]), })); - let (buf, res) = read_exact_at_impl(buf, 0, |buf, offset| { + let (buf, res) = read_exact_at_impl(buf, 0, None, |buf, offset| { let mock_read_at = Arc::clone(&mock_read_at); async move { mock_read_at.lock().await.read_at(buf, offset).await } }) @@ -770,13 +847,33 @@ mod test_read_exact_at_impl { assert_eq!(buf, vec![b'a', b'b', b'c', b'd', b'e']); } + #[tokio::test] + async fn test_with_count() { + let buf = Vec::with_capacity(5); + let mock_read_at = Arc::new(tokio::sync::Mutex::new(MockReadAt { + expectations: VecDeque::from(vec![Expectation { + offset: 0, + bytes_total: 3, + result: Ok(vec![b'a', b'b', b'c']), + }]), + })); + + let (buf, res) = read_exact_at_impl(buf, 0, Some(3), |buf, offset| { + let mock_read_at = Arc::clone(&mock_read_at); + async move { mock_read_at.lock().await.read_at(buf, offset).await } + }) + .await; + assert!(res.is_ok()); + assert_eq!(buf, vec![b'a', b'b', b'c']); + } + #[tokio::test] async fn test_empty_buf_issues_no_syscall() { let buf = Vec::new(); let mock_read_at = Arc::new(tokio::sync::Mutex::new(MockReadAt { expectations: VecDeque::new(), })); - let (_buf, res) = read_exact_at_impl(buf, 0, |buf, offset| { + let (_buf, res) = read_exact_at_impl(buf, 0, None, |buf, offset| { let mock_read_at = Arc::clone(&mock_read_at); async move { mock_read_at.lock().await.read_at(buf, offset).await } }) @@ -801,7 +898,7 @@ mod test_read_exact_at_impl { }, ]), })); - let (buf, res) = read_exact_at_impl(buf, 0, |buf, offset| { + let (buf, res) = read_exact_at_impl(buf, 0, None, |buf, offset| { let mock_read_at = Arc::clone(&mock_read_at); async move { mock_read_at.lock().await.read_at(buf, offset).await } }) @@ -832,7 +929,7 @@ mod test_read_exact_at_impl { }, ]), })); - let (_buf, res) = read_exact_at_impl(buf, 0, |buf, offset| { + let (_buf, res) = read_exact_at_impl(buf, 0, None, |buf, offset| { let mock_read_at = Arc::clone(&mock_read_at); async move { mock_read_at.lock().await.read_at(buf, offset).await } }) @@ -1025,6 +1122,7 @@ mod tests { use rand::Rng; use std::future::Future; use std::io::Write; + use std::os::unix::fs::FileExt; use std::sync::Arc; enum MaybeVirtualFile { @@ -1045,10 +1143,23 @@ mod tests { MaybeVirtualFile::File(file) => file.read_exact_at(&mut buf, offset).map(|()| buf), } } - async fn write_all_at(&self, buf: &[u8], offset: u64) -> Result<(), Error> { + async fn write_all_at, Buf: IoBuf + Send>( + &self, + buf: B, + offset: u64, + ) -> Result<(), Error> { match self { - MaybeVirtualFile::VirtualFile(file) => file.write_all_at(buf, offset).await, - MaybeVirtualFile::File(file) => file.write_all_at(buf, offset), + MaybeVirtualFile::VirtualFile(file) => { + let (_buf, res) = file.write_all_at(buf, offset).await; + res + } + MaybeVirtualFile::File(file) => { + let buf_len = buf.bytes_init(); + if buf_len == 0 { + return Ok(()); + } + file.write_all_at(&buf.slice(0..buf_len), offset) + } } } async fn seek(&mut self, pos: SeekFrom) -> Result { @@ -1057,10 +1168,22 @@ mod tests { MaybeVirtualFile::File(file) => file.seek(pos), } } - async fn write_all(&mut self, buf: &[u8]) -> Result<(), Error> { + async fn write_all, Buf: IoBuf + Send>( + &mut self, + buf: B, + ) -> Result<(), Error> { match self { - MaybeVirtualFile::VirtualFile(file) => file.write_all(buf).await, - MaybeVirtualFile::File(file) => file.write_all(buf), + MaybeVirtualFile::VirtualFile(file) => { + let (_buf, res) = file.write_all(buf).await; + res.map(|_| ()) + } + MaybeVirtualFile::File(file) => { + let buf_len = buf.bytes_init(); + if buf_len == 0 { + return Ok(()); + } + file.write_all(&buf.slice(0..buf_len)) + } } } @@ -1135,7 +1258,7 @@ mod tests { .to_owned(), ) .await?; - file_a.write_all(b"foobar").await?; + file_a.write_all(b"foobar".to_vec()).await?; // cannot read from a file opened in write-only mode let _ = file_a.read_string().await.unwrap_err(); @@ -1144,7 +1267,7 @@ mod tests { let mut file_a = openfunc(path_a, OpenOptions::new().read(true).to_owned()).await?; // cannot write to a file opened in read-only mode - let _ = file_a.write_all(b"bar").await.unwrap_err(); + let _ = file_a.write_all(b"bar".to_vec()).await.unwrap_err(); // Try simple read assert_eq!("foobar", file_a.read_string().await?); @@ -1186,8 +1309,8 @@ mod tests { .to_owned(), ) .await?; - file_b.write_all_at(b"BAR", 3).await?; - file_b.write_all_at(b"FOO", 0).await?; + file_b.write_all_at(b"BAR".to_vec(), 3).await?; + file_b.write_all_at(b"FOO".to_vec(), 0).await?; assert_eq!(file_b.read_string_at(2, 3).await?, "OBA"); @@ -1287,7 +1410,7 @@ mod tests { let path = testdir.join("myfile"); let tmp_path = testdir.join("myfile.tmp"); - VirtualFile::crashsafe_overwrite(&path, &tmp_path, b"foo") + VirtualFile::crashsafe_overwrite(path.clone(), tmp_path.clone(), b"foo".to_vec()) .await .unwrap(); let mut file = MaybeVirtualFile::from(VirtualFile::open(&path).await.unwrap()); @@ -1296,7 +1419,7 @@ mod tests { assert!(!tmp_path.exists()); drop(file); - VirtualFile::crashsafe_overwrite(&path, &tmp_path, b"bar") + VirtualFile::crashsafe_overwrite(path.clone(), tmp_path.clone(), b"bar".to_vec()) .await .unwrap(); let mut file = MaybeVirtualFile::from(VirtualFile::open(&path).await.unwrap()); @@ -1318,7 +1441,7 @@ mod tests { std::fs::write(&tmp_path, "some preexisting junk that should be removed").unwrap(); assert!(tmp_path.exists()); - VirtualFile::crashsafe_overwrite(&path, &tmp_path, b"foo") + VirtualFile::crashsafe_overwrite(path.clone(), tmp_path.clone(), b"foo".to_vec()) .await .unwrap(); diff --git a/pageserver/src/virtual_file/io_engine.rs b/pageserver/src/virtual_file/io_engine.rs index f7b46fe653..e369d28711 100644 --- a/pageserver/src/virtual_file/io_engine.rs +++ b/pageserver/src/virtual_file/io_engine.rs @@ -7,67 +7,114 @@ //! //! Then use [`get`] and [`super::OpenOptions`]. -#[derive( - Copy, - Clone, - PartialEq, - Eq, - Hash, - strum_macros::EnumString, - strum_macros::Display, - serde_with::DeserializeFromStr, - serde_with::SerializeDisplay, - Debug, -)] -#[strum(serialize_all = "kebab-case")] -pub enum IoEngineKind { +use tokio_epoll_uring::{IoBuf, Slice}; +use tracing::Instrument; + +pub(crate) use super::api::IoEngineKind; +#[derive(Clone, Copy)] +#[repr(u8)] +pub(crate) enum IoEngine { + NotSet, StdFs, #[cfg(target_os = "linux")] TokioEpollUring, } -static IO_ENGINE: once_cell::sync::OnceCell = once_cell::sync::OnceCell::new(); - -#[cfg(not(test))] -pub(super) fn init(engine: IoEngineKind) { - if IO_ENGINE.set(engine).is_err() { - panic!("called twice"); +impl From for IoEngine { + fn from(value: IoEngineKind) -> Self { + match value { + IoEngineKind::StdFs => IoEngine::StdFs, + #[cfg(target_os = "linux")] + IoEngineKind::TokioEpollUring => IoEngine::TokioEpollUring, + } } - crate::metrics::virtual_file_io_engine::KIND - .with_label_values(&[&format!("{engine}")]) - .set(1); } -pub(super) fn get() -> &'static IoEngineKind { - #[cfg(test)] - { - let env_var_name = "NEON_PAGESERVER_UNIT_TEST_VIRTUAL_FILE_IOENGINE"; - IO_ENGINE.get_or_init(|| match std::env::var(env_var_name) { - Ok(v) => match v.parse::() { - Ok(engine_kind) => engine_kind, - Err(e) => { - panic!("invalid VirtualFile io engine for env var {env_var_name}: {e:#}: {v:?}") - } - }, - Err(std::env::VarError::NotPresent) => { - crate::config::defaults::DEFAULT_VIRTUAL_FILE_IO_ENGINE - .parse() - .unwrap() - } - Err(std::env::VarError::NotUnicode(_)) => { - panic!("env var {env_var_name} is not unicode"); - } +impl TryFrom for IoEngine { + type Error = u8; + + fn try_from(value: u8) -> Result { + Ok(match value { + v if v == (IoEngine::NotSet as u8) => IoEngine::NotSet, + v if v == (IoEngine::StdFs as u8) => IoEngine::StdFs, + #[cfg(target_os = "linux")] + v if v == (IoEngine::TokioEpollUring as u8) => IoEngine::TokioEpollUring, + x => return Err(x), }) } - #[cfg(not(test))] - IO_ENGINE.get().unwrap() } -use std::os::unix::prelude::FileExt; +static IO_ENGINE: AtomicU8 = AtomicU8::new(IoEngine::NotSet as u8); -use super::FileGuard; +pub(crate) fn set(engine_kind: IoEngineKind) { + let engine: IoEngine = engine_kind.into(); + IO_ENGINE.store(engine as u8, std::sync::atomic::Ordering::Relaxed); + #[cfg(not(test))] + { + let metric = &crate::metrics::virtual_file_io_engine::KIND; + metric.reset(); + metric + .with_label_values(&[&format!("{engine_kind}")]) + .set(1); + } +} -impl IoEngineKind { +#[cfg(not(test))] +pub(super) fn init(engine_kind: IoEngineKind) { + set(engine_kind); +} + +/// Longer-term, this API should only be used by [`super::VirtualFile`]. +pub(crate) fn get() -> IoEngine { + let cur = IoEngine::try_from(IO_ENGINE.load(Ordering::Relaxed)).unwrap(); + if cfg!(test) { + let env_var_name = "NEON_PAGESERVER_UNIT_TEST_VIRTUAL_FILE_IOENGINE"; + match cur { + IoEngine::NotSet => { + let kind = match std::env::var(env_var_name) { + Ok(v) => match v.parse::() { + Ok(engine_kind) => engine_kind, + Err(e) => { + panic!("invalid VirtualFile io engine for env var {env_var_name}: {e:#}: {v:?}") + } + }, + Err(std::env::VarError::NotPresent) => { + crate::config::defaults::DEFAULT_VIRTUAL_FILE_IO_ENGINE + .parse() + .unwrap() + } + Err(std::env::VarError::NotUnicode(_)) => { + panic!("env var {env_var_name} is not unicode"); + } + }; + self::set(kind); + self::get() + } + x => x, + } + } else { + cur + } +} + +use std::{ + os::unix::prelude::FileExt, + sync::atomic::{AtomicU8, Ordering}, +}; + +use super::{FileGuard, Metadata}; + +#[cfg(target_os = "linux")] +fn epoll_uring_error_to_std(e: tokio_epoll_uring::Error) -> std::io::Error { + match e { + tokio_epoll_uring::Error::Op(e) => e, + tokio_epoll_uring::Error::System(system) => { + std::io::Error::new(std::io::ErrorKind::Other, system) + } + } +} + +impl IoEngine { pub(super) async fn read_at( &self, file_guard: FileGuard, @@ -78,7 +125,8 @@ impl IoEngineKind { B: tokio_epoll_uring::BoundedBufMut + Send, { match self { - IoEngineKind::StdFs => { + IoEngine::NotSet => panic!("not initialized"), + IoEngine::StdFs => { // SAFETY: `dst` only lives at most as long as this match arm, during which buf remains valid memory. let dst = unsafe { std::slice::from_raw_parts_mut(buf.stable_mut_ptr(), buf.bytes_total()) @@ -96,19 +144,112 @@ impl IoEngineKind { ((file_guard, buf), res) } #[cfg(target_os = "linux")] - IoEngineKind::TokioEpollUring => { + IoEngine::TokioEpollUring => { let system = tokio_epoll_uring::thread_local_system().await; let (resources, res) = system.read(file_guard, offset, buf).await; + (resources, res.map_err(epoll_uring_error_to_std)) + } + } + } + pub(super) async fn sync_all(&self, file_guard: FileGuard) -> (FileGuard, std::io::Result<()>) { + match self { + IoEngine::NotSet => panic!("not initialized"), + IoEngine::StdFs => { + let res = file_guard.with_std_file(|std_file| std_file.sync_all()); + (file_guard, res) + } + #[cfg(target_os = "linux")] + IoEngine::TokioEpollUring => { + let system = tokio_epoll_uring::thread_local_system().await; + let (resources, res) = system.fsync(file_guard).await; + (resources, res.map_err(epoll_uring_error_to_std)) + } + } + } + pub(super) async fn sync_data( + &self, + file_guard: FileGuard, + ) -> (FileGuard, std::io::Result<()>) { + match self { + IoEngine::NotSet => panic!("not initialized"), + IoEngine::StdFs => { + let res = file_guard.with_std_file(|std_file| std_file.sync_data()); + (file_guard, res) + } + #[cfg(target_os = "linux")] + IoEngine::TokioEpollUring => { + let system = tokio_epoll_uring::thread_local_system().await; + let (resources, res) = system.fdatasync(file_guard).await; + (resources, res.map_err(epoll_uring_error_to_std)) + } + } + } + pub(super) async fn metadata( + &self, + file_guard: FileGuard, + ) -> (FileGuard, std::io::Result) { + match self { + IoEngine::NotSet => panic!("not initialized"), + IoEngine::StdFs => { + let res = + file_guard.with_std_file(|std_file| std_file.metadata().map(Metadata::from)); + (file_guard, res) + } + #[cfg(target_os = "linux")] + IoEngine::TokioEpollUring => { + let system = tokio_epoll_uring::thread_local_system().await; + let (resources, res) = system.statx(file_guard).await; ( resources, - res.map_err(|e| match e { - tokio_epoll_uring::Error::Op(e) => e, - tokio_epoll_uring::Error::System(system) => { - std::io::Error::new(std::io::ErrorKind::Other, system) - } - }), + res.map_err(epoll_uring_error_to_std).map(Metadata::from), ) } } } + pub(super) async fn write_at( + &self, + file_guard: FileGuard, + offset: u64, + buf: Slice, + ) -> ((FileGuard, Slice), std::io::Result) { + match self { + IoEngine::NotSet => panic!("not initialized"), + IoEngine::StdFs => { + let result = file_guard.with_std_file(|std_file| std_file.write_at(&buf, offset)); + ((file_guard, buf), result) + } + #[cfg(target_os = "linux")] + IoEngine::TokioEpollUring => { + let system = tokio_epoll_uring::thread_local_system().await; + let (resources, res) = system.write(file_guard, offset, buf).await; + (resources, res.map_err(epoll_uring_error_to_std)) + } + } + } + + /// If we switch a user of [`tokio::fs`] to use [`super::io_engine`], + /// they'd start blocking the executor thread if [`IoEngine::StdFs`] is configured + /// whereas before the switch to [`super::io_engine`], that wasn't the case. + /// This method helps avoid such a regression. + /// + /// Panics if the `spawn_blocking` fails, see [`tokio::task::JoinError`] for reasons why that can happen. + pub(crate) async fn spawn_blocking_and_block_on_if_std(&self, work: Fut) -> R + where + Fut: 'static + Send + std::future::Future, + R: 'static + Send, + { + match self { + IoEngine::NotSet => panic!("not initialized"), + IoEngine::StdFs => { + let span = tracing::info_span!("spawn_blocking_block_on_if_std"); + tokio::task::spawn_blocking({ + move || tokio::runtime::Handle::current().block_on(work.instrument(span)) + }) + .await + .expect("failed to join blocking code most likely it panicked, panicking as well") + } + #[cfg(target_os = "linux")] + IoEngine::TokioEpollUring => work.await, + } + } } diff --git a/pageserver/src/virtual_file/metadata.rs b/pageserver/src/virtual_file/metadata.rs new file mode 100644 index 0000000000..f530c50988 --- /dev/null +++ b/pageserver/src/virtual_file/metadata.rs @@ -0,0 +1,30 @@ +use std::fs; + +pub enum Metadata { + StdFs(fs::Metadata), + #[cfg(target_os = "linux")] + TokioEpollUring(Box), +} + +#[cfg(target_os = "linux")] +impl From> for Metadata { + fn from(value: Box) -> Self { + Metadata::TokioEpollUring(value) + } +} + +impl From for Metadata { + fn from(value: std::fs::Metadata) -> Self { + Metadata::StdFs(value) + } +} + +impl Metadata { + pub fn len(&self) -> u64 { + match self { + Metadata::StdFs(metadata) => metadata.len(), + #[cfg(target_os = "linux")] + Metadata::TokioEpollUring(statx) => statx.stx_size, + } + } +} diff --git a/pageserver/src/virtual_file/open_options.rs b/pageserver/src/virtual_file/open_options.rs index 1e5ffe15cc..f75edb0bac 100644 --- a/pageserver/src/virtual_file/open_options.rs +++ b/pageserver/src/virtual_file/open_options.rs @@ -1,6 +1,6 @@ //! Enum-dispatch to the `OpenOptions` type of the respective [`super::IoEngineKind`]; -use super::IoEngineKind; +use super::io_engine::IoEngine; use std::{os::fd::OwnedFd, path::Path}; #[derive(Debug, Clone)] @@ -13,9 +13,10 @@ pub enum OpenOptions { impl Default for OpenOptions { fn default() -> Self { match super::io_engine::get() { - IoEngineKind::StdFs => Self::StdFs(std::fs::OpenOptions::new()), + IoEngine::NotSet => panic!("io engine not set"), + IoEngine::StdFs => Self::StdFs(std::fs::OpenOptions::new()), #[cfg(target_os = "linux")] - IoEngineKind::TokioEpollUring => { + IoEngine::TokioEpollUring => { Self::TokioEpollUring(tokio_epoll_uring::ops::open_at::OpenOptions::new()) } } diff --git a/pageserver/src/walingest.rs b/pageserver/src/walingest.rs index 3183608862..63a2b30d09 100644 --- a/pageserver/src/walingest.rs +++ b/pageserver/src/walingest.rs @@ -334,6 +334,12 @@ impl WalIngest { { self.checkpoint.oldestXid = xlog_checkpoint.oldestXid; } + trace!( + "xlog_checkpoint.oldestActiveXid={}, checkpoint.oldestActiveXid={}", + xlog_checkpoint.oldestActiveXid, + self.checkpoint.oldestActiveXid + ); + self.checkpoint.oldestActiveXid = xlog_checkpoint.oldestActiveXid; // Write a new checkpoint key-value pair on every checkpoint record, even // if nothing really changed. Not strictly required, but it seems nice to @@ -346,7 +352,7 @@ impl WalIngest { let info = decoded.xl_info & pg_constants::XLR_RMGR_INFO_MASK; if info == pg_constants::XLOG_LOGICAL_MESSAGE { - let xlrec = XlLogicalMessage::decode(&mut buf); + let xlrec = crate::walrecord::XlLogicalMessage::decode(&mut buf); let prefix = std::str::from_utf8(&buf[0..xlrec.prefix_size - 1])?; let message = &buf[xlrec.prefix_size..xlrec.prefix_size + xlrec.message_size]; if prefix == "neon-test" { @@ -360,6 +366,13 @@ impl WalIngest { } } } + pg_constants::RM_STANDBY_ID => { + let info = decoded.xl_info & pg_constants::XLR_RMGR_INFO_MASK; + if info == pg_constants::XLOG_RUNNING_XACTS { + let xlrec = crate::walrecord::XlRunningXacts::decode(&mut buf); + self.checkpoint.oldestActiveXid = xlrec.oldest_running_xid; + } + } _x => { // TODO: should probably log & fail here instead of blindly // doing something without understanding the protocol @@ -1033,7 +1046,23 @@ impl WalIngest { // Copy content debug!("copying rel {} to {}, {} blocks", src_rel, dst_rel, nblocks); for blknum in 0..nblocks { - debug!("copying block {} from {} to {}", blknum, src_rel, dst_rel); + // Sharding: + // - src and dst are always on the same shard, because they differ only by dbNode, and + // dbNode is not included in the hash inputs for sharding. + // - This WAL command is replayed on all shards, but each shard only copies the blocks + // that belong to it. + let src_key = rel_block_to_key(src_rel, blknum); + if !self.shard.is_key_local(&src_key) { + debug!( + "Skipping non-local key {} during XLOG_DBASE_CREATE", + src_key + ); + continue; + } + debug!( + "copying block {} from {} ({}) to {}", + blknum, src_rel, src_key, dst_rel + ); let content = modification .tline @@ -1347,16 +1376,22 @@ impl WalIngest { self.checkpoint.nextMultiOffset = xlrec.moff + xlrec.nmembers; self.checkpoint_modified = true; } - let max_mbr_xid = xlrec.members.iter().fold(0u32, |acc, mbr| { - if mbr.xid.wrapping_sub(acc) as i32 > 0 { - mbr.xid + let max_mbr_xid = xlrec.members.iter().fold(None, |acc, mbr| { + if let Some(max_xid) = acc { + if mbr.xid.wrapping_sub(max_xid) as i32 > 0 { + Some(mbr.xid) + } else { + acc + } } else { - acc + Some(mbr.xid) } }); - if self.checkpoint.update_next_xid(max_mbr_xid) { - self.checkpoint_modified = true; + if let Some(max_xid) = max_mbr_xid { + if self.checkpoint.update_next_xid(max_xid) { + self.checkpoint_modified = true; + } } Ok(()) } @@ -1632,8 +1667,6 @@ mod tests { use super::*; use crate::tenant::harness::*; use crate::tenant::remote_timeline_client::{remote_initdb_archive_path, INITDB_PATH}; - use crate::tenant::Timeline; - use postgres_ffi::v14::xlog_utils::SIZEOF_CHECKPOINT; use postgres_ffi::RELSEG_SIZE; use crate::DEFAULT_PG_VERSION; @@ -1673,22 +1706,22 @@ mod tests { let mut m = tline.begin_modification(Lsn(0x20)); walingest.put_rel_creation(&mut m, TESTREL_A, &ctx).await?; walingest - .put_rel_page_image(&mut m, TESTREL_A, 0, TEST_IMG("foo blk 0 at 2"), &ctx) + .put_rel_page_image(&mut m, TESTREL_A, 0, test_img("foo blk 0 at 2"), &ctx) .await?; m.commit(&ctx).await?; let mut m = tline.begin_modification(Lsn(0x30)); walingest - .put_rel_page_image(&mut m, TESTREL_A, 0, TEST_IMG("foo blk 0 at 3"), &ctx) + .put_rel_page_image(&mut m, TESTREL_A, 0, test_img("foo blk 0 at 3"), &ctx) .await?; m.commit(&ctx).await?; let mut m = tline.begin_modification(Lsn(0x40)); walingest - .put_rel_page_image(&mut m, TESTREL_A, 1, TEST_IMG("foo blk 1 at 4"), &ctx) + .put_rel_page_image(&mut m, TESTREL_A, 1, test_img("foo blk 1 at 4"), &ctx) .await?; m.commit(&ctx).await?; let mut m = tline.begin_modification(Lsn(0x50)); walingest - .put_rel_page_image(&mut m, TESTREL_A, 2, TEST_IMG("foo blk 2 at 5"), &ctx) + .put_rel_page_image(&mut m, TESTREL_A, 2, test_img("foo blk 2 at 5"), &ctx) .await?; m.commit(&ctx).await?; @@ -1729,46 +1762,46 @@ mod tests { tline .get_rel_page_at_lsn(TESTREL_A, 0, Version::Lsn(Lsn(0x20)), false, &ctx) .await?, - TEST_IMG("foo blk 0 at 2") + test_img("foo blk 0 at 2") ); assert_eq!( tline .get_rel_page_at_lsn(TESTREL_A, 0, Version::Lsn(Lsn(0x30)), false, &ctx) .await?, - TEST_IMG("foo blk 0 at 3") + test_img("foo blk 0 at 3") ); assert_eq!( tline .get_rel_page_at_lsn(TESTREL_A, 0, Version::Lsn(Lsn(0x40)), false, &ctx) .await?, - TEST_IMG("foo blk 0 at 3") + test_img("foo blk 0 at 3") ); assert_eq!( tline .get_rel_page_at_lsn(TESTREL_A, 1, Version::Lsn(Lsn(0x40)), false, &ctx) .await?, - TEST_IMG("foo blk 1 at 4") + test_img("foo blk 1 at 4") ); assert_eq!( tline .get_rel_page_at_lsn(TESTREL_A, 0, Version::Lsn(Lsn(0x50)), false, &ctx) .await?, - TEST_IMG("foo blk 0 at 3") + test_img("foo blk 0 at 3") ); assert_eq!( tline .get_rel_page_at_lsn(TESTREL_A, 1, Version::Lsn(Lsn(0x50)), false, &ctx) .await?, - TEST_IMG("foo blk 1 at 4") + test_img("foo blk 1 at 4") ); assert_eq!( tline .get_rel_page_at_lsn(TESTREL_A, 2, Version::Lsn(Lsn(0x50)), false, &ctx) .await?, - TEST_IMG("foo blk 2 at 5") + test_img("foo blk 2 at 5") ); // Truncate last block @@ -1790,13 +1823,13 @@ mod tests { tline .get_rel_page_at_lsn(TESTREL_A, 0, Version::Lsn(Lsn(0x60)), false, &ctx) .await?, - TEST_IMG("foo blk 0 at 3") + test_img("foo blk 0 at 3") ); assert_eq!( tline .get_rel_page_at_lsn(TESTREL_A, 1, Version::Lsn(Lsn(0x60)), false, &ctx) .await?, - TEST_IMG("foo blk 1 at 4") + test_img("foo blk 1 at 4") ); // should still see the truncated block with older LSN @@ -1810,7 +1843,7 @@ mod tests { tline .get_rel_page_at_lsn(TESTREL_A, 2, Version::Lsn(Lsn(0x50)), false, &ctx) .await?, - TEST_IMG("foo blk 2 at 5") + test_img("foo blk 2 at 5") ); // Truncate to zero length @@ -1829,7 +1862,7 @@ mod tests { // Extend from 0 to 2 blocks, leaving a gap let mut m = tline.begin_modification(Lsn(0x70)); walingest - .put_rel_page_image(&mut m, TESTREL_A, 1, TEST_IMG("foo blk 1"), &ctx) + .put_rel_page_image(&mut m, TESTREL_A, 1, test_img("foo blk 1"), &ctx) .await?; m.commit(&ctx).await?; assert_eq!( @@ -1848,13 +1881,13 @@ mod tests { tline .get_rel_page_at_lsn(TESTREL_A, 1, Version::Lsn(Lsn(0x70)), false, &ctx) .await?, - TEST_IMG("foo blk 1") + test_img("foo blk 1") ); // Extend a lot more, leaving a big gap that spans across segments let mut m = tline.begin_modification(Lsn(0x80)); walingest - .put_rel_page_image(&mut m, TESTREL_A, 1500, TEST_IMG("foo blk 1500"), &ctx) + .put_rel_page_image(&mut m, TESTREL_A, 1500, test_img("foo blk 1500"), &ctx) .await?; m.commit(&ctx).await?; assert_eq!( @@ -1875,7 +1908,7 @@ mod tests { tline .get_rel_page_at_lsn(TESTREL_A, 1500, Version::Lsn(Lsn(0x80)), false, &ctx) .await?, - TEST_IMG("foo blk 1500") + test_img("foo blk 1500") ); Ok(()) @@ -1893,7 +1926,7 @@ mod tests { let mut m = tline.begin_modification(Lsn(0x20)); walingest - .put_rel_page_image(&mut m, TESTREL_A, 0, TEST_IMG("foo blk 0 at 2"), &ctx) + .put_rel_page_image(&mut m, TESTREL_A, 0, test_img("foo blk 0 at 2"), &ctx) .await?; m.commit(&ctx).await?; @@ -1930,7 +1963,7 @@ mod tests { // Re-create it let mut m = tline.begin_modification(Lsn(0x40)); walingest - .put_rel_page_image(&mut m, TESTREL_A, 0, TEST_IMG("foo blk 0 at 4"), &ctx) + .put_rel_page_image(&mut m, TESTREL_A, 0, test_img("foo blk 0 at 4"), &ctx) .await?; m.commit(&ctx).await?; @@ -1968,7 +2001,7 @@ mod tests { for blkno in 0..relsize { let data = format!("foo blk {} at {}", blkno, Lsn(0x20)); walingest - .put_rel_page_image(&mut m, TESTREL_A, blkno, TEST_IMG(&data), &ctx) + .put_rel_page_image(&mut m, TESTREL_A, blkno, test_img(&data), &ctx) .await?; } m.commit(&ctx).await?; @@ -2006,7 +2039,7 @@ mod tests { tline .get_rel_page_at_lsn(TESTREL_A, blkno, Version::Lsn(lsn), false, &ctx) .await?, - TEST_IMG(&data) + test_img(&data) ); } @@ -2033,7 +2066,7 @@ mod tests { tline .get_rel_page_at_lsn(TESTREL_A, blkno, Version::Lsn(Lsn(0x60)), false, &ctx) .await?, - TEST_IMG(&data) + test_img(&data) ); } @@ -2051,7 +2084,7 @@ mod tests { tline .get_rel_page_at_lsn(TESTREL_A, blkno, Version::Lsn(Lsn(0x50)), false, &ctx) .await?, - TEST_IMG(&data) + test_img(&data) ); } @@ -2062,7 +2095,7 @@ mod tests { for blkno in 0..relsize { let data = format!("foo blk {} at {}", blkno, lsn); walingest - .put_rel_page_image(&mut m, TESTREL_A, blkno, TEST_IMG(&data), &ctx) + .put_rel_page_image(&mut m, TESTREL_A, blkno, test_img(&data), &ctx) .await?; } m.commit(&ctx).await?; @@ -2087,7 +2120,7 @@ mod tests { tline .get_rel_page_at_lsn(TESTREL_A, blkno, Version::Lsn(Lsn(0x80)), false, &ctx) .await?, - TEST_IMG(&data) + test_img(&data) ); } @@ -2108,7 +2141,7 @@ mod tests { for blknum in 0..RELSEG_SIZE + 1 { lsn += 0x10; let mut m = tline.begin_modification(Lsn(lsn)); - let img = TEST_IMG(&format!("foo blk {} at {}", blknum, Lsn(lsn))); + let img = test_img(&format!("foo blk {} at {}", blknum, Lsn(lsn))); walingest .put_rel_page_image(&mut m, TESTREL_A, blknum as BlockNumber, img, &ctx) .await?; diff --git a/pageserver/src/walrecord.rs b/pageserver/src/walrecord.rs index ff6bc9194b..ae2d996879 100644 --- a/pageserver/src/walrecord.rs +++ b/pageserver/src/walrecord.rs @@ -44,6 +44,11 @@ pub enum NeonWalRecord { moff: MultiXactOffset, members: Vec, }, + /// Update the map of AUX files, either writing or dropping an entry + AuxFile { + file_path: String, + content: Option, + }, } impl NeonWalRecord { @@ -768,6 +773,42 @@ impl XlLogicalMessage { } } +#[repr(C)] +#[derive(Debug)] +pub struct XlRunningXacts { + pub xcnt: u32, + pub subxcnt: u32, + pub subxid_overflow: bool, + pub next_xid: TransactionId, + pub oldest_running_xid: TransactionId, + pub latest_completed_xid: TransactionId, + pub xids: Vec, +} + +impl XlRunningXacts { + pub fn decode(buf: &mut Bytes) -> XlRunningXacts { + let xcnt = buf.get_u32_le(); + let subxcnt = buf.get_u32_le(); + let subxid_overflow = buf.get_u32_le() != 0; + let next_xid = buf.get_u32_le(); + let oldest_running_xid = buf.get_u32_le(); + let latest_completed_xid = buf.get_u32_le(); + let mut xids = Vec::new(); + for _ in 0..(xcnt + subxcnt) { + xids.push(buf.get_u32_le()); + } + XlRunningXacts { + xcnt, + subxcnt, + subxid_overflow, + next_xid, + oldest_running_xid, + latest_completed_xid, + xids, + } + } +} + /// Main routine to decode a WAL record and figure out which blocks are modified // // See xlogrecord.h for details diff --git a/pageserver/src/walredo.rs b/pageserver/src/walredo.rs index 45a3fbb626..fb98d25a33 100644 --- a/pageserver/src/walredo.rs +++ b/pageserver/src/walredo.rs @@ -17,70 +17,30 @@ //! records. It achieves it by dropping privileges before replaying //! any WAL records, so that even if an attacker hijacks the Postgres //! process, he cannot escape out of it. -//! -use anyhow::Context; -use byteorder::{ByteOrder, LittleEndian}; -use bytes::{BufMut, Bytes, BytesMut}; -use pageserver_api::shard::TenantShardId; -use serde::Serialize; -use std::collections::VecDeque; -use std::io; -use std::ops::{Deref, DerefMut}; -use std::os::unix::prelude::CommandExt; -use std::process::Stdio; -use std::process::{Child, Command}; -use std::sync::{Arc, RwLock}; -use std::time::Duration; -use std::time::Instant; -use tokio::io::{AsyncReadExt, AsyncWriteExt}; -use tracing::*; -use utils::poison::Poison; -use utils::{bin_ser::BeSer, lsn::Lsn}; -#[cfg(feature = "testing")] -use std::sync::atomic::{AtomicUsize, Ordering}; +/// Process lifecycle and abstracction for the IPC protocol. +mod process; + +/// Code to apply [`NeonWalRecord`]s. +pub(crate) mod apply_neon; use crate::config::PageServerConf; use crate::metrics::{ - WalRedoKillCause, WAL_REDO_BYTES_HISTOGRAM, WAL_REDO_PROCESS_COUNTERS, - WAL_REDO_PROCESS_LAUNCH_DURATION_HISTOGRAM, WAL_REDO_RECORDS_HISTOGRAM, - WAL_REDO_RECORD_COUNTER, WAL_REDO_TIME, + WAL_REDO_BYTES_HISTOGRAM, WAL_REDO_PROCESS_LAUNCH_DURATION_HISTOGRAM, + WAL_REDO_RECORDS_HISTOGRAM, WAL_REDO_TIME, }; use crate::repository::Key; use crate::walrecord::NeonWalRecord; - -use pageserver_api::key::{key_to_rel_block, key_to_slru_block}; -use pageserver_api::reltag::{RelTag, SlruKind}; -use postgres_ffi::pg_constants; -use postgres_ffi::relfile_utils::VISIBILITYMAP_FORKNUM; -use postgres_ffi::v14::nonrelfile_utils::{ - mx_offset_to_flags_bitshift, mx_offset_to_flags_offset, mx_offset_to_member_offset, - transaction_id_set_status, -}; -use postgres_ffi::BLCKSZ; - -/// -/// `RelTag` + block number (`blknum`) gives us a unique id of the page in the cluster. -/// -/// In Postgres `BufferTag` structure is used for exactly the same purpose. -/// [See more related comments here](https://github.com/postgres/postgres/blob/99c5852e20a0987eca1c38ba0c09329d4076b6a0/src/include/storage/buf_internals.h#L91). -/// -#[derive(Debug, PartialEq, Eq, PartialOrd, Ord, Clone, Copy, Serialize)] -pub(crate) struct BufferTag { - pub rel: RelTag, - pub blknum: u32, -} - -struct ProcessInput { - stdin: tokio::process::ChildStdin, - n_requests: usize, -} - -struct ProcessOutput { - stdout: tokio::process::ChildStdout, - pending_responses: VecDeque>, - n_processed_responses: usize, -} +use anyhow::Context; +use bytes::{Bytes, BytesMut}; +use pageserver_api::key::key_to_rel_block; +use pageserver_api::models::WalRedoManagerStatus; +use pageserver_api::shard::TenantShardId; +use std::sync::{Arc, RwLock}; +use std::time::Duration; +use std::time::Instant; +use tracing::*; +use utils::lsn::Lsn; /// /// This is the real implementation that uses a Postgres process to @@ -93,22 +53,7 @@ pub struct PostgresRedoManager { tenant_shard_id: TenantShardId, conf: &'static PageServerConf, last_redo_at: std::sync::Mutex>, - redo_process: RwLock>>, -} - -/// Can this request be served by neon redo functions -/// or we need to pass it to wal-redo postgres process? -fn can_apply_in_neon(rec: &NeonWalRecord) -> bool { - // Currently, we don't have bespoken Rust code to replay any - // Postgres WAL records. But everything else is handled in neon. - #[allow(clippy::match_like_matches_macro)] - match rec { - NeonWalRecord::Postgres { - will_init: _, - rec: _, - } => false, - _ => true, - } + redo_process: RwLock>>, } /// @@ -138,10 +83,10 @@ impl PostgresRedoManager { let base_img_lsn = base_img.as_ref().map(|p| p.0).unwrap_or(Lsn::INVALID); let mut img = base_img.map(|p| p.1); - let mut batch_neon = can_apply_in_neon(&records[0].1); + let mut batch_neon = apply_neon::can_apply_in_neon(&records[0].1); let mut batch_start = 0; for (i, record) in records.iter().enumerate().skip(1) { - let rec_neon = can_apply_in_neon(&record.1); + let rec_neon = apply_neon::can_apply_in_neon(&record.1); if rec_neon != batch_neon { let result = if batch_neon { @@ -178,6 +123,20 @@ impl PostgresRedoManager { .await } } + + pub(crate) fn status(&self) -> Option { + Some(WalRedoManagerStatus { + last_redo_at: { + let at = *self.last_redo_at.lock().unwrap(); + at.and_then(|at| { + let age = at.elapsed(); + // map any chrono errors silently to None here + chrono::Utc::now().checked_sub_signed(chrono::Duration::from_std(age).ok()?) + }) + }, + pid: self.redo_process.read().unwrap().as_ref().map(|p| p.id()), + }) + } } impl PostgresRedoManager { @@ -232,7 +191,7 @@ impl PostgresRedoManager { let mut n_attempts = 0u32; loop { // launch the WAL redo process on first use - let proc: Arc = { + let proc: Arc = { let proc_guard = self.redo_process.read().unwrap(); match &*proc_guard { None => { @@ -241,17 +200,23 @@ impl PostgresRedoManager { let mut proc_guard = self.redo_process.write().unwrap(); match &*proc_guard { None => { - let timer = - WAL_REDO_PROCESS_LAUNCH_DURATION_HISTOGRAM.start_timer(); + let start = Instant::now(); let proc = Arc::new( - WalRedoProcess::launch( + process::WalRedoProcess::launch( self.conf, self.tenant_shard_id, pg_version, ) .context("launch walredo process")?, ); - timer.observe_duration(); + let duration = start.elapsed(); + WAL_REDO_PROCESS_LAUNCH_DURATION_HISTOGRAM + .observe(duration.as_secs_f64()); + info!( + duration_ms = duration.as_millis(), + pid = proc.id(), + "launched walredo process" + ); *proc_guard = Some(Arc::clone(&proc)); proc } @@ -265,9 +230,8 @@ impl PostgresRedoManager { let started_at = std::time::Instant::now(); // Relational WAL records are applied using wal-redo-postgres - let buf_tag = BufferTag { rel, blknum }; let result = proc - .apply_wal_records(buf_tag, &base_img, records) + .apply_wal_records(rel, blknum, &base_img, records) .await .context("apply_wal_records"); @@ -298,7 +262,7 @@ impl PostgresRedoManager { // next request will launch a new one. if let Err(e) = result.as_ref() { error!( - "error applying {} WAL records {}..{} ({} bytes) to base image with LSN {} to reconstruct page image at LSN {} n_attempts={}: {:?}", + "error applying {} WAL records {}..{} ({} bytes) to key {key}, from base image with LSN {} to reconstruct page image at LSN {} n_attempts={}: {:?}", records.len(), records.first().map(|p| p.0).unwrap_or(Lsn(0)), records.last().map(|p| p.0).unwrap_or(Lsn(0)), @@ -395,721 +359,12 @@ impl PostgresRedoManager { _record_lsn: Lsn, record: &NeonWalRecord, ) -> anyhow::Result<()> { - match record { - NeonWalRecord::Postgres { - will_init: _, - rec: _, - } => { - anyhow::bail!("tried to pass postgres wal record to neon WAL redo"); - } - NeonWalRecord::ClearVisibilityMapFlags { - new_heap_blkno, - old_heap_blkno, - flags, - } => { - // sanity check that this is modifying the correct relation - let (rel, blknum) = key_to_rel_block(key).context("invalid record")?; - assert!( - rel.forknum == VISIBILITYMAP_FORKNUM, - "ClearVisibilityMapFlags record on unexpected rel {}", - rel - ); - if let Some(heap_blkno) = *new_heap_blkno { - // Calculate the VM block and offset that corresponds to the heap block. - let map_block = pg_constants::HEAPBLK_TO_MAPBLOCK(heap_blkno); - let map_byte = pg_constants::HEAPBLK_TO_MAPBYTE(heap_blkno); - let map_offset = pg_constants::HEAPBLK_TO_OFFSET(heap_blkno); - - // Check that we're modifying the correct VM block. - assert!(map_block == blknum); - - // equivalent to PageGetContents(page) - let map = &mut page[pg_constants::MAXALIGN_SIZE_OF_PAGE_HEADER_DATA..]; - - map[map_byte as usize] &= !(flags << map_offset); - } - - // Repeat for 'old_heap_blkno', if any - if let Some(heap_blkno) = *old_heap_blkno { - let map_block = pg_constants::HEAPBLK_TO_MAPBLOCK(heap_blkno); - let map_byte = pg_constants::HEAPBLK_TO_MAPBYTE(heap_blkno); - let map_offset = pg_constants::HEAPBLK_TO_OFFSET(heap_blkno); - - assert!(map_block == blknum); - - let map = &mut page[pg_constants::MAXALIGN_SIZE_OF_PAGE_HEADER_DATA..]; - - map[map_byte as usize] &= !(flags << map_offset); - } - } - // Non-relational WAL records are handled here, with custom code that has the - // same effects as the corresponding Postgres WAL redo function. - NeonWalRecord::ClogSetCommitted { xids, timestamp } => { - let (slru_kind, segno, blknum) = - key_to_slru_block(key).context("invalid record")?; - assert_eq!( - slru_kind, - SlruKind::Clog, - "ClogSetCommitted record with unexpected key {}", - key - ); - for &xid in xids { - let pageno = xid / pg_constants::CLOG_XACTS_PER_PAGE; - let expected_segno = pageno / pg_constants::SLRU_PAGES_PER_SEGMENT; - let expected_blknum = pageno % pg_constants::SLRU_PAGES_PER_SEGMENT; - - // Check that we're modifying the correct CLOG block. - assert!( - segno == expected_segno, - "ClogSetCommitted record for XID {} with unexpected key {}", - xid, - key - ); - assert!( - blknum == expected_blknum, - "ClogSetCommitted record for XID {} with unexpected key {}", - xid, - key - ); - - transaction_id_set_status( - xid, - pg_constants::TRANSACTION_STATUS_COMMITTED, - page, - ); - } - - // Append the timestamp - if page.len() == BLCKSZ as usize + 8 { - page.truncate(BLCKSZ as usize); - } - if page.len() == BLCKSZ as usize { - page.extend_from_slice(×tamp.to_be_bytes()); - } else { - warn!( - "CLOG blk {} in seg {} has invalid size {}", - blknum, - segno, - page.len() - ); - } - } - NeonWalRecord::ClogSetAborted { xids } => { - let (slru_kind, segno, blknum) = - key_to_slru_block(key).context("invalid record")?; - assert_eq!( - slru_kind, - SlruKind::Clog, - "ClogSetAborted record with unexpected key {}", - key - ); - for &xid in xids { - let pageno = xid / pg_constants::CLOG_XACTS_PER_PAGE; - let expected_segno = pageno / pg_constants::SLRU_PAGES_PER_SEGMENT; - let expected_blknum = pageno % pg_constants::SLRU_PAGES_PER_SEGMENT; - - // Check that we're modifying the correct CLOG block. - assert!( - segno == expected_segno, - "ClogSetAborted record for XID {} with unexpected key {}", - xid, - key - ); - assert!( - blknum == expected_blknum, - "ClogSetAborted record for XID {} with unexpected key {}", - xid, - key - ); - - transaction_id_set_status(xid, pg_constants::TRANSACTION_STATUS_ABORTED, page); - } - } - NeonWalRecord::MultixactOffsetCreate { mid, moff } => { - let (slru_kind, segno, blknum) = - key_to_slru_block(key).context("invalid record")?; - assert_eq!( - slru_kind, - SlruKind::MultiXactOffsets, - "MultixactOffsetCreate record with unexpected key {}", - key - ); - // Compute the block and offset to modify. - // See RecordNewMultiXact in PostgreSQL sources. - let pageno = mid / pg_constants::MULTIXACT_OFFSETS_PER_PAGE as u32; - let entryno = mid % pg_constants::MULTIXACT_OFFSETS_PER_PAGE as u32; - let offset = (entryno * 4) as usize; - - // Check that we're modifying the correct multixact-offsets block. - let expected_segno = pageno / pg_constants::SLRU_PAGES_PER_SEGMENT; - let expected_blknum = pageno % pg_constants::SLRU_PAGES_PER_SEGMENT; - assert!( - segno == expected_segno, - "MultiXactOffsetsCreate record for multi-xid {} with unexpected key {}", - mid, - key - ); - assert!( - blknum == expected_blknum, - "MultiXactOffsetsCreate record for multi-xid {} with unexpected key {}", - mid, - key - ); - - LittleEndian::write_u32(&mut page[offset..offset + 4], *moff); - } - NeonWalRecord::MultixactMembersCreate { moff, members } => { - let (slru_kind, segno, blknum) = - key_to_slru_block(key).context("invalid record")?; - assert_eq!( - slru_kind, - SlruKind::MultiXactMembers, - "MultixactMembersCreate record with unexpected key {}", - key - ); - for (i, member) in members.iter().enumerate() { - let offset = moff + i as u32; - - // Compute the block and offset to modify. - // See RecordNewMultiXact in PostgreSQL sources. - let pageno = offset / pg_constants::MULTIXACT_MEMBERS_PER_PAGE as u32; - let memberoff = mx_offset_to_member_offset(offset); - let flagsoff = mx_offset_to_flags_offset(offset); - let bshift = mx_offset_to_flags_bitshift(offset); - - // Check that we're modifying the correct multixact-members block. - let expected_segno = pageno / pg_constants::SLRU_PAGES_PER_SEGMENT; - let expected_blknum = pageno % pg_constants::SLRU_PAGES_PER_SEGMENT; - assert!( - segno == expected_segno, - "MultiXactMembersCreate record for offset {} with unexpected key {}", - moff, - key - ); - assert!( - blknum == expected_blknum, - "MultiXactMembersCreate record for offset {} with unexpected key {}", - moff, - key - ); - - let mut flagsval = LittleEndian::read_u32(&page[flagsoff..flagsoff + 4]); - flagsval &= !(((1 << pg_constants::MXACT_MEMBER_BITS_PER_XACT) - 1) << bshift); - flagsval |= member.status << bshift; - LittleEndian::write_u32(&mut page[flagsoff..flagsoff + 4], flagsval); - LittleEndian::write_u32(&mut page[memberoff..memberoff + 4], member.xid); - } - } - } + apply_neon::apply_in_neon(record, key, page)?; Ok(()) } } -/// -/// Command with ability not to give all file descriptors to child process -/// -trait CloseFileDescriptors: CommandExt { - /// - /// Close file descriptors (other than stdin, stdout, stderr) in child process - /// - fn close_fds(&mut self) -> &mut Command; -} - -impl CloseFileDescriptors for C { - fn close_fds(&mut self) -> &mut Command { - // SAFETY: Code executed inside pre_exec should have async-signal-safety, - // which means it should be safe to execute inside a signal handler. - // The precise meaning depends on platform. See `man signal-safety` - // for the linux definition. - // - // The set_fds_cloexec_threadsafe function is documented to be - // async-signal-safe. - // - // Aside from this function, the rest of the code is re-entrant and - // doesn't make any syscalls. We're just passing constants. - // - // NOTE: It's easy to indirectly cause a malloc or lock a mutex, - // which is not async-signal-safe. Be careful. - unsafe { - self.pre_exec(move || { - close_fds::set_fds_cloexec_threadsafe(3, &[]); - Ok(()) - }) - } - } -} - -struct WalRedoProcess { - #[allow(dead_code)] - conf: &'static PageServerConf, - tenant_shard_id: TenantShardId, - // Some() on construction, only becomes None on Drop. - child: Option, - stdout: tokio::sync::Mutex>, - stdin: tokio::sync::Mutex>, - /// Counter to separate same sized walredo inputs failing at the same millisecond. - #[cfg(feature = "testing")] - dump_sequence: AtomicUsize, -} - -impl WalRedoProcess { - // - // Start postgres binary in special WAL redo mode. - // - #[instrument(skip_all,fields(tenant_id=%tenant_shard_id.tenant_id, shard_id=%tenant_shard_id.shard_slug(), pg_version=pg_version))] - fn launch( - conf: &'static PageServerConf, - tenant_shard_id: TenantShardId, - pg_version: u32, - ) -> anyhow::Result { - let pg_bin_dir_path = conf.pg_bin_dir(pg_version).context("pg_bin_dir")?; // TODO these should be infallible. - let pg_lib_dir_path = conf.pg_lib_dir(pg_version).context("pg_lib_dir")?; - - // Start postgres itself - let child = Command::new(pg_bin_dir_path.join("postgres")) - .arg("--wal-redo") - .stdin(Stdio::piped()) - .stderr(Stdio::piped()) - .stdout(Stdio::piped()) - .env_clear() - .env("LD_LIBRARY_PATH", &pg_lib_dir_path) - .env("DYLD_LIBRARY_PATH", &pg_lib_dir_path) - // The redo process is not trusted, and runs in seccomp mode that - // doesn't allow it to open any files. We have to also make sure it - // doesn't inherit any file descriptors from the pageserver, that - // would allow an attacker to read any files that happen to be open - // in the pageserver. - // - // The Rust standard library makes sure to mark any file descriptors with - // as close-on-exec by default, but that's not enough, since we use - // libraries that directly call libc open without setting that flag. - .close_fds() - .spawn_no_leak_child(tenant_shard_id) - .context("spawn process")?; - WAL_REDO_PROCESS_COUNTERS.started.inc(); - let mut child = scopeguard::guard(child, |child| { - error!("killing wal-redo-postgres process due to a problem during launch"); - child.kill_and_wait(WalRedoKillCause::Startup); - }); - - let stdin = child.stdin.take().unwrap(); - let stdout = child.stdout.take().unwrap(); - let stderr = child.stderr.take().unwrap(); - let stderr = tokio::process::ChildStderr::from_std(stderr) - .context("convert to tokio::ChildStderr")?; - let stdin = - tokio::process::ChildStdin::from_std(stdin).context("convert to tokio::ChildStdin")?; - let stdout = tokio::process::ChildStdout::from_std(stdout) - .context("convert to tokio::ChildStdout")?; - - // all fallible operations post-spawn are complete, so get rid of the guard - let child = scopeguard::ScopeGuard::into_inner(child); - - tokio::spawn( - async move { - scopeguard::defer! { - debug!("wal-redo-postgres stderr_logger_task finished"); - crate::metrics::WAL_REDO_PROCESS_COUNTERS.active_stderr_logger_tasks_finished.inc(); - } - debug!("wal-redo-postgres stderr_logger_task started"); - crate::metrics::WAL_REDO_PROCESS_COUNTERS.active_stderr_logger_tasks_started.inc(); - - use tokio::io::AsyncBufReadExt; - let mut stderr_lines = tokio::io::BufReader::new(stderr); - let mut buf = Vec::new(); - let res = loop { - buf.clear(); - // TODO we don't trust the process to cap its stderr length. - // Currently it can do unbounded Vec allocation. - match stderr_lines.read_until(b'\n', &mut buf).await { - Ok(0) => break Ok(()), // eof - Ok(num_bytes) => { - let output = String::from_utf8_lossy(&buf[..num_bytes]); - error!(%output, "received output"); - } - Err(e) => { - break Err(e); - } - } - }; - match res { - Ok(()) => (), - Err(e) => { - error!(error=?e, "failed to read from walredo stderr"); - } - } - }.instrument(tracing::info_span!(parent: None, "wal-redo-postgres-stderr", pid = child.id(), tenant_id = %tenant_shard_id.tenant_id, shard_id = %tenant_shard_id.shard_slug(), %pg_version)) - ); - - Ok(Self { - conf, - tenant_shard_id, - child: Some(child), - stdin: tokio::sync::Mutex::new(Poison::new( - "stdin", - ProcessInput { - stdin, - n_requests: 0, - }, - )), - stdout: tokio::sync::Mutex::new(Poison::new( - "stdout", - ProcessOutput { - stdout, - pending_responses: VecDeque::new(), - n_processed_responses: 0, - }, - )), - #[cfg(feature = "testing")] - dump_sequence: AtomicUsize::default(), - }) - } - - fn id(&self) -> u32 { - self.child - .as_ref() - .expect("must not call this during Drop") - .id() - } - - // Apply given WAL records ('records') over an old page image. Returns - // new page image. - // - #[instrument(skip_all, fields(tenant_id=%self.tenant_shard_id.tenant_id, shard_id=%self.tenant_shard_id.shard_slug(), pid=%self.id()))] - async fn apply_wal_records( - &self, - tag: BufferTag, - base_img: &Option, - records: &[(Lsn, NeonWalRecord)], - ) -> anyhow::Result { - // Serialize all the messages to send the WAL redo process first. - // - // This could be problematic if there are millions of records to replay, - // but in practice the number of records is usually so small that it doesn't - // matter, and it's better to keep this code simple. - // - // Most requests start with a before-image with BLCKSZ bytes, followed by - // by some other WAL records. Start with a buffer that can hold that - // comfortably. - let mut writebuf: Vec = Vec::with_capacity((BLCKSZ as usize) * 3); - build_begin_redo_for_block_msg(tag, &mut writebuf); - if let Some(img) = base_img { - build_push_page_msg(tag, img, &mut writebuf); - } - for (lsn, rec) in records.iter() { - if let NeonWalRecord::Postgres { - will_init: _, - rec: postgres_rec, - } = rec - { - build_apply_record_msg(*lsn, postgres_rec, &mut writebuf); - } else { - anyhow::bail!("tried to pass neon wal record to postgres WAL redo"); - } - } - build_get_page_msg(tag, &mut writebuf); - WAL_REDO_RECORD_COUNTER.inc_by(records.len() as u64); - - let res = self.apply_wal_records0(&writebuf).await; - - if res.is_err() { - // not all of these can be caused by this particular input, however these are so rare - // in tests so capture all. - self.record_and_log(&writebuf); - } - - res - } - - async fn apply_wal_records0(&self, writebuf: &[u8]) -> anyhow::Result { - let request_no = { - let mut lock_guard = self.stdin.lock().await; - let mut poison_guard = lock_guard.check_and_arm()?; - let input = poison_guard.data_mut(); - input - .stdin - .write_all(writebuf) - .await - .context("write to walredo stdin")?; - let request_no = input.n_requests; - input.n_requests += 1; - poison_guard.disarm(); - request_no - }; - - // To improve walredo performance we separate sending requests and receiving - // responses. Them are protected by different mutexes (output and input). - // If thread T1, T2, T3 send requests D1, D2, D3 to walredo process - // then there is not warranty that T1 will first granted output mutex lock. - // To address this issue we maintain number of sent requests, number of processed - // responses and ring buffer with pending responses. After sending response - // (under input mutex), threads remembers request number. Then it releases - // input mutex, locks output mutex and fetch in ring buffer all responses until - // its stored request number. The it takes correspondent element from - // pending responses ring buffer and truncate all empty elements from the front, - // advancing processed responses number. - - let mut lock_guard = self.stdout.lock().await; - let mut poison_guard = lock_guard.check_and_arm()?; - let output = poison_guard.data_mut(); - let n_processed_responses = output.n_processed_responses; - while n_processed_responses + output.pending_responses.len() <= request_no { - // We expect the WAL redo process to respond with an 8k page image. We read it - // into this buffer. - let mut resultbuf = vec![0; BLCKSZ.into()]; - output - .stdout - .read_exact(&mut resultbuf) - .await - .context("read walredo stdout")?; - output - .pending_responses - .push_back(Some(Bytes::from(resultbuf))); - } - // Replace our request's response with None in `pending_responses`. - // Then make space in the ring buffer by clearing out any seqence of contiguous - // `None`'s from the front of `pending_responses`. - // NB: We can't pop_front() because other requests' responses because another - // requester might have grabbed the output mutex before us: - // T1: grab input mutex - // T1: send request_no 23 - // T1: release input mutex - // T2: grab input mutex - // T2: send request_no 24 - // T2: release input mutex - // T2: grab output mutex - // T2: n_processed_responses + output.pending_responses.len() <= request_no - // 23 0 24 - // T2: enters poll loop that reads stdout - // T2: put response for 23 into pending_responses - // T2: put response for 24 into pending_resposnes - // pending_responses now looks like this: Front Some(response_23) Some(response_24) Back - // T2: takes its response_24 - // pending_responses now looks like this: Front Some(response_23) None Back - // T2: does the while loop below - // pending_responses now looks like this: Front Some(response_23) None Back - // T2: releases output mutex - // T1: grabs output mutex - // T1: n_processed_responses + output.pending_responses.len() > request_no - // 23 2 23 - // T1: skips poll loop that reads stdout - // T1: takes its response_23 - // pending_responses now looks like this: Front None None Back - // T2: does the while loop below - // pending_responses now looks like this: Front Back - // n_processed_responses now has value 25 - let res = output.pending_responses[request_no - n_processed_responses] - .take() - .expect("we own this request_no, nobody else is supposed to take it"); - while let Some(front) = output.pending_responses.front() { - if front.is_none() { - output.pending_responses.pop_front(); - output.n_processed_responses += 1; - } else { - break; - } - } - poison_guard.disarm(); - Ok(res) - } - - #[cfg(feature = "testing")] - fn record_and_log(&self, writebuf: &[u8]) { - let millis = std::time::SystemTime::now() - .duration_since(std::time::SystemTime::UNIX_EPOCH) - .unwrap() - .as_millis(); - - let seq = self.dump_sequence.fetch_add(1, Ordering::Relaxed); - - // these files will be collected to an allure report - let filename = format!("walredo-{millis}-{}-{seq}.walredo", writebuf.len()); - - let path = self.conf.tenant_path(&self.tenant_shard_id).join(&filename); - - use std::io::Write; - let res = std::fs::OpenOptions::new() - .write(true) - .create_new(true) - .read(true) - .open(path) - .and_then(|mut f| f.write_all(writebuf)); - - // trip up allowed_errors - if let Err(e) = res { - tracing::error!(target=%filename, length=writebuf.len(), "failed to write out the walredo errored input: {e}"); - } else { - tracing::error!(filename, "erroring walredo input saved"); - } - } - - #[cfg(not(feature = "testing"))] - fn record_and_log(&self, _: &[u8]) {} -} - -impl Drop for WalRedoProcess { - fn drop(&mut self) { - self.child - .take() - .expect("we only do this once") - .kill_and_wait(WalRedoKillCause::WalRedoProcessDrop); - // no way to wait for stderr_logger_task from Drop because that is async only - } -} - -/// Wrapper type around `std::process::Child` which guarantees that the child -/// will be killed and waited-for by this process before being dropped. -struct NoLeakChild { - tenant_id: TenantShardId, - child: Option, -} - -impl Deref for NoLeakChild { - type Target = Child; - - fn deref(&self) -> &Self::Target { - self.child.as_ref().expect("must not use from drop") - } -} - -impl DerefMut for NoLeakChild { - fn deref_mut(&mut self) -> &mut Self::Target { - self.child.as_mut().expect("must not use from drop") - } -} - -impl NoLeakChild { - fn spawn(tenant_id: TenantShardId, command: &mut Command) -> io::Result { - let child = command.spawn()?; - Ok(NoLeakChild { - tenant_id, - child: Some(child), - }) - } - - fn kill_and_wait(mut self, cause: WalRedoKillCause) { - let child = match self.child.take() { - Some(child) => child, - None => return, - }; - Self::kill_and_wait_impl(child, cause); - } - - #[instrument(skip_all, fields(pid=child.id(), ?cause))] - fn kill_and_wait_impl(mut child: Child, cause: WalRedoKillCause) { - scopeguard::defer! { - WAL_REDO_PROCESS_COUNTERS.killed_by_cause[cause].inc(); - } - let res = child.kill(); - if let Err(e) = res { - // This branch is very unlikely because: - // - We (= pageserver) spawned this process successfully, so, we're allowed to kill it. - // - This is the only place that calls .kill() - // - We consume `self`, so, .kill() can't be called twice. - // - If the process exited by itself or was killed by someone else, - // .kill() will still succeed because we haven't wait()'ed yet. - // - // So, if we arrive here, we have really no idea what happened, - // whether the PID stored in self.child is still valid, etc. - // If this function were fallible, we'd return an error, but - // since it isn't, all we can do is log an error and proceed - // with the wait(). - error!(error = %e, "failed to SIGKILL; subsequent wait() might fail or wait for wrong process"); - } - - match child.wait() { - Ok(exit_status) => { - info!(exit_status = %exit_status, "wait successful"); - } - Err(e) => { - error!(error = %e, "wait error; might leak the child process; it will show as zombie (defunct)"); - } - } - } -} - -impl Drop for NoLeakChild { - fn drop(&mut self) { - let child = match self.child.take() { - Some(child) => child, - None => return, - }; - let tenant_shard_id = self.tenant_id; - // Offload the kill+wait of the child process into the background. - // If someone stops the runtime, we'll leak the child process. - // We can ignore that case because we only stop the runtime on pageserver exit. - tokio::runtime::Handle::current().spawn(async move { - tokio::task::spawn_blocking(move || { - // Intentionally don't inherit the tracing context from whoever is dropping us. - // This thread here is going to outlive of our dropper. - let span = tracing::info_span!( - "walredo", - tenant_id = %tenant_shard_id.tenant_id, - shard_id = %tenant_shard_id.shard_slug() - ); - let _entered = span.enter(); - Self::kill_and_wait_impl(child, WalRedoKillCause::NoLeakChildDrop); - }) - .await - }); - } -} - -trait NoLeakChildCommandExt { - fn spawn_no_leak_child(&mut self, tenant_id: TenantShardId) -> io::Result; -} - -impl NoLeakChildCommandExt for Command { - fn spawn_no_leak_child(&mut self, tenant_id: TenantShardId) -> io::Result { - NoLeakChild::spawn(tenant_id, self) - } -} - -// Functions for constructing messages to send to the postgres WAL redo -// process. See pgxn/neon_walredo/walredoproc.c for -// explanation of the protocol. - -fn build_begin_redo_for_block_msg(tag: BufferTag, buf: &mut Vec) { - let len = 4 + 1 + 4 * 4; - - buf.put_u8(b'B'); - buf.put_u32(len as u32); - - tag.ser_into(buf) - .expect("serialize BufferTag should always succeed"); -} - -fn build_push_page_msg(tag: BufferTag, base_img: &[u8], buf: &mut Vec) { - assert!(base_img.len() == 8192); - - let len = 4 + 1 + 4 * 4 + base_img.len(); - - buf.put_u8(b'P'); - buf.put_u32(len as u32); - tag.ser_into(buf) - .expect("serialize BufferTag should always succeed"); - buf.put(base_img); -} - -fn build_apply_record_msg(endlsn: Lsn, rec: &[u8], buf: &mut Vec) { - let len = 4 + 8 + rec.len(); - - buf.put_u8(b'A'); - buf.put_u32(len as u32); - buf.put_u64(endlsn.0); - buf.put(rec); -} - -fn build_get_page_msg(tag: BufferTag, buf: &mut Vec) { - let len = 4 + 1 + 4 * 4; - - buf.put_u8(b'G'); - buf.put_u32(len as u32); - tag.ser_into(buf) - .expect("serialize BufferTag should always succeed"); -} - #[cfg(test)] mod tests { use super::PostgresRedoManager; @@ -1118,6 +373,7 @@ mod tests { use bytes::Bytes; use pageserver_api::shard::TenantShardId; use std::str::FromStr; + use tracing::Instrument; use utils::{id::TenantId, lsn::Lsn}; #[tokio::test] @@ -1142,6 +398,7 @@ mod tests { short_records(), 14, ) + .instrument(h.span()) .await .unwrap(); @@ -1169,6 +426,7 @@ mod tests { short_records(), 14, ) + .instrument(h.span()) .await .unwrap(); @@ -1189,6 +447,7 @@ mod tests { short_records(), 16, /* 16 currently produces stderr output on startup, which adds a nice extra edge */ ) + .instrument(h.span()) .await .unwrap_err(); } @@ -1217,6 +476,7 @@ mod tests { // underscored because unused, except for removal at drop _repo_dir: camino_tempfile::Utf8TempDir, manager: PostgresRedoManager, + tenant_shard_id: TenantShardId, } impl RedoHarness { @@ -1233,7 +493,11 @@ mod tests { Ok(RedoHarness { _repo_dir: repo_dir, manager, + tenant_shard_id, }) } + fn span(&self) -> tracing::Span { + tracing::info_span!("RedoHarness", tenant_id=%self.tenant_shard_id.tenant_id, shard_id=%self.tenant_shard_id.shard_slug()) + } } } diff --git a/pageserver/src/walredo/apply_neon.rs b/pageserver/src/walredo/apply_neon.rs new file mode 100644 index 0000000000..247704e2a5 --- /dev/null +++ b/pageserver/src/walredo/apply_neon.rs @@ -0,0 +1,301 @@ +use crate::pgdatadir_mapping::AuxFilesDirectory; +use crate::walrecord::NeonWalRecord; +use anyhow::Context; +use byteorder::{ByteOrder, LittleEndian}; +use bytes::{BufMut, BytesMut}; +use pageserver_api::key::{key_to_rel_block, key_to_slru_block, Key}; +use pageserver_api::reltag::SlruKind; +use postgres_ffi::pg_constants; +use postgres_ffi::relfile_utils::VISIBILITYMAP_FORKNUM; +use postgres_ffi::v14::nonrelfile_utils::{ + mx_offset_to_flags_bitshift, mx_offset_to_flags_offset, mx_offset_to_member_offset, + transaction_id_set_status, +}; +use postgres_ffi::BLCKSZ; +use tracing::*; +use utils::bin_ser::BeSer; + +/// Can this request be served by neon redo functions +/// or we need to pass it to wal-redo postgres process? +pub(crate) fn can_apply_in_neon(rec: &NeonWalRecord) -> bool { + // Currently, we don't have bespoken Rust code to replay any + // Postgres WAL records. But everything else is handled in neon. + #[allow(clippy::match_like_matches_macro)] + match rec { + NeonWalRecord::Postgres { + will_init: _, + rec: _, + } => false, + _ => true, + } +} + +pub(crate) fn apply_in_neon( + record: &NeonWalRecord, + key: Key, + page: &mut BytesMut, +) -> Result<(), anyhow::Error> { + match record { + NeonWalRecord::Postgres { + will_init: _, + rec: _, + } => { + anyhow::bail!("tried to pass postgres wal record to neon WAL redo"); + } + NeonWalRecord::ClearVisibilityMapFlags { + new_heap_blkno, + old_heap_blkno, + flags, + } => { + // sanity check that this is modifying the correct relation + let (rel, blknum) = key_to_rel_block(key).context("invalid record")?; + assert!( + rel.forknum == VISIBILITYMAP_FORKNUM, + "ClearVisibilityMapFlags record on unexpected rel {}", + rel + ); + if let Some(heap_blkno) = *new_heap_blkno { + // Calculate the VM block and offset that corresponds to the heap block. + let map_block = pg_constants::HEAPBLK_TO_MAPBLOCK(heap_blkno); + let map_byte = pg_constants::HEAPBLK_TO_MAPBYTE(heap_blkno); + let map_offset = pg_constants::HEAPBLK_TO_OFFSET(heap_blkno); + + // Check that we're modifying the correct VM block. + assert!(map_block == blknum); + + // equivalent to PageGetContents(page) + let map = &mut page[pg_constants::MAXALIGN_SIZE_OF_PAGE_HEADER_DATA..]; + + map[map_byte as usize] &= !(flags << map_offset); + } + + // Repeat for 'old_heap_blkno', if any + if let Some(heap_blkno) = *old_heap_blkno { + let map_block = pg_constants::HEAPBLK_TO_MAPBLOCK(heap_blkno); + let map_byte = pg_constants::HEAPBLK_TO_MAPBYTE(heap_blkno); + let map_offset = pg_constants::HEAPBLK_TO_OFFSET(heap_blkno); + + assert!(map_block == blknum); + + let map = &mut page[pg_constants::MAXALIGN_SIZE_OF_PAGE_HEADER_DATA..]; + + map[map_byte as usize] &= !(flags << map_offset); + } + } + // Non-relational WAL records are handled here, with custom code that has the + // same effects as the corresponding Postgres WAL redo function. + NeonWalRecord::ClogSetCommitted { xids, timestamp } => { + let (slru_kind, segno, blknum) = key_to_slru_block(key).context("invalid record")?; + assert_eq!( + slru_kind, + SlruKind::Clog, + "ClogSetCommitted record with unexpected key {}", + key + ); + for &xid in xids { + let pageno = xid / pg_constants::CLOG_XACTS_PER_PAGE; + let expected_segno = pageno / pg_constants::SLRU_PAGES_PER_SEGMENT; + let expected_blknum = pageno % pg_constants::SLRU_PAGES_PER_SEGMENT; + + // Check that we're modifying the correct CLOG block. + assert!( + segno == expected_segno, + "ClogSetCommitted record for XID {} with unexpected key {}", + xid, + key + ); + assert!( + blknum == expected_blknum, + "ClogSetCommitted record for XID {} with unexpected key {}", + xid, + key + ); + + transaction_id_set_status(xid, pg_constants::TRANSACTION_STATUS_COMMITTED, page); + } + + // Append the timestamp + if page.len() == BLCKSZ as usize + 8 { + page.truncate(BLCKSZ as usize); + } + if page.len() == BLCKSZ as usize { + page.extend_from_slice(×tamp.to_be_bytes()); + } else { + warn!( + "CLOG blk {} in seg {} has invalid size {}", + blknum, + segno, + page.len() + ); + } + } + NeonWalRecord::ClogSetAborted { xids } => { + let (slru_kind, segno, blknum) = key_to_slru_block(key).context("invalid record")?; + assert_eq!( + slru_kind, + SlruKind::Clog, + "ClogSetAborted record with unexpected key {}", + key + ); + for &xid in xids { + let pageno = xid / pg_constants::CLOG_XACTS_PER_PAGE; + let expected_segno = pageno / pg_constants::SLRU_PAGES_PER_SEGMENT; + let expected_blknum = pageno % pg_constants::SLRU_PAGES_PER_SEGMENT; + + // Check that we're modifying the correct CLOG block. + assert!( + segno == expected_segno, + "ClogSetAborted record for XID {} with unexpected key {}", + xid, + key + ); + assert!( + blknum == expected_blknum, + "ClogSetAborted record for XID {} with unexpected key {}", + xid, + key + ); + + transaction_id_set_status(xid, pg_constants::TRANSACTION_STATUS_ABORTED, page); + } + } + NeonWalRecord::MultixactOffsetCreate { mid, moff } => { + let (slru_kind, segno, blknum) = key_to_slru_block(key).context("invalid record")?; + assert_eq!( + slru_kind, + SlruKind::MultiXactOffsets, + "MultixactOffsetCreate record with unexpected key {}", + key + ); + // Compute the block and offset to modify. + // See RecordNewMultiXact in PostgreSQL sources. + let pageno = mid / pg_constants::MULTIXACT_OFFSETS_PER_PAGE as u32; + let entryno = mid % pg_constants::MULTIXACT_OFFSETS_PER_PAGE as u32; + let offset = (entryno * 4) as usize; + + // Check that we're modifying the correct multixact-offsets block. + let expected_segno = pageno / pg_constants::SLRU_PAGES_PER_SEGMENT; + let expected_blknum = pageno % pg_constants::SLRU_PAGES_PER_SEGMENT; + assert!( + segno == expected_segno, + "MultiXactOffsetsCreate record for multi-xid {} with unexpected key {}", + mid, + key + ); + assert!( + blknum == expected_blknum, + "MultiXactOffsetsCreate record for multi-xid {} with unexpected key {}", + mid, + key + ); + + LittleEndian::write_u32(&mut page[offset..offset + 4], *moff); + } + NeonWalRecord::MultixactMembersCreate { moff, members } => { + let (slru_kind, segno, blknum) = key_to_slru_block(key).context("invalid record")?; + assert_eq!( + slru_kind, + SlruKind::MultiXactMembers, + "MultixactMembersCreate record with unexpected key {}", + key + ); + for (i, member) in members.iter().enumerate() { + let offset = moff + i as u32; + + // Compute the block and offset to modify. + // See RecordNewMultiXact in PostgreSQL sources. + let pageno = offset / pg_constants::MULTIXACT_MEMBERS_PER_PAGE as u32; + let memberoff = mx_offset_to_member_offset(offset); + let flagsoff = mx_offset_to_flags_offset(offset); + let bshift = mx_offset_to_flags_bitshift(offset); + + // Check that we're modifying the correct multixact-members block. + let expected_segno = pageno / pg_constants::SLRU_PAGES_PER_SEGMENT; + let expected_blknum = pageno % pg_constants::SLRU_PAGES_PER_SEGMENT; + assert!( + segno == expected_segno, + "MultiXactMembersCreate record for offset {} with unexpected key {}", + moff, + key + ); + assert!( + blknum == expected_blknum, + "MultiXactMembersCreate record for offset {} with unexpected key {}", + moff, + key + ); + + let mut flagsval = LittleEndian::read_u32(&page[flagsoff..flagsoff + 4]); + flagsval &= !(((1 << pg_constants::MXACT_MEMBER_BITS_PER_XACT) - 1) << bshift); + flagsval |= member.status << bshift; + LittleEndian::write_u32(&mut page[flagsoff..flagsoff + 4], flagsval); + LittleEndian::write_u32(&mut page[memberoff..memberoff + 4], member.xid); + } + } + NeonWalRecord::AuxFile { file_path, content } => { + let mut dir = AuxFilesDirectory::des(page)?; + dir.upsert(file_path.clone(), content.clone()); + + page.clear(); + let mut writer = page.writer(); + dir.ser_into(&mut writer)?; + } + } + Ok(()) +} + +#[cfg(test)] +mod test { + use bytes::Bytes; + use pageserver_api::key::AUX_FILES_KEY; + + use super::*; + use std::collections::HashMap; + + /// Test [`apply_in_neon`]'s handling of NeonWalRecord::AuxFile + #[test] + fn apply_aux_file_deltas() -> anyhow::Result<()> { + let base_dir = AuxFilesDirectory { + files: HashMap::from([ + ("two".to_string(), Bytes::from_static(b"content0")), + ("three".to_string(), Bytes::from_static(b"contentX")), + ]), + }; + let base_image = AuxFilesDirectory::ser(&base_dir)?; + + let deltas = vec![ + // Insert + NeonWalRecord::AuxFile { + file_path: "one".to_string(), + content: Some(Bytes::from_static(b"content1")), + }, + // Update + NeonWalRecord::AuxFile { + file_path: "two".to_string(), + content: Some(Bytes::from_static(b"content99")), + }, + // Delete + NeonWalRecord::AuxFile { + file_path: "three".to_string(), + content: None, + }, + ]; + + let file_path = AUX_FILES_KEY; + let mut page = BytesMut::from_iter(base_image); + + for record in deltas { + apply_in_neon(&record, file_path, &mut page)?; + } + + let reconstructed = AuxFilesDirectory::des(&page)?; + let expect = HashMap::from([ + ("one".to_string(), Bytes::from_static(b"content1")), + ("two".to_string(), Bytes::from_static(b"content99")), + ]); + + assert_eq!(reconstructed.files, expect); + + Ok(()) + } +} diff --git a/pageserver/src/walredo/process.rs b/pageserver/src/walredo/process.rs new file mode 100644 index 0000000000..d4e804137f --- /dev/null +++ b/pageserver/src/walredo/process.rs @@ -0,0 +1,362 @@ +use self::no_leak_child::NoLeakChild; +use crate::{ + config::PageServerConf, + metrics::{WalRedoKillCause, WAL_REDO_PROCESS_COUNTERS, WAL_REDO_RECORD_COUNTER}, + walrecord::NeonWalRecord, +}; +use anyhow::Context; +use bytes::Bytes; +use pageserver_api::{reltag::RelTag, shard::TenantShardId}; +use postgres_ffi::BLCKSZ; +#[cfg(feature = "testing")] +use std::sync::atomic::AtomicUsize; +use std::{ + collections::VecDeque, + process::{Command, Stdio}, +}; +use tokio::io::{AsyncReadExt, AsyncWriteExt}; +use tracing::{debug, error, instrument, Instrument}; +use utils::{lsn::Lsn, poison::Poison}; + +mod no_leak_child; +/// The IPC protocol that pageserver and walredo process speak over their shared pipe. +mod protocol; + +pub struct WalRedoProcess { + #[allow(dead_code)] + conf: &'static PageServerConf, + tenant_shard_id: TenantShardId, + // Some() on construction, only becomes None on Drop. + child: Option, + stdout: tokio::sync::Mutex>, + stdin: tokio::sync::Mutex>, + /// Counter to separate same sized walredo inputs failing at the same millisecond. + #[cfg(feature = "testing")] + dump_sequence: AtomicUsize, +} + +struct ProcessInput { + stdin: tokio::process::ChildStdin, + n_requests: usize, +} + +struct ProcessOutput { + stdout: tokio::process::ChildStdout, + pending_responses: VecDeque>, + n_processed_responses: usize, +} + +impl WalRedoProcess { + // + // Start postgres binary in special WAL redo mode. + // + #[instrument(skip_all,fields(pg_version=pg_version))] + pub(crate) fn launch( + conf: &'static PageServerConf, + tenant_shard_id: TenantShardId, + pg_version: u32, + ) -> anyhow::Result { + crate::span::debug_assert_current_span_has_tenant_id(); + + let pg_bin_dir_path = conf.pg_bin_dir(pg_version).context("pg_bin_dir")?; // TODO these should be infallible. + let pg_lib_dir_path = conf.pg_lib_dir(pg_version).context("pg_lib_dir")?; + + use no_leak_child::NoLeakChildCommandExt; + // Start postgres itself + let child = Command::new(pg_bin_dir_path.join("postgres")) + // the first arg must be --wal-redo so the child process enters into walredo mode + .arg("--wal-redo") + // the child doesn't process this arg, but, having it in the argv helps indentify the + // walredo process for a particular tenant when debugging a pagserver + .args(["--tenant-shard-id", &format!("{tenant_shard_id}")]) + .stdin(Stdio::piped()) + .stderr(Stdio::piped()) + .stdout(Stdio::piped()) + .env_clear() + .env("LD_LIBRARY_PATH", &pg_lib_dir_path) + .env("DYLD_LIBRARY_PATH", &pg_lib_dir_path) + // NB: The redo process is not trusted after we sent it the first + // walredo work. Before that, it is trusted. Specifically, we trust + // it to + // 1. close all file descriptors except stdin, stdout, stderr because + // pageserver might not be 100% diligent in setting FD_CLOEXEC on all + // the files it opens, and + // 2. to use seccomp to sandbox itself before processing the first + // walredo request. + .spawn_no_leak_child(tenant_shard_id) + .context("spawn process")?; + WAL_REDO_PROCESS_COUNTERS.started.inc(); + let mut child = scopeguard::guard(child, |child| { + error!("killing wal-redo-postgres process due to a problem during launch"); + child.kill_and_wait(WalRedoKillCause::Startup); + }); + + let stdin = child.stdin.take().unwrap(); + let stdout = child.stdout.take().unwrap(); + let stderr = child.stderr.take().unwrap(); + let stderr = tokio::process::ChildStderr::from_std(stderr) + .context("convert to tokio::ChildStderr")?; + let stdin = + tokio::process::ChildStdin::from_std(stdin).context("convert to tokio::ChildStdin")?; + let stdout = tokio::process::ChildStdout::from_std(stdout) + .context("convert to tokio::ChildStdout")?; + + // all fallible operations post-spawn are complete, so get rid of the guard + let child = scopeguard::ScopeGuard::into_inner(child); + + tokio::spawn( + async move { + scopeguard::defer! { + debug!("wal-redo-postgres stderr_logger_task finished"); + crate::metrics::WAL_REDO_PROCESS_COUNTERS.active_stderr_logger_tasks_finished.inc(); + } + debug!("wal-redo-postgres stderr_logger_task started"); + crate::metrics::WAL_REDO_PROCESS_COUNTERS.active_stderr_logger_tasks_started.inc(); + + use tokio::io::AsyncBufReadExt; + let mut stderr_lines = tokio::io::BufReader::new(stderr); + let mut buf = Vec::new(); + let res = loop { + buf.clear(); + // TODO we don't trust the process to cap its stderr length. + // Currently it can do unbounded Vec allocation. + match stderr_lines.read_until(b'\n', &mut buf).await { + Ok(0) => break Ok(()), // eof + Ok(num_bytes) => { + let output = String::from_utf8_lossy(&buf[..num_bytes]); + error!(%output, "received output"); + } + Err(e) => { + break Err(e); + } + } + }; + match res { + Ok(()) => (), + Err(e) => { + error!(error=?e, "failed to read from walredo stderr"); + } + } + }.instrument(tracing::info_span!(parent: None, "wal-redo-postgres-stderr", pid = child.id(), tenant_id = %tenant_shard_id.tenant_id, shard_id = %tenant_shard_id.shard_slug(), %pg_version)) + ); + + Ok(Self { + conf, + tenant_shard_id, + child: Some(child), + stdin: tokio::sync::Mutex::new(Poison::new( + "stdin", + ProcessInput { + stdin, + n_requests: 0, + }, + )), + stdout: tokio::sync::Mutex::new(Poison::new( + "stdout", + ProcessOutput { + stdout, + pending_responses: VecDeque::new(), + n_processed_responses: 0, + }, + )), + #[cfg(feature = "testing")] + dump_sequence: AtomicUsize::default(), + }) + } + + pub(crate) fn id(&self) -> u32 { + self.child + .as_ref() + .expect("must not call this during Drop") + .id() + } + + // Apply given WAL records ('records') over an old page image. Returns + // new page image. + // + #[instrument(skip_all, fields(tenant_id=%self.tenant_shard_id.tenant_id, shard_id=%self.tenant_shard_id.shard_slug(), pid=%self.id()))] + pub(crate) async fn apply_wal_records( + &self, + rel: RelTag, + blknum: u32, + base_img: &Option, + records: &[(Lsn, NeonWalRecord)], + ) -> anyhow::Result { + let tag = protocol::BufferTag { rel, blknum }; + + // Serialize all the messages to send the WAL redo process first. + // + // This could be problematic if there are millions of records to replay, + // but in practice the number of records is usually so small that it doesn't + // matter, and it's better to keep this code simple. + // + // Most requests start with a before-image with BLCKSZ bytes, followed by + // by some other WAL records. Start with a buffer that can hold that + // comfortably. + let mut writebuf: Vec = Vec::with_capacity((BLCKSZ as usize) * 3); + protocol::build_begin_redo_for_block_msg(tag, &mut writebuf); + if let Some(img) = base_img { + protocol::build_push_page_msg(tag, img, &mut writebuf); + } + for (lsn, rec) in records.iter() { + if let NeonWalRecord::Postgres { + will_init: _, + rec: postgres_rec, + } = rec + { + protocol::build_apply_record_msg(*lsn, postgres_rec, &mut writebuf); + } else { + anyhow::bail!("tried to pass neon wal record to postgres WAL redo"); + } + } + protocol::build_get_page_msg(tag, &mut writebuf); + WAL_REDO_RECORD_COUNTER.inc_by(records.len() as u64); + + let res = self.apply_wal_records0(&writebuf).await; + + if res.is_err() { + // not all of these can be caused by this particular input, however these are so rare + // in tests so capture all. + self.record_and_log(&writebuf); + } + + res + } + + async fn apply_wal_records0(&self, writebuf: &[u8]) -> anyhow::Result { + let request_no = { + let mut lock_guard = self.stdin.lock().await; + let mut poison_guard = lock_guard.check_and_arm()?; + let input = poison_guard.data_mut(); + input + .stdin + .write_all(writebuf) + .await + .context("write to walredo stdin")?; + let request_no = input.n_requests; + input.n_requests += 1; + poison_guard.disarm(); + request_no + }; + + // To improve walredo performance we separate sending requests and receiving + // responses. Them are protected by different mutexes (output and input). + // If thread T1, T2, T3 send requests D1, D2, D3 to walredo process + // then there is not warranty that T1 will first granted output mutex lock. + // To address this issue we maintain number of sent requests, number of processed + // responses and ring buffer with pending responses. After sending response + // (under input mutex), threads remembers request number. Then it releases + // input mutex, locks output mutex and fetch in ring buffer all responses until + // its stored request number. The it takes correspondent element from + // pending responses ring buffer and truncate all empty elements from the front, + // advancing processed responses number. + + let mut lock_guard = self.stdout.lock().await; + let mut poison_guard = lock_guard.check_and_arm()?; + let output = poison_guard.data_mut(); + let n_processed_responses = output.n_processed_responses; + while n_processed_responses + output.pending_responses.len() <= request_no { + // We expect the WAL redo process to respond with an 8k page image. We read it + // into this buffer. + let mut resultbuf = vec![0; BLCKSZ.into()]; + output + .stdout + .read_exact(&mut resultbuf) + .await + .context("read walredo stdout")?; + output + .pending_responses + .push_back(Some(Bytes::from(resultbuf))); + } + // Replace our request's response with None in `pending_responses`. + // Then make space in the ring buffer by clearing out any seqence of contiguous + // `None`'s from the front of `pending_responses`. + // NB: We can't pop_front() because other requests' responses because another + // requester might have grabbed the output mutex before us: + // T1: grab input mutex + // T1: send request_no 23 + // T1: release input mutex + // T2: grab input mutex + // T2: send request_no 24 + // T2: release input mutex + // T2: grab output mutex + // T2: n_processed_responses + output.pending_responses.len() <= request_no + // 23 0 24 + // T2: enters poll loop that reads stdout + // T2: put response for 23 into pending_responses + // T2: put response for 24 into pending_resposnes + // pending_responses now looks like this: Front Some(response_23) Some(response_24) Back + // T2: takes its response_24 + // pending_responses now looks like this: Front Some(response_23) None Back + // T2: does the while loop below + // pending_responses now looks like this: Front Some(response_23) None Back + // T2: releases output mutex + // T1: grabs output mutex + // T1: n_processed_responses + output.pending_responses.len() > request_no + // 23 2 23 + // T1: skips poll loop that reads stdout + // T1: takes its response_23 + // pending_responses now looks like this: Front None None Back + // T2: does the while loop below + // pending_responses now looks like this: Front Back + // n_processed_responses now has value 25 + let res = output.pending_responses[request_no - n_processed_responses] + .take() + .expect("we own this request_no, nobody else is supposed to take it"); + while let Some(front) = output.pending_responses.front() { + if front.is_none() { + output.pending_responses.pop_front(); + output.n_processed_responses += 1; + } else { + break; + } + } + poison_guard.disarm(); + Ok(res) + } + + #[cfg(feature = "testing")] + fn record_and_log(&self, writebuf: &[u8]) { + use std::sync::atomic::Ordering; + + let millis = std::time::SystemTime::now() + .duration_since(std::time::SystemTime::UNIX_EPOCH) + .unwrap() + .as_millis(); + + let seq = self.dump_sequence.fetch_add(1, Ordering::Relaxed); + + // these files will be collected to an allure report + let filename = format!("walredo-{millis}-{}-{seq}.walredo", writebuf.len()); + + let path = self.conf.tenant_path(&self.tenant_shard_id).join(&filename); + + use std::io::Write; + let res = std::fs::OpenOptions::new() + .write(true) + .create_new(true) + .read(true) + .open(path) + .and_then(|mut f| f.write_all(writebuf)); + + // trip up allowed_errors + if let Err(e) = res { + tracing::error!(target=%filename, length=writebuf.len(), "failed to write out the walredo errored input: {e}"); + } else { + tracing::error!(filename, "erroring walredo input saved"); + } + } + + #[cfg(not(feature = "testing"))] + fn record_and_log(&self, _: &[u8]) {} +} + +impl Drop for WalRedoProcess { + fn drop(&mut self) { + self.child + .take() + .expect("we only do this once") + .kill_and_wait(WalRedoKillCause::WalRedoProcessDrop); + // no way to wait for stderr_logger_task from Drop because that is async only + } +} diff --git a/pageserver/src/walredo/process/no_leak_child.rs b/pageserver/src/walredo/process/no_leak_child.rs new file mode 100644 index 0000000000..1a0d7039df --- /dev/null +++ b/pageserver/src/walredo/process/no_leak_child.rs @@ -0,0 +1,124 @@ +use tracing::instrument; +use tracing::{error, info}; + +use crate::metrics::WalRedoKillCause; +use crate::metrics::WAL_REDO_PROCESS_COUNTERS; + +use std::io; +use std::process::Command; + +use std::ops::DerefMut; + +use std::ops::Deref; + +use std::process::Child; + +use pageserver_api::shard::TenantShardId; + +/// Wrapper type around `std::process::Child` which guarantees that the child +/// will be killed and waited-for by this process before being dropped. +pub(crate) struct NoLeakChild { + pub(crate) tenant_id: TenantShardId, + pub(crate) child: Option, +} + +impl Deref for NoLeakChild { + type Target = Child; + + fn deref(&self) -> &Self::Target { + self.child.as_ref().expect("must not use from drop") + } +} + +impl DerefMut for NoLeakChild { + fn deref_mut(&mut self) -> &mut Self::Target { + self.child.as_mut().expect("must not use from drop") + } +} + +impl NoLeakChild { + pub(crate) fn spawn(tenant_id: TenantShardId, command: &mut Command) -> io::Result { + let child = command.spawn()?; + Ok(NoLeakChild { + tenant_id, + child: Some(child), + }) + } + + pub(crate) fn kill_and_wait(mut self, cause: WalRedoKillCause) { + let child = match self.child.take() { + Some(child) => child, + None => return, + }; + Self::kill_and_wait_impl(child, cause); + } + + #[instrument(skip_all, fields(pid=child.id(), ?cause))] + pub(crate) fn kill_and_wait_impl(mut child: Child, cause: WalRedoKillCause) { + scopeguard::defer! { + WAL_REDO_PROCESS_COUNTERS.killed_by_cause[cause].inc(); + } + let res = child.kill(); + if let Err(e) = res { + // This branch is very unlikely because: + // - We (= pageserver) spawned this process successfully, so, we're allowed to kill it. + // - This is the only place that calls .kill() + // - We consume `self`, so, .kill() can't be called twice. + // - If the process exited by itself or was killed by someone else, + // .kill() will still succeed because we haven't wait()'ed yet. + // + // So, if we arrive here, we have really no idea what happened, + // whether the PID stored in self.child is still valid, etc. + // If this function were fallible, we'd return an error, but + // since it isn't, all we can do is log an error and proceed + // with the wait(). + error!(error = %e, "failed to SIGKILL; subsequent wait() might fail or wait for wrong process"); + } + + match child.wait() { + Ok(exit_status) => { + info!(exit_status = %exit_status, "wait successful"); + } + Err(e) => { + error!(error = %e, "wait error; might leak the child process; it will show as zombie (defunct)"); + } + } + } +} + +impl Drop for NoLeakChild { + fn drop(&mut self) { + let child = match self.child.take() { + Some(child) => child, + None => return, + }; + let tenant_shard_id = self.tenant_id; + // Offload the kill+wait of the child process into the background. + // If someone stops the runtime, we'll leak the child process. + // We can ignore that case because we only stop the runtime on pageserver exit. + tokio::runtime::Handle::current().spawn(async move { + tokio::task::spawn_blocking(move || { + // Intentionally don't inherit the tracing context from whoever is dropping us. + // This thread here is going to outlive of our dropper. + let span = tracing::info_span!( + "walredo", + tenant_id = %tenant_shard_id.tenant_id, + shard_id = %tenant_shard_id.shard_slug() + ); + let _entered = span.enter(); + Self::kill_and_wait_impl(child, WalRedoKillCause::NoLeakChildDrop); + }) + .await + }); + } +} + +pub(crate) trait NoLeakChildCommandExt { + fn spawn_no_leak_child(&mut self, tenant_id: TenantShardId) -> io::Result; +} + +impl NoLeakChildCommandExt for Command { + fn spawn_no_leak_child(&mut self, tenant_id: TenantShardId) -> io::Result { + NoLeakChild::spawn(tenant_id, self) + } +} diff --git a/pageserver/src/walredo/process/protocol.rs b/pageserver/src/walredo/process/protocol.rs new file mode 100644 index 0000000000..b703344cc8 --- /dev/null +++ b/pageserver/src/walredo/process/protocol.rs @@ -0,0 +1,57 @@ +use bytes::BufMut; +use pageserver_api::reltag::RelTag; +use serde::Serialize; +use utils::bin_ser::BeSer; +use utils::lsn::Lsn; + +/// +/// `RelTag` + block number (`blknum`) gives us a unique id of the page in the cluster. +/// +/// In Postgres `BufferTag` structure is used for exactly the same purpose. +/// [See more related comments here](https://github.com/postgres/postgres/blob/99c5852e20a0987eca1c38ba0c09329d4076b6a0/src/include/storage/buf_internals.h#L91). +/// +#[derive(Debug, PartialEq, Eq, PartialOrd, Ord, Clone, Copy, Serialize)] +pub(crate) struct BufferTag { + pub rel: RelTag, + pub blknum: u32, +} + +pub(crate) fn build_begin_redo_for_block_msg(tag: BufferTag, buf: &mut Vec) { + let len = 4 + 1 + 4 * 4; + + buf.put_u8(b'B'); + buf.put_u32(len as u32); + + tag.ser_into(buf) + .expect("serialize BufferTag should always succeed"); +} + +pub(crate) fn build_push_page_msg(tag: BufferTag, base_img: &[u8], buf: &mut Vec) { + assert!(base_img.len() == 8192); + + let len = 4 + 1 + 4 * 4 + base_img.len(); + + buf.put_u8(b'P'); + buf.put_u32(len as u32); + tag.ser_into(buf) + .expect("serialize BufferTag should always succeed"); + buf.put(base_img); +} + +pub(crate) fn build_apply_record_msg(endlsn: Lsn, rec: &[u8], buf: &mut Vec) { + let len = 4 + 8 + rec.len(); + + buf.put_u8(b'A'); + buf.put_u32(len as u32); + buf.put_u64(endlsn.0); + buf.put(rec); +} + +pub(crate) fn build_get_page_msg(tag: BufferTag, buf: &mut Vec) { + let len = 4 + 1 + 4 * 4; + + buf.put_u8(b'G'); + buf.put_u32(len as u32); + tag.ser_into(buf) + .expect("serialize BufferTag should always succeed"); +} diff --git a/pgxn/hnsw/hnsw.c b/pgxn/hnsw/hnsw.c index 45bf78ed3b..e624cb831f 100644 --- a/pgxn/hnsw/hnsw.c +++ b/pgxn/hnsw/hnsw.c @@ -149,7 +149,7 @@ hnsw_check_available_memory(Size requested) struct sysinfo si; Size total; if (sysinfo(&si) < 0) - elog(ERROR, "Failed to get amount of RAM: %n"); + elog(ERROR, "Failed to get amount of RAM: %m"); total = si.totalram*si.mem_unit; if ((Size)NBuffers*BLCKSZ + requested >= total) diff --git a/pgxn/neon/Makefile b/pgxn/neon/Makefile index c6b224a14d..0bcb9545a6 100644 --- a/pgxn/neon/Makefile +++ b/pgxn/neon/Makefile @@ -21,7 +21,7 @@ SHLIB_LINK_INTERNAL = $(libpq) SHLIB_LINK = -lcurl EXTENSION = neon -DATA = neon--1.0.sql neon--1.0--1.1.sql +DATA = neon--1.0.sql neon--1.0--1.1.sql neon--1.1--1.2.sql neon--1.2--1.3.sql neon--1.3--1.2.sql neon--1.2--1.1.sql neon--1.1--1.0.sql PGFILEDESC = "neon - cloud storage for PostgreSQL" EXTRA_CLEAN = \ diff --git a/pgxn/neon/control_plane_connector.c b/pgxn/neon/control_plane_connector.c index f6f006cba4..93252e6b29 100644 --- a/pgxn/neon/control_plane_connector.c +++ b/pgxn/neon/control_plane_connector.c @@ -35,16 +35,17 @@ #include "utils/memutils.h" #include "utils/jsonb.h" +#include "control_plane_connector.h" +#include "neon_utils.h" + static ProcessUtility_hook_type PreviousProcessUtilityHook = NULL; +static const char *jwt_token = NULL; + /* GUCs */ static char *ConsoleURL = NULL; static bool ForwardDDL = true; -/* Curl structures for sending the HTTP requests */ -static CURL *CurlHandle; -static struct curl_slist *ContentHeader = NULL; - /* * CURL docs say that this buffer must exist until we call curl_easy_cleanup * (which we never do), so we make this a static @@ -113,6 +114,8 @@ ConstructDeltaMessage() if (RootTable.db_table) { JsonbValue dbs; + HASH_SEQ_STATUS status; + DbEntry *entry; dbs.type = jbvString; dbs.val.string.val = "dbs"; @@ -120,9 +123,6 @@ ConstructDeltaMessage() pushJsonbValue(&state, WJB_KEY, &dbs); pushJsonbValue(&state, WJB_BEGIN_ARRAY, NULL); - HASH_SEQ_STATUS status; - DbEntry *entry; - hash_seq_init(&status, RootTable.db_table); while ((entry = hash_seq_search(&status)) != NULL) { @@ -168,8 +168,9 @@ ConstructDeltaMessage() #else const char *logdetail; #endif + char *encrypted_password; PushKeyValue(&state, "password", (char *) entry->password); - char *encrypted_password = get_role_password(entry->name, &logdetail); + encrypted_password = get_role_password(entry->name, &logdetail); if (encrypted_password) { @@ -226,6 +227,8 @@ ErrorWriteCallback(char *ptr, size_t size, size_t nmemb, void *userdata) static void SendDeltasToControlPlane() { + static CURL *handle = NULL; + if (!RootTable.db_table && !RootTable.role_table) return; if (!ConsoleURL) @@ -236,29 +239,57 @@ SendDeltasToControlPlane() if (!ForwardDDL) return; - char *message = ConstructDeltaMessage(); - ErrorString str = {}; + if (handle == NULL) + { + struct curl_slist *headers = NULL; - curl_easy_setopt(CurlHandle, CURLOPT_CUSTOMREQUEST, "PATCH"); - curl_easy_setopt(CurlHandle, CURLOPT_HTTPHEADER, ContentHeader); - curl_easy_setopt(CurlHandle, CURLOPT_POSTFIELDS, message); - curl_easy_setopt(CurlHandle, CURLOPT_URL, ConsoleURL); - curl_easy_setopt(CurlHandle, CURLOPT_ERRORBUFFER, CurlErrorBuf); - curl_easy_setopt(CurlHandle, CURLOPT_TIMEOUT, 3L /* seconds */ ); - curl_easy_setopt(CurlHandle, CURLOPT_WRITEDATA, &str); - curl_easy_setopt(CurlHandle, CURLOPT_WRITEFUNCTION, ErrorWriteCallback); + headers = curl_slist_append(headers, "Content-Type: application/json"); + if (headers == NULL) + { + elog(ERROR, "Failed to set Content-Type header"); + } + + if (jwt_token) + { + char auth_header[8192]; + + snprintf(auth_header, sizeof(auth_header), "Authorization: Bearer %s", jwt_token); + headers = curl_slist_append(headers, auth_header); + if (headers == NULL) + { + elog(ERROR, "Failed to set Authorization header"); + } + } + + handle = alloc_curl_handle(); + + curl_easy_setopt(handle, CURLOPT_CUSTOMREQUEST, "PATCH"); + curl_easy_setopt(handle, CURLOPT_HTTPHEADER, headers); + curl_easy_setopt(handle, CURLOPT_URL, ConsoleURL); + curl_easy_setopt(handle, CURLOPT_ERRORBUFFER, CurlErrorBuf); + curl_easy_setopt(handle, CURLOPT_TIMEOUT, 3L /* seconds */ ); + curl_easy_setopt(handle, CURLOPT_WRITEFUNCTION, ErrorWriteCallback); + } + + char *message = ConstructDeltaMessage(); + ErrorString str; + + str.size = 0; + + curl_easy_setopt(handle, CURLOPT_POSTFIELDS, message); + curl_easy_setopt(handle, CURLOPT_WRITEDATA, &str); const int num_retries = 5; - int curl_status; + CURLcode curl_status; for (int i = 0; i < num_retries; i++) { - if ((curl_status = curl_easy_perform(CurlHandle)) == 0) + if ((curl_status = curl_easy_perform(handle)) == 0) break; elog(LOG, "Curl request failed on attempt %d: %s", i, CurlErrorBuf); pg_usleep(1000 * 1000); } - if (curl_status != 0) + if (curl_status != CURLE_OK) { elog(ERROR, "Failed to perform curl request: %s", CurlErrorBuf); } @@ -266,13 +297,11 @@ SendDeltasToControlPlane() { long response_code; - if (curl_easy_getinfo(CurlHandle, CURLINFO_RESPONSE_CODE, &response_code) != CURLE_UNKNOWN_OPTION) + if (curl_easy_getinfo(handle, CURLINFO_RESPONSE_CODE, &response_code) != CURLE_UNKNOWN_OPTION) { - bool error_exists = str.size != 0; - if (response_code != 200) { - if (error_exists) + if (str.size != 0) { elog(ERROR, "Received HTTP code %ld from control plane: %s", @@ -803,7 +832,7 @@ NeonProcessUtility( } } -extern void +void InitControlPlaneConnector() { PreviousProcessUtilityHook = ProcessUtility_hook; @@ -835,34 +864,10 @@ InitControlPlaneConnector() NULL, NULL); - const char *jwt_token = getenv("NEON_CONTROL_PLANE_TOKEN"); - + jwt_token = getenv("NEON_CONTROL_PLANE_TOKEN"); if (!jwt_token) { elog(LOG, "Missing NEON_CONTROL_PLANE_TOKEN environment variable, forwarding will not be authenticated"); } - if (curl_global_init(CURL_GLOBAL_DEFAULT)) - { - elog(ERROR, "Failed to initialize curl"); - } - if ((CurlHandle = curl_easy_init()) == NULL) - { - elog(ERROR, "Failed to initialize curl handle"); - } - if ((ContentHeader = curl_slist_append(ContentHeader, "Content-Type: application/json")) == NULL) - { - elog(ERROR, "Failed to initialize content header"); - } - - if (jwt_token) - { - char auth_header[8192]; - - snprintf(auth_header, sizeof(auth_header), "Authorization: Bearer %s", jwt_token); - if ((ContentHeader = curl_slist_append(ContentHeader, auth_header)) == NULL) - { - elog(ERROR, "Failed to initialize authorization header"); - } - } } diff --git a/pgxn/neon/control_plane_connector.h b/pgxn/neon/control_plane_connector.h index 12d6a97562..7eed449200 100644 --- a/pgxn/neon/control_plane_connector.h +++ b/pgxn/neon/control_plane_connector.h @@ -1,6 +1,6 @@ #ifndef CONTROL_PLANE_CONNECTOR_H #define CONTROL_PLANE_CONNECTOR_H -void InitControlPlaneConnector(); +void InitControlPlaneConnector(void); #endif diff --git a/pgxn/neon/extension_server.c b/pgxn/neon/extension_server.c index d9a75142f1..e38af08f89 100644 --- a/pgxn/neon/extension_server.c +++ b/pgxn/neon/extension_server.c @@ -14,6 +14,9 @@ #include "utils/guc.h" +#include "extension_server.h" +#include "neon_utils.h" + static int extension_server_port = 0; static download_extension_file_hook_type prev_download_extension_file_hook = NULL; @@ -31,15 +34,18 @@ static download_extension_file_hook_type prev_download_extension_file_hook = NUL static bool neon_download_extension_file_http(const char *filename, bool is_library) { - CURL *curl; + static CURL *handle = NULL; + CURLcode res; char *compute_ctl_url; - char *postdata; bool ret = false; - if ((curl = curl_easy_init()) == NULL) + if (handle == NULL) { - elog(ERROR, "Failed to initialize curl handle"); + handle = alloc_curl_handle(); + + curl_easy_setopt(handle, CURLOPT_CUSTOMREQUEST, "POST"); + curl_easy_setopt(handle, CURLOPT_TIMEOUT, 3L /* seconds */ ); } compute_ctl_url = psprintf("http://localhost:%d/extension_server/%s%s", @@ -47,28 +53,22 @@ neon_download_extension_file_http(const char *filename, bool is_library) elog(LOG, "Sending request to compute_ctl: %s", compute_ctl_url); - curl_easy_setopt(curl, CURLOPT_CUSTOMREQUEST, "POST"); - curl_easy_setopt(curl, CURLOPT_URL, compute_ctl_url); - curl_easy_setopt(curl, CURLOPT_TIMEOUT, 3L /* seconds */ ); + curl_easy_setopt(handle, CURLOPT_URL, compute_ctl_url); - if (curl) + /* Perform the request, res will get the return code */ + res = curl_easy_perform(handle); + /* Check for errors */ + if (res == CURLE_OK) { - /* Perform the request, res will get the return code */ - res = curl_easy_perform(curl); - /* Check for errors */ - if (res == CURLE_OK) - { - ret = true; - } - else - { - /* Don't error here because postgres will try to find the file */ - /* and will fail with some proper error message if it's not found. */ - elog(WARNING, "neon_download_extension_file_http failed: %s\n", curl_easy_strerror(res)); - } - - /* always cleanup */ - curl_easy_cleanup(curl); + ret = true; + } + else + { + /* + * Don't error here because postgres will try to find the file and will + * fail with some proper error message if it's not found. + */ + elog(WARNING, "neon_download_extension_file_http failed: %s\n", curl_easy_strerror(res)); } return ret; diff --git a/pgxn/neon/extension_server.h b/pgxn/neon/extension_server.h new file mode 100644 index 0000000000..3e67708b85 --- /dev/null +++ b/pgxn/neon/extension_server.h @@ -0,0 +1,17 @@ +/*------------------------------------------------------------------------- + * + * extension_server.h + * Request compute_ctl to download extension files. + * + * IDENTIFICATION + * contrib/neon/extension_server.h + * + *------------------------------------------------------------------------- + */ + +#ifndef EXTENSION_SERVER_H +#define EXTENSION_SERVER_H + +void pg_init_extension_server(void); + +#endif /* EXTENSION_SERVER_H */ diff --git a/pgxn/neon/file_cache.c b/pgxn/neon/file_cache.c index 21db666caa..25275ef31f 100644 --- a/pgxn/neon/file_cache.c +++ b/pgxn/neon/file_cache.c @@ -25,6 +25,8 @@ #include "funcapi.h" #include "miscadmin.h" #include "pagestore_client.h" +#include "common/hashfn.h" +#include "lib/hyperloglog.h" #include "pgstat.h" #include "postmaster/bgworker.h" #include RELFILEINFO_HDR @@ -60,6 +62,7 @@ #define BLOCKS_PER_CHUNK 128 /* 1Mb chunk */ #define MB ((uint64)1024*1024) +#define HYPER_LOG_LOG_BIT_WIDTH 10 #define SIZE_MB_TO_CHUNKS(size) ((uint32)((size) * MB / BLCKSZ / BLOCKS_PER_CHUNK)) typedef struct FileCacheEntry @@ -84,6 +87,8 @@ typedef struct FileCacheControl uint64 writes; dlist_head lru; /* double linked list for LRU replacement * algorithm */ + hyperLogLogState wss_estimation; /* estimation of wroking set size */ + uint8_t hyperloglog_hashes[(1 << HYPER_LOG_LOG_BIT_WIDTH) + 1]; } FileCacheControl; static HTAB *lfc_hash; @@ -232,6 +237,14 @@ lfc_shmem_startup(void) lfc_ctl->writes = 0; dlist_init(&lfc_ctl->lru); + /* Initialize hyper-log-log structure for estimating working set size */ + initHyperLogLog(&lfc_ctl->wss_estimation, HYPER_LOG_LOG_BIT_WIDTH); + + /* We need hashes in shared memory */ + pfree(lfc_ctl->wss_estimation.hashesArr); + memset(lfc_ctl->hyperloglog_hashes, 0, sizeof lfc_ctl->hyperloglog_hashes); + lfc_ctl->wss_estimation.hashesArr = lfc_ctl->hyperloglog_hashes; + /* Recreate file cache on restart */ fd = BasicOpenFile(lfc_path, O_RDWR | O_CREAT | O_TRUNC); if (fd < 0) @@ -314,6 +327,9 @@ lfc_change_limit_hook(int newval, void *extra) lfc_ctl->used -= 1; } lfc_ctl->limit = new_size; + if (new_size == 0) { + lfc_ctl->generation += 1; + } neon_log(DEBUG1, "set local file cache limit to %d", new_size); LWLockRelease(lfc_lock); @@ -526,10 +542,16 @@ lfc_read(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber blkno, } entry = hash_search_with_hash_value(lfc_hash, &tag, hash, HASH_FIND, NULL); + + /* Approximate working set */ + tag.blockNum = blkno; + addHyperLogLog(&lfc_ctl->wss_estimation, hash_bytes((uint8_t const*)&tag, sizeof(tag))); + if (entry == NULL || (entry->bitmap[chunk_offs >> 5] & (1 << (chunk_offs & 31))) == 0) { /* Page is not cached */ lfc_ctl->misses += 1; + pgBufferUsage.file_cache.misses += 1; LWLockRelease(lfc_lock); return false; } @@ -555,6 +577,7 @@ lfc_read(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber blkno, { Assert(LFC_ENABLED()); lfc_ctl->hits += 1; + pgBufferUsage.file_cache.hits += 1; Assert(entry->access_count > 0); if (--entry->access_count == 0) dlist_push_tail(&lfc_ctl->lru, &entry->lru_node); @@ -962,3 +985,21 @@ local_cache_pages(PG_FUNCTION_ARGS) else SRF_RETURN_DONE(funcctx); } + +PG_FUNCTION_INFO_V1(approximate_working_set_size); + +Datum +approximate_working_set_size(PG_FUNCTION_ARGS) +{ + int32 dc = -1; + if (lfc_size_limit != 0) + { + bool reset = PG_GETARG_BOOL(0); + LWLockAcquire(lfc_lock, reset ? LW_EXCLUSIVE : LW_SHARED); + dc = (int32) estimateHyperLogLog(&lfc_ctl->wss_estimation); + if (reset) + memset(lfc_ctl->hyperloglog_hashes, 0, sizeof lfc_ctl->hyperloglog_hashes); + LWLockRelease(lfc_lock); + } + PG_RETURN_INT32(dc); +} diff --git a/pgxn/neon/libpagestore.c b/pgxn/neon/libpagestore.c index 0eb1acbfb0..e31de3c6b5 100644 --- a/pgxn/neon/libpagestore.c +++ b/pgxn/neon/libpagestore.c @@ -316,6 +316,7 @@ pageserver_connect(shardno_t shard_no, int elevel) static uint64_t delay_us = MIN_RECONNECT_INTERVAL_USEC; TimestampTz now; uint64_t us_since_last_connect; + bool broke_from_loop = false; Assert(page_servers[shard_no].conn == NULL); @@ -328,18 +329,14 @@ pageserver_connect(shardno_t shard_no, int elevel) now = GetCurrentTimestamp(); us_since_last_connect = now - last_connect_time; - if (us_since_last_connect < delay_us) + if (us_since_last_connect < MAX_RECONNECT_INTERVAL_USEC) { - pg_usleep(delay_us - us_since_last_connect); + pg_usleep(delay_us); delay_us *= 2; - if (delay_us > MAX_RECONNECT_INTERVAL_USEC) - delay_us = MAX_RECONNECT_INTERVAL_USEC; - last_connect_time = GetCurrentTimestamp(); } else { delay_us = MIN_RECONNECT_INTERVAL_USEC; - last_connect_time = now; } /* @@ -366,6 +363,7 @@ pageserver_connect(shardno_t shard_no, int elevel) values[n] = NULL; n++; conn = PQconnectdbParams(keywords, values, 1); + last_connect_time = GetCurrentTimestamp(); if (PQstatus(conn) == CONNECTION_BAD) { @@ -421,7 +419,9 @@ pageserver_connect(shardno_t shard_no, int elevel) neon_shard_log(shard_no, elevel, "could not complete handshake with pageserver: %s", msg); - return false; + /* Returning from inside PG_TRY is bad, so we break/return later */ + broke_from_loop = true; + break; } } } @@ -434,6 +434,11 @@ pageserver_connect(shardno_t shard_no, int elevel) } PG_END_TRY(); + if (broke_from_loop) + { + return false; + } + neon_shard_log(shard_no, LOG, "libpagestore: connected to '%s'", connstr); page_servers[shard_no].conn = conn; page_servers[shard_no].wes = wes; diff --git a/pgxn/neon/neon--1.1--1.0.sql b/pgxn/neon/neon--1.1--1.0.sql new file mode 100644 index 0000000000..e83e3104e8 --- /dev/null +++ b/pgxn/neon/neon--1.1--1.0.sql @@ -0,0 +1,6 @@ +-- the order of operations is important here +-- because the view depends on the function + +DROP VIEW IF EXISTS neon_lfc_stats CASCADE; + +DROP FUNCTION IF EXISTS neon_get_lfc_stats CASCADE; diff --git a/pgxn/neon/neon--1.1--1.2.sql b/pgxn/neon/neon--1.1--1.2.sql new file mode 100644 index 0000000000..5818b4ffe5 --- /dev/null +++ b/pgxn/neon/neon--1.1--1.2.sql @@ -0,0 +1,29 @@ +\echo Use "ALTER EXTENSION neon UPDATE TO '1.2'" to load this file. \quit + +-- Create a convenient view similar to pg_stat_database +-- that exposes all lfc stat values in one row. +CREATE OR REPLACE VIEW NEON_STAT_FILE_CACHE AS + WITH lfc_stats AS ( + SELECT + stat_name, + count + FROM neon_get_lfc_stats() AS t(stat_name text, count bigint) + ), + lfc_values AS ( + SELECT + MAX(CASE WHEN stat_name = 'file_cache_misses' THEN count ELSE NULL END) AS file_cache_misses, + MAX(CASE WHEN stat_name = 'file_cache_hits' THEN count ELSE NULL END) AS file_cache_hits, + MAX(CASE WHEN stat_name = 'file_cache_used' THEN count ELSE NULL END) AS file_cache_used, + MAX(CASE WHEN stat_name = 'file_cache_writes' THEN count ELSE NULL END) AS file_cache_writes, + -- Calculate the file_cache_hit_ratio within the same CTE for simplicity + CASE + WHEN MAX(CASE WHEN stat_name = 'file_cache_misses' THEN count ELSE 0 END) + MAX(CASE WHEN stat_name = 'file_cache_hits' THEN count ELSE 0 END) = 0 THEN NULL + ELSE ROUND((MAX(CASE WHEN stat_name = 'file_cache_hits' THEN count ELSE 0 END)::DECIMAL / + (MAX(CASE WHEN stat_name = 'file_cache_hits' THEN count ELSE 0 END) + MAX(CASE WHEN stat_name = 'file_cache_misses' THEN count ELSE 0 END))) * 100, 2) + END AS file_cache_hit_ratio + FROM lfc_stats + ) +SELECT file_cache_misses, file_cache_hits, file_cache_used, file_cache_writes, file_cache_hit_ratio from lfc_values; + +-- externalize the view to all users in role pg_monitor +GRANT SELECT ON NEON_STAT_FILE_CACHE TO PG_MONITOR; \ No newline at end of file diff --git a/pgxn/neon/neon--1.2--1.1.sql b/pgxn/neon/neon--1.2--1.1.sql new file mode 100644 index 0000000000..c9f6a40f73 --- /dev/null +++ b/pgxn/neon/neon--1.2--1.1.sql @@ -0,0 +1 @@ +DROP VIEW IF EXISTS NEON_STAT_FILE_CACHE CASCADE; diff --git a/pgxn/neon/neon--1.2--1.3.sql b/pgxn/neon/neon--1.2--1.3.sql new file mode 100644 index 0000000000..9583008777 --- /dev/null +++ b/pgxn/neon/neon--1.2--1.3.sql @@ -0,0 +1,9 @@ +\echo Use "ALTER EXTENSION neon UPDATE TO '1.3'" to load this file. \quit + +CREATE FUNCTION approximate_working_set_size(reset bool) +RETURNS integer +AS 'MODULE_PATHNAME', 'approximate_working_set_size' +LANGUAGE C PARALLEL SAFE; + +GRANT EXECUTE ON FUNCTION approximate_working_set_size(bool) TO pg_monitor; + diff --git a/pgxn/neon/neon--1.3--1.2.sql b/pgxn/neon/neon--1.3--1.2.sql new file mode 100644 index 0000000000..2733a15c75 --- /dev/null +++ b/pgxn/neon/neon--1.3--1.2.sql @@ -0,0 +1 @@ +DROP FUNCTION IF EXISTS approximate_working_set_size(bool) CASCADE; diff --git a/pgxn/neon/neon.c b/pgxn/neon/neon.c index b930fdb3ca..6ede78a576 100644 --- a/pgxn/neon/neon.c +++ b/pgxn/neon/neon.c @@ -11,17 +11,25 @@ #include "postgres.h" #include "fmgr.h" +#include "miscadmin.h" #include "access/xact.h" #include "access/xlog.h" #include "storage/buf_internals.h" #include "storage/bufmgr.h" #include "catalog/pg_type.h" +#include "postmaster/bgworker.h" +#include "postmaster/interrupt.h" +#include "replication/slot.h" #include "replication/walsender.h" +#include "storage/procsignal.h" +#include "tcop/tcopprot.h" #include "funcapi.h" #include "access/htup_details.h" #include "utils/pg_lsn.h" #include "utils/guc.h" +#include "utils/wait_event.h" +#include "extension_server.h" #include "neon.h" #include "walproposer.h" #include "pagestore_client.h" @@ -30,6 +38,234 @@ PG_MODULE_MAGIC; void _PG_init(void); +static int logical_replication_max_snap_files = 300; +bool primary_is_running = false; + +static void +InitLogicalReplicationMonitor(void) +{ + BackgroundWorker bgw; + + DefineCustomIntVariable( + "neon.logical_replication_max_snap_files", + "Maximum allowed logical replication .snap files", + NULL, + &logical_replication_max_snap_files, + 300, 0, INT_MAX, + PGC_SIGHUP, + 0, + NULL, NULL, NULL); + + memset(&bgw, 0, sizeof(bgw)); + bgw.bgw_flags = BGWORKER_SHMEM_ACCESS; + bgw.bgw_start_time = BgWorkerStart_RecoveryFinished; + snprintf(bgw.bgw_library_name, BGW_MAXLEN, "neon"); + snprintf(bgw.bgw_function_name, BGW_MAXLEN, "LogicalSlotsMonitorMain"); + snprintf(bgw.bgw_name, BGW_MAXLEN, "Logical replication monitor"); + snprintf(bgw.bgw_type, BGW_MAXLEN, "Logical replication monitor"); + bgw.bgw_restart_time = 5; + bgw.bgw_notify_pid = 0; + bgw.bgw_main_arg = (Datum) 0; + + RegisterBackgroundWorker(&bgw); +} + +static int +LsnDescComparator(const void *a, const void *b) +{ + XLogRecPtr lsn1 = *((const XLogRecPtr *) a); + XLogRecPtr lsn2 = *((const XLogRecPtr *) b); + + if (lsn1 < lsn2) + return 1; + else if (lsn1 == lsn2) + return 0; + else + return -1; +} + +/* + * Look at .snap files and calculate minimum allowed restart_lsn of slot so that + * next gc would leave not more than logical_replication_max_snap_files; all + * slots having lower restart_lsn should be dropped. + */ +static XLogRecPtr +get_num_snap_files_lsn_threshold(void) +{ + DIR *dirdesc; + struct dirent *de; + char *snap_path = "pg_logical/snapshots/"; + int lsns_allocated = 1024; + int lsns_num = 0; + XLogRecPtr *lsns; + XLogRecPtr cutoff; + + if (logical_replication_max_snap_files < 0) + return 0; + + lsns = palloc(sizeof(XLogRecPtr) * lsns_allocated); + + /* find all .snap files and get their lsns */ + dirdesc = AllocateDir(snap_path); + while ((de = ReadDir(dirdesc, snap_path)) != NULL) + { + XLogRecPtr lsn; + uint32 hi; + uint32 lo; + + if (strcmp(de->d_name, ".") == 0 || + strcmp(de->d_name, "..") == 0) + continue; + + if (sscanf(de->d_name, "%X-%X.snap", &hi, &lo) != 2) + { + ereport(LOG, + (errmsg("could not parse file name as .snap file \"%s\"", de->d_name))); + continue; + } + + lsn = ((uint64) hi) << 32 | lo; + elog(DEBUG5, "found snap file %X/%X", LSN_FORMAT_ARGS(lsn)); + if (lsns_allocated == lsns_num) + { + lsns_allocated *= 2; + lsns = repalloc(lsns, sizeof(XLogRecPtr) * lsns_allocated); + } + lsns[lsns_num++] = lsn; + } + /* sort by lsn desc */ + qsort(lsns, lsns_num, sizeof(XLogRecPtr), LsnDescComparator); + /* and take cutoff at logical_replication_max_snap_files */ + if (logical_replication_max_snap_files > lsns_num) + cutoff = 0; + /* have less files than cutoff */ + else + { + cutoff = lsns[logical_replication_max_snap_files - 1]; + elog(LOG, "ls_monitor: dropping logical slots with restart_lsn lower %X/%X, found %d .snap files, limit is %d", + LSN_FORMAT_ARGS(cutoff), lsns_num, logical_replication_max_snap_files); + } + pfree(lsns); + FreeDir(dirdesc); + return cutoff; +} + +#define LS_MONITOR_CHECK_INTERVAL 10000 /* ms */ + +/* + * Unused logical replication slots pins WAL and prevents deletion of snapshots. + * WAL bloat is guarded by max_slot_wal_keep_size; this bgw removes slots which + * need too many .snap files. + */ +PGDLLEXPORT void +LogicalSlotsMonitorMain(Datum main_arg) +{ + /* Establish signal handlers. */ + pqsignal(SIGUSR1, procsignal_sigusr1_handler); + pqsignal(SIGHUP, SignalHandlerForConfigReload); + pqsignal(SIGTERM, die); + + BackgroundWorkerUnblockSignals(); + + for (;;) + { + XLogRecPtr cutoff_lsn; + + /* + * If there are too many .snap files, just drop all logical slots to + * prevent aux files bloat. + */ + cutoff_lsn = get_num_snap_files_lsn_threshold(); + if (cutoff_lsn > 0) + { + for (int i = 0; i < max_replication_slots; i++) + { + char slot_name[NAMEDATALEN]; + ReplicationSlot *s = &ReplicationSlotCtl->replication_slots[i]; + XLogRecPtr restart_lsn; + + /* find the name */ + LWLockAcquire(ReplicationSlotControlLock, LW_SHARED); + /* Consider only logical repliction slots */ + if (!s->in_use || !SlotIsLogical(s)) + { + LWLockRelease(ReplicationSlotControlLock); + continue; + } + + /* do we need to drop it? */ + SpinLockAcquire(&s->mutex); + restart_lsn = s->data.restart_lsn; + SpinLockRelease(&s->mutex); + if (restart_lsn >= cutoff_lsn) + { + LWLockRelease(ReplicationSlotControlLock); + continue; + } + + strlcpy(slot_name, s->data.name.data, NAMEDATALEN); + elog(LOG, "ls_monitor: dropping slot %s with restart_lsn %X/%X below horizon %X/%X", + slot_name, LSN_FORMAT_ARGS(restart_lsn), LSN_FORMAT_ARGS(cutoff_lsn)); + LWLockRelease(ReplicationSlotControlLock); + + /* now try to drop it, killing owner before if any */ + for (;;) + { + pid_t active_pid; + + SpinLockAcquire(&s->mutex); + active_pid = s->active_pid; + SpinLockRelease(&s->mutex); + + if (active_pid == 0) + { + /* + * Slot is releasted, try to drop it. Though of course + * it could have been reacquired, so drop can ERROR + * out. Similarly it could have been dropped in the + * meanwhile. + * + * In principle we could remove pg_try/pg_catch, that + * would restart the whole bgworker. + */ + ConditionVariableCancelSleep(); + PG_TRY(); + { + ReplicationSlotDrop(slot_name, true); + elog(LOG, "ls_monitor: slot %s dropped", slot_name); + } + PG_CATCH(); + { + /* log ERROR and reset elog stack */ + EmitErrorReport(); + FlushErrorState(); + elog(LOG, "ls_monitor: failed to drop slot %s", slot_name); + } + PG_END_TRY(); + break; + } + else + { + /* kill the owner and wait for release */ + elog(LOG, "ls_monitor: killing slot %s owner %d", slot_name, active_pid); + (void) kill(active_pid, SIGTERM); + /* We shouldn't get stuck, but to be safe add timeout. */ + ConditionVariableTimedSleep(&s->active_cv, 1000, WAIT_EVENT_REPLICATION_SLOT_DROP); + } + } + } + } + + (void) WaitLatch(MyLatch, + WL_LATCH_SET | WL_EXIT_ON_PM_DEATH | WL_TIMEOUT, + LS_MONITOR_CHECK_INTERVAL, + PG_WAIT_EXTENSION); + ResetLatch(MyLatch); + CHECK_FOR_INTERRUPTS(); + } +} + + void _PG_init(void) { @@ -44,10 +280,21 @@ _PG_init(void) pg_init_libpagestore(); pg_init_walproposer(); + InitLogicalReplicationMonitor(); + InitControlPlaneConnector(); pg_init_extension_server(); + DefineCustomBoolVariable( + "neon.primary_is_running", + "true if the primary was running at replica startup. false otherwise", + NULL, + &primary_is_running, + false, + PGC_POSTMASTER, + 0, + NULL, NULL, NULL); /* * Important: This must happen after other parts of the extension are * loaded, otherwise any settings to GUCs that were set before the diff --git a/pgxn/neon/neon.control b/pgxn/neon/neon.control index 4e4cb9f372..cee2f336f2 100644 --- a/pgxn/neon/neon.control +++ b/pgxn/neon/neon.control @@ -1,5 +1,6 @@ # neon extension comment = 'cloud storage for PostgreSQL' -default_version = '1.1' +default_version = '1.3' module_pathname = '$libdir/neon' relocatable = true +trusted = true diff --git a/pgxn/neon/neon.h b/pgxn/neon/neon.h index c3afecc679..a0f8c97497 100644 --- a/pgxn/neon/neon.h +++ b/pgxn/neon/neon.h @@ -25,12 +25,11 @@ extern int wal_acceptor_connection_timeout; extern void pg_init_libpagestore(void); extern void pg_init_walproposer(void); -extern void pg_init_extension_server(void); - extern uint64 BackpressureThrottlingTime(void); extern void replication_feedback_get_lsns(XLogRecPtr *writeLsn, XLogRecPtr *flushLsn, XLogRecPtr *applyLsn); extern void PGDLLEXPORT WalProposerSync(int argc, char *argv[]); extern void PGDLLEXPORT WalProposerMain(Datum main_arg); +PGDLLEXPORT void LogicalSlotsMonitorMain(Datum main_arg); #endif /* NEON_H */ diff --git a/pgxn/neon/neon_utils.c b/pgxn/neon/neon_utils.c index 9135847aaf..1fb4ed9522 100644 --- a/pgxn/neon/neon_utils.c +++ b/pgxn/neon/neon_utils.c @@ -1,8 +1,12 @@ - #include +#ifndef WALPROPOSER_LIB +#include +#endif + #include "postgres.h" +#include "neon_utils.h" #include "lib/stringinfo.h" #include "libpq/pqformat.h" @@ -11,7 +15,7 @@ * * Returns -1 if the character is not a hexadecimal digit. */ -int +static int HexDecodeChar(char c) { if (c >= '0' && c <= '9') @@ -114,3 +118,48 @@ disable_core_dump() fprintf(stderr, "WARNING: disable cores setrlimit failed: %s", strerror(save_errno)); } } + +#ifndef WALPROPOSER_LIB + +/* + * On macOS with a libcurl that has IPv6 support, curl_global_init() calls + * SCDynamicStoreCopyProxies(), which makes the program multithreaded. An ideal + * place to call curl_global_init() would be _PG_init(), but Neon has to be + * added to shared_preload_libraries, which are loaded in the Postmaster + * process. The Postmaster is not supposed to become multithreaded at any point + * in its lifecycle. Postgres doesn't have any good hook that I know of to + * initialize per-backend structures, so we have to check this on any + * allocation of a CURL handle. + * + * Free the allocated CURL handle with curl_easy_cleanup(3). + * + * https://developer.apple.com/documentation/systemconfiguration/1517088-scdynamicstorecopyproxies + */ +CURL * +alloc_curl_handle(void) +{ + static bool curl_initialized = false; + + CURL *handle; + + if (unlikely(!curl_initialized)) + { + /* Protected by mutex internally */ + if (curl_global_init(CURL_GLOBAL_DEFAULT)) + { + elog(ERROR, "Failed to initialize curl"); + } + + curl_initialized = true; + } + + handle = curl_easy_init(); + if (handle == NULL) + { + elog(ERROR, "Failed to initialize curl handle"); + } + + return handle; +} + +#endif diff --git a/pgxn/neon/neon_utils.h b/pgxn/neon/neon_utils.h index a86f1e061c..89683714f1 100644 --- a/pgxn/neon/neon_utils.h +++ b/pgxn/neon/neon_utils.h @@ -1,11 +1,23 @@ #ifndef __NEON_UTILS_H__ #define __NEON_UTILS_H__ +#include "lib/stringinfo.h" + +#ifndef WALPROPOSER_LIB +#include +#endif + bool HexDecodeString(uint8 *result, char *input, int nbytes); uint32 pq_getmsgint32_le(StringInfo msg); uint64 pq_getmsgint64_le(StringInfo msg); void pq_sendint32_le(StringInfo buf, uint32 i); void pq_sendint64_le(StringInfo buf, uint64 i); -extern void disable_core_dump(); +void disable_core_dump(void); + +#ifndef WALPROPOSER_LIB + +CURL * alloc_curl_handle(void); + +#endif #endif /* __NEON_UTILS_H__ */ diff --git a/pgxn/neon/pagestore_client.h b/pgxn/neon/pagestore_client.h index 8c02f357bc..2889ffacae 100644 --- a/pgxn/neon/pagestore_client.h +++ b/pgxn/neon/pagestore_client.h @@ -15,6 +15,7 @@ #include "neon_pgversioncompat.h" +#include "access/slru.h" #include "access/xlogdefs.h" #include RELFILEINFO_HDR #include "lib/stringinfo.h" @@ -34,6 +35,7 @@ typedef enum T_NeonNblocksRequest, T_NeonGetPageRequest, T_NeonDbSizeRequest, + T_NeonGetSlruSegmentRequest, /* pagestore -> pagestore_client */ T_NeonExistsResponse = 100, @@ -41,6 +43,7 @@ typedef enum T_NeonGetPageResponse, T_NeonErrorResponse, T_NeonDbSizeResponse, + T_NeonGetSlruSegmentResponse, } NeonMessageTag; /* base struct for c-style inheritance */ @@ -59,6 +62,13 @@ typedef struct (errmsg(NEON_TAG "[shard %d] " fmt, shard_no, ##__VA_ARGS__), \ errhidestmt(true), errhidecontext(true), errposition(0), internalerrposition(0))) +/* SLRUs downloadable from page server */ +typedef enum { + SLRU_CLOG, + SLRU_MULTIXACT_MEMBERS, + SLRU_MULTIXACT_OFFSETS +} SlruKind; + /* * supertype of all the Neon*Request structs below * @@ -101,6 +111,13 @@ typedef struct BlockNumber blkno; } NeonGetPageRequest; +typedef struct +{ + NeonRequest req; + SlruKind kind; + int segno; +} NeonGetSlruSegmentRequest; + /* supertype of all the Neon*Response structs below */ typedef struct { @@ -140,6 +157,14 @@ typedef struct * message */ } NeonErrorResponse; +typedef struct +{ + NeonMessageTag tag; + int n_blocks; + char data[BLCKSZ * SLRU_PAGES_PER_SEGMENT]; +} NeonGetSlruSegmentResponse; + + extern StringInfoData nm_pack_request(NeonRequest *msg); extern NeonResponse *nm_unpack_response(StringInfo s); extern char *nm_to_string(NeonMessage *msg); diff --git a/pgxn/neon/pagestore_smgr.c b/pgxn/neon/pagestore_smgr.c index 1fa802e6f4..0256de2b9a 100644 --- a/pgxn/neon/pagestore_smgr.c +++ b/pgxn/neon/pagestore_smgr.c @@ -1043,12 +1043,25 @@ nm_pack_request(NeonRequest *msg) break; } + case T_NeonGetSlruSegmentRequest: + { + NeonGetSlruSegmentRequest *msg_req = (NeonGetSlruSegmentRequest *) msg; + + pq_sendbyte(&s, msg_req->req.latest); + pq_sendint64(&s, msg_req->req.lsn); + pq_sendbyte(&s, msg_req->kind); + pq_sendint32(&s, msg_req->segno); + + break; + } + /* pagestore -> pagestore_client. We never need to create these. */ case T_NeonExistsResponse: case T_NeonNblocksResponse: case T_NeonGetPageResponse: case T_NeonErrorResponse: case T_NeonDbSizeResponse: + case T_NeonGetSlruSegmentResponse: default: neon_log(ERROR, "unexpected neon message tag 0x%02x", msg->tag); break; @@ -1135,6 +1148,20 @@ nm_unpack_response(StringInfo s) break; } + case T_NeonGetSlruSegmentResponse: + { + NeonGetSlruSegmentResponse *msg_resp; + int n_blocks = pq_getmsgint(s, 4); + msg_resp = palloc(sizeof(NeonGetSlruSegmentResponse)); + msg_resp->tag = tag; + msg_resp->n_blocks = n_blocks; + memcpy(msg_resp->data, pq_getmsgbytes(s, n_blocks * BLCKSZ), n_blocks * BLCKSZ); + pq_getmsgend(s); + + resp = (NeonResponse *) msg_resp; + break; + } + /* * pagestore_client -> pagestore * @@ -1144,6 +1171,7 @@ nm_unpack_response(StringInfo s) case T_NeonNblocksRequest: case T_NeonGetPageRequest: case T_NeonDbSizeRequest: + case T_NeonGetSlruSegmentRequest: default: neon_log(ERROR, "unexpected neon message tag 0x%02x", tag); break; @@ -1213,7 +1241,18 @@ nm_to_string(NeonMessage *msg) appendStringInfoChar(&s, '}'); break; } + case T_NeonGetSlruSegmentRequest: + { + NeonGetSlruSegmentRequest *msg_req = (NeonGetSlruSegmentRequest *) msg; + appendStringInfoString(&s, "{\"type\": \"NeonGetSlruSegmentRequest\""); + appendStringInfo(&s, ", \"kind\": %u", msg_req->kind); + appendStringInfo(&s, ", \"segno\": %u", msg_req->segno); + appendStringInfo(&s, ", \"lsn\": \"%X/%X\"", LSN_FORMAT_ARGS(msg_req->req.lsn)); + appendStringInfo(&s, ", \"latest\": %d", msg_req->req.latest); + appendStringInfoChar(&s, '}'); + break; + } /* pagestore -> pagestore_client */ case T_NeonExistsResponse: { @@ -1267,6 +1306,17 @@ nm_to_string(NeonMessage *msg) msg_resp->db_size); appendStringInfoChar(&s, '}'); + break; + } + case T_NeonGetSlruSegmentResponse: + { + NeonGetSlruSegmentResponse *msg_resp = (NeonGetSlruSegmentResponse *) msg; + + appendStringInfoString(&s, "{\"type\": \"NeonGetSlruSegmentResponse\""); + appendStringInfo(&s, ", \"n_blocks\": %u}", + msg_resp->n_blocks); + appendStringInfoChar(&s, '}'); + break; } @@ -1838,7 +1888,6 @@ neon_zeroextend(SMgrRelation reln, ForkNumber forkNum, BlockNumber blocknum, int nblocks, bool skipFsync) { const PGAlignedBlock buffer = {0}; - BlockNumber curblocknum = blocknum; int remblocks = nblocks; XLogRecPtr lsn = 0; @@ -2739,6 +2788,74 @@ neon_end_unlogged_build(SMgrRelation reln) unlogged_build_phase = UNLOGGED_BUILD_NOT_IN_PROGRESS; } +#define STRPREFIX(str, prefix) (strncmp(str, prefix, strlen(prefix)) == 0) + +static int +neon_read_slru_segment(SMgrRelation reln, const char* path, int segno, void* buffer) +{ + XLogRecPtr request_lsn; + /* + * GetRedoStartLsn() returns LSN of basebackup. + * We need to download SLRU segments only once after node startup, + * then SLRUs are maintained locally. + */ + request_lsn = GetRedoStartLsn(); + request_lsn = nm_adjust_lsn(request_lsn); + SlruKind kind; + + if (STRPREFIX(path, "pg_xact")) + kind = SLRU_CLOG; + else if (STRPREFIX(path, "pg_multixact/members")) + kind = SLRU_MULTIXACT_MEMBERS; + else if (STRPREFIX(path, "pg_multixact/offsets")) + kind = SLRU_MULTIXACT_OFFSETS; + else + return -1; + + NeonResponse *resp; + NeonGetSlruSegmentRequest request = { + .req.tag = T_NeonGetSlruSegmentRequest, + .req.latest = false, + .req.lsn = request_lsn, + + .kind = kind, + .segno = segno + }; + int n_blocks; + shardno_t shard_no = 0; /* All SLRUs are at shard 0 */ + do + { + while (!page_server->send(shard_no, &request.req) || !page_server->flush(shard_no)); + consume_prefetch_responses(); + resp = page_server->receive(shard_no); + } while (resp == NULL); + + switch (resp->tag) + { + case T_NeonGetSlruSegmentResponse: + n_blocks = ((NeonGetSlruSegmentResponse *) resp)->n_blocks; + memcpy(buffer, ((NeonGetSlruSegmentResponse *) resp)->data, n_blocks*BLCKSZ); + break; + + case T_NeonErrorResponse: + ereport(ERROR, + (errcode(ERRCODE_IO_ERROR), + errmsg(NEON_TAG "could not read SLRU %d segment %d at lsn %X/%08X", + kind, + segno, + LSN_FORMAT_ARGS(request_lsn)), + errdetail("page server returned error: %s", + ((NeonErrorResponse *) resp)->message))); + break; + + default: + neon_log(ERROR, "unexpected response from page server with tag 0x%02x", resp->tag); + } + pfree(resp); + + return n_blocks; +} + static void AtEOXact_neon(XactEvent event, void *arg) { @@ -2797,6 +2914,8 @@ static const struct f_smgr neon_smgr = .smgr_start_unlogged_build = neon_start_unlogged_build, .smgr_finish_unlogged_build_phase_1 = neon_finish_unlogged_build_phase_1, .smgr_end_unlogged_build = neon_end_unlogged_build, + + .smgr_read_slru_segment = neon_read_slru_segment, }; const f_smgr * @@ -2959,14 +3078,6 @@ neon_redo_read_buffer_filter(XLogReaderState *record, uint8 block_id) XLogRecGetBlockTag(record, block_id, &rinfo, &forknum, &blkno); #endif - /* - * Out of an abundance of caution, we always run redo on shared catalogs, - * regardless of whether the block is stored in shared buffers. See also - * this function's top comment. - */ - if (!OidIsValid(NInfoGetDbOid(rinfo))) - return false; - CopyNRelFileInfoToBufTag(tag, rinfo); tag.forkNum = forknum; tag.blockNum = blkno; @@ -2980,17 +3091,28 @@ neon_redo_read_buffer_filter(XLogReaderState *record, uint8 block_id) */ LWLockAcquire(partitionLock, LW_SHARED); - /* Try to find the relevant buffer */ - buffer = BufTableLookup(&tag, hash); - - no_redo_needed = buffer < 0; + /* + * Out of an abundance of caution, we always run redo on shared catalogs, + * regardless of whether the block is stored in shared buffers. See also + * this function's top comment. + */ + if (!OidIsValid(NInfoGetDbOid(rinfo))) + { + no_redo_needed = false; + } + else + { + /* Try to find the relevant buffer */ + buffer = BufTableLookup(&tag, hash); + no_redo_needed = buffer < 0; + } /* In both cases st lwlsn past this WAL record */ SetLastWrittenLSNForBlock(end_recptr, rinfo, forknum, blkno); /* * we don't have the buffer in memory, update lwLsn past this record, also - * evict page fro file cache + * evict page from file cache */ if (no_redo_needed) lfc_evict(rinfo, forknum, blkno); diff --git a/pgxn/neon/walproposer.c b/pgxn/neon/walproposer.c index 171af7d2aa..9ff0493352 100644 --- a/pgxn/neon/walproposer.c +++ b/pgxn/neon/walproposer.c @@ -688,7 +688,7 @@ RecvAcceptorGreeting(Safekeeper *sk) if (!AsyncReadMessage(sk, (AcceptorProposerMessage *) &sk->greetResponse)) return; - wp_log(LOG, "received AcceptorGreeting from safekeeper %s:%s", sk->host, sk->port); + wp_log(LOG, "received AcceptorGreeting from safekeeper %s:%s, term=" INT64_FORMAT, sk->host, sk->port, sk->greetResponse.term); /* Protocol is all good, move to voting. */ sk->state = SS_VOTING; @@ -922,6 +922,7 @@ static void DetermineEpochStartLsn(WalProposer *wp) { TermHistory *dth; + int n_ready = 0; wp->propEpochStartLsn = InvalidXLogRecPtr; wp->donorEpoch = 0; @@ -932,6 +933,8 @@ DetermineEpochStartLsn(WalProposer *wp) { if (wp->safekeeper[i].state == SS_IDLE) { + n_ready++; + if (GetEpoch(&wp->safekeeper[i]) > wp->donorEpoch || (GetEpoch(&wp->safekeeper[i]) == wp->donorEpoch && wp->safekeeper[i].voteResponse.flushLsn > wp->propEpochStartLsn)) @@ -958,6 +961,16 @@ DetermineEpochStartLsn(WalProposer *wp) } } + if (n_ready < wp->quorum) + { + /* + * This is a rare case that can be triggered if safekeeper has voted and disconnected. + * In this case, its state will not be SS_IDLE and its vote cannot be used, because + * we clean up `voteResponse` in `ShutdownConnection`. + */ + wp_log(FATAL, "missing majority of votes, collected %d, expected %d, got %d", wp->n_votes, wp->quorum, n_ready); + } + /* * If propEpochStartLsn is 0, it means flushLsn is 0 everywhere, we are bootstrapping * and nothing was committed yet. Start streaming then from the basebackup LSN. @@ -1207,7 +1220,7 @@ PrepareAppendRequest(WalProposer *wp, AppendRequestHeader *req, XLogRecPtr begin req->epochStartLsn = wp->propEpochStartLsn; req->beginLsn = beginLsn; req->endLsn = endLsn; - req->commitLsn = GetAcknowledgedByQuorumWALPosition(wp); + req->commitLsn = wp->commitLsn; req->truncateLsn = wp->truncateLsn; req->proposerId = wp->greetRequest.proposerId; } @@ -1392,7 +1405,7 @@ static bool RecvAppendResponses(Safekeeper *sk) { WalProposer *wp = sk->wp; - XLogRecPtr minQuorumLsn; + XLogRecPtr newCommitLsn; bool readAnything = false; while (true) @@ -1431,23 +1444,24 @@ RecvAppendResponses(Safekeeper *sk) if (!readAnything) return sk->state == SS_ACTIVE; - HandleSafekeeperResponse(wp); - + /* update commit_lsn */ + newCommitLsn = GetAcknowledgedByQuorumWALPosition(wp); /* - * Also send the new commit lsn to all the safekeepers. + * Send the new value to all safekeepers. */ - minQuorumLsn = GetAcknowledgedByQuorumWALPosition(wp); - if (minQuorumLsn > wp->lastSentCommitLsn) + if (newCommitLsn > wp->commitLsn) { + wp->commitLsn = newCommitLsn; BroadcastAppendRequest(wp); - wp->lastSentCommitLsn = minQuorumLsn; } + HandleSafekeeperResponse(wp); + return sk->state == SS_ACTIVE; } /* Parse a PageserverFeedback message, or the PageserverFeedback part of an AppendResponse */ -void +static void ParsePageserverFeedbackMessage(WalProposer *wp, StringInfo reply_message, PageserverFeedback *rf) { uint8 nkeys; @@ -1577,9 +1591,9 @@ GetAcknowledgedByQuorumWALPosition(WalProposer *wp) Safekeeper * GetDonor(WalProposer *wp, XLogRecPtr *donor_lsn) { - *donor_lsn = InvalidXLogRecPtr; Safekeeper *donor = NULL; int i; + *donor_lsn = InvalidXLogRecPtr; if (wp->n_votes < wp->quorum) { @@ -1619,11 +1633,9 @@ GetDonor(WalProposer *wp, XLogRecPtr *donor_lsn) static void HandleSafekeeperResponse(WalProposer *wp) { - XLogRecPtr minQuorumLsn; XLogRecPtr candidateTruncateLsn; - minQuorumLsn = GetAcknowledgedByQuorumWALPosition(wp); - wp->api.process_safekeeper_feedback(wp, minQuorumLsn); + wp->api.process_safekeeper_feedback(wp); /* * Try to advance truncateLsn -- the last record flushed to all @@ -1636,7 +1648,7 @@ HandleSafekeeperResponse(WalProposer *wp) * can't commit entries from previous term' in Raft); 2) */ candidateTruncateLsn = CalculateMinFlushLsn(wp); - candidateTruncateLsn = Min(candidateTruncateLsn, minQuorumLsn); + candidateTruncateLsn = Min(candidateTruncateLsn, wp->commitLsn); if (candidateTruncateLsn > wp->truncateLsn) { wp->truncateLsn = candidateTruncateLsn; diff --git a/pgxn/neon/walproposer.h b/pgxn/neon/walproposer.h index 688d8e6e52..bc674fd979 100644 --- a/pgxn/neon/walproposer.h +++ b/pgxn/neon/walproposer.h @@ -486,6 +486,8 @@ typedef struct walproposer_api * * On success, the data is placed in *buf. It is valid until the next call * to this function. + * + * Returns PG_ASYNC_READ_FAIL on closed connection. */ PGAsyncReadResult (*conn_async_read) (Safekeeper *sk, char **buf, int *amount); @@ -532,6 +534,13 @@ typedef struct walproposer_api * Returns 0 if timeout is reached, 1 if some event happened. Updates * events mask to indicate events and sets sk to the safekeeper which has * an event. + * + * On timeout, events is set to WL_NO_EVENTS. On socket event, events is + * set to WL_SOCKET_READABLE and/or WL_SOCKET_WRITEABLE. When socket is + * closed, events is set to WL_SOCKET_READABLE. + * + * WL_SOCKET_WRITEABLE is usually set only when we need to flush the buffer. + * It can be returned only if caller asked for this event in the last *_event_set call. */ int (*wait_event_set) (WalProposer *wp, long timeout, Safekeeper **sk, uint32 *events); @@ -555,7 +564,7 @@ typedef struct walproposer_api * backpressure feedback and to confirm WAL persistence (has been commited * on the quorum of safekeepers). */ - void (*process_safekeeper_feedback) (WalProposer *wp, XLogRecPtr commitLsn); + void (*process_safekeeper_feedback) (WalProposer *wp); /* * Write a log message to the internal log processor. This is used only @@ -637,8 +646,8 @@ typedef struct WalProposer /* WAL has been generated up to this point */ XLogRecPtr availableLsn; - /* last commitLsn broadcasted to safekeepers */ - XLogRecPtr lastSentCommitLsn; + /* cached GetAcknowledgedByQuorumWALPosition result */ + XLogRecPtr commitLsn; ProposerGreeting greetRequest; diff --git a/pgxn/neon/walproposer_pg.c b/pgxn/neon/walproposer_pg.c index 61a2a54809..8eec2f02c1 100644 --- a/pgxn/neon/walproposer_pg.c +++ b/pgxn/neon/walproposer_pg.c @@ -68,6 +68,8 @@ static WalproposerShmemState *walprop_shared; static WalProposerConfig walprop_config; static XLogRecPtr sentPtr = InvalidXLogRecPtr; static const walproposer_api walprop_pg; +static volatile sig_atomic_t got_SIGUSR2 = false; +static bool reported_sigusr2 = false; static void nwp_shmem_startup_hook(void); static void nwp_register_gucs(void); @@ -101,6 +103,8 @@ static void add_nwr_event_set(Safekeeper *sk, uint32 events); static void update_nwr_event_set(Safekeeper *sk, uint32 events); static void rm_safekeeper_event_set(Safekeeper *to_remove, bool is_sk); +static void CheckGracefulShutdown(WalProposer *wp); + static XLogRecPtr GetLogRepRestartLSN(WalProposer *wp); static void @@ -398,7 +402,7 @@ walprop_pg_get_shmem_state(WalProposer *wp) return walprop_shared; } -void +static void replication_feedback_set(PageserverFeedback *rf) { SpinLockAcquire(&walprop_shared->mutex); @@ -492,6 +496,24 @@ walprop_pg_init_standalone_sync_safekeepers(void) BackgroundWorkerUnblockSignals(); } +/* + * We pretend to be a walsender process, and the lifecycle of a walsender is + * slightly different than other procesess. At shutdown, walsender processes + * stay alive until the very end, after the checkpointer has written the + * shutdown checkpoint. When the checkpointer exits, the postmaster sends all + * remaining walsender processes SIGUSR2. On receiving SIGUSR2, we try to send + * the remaining WAL, and then exit. This ensures that the checkpoint record + * reaches durable storage (in safekeepers), before the server shuts down + * completely. + */ +static void +walprop_sigusr2(SIGNAL_ARGS) +{ + got_SIGUSR2 = true; + + SetLatch(MyLatch); +} + static void walprop_pg_init_bgworker(void) { @@ -503,6 +525,7 @@ walprop_pg_init_bgworker(void) pqsignal(SIGUSR1, procsignal_sigusr1_handler); pqsignal(SIGHUP, SignalHandlerForConfigReload); pqsignal(SIGTERM, die); + pqsignal(SIGUSR2, walprop_sigusr2); BackgroundWorkerUnblockSignals(); @@ -1026,7 +1049,7 @@ static void StartProposerReplication(WalProposer *wp, StartReplicationCmd *cmd) { XLogRecPtr FlushPtr; - TimeLineID currTLI; + __attribute__((unused)) TimeLineID currTLI; #if PG_VERSION_NUM < 150000 if (ThisTimeLineID == 0) @@ -1075,14 +1098,26 @@ StartProposerReplication(WalProposer *wp, StartReplicationCmd *cmd) #endif /* - * When we first start replication the standby will be behind the primary. - * For some applications, for example synchronous replication, it is - * important to have a clear state for this initial catchup mode, so we - * can trigger actions when we change streaming state later. We may stay - * in this state for a long time, which is exactly why we want to be able - * to monitor whether or not we are still here. + * XXX: Move straight to STOPPING state, skipping the STREAMING state. + * + * This is a bit weird. Normal walsenders stay in STREAMING state, until + * the checkpointer signals them that it is about to start writing the + * shutdown checkpoint. The walsenders acknowledge that they have received + * that signal by switching to STOPPING state. That tells the walsenders + * that they must not write any new WAL. + * + * However, we cannot easily intercept that signal from the checkpointer. + * It's sent by WalSndInitStopping(), using + * SendProcSignal(PROCSIGNAL_WALSND_INIT_STOPPING). It's received by + * HandleWalSndInitStopping, which sets a process-local got_STOPPING flag. + * However, that's all private to walsender.c. + * + * We don't need to do anything special upon receiving the signal, the + * walproposer doesn't write any WAL anyway, so we skip the STREAMING + * state and go directly to STOPPING mode. That way, the checkpointer + * won't wait for us. */ - WalSndSetState(WALSNDSTATE_CATCHUP); + WalSndSetState(WALSNDSTATE_STOPPING); /* * Don't allow a request to stream from a future point in WAL that hasn't @@ -1122,6 +1157,8 @@ StartProposerReplication(WalProposer *wp, StartReplicationCmd *cmd) static void WalSndLoop(WalProposer *wp) { + XLogRecPtr flushPtr; + /* Clear any already-pending wakeups */ ResetLatch(MyLatch); @@ -1130,9 +1167,6 @@ WalSndLoop(WalProposer *wp) CHECK_FOR_INTERRUPTS(); XLogBroadcastWalProposer(wp); - - if (MyWalSnd->state == WALSNDSTATE_CATCHUP) - WalSndSetState(WALSNDSTATE_STREAMING); WalProposerPoll(wp); } } @@ -1230,7 +1264,6 @@ WalProposerRecovery(WalProposer *wp, Safekeeper *sk) TimeLineID timeline; XLogRecPtr startpos; XLogRecPtr endpos; - uint64 download_range_mb; startpos = GetLogRepRestartLSN(wp); if (startpos == InvalidXLogRecPtr) @@ -1745,6 +1778,9 @@ walprop_pg_wait_event_set(WalProposer *wp, long timeout, Safekeeper **sk, uint32 { ConditionVariableCancelSleep(); ResetLatch(MyLatch); + + CheckGracefulShutdown(wp); + *events = WL_LATCH_SET; return 1; } @@ -1798,6 +1834,41 @@ walprop_pg_finish_sync_safekeepers(WalProposer *wp, XLogRecPtr lsn) exit(0); } +/* + * Like vanilla walsender, on sigusr2 send all remaining WAL and exit. + * + * Note that unlike sync-safekeepers waiting here is not reliable: we + * don't check that majority of safekeepers received and persisted + * commit_lsn -- only that walproposer reached it (which immediately + * broadcasts new value). Doing that without incurring redundant control + * file syncing would need wp -> sk protocol change. OTOH unlike + * sync-safekeepers which must bump commit_lsn or basebackup will fail, + * this catchup is important only for tests where safekeepers/network + * don't crash on their own. + */ +static void +CheckGracefulShutdown(WalProposer *wp) +{ + if (got_SIGUSR2) + { + if (!reported_sigusr2) + { + XLogRecPtr flushPtr = walprop_pg_get_flush_rec_ptr(wp); + + wpg_log(LOG, "walproposer will send and wait for remaining WAL between %X/%X and %X/%X", + LSN_FORMAT_ARGS(wp->commitLsn), LSN_FORMAT_ARGS(flushPtr)); + reported_sigusr2 = true; + } + + if (wp->commitLsn >= walprop_pg_get_flush_rec_ptr(wp)) + { + wpg_log(LOG, "walproposer sent all WAL up to %X/%X, exiting", + LSN_FORMAT_ARGS(wp->commitLsn)); + proc_exit(0); + } + } +} + /* * Choose most advanced PageserverFeedback and set it to *rf. */ @@ -1878,7 +1949,7 @@ CombineHotStanbyFeedbacks(HotStandbyFeedback *hs, WalProposer *wp) * None of that is functional in sync-safekeepers. */ static void -walprop_pg_process_safekeeper_feedback(WalProposer *wp, XLogRecPtr commitLsn) +walprop_pg_process_safekeeper_feedback(WalProposer *wp) { HotStandbyFeedback hsFeedback; XLogRecPtr oldDiskConsistentLsn; @@ -1893,10 +1964,10 @@ walprop_pg_process_safekeeper_feedback(WalProposer *wp, XLogRecPtr commitLsn) replication_feedback_set(&quorumFeedback.rf); SetZenithCurrentClusterSize(quorumFeedback.rf.currentClusterSize); - if (commitLsn > quorumFeedback.flushLsn || oldDiskConsistentLsn != quorumFeedback.rf.disk_consistent_lsn) + if (wp->commitLsn > quorumFeedback.flushLsn || oldDiskConsistentLsn != quorumFeedback.rf.disk_consistent_lsn) { - if (commitLsn > quorumFeedback.flushLsn) - quorumFeedback.flushLsn = commitLsn; + if (wp->commitLsn > quorumFeedback.flushLsn) + quorumFeedback.flushLsn = wp->commitLsn; /* * Advance the replication slot to commitLsn. WAL before it is @@ -1929,6 +2000,8 @@ walprop_pg_process_safekeeper_feedback(WalProposer *wp, XLogRecPtr commitLsn) XidFromFullTransactionId(hsFeedback.catalog_xmin), EpochFromFullTransactionId(hsFeedback.catalog_xmin)); } + + CheckGracefulShutdown(wp); } static XLogRecPtr diff --git a/pgxn/neon_test_utils/neon_test_utils--1.0.sql b/pgxn/neon_test_utils/neon_test_utils--1.0.sql index 402981a9a6..23340e352e 100644 --- a/pgxn/neon_test_utils/neon_test_utils--1.0.sql +++ b/pgxn/neon_test_utils/neon_test_utils--1.0.sql @@ -7,6 +7,24 @@ AS 'MODULE_PATHNAME', 'test_consume_xids' LANGUAGE C STRICT PARALLEL UNSAFE; +CREATE FUNCTION test_consume_cpu(seconds int) +RETURNS VOID +AS 'MODULE_PATHNAME', 'test_consume_cpu' +LANGUAGE C STRICT +PARALLEL UNSAFE; + +CREATE FUNCTION test_consume_memory(megabytes int) +RETURNS VOID +AS 'MODULE_PATHNAME', 'test_consume_memory' +LANGUAGE C STRICT +PARALLEL UNSAFE; + +CREATE FUNCTION test_release_memory(megabytes int DEFAULT NULL) +RETURNS VOID +AS 'MODULE_PATHNAME', 'test_release_memory' +LANGUAGE C +PARALLEL UNSAFE; + CREATE FUNCTION clear_buffer_cache() RETURNS VOID AS 'MODULE_PATHNAME', 'clear_buffer_cache' diff --git a/pgxn/neon_test_utils/neon_test_utils.control b/pgxn/neon_test_utils/neon_test_utils.control index 94e6720503..5219571f11 100644 --- a/pgxn/neon_test_utils/neon_test_utils.control +++ b/pgxn/neon_test_utils/neon_test_utils.control @@ -3,3 +3,4 @@ comment = 'helpers for neon testing and debugging' default_version = '1.0' module_pathname = '$libdir/neon_test_utils' relocatable = true +trusted = true diff --git a/pgxn/neon_test_utils/neontest.c b/pgxn/neon_test_utils/neontest.c index aa644efd40..82ce5be9f6 100644 --- a/pgxn/neon_test_utils/neontest.c +++ b/pgxn/neon_test_utils/neontest.c @@ -21,10 +21,12 @@ #include "miscadmin.h" #include "storage/buf_internals.h" #include "storage/bufmgr.h" +#include "storage/fd.h" #include "utils/builtins.h" #include "utils/pg_lsn.h" #include "utils/rel.h" #include "utils/varlena.h" +#include "utils/wait_event.h" #include "../neon/pagestore_client.h" PG_MODULE_MAGIC; @@ -32,6 +34,9 @@ PG_MODULE_MAGIC; extern void _PG_init(void); PG_FUNCTION_INFO_V1(test_consume_xids); +PG_FUNCTION_INFO_V1(test_consume_cpu); +PG_FUNCTION_INFO_V1(test_consume_memory); +PG_FUNCTION_INFO_V1(test_release_memory); PG_FUNCTION_INFO_V1(clear_buffer_cache); PG_FUNCTION_INFO_V1(get_raw_page_at_lsn); PG_FUNCTION_INFO_V1(get_raw_page_at_lsn_ex); @@ -97,6 +102,117 @@ test_consume_xids(PG_FUNCTION_ARGS) PG_RETURN_VOID(); } + +/* + * test_consume_cpu(seconds int). Keeps one CPU busy for the given number of seconds. + */ +Datum +test_consume_cpu(PG_FUNCTION_ARGS) +{ + int32 seconds = PG_GETARG_INT32(0); + TimestampTz start; + uint64 total_iterations = 0; + + start = GetCurrentTimestamp(); + + for (;;) + { + TimestampTz elapsed; + + elapsed = GetCurrentTimestamp() - start; + if (elapsed > (TimestampTz) seconds * USECS_PER_SEC) + break; + + /* keep spinning */ + for (int i = 0; i < 1000000; i++) + total_iterations++; + elog(DEBUG2, "test_consume_cpu(): %lu iterations in total", total_iterations); + + CHECK_FOR_INTERRUPTS(); + } + + PG_RETURN_VOID(); +} + +static MemoryContext consume_cxt = NULL; +static slist_head consumed_memory_chunks; +static int64 num_memory_chunks; + +/* + * test_consume_memory(megabytes int). + * + * Consume given amount of memory. The allocation is made in TopMemoryContext, + * so it outlives the function, until you call test_release_memory to + * explicitly release it, or close the session. + */ +Datum +test_consume_memory(PG_FUNCTION_ARGS) +{ + int32 megabytes = PG_GETARG_INT32(0); + + /* + * Consume the memory in a new memory context, so that it's convenient to + * release and to display it separately in a possible memory context dump. + */ + if (consume_cxt == NULL) + consume_cxt = AllocSetContextCreate(TopMemoryContext, + "test_consume_memory", + ALLOCSET_DEFAULT_SIZES); + + for (int32 i = 0; i < megabytes; i++) + { + char *p; + + p = MemoryContextAllocZero(consume_cxt, 1024 * 1024); + + /* touch the memory, so that it's really allocated by the kernel */ + for (int j = 0; j < 1024 * 1024; j += 1024) + p[j] = j % 0xFF; + + slist_push_head(&consumed_memory_chunks, (slist_node *) p); + num_memory_chunks++; + } + + PG_RETURN_VOID(); +} + +/* + * test_release_memory(megabytes int). NULL releases all + */ +Datum +test_release_memory(PG_FUNCTION_ARGS) +{ + if (PG_ARGISNULL(0)) + { + if (consume_cxt) + { + MemoryContextDelete(consume_cxt); + consume_cxt = NULL; + num_memory_chunks = 0; + } + } + else + { + int32 chunks_to_release = PG_GETARG_INT32(0); + + if (chunks_to_release > num_memory_chunks) + { + elog(WARNING, "only %lu MB is consumed, releasing it all", num_memory_chunks); + chunks_to_release = num_memory_chunks; + } + + for (int32 i = 0; i < chunks_to_release; i++) + { + slist_node *chunk = slist_pop_head_node(&consumed_memory_chunks); + + pfree(chunk); + num_memory_chunks--; + } + } + + PG_RETURN_VOID(); +} + /* * Flush the buffer cache, evicting all pages that are not currently pinned. */ diff --git a/pgxn/neon_walredo/walredoproc.c b/pgxn/neon_walredo/walredoproc.c index bdc50b0aa9..c4ab22636b 100644 --- a/pgxn/neon_walredo/walredoproc.c +++ b/pgxn/neon_walredo/walredoproc.c @@ -140,9 +140,45 @@ static XLogReaderState *reader_state; #define TRACE DEBUG5 #ifdef HAVE_LIBSECCOMP + + +/* + * https://man7.org/linux/man-pages/man2/close_range.2.html + * + * The `close_range` syscall is available as of Linux 5.9. + * + * The `close_range` libc wrapper is only available in glibc >= 2.34. + * Debian Bullseye ships a libc package based on glibc 2.31. + * => write the wrapper ourselves, using the syscall number from the kernel headers. + * + * If the Linux uAPI headers don't define the system call number, + * fail the build deliberately rather than ifdef'ing it to ENOSYS. + * We prefer a compile time over a runtime error for walredo. + */ +#include +#include +#include + +static int +close_range_syscall(unsigned int start_fd, unsigned int count, unsigned int flags) +{ + return syscall(__NR_close_range, start_fd, count, flags); +} + static void enter_seccomp_mode(void) { + + /* + * The pageserver process relies on us to close all the file descriptors + * it potentially leaked to us, _before_ we start processing potentially dangerous + * wal records. See the comment in the Rust code that launches this process. + */ + int err; + if (err = close_range_syscall(3, ~0U, 0)) { + ereport(FATAL, (errcode(ERRCODE_SYSTEM_ERROR), errmsg("seccomp: could not close files >= fd 3"))); + } + PgSeccompRule syscalls[] = { /* Hard requirements */ @@ -184,6 +220,9 @@ enter_seccomp_mode(void) } #endif /* HAVE_LIBSECCOMP */ +PGDLLEXPORT void +WalRedoMain(int argc, char *argv[]); + /* * Entry point for the WAL redo process. * @@ -771,6 +810,9 @@ ApplyRecord(StringInfo input_message) ErrorContextCallback errcallback; #if PG_VERSION_NUM >= 150000 DecodedXLogRecord *decoded; +#define STATIC_DECODEBUF_SIZE (64 * 1024) + static char *static_decodebuf = NULL; + size_t required_space; #endif /* @@ -800,7 +842,19 @@ ApplyRecord(StringInfo input_message) XLogBeginRead(reader_state, lsn); #if PG_VERSION_NUM >= 150000 - decoded = (DecodedXLogRecord *) XLogReadRecordAlloc(reader_state, record->xl_tot_len, true); + /* + * For reasonably small records, reuse a fixed size buffer to reduce + * palloc overhead. + */ + required_space = DecodeXLogRecordRequiredSpace(record->xl_tot_len); + if (required_space <= STATIC_DECODEBUF_SIZE) + { + if (static_decodebuf == NULL) + static_decodebuf = MemoryContextAlloc(TopMemoryContext, STATIC_DECODEBUF_SIZE); + decoded = (DecodedXLogRecord *) static_decodebuf; + } + else + decoded = palloc(required_space); if (!DecodeXLogRecord(reader_state, decoded, record, lsn, &errormsg)) elog(ERROR, "failed to decode WAL record: %s", errormsg); @@ -809,37 +863,15 @@ ApplyRecord(StringInfo input_message) /* Record the location of the next record. */ decoded->next_lsn = reader_state->NextRecPtr; - /* - * If it's in the decode buffer, mark the decode buffer space as - * occupied. - */ - if (!decoded->oversized) - { - /* The new decode buffer head must be MAXALIGNed. */ - Assert(decoded->size == MAXALIGN(decoded->size)); - if ((char *) decoded == reader_state->decode_buffer) - reader_state->decode_buffer_tail = reader_state->decode_buffer + decoded->size; - else - reader_state->decode_buffer_tail += decoded->size; - } - - /* Insert it into the queue of decoded records. */ - Assert(reader_state->decode_queue_tail != decoded); - if (reader_state->decode_queue_tail) - reader_state->decode_queue_tail->next = decoded; - reader_state->decode_queue_tail = decoded; - if (!reader_state->decode_queue_head) - reader_state->decode_queue_head = decoded; - /* * Update the pointers to the beginning and one-past-the-end of this * record, again for the benefit of historical code that expected the * decoder to track this rather than accessing these fields of the record * itself. */ - reader_state->record = reader_state->decode_queue_head; - reader_state->ReadRecPtr = reader_state->record->lsn; - reader_state->EndRecPtr = reader_state->record->next_lsn; + reader_state->record = decoded; + reader_state->ReadRecPtr = decoded->lsn; + reader_state->EndRecPtr = decoded->next_lsn; } #else /* @@ -879,8 +911,9 @@ ApplyRecord(StringInfo input_message) elog(TRACE, "applied WAL record with LSN %X/%X", (uint32) (lsn >> 32), (uint32) lsn); + #if PG_VERSION_NUM >= 150000 - if (decoded && decoded->oversized) + if ((char *) decoded != static_decodebuf) pfree(decoded); #endif } diff --git a/poetry.lock b/poetry.lock index 1644b2b299..832d7c4334 100644 --- a/poetry.lock +++ b/poetry.lock @@ -2,87 +2,87 @@ [[package]] name = "aiohttp" -version = "3.9.0" +version = "3.9.2" description = "Async http client/server framework (asyncio)" optional = false python-versions = ">=3.8" files = [ - {file = "aiohttp-3.9.0-cp310-cp310-macosx_10_9_universal2.whl", hash = "sha256:6896b8416be9ada4d22cd359d7cb98955576ce863eadad5596b7cdfbf3e17c6c"}, - {file = "aiohttp-3.9.0-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:1736d87dad8ef46a8ec9cddd349fa9f7bd3a064c47dd6469c0d6763d3d49a4fc"}, - {file = "aiohttp-3.9.0-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:8c9e5f4d7208cda1a2bb600e29069eecf857e6980d0ccc922ccf9d1372c16f4b"}, - {file = "aiohttp-3.9.0-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:8488519aa05e636c5997719fe543c8daf19f538f4fa044f3ce94bee608817cff"}, - {file = "aiohttp-3.9.0-cp310-cp310-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:5ab16c254e2312efeb799bc3c06897f65a133b38b69682bf75d1f1ee1a9c43a9"}, - {file = "aiohttp-3.9.0-cp310-cp310-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:7a94bde005a8f926d0fa38b88092a03dea4b4875a61fbcd9ac6f4351df1b57cd"}, - {file = "aiohttp-3.9.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:4b777c9286b6c6a94f50ddb3a6e730deec327e9e2256cb08b5530db0f7d40fd8"}, - {file = "aiohttp-3.9.0-cp310-cp310-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:571760ad7736b34d05597a1fd38cbc7d47f7b65deb722cb8e86fd827404d1f6b"}, - {file = "aiohttp-3.9.0-cp310-cp310-musllinux_1_1_aarch64.whl", hash = "sha256:deac0a32aec29608eb25d730f4bc5a261a65b6c48ded1ed861d2a1852577c932"}, - {file = "aiohttp-3.9.0-cp310-cp310-musllinux_1_1_i686.whl", hash = "sha256:4ee1b4152bc3190cc40ddd6a14715e3004944263ea208229ab4c297712aa3075"}, - {file = "aiohttp-3.9.0-cp310-cp310-musllinux_1_1_ppc64le.whl", hash = "sha256:3607375053df58ed6f23903aa10cf3112b1240e8c799d243bbad0f7be0666986"}, - {file = "aiohttp-3.9.0-cp310-cp310-musllinux_1_1_s390x.whl", hash = "sha256:65b0a70a25456d329a5e1426702dde67be0fb7a4ead718005ba2ca582d023a94"}, - {file = "aiohttp-3.9.0-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:5a2eb5311a37fe105aa35f62f75a078537e1a9e4e1d78c86ec9893a3c97d7a30"}, - {file = "aiohttp-3.9.0-cp310-cp310-win32.whl", hash = "sha256:2cbc14a13fb6b42d344e4f27746a4b03a2cb0c1c3c5b932b0d6ad8881aa390e3"}, - {file = "aiohttp-3.9.0-cp310-cp310-win_amd64.whl", hash = "sha256:ac9669990e2016d644ba8ae4758688534aabde8dbbc81f9af129c3f5f01ca9cd"}, - {file = "aiohttp-3.9.0-cp311-cp311-macosx_10_9_universal2.whl", hash = "sha256:f8e05f5163528962ce1d1806fce763ab893b1c5b7ace0a3538cd81a90622f844"}, - {file = "aiohttp-3.9.0-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:4afa8f71dba3a5a2e1e1282a51cba7341ae76585345c43d8f0e624882b622218"}, - {file = "aiohttp-3.9.0-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:f929f4c9b9a00f3e6cc0587abb95ab9c05681f8b14e0fe1daecfa83ea90f8318"}, - {file = "aiohttp-3.9.0-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:28185e36a78d247c55e9fbea2332d16aefa14c5276a582ce7a896231c6b1c208"}, - {file = "aiohttp-3.9.0-cp311-cp311-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:a486ddf57ab98b6d19ad36458b9f09e6022de0381674fe00228ca7b741aacb2f"}, - {file = "aiohttp-3.9.0-cp311-cp311-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:70e851f596c00f40a2f00a46126c95c2e04e146015af05a9da3e4867cfc55911"}, - {file = "aiohttp-3.9.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:c5b7bf8fe4d39886adc34311a233a2e01bc10eb4e842220235ed1de57541a896"}, - {file = "aiohttp-3.9.0-cp311-cp311-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:c67a51ea415192c2e53e4e048c78bab82d21955b4281d297f517707dc836bf3d"}, - {file = "aiohttp-3.9.0-cp311-cp311-musllinux_1_1_aarch64.whl", hash = "sha256:694df243f394629bcae2d8ed94c589a181e8ba8604159e6e45e7b22e58291113"}, - {file = "aiohttp-3.9.0-cp311-cp311-musllinux_1_1_i686.whl", hash = "sha256:3dd8119752dd30dd7bca7d4bc2a92a59be6a003e4e5c2cf7e248b89751b8f4b7"}, - {file = "aiohttp-3.9.0-cp311-cp311-musllinux_1_1_ppc64le.whl", hash = "sha256:eb6dfd52063186ac97b4caa25764cdbcdb4b10d97f5c5f66b0fa95052e744eb7"}, - {file = "aiohttp-3.9.0-cp311-cp311-musllinux_1_1_s390x.whl", hash = "sha256:d97c3e286d0ac9af6223bc132dc4bad6540b37c8d6c0a15fe1e70fb34f9ec411"}, - {file = "aiohttp-3.9.0-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:816f4db40555026e4cdda604a1088577c1fb957d02f3f1292e0221353403f192"}, - {file = "aiohttp-3.9.0-cp311-cp311-win32.whl", hash = "sha256:3abf0551874fecf95f93b58f25ef4fc9a250669a2257753f38f8f592db85ddea"}, - {file = "aiohttp-3.9.0-cp311-cp311-win_amd64.whl", hash = "sha256:e18d92c3e9e22553a73e33784fcb0ed484c9874e9a3e96c16a8d6a1e74a0217b"}, - {file = "aiohttp-3.9.0-cp312-cp312-macosx_10_9_universal2.whl", hash = "sha256:99ae01fb13a618b9942376df77a1f50c20a281390dad3c56a6ec2942e266220d"}, - {file = "aiohttp-3.9.0-cp312-cp312-macosx_10_9_x86_64.whl", hash = "sha256:05857848da443c8c12110d99285d499b4e84d59918a21132e45c3f0804876994"}, - {file = "aiohttp-3.9.0-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:317719d7f824eba55857fe0729363af58e27c066c731bc62cd97bc9c3d9c7ea4"}, - {file = "aiohttp-3.9.0-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:a1e3b3c107ccb0e537f309f719994a55621acd2c8fdf6d5ce5152aed788fb940"}, - {file = "aiohttp-3.9.0-cp312-cp312-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:45820ddbb276113ead8d4907a7802adb77548087ff5465d5c554f9aa3928ae7d"}, - {file = "aiohttp-3.9.0-cp312-cp312-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:05a183f1978802588711aed0dea31e697d760ce9055292db9dc1604daa9a8ded"}, - {file = "aiohttp-3.9.0-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:51a4cd44788ea0b5e6bb8fa704597af3a30be75503a7ed1098bc5b8ffdf6c982"}, - {file = "aiohttp-3.9.0-cp312-cp312-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:673343fbc0c1ac44d0d2640addc56e97a052504beacd7ade0dc5e76d3a4c16e8"}, - {file = "aiohttp-3.9.0-cp312-cp312-musllinux_1_1_aarch64.whl", hash = "sha256:7e8a3b79b6d186a9c99761fd4a5e8dd575a48d96021f220ac5b5fa856e5dd029"}, - {file = "aiohttp-3.9.0-cp312-cp312-musllinux_1_1_i686.whl", hash = "sha256:6777a390e41e78e7c45dab43a4a0196c55c3b8c30eebe017b152939372a83253"}, - {file = "aiohttp-3.9.0-cp312-cp312-musllinux_1_1_ppc64le.whl", hash = "sha256:7ae5f99a32c53731c93ac3075abd3e1e5cfbe72fc3eaac4c27c9dd64ba3b19fe"}, - {file = "aiohttp-3.9.0-cp312-cp312-musllinux_1_1_s390x.whl", hash = "sha256:f1e4f254e9c35d8965d377e065c4a8a55d396fe87c8e7e8429bcfdeeb229bfb3"}, - {file = "aiohttp-3.9.0-cp312-cp312-musllinux_1_1_x86_64.whl", hash = "sha256:11ca808f9a6b63485059f5f6e164ef7ec826483c1212a44f268b3653c91237d8"}, - {file = "aiohttp-3.9.0-cp312-cp312-win32.whl", hash = "sha256:de3cc86f4ea8b4c34a6e43a7306c40c1275e52bfa9748d869c6b7d54aa6dad80"}, - {file = "aiohttp-3.9.0-cp312-cp312-win_amd64.whl", hash = "sha256:ca4fddf84ac7d8a7d0866664936f93318ff01ee33e32381a115b19fb5a4d1202"}, - {file = "aiohttp-3.9.0-cp38-cp38-macosx_10_9_universal2.whl", hash = "sha256:f09960b5bb1017d16c0f9e9f7fc42160a5a49fa1e87a175fd4a2b1a1833ea0af"}, - {file = "aiohttp-3.9.0-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:8303531e2c17b1a494ffaeba48f2da655fe932c4e9a2626c8718403c83e5dd2b"}, - {file = "aiohttp-3.9.0-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:4790e44f46a4aa07b64504089def5744d3b6780468c4ec3a1a36eb7f2cae9814"}, - {file = "aiohttp-3.9.0-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:a1d7edf74a36de0e5ca50787e83a77cf352f5504eb0ffa3f07000a911ba353fb"}, - {file = "aiohttp-3.9.0-cp38-cp38-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:94697c7293199c2a2551e3e3e18438b4cba293e79c6bc2319f5fd652fccb7456"}, - {file = "aiohttp-3.9.0-cp38-cp38-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:a1b66dbb8a7d5f50e9e2ea3804b01e766308331d0cac76eb30c563ac89c95985"}, - {file = "aiohttp-3.9.0-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:9623cfd9e85b76b83ef88519d98326d4731f8d71869867e47a0b979ffec61c73"}, - {file = "aiohttp-3.9.0-cp38-cp38-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:f32c86dc967ab8c719fd229ce71917caad13cc1e8356ee997bf02c5b368799bf"}, - {file = "aiohttp-3.9.0-cp38-cp38-musllinux_1_1_aarch64.whl", hash = "sha256:f50b4663c3e0262c3a361faf440761fbef60ccdde5fe8545689a4b3a3c149fb4"}, - {file = "aiohttp-3.9.0-cp38-cp38-musllinux_1_1_i686.whl", hash = "sha256:dcf71c55ec853826cd70eadb2b6ac62ec577416442ca1e0a97ad875a1b3a0305"}, - {file = "aiohttp-3.9.0-cp38-cp38-musllinux_1_1_ppc64le.whl", hash = "sha256:42fe4fd9f0dfcc7be4248c162d8056f1d51a04c60e53366b0098d1267c4c9da8"}, - {file = "aiohttp-3.9.0-cp38-cp38-musllinux_1_1_s390x.whl", hash = "sha256:76a86a9989ebf82ee61e06e2bab408aec4ea367dc6da35145c3352b60a112d11"}, - {file = "aiohttp-3.9.0-cp38-cp38-musllinux_1_1_x86_64.whl", hash = "sha256:f9e09a1c83521d770d170b3801eea19b89f41ccaa61d53026ed111cb6f088887"}, - {file = "aiohttp-3.9.0-cp38-cp38-win32.whl", hash = "sha256:a00ce44c21612d185c5275c5cba4bab8d7c1590f248638b667ed8a782fa8cd6f"}, - {file = "aiohttp-3.9.0-cp38-cp38-win_amd64.whl", hash = "sha256:d5b9345ab92ebe6003ae11d8092ce822a0242146e6fa270889b9ba965457ca40"}, - {file = "aiohttp-3.9.0-cp39-cp39-macosx_10_9_universal2.whl", hash = "sha256:98d21092bf2637c5fa724a428a69e8f5955f2182bff61f8036827cf6ce1157bf"}, - {file = "aiohttp-3.9.0-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:35a68cd63ca6aaef5707888f17a70c36efe62b099a4e853d33dc2e9872125be8"}, - {file = "aiohttp-3.9.0-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:3d7f6235c7475658acfc1769d968e07ab585c79f6ca438ddfecaa9a08006aee2"}, - {file = "aiohttp-3.9.0-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:db04d1de548f7a62d1dd7e7cdf7c22893ee168e22701895067a28a8ed51b3735"}, - {file = "aiohttp-3.9.0-cp39-cp39-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:536b01513d67d10baf6f71c72decdf492fb7433c5f2f133e9a9087379d4b6f31"}, - {file = "aiohttp-3.9.0-cp39-cp39-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:87c8b0a6487e8109427ccf638580865b54e2e3db4a6e0e11c02639231b41fc0f"}, - {file = "aiohttp-3.9.0-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:7276fe0017664414fdc3618fca411630405f1aaf0cc3be69def650eb50441787"}, - {file = "aiohttp-3.9.0-cp39-cp39-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:23170247ef89ffa842a02bbfdc425028574d9e010611659abeb24d890bc53bb8"}, - {file = "aiohttp-3.9.0-cp39-cp39-musllinux_1_1_aarch64.whl", hash = "sha256:b1a2ea8252cacc7fd51df5a56d7a2bb1986ed39be9397b51a08015727dfb69bd"}, - {file = "aiohttp-3.9.0-cp39-cp39-musllinux_1_1_i686.whl", hash = "sha256:2d71abc15ff7047412ef26bf812dfc8d0d1020d664617f4913df2df469f26b76"}, - {file = "aiohttp-3.9.0-cp39-cp39-musllinux_1_1_ppc64le.whl", hash = "sha256:2d820162c8c2bdbe97d328cd4f417c955ca370027dce593345e437b2e9ffdc4d"}, - {file = "aiohttp-3.9.0-cp39-cp39-musllinux_1_1_s390x.whl", hash = "sha256:2779f5e7c70f7b421915fd47db332c81de365678180a9f3ab404088f87ba5ff9"}, - {file = "aiohttp-3.9.0-cp39-cp39-musllinux_1_1_x86_64.whl", hash = "sha256:366bc870d7ac61726f32a489fbe3d1d8876e87506870be66b01aeb84389e967e"}, - {file = "aiohttp-3.9.0-cp39-cp39-win32.whl", hash = "sha256:1df43596b826022b14998f0460926ce261544fedefe0d2f653e1b20f49e96454"}, - {file = "aiohttp-3.9.0-cp39-cp39-win_amd64.whl", hash = "sha256:9c196b30f1b1aa3363a69dd69079ae9bec96c2965c4707eaa6914ba099fb7d4f"}, - {file = "aiohttp-3.9.0.tar.gz", hash = "sha256:09f23292d29135025e19e8ff4f0a68df078fe4ee013bca0105b2e803989de92d"}, + {file = "aiohttp-3.9.2-cp310-cp310-macosx_10_9_universal2.whl", hash = "sha256:772fbe371788e61c58d6d3d904268e48a594ba866804d08c995ad71b144f94cb"}, + {file = "aiohttp-3.9.2-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:edd4f1af2253f227ae311ab3d403d0c506c9b4410c7fc8d9573dec6d9740369f"}, + {file = "aiohttp-3.9.2-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:cfee9287778399fdef6f8a11c9e425e1cb13cc9920fd3a3df8f122500978292b"}, + {file = "aiohttp-3.9.2-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:3cc158466f6a980a6095ee55174d1de5730ad7dec251be655d9a6a9dd7ea1ff9"}, + {file = "aiohttp-3.9.2-cp310-cp310-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:54ec82f45d57c9a65a1ead3953b51c704f9587440e6682f689da97f3e8defa35"}, + {file = "aiohttp-3.9.2-cp310-cp310-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:abeb813a18eb387f0d835ef51f88568540ad0325807a77a6e501fed4610f864e"}, + {file = "aiohttp-3.9.2-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:cc91d07280d7d169f3a0f9179d8babd0ee05c79d4d891447629ff0d7d8089ec2"}, + {file = "aiohttp-3.9.2-cp310-cp310-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:b65e861f4bebfb660f7f0f40fa3eb9f2ab9af10647d05dac824390e7af8f75b7"}, + {file = "aiohttp-3.9.2-cp310-cp310-musllinux_1_1_aarch64.whl", hash = "sha256:04fd8ffd2be73d42bcf55fd78cde7958eeee6d4d8f73c3846b7cba491ecdb570"}, + {file = "aiohttp-3.9.2-cp310-cp310-musllinux_1_1_i686.whl", hash = "sha256:3d8d962b439a859b3ded9a1e111a4615357b01620a546bc601f25b0211f2da81"}, + {file = "aiohttp-3.9.2-cp310-cp310-musllinux_1_1_ppc64le.whl", hash = "sha256:8ceb658afd12b27552597cf9a65d9807d58aef45adbb58616cdd5ad4c258c39e"}, + {file = "aiohttp-3.9.2-cp310-cp310-musllinux_1_1_s390x.whl", hash = "sha256:0e4ee4df741670560b1bc393672035418bf9063718fee05e1796bf867e995fad"}, + {file = "aiohttp-3.9.2-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:2dec87a556f300d3211decf018bfd263424f0690fcca00de94a837949fbcea02"}, + {file = "aiohttp-3.9.2-cp310-cp310-win32.whl", hash = "sha256:3e1a800f988ce7c4917f34096f81585a73dbf65b5c39618b37926b1238cf9bc4"}, + {file = "aiohttp-3.9.2-cp310-cp310-win_amd64.whl", hash = "sha256:ea510718a41b95c236c992b89fdfc3d04cc7ca60281f93aaada497c2b4e05c46"}, + {file = "aiohttp-3.9.2-cp311-cp311-macosx_10_9_universal2.whl", hash = "sha256:6aaa6f99256dd1b5756a50891a20f0d252bd7bdb0854c5d440edab4495c9f973"}, + {file = "aiohttp-3.9.2-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:a27d8c70ad87bcfce2e97488652075a9bdd5b70093f50b10ae051dfe5e6baf37"}, + {file = "aiohttp-3.9.2-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:54287bcb74d21715ac8382e9de146d9442b5f133d9babb7e5d9e453faadd005e"}, + {file = "aiohttp-3.9.2-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:5bb3d05569aa83011fcb346b5266e00b04180105fcacc63743fc2e4a1862a891"}, + {file = "aiohttp-3.9.2-cp311-cp311-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:c8534e7d69bb8e8d134fe2be9890d1b863518582f30c9874ed7ed12e48abe3c4"}, + {file = "aiohttp-3.9.2-cp311-cp311-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:4bd9d5b989d57b41e4ff56ab250c5ddf259f32db17159cce630fd543376bd96b"}, + {file = "aiohttp-3.9.2-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:fa6904088e6642609981f919ba775838ebf7df7fe64998b1a954fb411ffb4663"}, + {file = "aiohttp-3.9.2-cp311-cp311-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:bda42eb410be91b349fb4ee3a23a30ee301c391e503996a638d05659d76ea4c2"}, + {file = "aiohttp-3.9.2-cp311-cp311-musllinux_1_1_aarch64.whl", hash = "sha256:193cc1ccd69d819562cc7f345c815a6fc51d223b2ef22f23c1a0f67a88de9a72"}, + {file = "aiohttp-3.9.2-cp311-cp311-musllinux_1_1_i686.whl", hash = "sha256:b9f1cb839b621f84a5b006848e336cf1496688059d2408e617af33e3470ba204"}, + {file = "aiohttp-3.9.2-cp311-cp311-musllinux_1_1_ppc64le.whl", hash = "sha256:d22a0931848b8c7a023c695fa2057c6aaac19085f257d48baa24455e67df97ec"}, + {file = "aiohttp-3.9.2-cp311-cp311-musllinux_1_1_s390x.whl", hash = "sha256:4112d8ba61fbd0abd5d43a9cb312214565b446d926e282a6d7da3f5a5aa71d36"}, + {file = "aiohttp-3.9.2-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:c4ad4241b52bb2eb7a4d2bde060d31c2b255b8c6597dd8deac2f039168d14fd7"}, + {file = "aiohttp-3.9.2-cp311-cp311-win32.whl", hash = "sha256:ee2661a3f5b529f4fc8a8ffee9f736ae054adfb353a0d2f78218be90617194b3"}, + {file = "aiohttp-3.9.2-cp311-cp311-win_amd64.whl", hash = "sha256:4deae2c165a5db1ed97df2868ef31ca3cc999988812e82386d22937d9d6fed52"}, + {file = "aiohttp-3.9.2-cp312-cp312-macosx_10_9_universal2.whl", hash = "sha256:6f4cdba12539215aaecf3c310ce9d067b0081a0795dd8a8805fdb67a65c0572a"}, + {file = "aiohttp-3.9.2-cp312-cp312-macosx_10_9_x86_64.whl", hash = "sha256:84e843b33d5460a5c501c05539809ff3aee07436296ff9fbc4d327e32aa3a326"}, + {file = "aiohttp-3.9.2-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:8008d0f451d66140a5aa1c17e3eedc9d56e14207568cd42072c9d6b92bf19b52"}, + {file = "aiohttp-3.9.2-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:61c47ab8ef629793c086378b1df93d18438612d3ed60dca76c3422f4fbafa792"}, + {file = "aiohttp-3.9.2-cp312-cp312-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:bc71f748e12284312f140eaa6599a520389273174b42c345d13c7e07792f4f57"}, + {file = "aiohttp-3.9.2-cp312-cp312-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:a1c3a4d0ab2f75f22ec80bca62385db2e8810ee12efa8c9e92efea45c1849133"}, + {file = "aiohttp-3.9.2-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:9a87aa0b13bbee025faa59fa58861303c2b064b9855d4c0e45ec70182bbeba1b"}, + {file = "aiohttp-3.9.2-cp312-cp312-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:e2cc0d04688b9f4a7854c56c18aa7af9e5b0a87a28f934e2e596ba7e14783192"}, + {file = "aiohttp-3.9.2-cp312-cp312-musllinux_1_1_aarch64.whl", hash = "sha256:1956e3ac376b1711c1533266dec4efd485f821d84c13ce1217d53e42c9e65f08"}, + {file = "aiohttp-3.9.2-cp312-cp312-musllinux_1_1_i686.whl", hash = "sha256:114da29f39eccd71b93a0fcacff178749a5c3559009b4a4498c2c173a6d74dff"}, + {file = "aiohttp-3.9.2-cp312-cp312-musllinux_1_1_ppc64le.whl", hash = "sha256:3f17999ae3927d8a9a823a1283b201344a0627272f92d4f3e3a4efe276972fe8"}, + {file = "aiohttp-3.9.2-cp312-cp312-musllinux_1_1_s390x.whl", hash = "sha256:f31df6a32217a34ae2f813b152a6f348154f948c83213b690e59d9e84020925c"}, + {file = "aiohttp-3.9.2-cp312-cp312-musllinux_1_1_x86_64.whl", hash = "sha256:7a75307ffe31329928a8d47eae0692192327c599113d41b278d4c12b54e1bd11"}, + {file = "aiohttp-3.9.2-cp312-cp312-win32.whl", hash = "sha256:972b63d589ff8f305463593050a31b5ce91638918da38139b9d8deaba9e0fed7"}, + {file = "aiohttp-3.9.2-cp312-cp312-win_amd64.whl", hash = "sha256:200dc0246f0cb5405c80d18ac905c8350179c063ea1587580e3335bfc243ba6a"}, + {file = "aiohttp-3.9.2-cp38-cp38-macosx_10_9_universal2.whl", hash = "sha256:158564d0d1020e0d3fe919a81d97aadad35171e13e7b425b244ad4337fc6793a"}, + {file = "aiohttp-3.9.2-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:da1346cd0ccb395f0ed16b113ebb626fa43b7b07fd7344fce33e7a4f04a8897a"}, + {file = "aiohttp-3.9.2-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:eaa9256de26ea0334ffa25f1913ae15a51e35c529a1ed9af8e6286dd44312554"}, + {file = "aiohttp-3.9.2-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:1543e7fb00214fb4ccead42e6a7d86f3bb7c34751ec7c605cca7388e525fd0b4"}, + {file = "aiohttp-3.9.2-cp38-cp38-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:186e94570433a004e05f31f632726ae0f2c9dee4762a9ce915769ce9c0a23d89"}, + {file = "aiohttp-3.9.2-cp38-cp38-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:d52d20832ac1560f4510d68e7ba8befbc801a2b77df12bd0cd2bcf3b049e52a4"}, + {file = "aiohttp-3.9.2-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:1c45e4e815ac6af3b72ca2bde9b608d2571737bb1e2d42299fc1ffdf60f6f9a1"}, + {file = "aiohttp-3.9.2-cp38-cp38-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:aa906b9bdfd4a7972dd0628dbbd6413d2062df5b431194486a78f0d2ae87bd55"}, + {file = "aiohttp-3.9.2-cp38-cp38-musllinux_1_1_aarch64.whl", hash = "sha256:68bbee9e17d66f17bb0010aa15a22c6eb28583edcc8b3212e2b8e3f77f3ebe2a"}, + {file = "aiohttp-3.9.2-cp38-cp38-musllinux_1_1_i686.whl", hash = "sha256:4c189b64bd6d9a403a1a3f86a3ab3acbc3dc41a68f73a268a4f683f89a4dec1f"}, + {file = "aiohttp-3.9.2-cp38-cp38-musllinux_1_1_ppc64le.whl", hash = "sha256:8a7876f794523123bca6d44bfecd89c9fec9ec897a25f3dd202ee7fc5c6525b7"}, + {file = "aiohttp-3.9.2-cp38-cp38-musllinux_1_1_s390x.whl", hash = "sha256:d23fba734e3dd7b1d679b9473129cd52e4ec0e65a4512b488981a56420e708db"}, + {file = "aiohttp-3.9.2-cp38-cp38-musllinux_1_1_x86_64.whl", hash = "sha256:b141753be581fab842a25cb319f79536d19c2a51995d7d8b29ee290169868eab"}, + {file = "aiohttp-3.9.2-cp38-cp38-win32.whl", hash = "sha256:103daf41ff3b53ba6fa09ad410793e2e76c9d0269151812e5aba4b9dd674a7e8"}, + {file = "aiohttp-3.9.2-cp38-cp38-win_amd64.whl", hash = "sha256:328918a6c2835861ff7afa8c6d2c70c35fdaf996205d5932351bdd952f33fa2f"}, + {file = "aiohttp-3.9.2-cp39-cp39-macosx_10_9_universal2.whl", hash = "sha256:5264d7327c9464786f74e4ec9342afbbb6ee70dfbb2ec9e3dfce7a54c8043aa3"}, + {file = "aiohttp-3.9.2-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:07205ae0015e05c78b3288c1517afa000823a678a41594b3fdc870878d645305"}, + {file = "aiohttp-3.9.2-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:ae0a1e638cffc3ec4d4784b8b4fd1cf28968febc4bd2718ffa25b99b96a741bd"}, + {file = "aiohttp-3.9.2-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:d43302a30ba1166325974858e6ef31727a23bdd12db40e725bec0f759abce505"}, + {file = "aiohttp-3.9.2-cp39-cp39-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:16a967685907003765855999af11a79b24e70b34dc710f77a38d21cd9fc4f5fe"}, + {file = "aiohttp-3.9.2-cp39-cp39-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:6fa3ee92cd441d5c2d07ca88d7a9cef50f7ec975f0117cd0c62018022a184308"}, + {file = "aiohttp-3.9.2-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:0b500c5ad9c07639d48615a770f49618130e61be36608fc9bc2d9bae31732b8f"}, + {file = "aiohttp-3.9.2-cp39-cp39-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:c07327b368745b1ce2393ae9e1aafed7073d9199e1dcba14e035cc646c7941bf"}, + {file = "aiohttp-3.9.2-cp39-cp39-musllinux_1_1_aarch64.whl", hash = "sha256:cc7d6502c23a0ec109687bf31909b3fb7b196faf198f8cff68c81b49eb316ea9"}, + {file = "aiohttp-3.9.2-cp39-cp39-musllinux_1_1_i686.whl", hash = "sha256:07be2be7071723c3509ab5c08108d3a74f2181d4964e869f2504aaab68f8d3e8"}, + {file = "aiohttp-3.9.2-cp39-cp39-musllinux_1_1_ppc64le.whl", hash = "sha256:122468f6fee5fcbe67cb07014a08c195b3d4c41ff71e7b5160a7bcc41d585a5f"}, + {file = "aiohttp-3.9.2-cp39-cp39-musllinux_1_1_s390x.whl", hash = "sha256:00a9abcea793c81e7f8778ca195a1714a64f6d7436c4c0bb168ad2a212627000"}, + {file = "aiohttp-3.9.2-cp39-cp39-musllinux_1_1_x86_64.whl", hash = "sha256:7a9825fdd64ecac5c670234d80bb52bdcaa4139d1f839165f548208b3779c6c6"}, + {file = "aiohttp-3.9.2-cp39-cp39-win32.whl", hash = "sha256:5422cd9a4a00f24c7244e1b15aa9b87935c85fb6a00c8ac9b2527b38627a9211"}, + {file = "aiohttp-3.9.2-cp39-cp39-win_amd64.whl", hash = "sha256:7d579dcd5d82a86a46f725458418458fa43686f6a7b252f2966d359033ffc8ab"}, + {file = "aiohttp-3.9.2.tar.gz", hash = "sha256:b0ad0a5e86ce73f5368a164c10ada10504bf91869c05ab75d982c6048217fbf7"}, ] [package.dependencies] @@ -158,6 +158,28 @@ files = [ attrs = ">=16.0.0" pluggy = ">=0.4.0" +[[package]] +name = "anyio" +version = "4.3.0" +description = "High level compatibility layer for multiple asynchronous event loop implementations" +optional = false +python-versions = ">=3.8" +files = [ + {file = "anyio-4.3.0-py3-none-any.whl", hash = "sha256:048e05d0f6caeed70d731f3db756d35dcc1f35747c8c403364a8332c630441b8"}, + {file = "anyio-4.3.0.tar.gz", hash = "sha256:f75253795a87df48568485fd18cdd2a3fa5c4f7c5be8e5e36637733fce06fed6"}, +] + +[package.dependencies] +exceptiongroup = {version = ">=1.0.2", markers = "python_version < \"3.11\""} +idna = ">=2.8" +sniffio = ">=1.1" +typing-extensions = {version = ">=4.1", markers = "python_version < \"3.11\""} + +[package.extras] +doc = ["Sphinx (>=7)", "packaging", "sphinx-autodoc-typehints (>=1.2.0)", "sphinx-rtd-theme"] +test = ["anyio[trio]", "coverage[toml] (>=7)", "exceptiongroup (>=1.2.0)", "hypothesis (>=4.0)", "psutil (>=5.9)", "pytest (>=7.0)", "pytest-mock (>=3.6.1)", "trustme", "uvloop (>=0.17)"] +trio = ["trio (>=0.23)"] + [[package]] name = "async-timeout" version = "4.0.3" @@ -836,47 +858,56 @@ files = [ [[package]] name = "cryptography" -version = "41.0.6" +version = "42.0.4" description = "cryptography is a package which provides cryptographic recipes and primitives to Python developers." optional = false python-versions = ">=3.7" files = [ - {file = "cryptography-41.0.6-cp37-abi3-macosx_10_12_universal2.whl", hash = "sha256:0f27acb55a4e77b9be8d550d762b0513ef3fc658cd3eb15110ebbcbd626db12c"}, - {file = "cryptography-41.0.6-cp37-abi3-macosx_10_12_x86_64.whl", hash = "sha256:ae236bb8760c1e55b7a39b6d4d32d2279bc6c7c8500b7d5a13b6fb9fc97be35b"}, - {file = "cryptography-41.0.6-cp37-abi3-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:afda76d84b053923c27ede5edc1ed7d53e3c9f475ebaf63c68e69f1403c405a8"}, - {file = "cryptography-41.0.6-cp37-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:da46e2b5df770070412c46f87bac0849b8d685c5f2679771de277a422c7d0b86"}, - {file = "cryptography-41.0.6-cp37-abi3-manylinux_2_28_aarch64.whl", hash = "sha256:ff369dd19e8fe0528b02e8df9f2aeb2479f89b1270d90f96a63500afe9af5cae"}, - {file = "cryptography-41.0.6-cp37-abi3-manylinux_2_28_x86_64.whl", hash = "sha256:b648fe2a45e426aaee684ddca2632f62ec4613ef362f4d681a9a6283d10e079d"}, - {file = "cryptography-41.0.6-cp37-abi3-musllinux_1_1_aarch64.whl", hash = "sha256:5daeb18e7886a358064a68dbcaf441c036cbdb7da52ae744e7b9207b04d3908c"}, - {file = "cryptography-41.0.6-cp37-abi3-musllinux_1_1_x86_64.whl", hash = "sha256:068bc551698c234742c40049e46840843f3d98ad7ce265fd2bd4ec0d11306596"}, - {file = "cryptography-41.0.6-cp37-abi3-win32.whl", hash = "sha256:2132d5865eea673fe6712c2ed5fb4fa49dba10768bb4cc798345748380ee3660"}, - {file = "cryptography-41.0.6-cp37-abi3-win_amd64.whl", hash = "sha256:48783b7e2bef51224020efb61b42704207dde583d7e371ef8fc2a5fb6c0aabc7"}, - {file = "cryptography-41.0.6-pp310-pypy310_pp73-macosx_10_12_x86_64.whl", hash = "sha256:8efb2af8d4ba9dbc9c9dd8f04d19a7abb5b49eab1f3694e7b5a16a5fc2856f5c"}, - {file = "cryptography-41.0.6-pp310-pypy310_pp73-manylinux_2_28_aarch64.whl", hash = "sha256:c5a550dc7a3b50b116323e3d376241829fd326ac47bc195e04eb33a8170902a9"}, - {file = "cryptography-41.0.6-pp310-pypy310_pp73-manylinux_2_28_x86_64.whl", hash = "sha256:85abd057699b98fce40b41737afb234fef05c67e116f6f3650782c10862c43da"}, - {file = "cryptography-41.0.6-pp310-pypy310_pp73-win_amd64.whl", hash = "sha256:f39812f70fc5c71a15aa3c97b2bbe213c3f2a460b79bd21c40d033bb34a9bf36"}, - {file = "cryptography-41.0.6-pp38-pypy38_pp73-macosx_10_12_x86_64.whl", hash = "sha256:742ae5e9a2310e9dade7932f9576606836ed174da3c7d26bc3d3ab4bd49b9f65"}, - {file = "cryptography-41.0.6-pp38-pypy38_pp73-manylinux_2_28_aarch64.whl", hash = "sha256:35f3f288e83c3f6f10752467c48919a7a94b7d88cc00b0668372a0d2ad4f8ead"}, - {file = "cryptography-41.0.6-pp38-pypy38_pp73-manylinux_2_28_x86_64.whl", hash = "sha256:4d03186af98b1c01a4eda396b137f29e4e3fb0173e30f885e27acec8823c1b09"}, - {file = "cryptography-41.0.6-pp38-pypy38_pp73-win_amd64.whl", hash = "sha256:b27a7fd4229abef715e064269d98a7e2909ebf92eb6912a9603c7e14c181928c"}, - {file = "cryptography-41.0.6-pp39-pypy39_pp73-macosx_10_12_x86_64.whl", hash = "sha256:398ae1fc711b5eb78e977daa3cbf47cec20f2c08c5da129b7a296055fbb22aed"}, - {file = "cryptography-41.0.6-pp39-pypy39_pp73-manylinux_2_28_aarch64.whl", hash = "sha256:7e00fb556bda398b99b0da289ce7053639d33b572847181d6483ad89835115f6"}, - {file = "cryptography-41.0.6-pp39-pypy39_pp73-manylinux_2_28_x86_64.whl", hash = "sha256:60e746b11b937911dc70d164060d28d273e31853bb359e2b2033c9e93e6f3c43"}, - {file = "cryptography-41.0.6-pp39-pypy39_pp73-win_amd64.whl", hash = "sha256:3288acccef021e3c3c10d58933f44e8602cf04dba96d9796d70d537bb2f4bbc4"}, - {file = "cryptography-41.0.6.tar.gz", hash = "sha256:422e3e31d63743855e43e5a6fcc8b4acab860f560f9321b0ee6269cc7ed70cc3"}, + {file = "cryptography-42.0.4-cp37-abi3-macosx_10_12_universal2.whl", hash = "sha256:ffc73996c4fca3d2b6c1c8c12bfd3ad00def8621da24f547626bf06441400449"}, + {file = "cryptography-42.0.4-cp37-abi3-macosx_10_12_x86_64.whl", hash = "sha256:db4b65b02f59035037fde0998974d84244a64c3265bdef32a827ab9b63d61b18"}, + {file = "cryptography-42.0.4-cp37-abi3-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:dad9c385ba8ee025bb0d856714f71d7840020fe176ae0229de618f14dae7a6e2"}, + {file = "cryptography-42.0.4-cp37-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:69b22ab6506a3fe483d67d1ed878e1602bdd5912a134e6202c1ec672233241c1"}, + {file = "cryptography-42.0.4-cp37-abi3-manylinux_2_28_aarch64.whl", hash = "sha256:e09469a2cec88fb7b078e16d4adec594414397e8879a4341c6ace96013463d5b"}, + {file = "cryptography-42.0.4-cp37-abi3-manylinux_2_28_x86_64.whl", hash = "sha256:3e970a2119507d0b104f0a8e281521ad28fc26f2820687b3436b8c9a5fcf20d1"}, + {file = "cryptography-42.0.4-cp37-abi3-musllinux_1_1_aarch64.whl", hash = "sha256:e53dc41cda40b248ebc40b83b31516487f7db95ab8ceac1f042626bc43a2f992"}, + {file = "cryptography-42.0.4-cp37-abi3-musllinux_1_1_x86_64.whl", hash = "sha256:c3a5cbc620e1e17009f30dd34cb0d85c987afd21c41a74352d1719be33380885"}, + {file = "cryptography-42.0.4-cp37-abi3-musllinux_1_2_aarch64.whl", hash = "sha256:6bfadd884e7280df24d26f2186e4e07556a05d37393b0f220a840b083dc6a824"}, + {file = "cryptography-42.0.4-cp37-abi3-musllinux_1_2_x86_64.whl", hash = "sha256:01911714117642a3f1792c7f376db572aadadbafcd8d75bb527166009c9f1d1b"}, + {file = "cryptography-42.0.4-cp37-abi3-win32.whl", hash = "sha256:fb0cef872d8193e487fc6bdb08559c3aa41b659a7d9be48b2e10747f47863925"}, + {file = "cryptography-42.0.4-cp37-abi3-win_amd64.whl", hash = "sha256:c1f25b252d2c87088abc8bbc4f1ecbf7c919e05508a7e8628e6875c40bc70923"}, + {file = "cryptography-42.0.4-cp39-abi3-macosx_10_12_universal2.whl", hash = "sha256:15a1fb843c48b4a604663fa30af60818cd28f895572386e5f9b8a665874c26e7"}, + {file = "cryptography-42.0.4-cp39-abi3-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:a1327f280c824ff7885bdeef8578f74690e9079267c1c8bd7dc5cc5aa065ae52"}, + {file = "cryptography-42.0.4-cp39-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:6ffb03d419edcab93b4b19c22ee80c007fb2d708429cecebf1dd3258956a563a"}, + {file = "cryptography-42.0.4-cp39-abi3-manylinux_2_28_aarch64.whl", hash = "sha256:1df6fcbf60560d2113b5ed90f072dc0b108d64750d4cbd46a21ec882c7aefce9"}, + {file = "cryptography-42.0.4-cp39-abi3-manylinux_2_28_x86_64.whl", hash = "sha256:44a64043f743485925d3bcac548d05df0f9bb445c5fcca6681889c7c3ab12764"}, + {file = "cryptography-42.0.4-cp39-abi3-musllinux_1_1_aarch64.whl", hash = "sha256:3c6048f217533d89f2f8f4f0fe3044bf0b2090453b7b73d0b77db47b80af8dff"}, + {file = "cryptography-42.0.4-cp39-abi3-musllinux_1_1_x86_64.whl", hash = "sha256:6d0fbe73728c44ca3a241eff9aefe6496ab2656d6e7a4ea2459865f2e8613257"}, + {file = "cryptography-42.0.4-cp39-abi3-musllinux_1_2_aarch64.whl", hash = "sha256:887623fe0d70f48ab3f5e4dbf234986b1329a64c066d719432d0698522749929"}, + {file = "cryptography-42.0.4-cp39-abi3-musllinux_1_2_x86_64.whl", hash = "sha256:ce8613beaffc7c14f091497346ef117c1798c202b01153a8cc7b8e2ebaaf41c0"}, + {file = "cryptography-42.0.4-cp39-abi3-win32.whl", hash = "sha256:810bcf151caefc03e51a3d61e53335cd5c7316c0a105cc695f0959f2c638b129"}, + {file = "cryptography-42.0.4-cp39-abi3-win_amd64.whl", hash = "sha256:a0298bdc6e98ca21382afe914c642620370ce0470a01e1bef6dd9b5354c36854"}, + {file = "cryptography-42.0.4-pp310-pypy310_pp73-macosx_10_12_x86_64.whl", hash = "sha256:5f8907fcf57392cd917892ae83708761c6ff3c37a8e835d7246ff0ad251d9298"}, + {file = "cryptography-42.0.4-pp310-pypy310_pp73-manylinux_2_28_aarch64.whl", hash = "sha256:12d341bd42cdb7d4937b0cabbdf2a94f949413ac4504904d0cdbdce4a22cbf88"}, + {file = "cryptography-42.0.4-pp310-pypy310_pp73-manylinux_2_28_x86_64.whl", hash = "sha256:1cdcdbd117681c88d717437ada72bdd5be9de117f96e3f4d50dab3f59fd9ab20"}, + {file = "cryptography-42.0.4-pp310-pypy310_pp73-win_amd64.whl", hash = "sha256:0e89f7b84f421c56e7ff69f11c441ebda73b8a8e6488d322ef71746224c20fce"}, + {file = "cryptography-42.0.4-pp39-pypy39_pp73-macosx_10_12_x86_64.whl", hash = "sha256:f1e85a178384bf19e36779d91ff35c7617c885da487d689b05c1366f9933ad74"}, + {file = "cryptography-42.0.4-pp39-pypy39_pp73-manylinux_2_28_aarch64.whl", hash = "sha256:d2a27aca5597c8a71abbe10209184e1a8e91c1fd470b5070a2ea60cafec35bcd"}, + {file = "cryptography-42.0.4-pp39-pypy39_pp73-manylinux_2_28_x86_64.whl", hash = "sha256:4e36685cb634af55e0677d435d425043967ac2f3790ec652b2b88ad03b85c27b"}, + {file = "cryptography-42.0.4-pp39-pypy39_pp73-win_amd64.whl", hash = "sha256:f47be41843200f7faec0683ad751e5ef11b9a56a220d57f300376cd8aba81660"}, + {file = "cryptography-42.0.4.tar.gz", hash = "sha256:831a4b37accef30cccd34fcb916a5d7b5be3cbbe27268a02832c3e450aea39cb"}, ] [package.dependencies] -cffi = ">=1.12" +cffi = {version = ">=1.12", markers = "platform_python_implementation != \"PyPy\""} [package.extras] docs = ["sphinx (>=5.3.0)", "sphinx-rtd-theme (>=1.1.1)"] -docstest = ["pyenchant (>=1.6.11)", "sphinxcontrib-spelling (>=4.0.1)", "twine (>=1.12.0)"] +docstest = ["pyenchant (>=1.6.11)", "readme-renderer", "sphinxcontrib-spelling (>=4.0.1)"] nox = ["nox"] -pep8test = ["black", "check-sdist", "mypy", "ruff"] +pep8test = ["check-sdist", "click", "mypy", "ruff"] sdist = ["build"] ssh = ["bcrypt (>=3.1.5)"] -test = ["pretend", "pytest (>=6.2.0)", "pytest-benchmark", "pytest-cov", "pytest-xdist"] +test = ["certifi", "pretend", "pytest (>=6.2.0)", "pytest-benchmark", "pytest-cov", "pytest-xdist"] test-randomorder = ["pytest-randomly"] [[package]] @@ -1064,6 +1095,100 @@ files = [ {file = "graphql_core-3.2.1-py3-none-any.whl", hash = "sha256:f83c658e4968998eed1923a2e3e3eddd347e005ac0315fbb7ca4d70ea9156323"}, ] +[[package]] +name = "h11" +version = "0.14.0" +description = "A pure-Python, bring-your-own-I/O implementation of HTTP/1.1" +optional = false +python-versions = ">=3.7" +files = [ + {file = "h11-0.14.0-py3-none-any.whl", hash = "sha256:e3fe4ac4b851c468cc8363d500db52c2ead036020723024a109d37346efaa761"}, + {file = "h11-0.14.0.tar.gz", hash = "sha256:8f19fbbe99e72420ff35c00b27a34cb9937e902a8b810e2c88300c6f0a3b699d"}, +] + +[[package]] +name = "h2" +version = "4.1.0" +description = "HTTP/2 State-Machine based protocol implementation" +optional = false +python-versions = ">=3.6.1" +files = [ + {file = "h2-4.1.0-py3-none-any.whl", hash = "sha256:03a46bcf682256c95b5fd9e9a99c1323584c3eec6440d379b9903d709476bc6d"}, + {file = "h2-4.1.0.tar.gz", hash = "sha256:a83aca08fbe7aacb79fec788c9c0bac936343560ed9ec18b82a13a12c28d2abb"}, +] + +[package.dependencies] +hpack = ">=4.0,<5" +hyperframe = ">=6.0,<7" + +[[package]] +name = "hpack" +version = "4.0.0" +description = "Pure-Python HPACK header compression" +optional = false +python-versions = ">=3.6.1" +files = [ + {file = "hpack-4.0.0-py3-none-any.whl", hash = "sha256:84a076fad3dc9a9f8063ccb8041ef100867b1878b25ef0ee63847a5d53818a6c"}, + {file = "hpack-4.0.0.tar.gz", hash = "sha256:fc41de0c63e687ebffde81187a948221294896f6bdc0ae2312708df339430095"}, +] + +[[package]] +name = "httpcore" +version = "1.0.3" +description = "A minimal low-level HTTP client." +optional = false +python-versions = ">=3.8" +files = [ + {file = "httpcore-1.0.3-py3-none-any.whl", hash = "sha256:9a6a501c3099307d9fd76ac244e08503427679b1e81ceb1d922485e2f2462ad2"}, + {file = "httpcore-1.0.3.tar.gz", hash = "sha256:5c0f9546ad17dac4d0772b0808856eb616eb8b48ce94f49ed819fd6982a8a544"}, +] + +[package.dependencies] +certifi = "*" +h11 = ">=0.13,<0.15" + +[package.extras] +asyncio = ["anyio (>=4.0,<5.0)"] +http2 = ["h2 (>=3,<5)"] +socks = ["socksio (==1.*)"] +trio = ["trio (>=0.22.0,<0.24.0)"] + +[[package]] +name = "httpx" +version = "0.26.0" +description = "The next generation HTTP client." +optional = false +python-versions = ">=3.8" +files = [ + {file = "httpx-0.26.0-py3-none-any.whl", hash = "sha256:8915f5a3627c4d47b73e8202457cb28f1266982d1159bd5779d86a80c0eab1cd"}, + {file = "httpx-0.26.0.tar.gz", hash = "sha256:451b55c30d5185ea6b23c2c793abf9bb237d2a7dfb901ced6ff69ad37ec1dfaf"}, +] + +[package.dependencies] +anyio = "*" +certifi = "*" +h2 = {version = ">=3,<5", optional = true, markers = "extra == \"http2\""} +httpcore = "==1.*" +idna = "*" +sniffio = "*" + +[package.extras] +brotli = ["brotli", "brotlicffi"] +cli = ["click (==8.*)", "pygments (==2.*)", "rich (>=10,<14)"] +http2 = ["h2 (>=3,<5)"] +socks = ["socksio (==1.*)"] + +[[package]] +name = "hyperframe" +version = "6.0.1" +description = "HTTP/2 framing layer for Python" +optional = false +python-versions = ">=3.6.1" +files = [ + {file = "hyperframe-6.0.1-py3-none-any.whl", hash = "sha256:0ec6bafd80d8ad2195c4f03aacba3a8265e57bc4cff261e802bf39970ed02a15"}, + {file = "hyperframe-6.0.1.tar.gz", hash = "sha256:ae510046231dc8e9ecb1a6586f63d2347bf4c8905914aa84ba585ae85f28a914"}, +] + [[package]] name = "idna" version = "3.3" @@ -1900,6 +2025,20 @@ pytest = [ {version = ">=6.2.4", markers = "python_version >= \"3.10\""}, ] +[[package]] +name = "pytest-repeat" +version = "0.9.3" +description = "pytest plugin for repeating tests" +optional = false +python-versions = ">=3.7" +files = [ + {file = "pytest_repeat-0.9.3-py3-none-any.whl", hash = "sha256:26ab2df18226af9d5ce441c858f273121e92ff55f5bb311d25755b8d7abdd8ed"}, + {file = "pytest_repeat-0.9.3.tar.gz", hash = "sha256:ffd3836dfcd67bb270bec648b330e20be37d2966448c4148c4092d1e8aba8185"}, +] + +[package.dependencies] +pytest = "*" + [[package]] name = "pytest-rerunfailures" version = "13.0" @@ -2043,6 +2182,7 @@ files = [ {file = "PyYAML-6.0.1-cp311-cp311-win_amd64.whl", hash = "sha256:bf07ee2fef7014951eeb99f56f39c9bb4af143d8aa3c21b1677805985307da34"}, {file = "PyYAML-6.0.1-cp312-cp312-macosx_10_9_x86_64.whl", hash = "sha256:855fb52b0dc35af121542a76b9a84f8d1cd886ea97c84703eaa6d88e37a2ad28"}, {file = "PyYAML-6.0.1-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:40df9b996c2b73138957fe23a16a4f0ba614f4c0efce1e9406a184b6d07fa3a9"}, + {file = "PyYAML-6.0.1-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:a08c6f0fe150303c1c6b71ebcd7213c2858041a7e01975da3a99aed1e7a378ef"}, {file = "PyYAML-6.0.1-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:6c22bec3fbe2524cde73d7ada88f6566758a8f7227bfbf93a408a9d86bcc12a0"}, {file = "PyYAML-6.0.1-cp312-cp312-musllinux_1_1_x86_64.whl", hash = "sha256:8d4e9c88387b0f5c7d5f281e55304de64cf7f9c0021a3525bd3b1c542da3b0e4"}, {file = "PyYAML-6.0.1-cp312-cp312-win32.whl", hash = "sha256:d483d2cdf104e7c9fa60c544d92981f12ad66a457afae824d146093b8c294c54"}, @@ -2132,28 +2272,28 @@ pyasn1 = ">=0.1.3" [[package]] name = "ruff" -version = "0.1.11" +version = "0.2.2" description = "An extremely fast Python linter and code formatter, written in Rust." optional = false python-versions = ">=3.7" files = [ - {file = "ruff-0.1.11-py3-none-macosx_10_12_x86_64.macosx_11_0_arm64.macosx_10_12_universal2.whl", hash = "sha256:a7f772696b4cdc0a3b2e527fc3c7ccc41cdcb98f5c80fdd4f2b8c50eb1458196"}, - {file = "ruff-0.1.11-py3-none-macosx_10_12_x86_64.whl", hash = "sha256:934832f6ed9b34a7d5feea58972635c2039c7a3b434fe5ba2ce015064cb6e955"}, - {file = "ruff-0.1.11-py3-none-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:ea0d3e950e394c4b332bcdd112aa566010a9f9c95814844a7468325290aabfd9"}, - {file = "ruff-0.1.11-py3-none-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:9bd4025b9c5b429a48280785a2b71d479798a69f5c2919e7d274c5f4b32c3607"}, - {file = "ruff-0.1.11-py3-none-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:e1ad00662305dcb1e987f5ec214d31f7d6a062cae3e74c1cbccef15afd96611d"}, - {file = "ruff-0.1.11-py3-none-manylinux_2_17_ppc64.manylinux2014_ppc64.whl", hash = "sha256:4b077ce83f47dd6bea1991af08b140e8b8339f0ba8cb9b7a484c30ebab18a23f"}, - {file = "ruff-0.1.11-py3-none-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:c4a88efecec23c37b11076fe676e15c6cdb1271a38f2b415e381e87fe4517f18"}, - {file = "ruff-0.1.11-py3-none-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:5b25093dad3b055667730a9b491129c42d45e11cdb7043b702e97125bcec48a1"}, - {file = "ruff-0.1.11-py3-none-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:231d8fb11b2cc7c0366a326a66dafc6ad449d7fcdbc268497ee47e1334f66f77"}, - {file = "ruff-0.1.11-py3-none-musllinux_1_2_aarch64.whl", hash = "sha256:09c415716884950080921dd6237767e52e227e397e2008e2bed410117679975b"}, - {file = "ruff-0.1.11-py3-none-musllinux_1_2_armv7l.whl", hash = "sha256:0f58948c6d212a6b8d41cd59e349751018797ce1727f961c2fa755ad6208ba45"}, - {file = "ruff-0.1.11-py3-none-musllinux_1_2_i686.whl", hash = "sha256:190a566c8f766c37074d99640cd9ca3da11d8deae2deae7c9505e68a4a30f740"}, - {file = "ruff-0.1.11-py3-none-musllinux_1_2_x86_64.whl", hash = "sha256:6464289bd67b2344d2a5d9158d5eb81025258f169e69a46b741b396ffb0cda95"}, - {file = "ruff-0.1.11-py3-none-win32.whl", hash = "sha256:9b8f397902f92bc2e70fb6bebfa2139008dc72ae5177e66c383fa5426cb0bf2c"}, - {file = "ruff-0.1.11-py3-none-win_amd64.whl", hash = "sha256:eb85ee287b11f901037a6683b2374bb0ec82928c5cbc984f575d0437979c521a"}, - {file = "ruff-0.1.11-py3-none-win_arm64.whl", hash = "sha256:97ce4d752f964ba559c7023a86e5f8e97f026d511e48013987623915431c7ea9"}, - {file = "ruff-0.1.11.tar.gz", hash = "sha256:f9d4d88cb6eeb4dfe20f9f0519bd2eaba8119bde87c3d5065c541dbae2b5a2cb"}, + {file = "ruff-0.2.2-py3-none-macosx_10_12_x86_64.macosx_11_0_arm64.macosx_10_12_universal2.whl", hash = "sha256:0a9efb032855ffb3c21f6405751d5e147b0c6b631e3ca3f6b20f917572b97eb6"}, + {file = "ruff-0.2.2-py3-none-macosx_10_12_x86_64.whl", hash = "sha256:d450b7fbff85913f866a5384d8912710936e2b96da74541c82c1b458472ddb39"}, + {file = "ruff-0.2.2-py3-none-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:ecd46e3106850a5c26aee114e562c329f9a1fbe9e4821b008c4404f64ff9ce73"}, + {file = "ruff-0.2.2-py3-none-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:5e22676a5b875bd72acd3d11d5fa9075d3a5f53b877fe7b4793e4673499318ba"}, + {file = "ruff-0.2.2-py3-none-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:1695700d1e25a99d28f7a1636d85bafcc5030bba9d0578c0781ba1790dbcf51c"}, + {file = "ruff-0.2.2-py3-none-manylinux_2_17_ppc64.manylinux2014_ppc64.whl", hash = "sha256:b0c232af3d0bd8f521806223723456ffebf8e323bd1e4e82b0befb20ba18388e"}, + {file = "ruff-0.2.2-py3-none-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:f63d96494eeec2fc70d909393bcd76c69f35334cdbd9e20d089fb3f0640216ca"}, + {file = "ruff-0.2.2-py3-none-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:6a61ea0ff048e06de273b2e45bd72629f470f5da8f71daf09fe481278b175001"}, + {file = "ruff-0.2.2-py3-none-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:5e1439c8f407e4f356470e54cdecdca1bd5439a0673792dbe34a2b0a551a2fe3"}, + {file = "ruff-0.2.2-py3-none-musllinux_1_2_aarch64.whl", hash = "sha256:940de32dc8853eba0f67f7198b3e79bc6ba95c2edbfdfac2144c8235114d6726"}, + {file = "ruff-0.2.2-py3-none-musllinux_1_2_armv7l.whl", hash = "sha256:0c126da55c38dd917621552ab430213bdb3273bb10ddb67bc4b761989210eb6e"}, + {file = "ruff-0.2.2-py3-none-musllinux_1_2_i686.whl", hash = "sha256:3b65494f7e4bed2e74110dac1f0d17dc8e1f42faaa784e7c58a98e335ec83d7e"}, + {file = "ruff-0.2.2-py3-none-musllinux_1_2_x86_64.whl", hash = "sha256:1ec49be4fe6ddac0503833f3ed8930528e26d1e60ad35c2446da372d16651ce9"}, + {file = "ruff-0.2.2-py3-none-win32.whl", hash = "sha256:d920499b576f6c68295bc04e7b17b6544d9d05f196bb3aac4358792ef6f34325"}, + {file = "ruff-0.2.2-py3-none-win_amd64.whl", hash = "sha256:cc9a91ae137d687f43a44c900e5d95e9617cb37d4c989e462980ba27039d239d"}, + {file = "ruff-0.2.2-py3-none-win_arm64.whl", hash = "sha256:c9d15fc41e6054bfc7200478720570078f0b41c9ae4f010bcc16bd6f4d1aacdd"}, + {file = "ruff-0.2.2.tar.gz", hash = "sha256:e62ed7f36b3068a30ba39193a14274cd706bc486fad521276458022f7bccb31d"}, ] [[package]] @@ -2215,6 +2355,17 @@ files = [ {file = "six-1.16.0.tar.gz", hash = "sha256:1e61c37477a1626458e36f7b1d82aa5c9b094fa4802892072e49de9c60c4c926"}, ] +[[package]] +name = "sniffio" +version = "1.3.0" +description = "Sniff out which async library your code is running under" +optional = false +python-versions = ">=3.7" +files = [ + {file = "sniffio-1.3.0-py3-none-any.whl", hash = "sha256:eecefdce1e5bbfb7ad2eeaabf7c1eeb404d7757c379bd1f7e5cce9d8bf425384"}, + {file = "sniffio-1.3.0.tar.gz", hash = "sha256:e60305c5e5d314f5389259b7f22aaa33d8f7dee49763119234af3755c55b9101"}, +] + [[package]] name = "sshpubkeys" version = "3.3.1" @@ -2668,4 +2819,4 @@ cffi = ["cffi (>=1.11)"] [metadata] lock-version = "2.0" python-versions = "^3.9" -content-hash = "9cf2734cafd5b6963165d398f1b24621193d5284d0bc7cc26a720a014f523860" +content-hash = "af9d5b45310c12411bfe67cb9677d2236808d0780ca1bd81525d2763a928f7f9" diff --git a/proxy/Cargo.toml b/proxy/Cargo.toml index f075c718a7..d8112c8bf0 100644 --- a/proxy/Cargo.toml +++ b/proxy/Cargo.toml @@ -19,6 +19,7 @@ chrono.workspace = true clap.workspace = true consumption_metrics.workspace = true dashmap.workspace = true +env_logger.workspace = true futures.workspace = true git-version.workspace = true hashbrown.workspace = true @@ -31,6 +32,7 @@ hyper-tungstenite.workspace = true hyper.workspace = true ipnet.workspace = true itertools.workspace = true +lasso = { workspace = true, features = ["multi-threaded"] } md5.workspace = true metrics.workspace = true once_cell.workspace = true @@ -58,11 +60,14 @@ scopeguard.workspace = true serde.workspace = true serde_json.workspace = true sha2.workspace = true +smol_str.workspace = true +smallvec.workspace = true socket2.workspace = true sync_wrapper.workspace = true task-local-extensions.workspace = true thiserror.workspace = true -tls-listener.workspace = true +tikv-jemallocator.workspace = true +tikv-jemalloc-ctl = { workspace = true, features = ["use_std"] } tokio-postgres.workspace = true tokio-rustls.workspace = true tokio-util.workspace = true @@ -72,6 +77,7 @@ tracing-subscriber.workspace = true tracing-utils.workspace = true tracing.workspace = true url.workspace = true +urlencoding.workspace = true utils.workspace = true uuid.workspace = true webpki-roots.workspace = true @@ -80,7 +86,6 @@ native-tls.workspace = true postgres-native-tls.workspace = true postgres-protocol.workspace = true redis.workspace = true -smol_str.workspace = true workspace_hack.workspace = true @@ -90,3 +95,4 @@ rcgen.workspace = true rstest.workspace = true tokio-postgres-rustls.workspace = true walkdir.workspace = true +rand_distr = "0.4" diff --git a/proxy/src/auth.rs b/proxy/src/auth.rs index 8d1b861a66..8c44823c98 100644 --- a/proxy/src/auth.rs +++ b/proxy/src/auth.rs @@ -5,7 +5,8 @@ pub use backend::BackendType; mod credentials; pub use credentials::{ - check_peer_addr_is_in_list, endpoint_sni, ComputeUserInfoMaybeEndpoint, IpPattern, + check_peer_addr_is_in_list, endpoint_sni, ComputeUserInfoMaybeEndpoint, + ComputeUserInfoParseError, IpPattern, }; mod password_hack; @@ -14,9 +15,13 @@ use password_hack::PasswordHackPayload; mod flow; pub use flow::*; +use tokio::time::error::Elapsed; -use crate::{console, error::UserFacingError}; -use std::io; +use crate::{ + console, + error::{ReportableError, UserFacingError}, +}; +use std::{io, net::IpAddr}; use thiserror::Error; /// Convenience wrapper for the authentication error. @@ -31,9 +36,6 @@ pub enum AuthErrorImpl { #[error(transparent)] GetAuthInfo(#[from] console::errors::GetAuthInfoError), - #[error(transparent)] - WakeCompute(#[from] console::errors::WakeComputeError), - /// SASL protocol errors (includes [SCRAM](crate::scram)). #[error(transparent)] Sasl(#[from] crate::sasl::Error), @@ -60,13 +62,17 @@ pub enum AuthErrorImpl { Io(#[from] io::Error), #[error( - "This IP address is not allowed to connect to this endpoint. \ - Please add it to the allowed list in the Neon console." + "This IP address {0} is not allowed to connect to this endpoint. \ + Please add it to the allowed list in the Neon console. \ + Make sure to check for IPv4 or IPv6 addresses." )] - IpAddressNotAllowed, + IpAddressNotAllowed(IpAddr), #[error("Too many connections to this endpoint. Please try again later.")] TooManyConnections, + + #[error("Authentication timed out")] + UserTimeout(Elapsed), } #[derive(Debug, Error)] @@ -82,8 +88,8 @@ impl AuthError { AuthErrorImpl::AuthFailed(user.into()).into() } - pub fn ip_address_not_allowed() -> Self { - AuthErrorImpl::IpAddressNotAllowed.into() + pub fn ip_address_not_allowed(ip: IpAddr) -> Self { + AuthErrorImpl::IpAddressNotAllowed(ip).into() } pub fn too_many_connections() -> Self { @@ -93,6 +99,10 @@ impl AuthError { pub fn is_auth_failed(&self) -> bool { matches!(self.0.as_ref(), AuthErrorImpl::AuthFailed(_)) } + + pub fn user_timeout(elapsed: Elapsed) -> Self { + AuthErrorImpl::UserTimeout(elapsed).into() + } } impl> From for AuthError { @@ -107,15 +117,34 @@ impl UserFacingError for AuthError { match self.0.as_ref() { Link(e) => e.to_string_client(), GetAuthInfo(e) => e.to_string_client(), - WakeCompute(e) => e.to_string_client(), Sasl(e) => e.to_string_client(), AuthFailed(_) => self.to_string(), BadAuthMethod(_) => self.to_string(), MalformedPassword(_) => self.to_string(), MissingEndpointName => self.to_string(), Io(_) => "Internal error".to_string(), - IpAddressNotAllowed => self.to_string(), + IpAddressNotAllowed(_) => self.to_string(), TooManyConnections => self.to_string(), + UserTimeout(_) => self.to_string(), + } + } +} + +impl ReportableError for AuthError { + fn get_error_kind(&self) -> crate::error::ErrorKind { + use AuthErrorImpl::*; + match self.0.as_ref() { + Link(e) => e.get_error_kind(), + GetAuthInfo(e) => e.get_error_kind(), + Sasl(e) => e.get_error_kind(), + AuthFailed(_) => crate::error::ErrorKind::User, + BadAuthMethod(_) => crate::error::ErrorKind::User, + MalformedPassword(_) => crate::error::ErrorKind::User, + MissingEndpointName => crate::error::ErrorKind::User, + Io(_) => crate::error::ErrorKind::ClientDisconnect, + IpAddressNotAllowed(_) => crate::error::ErrorKind::User, + TooManyConnections => crate::error::ErrorKind::RateLimit, + UserTimeout(_) => crate::error::ErrorKind::User, } } } diff --git a/proxy/src/auth/backend.rs b/proxy/src/auth/backend.rs index b1634906c9..11af85caa4 100644 --- a/proxy/src/auth/backend.rs +++ b/proxy/src/auth/backend.rs @@ -9,11 +9,10 @@ use crate::auth::credentials::check_peer_addr_is_in_list; use crate::auth::validate_password_and_exchange; use crate::cache::Cached; use crate::console::errors::GetAuthInfoError; -use crate::console::provider::ConsoleBackend; -use crate::console::AuthSecret; +use crate::console::provider::{CachedRoleSecret, ConsoleBackend}; +use crate::console::{AuthSecret, NodeInfo}; use crate::context::RequestMonitoring; -use crate::proxy::connect_compute::handle_try_wake; -use crate::proxy::retry::retry_after; +use crate::proxy::connect_compute::ComputeConnectBackend; use crate::proxy::NeonOptions; use crate::stream::Stream; use crate::{ @@ -27,14 +26,26 @@ use crate::{ stream, url, }; use crate::{scram, EndpointCacheKey, EndpointId, RoleName}; -use futures::TryFutureExt; -use std::borrow::Cow; -use std::ops::ControlFlow; use std::sync::Arc; use tokio::io::{AsyncRead, AsyncWrite}; -use tracing::{error, info, warn}; +use tracing::info; -use super::IpPattern; +/// Alternative to [`std::borrow::Cow`] but doesn't need `T: ToOwned` as we don't need that functionality +pub enum MaybeOwned<'a, T> { + Owned(T), + Borrowed(&'a T), +} + +impl std::ops::Deref for MaybeOwned<'_, T> { + type Target = T; + + fn deref(&self) -> &Self::Target { + match self { + MaybeOwned::Owned(t) => t, + MaybeOwned::Borrowed(t) => t, + } + } +} /// This type serves two purposes: /// @@ -44,22 +55,22 @@ use super::IpPattern; /// * However, when we substitute `T` with [`ComputeUserInfoMaybeEndpoint`], /// this helps us provide the credentials only to those auth /// backends which require them for the authentication process. -pub enum BackendType<'a, T> { +pub enum BackendType<'a, T, D> { /// Cloud API (V2). - Console(Cow<'a, ConsoleBackend>, T), + Console(MaybeOwned<'a, ConsoleBackend>, T), /// Authentication via a web browser. - Link(Cow<'a, url::ApiUrl>), - #[cfg(test)] - /// Test backend. - Test(&'a dyn TestBackend), + Link(MaybeOwned<'a, url::ApiUrl>, D), } pub trait TestBackend: Send + Sync + 'static { fn wake_compute(&self) -> Result; - fn get_allowed_ips(&self) -> Result, console::errors::GetAuthInfoError>; + fn get_allowed_ips_and_secret( + &self, + ) -> Result<(CachedAllowedIps, Option), console::errors::GetAuthInfoError>; + fn get_role_secret(&self) -> Result; } -impl std::fmt::Display for BackendType<'_, ()> { +impl std::fmt::Display for BackendType<'_, (), ()> { fn fmt(&self, fmt: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { use BackendType::*; match self { @@ -67,64 +78,57 @@ impl std::fmt::Display for BackendType<'_, ()> { ConsoleBackend::Console(endpoint) => { fmt.debug_tuple("Console").field(&endpoint.url()).finish() } - #[cfg(feature = "testing")] + #[cfg(any(test, feature = "testing"))] ConsoleBackend::Postgres(endpoint) => { fmt.debug_tuple("Postgres").field(&endpoint.url()).finish() } + #[cfg(test)] + ConsoleBackend::Test(_) => fmt.debug_tuple("Test").finish(), }, - Link(url) => fmt.debug_tuple("Link").field(&url.as_str()).finish(), - #[cfg(test)] - Test(_) => fmt.debug_tuple("Test").finish(), + Link(url, _) => fmt.debug_tuple("Link").field(&url.as_str()).finish(), } } } -impl BackendType<'_, T> { +impl BackendType<'_, T, D> { /// Very similar to [`std::option::Option::as_ref`]. /// This helps us pass structured config to async tasks. - pub fn as_ref(&self) -> BackendType<'_, &T> { + pub fn as_ref(&self) -> BackendType<'_, &T, &D> { use BackendType::*; match self { - Console(c, x) => Console(Cow::Borrowed(c), x), - Link(c) => Link(Cow::Borrowed(c)), - #[cfg(test)] - Test(x) => Test(*x), + Console(c, x) => Console(MaybeOwned::Borrowed(c), x), + Link(c, x) => Link(MaybeOwned::Borrowed(c), x), } } } -impl<'a, T> BackendType<'a, T> { +impl<'a, T, D> BackendType<'a, T, D> { /// Very similar to [`std::option::Option::map`]. /// Maps [`BackendType`] to [`BackendType`] by applying /// a function to a contained value. - pub fn map(self, f: impl FnOnce(T) -> R) -> BackendType<'a, R> { + pub fn map(self, f: impl FnOnce(T) -> R) -> BackendType<'a, R, D> { use BackendType::*; match self { Console(c, x) => Console(c, f(x)), - Link(c) => Link(c), - #[cfg(test)] - Test(x) => Test(x), + Link(c, x) => Link(c, x), } } } - -impl<'a, T, E> BackendType<'a, Result> { +impl<'a, T, D, E> BackendType<'a, Result, D> { /// Very similar to [`std::option::Option::transpose`]. /// This is most useful for error handling. - pub fn transpose(self) -> Result, E> { + pub fn transpose(self) -> Result, E> { use BackendType::*; match self { Console(c, x) => x.map(|x| Console(c, x)), - Link(c) => Ok(Link(c)), - #[cfg(test)] - Test(x) => Ok(Test(x)), + Link(c, x) => Ok(Link(c, x)), } } } -pub struct ComputeCredentials { +pub struct ComputeCredentials { pub info: ComputeUserInfo, - pub keys: T, + pub keys: ComputeCredentialKeys, } #[derive(Debug, Clone)] @@ -147,7 +151,6 @@ impl ComputeUserInfo { } pub enum ComputeCredentialKeys { - #[cfg(feature = "testing")] Password(Vec), AuthKeys(AuthKeys), } @@ -182,28 +185,35 @@ async fn auth_quirks( client: &mut stream::PqStream>, allow_cleartext: bool, config: &'static AuthenticationConfig, -) -> auth::Result> { +) -> auth::Result { // If there's no project so far, that entails that client doesn't // support SNI or other means of passing the endpoint (project) name. // We now expect to see a very specific payload in the place of password. let (info, unauthenticated_password) = match user_info.try_into() { Err(info) => { - let res = hacks::password_hack_no_authentication(info, client, &mut ctx.latency_timer) - .await?; - ctx.set_endpoint_id(Some(res.info.endpoint.clone())); - (res.info, Some(res.keys)) + let res = hacks::password_hack_no_authentication(ctx, info, client).await?; + + ctx.set_endpoint_id(res.info.endpoint.clone()); + let password = match res.keys { + ComputeCredentialKeys::Password(p) => p, + _ => unreachable!("password hack should return a password"), + }; + (res.info, Some(password)) } Ok(info) => (info, None), }; info!("fetching user's authentication info"); - let allowed_ips = api.get_allowed_ips(ctx, &info).await?; + let (allowed_ips, maybe_secret) = api.get_allowed_ips_and_secret(ctx, &info).await?; // check allowed list if !check_peer_addr_is_in_list(&ctx.peer_addr, &allowed_ips) { - return Err(auth::AuthError::ip_address_not_allowed()); + return Err(auth::AuthError::ip_address_not_allowed(ctx.peer_addr)); } - let cached_secret = api.get_role_secret(ctx, &info).await?; + let cached_secret = match maybe_secret { + Some(secret) => secret, + None => api.get_role_secret(ctx, &info).await?, + }; let secret = cached_secret.value.clone().unwrap_or_else(|| { // If we don't have an authentication secret, we mock one to @@ -242,7 +252,7 @@ async fn authenticate_with_secret( unauthenticated_password: Option>, allow_cleartext: bool, config: &'static AuthenticationConfig, -) -> auth::Result> { +) -> auth::Result { if let Some(password) = unauthenticated_password { let auth_outcome = validate_password_and_exchange(&password, secret)?; let keys = match auth_outcome { @@ -264,66 +274,22 @@ async fn authenticate_with_secret( // Perform cleartext auth if we're allowed to do that. // Currently, we use it for websocket connections (latency). if allow_cleartext { - return hacks::authenticate_cleartext(info, client, &mut ctx.latency_timer, secret).await; + ctx.set_auth_method(crate::context::AuthMethod::Cleartext); + return hacks::authenticate_cleartext(ctx, info, client, secret).await; } // Finally, proceed with the main auth flow (SCRAM-based). - classic::authenticate(info, client, config, &mut ctx.latency_timer, secret).await + classic::authenticate(ctx, info, client, config, secret).await } -/// Authenticate the user and then wake a compute (or retrieve an existing compute session from cache) -/// only if authentication was successfuly. -async fn auth_and_wake_compute( - ctx: &mut RequestMonitoring, - api: &impl console::Api, - user_info: ComputeUserInfoMaybeEndpoint, - client: &mut stream::PqStream>, - allow_cleartext: bool, - config: &'static AuthenticationConfig, -) -> auth::Result<(CachedNodeInfo, ComputeUserInfo)> { - let compute_credentials = - auth_quirks(ctx, api, user_info, client, allow_cleartext, config).await?; - - let mut num_retries = 0; - let mut node = loop { - let wake_res = api.wake_compute(ctx, &compute_credentials.info).await; - match handle_try_wake(wake_res, num_retries) { - Err(e) => { - error!(error = ?e, num_retries, retriable = false, "couldn't wake compute node"); - return Err(e.into()); - } - Ok(ControlFlow::Continue(e)) => { - warn!(error = ?e, num_retries, retriable = true, "couldn't wake compute node"); - } - Ok(ControlFlow::Break(n)) => break n, - } - - let wait_duration = retry_after(num_retries); - num_retries += 1; - tokio::time::sleep(wait_duration).await; - }; - - ctx.set_project(node.aux.clone()); - - match compute_credentials.keys { - #[cfg(feature = "testing")] - ComputeCredentialKeys::Password(password) => node.config.password(password), - ComputeCredentialKeys::AuthKeys(auth_keys) => node.config.auth_keys(auth_keys), - }; - - Ok((node, compute_credentials.info)) -} - -impl<'a> BackendType<'a, ComputeUserInfoMaybeEndpoint> { +impl<'a> BackendType<'a, ComputeUserInfoMaybeEndpoint, &()> { /// Get compute endpoint name from the credentials. pub fn get_endpoint(&self) -> Option { use BackendType::*; match self { Console(_, user_info) => user_info.endpoint_id.clone(), - Link(_) => Some("link".into()), - #[cfg(test)] - Test(_) => Some("test".into()), + Link(_, _) => Some("link".into()), } } @@ -333,9 +299,7 @@ impl<'a> BackendType<'a, ComputeUserInfoMaybeEndpoint> { match self { Console(_, user_info) => &user_info.user, - Link(_) => "link", - #[cfg(test)] - Test(_) => "test", + Link(_, _) => "link", } } @@ -347,7 +311,7 @@ impl<'a> BackendType<'a, ComputeUserInfoMaybeEndpoint> { client: &mut stream::PqStream>, allow_cleartext: bool, config: &'static AuthenticationConfig, - ) -> auth::Result<(CachedNodeInfo, BackendType<'a, ComputeUserInfo>)> { + ) -> auth::Result> { use BackendType::*; let res = match self { @@ -358,25 +322,17 @@ impl<'a> BackendType<'a, ComputeUserInfoMaybeEndpoint> { "performing authentication using the console" ); - let (cache_info, user_info) = - auth_and_wake_compute(ctx, &*api, user_info, client, allow_cleartext, config) - .await?; - (cache_info, BackendType::Console(api, user_info)) + let credentials = + auth_quirks(ctx, &*api, user_info, client, allow_cleartext, config).await?; + BackendType::Console(api, credentials) } // NOTE: this auth backend doesn't use client credentials. - Link(url) => { + Link(url, _) => { info!("performing link authentication"); - let node_info = link::authenticate(&url, client).await?; + let info = link::authenticate(ctx, &url, client).await?; - ( - CachedNodeInfo::new_uncached(node_info), - BackendType::Link(url), - ) - } - #[cfg(test)] - Test(_) => { - unreachable!("this function should never be called in the test backend") + BackendType::Link(url, info) } }; @@ -385,33 +341,70 @@ impl<'a> BackendType<'a, ComputeUserInfoMaybeEndpoint> { } } -impl BackendType<'_, ComputeUserInfo> { - pub async fn get_allowed_ips( +impl BackendType<'_, ComputeUserInfo, &()> { + pub async fn get_role_secret( &self, ctx: &mut RequestMonitoring, - ) -> Result { + ) -> Result { use BackendType::*; match self { - Console(api, user_info) => api.get_allowed_ips(ctx, user_info).await, - Link(_) => Ok(Cached::new_uncached(Arc::new(vec![]))), - #[cfg(test)] - Test(x) => Ok(Cached::new_uncached(Arc::new(x.get_allowed_ips()?))), + Console(api, user_info) => api.get_role_secret(ctx, user_info).await, + Link(_, _) => Ok(Cached::new_uncached(None)), } } - /// When applicable, wake the compute node, gaining its connection info in the process. - /// The link auth flow doesn't support this, so we return [`None`] in that case. - pub async fn wake_compute( + pub async fn get_allowed_ips_and_secret( &self, ctx: &mut RequestMonitoring, - ) -> Result, console::errors::WakeComputeError> { + ) -> Result<(CachedAllowedIps, Option), GetAuthInfoError> { use BackendType::*; - match self { - Console(api, user_info) => api.wake_compute(ctx, user_info).map_ok(Some).await, - Link(_) => Ok(None), - #[cfg(test)] - Test(x) => x.wake_compute().map(Some), + Console(api, user_info) => api.get_allowed_ips_and_secret(ctx, user_info).await, + Link(_, _) => Ok((Cached::new_uncached(Arc::new(vec![])), None)), + } + } +} + +#[async_trait::async_trait] +impl ComputeConnectBackend for BackendType<'_, ComputeCredentials, NodeInfo> { + async fn wake_compute( + &self, + ctx: &mut RequestMonitoring, + ) -> Result { + use BackendType::*; + + match self { + Console(api, creds) => api.wake_compute(ctx, &creds.info).await, + Link(_, info) => Ok(Cached::new_uncached(info.clone())), + } + } + + fn get_keys(&self) -> Option<&ComputeCredentialKeys> { + match self { + BackendType::Console(_, creds) => Some(&creds.keys), + BackendType::Link(_, _) => None, + } + } +} + +#[async_trait::async_trait] +impl ComputeConnectBackend for BackendType<'_, ComputeCredentials, &()> { + async fn wake_compute( + &self, + ctx: &mut RequestMonitoring, + ) -> Result { + use BackendType::*; + + match self { + Console(api, creds) => api.wake_compute(ctx, &creds.info).await, + Link(_, _) => unreachable!("link auth flow doesn't support waking the compute"), + } + } + + fn get_keys(&self) -> Option<&ComputeCredentialKeys> { + match self { + BackendType::Console(_, creds) => Some(&creds.keys), + BackendType::Link(_, _) => None, } } } diff --git a/proxy/src/auth/backend/classic.rs b/proxy/src/auth/backend/classic.rs index 358b335b88..b98fa63120 100644 --- a/proxy/src/auth/backend/classic.rs +++ b/proxy/src/auth/backend/classic.rs @@ -4,7 +4,7 @@ use crate::{ compute, config::AuthenticationConfig, console::AuthSecret, - metrics::LatencyTimer, + context::RequestMonitoring, sasl, stream::{PqStream, Stream}, }; @@ -12,28 +12,26 @@ use tokio::io::{AsyncRead, AsyncWrite}; use tracing::{info, warn}; pub(super) async fn authenticate( + ctx: &mut RequestMonitoring, creds: ComputeUserInfo, client: &mut PqStream>, config: &'static AuthenticationConfig, - latency_timer: &mut LatencyTimer, secret: AuthSecret, -) -> auth::Result> { +) -> auth::Result { let flow = AuthFlow::new(client); let scram_keys = match secret { - #[cfg(feature = "testing")] + #[cfg(any(test, feature = "testing"))] AuthSecret::Md5(_) => { info!("auth endpoint chooses MD5"); return Err(auth::AuthError::bad_auth_method("MD5")); } AuthSecret::Scram(secret) => { info!("auth endpoint chooses SCRAM"); - let scram = auth::Scram(&secret); + let scram = auth::Scram(&secret, &mut *ctx); let auth_outcome = tokio::time::timeout( config.scram_protocol_timeout, async { - // pause the timer while we communicate with the client - let _paused = latency_timer.pause(); flow.begin(scram).await.map_err(|error| { warn!(?error, "error sending scram acknowledgement"); @@ -45,9 +43,9 @@ pub(super) async fn authenticate( } ) .await - .map_err(|error| { - warn!("error processing scram messages error = authentication timed out, execution time exeeded {} seconds", config.scram_protocol_timeout.as_secs()); - auth::io::Error::new(auth::io::ErrorKind::TimedOut, error) + .map_err(|e| { + warn!("error processing scram messages error = authentication timed out, execution time exceeded {} seconds", config.scram_protocol_timeout.as_secs()); + auth::AuthError::user_timeout(e) })??; let client_key = match auth_outcome { diff --git a/proxy/src/auth/backend/hacks.rs b/proxy/src/auth/backend/hacks.rs index b6c1a92d3c..f7241be4a9 100644 --- a/proxy/src/auth/backend/hacks.rs +++ b/proxy/src/auth/backend/hacks.rs @@ -4,7 +4,7 @@ use super::{ use crate::{ auth::{self, AuthFlow}, console::AuthSecret, - metrics::LatencyTimer, + context::RequestMonitoring, sasl, stream::{self, Stream}, }; @@ -16,21 +16,25 @@ use tracing::{info, warn}; /// These properties are benefical for serverless JS workers, so we /// use this mechanism for websocket connections. pub async fn authenticate_cleartext( + ctx: &mut RequestMonitoring, info: ComputeUserInfo, client: &mut stream::PqStream>, - latency_timer: &mut LatencyTimer, secret: AuthSecret, -) -> auth::Result> { +) -> auth::Result { warn!("cleartext auth flow override is enabled, proceeding"); + ctx.set_auth_method(crate::context::AuthMethod::Cleartext); // pause the timer while we communicate with the client - let _paused = latency_timer.pause(); + let paused = ctx.latency_timer.pause(crate::metrics::Waiting::Client); - let auth_outcome = AuthFlow::new(client) + let auth_flow = AuthFlow::new(client) .begin(auth::CleartextPassword(secret)) - .await? - .authenticate() .await?; + drop(paused); + // cleartext auth is only allowed to the ws/http protocol. + // If we're here, we already received the password in the first message. + // Scram protocol will be executed on the proxy side. + let auth_outcome = auth_flow.authenticate().await?; let keys = match auth_outcome { sasl::Outcome::Success(key) => key, @@ -47,14 +51,15 @@ pub async fn authenticate_cleartext( /// Similar to [`authenticate_cleartext`], but there's a specific password format, /// and passwords are not yet validated (we don't know how to validate them!) pub async fn password_hack_no_authentication( + ctx: &mut RequestMonitoring, info: ComputeUserInfoNoEndpoint, client: &mut stream::PqStream>, - latency_timer: &mut LatencyTimer, -) -> auth::Result>> { +) -> auth::Result { warn!("project not specified, resorting to the password hack auth flow"); + ctx.set_auth_method(crate::context::AuthMethod::Cleartext); // pause the timer while we communicate with the client - let _paused = latency_timer.pause(); + let _paused = ctx.latency_timer.pause(crate::metrics::Waiting::Client); let payload = AuthFlow::new(client) .begin(auth::PasswordHack) @@ -71,6 +76,6 @@ pub async fn password_hack_no_authentication( options: info.options, endpoint: payload.endpoint, }, - keys: payload.password, + keys: ComputeCredentialKeys::Password(payload.password), }) } diff --git a/proxy/src/auth/backend/link.rs b/proxy/src/auth/backend/link.rs index a7ddd257b3..7db76f3d9e 100644 --- a/proxy/src/auth/backend/link.rs +++ b/proxy/src/auth/backend/link.rs @@ -1,7 +1,8 @@ use crate::{ auth, compute, console::{self, provider::NodeInfo}, - error::UserFacingError, + context::RequestMonitoring, + error::{ReportableError, UserFacingError}, stream::PqStream, waiters, }; @@ -13,10 +14,6 @@ use tracing::{info, info_span}; #[derive(Debug, Error)] pub enum LinkAuthError { - /// Authentication error reported by the console. - #[error("Authentication failed: {0}")] - AuthFailed(String), - #[error(transparent)] WaiterRegister(#[from] waiters::RegisterError), @@ -29,10 +26,16 @@ pub enum LinkAuthError { impl UserFacingError for LinkAuthError { fn to_string_client(&self) -> String { - use LinkAuthError::*; + "Internal error".to_string() + } +} + +impl ReportableError for LinkAuthError { + fn get_error_kind(&self) -> crate::error::ErrorKind { match self { - AuthFailed(_) => self.to_string(), - _ => "Internal error".to_string(), + LinkAuthError::WaiterRegister(_) => crate::error::ErrorKind::Service, + LinkAuthError::WaiterWait(_) => crate::error::ErrorKind::Service, + LinkAuthError::Io(_) => crate::error::ErrorKind::ClientDisconnect, } } } @@ -54,9 +57,12 @@ pub fn new_psql_session_id() -> String { } pub(super) async fn authenticate( + ctx: &mut RequestMonitoring, link_uri: &reqwest::Url, client: &mut PqStream, ) -> auth::Result { + ctx.set_auth_method(crate::context::AuthMethod::Web); + // registering waiter can fail if we get unlucky with rng. // just try again. let (psql_session_id, waiter) = loop { @@ -94,6 +100,11 @@ pub(super) async fn authenticate( .dbname(&db_info.dbname) .user(&db_info.user); + ctx.set_user(db_info.user.into()); + ctx.set_project(db_info.aux.clone()); + let cold_start_info = db_info.aux.cold_start_info.clone().unwrap_or_default(); + info!(?cold_start_info, "woken up a compute node"); + // Backwards compatibility. pg_sni_proxy uses "--" in domain names // while direct connections do not. Once we migrate to pg_sni_proxy // everywhere, we can remove this. diff --git a/proxy/src/auth/credentials.rs b/proxy/src/auth/credentials.rs index bdb79f2517..89773aa1ff 100644 --- a/proxy/src/auth/credentials.rs +++ b/proxy/src/auth/credentials.rs @@ -1,8 +1,13 @@ //! User credentials used in authentication. use crate::{ - auth::password_hack::parse_endpoint_param, context::RequestMonitoring, error::UserFacingError, - metrics::NUM_CONNECTION_ACCEPTED_BY_SNI, proxy::NeonOptions, EndpointId, RoleName, + auth::password_hack::parse_endpoint_param, + context::RequestMonitoring, + error::{ReportableError, UserFacingError}, + metrics::NUM_CONNECTION_ACCEPTED_BY_SNI, + proxy::NeonOptions, + serverless::SERVERLESS_DRIVER_SNI, + EndpointId, RoleName, }; use itertools::Itertools; use pq_proto::StartupMessageParams; @@ -38,6 +43,12 @@ pub enum ComputeUserInfoParseError { impl UserFacingError for ComputeUserInfoParseError {} +impl ReportableError for ComputeUserInfoParseError { + fn get_error_kind(&self) -> crate::error::ErrorKind { + crate::error::ErrorKind::User + } +} + /// Various client credentials which we use for authentication. /// Note that we don't store any kind of client key or password here. #[derive(Debug, Clone, PartialEq, Eq)] @@ -54,10 +65,10 @@ impl ComputeUserInfoMaybeEndpoint { } } -pub fn endpoint_sni<'a>( - sni: &'a str, +pub fn endpoint_sni( + sni: &str, common_names: &HashSet, -) -> Result<&'a str, ComputeUserInfoParseError> { +) -> Result, ComputeUserInfoParseError> { let Some((subdomain, common_name)) = sni.split_once('.') else { return Err(ComputeUserInfoParseError::UnknownCommonName { cn: sni.into() }); }; @@ -66,7 +77,10 @@ pub fn endpoint_sni<'a>( cn: common_name.into(), }); } - Ok(subdomain) + if subdomain == SERVERLESS_DRIVER_SNI { + return Ok(None); + } + Ok(Some(EndpointId::from(subdomain))) } impl ComputeUserInfoMaybeEndpoint { @@ -85,7 +99,9 @@ impl ComputeUserInfoMaybeEndpoint { // record the values if we have them ctx.set_application(params.get("application_name").map(SmolStr::from)); ctx.set_user(user.clone()); - ctx.set_endpoint_id(sni.map(EndpointId::from)); + if let Some(dbname) = params.get("database") { + ctx.set_dbname(dbname.into()); + } // Project name might be passed via PG's command-line options. let endpoint_option = params @@ -103,7 +119,7 @@ impl ComputeUserInfoMaybeEndpoint { let endpoint_from_domain = if let Some(sni_str) = sni { if let Some(cn) = common_names { - Some(EndpointId::from(endpoint_sni(sni_str, cn)?)) + endpoint_sni(sni_str, cn)? } else { None } @@ -117,14 +133,18 @@ impl ComputeUserInfoMaybeEndpoint { Some(Err(InconsistentProjectNames { domain, option })) } // Invariant: project name may not contain certain characters. - (a, b) => a.or(b).map(|name| match project_name_valid(&name) { + (a, b) => a.or(b).map(|name| match project_name_valid(name.as_ref()) { false => Err(MalformedProjectName(name)), true => Ok(name), }), } .transpose()?; - info!(%user, project = endpoint.as_deref(), "credentials"); + if let Some(ep) = &endpoint { + ctx.set_endpoint_id(ep.clone()); + } + + info!(%user, "credentials"); if sni.is_some() { info!("Connection with sni"); NUM_CONNECTION_ACCEPTED_BY_SNI @@ -146,7 +166,7 @@ impl ComputeUserInfoMaybeEndpoint { Ok(Self { user, - endpoint_id: endpoint.map(EndpointId::from), + endpoint_id: endpoint, options, }) } diff --git a/proxy/src/auth/flow.rs b/proxy/src/auth/flow.rs index 3151a77263..788381b6c0 100644 --- a/proxy/src/auth/flow.rs +++ b/proxy/src/auth/flow.rs @@ -4,9 +4,11 @@ use super::{backend::ComputeCredentialKeys, AuthErrorImpl, PasswordHackPayload}; use crate::{ config::TlsServerEndPoint, console::AuthSecret, + context::RequestMonitoring, sasl, scram, stream::{PqStream, Stream}, }; +use postgres_protocol::authentication::sasl::{SCRAM_SHA_256, SCRAM_SHA_256_PLUS}; use pq_proto::{BeAuthenticationSaslMessage, BeMessage, BeMessage as Be}; use std::io; use tokio::io::{AsyncRead, AsyncWrite}; @@ -23,7 +25,7 @@ pub trait AuthMethod { pub struct Begin; /// Use [SCRAM](crate::scram)-based auth in [`AuthFlow`]. -pub struct Scram<'a>(pub &'a scram::ServerSecret); +pub struct Scram<'a>(pub &'a scram::ServerSecret, pub &'a mut RequestMonitoring); impl AuthMethod for Scram<'_> { #[inline(always)] @@ -138,6 +140,11 @@ impl AuthFlow<'_, S, CleartextPassword> { impl AuthFlow<'_, S, Scram<'_>> { /// Perform user authentication. Raise an error in case authentication failed. pub async fn authenticate(self) -> super::Result> { + let Scram(secret, ctx) = self.state; + + // pause the timer while we communicate with the client + let _paused = ctx.latency_timer.pause(crate::metrics::Waiting::Client); + // Initial client message contains the chosen auth method's name. let msg = self.stream.read_password_message().await?; let sasl = sasl::FirstMessage::parse(&msg) @@ -148,9 +155,15 @@ impl AuthFlow<'_, S, Scram<'_>> { return Err(super::AuthError::bad_auth_method(sasl.method)); } + match sasl.method { + SCRAM_SHA_256 => ctx.auth_method = Some(crate::context::AuthMethod::ScramSha256), + SCRAM_SHA_256_PLUS => { + ctx.auth_method = Some(crate::context::AuthMethod::ScramSha256Plus) + } + _ => {} + } info!("client chooses {}", sasl.method); - let secret = self.state.0; let outcome = sasl::SaslStream::new(self.stream, sasl.message) .authenticate(scram::Exchange::new( secret, @@ -167,12 +180,12 @@ impl AuthFlow<'_, S, Scram<'_>> { } } -pub(super) fn validate_password_and_exchange( +pub(crate) fn validate_password_and_exchange( password: &[u8], secret: AuthSecret, ) -> super::Result> { match secret { - #[cfg(feature = "testing")] + #[cfg(any(test, feature = "testing"))] AuthSecret::Md5(_) => { // test only Ok(sasl::Outcome::Success(ComputeCredentialKeys::Password( diff --git a/proxy/src/bin/pg_sni_router.rs b/proxy/src/bin/pg_sni_router.rs index 1edbc1e7e7..385f7820cb 100644 --- a/proxy/src/bin/pg_sni_router.rs +++ b/proxy/src/bin/pg_sni_router.rs @@ -10,10 +10,11 @@ use itertools::Itertools; use proxy::config::TlsServerEndPoint; use proxy::context::RequestMonitoring; use proxy::proxy::run_until_cancelled; +use rustls::pki_types::PrivateKeyDer; use tokio::net::TcpListener; use anyhow::{anyhow, bail, ensure, Context}; -use clap::{self, Arg}; +use clap::Arg; use futures::TryFutureExt; use proxy::console::messages::MetricsAuxInfo; use proxy::stream::{PqStream, Stream}; @@ -76,37 +77,40 @@ async fn main() -> anyhow::Result<()> { (Some(key_path), Some(cert_path)) => { let key = { let key_bytes = std::fs::read(key_path).context("TLS key file")?; - let mut keys = rustls_pemfile::pkcs8_private_keys(&mut &key_bytes[..]) - .context(format!("Failed to read TLS keys at '{key_path}'"))?; + + let mut keys = + rustls_pemfile::pkcs8_private_keys(&mut &key_bytes[..]).collect_vec(); ensure!(keys.len() == 1, "keys.len() = {} (should be 1)", keys.len()); - keys.pop().map(rustls::PrivateKey).unwrap() + PrivateKeyDer::Pkcs8( + keys.pop() + .unwrap() + .context(format!("Failed to read TLS keys at '{key_path}'"))?, + ) }; let cert_chain_bytes = std::fs::read(cert_path) .context(format!("Failed to read TLS cert file at '{cert_path}.'"))?; - let cert_chain = { + let cert_chain: Vec<_> = { rustls_pemfile::certs(&mut &cert_chain_bytes[..]) - .context(format!( - "Failed to read TLS certificate chain from bytes from file at '{cert_path}'." - ))? - .into_iter() - .map(rustls::Certificate) - .collect_vec() + .try_collect() + .with_context(|| { + format!("Failed to read TLS certificate chain from bytes from file at '{cert_path}'.") + })? }; // needed for channel bindings let first_cert = cert_chain.first().context("missing certificate")?; let tls_server_end_point = TlsServerEndPoint::new(first_cert)?; - let tls_config = rustls::ServerConfig::builder() - .with_safe_default_cipher_suites() - .with_safe_default_kx_groups() - .with_protocol_versions(&[&rustls::version::TLS13, &rustls::version::TLS12])? - .with_no_client_auth() - .with_single_cert(cert_chain, key)? - .into(); + let tls_config = rustls::ServerConfig::builder_with_protocol_versions(&[ + &rustls::version::TLS13, + &rustls::version::TLS12, + ]) + .with_no_client_auth() + .with_single_cert(cert_chain, key)? + .into(); (tls_config, tls_server_end_point) } @@ -171,16 +175,8 @@ async fn task_main( .context("failed to set socket option")?; info!(%peer_addr, "serving"); - let mut ctx = - RequestMonitoring::new(session_id, peer_addr.ip(), "sni_router", "sni"); - handle_client( - &mut ctx, - dest_suffix, - tls_config, - tls_server_end_point, - socket, - ) - .await + let ctx = RequestMonitoring::new(session_id, peer_addr.ip(), "sni_router", "sni"); + handle_client(ctx, dest_suffix, tls_config, tls_server_end_point, socket).await } .unwrap_or_else(|e| { // Acknowledge that the task has finished with an error. @@ -240,13 +236,15 @@ async fn ssl_handshake( ?unexpected, "unexpected startup packet, rejecting connection" ); - stream.throw_error_str(ERR_INSECURE_CONNECTION).await? + stream + .throw_error_str(ERR_INSECURE_CONNECTION, proxy::error::ErrorKind::User) + .await? } } } async fn handle_client( - ctx: &mut RequestMonitoring, + mut ctx: RequestMonitoring, dest_suffix: Arc, tls_config: Arc, tls_server_end_point: TlsServerEndPoint, @@ -272,5 +270,10 @@ async fn handle_client( let client = tokio::net::TcpStream::connect(destination).await?; let metrics_aux: MetricsAuxInfo = Default::default(); - proxy::proxy::proxy_pass(ctx, tls_stream, client, metrics_aux).await + + // doesn't yet matter as pg-sni-router doesn't report analytics logs + ctx.set_success(); + ctx.log(); + + proxy::proxy::passthrough::proxy_pass(tls_stream, client, metrics_aux).await } diff --git a/proxy/src/bin/proxy.rs b/proxy/src/bin/proxy.rs index ba113a89eb..b3d4fc0411 100644 --- a/proxy/src/bin/proxy.rs +++ b/proxy/src/bin/proxy.rs @@ -1,5 +1,8 @@ use futures::future::Either; use proxy::auth; +use proxy::auth::backend::MaybeOwned; +use proxy::cancellation::CancelMap; +use proxy::cancellation::CancellationHandler; use proxy::config::AuthenticationConfig; use proxy::config::CacheOptions; use proxy::config::HttpConfig; @@ -11,16 +14,18 @@ use proxy::rate_limiter::EndpointRateLimiter; use proxy::rate_limiter::RateBucketInfo; use proxy::rate_limiter::RateLimiterConfig; use proxy::redis::notifications; +use proxy::redis::publisher::RedisPublisherClient; use proxy::serverless::GlobalConnPoolOptions; use proxy::usage_metrics; use anyhow::bail; use proxy::config::{self, ProxyConfig}; use proxy::serverless; +use std::net::SocketAddr; use std::pin::pin; use std::sync::Arc; -use std::{borrow::Cow, net::SocketAddr}; use tokio::net::TcpListener; +use tokio::sync::Mutex; use tokio::task::JoinSet; use tokio_util::sync::CancellationToken; use tracing::info; @@ -32,6 +37,9 @@ project_build_tag!(BUILD_TAG); use clap::{Parser, ValueEnum}; +#[global_allocator] +static GLOBAL: tikv_jemallocator::Jemalloc = tikv_jemallocator::Jemalloc; + #[derive(Clone, Debug, ValueEnum)] enum AuthBackend { Console, @@ -84,6 +92,9 @@ struct ProxyCliArgs { /// path to directory with TLS certificates for client postgres connections #[clap(long)] certs_dir: Option, + /// timeout for the TLS handshake + #[clap(long, default_value = "15s", value_parser = humantime::parse_duration)] + handshake_timeout: tokio::time::Duration, /// http endpoint to receive periodic metric updates #[clap(long)] metric_collection_endpoint: Option, @@ -122,6 +133,9 @@ struct ProxyCliArgs { /// Can be given multiple times for different bucket sizes. #[clap(long, default_values_t = RateBucketInfo::DEFAULT_SET)] endpoint_rps_limit: Vec, + /// Redis rate limiter max number of requests per second. + #[clap(long, default_values_t = RateBucketInfo::DEFAULT_SET)] + redis_rps_limit: Vec, /// Initial limit for dynamic rate limiter. Makes sense only if `rate_limit_algorithm` is *not* `None`. #[clap(long, default_value_t = 100)] initial_limit: usize, @@ -161,6 +175,10 @@ struct SqlOverHttpArgs { #[clap(long, default_value_t = 20)] sql_over_http_pool_max_conns_per_endpoint: usize, + /// How many connections to pool for each endpoint. Excess connections are discarded + #[clap(long, default_value_t = 20000)] + sql_over_http_pool_max_total_conns: usize, + /// How long pooled connections should remain idle for before closing #[clap(long, default_value = "5m", value_parser = humantime::parse_duration)] sql_over_http_idle_timeout: tokio::time::Duration, @@ -187,6 +205,13 @@ async fn main() -> anyhow::Result<()> { info!("Build_tag: {BUILD_TAG}"); ::metrics::set_build_info_metric(GIT_VERSION, BUILD_TAG); + match proxy::jemalloc::MetricRecorder::new(prometheus::default_registry()) { + Ok(t) => { + t.start(); + } + Err(e) => tracing::error!(error = ?e, "could not start jemalloc metrics loop"), + } + let args = ProxyCliArgs::parse(); let config = build_config(&args)?; @@ -207,6 +232,19 @@ async fn main() -> anyhow::Result<()> { let cancellation_token = CancellationToken::new(); let endpoint_rate_limiter = Arc::new(EndpointRateLimiter::new(&config.endpoint_rps_limit)); + let cancel_map = CancelMap::default(); + let redis_publisher = match &args.redis_notifications { + Some(url) => Some(Arc::new(Mutex::new(RedisPublisherClient::new( + url, + args.region.clone(), + &config.redis_rps_limit, + )?))), + None => None, + }; + let cancellation_handler = Arc::new(CancellationHandler::new( + cancel_map.clone(), + redis_publisher, + )); // client facing tasks. these will exit on error or on cancellation // cancellation returns Ok(()) @@ -216,6 +254,7 @@ async fn main() -> anyhow::Result<()> { proxy_listener, cancellation_token.clone(), endpoint_rate_limiter.clone(), + cancellation_handler.clone(), )); // TODO: rename the argument to something like serverless. @@ -230,6 +269,7 @@ async fn main() -> anyhow::Result<()> { serverless_listener, cancellation_token.clone(), endpoint_rate_limiter.clone(), + cancellation_handler.clone(), )); } @@ -249,18 +289,18 @@ async fn main() -> anyhow::Result<()> { } if let auth::BackendType::Console(api, _) = &config.auth_backend { - match &**api { - proxy::console::provider::ConsoleBackend::Console(api) => { - let cache = api.caches.project_info.clone(); - if let Some(url) = args.redis_notifications { - info!("Starting redis notifications listener ({url})"); - maintenance_tasks - .spawn(notifications::task_main(url.to_owned(), cache.clone())); - } - maintenance_tasks.spawn(async move { cache.clone().gc_worker().await }); + if let proxy::console::provider::ConsoleBackend::Console(api) = &**api { + let cache = api.caches.project_info.clone(); + if let Some(url) = args.redis_notifications { + info!("Starting redis notifications listener ({url})"); + maintenance_tasks.spawn(notifications::task_main( + url.to_owned(), + cache.clone(), + cancel_map.clone(), + args.region.clone(), + )); } - #[cfg(feature = "testing")] - proxy::console::provider::ConsoleBackend::Postgres(_) => {} + maintenance_tasks.spawn(async move { cache.clone().gc_worker().await }); } } @@ -359,18 +399,18 @@ fn build_config(args: &ProxyCliArgs) -> anyhow::Result<&'static ProxyConfig> { let api = console::provider::neon::Api::new(endpoint, caches, locks); let api = console::provider::ConsoleBackend::Console(api); - auth::BackendType::Console(Cow::Owned(api), ()) + auth::BackendType::Console(MaybeOwned::Owned(api), ()) } #[cfg(feature = "testing")] AuthBackend::Postgres => { let url = args.auth_endpoint.parse()?; let api = console::provider::mock::Api::new(url); let api = console::provider::ConsoleBackend::Postgres(api); - auth::BackendType::Console(Cow::Owned(api), ()) + auth::BackendType::Console(MaybeOwned::Owned(api), ()) } AuthBackend::Link => { let url = args.uri.parse()?; - auth::BackendType::Link(Cow::Owned(url)) + auth::BackendType::Link(MaybeOwned::Owned(url), ()) } }; let http_config = HttpConfig { @@ -381,6 +421,7 @@ fn build_config(args: &ProxyCliArgs) -> anyhow::Result<&'static ProxyConfig> { pool_shards: args.sql_over_http.sql_over_http_pool_shards, idle_timeout: args.sql_over_http.sql_over_http_idle_timeout, opt_in: args.sql_over_http.sql_over_http_pool_opt_in, + max_total_conns: args.sql_over_http.sql_over_http_pool_max_total_conns, }, }; let authentication_config = AuthenticationConfig { @@ -389,6 +430,8 @@ fn build_config(args: &ProxyCliArgs) -> anyhow::Result<&'static ProxyConfig> { let mut endpoint_rps_limit = args.endpoint_rps_limit.clone(); RateBucketInfo::validate(&mut endpoint_rps_limit)?; + let mut redis_rps_limit = args.redis_rps_limit.clone(); + RateBucketInfo::validate(&mut redis_rps_limit)?; let config = Box::leak(Box::new(ProxyConfig { tls_config, @@ -400,6 +443,8 @@ fn build_config(args: &ProxyCliArgs) -> anyhow::Result<&'static ProxyConfig> { require_client_ip: args.require_client_ip, disable_ip_check_for_http: args.disable_ip_check_for_http, endpoint_rps_limit, + redis_rps_limit, + handshake_timeout: args.handshake_timeout, // TODO: add this argument region: args.region.clone(), })); diff --git a/proxy/src/cache/project_info.rs b/proxy/src/cache/project_info.rs index 6f37868a8c..6e3eb8c1b0 100644 --- a/proxy/src/cache/project_info.rs +++ b/proxy/src/cache/project_info.rs @@ -12,15 +12,18 @@ use tokio::time::Instant; use tracing::{debug, info}; use crate::{ - auth::IpPattern, config::ProjectInfoCacheOptions, console::AuthSecret, EndpointId, ProjectId, - RoleName, + auth::IpPattern, + config::ProjectInfoCacheOptions, + console::AuthSecret, + intern::{EndpointIdInt, ProjectIdInt, RoleNameInt}, + EndpointId, ProjectId, RoleName, }; use super::{Cache, Cached}; pub trait ProjectInfoCache { - fn invalidate_allowed_ips_for_project(&self, project_id: &ProjectId); - fn invalidate_role_secret_for_project(&self, project_id: &ProjectId, role_name: &RoleName); + fn invalidate_allowed_ips_for_project(&self, project_id: ProjectIdInt); + fn invalidate_role_secret_for_project(&self, project_id: ProjectIdInt, role_name: RoleNameInt); fn enable_ttl(&self); fn disable_ttl(&self); } @@ -47,7 +50,7 @@ impl From for Entry { #[derive(Default)] struct EndpointInfo { - secret: std::collections::HashMap>>, + secret: std::collections::HashMap>>, allowed_ips: Option>>>, } @@ -60,11 +63,11 @@ impl EndpointInfo { } pub fn get_role_secret( &self, - role_name: &RoleName, + role_name: RoleNameInt, valid_since: Instant, ignore_cache_since: Option, ) -> Option<(Option, bool)> { - if let Some(secret) = self.secret.get(role_name) { + if let Some(secret) = self.secret.get(&role_name) { if valid_since < secret.created_at { return Some(( secret.value.clone(), @@ -93,8 +96,8 @@ impl EndpointInfo { pub fn invalidate_allowed_ips(&mut self) { self.allowed_ips = None; } - pub fn invalidate_role_secret(&mut self, role_name: &RoleName) { - self.secret.remove(role_name); + pub fn invalidate_role_secret(&mut self, role_name: RoleNameInt) { + self.secret.remove(&role_name); } } @@ -106,9 +109,9 @@ impl EndpointInfo { /// One may ask, why the data is stored per project, when on the user request there is only data about the endpoint available? /// On the cplane side updates are done per project (or per branch), so it's easier to invalidate the whole project cache. pub struct ProjectInfoCacheImpl { - cache: DashMap, + cache: DashMap, - project2ep: DashMap>, + project2ep: DashMap>, config: ProjectInfoCacheOptions, start_time: Instant, @@ -116,11 +119,11 @@ pub struct ProjectInfoCacheImpl { } impl ProjectInfoCache for ProjectInfoCacheImpl { - fn invalidate_allowed_ips_for_project(&self, project_id: &ProjectId) { + fn invalidate_allowed_ips_for_project(&self, project_id: ProjectIdInt) { info!("invalidating allowed ips for project `{}`", project_id); let endpoints = self .project2ep - .get(project_id) + .get(&project_id) .map(|kv| kv.value().clone()) .unwrap_or_default(); for endpoint_id in endpoints { @@ -129,14 +132,14 @@ impl ProjectInfoCache for ProjectInfoCacheImpl { } } } - fn invalidate_role_secret_for_project(&self, project_id: &ProjectId, role_name: &RoleName) { + fn invalidate_role_secret_for_project(&self, project_id: ProjectIdInt, role_name: RoleNameInt) { info!( "invalidating role secret for project_id `{}` and role_name `{}`", - project_id, role_name + project_id, role_name, ); let endpoints = self .project2ep - .get(project_id) + .get(&project_id) .map(|kv| kv.value().clone()) .unwrap_or_default(); for endpoint_id in endpoints { @@ -173,15 +176,17 @@ impl ProjectInfoCacheImpl { endpoint_id: &EndpointId, role_name: &RoleName, ) -> Option>> { + let endpoint_id = EndpointIdInt::get(endpoint_id)?; + let role_name = RoleNameInt::get(role_name)?; let (valid_since, ignore_cache_since) = self.get_cache_times(); - let endpoint_info = self.cache.get(endpoint_id)?; + let endpoint_info = self.cache.get(&endpoint_id)?; let (value, ignore_cache) = endpoint_info.get_role_secret(role_name, valid_since, ignore_cache_since)?; if !ignore_cache { let cached = Cached { token: Some(( self, - CachedLookupInfo::new_role_secret(endpoint_id.clone(), role_name.clone()), + CachedLookupInfo::new_role_secret(endpoint_id, role_name), )), value, }; @@ -193,13 +198,14 @@ impl ProjectInfoCacheImpl { &self, endpoint_id: &EndpointId, ) -> Option>>> { + let endpoint_id = EndpointIdInt::get(endpoint_id)?; let (valid_since, ignore_cache_since) = self.get_cache_times(); - let endpoint_info = self.cache.get(endpoint_id)?; + let endpoint_info = self.cache.get(&endpoint_id)?; let value = endpoint_info.get_allowed_ips(valid_since, ignore_cache_since); let (value, ignore_cache) = value?; if !ignore_cache { let cached = Cached { - token: Some((self, CachedLookupInfo::new_allowed_ips(endpoint_id.clone()))), + token: Some((self, CachedLookupInfo::new_allowed_ips(endpoint_id))), value, }; return Some(cached); @@ -213,14 +219,17 @@ impl ProjectInfoCacheImpl { role_name: &RoleName, secret: Option, ) { + let project_id = ProjectIdInt::from(project_id); + let endpoint_id = EndpointIdInt::from(endpoint_id); + let role_name = RoleNameInt::from(role_name); if self.cache.len() >= self.config.size { // If there are too many entries, wait until the next gc cycle. return; } - self.inser_project2endpoint(project_id, endpoint_id); - let mut entry = self.cache.entry(endpoint_id.clone()).or_default(); + self.insert_project2endpoint(project_id, endpoint_id); + let mut entry = self.cache.entry(endpoint_id).or_default(); if entry.secret.len() < self.config.max_roles { - entry.secret.insert(role_name.clone(), secret.into()); + entry.secret.insert(role_name, secret.into()); } } pub fn insert_allowed_ips( @@ -229,22 +238,21 @@ impl ProjectInfoCacheImpl { endpoint_id: &EndpointId, allowed_ips: Arc>, ) { + let project_id = ProjectIdInt::from(project_id); + let endpoint_id = EndpointIdInt::from(endpoint_id); if self.cache.len() >= self.config.size { // If there are too many entries, wait until the next gc cycle. return; } - self.inser_project2endpoint(project_id, endpoint_id); - self.cache - .entry(endpoint_id.clone()) - .or_default() - .allowed_ips = Some(allowed_ips.into()); + self.insert_project2endpoint(project_id, endpoint_id); + self.cache.entry(endpoint_id).or_default().allowed_ips = Some(allowed_ips.into()); } - fn inser_project2endpoint(&self, project_id: &ProjectId, endpoint_id: &EndpointId) { - if let Some(mut endpoints) = self.project2ep.get_mut(project_id) { - endpoints.insert(endpoint_id.clone()); + fn insert_project2endpoint(&self, project_id: ProjectIdInt, endpoint_id: EndpointIdInt) { + if let Some(mut endpoints) = self.project2ep.get_mut(&project_id) { + endpoints.insert(endpoint_id); } else { self.project2ep - .insert(project_id.clone(), HashSet::from([endpoint_id.clone()])); + .insert(project_id, HashSet::from([endpoint_id])); } } fn get_cache_times(&self) -> (Instant, Option) { @@ -300,18 +308,18 @@ impl ProjectInfoCacheImpl { /// This is used to invalidate cache entries. pub struct CachedLookupInfo { /// Search by this key. - endpoint_id: EndpointId, + endpoint_id: EndpointIdInt, lookup_type: LookupType, } impl CachedLookupInfo { - pub(self) fn new_role_secret(endpoint_id: EndpointId, role_name: RoleName) -> Self { + pub(self) fn new_role_secret(endpoint_id: EndpointIdInt, role_name: RoleNameInt) -> Self { Self { endpoint_id, lookup_type: LookupType::RoleSecret(role_name), } } - pub(self) fn new_allowed_ips(endpoint_id: EndpointId) -> Self { + pub(self) fn new_allowed_ips(endpoint_id: EndpointIdInt) -> Self { Self { endpoint_id, lookup_type: LookupType::AllowedIps, @@ -320,7 +328,7 @@ impl CachedLookupInfo { } enum LookupType { - RoleSecret(RoleName), + RoleSecret(RoleNameInt), AllowedIps, } @@ -335,7 +343,7 @@ impl Cache for ProjectInfoCacheImpl { match &key.lookup_type { LookupType::RoleSecret(role_name) => { if let Some(mut endpoint_info) = self.cache.get_mut(&key.endpoint_id) { - endpoint_info.invalidate_role_secret(role_name); + endpoint_info.invalidate_role_secret(*role_name); } } LookupType::AllowedIps => { @@ -350,8 +358,7 @@ impl Cache for ProjectInfoCacheImpl { #[cfg(test)] mod tests { use super::*; - use crate::{console::AuthSecret, scram::ServerSecret}; - use std::{sync::Arc, time::Duration}; + use crate::scram::ServerSecret; #[tokio::test] async fn test_project_info_cache_settings() { @@ -457,7 +464,7 @@ mod tests { assert_eq!(cached.value, secret2); // The only way to invalidate this value is to invalidate via the api. - cache.invalidate_role_secret_for_project(&project_id, &user2); + cache.invalidate_role_secret_for_project((&project_id).into(), (&user2).into()); assert!(cache.get_role_secret(&endpoint_id, &user2).is_none()); let cached = cache.get_allowed_ips(&endpoint_id).unwrap(); diff --git a/proxy/src/cancellation.rs b/proxy/src/cancellation.rs index a5eb3544b4..c9607909b3 100644 --- a/proxy/src/cancellation.rs +++ b/proxy/src/cancellation.rs @@ -1,73 +1,154 @@ -use anyhow::{bail, Context}; +use async_trait::async_trait; use dashmap::DashMap; use pq_proto::CancelKeyData; -use std::net::SocketAddr; +use std::{net::SocketAddr, sync::Arc}; +use thiserror::Error; use tokio::net::TcpStream; +use tokio::sync::Mutex; use tokio_postgres::{CancelToken, NoTls}; use tracing::info; +use uuid::Uuid; + +use crate::{ + error::ReportableError, metrics::NUM_CANCELLATION_REQUESTS, + redis::publisher::RedisPublisherClient, +}; + +pub type CancelMap = Arc>>; /// Enables serving `CancelRequest`s. -#[derive(Default)] -pub struct CancelMap(DashMap>); +/// +/// If there is a `RedisPublisherClient` available, it will be used to publish the cancellation key to other proxy instances. +pub struct CancellationHandler { + map: CancelMap, + redis_client: Option>>, +} -impl CancelMap { +#[derive(Debug, Error)] +pub enum CancelError { + #[error("{0}")] + IO(#[from] std::io::Error), + #[error("{0}")] + Postgres(#[from] tokio_postgres::Error), +} + +impl ReportableError for CancelError { + fn get_error_kind(&self) -> crate::error::ErrorKind { + match self { + CancelError::IO(_) => crate::error::ErrorKind::Compute, + CancelError::Postgres(e) if e.as_db_error().is_some() => { + crate::error::ErrorKind::Postgres + } + CancelError::Postgres(_) => crate::error::ErrorKind::Compute, + } + } +} + +impl CancellationHandler { + pub fn new(map: CancelMap, redis_client: Option>>) -> Self { + Self { map, redis_client } + } /// Cancel a running query for the corresponding connection. - pub async fn cancel_session(&self, key: CancelKeyData) -> anyhow::Result<()> { + pub async fn cancel_session( + &self, + key: CancelKeyData, + session_id: Uuid, + ) -> Result<(), CancelError> { + let from = "from_client"; // NB: we should immediately release the lock after cloning the token. - let cancel_closure = self - .0 - .get(&key) - .and_then(|x| x.clone()) - .with_context(|| format!("query cancellation key not found: {key}"))?; - + let Some(cancel_closure) = self.map.get(&key).and_then(|x| x.clone()) else { + tracing::warn!("query cancellation key not found: {key}"); + if let Some(redis_client) = &self.redis_client { + NUM_CANCELLATION_REQUESTS + .with_label_values(&[from, "not_found"]) + .inc(); + info!("publishing cancellation key to Redis"); + match redis_client.lock().await.try_publish(key, session_id).await { + Ok(()) => { + info!("cancellation key successfuly published to Redis"); + } + Err(e) => { + tracing::error!("failed to publish a message: {e}"); + return Err(CancelError::IO(std::io::Error::new( + std::io::ErrorKind::Other, + e.to_string(), + ))); + } + } + } + return Ok(()); + }; + NUM_CANCELLATION_REQUESTS + .with_label_values(&[from, "found"]) + .inc(); info!("cancelling query per user's request using key {key}"); cancel_closure.try_cancel_query().await } /// Run async action within an ephemeral session identified by [`CancelKeyData`]. - pub async fn with_session<'a, F, R, V>(&'a self, f: F) -> anyhow::Result - where - F: FnOnce(Session<'a>) -> R, - R: std::future::Future>, - { + pub fn get_session(self: Arc) -> Session { // HACK: We'd rather get the real backend_pid but tokio_postgres doesn't // expose it and we don't want to do another roundtrip to query // for it. The client will be able to notice that this is not the // actual backend_pid, but backend_pid is not used for anything // so it doesn't matter. - let key = rand::random(); + let key = loop { + let key = rand::random(); - // Random key collisions are unlikely to happen here, but they're still possible, - // which is why we have to take care not to rewrite an existing key. - match self.0.entry(key) { - dashmap::mapref::entry::Entry::Occupied(_) => { - bail!("query cancellation key already exists: {key}") + // Random key collisions are unlikely to happen here, but they're still possible, + // which is why we have to take care not to rewrite an existing key. + match self.map.entry(key) { + dashmap::mapref::entry::Entry::Occupied(_) => continue, + dashmap::mapref::entry::Entry::Vacant(e) => { + e.insert(None); + } } - dashmap::mapref::entry::Entry::Vacant(e) => { - e.insert(None); - } - } - - // This will guarantee that the session gets dropped - // as soon as the future is finished. - scopeguard::defer! { - self.0.remove(&key); - info!("dropped query cancellation key {key}"); - } + break key; + }; info!("registered new query cancellation key {key}"); - let session = Session::new(key, self); - f(session).await + Session { + key, + cancellation_handler: self, + } } #[cfg(test)] fn contains(&self, session: &Session) -> bool { - self.0.contains_key(&session.key) + self.map.contains_key(&session.key) } #[cfg(test)] fn is_empty(&self) -> bool { - self.0.is_empty() + self.map.is_empty() + } +} + +#[async_trait] +pub trait NotificationsCancellationHandler { + async fn cancel_session_no_publish(&self, key: CancelKeyData) -> Result<(), CancelError>; +} + +#[async_trait] +impl NotificationsCancellationHandler for CancellationHandler { + async fn cancel_session_no_publish(&self, key: CancelKeyData) -> Result<(), CancelError> { + let from = "from_redis"; + let cancel_closure = self.map.get(&key).and_then(|x| x.clone()); + match cancel_closure { + Some(cancel_closure) => { + NUM_CANCELLATION_REQUESTS + .with_label_values(&[from, "found"]) + .inc(); + cancel_closure.try_cancel_query().await + } + None => { + NUM_CANCELLATION_REQUESTS + .with_label_values(&[from, "not_found"]) + .inc(); + tracing::warn!("query cancellation key not found: {key}"); + Ok(()) + } + } } } @@ -87,72 +168,59 @@ impl CancelClosure { cancel_token, } } - /// Cancels the query running on user's compute node. - pub async fn try_cancel_query(self) -> anyhow::Result<()> { + pub async fn try_cancel_query(self) -> Result<(), CancelError> { let socket = TcpStream::connect(self.socket_addr).await?; self.cancel_token.cancel_query_raw(socket, NoTls).await?; - + info!("query was cancelled"); Ok(()) } } /// Helper for registering query cancellation tokens. -pub struct Session<'a> { +pub struct Session { /// The user-facing key identifying this session. key: CancelKeyData, /// The [`CancelMap`] this session belongs to. - cancel_map: &'a CancelMap, + cancellation_handler: Arc, } -impl<'a> Session<'a> { - fn new(key: CancelKeyData, cancel_map: &'a CancelMap) -> Self { - Self { key, cancel_map } +impl Session { + /// Store the cancel token for the given session. + /// This enables query cancellation in `crate::proxy::prepare_client_connection`. + pub fn enable_query_cancellation(&self, cancel_closure: CancelClosure) -> CancelKeyData { + info!("enabling query cancellation for this session"); + self.cancellation_handler + .map + .insert(self.key, Some(cancel_closure)); + + self.key } } -impl Session<'_> { - /// Store the cancel token for the given session. - /// This enables query cancellation in `crate::proxy::prepare_client_connection`. - pub fn enable_query_cancellation(self, cancel_closure: CancelClosure) -> CancelKeyData { - info!("enabling query cancellation for this session"); - self.cancel_map.0.insert(self.key, Some(cancel_closure)); - - self.key +impl Drop for Session { + fn drop(&mut self) { + self.cancellation_handler.map.remove(&self.key); + info!("dropped query cancellation key {}", &self.key); } } #[cfg(test)] mod tests { use super::*; - use once_cell::sync::Lazy; #[tokio::test] async fn check_session_drop() -> anyhow::Result<()> { - static CANCEL_MAP: Lazy = Lazy::new(Default::default); - - let (tx, rx) = tokio::sync::oneshot::channel(); - let task = tokio::spawn(CANCEL_MAP.with_session(|session| async move { - assert!(CANCEL_MAP.contains(&session)); - - tx.send(()).expect("failed to send"); - futures::future::pending::<()>().await; // sleep forever - - Ok(()) - })); - - // Wait until the task has been spawned. - rx.await.context("failed to hear from the task")?; - - // Drop the session's entry by cancelling the task. - task.abort(); - let error = task.await.expect_err("task should have failed"); - if !error.is_cancelled() { - anyhow::bail!(error); - } + let cancellation_handler = Arc::new(CancellationHandler { + map: CancelMap::default(), + redis_client: None, + }); + let session = cancellation_handler.clone().get_session(); + assert!(cancellation_handler.contains(&session)); + drop(session); // Check that the session has been dropped. - assert!(CANCEL_MAP.is_empty()); + assert!(cancellation_handler.is_empty()); Ok(()) } diff --git a/proxy/src/compute.rs b/proxy/src/compute.rs index aef1aab733..b61c1fb9ef 100644 --- a/proxy/src/compute.rs +++ b/proxy/src/compute.rs @@ -1,6 +1,10 @@ use crate::{ - auth::parse_endpoint_param, cancellation::CancelClosure, console::errors::WakeComputeError, - context::RequestMonitoring, error::UserFacingError, metrics::NUM_DB_CONNECTIONS_GAUGE, + auth::parse_endpoint_param, + cancellation::CancelClosure, + console::{errors::WakeComputeError, messages::MetricsAuxInfo}, + context::RequestMonitoring, + error::{ReportableError, UserFacingError}, + metrics::NUM_DB_CONNECTIONS_GAUGE, proxy::neon_option, }; use futures::{FutureExt, TryFutureExt}; @@ -58,6 +62,20 @@ impl UserFacingError for ConnectionError { } } +impl ReportableError for ConnectionError { + fn get_error_kind(&self) -> crate::error::ErrorKind { + match self { + ConnectionError::Postgres(e) if e.as_db_error().is_some() => { + crate::error::ErrorKind::Postgres + } + ConnectionError::Postgres(_) => crate::error::ErrorKind::Compute, + ConnectionError::CouldNotConnect(_) => crate::error::ErrorKind::Compute, + ConnectionError::TlsError(_) => crate::error::ErrorKind::Compute, + ConnectionError::WakeComputeError(e) => e.get_error_kind(), + } + } +} + /// A pair of `ClientKey` & `ServerKey` for `SCRAM-SHA-256`. pub type ScramKeys = tokio_postgres::config::ScramKeys<32>; @@ -75,7 +93,7 @@ impl ConnCfg { } /// Reuse password or auth keys from the other config. - pub fn reuse_password(&mut self, other: &Self) { + pub fn reuse_password(&mut self, other: Self) { if let Some(password) = other.get_password() { self.password(password); } @@ -235,6 +253,8 @@ pub struct PostgresConnection { pub params: std::collections::HashMap, /// Query cancellation token. pub cancel_closure: CancelClosure, + /// Labels for proxy's metrics. + pub aux: MetricsAuxInfo, _guage: IntCounterPairGuard, } @@ -245,6 +265,7 @@ impl ConnCfg { &self, ctx: &mut RequestMonitoring, allow_self_signed_compute: bool, + aux: MetricsAuxInfo, timeout: Duration, ) -> Result { let (socket_addr, stream, host) = self.connect_raw(timeout).await?; @@ -279,6 +300,7 @@ impl ConnCfg { stream, params, cancel_closure, + aux, _guage: NUM_DB_CONNECTIONS_GAUGE .with_label_values(&[ctx.protocol]) .guard(), diff --git a/proxy/src/config.rs b/proxy/src/config.rs index 2c46458a49..437ec9f401 100644 --- a/proxy/src/config.rs +++ b/proxy/src/config.rs @@ -1,6 +1,10 @@ use crate::{auth, rate_limiter::RateBucketInfo, serverless::GlobalConnPoolOptions}; use anyhow::{bail, ensure, Context, Ok}; -use rustls::{sign, Certificate, PrivateKey}; +use itertools::Itertools; +use rustls::{ + crypto::ring::sign, + pki_types::{CertificateDer, PrivateKeyDer}, +}; use sha2::{Digest, Sha256}; use std::{ collections::{HashMap, HashSet}, @@ -13,7 +17,7 @@ use x509_parser::oid_registry; pub struct ProxyConfig { pub tls_config: Option, - pub auth_backend: auth::BackendType<'static, ()>, + pub auth_backend: auth::BackendType<'static, (), ()>, pub metric_collection: Option, pub allow_self_signed_compute: bool, pub http_config: HttpConfig, @@ -21,7 +25,9 @@ pub struct ProxyConfig { pub require_client_ip: bool, pub disable_ip_check_for_http: bool, pub endpoint_rps_limit: Vec, + pub redis_rps_limit: Vec, pub region: String, + pub handshake_timeout: Duration, } #[derive(Debug)] @@ -86,14 +92,14 @@ pub fn configure_tls( let cert_resolver = Arc::new(cert_resolver); - let config = rustls::ServerConfig::builder() - .with_safe_default_cipher_suites() - .with_safe_default_kx_groups() - // allow TLS 1.2 to be compatible with older client libraries - .with_protocol_versions(&[&rustls::version::TLS13, &rustls::version::TLS12])? - .with_no_client_auth() - .with_cert_resolver(cert_resolver.clone()) - .into(); + // allow TLS 1.2 to be compatible with older client libraries + let config = rustls::ServerConfig::builder_with_protocol_versions(&[ + &rustls::version::TLS13, + &rustls::version::TLS12, + ]) + .with_no_client_auth() + .with_cert_resolver(cert_resolver.clone()) + .into(); Ok(TlsConfig { config, @@ -131,14 +137,14 @@ pub enum TlsServerEndPoint { } impl TlsServerEndPoint { - pub fn new(cert: &Certificate) -> anyhow::Result { + pub fn new(cert: &CertificateDer) -> anyhow::Result { let sha256_oids = [ // I'm explicitly not adding MD5 or SHA1 here... They're bad. oid_registry::OID_SIG_ECDSA_WITH_SHA256, oid_registry::OID_PKCS1_SHA256WITHRSA, ]; - let pem = x509_parser::parse_x509_certificate(&cert.0) + let pem = x509_parser::parse_x509_certificate(cert) .context("Failed to parse PEM object from cerficiate")? .1; @@ -148,8 +154,7 @@ impl TlsServerEndPoint { let oid = pem.signature_algorithm.oid(); let alg = reg.get(oid); if sha256_oids.contains(oid) { - let tls_server_end_point: [u8; 32] = - Sha256::new().chain_update(&cert.0).finalize().into(); + let tls_server_end_point: [u8; 32] = Sha256::new().chain_update(cert).finalize().into(); info!(subject = %pem.subject, signature_algorithm = alg.map(|a| a.description()), tls_server_end_point = %base64::encode(tls_server_end_point), "determined channel binding"); Ok(Self::Sha256(tls_server_end_point)) } else { @@ -163,7 +168,7 @@ impl TlsServerEndPoint { } } -#[derive(Default)] +#[derive(Default, Debug)] pub struct CertResolver { certs: HashMap, TlsServerEndPoint)>, default: Option<(Arc, TlsServerEndPoint)>, @@ -183,11 +188,14 @@ impl CertResolver { let priv_key = { let key_bytes = std::fs::read(key_path) .context(format!("Failed to read TLS keys at '{key_path}'"))?; - let mut keys = rustls_pemfile::pkcs8_private_keys(&mut &key_bytes[..]) - .context(format!("Failed to parse TLS keys at '{key_path}'"))?; + let mut keys = rustls_pemfile::pkcs8_private_keys(&mut &key_bytes[..]).collect_vec(); ensure!(keys.len() == 1, "keys.len() = {} (should be 1)", keys.len()); - keys.pop().map(rustls::PrivateKey).unwrap() + PrivateKeyDer::Pkcs8( + keys.pop() + .unwrap() + .context(format!("Failed to parse TLS keys at '{key_path}'"))?, + ) }; let cert_chain_bytes = std::fs::read(cert_path) @@ -195,14 +203,10 @@ impl CertResolver { let cert_chain = { rustls_pemfile::certs(&mut &cert_chain_bytes[..]) + .try_collect() .with_context(|| { - format!( - "Failed to read TLS certificate chain from bytes from file at '{cert_path}'." - ) + format!("Failed to read TLS certificate chain from bytes from file at '{cert_path}'.") })? - .into_iter() - .map(rustls::Certificate) - .collect() }; self.add_cert(priv_key, cert_chain, is_default) @@ -210,15 +214,15 @@ impl CertResolver { pub fn add_cert( &mut self, - priv_key: PrivateKey, - cert_chain: Vec, + priv_key: PrivateKeyDer<'static>, + cert_chain: Vec>, is_default: bool, ) -> anyhow::Result<()> { let key = sign::any_supported_type(&priv_key).context("invalid private key")?; let first_cert = &cert_chain[0]; let tls_server_end_point = TlsServerEndPoint::new(first_cert)?; - let pem = x509_parser::parse_x509_certificate(&first_cert.0) + let pem = x509_parser::parse_x509_certificate(first_cert) .context("Failed to parse PEM object from cerficiate")? .1; diff --git a/proxy/src/console/messages.rs b/proxy/src/console/messages.rs index 6ef9bcf4eb..102076f2c6 100644 --- a/proxy/src/console/messages.rs +++ b/proxy/src/console/messages.rs @@ -1,4 +1,4 @@ -use serde::Deserialize; +use serde::{Deserialize, Serialize}; use std::fmt; use crate::auth::IpPattern; @@ -98,31 +98,17 @@ pub struct MetricsAuxInfo { pub endpoint_id: EndpointId, pub project_id: ProjectId, pub branch_id: BranchId, + pub cold_start_info: Option, } -impl MetricsAuxInfo { - /// Definitions of labels for traffic metric. - pub const TRAFFIC_LABELS: &'static [&'static str] = &[ - // Received (rx) / sent (tx). - "direction", - // ID of a project. - "project_id", - // ID of an endpoint within a project. - "endpoint_id", - // ID of a branch within a project (snapshot). - "branch_id", - ]; - - /// Values of labels for traffic metric. - // TODO: add more type safety (validate arity & positions). - pub fn traffic_labels(&self, direction: &'static str) -> [&str; 4] { - [ - direction, - &self.project_id, - &self.endpoint_id, - &self.branch_id, - ] - } +#[derive(Debug, Default, Serialize, Deserialize, Clone)] +#[serde(rename_all = "snake_case")] +pub enum ColdStartInfo { + #[default] + Unknown = 0, + Warm = 1, + PoolHit = 2, + PoolMiss = 3, } #[cfg(test)] @@ -135,6 +121,7 @@ mod tests { "endpoint_id": "endpoint", "project_id": "project", "branch_id": "branch", + "cold_start_info": "unknown", }) } diff --git a/proxy/src/console/mgmt.rs b/proxy/src/console/mgmt.rs index 373138b09e..c7a2d467c0 100644 --- a/proxy/src/console/mgmt.rs +++ b/proxy/src/console/mgmt.rs @@ -4,7 +4,7 @@ use crate::{ }; use anyhow::Context; use once_cell::sync::Lazy; -use postgres_backend::{self, AuthType, PostgresBackend, PostgresBackendTCP, QueryError}; +use postgres_backend::{AuthType, PostgresBackend, PostgresBackendTCP, QueryError}; use pq_proto::{BeMessage, SINGLE_COL_ROWDESC}; use std::{convert::Infallible, future}; use tokio::net::{TcpListener, TcpStream}; diff --git a/proxy/src/console/provider.rs b/proxy/src/console/provider.rs index a6dfbd79db..8609606273 100644 --- a/proxy/src/console/provider.rs +++ b/proxy/src/console/provider.rs @@ -1,10 +1,13 @@ -#[cfg(feature = "testing")] +#[cfg(any(test, feature = "testing"))] pub mod mock; pub mod neon; use super::messages::MetricsAuxInfo; use crate::{ - auth::{backend::ComputeUserInfo, IpPattern}, + auth::{ + backend::{ComputeCredentialKeys, ComputeUserInfo}, + IpPattern, + }, cache::{project_info::ProjectInfoCacheImpl, Cached, TimedLru}, compute, config::{CacheOptions, ProjectInfoCacheOptions}, @@ -20,7 +23,7 @@ use tracing::info; pub mod errors { use crate::{ - error::{io_error, UserFacingError}, + error::{io_error, ReportableError, UserFacingError}, http, proxy::retry::ShouldRetry, }; @@ -70,7 +73,7 @@ pub mod errors { // Status 406: endpoint is disabled (we don't allow connections). format!("{REQUEST_FAILED}: endpoint is disabled") } - http::StatusCode::LOCKED => { + http::StatusCode::LOCKED | http::StatusCode::UNPROCESSABLE_ENTITY => { // Status 423: project might be in maintenance mode (or bad state), or quotas exceeded. format!("{REQUEST_FAILED}: endpoint is temporary unavailable. check your quotas and/or contact our support") } @@ -81,6 +84,37 @@ pub mod errors { } } + impl ReportableError for ApiError { + fn get_error_kind(&self) -> crate::error::ErrorKind { + match self { + ApiError::Console { + status: http::StatusCode::NOT_FOUND | http::StatusCode::NOT_ACCEPTABLE, + .. + } => crate::error::ErrorKind::User, + ApiError::Console { + status: http::StatusCode::UNPROCESSABLE_ENTITY, + text, + } if text.contains("compute time quota of non-primary branches is exceeded") => { + crate::error::ErrorKind::User + } + ApiError::Console { + status: http::StatusCode::LOCKED, + text, + } if text.contains("quota exceeded") + || text.contains("the limit for current plan reached") => + { + crate::error::ErrorKind::User + } + ApiError::Console { + status: http::StatusCode::TOO_MANY_REQUESTS, + .. + } => crate::error::ErrorKind::ServiceRateLimit, + ApiError::Console { .. } => crate::error::ErrorKind::ControlPlane, + ApiError::Transport(_) => crate::error::ErrorKind::ControlPlane, + } + } + } + impl ShouldRetry for ApiError { fn could_retry(&self) -> bool { match self { @@ -92,6 +126,11 @@ pub mod errors { status: http::StatusCode::BAD_REQUEST, .. } => true, + // don't retry when quotas are exceeded + Self::Console { + status: http::StatusCode::UNPROCESSABLE_ENTITY, + ref text, + } => !text.contains("compute time quota of non-primary branches is exceeded"), // locked can be returned when the endpoint was in transition // or when quotas are exceeded. don't retry when quotas are exceeded Self::Console { @@ -150,6 +189,16 @@ pub mod errors { } } } + + impl ReportableError for GetAuthInfoError { + fn get_error_kind(&self) -> crate::error::ErrorKind { + match self { + GetAuthInfoError::BadSecret => crate::error::ErrorKind::ControlPlane, + GetAuthInfoError::ApiError(_) => crate::error::ErrorKind::ControlPlane, + } + } + } + #[derive(Debug, Error)] pub enum WakeComputeError { #[error("Console responded with a malformed compute address: {0}")] @@ -194,12 +243,22 @@ pub mod errors { } } } + + impl ReportableError for WakeComputeError { + fn get_error_kind(&self) -> crate::error::ErrorKind { + match self { + WakeComputeError::BadComputeAddress(_) => crate::error::ErrorKind::ControlPlane, + WakeComputeError::ApiError(e) => e.get_error_kind(), + WakeComputeError::TimeoutError => crate::error::ErrorKind::ServiceRateLimit, + } + } + } } /// Auth secret which is managed by the cloud. #[derive(Clone, Eq, PartialEq, Debug)] pub enum AuthSecret { - #[cfg(feature = "testing")] + #[cfg(any(test, feature = "testing"))] /// Md5 hash of user's password. Md5([u8; 16]), @@ -232,6 +291,34 @@ pub struct NodeInfo { pub allow_self_signed_compute: bool, } +impl NodeInfo { + pub async fn connect( + &self, + ctx: &mut RequestMonitoring, + timeout: Duration, + ) -> Result { + self.config + .connect( + ctx, + self.allow_self_signed_compute, + self.aux.clone(), + timeout, + ) + .await + } + pub fn reuse_settings(&mut self, other: Self) { + self.allow_self_signed_compute = other.allow_self_signed_compute; + self.config.reuse_password(other.config); + } + + pub fn set_keys(&mut self, keys: &ComputeCredentialKeys) { + match keys { + ComputeCredentialKeys::Password(password) => self.config.password(password), + ComputeCredentialKeys::AuthKeys(auth_keys) => self.config.auth_keys(*auth_keys), + }; + } +} + pub type NodeInfoCache = TimedLru; pub type CachedNodeInfo = Cached<&'static NodeInfoCache>; pub type CachedRoleSecret = Cached<&'static ProjectInfoCacheImpl, Option>; @@ -250,11 +337,11 @@ pub trait Api { user_info: &ComputeUserInfo, ) -> Result; - async fn get_allowed_ips( + async fn get_allowed_ips_and_secret( &self, ctx: &mut RequestMonitoring, user_info: &ComputeUserInfo, - ) -> Result; + ) -> Result<(CachedAllowedIps, Option), errors::GetAuthInfoError>; /// Wake up the compute node and return the corresponding connection info. async fn wake_compute( @@ -264,13 +351,16 @@ pub trait Api { ) -> Result; } -#[derive(Clone)] +#[non_exhaustive] pub enum ConsoleBackend { /// Current Cloud API (V2). Console(neon::Api), /// Local mock of Cloud API (V2). - #[cfg(feature = "testing")] + #[cfg(any(test, feature = "testing"))] Postgres(mock::Api), + /// Internal testing + #[cfg(test)] + Test(Box), } #[async_trait] @@ -283,21 +373,25 @@ impl Api for ConsoleBackend { use ConsoleBackend::*; match self { Console(api) => api.get_role_secret(ctx, user_info).await, - #[cfg(feature = "testing")] + #[cfg(any(test, feature = "testing"))] Postgres(api) => api.get_role_secret(ctx, user_info).await, + #[cfg(test)] + Test(_) => unreachable!("this function should never be called in the test backend"), } } - async fn get_allowed_ips( + async fn get_allowed_ips_and_secret( &self, ctx: &mut RequestMonitoring, user_info: &ComputeUserInfo, - ) -> Result { + ) -> Result<(CachedAllowedIps, Option), errors::GetAuthInfoError> { use ConsoleBackend::*; match self { - Console(api) => api.get_allowed_ips(ctx, user_info).await, - #[cfg(feature = "testing")] - Postgres(api) => api.get_allowed_ips(ctx, user_info).await, + Console(api) => api.get_allowed_ips_and_secret(ctx, user_info).await, + #[cfg(any(test, feature = "testing"))] + Postgres(api) => api.get_allowed_ips_and_secret(ctx, user_info).await, + #[cfg(test)] + Test(api) => api.get_allowed_ips_and_secret(), } } @@ -310,8 +404,10 @@ impl Api for ConsoleBackend { match self { Console(api) => api.wake_compute(ctx, user_info).await, - #[cfg(feature = "testing")] + #[cfg(any(test, feature = "testing"))] Postgres(api) => api.wake_compute(ctx, user_info).await, + #[cfg(test)] + Test(api) => api.wake_compute(), } } } diff --git a/proxy/src/console/provider/mock.rs b/proxy/src/console/provider/mock.rs index 55f395a403..0579ef6fc4 100644 --- a/proxy/src/console/provider/mock.rs +++ b/proxy/src/console/provider/mock.rs @@ -157,14 +157,17 @@ impl super::Api for Api { )) } - async fn get_allowed_ips( + async fn get_allowed_ips_and_secret( &self, _ctx: &mut RequestMonitoring, user_info: &ComputeUserInfo, - ) -> Result { - Ok(Cached::new_uncached(Arc::new( - self.do_get_auth_info(user_info).await?.allowed_ips, - ))) + ) -> Result<(CachedAllowedIps, Option), GetAuthInfoError> { + Ok(( + Cached::new_uncached(Arc::new( + self.do_get_auth_info(user_info).await?.allowed_ips, + )), + None, + )) } #[tracing::instrument(skip_all)] @@ -173,9 +176,7 @@ impl super::Api for Api { _ctx: &mut RequestMonitoring, _user_info: &ComputeUserInfo, ) -> Result { - self.do_wake_compute() - .map_ok(CachedNodeInfo::new_uncached) - .await + self.do_wake_compute().map_ok(Cached::new_uncached).await } } diff --git a/proxy/src/console/provider/neon.rs b/proxy/src/console/provider/neon.rs index 33618faed8..3b2e0cc204 100644 --- a/proxy/src/console/provider/neon.rs +++ b/proxy/src/console/provider/neon.rs @@ -6,7 +6,9 @@ use super::{ ApiCaches, ApiLocks, AuthInfo, AuthSecret, CachedAllowedIps, CachedNodeInfo, CachedRoleSecret, NodeInfo, }; -use crate::{auth::backend::ComputeUserInfo, compute, http, scram}; +use crate::{ + auth::backend::ComputeUserInfo, compute, console::messages::ColdStartInfo, http, scram, +}; use crate::{ cache::Cached, context::RequestMonitoring, @@ -19,7 +21,6 @@ use tokio::time::Instant; use tokio_postgres::config::SslMode; use tracing::{error, info, info_span, warn, Instrument}; -#[derive(Clone)] pub struct Api { endpoint: http::Endpoint, pub caches: &'static ApiCaches, @@ -73,7 +74,9 @@ impl Api { info!(url = request.url().as_str(), "sending http request"); let start = Instant::now(); + let pause = ctx.latency_timer.pause(crate::metrics::Waiting::Cplane); let response = self.endpoint.execute(request).await?; + drop(pause); info!(duration = ?start.elapsed(), "received http response"); let body = match parse_body::(response).await { Ok(body) => body, @@ -133,7 +136,9 @@ impl Api { info!(url = request.url().as_str(), "sending http request"); let start = Instant::now(); + let pause = ctx.latency_timer.pause(crate::metrics::Waiting::Cplane); let response = self.endpoint.execute(request).await?; + drop(pause); info!(duration = ?start.elapsed(), "received http response"); let body = parse_body::(response).await?; @@ -189,22 +194,23 @@ impl super::Api for Api { ep, Arc::new(auth_info.allowed_ips), ); + ctx.set_project_id(project_id); } // When we just got a secret, we don't need to invalidate it. Ok(Cached::new_uncached(auth_info.secret)) } - async fn get_allowed_ips( + async fn get_allowed_ips_and_secret( &self, ctx: &mut RequestMonitoring, user_info: &ComputeUserInfo, - ) -> Result { + ) -> Result<(CachedAllowedIps, Option), GetAuthInfoError> { let ep = &user_info.endpoint; if let Some(allowed_ips) = self.caches.project_info.get_allowed_ips(ep) { ALLOWED_IPS_BY_CACHE_OUTCOME .with_label_values(&["hit"]) .inc(); - return Ok(allowed_ips); + return Ok((allowed_ips, None)); } ALLOWED_IPS_BY_CACHE_OUTCOME .with_label_values(&["miss"]) @@ -222,8 +228,12 @@ impl super::Api for Api { self.caches .project_info .insert_allowed_ips(&project_id, ep, allowed_ips.clone()); + ctx.set_project_id(project_id); } - Ok(Cached::new_uncached(allowed_ips)) + Ok(( + Cached::new_uncached(allowed_ips), + Some(Cached::new_uncached(auth_info.secret)), + )) } #[tracing::instrument(skip_all)] @@ -250,11 +260,15 @@ impl super::Api for Api { if permit.should_check_cache() { if let Some(cached) = self.caches.node_info.get(&key) { info!(key = &*key, "found cached compute node info"); + ctx.set_cold_start_info(ColdStartInfo::Warm); return Ok(cached); } } let node = self.do_wake_compute(ctx, user_info).await?; + ctx.set_project(node.aux.clone()); + let cold_start_info = node.aux.cold_start_info.clone().unwrap_or_default(); + info!(?cold_start_info, "woken up a compute node"); let (_, cached) = self.caches.node_info.insert(key.clone(), node); info!(key = &*key, "created a cache entry for compute node info"); diff --git a/proxy/src/context.rs b/proxy/src/context.rs index 9e2ea10031..7ca830cdb4 100644 --- a/proxy/src/context.rs +++ b/proxy/src/context.rs @@ -5,18 +5,22 @@ use once_cell::sync::OnceCell; use smol_str::SmolStr; use std::net::IpAddr; use tokio::sync::mpsc; +use tracing::{field::display, info_span, Span}; use uuid::Uuid; use crate::{ - console::messages::MetricsAuxInfo, error::ErrorKind, metrics::LatencyTimer, BranchId, - EndpointId, ProjectId, RoleName, + console::messages::{ColdStartInfo, MetricsAuxInfo}, + error::ErrorKind, + metrics::{LatencyTimer, ENDPOINT_ERRORS_BY_KIND, ERROR_BY_KIND}, + BranchId, DbName, EndpointId, ProjectId, RoleName, }; +use self::parquet::RequestData; + pub mod parquet; -static LOG_CHAN: OnceCell> = OnceCell::new(); +static LOG_CHAN: OnceCell> = OnceCell::new(); -#[derive(Clone)] /// Context data for a single request to connect to a database. /// /// This data should **not** be used for connection logic, only for observability and limiting purposes. @@ -27,22 +31,35 @@ pub struct RequestMonitoring { pub protocol: &'static str, first_packet: chrono::DateTime, region: &'static str, + pub span: Span, // filled in as they are discovered project: Option, branch: Option, endpoint_id: Option, + dbname: Option, user: Option, application: Option, error_kind: Option, + pub(crate) auth_method: Option, success: bool, + cold_start_info: Option, // extra // This sender is here to keep the request monitoring channel open while requests are taking place. - sender: Option>, + sender: Option>, pub latency_timer: LatencyTimer, } +#[derive(Clone, Debug)] +pub enum AuthMethod { + // aka link aka passwordless + Web, + ScramSha256, + ScramSha256Plus, + Cleartext, +} + impl RequestMonitoring { pub fn new( session_id: Uuid, @@ -50,20 +67,32 @@ impl RequestMonitoring { protocol: &'static str, region: &'static str, ) -> Self { + let span = info_span!( + "connect_request", + %protocol, + ?session_id, + %peer_addr, + ep = tracing::field::Empty, + ); + Self { peer_addr, session_id, protocol, first_packet: Utc::now(), region, + span, project: None, branch: None, endpoint_id: None, + dbname: None, user: None, application: None, error_kind: None, + auth_method: None, success: false, + cold_start_info: None, sender: LOG_CHAN.get().and_then(|tx| tx.upgrade()), latency_timer: LatencyTimer::new(protocol), @@ -83,37 +112,68 @@ impl RequestMonitoring { ) } - pub fn set_project(&mut self, x: MetricsAuxInfo) { - self.branch = Some(x.branch_id); - self.endpoint_id = Some(x.endpoint_id); - self.project = Some(x.project_id); + pub fn set_cold_start_info(&mut self, info: ColdStartInfo) { + self.cold_start_info = Some(info); } - pub fn set_endpoint_id(&mut self, endpoint_id: Option) { - self.endpoint_id = endpoint_id.or_else(|| self.endpoint_id.clone()); + pub fn set_project(&mut self, x: MetricsAuxInfo) { + self.set_endpoint_id(x.endpoint_id); + self.branch = Some(x.branch_id); + self.project = Some(x.project_id); + self.cold_start_info = x.cold_start_info; + } + + pub fn set_project_id(&mut self, project_id: ProjectId) { + self.project = Some(project_id); + } + + pub fn set_endpoint_id(&mut self, endpoint_id: EndpointId) { + self.span.record("ep", display(&endpoint_id)); + crate::metrics::CONNECTING_ENDPOINTS + .with_label_values(&[self.protocol]) + .measure(&endpoint_id); + self.endpoint_id = Some(endpoint_id); } pub fn set_application(&mut self, app: Option) { self.application = app.or_else(|| self.application.clone()); } + pub fn set_dbname(&mut self, dbname: DbName) { + self.dbname = Some(dbname); + } + pub fn set_user(&mut self, user: RoleName) { self.user = Some(user); } + pub fn set_auth_method(&mut self, auth_method: AuthMethod) { + self.auth_method = Some(auth_method); + } + + pub fn set_error_kind(&mut self, kind: ErrorKind) { + ERROR_BY_KIND + .with_label_values(&[kind.to_metric_label()]) + .inc(); + if let Some(ep) = &self.endpoint_id { + ENDPOINT_ERRORS_BY_KIND + .with_label_values(&[kind.to_metric_label()]) + .measure(ep); + } + self.error_kind = Some(kind); + } + pub fn set_success(&mut self) { self.success = true; } - pub fn log(&mut self) { - if let Some(tx) = self.sender.take() { - let _: Result<(), _> = tx.send(self.clone()); - } - } + pub fn log(self) {} } impl Drop for RequestMonitoring { fn drop(&mut self) { - self.log() + if let Some(tx) = self.sender.take() { + let _: Result<(), _> = tx.send(RequestData::from(&*self)); + } } } diff --git a/proxy/src/context/parquet.rs b/proxy/src/context/parquet.rs index 1e9e723938..a2be1c4186 100644 --- a/proxy/src/context/parquet.rs +++ b/proxy/src/context/parquet.rs @@ -1,7 +1,7 @@ use std::{sync::Arc, time::SystemTime}; use anyhow::Context; -use bytes::BytesMut; +use bytes::{buf::Writer, BufMut, BytesMut}; use chrono::{Datelike, Timelike}; use futures::{Stream, StreamExt}; use parquet::{ @@ -13,7 +13,7 @@ use parquet::{ }, record::RecordWriter, }; -use remote_storage::{GenericRemoteStorage, RemotePath, RemoteStorageConfig}; +use remote_storage::{GenericRemoteStorage, RemotePath, RemoteStorageConfig, TimeoutOrCancel}; use tokio::{sync::mpsc, time}; use tokio_util::sync::CancellationToken; use tracing::{debug, info, Span}; @@ -74,7 +74,7 @@ pub(crate) const FAILED_UPLOAD_MAX_RETRIES: u32 = 10; // * after each rowgroup write, we check the length of the file and upload to s3 if large enough #[derive(parquet_derive::ParquetRecordWriter)] -struct RequestData { +pub struct RequestData { region: &'static str, protocol: &'static str, /// Must be UTC. The derive macro doesn't like the timezones @@ -84,19 +84,23 @@ struct RequestData { username: Option, application_name: Option, endpoint_id: Option, + database: Option, project: Option, branch: Option, + auth_method: Option<&'static str>, error: Option<&'static str>, /// Success is counted if we form a HTTP response with sql rows inside /// Or if we make it to proxy_pass success: bool, + /// Indicates if the cplane started the new compute node for this request. + cold_start_info: Option<&'static str>, /// Tracks time from session start (HTTP request/libpq TCP handshake) /// Through to success/failure duration_us: u64, } -impl From for RequestData { - fn from(value: RequestMonitoring) -> Self { +impl From<&RequestMonitoring> for RequestData { + fn from(value: &RequestMonitoring) -> Self { Self { session_id: value.session_id, peer_addr: value.peer_addr.to_string(), @@ -104,12 +108,25 @@ impl From for RequestData { username: value.user.as_deref().map(String::from), application_name: value.application.as_deref().map(String::from), endpoint_id: value.endpoint_id.as_deref().map(String::from), + database: value.dbname.as_deref().map(String::from), project: value.project.as_deref().map(String::from), branch: value.branch.as_deref().map(String::from), + auth_method: value.auth_method.as_ref().map(|x| match x { + super::AuthMethod::Web => "web", + super::AuthMethod::ScramSha256 => "scram_sha_256", + super::AuthMethod::ScramSha256Plus => "scram_sha_256_plus", + super::AuthMethod::Cleartext => "cleartext", + }), protocol: value.protocol, region: value.region, - error: value.error_kind.as_ref().map(|e| e.to_str()), + error: value.error_kind.as_ref().map(|e| e.to_metric_label()), success: value.success, + cold_start_info: value.cold_start_info.as_ref().map(|x| match x { + crate::console::messages::ColdStartInfo::Unknown => "unknown", + crate::console::messages::ColdStartInfo::Warm => "warm", + crate::console::messages::ColdStartInfo::PoolHit => "pool_hit", + crate::console::messages::ColdStartInfo::PoolMiss => "pool_miss", + }), duration_us: SystemTime::from(value.first_packet) .elapsed() .unwrap_or_default() @@ -192,8 +209,9 @@ async fn worker_inner( let mut rows = Vec::with_capacity(config.rows_per_group); let schema = rows.as_slice().schema()?; - let file = BytesWriter::default(); - let mut w = SerializedFileWriter::new(file, schema.clone(), config.propeties.clone())?; + let buffer = BytesMut::new(); + let w = buffer.writer(); + let mut w = SerializedFileWriter::new(w, schema.clone(), config.propeties.clone())?; let mut last_upload = time::Instant::now(); @@ -221,20 +239,23 @@ async fn worker_inner( } if !w.flushed_row_groups().is_empty() { - let _: BytesWriter = upload_parquet(w, len, &storage).await?; + let _: Writer = upload_parquet(w, len, &storage).await?; } Ok(()) } -async fn flush_rows( +async fn flush_rows( rows: Vec, - mut w: SerializedFileWriter, + mut w: SerializedFileWriter, ) -> anyhow::Result<( Vec, - SerializedFileWriter, + SerializedFileWriter, RowGroupMetaDataPtr, -)> { +)> +where + W: std::io::Write + Send + 'static, +{ let span = Span::current(); let (mut rows, w, rg_meta) = tokio::task::spawn_blocking(move || { let _enter = span.enter(); @@ -258,10 +279,10 @@ async fn flush_rows( } async fn upload_parquet( - w: SerializedFileWriter, + w: SerializedFileWriter>, len: i64, storage: &GenericRemoteStorage, -) -> anyhow::Result { +) -> anyhow::Result> { let len_uncompressed = w .flushed_row_groups() .iter() @@ -270,11 +291,12 @@ async fn upload_parquet( // I don't know how compute intensive this is, although it probably isn't much... better be safe than sorry. // finish method only available on the fork: https://github.com/apache/arrow-rs/issues/5253 - let (mut file, metadata) = tokio::task::spawn_blocking(move || w.finish()) + let (writer, metadata) = tokio::task::spawn_blocking(move || w.finish()) .await .unwrap()?; - let data = file.buf.split().freeze(); + let mut buffer = writer.into_inner(); + let data = buffer.split().freeze(); let compression = len as f64 / len_uncompressed as f64; let size = data.len(); @@ -300,39 +322,27 @@ async fn upload_parquet( let path = RemotePath::from_string(&format!( "{year:04}/{month:02}/{day:02}/{hour:02}/requests_{id}.parquet" ))?; + let cancel = CancellationToken::new(); backoff::retry( || async { let stream = futures::stream::once(futures::future::ready(Ok(data.clone()))); - storage.upload(stream, data.len(), &path, None).await + storage + .upload(stream, data.len(), &path, None, &cancel) + .await }, - |_e| false, + TimeoutOrCancel::caused_by_cancel, FAILED_UPLOAD_WARN_THRESHOLD, FAILED_UPLOAD_MAX_RETRIES, "request_data_upload", // we don't want cancellation to interrupt here, so we make a dummy cancel token - backoff::Cancel::new(CancellationToken::new(), || anyhow::anyhow!("Cancelled")), + &cancel, ) .await + .ok_or_else(|| anyhow::Error::new(TimeoutOrCancel::Cancel)) + .and_then(|x| x) .context("request_data_upload")?; - Ok(file) -} - -// why doesn't BytesMut impl io::Write? -#[derive(Default)] -struct BytesWriter { - buf: BytesMut, -} - -impl std::io::Write for BytesWriter { - fn write(&mut self, buf: &[u8]) -> std::io::Result { - self.buf.extend_from_slice(buf); - Ok(buf.len()) - } - - fn flush(&mut self) -> std::io::Result<()> { - Ok(()) - } + Ok(buffer.writer()) } #[cfg(test)] @@ -414,7 +424,8 @@ mod tests { ) .unwrap(), max_keys_per_list_response: DEFAULT_MAX_KEYS_PER_LIST_RESPONSE, - }) + }), + timeout: RemoteStorageConfig::DEFAULT_TIMEOUT, }) ); assert_eq!(parquet_upload.parquet_upload_row_group_size, 100); @@ -441,12 +452,15 @@ mod tests { application_name: Some("test".to_owned()), username: Some(hex::encode(rng.gen::<[u8; 4]>())), endpoint_id: Some(hex::encode(rng.gen::<[u8; 16]>())), + database: Some(hex::encode(rng.gen::<[u8; 16]>())), project: Some(hex::encode(rng.gen::<[u8; 16]>())), branch: Some(hex::encode(rng.gen::<[u8; 16]>())), + auth_method: None, protocol: ["tcp", "ws", "http"][rng.gen_range(0..3)], region: "us-east-1", error: None, success: rng.gen(), + cold_start_info: Some("no"), duration_us: rng.gen_range(0..30_000_000), } } @@ -465,6 +479,7 @@ mod tests { ) -> Vec<(u64, usize, i64)> { let remote_storage_config = RemoteStorageConfig { storage: RemoteStorageKind::LocalFs(tmpdir.to_path_buf()), + timeout: std::time::Duration::from_secs(120), }; let storage = GenericRemoteStorage::from_config(&remote_storage_config).unwrap(); @@ -515,16 +530,16 @@ mod tests { assert_eq!( file_stats, [ - (1087635, 3, 6000), - (1087288, 3, 6000), - (1087444, 3, 6000), - (1087572, 3, 6000), - (1087468, 3, 6000), - (1087500, 3, 6000), - (1087533, 3, 6000), - (1087566, 3, 6000), - (362671, 1, 2000) - ], + (1314406, 3, 6000), + (1314399, 3, 6000), + (1314459, 3, 6000), + (1314416, 3, 6000), + (1314546, 3, 6000), + (1314388, 3, 6000), + (1314180, 3, 6000), + (1314416, 3, 6000), + (438359, 1, 2000) + ] ); tmpdir.close().unwrap(); @@ -553,12 +568,12 @@ mod tests { assert_eq!( file_stats, [ - (1028637, 5, 10000), - (1031969, 5, 10000), - (1019900, 5, 10000), - (1020365, 5, 10000), - (1025010, 5, 10000) - ], + (1220668, 5, 10000), + (1226818, 5, 10000), + (1228612, 5, 10000), + (1227974, 5, 10000), + (1219252, 5, 10000) + ] ); tmpdir.close().unwrap(); @@ -589,12 +604,12 @@ mod tests { assert_eq!( file_stats, [ - (1210770, 6, 12000), - (1211036, 6, 12000), - (1210990, 6, 12000), - (1210861, 6, 12000), - (202073, 1, 2000) - ], + (1206315, 5, 10000), + (1206046, 5, 10000), + (1206339, 5, 10000), + (1206327, 5, 10000), + (1206582, 5, 10000) + ] ); tmpdir.close().unwrap(); @@ -618,16 +633,16 @@ mod tests { assert_eq!( file_stats, [ - (1087635, 3, 6000), - (1087288, 3, 6000), - (1087444, 3, 6000), - (1087572, 3, 6000), - (1087468, 3, 6000), - (1087500, 3, 6000), - (1087533, 3, 6000), - (1087566, 3, 6000), - (362671, 1, 2000) - ], + (1314406, 3, 6000), + (1314399, 3, 6000), + (1314459, 3, 6000), + (1314416, 3, 6000), + (1314546, 3, 6000), + (1314388, 3, 6000), + (1314180, 3, 6000), + (1314416, 3, 6000), + (438359, 1, 2000) + ] ); tmpdir.close().unwrap(); @@ -663,7 +678,7 @@ mod tests { // files are smaller than the size threshold, but they took too long to fill so were flushed early assert_eq!( file_stats, - [(545264, 2, 3001), (545025, 2, 3000), (544857, 2, 2999)], + [(658837, 2, 3001), (658551, 2, 3000), (658347, 2, 2999)] ); tmpdir.close().unwrap(); diff --git a/proxy/src/error.rs b/proxy/src/error.rs index 5b2dd7ecfd..4614f3913d 100644 --- a/proxy/src/error.rs +++ b/proxy/src/error.rs @@ -17,7 +17,7 @@ pub fn log_error(e: E) -> E { /// NOTE: This trait should not be implemented for [`anyhow::Error`], since it /// is way too convenient and tends to proliferate all across the codebase, /// ultimately leading to accidental leaks of sensitive data. -pub trait UserFacingError: fmt::Display { +pub trait UserFacingError: ReportableError { /// Format the error for client, stripping all sensitive info. /// /// Although this might be a no-op for many types, it's highly @@ -29,36 +29,58 @@ pub trait UserFacingError: fmt::Display { } } -#[derive(Clone)] +#[derive(Copy, Clone, Debug, Eq, PartialEq)] pub enum ErrorKind { /// Wrong password, unknown endpoint, protocol violation, etc... User, /// Network error between user and proxy. Not necessarily user error - Disconnect, + ClientDisconnect, - /// Proxy self-imposed rate limits + /// Proxy self-imposed user rate limits RateLimit, + /// Proxy self-imposed service-wise rate limits + ServiceRateLimit, + /// internal errors Service, /// Error communicating with control plane ControlPlane, + /// Postgres error + Postgres, + /// Error communicating with compute Compute, } impl ErrorKind { - pub fn to_str(&self) -> &'static str { + pub fn to_metric_label(&self) -> &'static str { match self { - ErrorKind::User => "request failed due to user error", - ErrorKind::Disconnect => "client disconnected", - ErrorKind::RateLimit => "request cancelled due to rate limit", - ErrorKind::Service => "internal service error", - ErrorKind::ControlPlane => "non-retryable control plane error", - ErrorKind::Compute => "non-retryable compute error (or exhausted retry capacity)", + ErrorKind::User => "user", + ErrorKind::ClientDisconnect => "clientdisconnect", + ErrorKind::RateLimit => "ratelimit", + ErrorKind::ServiceRateLimit => "serviceratelimit", + ErrorKind::Service => "service", + ErrorKind::ControlPlane => "controlplane", + ErrorKind::Postgres => "postgres", + ErrorKind::Compute => "compute", + } + } +} + +pub trait ReportableError: fmt::Display + Send + 'static { + fn get_error_kind(&self) -> ErrorKind; +} + +impl ReportableError for tokio_postgres::error::Error { + fn get_error_kind(&self) -> ErrorKind { + if self.as_db_error().is_some() { + ErrorKind::Postgres + } else { + ErrorKind::Compute } } } diff --git a/proxy/src/intern.rs b/proxy/src/intern.rs new file mode 100644 index 0000000000..a6519bdff9 --- /dev/null +++ b/proxy/src/intern.rs @@ -0,0 +1,237 @@ +use std::{ + hash::BuildHasherDefault, marker::PhantomData, num::NonZeroUsize, ops::Index, sync::OnceLock, +}; + +use lasso::{Capacity, MemoryLimits, Spur, ThreadedRodeo}; +use rustc_hash::FxHasher; + +use crate::{BranchId, EndpointId, ProjectId, RoleName}; + +pub trait InternId: Sized + 'static { + fn get_interner() -> &'static StringInterner; +} + +pub struct StringInterner { + inner: ThreadedRodeo>, + _id: PhantomData, +} + +#[derive(PartialEq, Debug, Clone, Copy, Eq, Hash)] +pub struct InternedString { + inner: Spur, + _id: PhantomData, +} + +impl std::fmt::Display for InternedString { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + self.as_str().fmt(f) + } +} + +impl InternedString { + pub fn as_str(&self) -> &'static str { + Id::get_interner().inner.resolve(&self.inner) + } + pub fn get(s: &str) -> Option { + Id::get_interner().get(s) + } +} + +impl AsRef for InternedString { + fn as_ref(&self) -> &str { + self.as_str() + } +} + +impl std::ops::Deref for InternedString { + type Target = str; + fn deref(&self) -> &str { + self.as_str() + } +} + +impl<'de, Id: InternId> serde::de::Deserialize<'de> for InternedString { + fn deserialize>(d: D) -> Result { + struct Visitor(PhantomData); + impl<'de, Id: InternId> serde::de::Visitor<'de> for Visitor { + type Value = InternedString; + + fn expecting(&self, formatter: &mut std::fmt::Formatter) -> std::fmt::Result { + formatter.write_str("a string") + } + + fn visit_str(self, v: &str) -> Result + where + E: serde::de::Error, + { + Ok(Id::get_interner().get_or_intern(v)) + } + } + d.deserialize_str(Visitor::(PhantomData)) + } +} + +impl serde::Serialize for InternedString { + fn serialize(&self, s: S) -> Result { + self.as_str().serialize(s) + } +} + +impl StringInterner { + pub fn new() -> Self { + StringInterner { + inner: ThreadedRodeo::with_capacity_memory_limits_and_hasher( + Capacity::new(2500, NonZeroUsize::new(1 << 16).unwrap()), + // unbounded + MemoryLimits::for_memory_usage(usize::MAX), + BuildHasherDefault::::default(), + ), + _id: PhantomData, + } + } + + pub fn is_empty(&self) -> bool { + self.inner.is_empty() + } + + pub fn len(&self) -> usize { + self.inner.len() + } + + pub fn current_memory_usage(&self) -> usize { + self.inner.current_memory_usage() + } + + pub fn get_or_intern(&self, s: &str) -> InternedString { + InternedString { + inner: self.inner.get_or_intern(s), + _id: PhantomData, + } + } + + pub fn get(&self, s: &str) -> Option> { + Some(InternedString { + inner: self.inner.get(s)?, + _id: PhantomData, + }) + } +} + +impl Index> for StringInterner { + type Output = str; + + fn index(&self, index: InternedString) -> &Self::Output { + self.inner.resolve(&index.inner) + } +} + +impl Default for StringInterner { + fn default() -> Self { + Self::new() + } +} + +#[derive(Copy, Clone, Debug, PartialEq, Eq, Hash)] +pub struct RoleNameTag; +impl InternId for RoleNameTag { + fn get_interner() -> &'static StringInterner { + pub static ROLE_NAMES: OnceLock> = OnceLock::new(); + ROLE_NAMES.get_or_init(Default::default) + } +} +pub type RoleNameInt = InternedString; +impl From<&RoleName> for RoleNameInt { + fn from(value: &RoleName) -> Self { + RoleNameTag::get_interner().get_or_intern(value) + } +} + +#[derive(Copy, Clone, Debug, PartialEq, Eq, Hash)] +pub struct EndpointIdTag; +impl InternId for EndpointIdTag { + fn get_interner() -> &'static StringInterner { + pub static ROLE_NAMES: OnceLock> = OnceLock::new(); + ROLE_NAMES.get_or_init(Default::default) + } +} +pub type EndpointIdInt = InternedString; +impl From<&EndpointId> for EndpointIdInt { + fn from(value: &EndpointId) -> Self { + EndpointIdTag::get_interner().get_or_intern(value) + } +} + +#[derive(Copy, Clone, Debug, PartialEq, Eq, Hash)] +pub struct BranchIdTag; +impl InternId for BranchIdTag { + fn get_interner() -> &'static StringInterner { + pub static ROLE_NAMES: OnceLock> = OnceLock::new(); + ROLE_NAMES.get_or_init(Default::default) + } +} +pub type BranchIdInt = InternedString; +impl From<&BranchId> for BranchIdInt { + fn from(value: &BranchId) -> Self { + BranchIdTag::get_interner().get_or_intern(value) + } +} + +#[derive(Copy, Clone, Debug, PartialEq, Eq, Hash)] +pub struct ProjectIdTag; +impl InternId for ProjectIdTag { + fn get_interner() -> &'static StringInterner { + pub static ROLE_NAMES: OnceLock> = OnceLock::new(); + ROLE_NAMES.get_or_init(Default::default) + } +} +pub type ProjectIdInt = InternedString; +impl From<&ProjectId> for ProjectIdInt { + fn from(value: &ProjectId) -> Self { + ProjectIdTag::get_interner().get_or_intern(value) + } +} + +#[cfg(test)] +mod tests { + use std::sync::OnceLock; + + use crate::intern::StringInterner; + + use super::InternId; + + struct MyId; + impl InternId for MyId { + fn get_interner() -> &'static StringInterner { + pub static ROLE_NAMES: OnceLock> = OnceLock::new(); + ROLE_NAMES.get_or_init(Default::default) + } + } + + #[test] + fn push_many_strings() { + use rand::{rngs::StdRng, Rng, SeedableRng}; + use rand_distr::Zipf; + + let endpoint_dist = Zipf::new(500000, 0.8).unwrap(); + let endpoints = StdRng::seed_from_u64(272488357).sample_iter(endpoint_dist); + + let interner = MyId::get_interner(); + + const N: usize = 100_000; + let mut verify = Vec::with_capacity(N); + for endpoint in endpoints.take(N) { + let endpoint = format!("ep-string-interning-{endpoint}"); + let key = interner.get_or_intern(&endpoint); + verify.push((endpoint, key)); + } + + for (s, key) in verify { + assert_eq!(interner[key], s); + } + + // 2031616/59861 = 34 bytes per string + assert_eq!(interner.len(), 59_861); + // will have other overhead for the internal hashmaps that are not accounted for. + assert_eq!(interner.current_memory_usage(), 2_031_616); + } +} diff --git a/proxy/src/jemalloc.rs b/proxy/src/jemalloc.rs new file mode 100644 index 0000000000..ed20798d56 --- /dev/null +++ b/proxy/src/jemalloc.rs @@ -0,0 +1,100 @@ +use std::time::Duration; + +use metrics::IntGauge; +use prometheus::{register_int_gauge_with_registry, Registry}; +use tikv_jemalloc_ctl::{config, epoch, epoch_mib, stats, version}; + +pub struct MetricRecorder { + epoch: epoch_mib, + active: stats::active_mib, + active_gauge: IntGauge, + allocated: stats::allocated_mib, + allocated_gauge: IntGauge, + mapped: stats::mapped_mib, + mapped_gauge: IntGauge, + metadata: stats::metadata_mib, + metadata_gauge: IntGauge, + resident: stats::resident_mib, + resident_gauge: IntGauge, + retained: stats::retained_mib, + retained_gauge: IntGauge, +} + +impl MetricRecorder { + pub fn new(registry: &Registry) -> Result { + tracing::info!( + config = config::malloc_conf::read()?, + version = version::read()?, + "starting jemalloc recorder" + ); + + Ok(Self { + epoch: epoch::mib()?, + active: stats::active::mib()?, + active_gauge: register_int_gauge_with_registry!( + "jemalloc_active_bytes", + "Total number of bytes in active pages allocated by the process", + registry + )?, + allocated: stats::allocated::mib()?, + allocated_gauge: register_int_gauge_with_registry!( + "jemalloc_allocated_bytes", + "Total number of bytes allocated by the process", + registry + )?, + mapped: stats::mapped::mib()?, + mapped_gauge: register_int_gauge_with_registry!( + "jemalloc_mapped_bytes", + "Total number of bytes in active extents mapped by the allocator", + registry + )?, + metadata: stats::metadata::mib()?, + metadata_gauge: register_int_gauge_with_registry!( + "jemalloc_metadata_bytes", + "Total number of bytes dedicated to jemalloc metadata", + registry + )?, + resident: stats::resident::mib()?, + resident_gauge: register_int_gauge_with_registry!( + "jemalloc_resident_bytes", + "Total number of bytes in physically resident data pages mapped by the allocator", + registry + )?, + retained: stats::retained::mib()?, + retained_gauge: register_int_gauge_with_registry!( + "jemalloc_retained_bytes", + "Total number of bytes in virtual memory mappings that were retained rather than being returned to the operating system", + registry + )?, + }) + } + + fn _poll(&self) -> Result<(), anyhow::Error> { + self.epoch.advance()?; + self.active_gauge.set(self.active.read()? as i64); + self.allocated_gauge.set(self.allocated.read()? as i64); + self.mapped_gauge.set(self.mapped.read()? as i64); + self.metadata_gauge.set(self.metadata.read()? as i64); + self.resident_gauge.set(self.resident.read()? as i64); + self.retained_gauge.set(self.retained.read()? as i64); + Ok(()) + } + + #[inline] + pub fn poll(&self) { + if let Err(error) = self._poll() { + tracing::warn!(%error, "Failed to poll jemalloc stats"); + } + } + + pub fn start(self) -> tokio::task::JoinHandle<()> { + tokio::task::spawn(async move { + let mut interval = tokio::time::interval(Duration::from_secs(15)); + interval.set_missed_tick_behavior(tokio::time::MissedTickBehavior::Skip); + loop { + self.poll(); + interval.tick().await; + } + }) + } +} diff --git a/proxy/src/lib.rs b/proxy/src/lib.rs index a9e4a38302..da7c7f3ed2 100644 --- a/proxy/src/lib.rs +++ b/proxy/src/lib.rs @@ -16,6 +16,8 @@ pub mod console; pub mod context; pub mod error; pub mod http; +pub mod intern; +pub mod jemalloc; pub mod logging; pub mod metrics; pub mod parse; diff --git a/proxy/src/metrics.rs b/proxy/src/metrics.rs index 6e4cbb3f3a..02ebcd6aaa 100644 --- a/proxy/src/metrics.rs +++ b/proxy/src/metrics.rs @@ -1,14 +1,13 @@ use ::metrics::{ - exponential_buckets, register_int_counter_pair_vec, register_int_counter_vec, - IntCounterPairVec, IntCounterVec, -}; -use prometheus::{ - register_histogram, register_histogram_vec, register_int_gauge_vec, Histogram, HistogramVec, - IntGaugeVec, + exponential_buckets, register_histogram, register_histogram_vec, register_hll_vec, + register_int_counter_pair_vec, register_int_counter_vec, register_int_gauge, + register_int_gauge_vec, Histogram, HistogramVec, HyperLogLogVec, IntCounterPairVec, + IntCounterVec, IntGauge, IntGaugeVec, }; +use metrics::{register_int_counter, register_int_counter_pair, IntCounter, IntCounterPair}; use once_cell::sync::Lazy; -use tokio::time; +use tokio::time::{self, Instant}; pub static NUM_DB_CONNECTIONS_GAUGE: Lazy = Lazy::new(|| { register_int_counter_pair_vec!( @@ -47,9 +46,9 @@ pub static COMPUTE_CONNECTION_LATENCY: Lazy = Lazy::new(|| { register_histogram_vec!( "proxy_compute_connection_latency_seconds", "Time it took for proxy to establish a connection to the compute endpoint", - // http/ws/tcp, true/false, true/false, success/failure - // 3 * 2 * 2 * 2 = 24 counters - &["protocol", "cache_miss", "pool_miss", "outcome"], + // http/ws/tcp, true/false, true/false, success/failure, client/client_and_cplane + // 3 * 2 * 2 * 2 * 2 = 48 counters + &["protocol", "cache_miss", "pool_miss", "outcome", "excluded"], // largest bucket = 2^16 * 0.5ms = 32s exponential_buckets(0.0005, 2.0, 16).unwrap(), ) @@ -115,12 +114,73 @@ pub static ALLOWED_IPS_NUMBER: Lazy = Lazy::new(|| { .unwrap() }); -#[derive(Clone)] +pub static HTTP_CONTENT_LENGTH: Lazy = Lazy::new(|| { + register_histogram!( + "proxy_http_conn_content_length_bytes", + "Time it took for proxy to establish a connection to the compute endpoint", + // largest bucket = 3^16 * 0.05ms = 2.15s + exponential_buckets(8.0, 2.0, 20).unwrap() + ) + .unwrap() +}); + +pub static GC_LATENCY: Lazy = Lazy::new(|| { + register_histogram!( + "proxy_http_pool_reclaimation_lag_seconds", + "Time it takes to reclaim unused connection pools", + // 1us -> 65ms + exponential_buckets(1e-6, 2.0, 16).unwrap(), + ) + .unwrap() +}); + +pub static ENDPOINT_POOLS: Lazy = Lazy::new(|| { + register_int_counter_pair!( + "proxy_http_pool_endpoints_registered_total", + "Number of endpoints we have registered pools for", + "proxy_http_pool_endpoints_unregistered_total", + "Number of endpoints we have unregistered pools for", + ) + .unwrap() +}); + +pub static NUM_OPEN_CLIENTS_IN_HTTP_POOL: Lazy = Lazy::new(|| { + register_int_gauge!( + "proxy_http_pool_opened_connections", + "Number of opened connections to a database.", + ) + .unwrap() +}); + +pub static NUM_CANCELLATION_REQUESTS: Lazy = Lazy::new(|| { + register_int_counter_vec!( + "proxy_cancellation_requests_total", + "Number of cancellation requests (per found/not_found).", + &["source", "kind"], + ) + .unwrap() +}); + +pub enum Waiting { + Cplane, + Client, + Compute, +} + +#[derive(Default)] +struct Accumulated { + cplane: time::Duration, + client: time::Duration, + compute: time::Duration, +} + pub struct LatencyTimer { // time since the stopwatch was started - start: Option, + start: time::Instant, + // time since the stopwatch was stopped + stop: Option, // accumulated time on the stopwatch - pub accumulated: std::time::Duration, + accumulated: Accumulated, // label data protocol: &'static str, cache_miss: bool, @@ -130,13 +190,16 @@ pub struct LatencyTimer { pub struct LatencyTimerPause<'a> { timer: &'a mut LatencyTimer, + start: time::Instant, + waiting_for: Waiting, } impl LatencyTimer { pub fn new(protocol: &'static str) -> Self { Self { - start: Some(time::Instant::now()), - accumulated: std::time::Duration::ZERO, + start: time::Instant::now(), + stop: None, + accumulated: Accumulated::default(), protocol, cache_miss: false, // by default we don't do pooling @@ -146,11 +209,12 @@ impl LatencyTimer { } } - pub fn pause(&mut self) -> LatencyTimerPause<'_> { - // stop the stopwatch and record the time that we have accumulated - let start = self.start.take().expect("latency timer should be started"); - self.accumulated += start.elapsed(); - LatencyTimerPause { timer: self } + pub fn pause(&mut self, waiting_for: Waiting) -> LatencyTimerPause<'_> { + LatencyTimerPause { + timer: self, + start: Instant::now(), + waiting_for, + } } pub fn cache_miss(&mut self) { @@ -163,8 +227,7 @@ impl LatencyTimer { pub fn success(&mut self) { // stop the stopwatch and record the time that we have accumulated - let start = self.start.take().expect("latency timer should be started"); - self.accumulated += start.elapsed(); + self.stop = Some(time::Instant::now()); // success self.outcome = "success"; @@ -173,23 +236,42 @@ impl LatencyTimer { impl Drop for LatencyTimerPause<'_> { fn drop(&mut self) { - // start the stopwatch again - self.timer.start = Some(time::Instant::now()); + let dur = self.start.elapsed(); + match self.waiting_for { + Waiting::Cplane => self.timer.accumulated.cplane += dur, + Waiting::Client => self.timer.accumulated.client += dur, + Waiting::Compute => self.timer.accumulated.compute += dur, + } } } impl Drop for LatencyTimer { fn drop(&mut self) { - let duration = - self.start.map(|start| start.elapsed()).unwrap_or_default() + self.accumulated; + let duration = self + .stop + .unwrap_or_else(time::Instant::now) + .duration_since(self.start); + // Excluding cplane communication from the accumulated time. COMPUTE_CONNECTION_LATENCY .with_label_values(&[ self.protocol, bool_to_str(self.cache_miss), bool_to_str(self.pool_miss), self.outcome, + "client", ]) - .observe(duration.as_secs_f64()) + .observe((duration.saturating_sub(self.accumulated.client)).as_secs_f64()); + // Exclude client and cplane communication from the accumulated time. + let accumulated_total = self.accumulated.client + self.accumulated.cplane; + COMPUTE_CONNECTION_LATENCY + .with_label_values(&[ + self.protocol, + bool_to_str(self.cache_miss), + bool_to_str(self.pool_miss), + self.outcome, + "client_and_cplane", + ]) + .observe((duration.saturating_sub(accumulated_total)).as_secs_f64()); } } @@ -211,15 +293,6 @@ pub static NUM_WAKEUP_FAILURES: Lazy = Lazy::new(|| { .unwrap() }); -pub static NUM_BYTES_PROXIED_PER_CLIENT_COUNTER: Lazy = Lazy::new(|| { - register_int_counter_vec!( - "proxy_io_bytes_per_client", - "Number of bytes sent/received between client and backend.", - crate::console::messages::MetricsAuxInfo::TRAFFIC_LABELS, - ) - .unwrap() -}); - pub static NUM_BYTES_PROXIED_COUNTER: Lazy = Lazy::new(|| { register_int_counter_vec!( "proxy_io_bytes", @@ -236,3 +309,49 @@ pub const fn bool_to_str(x: bool) -> &'static str { "false" } } + +pub static CONNECTING_ENDPOINTS: Lazy> = Lazy::new(|| { + register_hll_vec!( + 32, + "proxy_connecting_endpoints", + "HLL approximate cardinality of endpoints that are connecting", + &["protocol"], + ) + .unwrap() +}); + +pub static ERROR_BY_KIND: Lazy = Lazy::new(|| { + register_int_counter_vec!( + "proxy_errors_total", + "Number of errors by a given classification", + &["type"], + ) + .unwrap() +}); + +pub static ENDPOINT_ERRORS_BY_KIND: Lazy> = Lazy::new(|| { + register_hll_vec!( + 32, + "proxy_endpoints_affected_by_errors", + "Number of endpoints affected by errors of a given classification", + &["type"], + ) + .unwrap() +}); + +pub static REDIS_BROKEN_MESSAGES: Lazy = Lazy::new(|| { + register_int_counter_vec!( + "proxy_redis_errors_total", + "Number of errors by a given classification", + &["channel"], + ) + .unwrap() +}); + +pub static TLS_HANDSHAKE_FAILURES: Lazy = Lazy::new(|| { + register_int_counter!( + "proxy_tls_handshake_failures", + "Number of TLS handshake failures", + ) + .unwrap() +}); diff --git a/proxy/src/protocol2.rs b/proxy/src/protocol2.rs index 1d8931be85..f476cb9b37 100644 --- a/proxy/src/protocol2.rs +++ b/proxy/src/protocol2.rs @@ -1,22 +1,27 @@ //! Proxy Protocol V2 implementation use std::{ - future::poll_fn, - future::Future, + future::{poll_fn, Future}, io, net::SocketAddr, pin::{pin, Pin}, + sync::Mutex, task::{ready, Context, Poll}, }; use bytes::{Buf, BytesMut}; +use hyper::server::accept::Accept; use hyper::server::conn::{AddrIncoming, AddrStream}; +use metrics::IntCounterPairGuard; use pin_project_lite::pin_project; -use tls_listener::AsyncAccept; use tokio::io::{AsyncRead, AsyncReadExt, AsyncWrite, ReadBuf}; +use uuid::Uuid; + +use crate::metrics::NUM_CLIENT_CONNECTION_GAUGE; pub struct ProxyProtocolAccept { pub incoming: AddrIncoming, + pub protocol: &'static str, } pin_project! { @@ -326,21 +331,84 @@ impl AsyncRead for WithClientIp { } } -impl AsyncAccept for ProxyProtocolAccept { - type Connection = WithClientIp; +impl Accept for ProxyProtocolAccept { + type Conn = WithConnectionGuard>; type Error = io::Error; fn poll_accept( mut self: Pin<&mut Self>, cx: &mut Context<'_>, - ) -> Poll>> { + ) -> Poll>> { let conn = ready!(Pin::new(&mut self.incoming).poll_accept(cx)?); + tracing::info!(protocol = self.protocol, "accepted new TCP connection"); let Some(conn) = conn else { return Poll::Ready(None); }; - Poll::Ready(Some(Ok(WithClientIp::new(conn)))) + Poll::Ready(Some(Ok(WithConnectionGuard { + inner: WithClientIp::new(conn), + connection_id: Uuid::new_v4(), + gauge: Mutex::new(Some( + NUM_CLIENT_CONNECTION_GAUGE + .with_label_values(&[self.protocol]) + .guard(), + )), + }))) + } +} + +pin_project! { + pub struct WithConnectionGuard { + #[pin] + pub inner: T, + pub connection_id: Uuid, + pub gauge: Mutex>, + } +} + +impl AsyncWrite for WithConnectionGuard { + #[inline] + fn poll_write( + self: Pin<&mut Self>, + cx: &mut Context<'_>, + buf: &[u8], + ) -> Poll> { + self.project().inner.poll_write(cx, buf) + } + + #[inline] + fn poll_flush(self: Pin<&mut Self>, cx: &mut Context<'_>) -> Poll> { + self.project().inner.poll_flush(cx) + } + + #[inline] + fn poll_shutdown(self: Pin<&mut Self>, cx: &mut Context<'_>) -> Poll> { + self.project().inner.poll_shutdown(cx) + } + + #[inline] + fn poll_write_vectored( + self: Pin<&mut Self>, + cx: &mut Context<'_>, + bufs: &[io::IoSlice<'_>], + ) -> Poll> { + self.project().inner.poll_write_vectored(cx, bufs) + } + + #[inline] + fn is_write_vectored(&self) -> bool { + self.inner.is_write_vectored() + } +} + +impl AsyncRead for WithConnectionGuard { + fn poll_read( + self: Pin<&mut Self>, + cx: &mut Context<'_>, + buf: &mut ReadBuf<'_>, + ) -> Poll> { + self.project().inner.poll_read(cx, buf) } } diff --git a/proxy/src/proxy.rs b/proxy/src/proxy.rs index 087cc7f7a9..ab5bf5d494 100644 --- a/proxy/src/proxy.rs +++ b/proxy/src/proxy.rs @@ -2,42 +2,45 @@ mod tests; pub mod connect_compute; +mod copy_bidirectional; +pub mod handshake; +pub mod passthrough; pub mod retry; +pub mod wake_compute; use crate::{ auth, - cancellation::{self, CancelMap}, + cancellation::{self, CancellationHandler}, compute, - config::{AuthenticationConfig, ProxyConfig, TlsConfig}, - console::messages::MetricsAuxInfo, + config::{ProxyConfig, TlsConfig}, context::RequestMonitoring, - metrics::{ - NUM_BYTES_PROXIED_COUNTER, NUM_BYTES_PROXIED_PER_CLIENT_COUNTER, - NUM_CLIENT_CONNECTION_GAUGE, NUM_CONNECTION_REQUESTS_GAUGE, - }, + error::ReportableError, + metrics::{NUM_CLIENT_CONNECTION_GAUGE, NUM_CONNECTION_REQUESTS_GAUGE}, protocol2::WithClientIp, + proxy::handshake::{handshake, HandshakeData}, rate_limiter::EndpointRateLimiter, stream::{PqStream, Stream}, - usage_metrics::{Ids, USAGE_METRICS}, EndpointCacheKey, }; -use anyhow::{bail, Context}; use futures::TryFutureExt; use itertools::Itertools; +use metrics::IntCounterPairGuard; use once_cell::sync::OnceCell; -use pq_proto::{BeMessage as Be, FeStartupPacket, StartupMessageParams}; +use pq_proto::{BeMessage as Be, StartupMessageParams}; use regex::Regex; use smol_str::{format_smolstr, SmolStr}; use std::sync::Arc; +use thiserror::Error; use tokio::io::{AsyncRead, AsyncWrite, AsyncWriteExt}; use tokio_util::sync::CancellationToken; -use tracing::{error, info, info_span, Instrument}; -use utils::measured_stream::MeasuredStream; +use tracing::{error, info, Instrument}; -use self::connect_compute::{connect_to_compute, TcpMechanism}; +use self::{ + connect_compute::{connect_to_compute, TcpMechanism}, + passthrough::ProxyPassthrough, +}; const ERR_INSECURE_CONNECTION: &str = "connection is insecure (try using `sslmode=require`)"; -const ERR_PROTO_VIOLATION: &str = "protocol violation"; pub async fn run_until_cancelled( f: F, @@ -59,6 +62,7 @@ pub async fn task_main( listener: tokio::net::TcpListener, cancellation_token: CancellationToken, endpoint_rate_limiter: Arc, + cancellation_handler: Arc, ) -> anyhow::Result<()> { scopeguard::defer! { info!("proxy has shut down"); @@ -69,57 +73,84 @@ pub async fn task_main( socket2::SockRef::from(&listener).set_keepalive(true)?; let connections = tokio_util::task::task_tracker::TaskTracker::new(); - let cancel_map = Arc::new(CancelMap::default()); while let Some(accept_result) = run_until_cancelled(listener.accept(), &cancellation_token).await { let (socket, peer_addr) = accept_result?; + let conn_gauge = NUM_CLIENT_CONNECTION_GAUGE + .with_label_values(&["tcp"]) + .guard(); + let session_id = uuid::Uuid::new_v4(); - let cancel_map = Arc::clone(&cancel_map); + let cancellation_handler = Arc::clone(&cancellation_handler); let endpoint_rate_limiter = endpoint_rate_limiter.clone(); - connections.spawn( - async move { - info!("accepted postgres client connection"); + tracing::info!(protocol = "tcp", %session_id, "accepted new TCP connection"); - let mut socket = WithClientIp::new(socket); - let mut peer_addr = peer_addr.ip(); - if let Some(addr) = socket.wait_for_addr().await? { - peer_addr = addr.ip(); - tracing::Span::current().record("peer_addr", &tracing::field::display(addr)); - } else if config.require_client_ip { - bail!("missing required client IP"); + connections.spawn(async move { + let mut socket = WithClientIp::new(socket); + let mut peer_addr = peer_addr.ip(); + match socket.wait_for_addr().await { + Ok(Some(addr)) => peer_addr = addr.ip(), + Err(e) => { + error!("per-client task finished with an error: {e:#}"); + return; } - - let mut ctx = RequestMonitoring::new(session_id, peer_addr, "tcp", &config.region); - - socket - .inner - .set_nodelay(true) - .context("failed to set socket option")?; - - handle_client( - config, - &mut ctx, - &cancel_map, - socket, - ClientMode::Tcp, - endpoint_rate_limiter, - ) - .await + Ok(None) if config.require_client_ip => { + error!("missing required client IP"); + return; + } + Ok(None) => {} } - .instrument(info_span!( - "handle_client", - ?session_id, - peer_addr = tracing::field::Empty - )) - .unwrap_or_else(move |e| { - // Acknowledge that the task has finished with an error. - error!(?session_id, "per-client task finished with an error: {e:#}"); - }), - ); + + match socket.inner.set_nodelay(true) { + Ok(()) => {}, + Err(e) => { + error!("per-client task finished with an error: failed to set socket option: {e:#}"); + return; + }, + }; + + let mut ctx = RequestMonitoring::new(session_id, peer_addr, "tcp", &config.region); + let span = ctx.span.clone(); + + let res = handle_client( + config, + &mut ctx, + cancellation_handler, + socket, + ClientMode::Tcp, + endpoint_rate_limiter, + conn_gauge, + ) + .instrument(span.clone()) + .await; + + match res { + Err(e) => { + // todo: log and push to ctx the error kind + ctx.set_error_kind(e.get_error_kind()); + ctx.log(); + error!(parent: &span, "per-client task finished with an error: {e:#}"); + } + Ok(None) => { + ctx.set_success(); + ctx.log(); + } + Ok(Some(p)) => { + ctx.set_success(); + ctx.log(); + match p.proxy_pass().instrument(span.clone()).await { + Ok(()) => {} + Err(e) => { + error!(parent: &span, "per-client task finished with an error: {e:#}"); + } + } + } + } + }); } connections.close(); @@ -138,14 +169,14 @@ pub enum ClientMode { /// Abstracts the logic of handling TCP vs WS clients impl ClientMode { - fn allow_cleartext(&self) -> bool { + pub fn allow_cleartext(&self) -> bool { match self { ClientMode::Tcp => false, ClientMode::Websockets { .. } => true, } } - fn allow_self_signed_compute(&self, config: &ProxyConfig) -> bool { + pub fn allow_self_signed_compute(&self, config: &ProxyConfig) -> bool { match self { ClientMode::Tcp => config.allow_self_signed_compute, ClientMode::Websockets { .. } => false, @@ -168,164 +199,150 @@ impl ClientMode { } } +#[derive(Debug, Error)] +// almost all errors should be reported to the user, but there's a few cases where we cannot +// 1. Cancellation: we are not allowed to tell the client any cancellation statuses for security reasons +// 2. Handshake: handshake reports errors if it can, otherwise if the handshake fails due to protocol violation, +// we cannot be sure the client even understands our error message +// 3. PrepareClient: The client disconnected, so we can't tell them anyway... +pub enum ClientRequestError { + #[error("{0}")] + Cancellation(#[from] cancellation::CancelError), + #[error("{0}")] + Handshake(#[from] handshake::HandshakeError), + #[error("{0}")] + HandshakeTimeout(#[from] tokio::time::error::Elapsed), + #[error("{0}")] + PrepareClient(#[from] std::io::Error), + #[error("{0}")] + ReportedError(#[from] crate::stream::ReportedError), +} + +impl ReportableError for ClientRequestError { + fn get_error_kind(&self) -> crate::error::ErrorKind { + match self { + ClientRequestError::Cancellation(e) => e.get_error_kind(), + ClientRequestError::Handshake(e) => e.get_error_kind(), + ClientRequestError::HandshakeTimeout(_) => crate::error::ErrorKind::RateLimit, + ClientRequestError::ReportedError(e) => e.get_error_kind(), + ClientRequestError::PrepareClient(_) => crate::error::ErrorKind::ClientDisconnect, + } + } +} + pub async fn handle_client( config: &'static ProxyConfig, ctx: &mut RequestMonitoring, - cancel_map: &CancelMap, + cancellation_handler: Arc, stream: S, mode: ClientMode, endpoint_rate_limiter: Arc, -) -> anyhow::Result<()> { - info!( - protocol = ctx.protocol, - "handling interactive connection from client" - ); + conn_gauge: IntCounterPairGuard, +) -> Result>, ClientRequestError> { + info!("handling interactive connection from client"); let proto = ctx.protocol; - let _client_gauge = NUM_CLIENT_CONNECTION_GAUGE - .with_label_values(&[proto]) - .guard(); let _request_gauge = NUM_CONNECTION_REQUESTS_GAUGE .with_label_values(&[proto]) .guard(); let tls = config.tls_config.as_ref(); - let pause = ctx.latency_timer.pause(); - let do_handshake = handshake(stream, mode.handshake_tls(tls), cancel_map); - let (mut stream, params) = match do_handshake.await? { - Some(x) => x, - None => return Ok(()), // it's a cancellation request - }; + let pause = ctx.latency_timer.pause(crate::metrics::Waiting::Client); + let do_handshake = handshake(stream, mode.handshake_tls(tls)); + let (mut stream, params) = + match tokio::time::timeout(config.handshake_timeout, do_handshake).await?? { + HandshakeData::Startup(stream, params) => (stream, params), + HandshakeData::Cancel(cancel_key_data) => { + return Ok(cancellation_handler + .cancel_session(cancel_key_data, ctx.session_id) + .await + .map(|()| None)?) + } + }; drop(pause); + let hostname = mode.hostname(stream.get_ref()); + + let common_names = tls.map(|tls| &tls.common_names); + // Extract credentials which we're going to use for auth. - let user_info = { - let hostname = mode.hostname(stream.get_ref()); + let result = config + .auth_backend + .as_ref() + .map(|_| auth::ComputeUserInfoMaybeEndpoint::parse(ctx, ¶ms, hostname, common_names)) + .transpose(); - let common_names = tls.map(|tls| &tls.common_names); - let result = config - .auth_backend - .as_ref() - .map(|_| { - auth::ComputeUserInfoMaybeEndpoint::parse(ctx, ¶ms, hostname, common_names) - }) - .transpose(); + let user_info = match result { + Ok(user_info) => user_info, + Err(e) => stream.throw_error(e).await?, + }; - match result { - Ok(user_info) => user_info, - Err(e) => stream.throw_error(e).await?, + // check rate limit + if let Some(ep) = user_info.get_endpoint() { + if !endpoint_rate_limiter.check(ep) { + return stream + .throw_error(auth::AuthError::too_many_connections()) + .await?; + } + } + + let user = user_info.get_user().to_owned(); + let user_info = match user_info + .authenticate( + ctx, + &mut stream, + mode.allow_cleartext(), + &config.authentication_config, + ) + .await + { + Ok(auth_result) => auth_result, + Err(e) => { + let db = params.get("database"); + let app = params.get("application_name"); + let params_span = tracing::info_span!("", ?user, ?db, ?app); + + return stream.throw_error(e).instrument(params_span).await?; } }; - ctx.set_endpoint_id(user_info.get_endpoint()); - - let client = Client::new( - stream, - user_info, - ¶ms, + let mut node = connect_to_compute( + ctx, + &TcpMechanism { params: ¶ms }, + &user_info, mode.allow_self_signed_compute(config), - endpoint_rate_limiter, - ); - cancel_map - .with_session(|session| { - client.connect_to_db(ctx, session, mode, &config.authentication_config) - }) - .await -} + ) + .or_else(|e| stream.throw_error(e)) + .await?; -/// Establish a (most probably, secure) connection with the client. -/// For better testing experience, `stream` can be any object satisfying the traits. -/// It's easier to work with owned `stream` here as we need to upgrade it to TLS; -/// we also take an extra care of propagating only the select handshake errors to client. -#[tracing::instrument(skip_all)] -async fn handshake( - stream: S, - mut tls: Option<&TlsConfig>, - cancel_map: &CancelMap, -) -> anyhow::Result>, StartupMessageParams)>> { - // Client may try upgrading to each protocol only once - let (mut tried_ssl, mut tried_gss) = (false, false); + let session = cancellation_handler.get_session(); + prepare_client_connection(&node, &session, &mut stream).await?; - let mut stream = PqStream::new(Stream::from_raw(stream)); - loop { - let msg = stream.read_startup_packet().await?; - info!("received {msg:?}"); + // Before proxy passing, forward to compute whatever data is left in the + // PqStream input buffer. Normally there is none, but our serverless npm + // driver in pipeline mode sends startup, password and first query + // immediately after opening the connection. + let (stream, read_buf) = stream.into_inner(); + node.stream.write_all(&read_buf).await?; - use FeStartupPacket::*; - match msg { - SslRequest => match stream.get_ref() { - Stream::Raw { .. } if !tried_ssl => { - tried_ssl = true; - - // We can't perform TLS handshake without a config - let enc = tls.is_some(); - stream.write_message(&Be::EncryptionResponse(enc)).await?; - if let Some(tls) = tls.take() { - // Upgrade raw stream into a secure TLS-backed stream. - // NOTE: We've consumed `tls`; this fact will be used later. - - let (raw, read_buf) = stream.into_inner(); - // TODO: Normally, client doesn't send any data before - // server says TLS handshake is ok and read_buf is empy. - // However, you could imagine pipelining of postgres - // SSLRequest + TLS ClientHello in one hunk similar to - // pipelining in our node js driver. We should probably - // support that by chaining read_buf with the stream. - if !read_buf.is_empty() { - bail!("data is sent before server replied with EncryptionResponse"); - } - let tls_stream = raw.upgrade(tls.to_server_config()).await?; - - let (_, tls_server_end_point) = tls - .cert_resolver - .resolve(tls_stream.get_ref().1.server_name()) - .context("missing certificate")?; - - stream = PqStream::new(Stream::Tls { - tls: Box::new(tls_stream), - tls_server_end_point, - }); - } - } - _ => bail!(ERR_PROTO_VIOLATION), - }, - GssEncRequest => match stream.get_ref() { - Stream::Raw { .. } if !tried_gss => { - tried_gss = true; - - // Currently, we don't support GSSAPI - stream.write_message(&Be::EncryptionResponse(false)).await?; - } - _ => bail!(ERR_PROTO_VIOLATION), - }, - StartupMessage { params, .. } => { - // Check that the config has been consumed during upgrade - // OR we didn't provide it at all (for dev purposes). - if tls.is_some() { - stream.throw_error_str(ERR_INSECURE_CONNECTION).await?; - } - - info!(session_type = "normal", "successful handshake"); - break Ok(Some((stream, params))); - } - CancelRequest(cancel_key_data) => { - cancel_map.cancel_session(cancel_key_data).await?; - - info!(session_type = "cancellation", "successful handshake"); - break Ok(None); - } - } - } + Ok(Some(ProxyPassthrough { + client: stream, + aux: node.aux.clone(), + compute: node, + req: _request_gauge, + conn: conn_gauge, + cancel: session, + })) } /// Finish client connection initialization: confirm auth success, send params, etc. #[tracing::instrument(skip_all)] async fn prepare_client_connection( node: &compute::PostgresConnection, - session: cancellation::Session<'_>, + session: &cancellation::Session, stream: &mut PqStream, -) -> anyhow::Result<()> { +) -> Result<(), std::io::Error> { // Register compute's query cancellation token and produce a new, unique one. // The new token (cancel_key_data) will be sent to the client. let cancel_key_data = session.enable_query_cancellation(node.cancel_closure.clone()); @@ -349,151 +366,6 @@ async fn prepare_client_connection( Ok(()) } -/// Forward bytes in both directions (client <-> compute). -#[tracing::instrument(skip_all)] -pub async fn proxy_pass( - ctx: &mut RequestMonitoring, - client: impl AsyncRead + AsyncWrite + Unpin, - compute: impl AsyncRead + AsyncWrite + Unpin, - aux: MetricsAuxInfo, -) -> anyhow::Result<()> { - ctx.set_success(); - ctx.log(); - - let usage = USAGE_METRICS.register(Ids { - endpoint_id: aux.endpoint_id.clone(), - branch_id: aux.branch_id.clone(), - }); - - let m_sent = NUM_BYTES_PROXIED_COUNTER.with_label_values(&["tx"]); - let m_sent2 = NUM_BYTES_PROXIED_PER_CLIENT_COUNTER.with_label_values(&aux.traffic_labels("tx")); - let mut client = MeasuredStream::new( - client, - |_| {}, - |cnt| { - // Number of bytes we sent to the client (outbound). - m_sent.inc_by(cnt as u64); - m_sent2.inc_by(cnt as u64); - usage.record_egress(cnt as u64); - }, - ); - - let m_recv = NUM_BYTES_PROXIED_COUNTER.with_label_values(&["rx"]); - let m_recv2 = NUM_BYTES_PROXIED_PER_CLIENT_COUNTER.with_label_values(&aux.traffic_labels("rx")); - let mut compute = MeasuredStream::new( - compute, - |_| {}, - |cnt| { - // Number of bytes the client sent to the compute node (inbound). - m_recv.inc_by(cnt as u64); - m_recv2.inc_by(cnt as u64); - }, - ); - - // Starting from here we only proxy the client's traffic. - info!("performing the proxy pass..."); - let _ = tokio::io::copy_bidirectional(&mut client, &mut compute).await?; - - Ok(()) -} - -/// Thin connection context. -struct Client<'a, S> { - /// The underlying libpq protocol stream. - stream: PqStream>, - /// Client credentials that we care about. - user_info: auth::BackendType<'a, auth::ComputeUserInfoMaybeEndpoint>, - /// KV-dictionary with PostgreSQL connection params. - params: &'a StartupMessageParams, - /// Allow self-signed certificates (for testing). - allow_self_signed_compute: bool, - /// Rate limiter for endpoints - endpoint_rate_limiter: Arc, -} - -impl<'a, S> Client<'a, S> { - /// Construct a new connection context. - fn new( - stream: PqStream>, - user_info: auth::BackendType<'a, auth::ComputeUserInfoMaybeEndpoint>, - params: &'a StartupMessageParams, - allow_self_signed_compute: bool, - endpoint_rate_limiter: Arc, - ) -> Self { - Self { - stream, - user_info, - params, - allow_self_signed_compute, - endpoint_rate_limiter, - } - } -} - -impl Client<'_, S> { - /// Let the client authenticate and connect to the designated compute node. - // Instrumentation logs endpoint name everywhere. Doesn't work for link - // auth; strictly speaking we don't know endpoint name in its case. - #[tracing::instrument(name = "", fields(ep = %self.user_info.get_endpoint().unwrap_or_default()), skip_all)] - async fn connect_to_db( - self, - ctx: &mut RequestMonitoring, - session: cancellation::Session<'_>, - mode: ClientMode, - config: &'static AuthenticationConfig, - ) -> anyhow::Result<()> { - let Self { - mut stream, - user_info, - params, - allow_self_signed_compute, - endpoint_rate_limiter, - } = self; - - // check rate limit - if let Some(ep) = user_info.get_endpoint() { - if !endpoint_rate_limiter.check(ep) { - return stream - .throw_error(auth::AuthError::too_many_connections()) - .await; - } - } - - let user = user_info.get_user().to_owned(); - let auth_result = match user_info - .authenticate(ctx, &mut stream, mode.allow_cleartext(), config) - .await - { - Ok(auth_result) => auth_result, - Err(e) => { - let db = params.get("database"); - let app = params.get("application_name"); - let params_span = tracing::info_span!("", ?user, ?db, ?app); - - return stream.throw_error(e).instrument(params_span).await; - } - }; - - let (mut node_info, user_info) = auth_result; - - node_info.allow_self_signed_compute = allow_self_signed_compute; - - let aux = node_info.aux.clone(); - let mut node = connect_to_compute(ctx, &TcpMechanism { params }, node_info, &user_info) - .or_else(|e| stream.throw_error(e)) - .await?; - - prepare_client_connection(&node, session, &mut stream).await?; - // Before proxy passing, forward to compute whatever data is left in the - // PqStream input buffer. Normally there is none, but our serverless npm - // driver in pipeline mode sends startup, password and first query - // immediately after opening the connection. - let (stream, read_buf) = stream.into_inner(); - node.stream.write_all(&read_buf).await?; - proxy_pass(ctx, stream, node.stream, aux).await - } -} - #[derive(Debug, Clone, PartialEq, Eq, Default)] pub struct NeonOptions(Vec<(SmolStr, SmolStr)>); @@ -508,6 +380,11 @@ impl NeonOptions { Self::parse_from_iter(StartupMessageParams::parse_options_raw(options)) } + pub fn is_ephemeral(&self) -> bool { + // Currently, neon endpoint options are all reserved for ephemeral endpoints. + !self.0.is_empty() + } + fn parse_from_iter<'a>(options: impl Iterator) -> Self { let mut options = options .filter_map(neon_option) diff --git a/proxy/src/proxy/connect_compute.rs b/proxy/src/proxy/connect_compute.rs index 8bbe88aa51..c76e2ff6d9 100644 --- a/proxy/src/proxy/connect_compute.rs +++ b/proxy/src/proxy/connect_compute.rs @@ -1,15 +1,17 @@ use crate::{ - auth, + auth::backend::ComputeCredentialKeys, compute::{self, PostgresConnection}, - console::{self, errors::WakeComputeError, Api}, + console::{self, errors::WakeComputeError, CachedNodeInfo, NodeInfo}, context::RequestMonitoring, - metrics::{bool_to_str, NUM_CONNECTION_FAILURES, NUM_WAKEUP_FAILURES}, - proxy::retry::{retry_after, ShouldRetry}, + error::ReportableError, + metrics::NUM_CONNECTION_FAILURES, + proxy::{ + retry::{retry_after, ShouldRetry}, + wake_compute::wake_compute, + }, }; use async_trait::async_trait; -use hyper::StatusCode; use pq_proto::StartupMessageParams; -use std::ops::ControlFlow; use tokio::time; use tracing::{error, info, warn}; @@ -19,7 +21,7 @@ const CONNECT_TIMEOUT: time::Duration = time::Duration::from_secs(2); /// (e.g. the compute node's address might've changed at the wrong time). /// Invalidate the cache entry (if any) to prevent subsequent errors. #[tracing::instrument(name = "invalidate_cache", skip_all)] -pub fn invalidate_cache(node_info: console::CachedNodeInfo) -> compute::ConnCfg { +pub fn invalidate_cache(node_info: console::CachedNodeInfo) -> NodeInfo { let is_cached = node_info.cached(); if is_cached { warn!("invalidating stalled compute node info cache entry"); @@ -30,28 +32,13 @@ pub fn invalidate_cache(node_info: console::CachedNodeInfo) -> compute::ConnCfg }; NUM_CONNECTION_FAILURES.with_label_values(&[label]).inc(); - node_info.invalidate().config -} - -/// Try to connect to the compute node once. -#[tracing::instrument(name = "connect_once", fields(pid = tracing::field::Empty), skip_all)] -async fn connect_to_compute_once( - ctx: &mut RequestMonitoring, - node_info: &console::CachedNodeInfo, - timeout: time::Duration, -) -> Result { - let allow_self_signed_compute = node_info.allow_self_signed_compute; - - node_info - .config - .connect(ctx, allow_self_signed_compute, timeout) - .await + node_info.invalidate() } #[async_trait] pub trait ConnectMechanism { type Connection; - type ConnectError; + type ConnectError: ReportableError; type Error: From; async fn connect_once( &self, @@ -63,6 +50,16 @@ pub trait ConnectMechanism { fn update_connect_config(&self, conf: &mut compute::ConnCfg); } +#[async_trait] +pub trait ComputeConnectBackend { + async fn wake_compute( + &self, + ctx: &mut RequestMonitoring, + ) -> Result; + + fn get_keys(&self) -> Option<&ComputeCredentialKeys>; +} + pub struct TcpMechanism<'a> { /// KV-dictionary with PostgreSQL connection params. pub params: &'a StartupMessageParams, @@ -74,13 +71,14 @@ impl ConnectMechanism for TcpMechanism<'_> { type ConnectError = compute::ConnectionError; type Error = compute::ConnectionError; + #[tracing::instrument(fields(pid = tracing::field::Empty), skip_all)] async fn connect_once( &self, ctx: &mut RequestMonitoring, node_info: &console::CachedNodeInfo, timeout: time::Duration, ) -> Result { - connect_to_compute_once(ctx, node_info, timeout).await + node_info.connect(ctx, timeout).await } fn update_connect_config(&self, config: &mut compute::ConnCfg) { @@ -88,56 +86,30 @@ impl ConnectMechanism for TcpMechanism<'_> { } } -fn report_error(e: &WakeComputeError, retry: bool) { - use crate::console::errors::ApiError; - let retry = bool_to_str(retry); - let kind = match e { - WakeComputeError::BadComputeAddress(_) => "bad_compute_address", - WakeComputeError::ApiError(ApiError::Transport(_)) => "api_transport_error", - WakeComputeError::ApiError(ApiError::Console { - status: StatusCode::LOCKED, - ref text, - }) if text.contains("written data quota exceeded") - || text.contains("the limit for current plan reached") => - { - "quota_exceeded" - } - WakeComputeError::ApiError(ApiError::Console { - status: StatusCode::LOCKED, - .. - }) => "api_console_locked", - WakeComputeError::ApiError(ApiError::Console { - status: StatusCode::BAD_REQUEST, - .. - }) => "api_console_bad_request", - WakeComputeError::ApiError(ApiError::Console { status, .. }) - if status.is_server_error() => - { - "api_console_other_server_error" - } - WakeComputeError::ApiError(ApiError::Console { .. }) => "api_console_other_error", - WakeComputeError::TimeoutError => "timeout_error", - }; - NUM_WAKEUP_FAILURES.with_label_values(&[retry, kind]).inc(); -} - /// Try to connect to the compute node, retrying if necessary. /// This function might update `node_info`, so we take it by `&mut`. #[tracing::instrument(skip_all)] -pub async fn connect_to_compute( +pub async fn connect_to_compute( ctx: &mut RequestMonitoring, mechanism: &M, - mut node_info: console::CachedNodeInfo, - user_info: &auth::BackendType<'_, auth::backend::ComputeUserInfo>, + user_info: &B, + allow_self_signed_compute: bool, ) -> Result where M::ConnectError: ShouldRetry + std::fmt::Debug, M::Error: From, { + let mut num_retries = 0; + let mut node_info = wake_compute(&mut num_retries, ctx, user_info).await?; + if let Some(keys) = user_info.get_keys() { + node_info.set_keys(keys); + } + node_info.allow_self_signed_compute = allow_self_signed_compute; + // let mut node_info = credentials.get_node_info(ctx, user_info).await?; mechanism.update_connect_config(&mut node_info.config); // try once - let (config, err) = match mechanism + let err = match mechanism .connect_once(ctx, &node_info, CONNECT_TIMEOUT) .await { @@ -145,57 +117,35 @@ where ctx.latency_timer.success(); return Ok(res); } - Err(e) => { - error!(error = ?e, "could not connect to compute node"); - (invalidate_cache(node_info), e) - } + Err(e) => e, }; - ctx.latency_timer.cache_miss(); + error!(error = ?err, "could not connect to compute node"); - let mut num_retries = 1; - - // if we failed to connect, it's likely that the compute node was suspended, wake a new compute node - info!("compute node's state has likely changed; requesting a wake-up"); - let node_info = loop { - let wake_res = match user_info { - auth::BackendType::Console(api, user_info) => api.wake_compute(ctx, user_info).await, - // nothing to do? - auth::BackendType::Link(_) => return Err(err.into()), - // test backend - #[cfg(test)] - auth::BackendType::Test(x) => x.wake_compute(), - }; - - match handle_try_wake(wake_res, num_retries) { - Err(e) => { - error!(error = ?e, num_retries, retriable = false, "couldn't wake compute node"); - report_error(&e, false); - return Err(e.into()); - } - // failed to wake up but we can continue to retry - Ok(ControlFlow::Continue(e)) => { - report_error(&e, true); - warn!(error = ?e, num_retries, retriable = true, "couldn't wake compute node"); - } - // successfully woke up a compute node and can break the wakeup loop - Ok(ControlFlow::Break(mut node_info)) => { - node_info.config.reuse_password(&config); - mechanism.update_connect_config(&mut node_info.config); - break node_info; - } + let node_info = if !node_info.cached() { + // If we just recieved this from cplane and dodn't get it from cache, we shouldn't retry. + // Do not need to retrieve a new node_info, just return the old one. + if !err.should_retry(num_retries) { + return Err(err.into()); } + node_info + } else { + // if we failed to connect, it's likely that the compute node was suspended, wake a new compute node + info!("compute node's state has likely changed; requesting a wake-up"); + ctx.latency_timer.cache_miss(); + let old_node_info = invalidate_cache(node_info); + let mut node_info = wake_compute(&mut num_retries, ctx, user_info).await?; + node_info.reuse_settings(old_node_info); - let wait_duration = retry_after(num_retries); - num_retries += 1; - - time::sleep(wait_duration).await; + mechanism.update_connect_config(&mut node_info.config); + node_info }; // now that we have a new node, try connect to it repeatedly. // this can error for a few reasons, for instance: // * DNS connection settings haven't quite propagated yet info!("wake_compute success. attempting to connect"); + num_retries = 1; loop { match mechanism .connect_once(ctx, &node_info, CONNECT_TIMEOUT) @@ -221,23 +171,3 @@ where time::sleep(wait_duration).await; } } - -/// Attempts to wake up the compute node. -/// * Returns Ok(Continue(e)) if there was an error waking but retries are acceptable -/// * Returns Ok(Break(node)) if the wakeup succeeded -/// * Returns Err(e) if there was an error -pub fn handle_try_wake( - result: Result, - num_retries: u32, -) -> Result, WakeComputeError> { - match result { - Err(err) => match &err { - WakeComputeError::ApiError(api) if api.should_retry(num_retries) => { - Ok(ControlFlow::Continue(err)) - } - _ => Err(err), - }, - // Ready to try again. - Ok(new) => Ok(ControlFlow::Break(new)), - } -} diff --git a/proxy/src/proxy/copy_bidirectional.rs b/proxy/src/proxy/copy_bidirectional.rs new file mode 100644 index 0000000000..684be74f9a --- /dev/null +++ b/proxy/src/proxy/copy_bidirectional.rs @@ -0,0 +1,274 @@ +use tokio::io::{AsyncRead, AsyncWrite, ReadBuf}; +use tracing::info; + +use std::future::poll_fn; +use std::io; +use std::pin::Pin; +use std::task::{ready, Context, Poll}; + +#[derive(Debug)] +enum TransferState { + Running(CopyBuffer), + ShuttingDown(u64), + Done(u64), +} + +fn transfer_one_direction( + cx: &mut Context<'_>, + state: &mut TransferState, + r: &mut A, + w: &mut B, +) -> Poll> +where + A: AsyncRead + AsyncWrite + Unpin + ?Sized, + B: AsyncRead + AsyncWrite + Unpin + ?Sized, +{ + let mut r = Pin::new(r); + let mut w = Pin::new(w); + loop { + match state { + TransferState::Running(buf) => { + let count = ready!(buf.poll_copy(cx, r.as_mut(), w.as_mut()))?; + *state = TransferState::ShuttingDown(count); + } + TransferState::ShuttingDown(count) => { + ready!(w.as_mut().poll_shutdown(cx))?; + *state = TransferState::Done(*count); + } + TransferState::Done(count) => return Poll::Ready(Ok(*count)), + } + } +} + +#[tracing::instrument(skip_all)] +pub(super) async fn copy_bidirectional_client_compute( + client: &mut Client, + compute: &mut Compute, +) -> Result<(u64, u64), std::io::Error> +where + Client: AsyncRead + AsyncWrite + Unpin + ?Sized, + Compute: AsyncRead + AsyncWrite + Unpin + ?Sized, +{ + let mut client_to_compute = TransferState::Running(CopyBuffer::new()); + let mut compute_to_client = TransferState::Running(CopyBuffer::new()); + + poll_fn(|cx| { + let mut client_to_compute_result = + transfer_one_direction(cx, &mut client_to_compute, client, compute)?; + let mut compute_to_client_result = + transfer_one_direction(cx, &mut compute_to_client, compute, client)?; + + // Early termination checks from compute to client. + if let TransferState::Done(_) = compute_to_client { + if let TransferState::Running(buf) = &client_to_compute { + info!("Compute is done, terminate client"); + // Initiate shutdown + client_to_compute = TransferState::ShuttingDown(buf.amt); + client_to_compute_result = + transfer_one_direction(cx, &mut client_to_compute, client, compute)?; + } + } + + // Early termination checks from compute to client. + if let TransferState::Done(_) = client_to_compute { + if let TransferState::Running(buf) = &compute_to_client { + info!("Client is done, terminate compute"); + // Initiate shutdown + compute_to_client = TransferState::ShuttingDown(buf.amt); + compute_to_client_result = + transfer_one_direction(cx, &mut compute_to_client, client, compute)?; + } + } + + // It is not a problem if ready! returns early ... (comment remains the same) + let client_to_compute = ready!(client_to_compute_result); + let compute_to_client = ready!(compute_to_client_result); + + Poll::Ready(Ok((client_to_compute, compute_to_client))) + }) + .await +} + +#[derive(Debug)] +pub(super) struct CopyBuffer { + read_done: bool, + need_flush: bool, + pos: usize, + cap: usize, + amt: u64, + buf: Box<[u8]>, +} +const DEFAULT_BUF_SIZE: usize = 8 * 1024; + +impl CopyBuffer { + pub(super) fn new() -> Self { + Self { + read_done: false, + need_flush: false, + pos: 0, + cap: 0, + amt: 0, + buf: vec![0; DEFAULT_BUF_SIZE].into_boxed_slice(), + } + } + + fn poll_fill_buf( + &mut self, + cx: &mut Context<'_>, + reader: Pin<&mut R>, + ) -> Poll> + where + R: AsyncRead + ?Sized, + { + let me = &mut *self; + let mut buf = ReadBuf::new(&mut me.buf); + buf.set_filled(me.cap); + + let res = reader.poll_read(cx, &mut buf); + if let Poll::Ready(Ok(())) = res { + let filled_len = buf.filled().len(); + me.read_done = me.cap == filled_len; + me.cap = filled_len; + } + res + } + + fn poll_write_buf( + &mut self, + cx: &mut Context<'_>, + mut reader: Pin<&mut R>, + mut writer: Pin<&mut W>, + ) -> Poll> + where + R: AsyncRead + ?Sized, + W: AsyncWrite + ?Sized, + { + let me = &mut *self; + match writer.as_mut().poll_write(cx, &me.buf[me.pos..me.cap]) { + Poll::Pending => { + // Top up the buffer towards full if we can read a bit more + // data - this should improve the chances of a large write + if !me.read_done && me.cap < me.buf.len() { + ready!(me.poll_fill_buf(cx, reader.as_mut()))?; + } + Poll::Pending + } + res => res, + } + } + + pub(super) fn poll_copy( + &mut self, + cx: &mut Context<'_>, + mut reader: Pin<&mut R>, + mut writer: Pin<&mut W>, + ) -> Poll> + where + R: AsyncRead + ?Sized, + W: AsyncWrite + ?Sized, + { + loop { + // If our buffer is empty, then we need to read some data to + // continue. + if self.pos == self.cap && !self.read_done { + self.pos = 0; + self.cap = 0; + + match self.poll_fill_buf(cx, reader.as_mut()) { + Poll::Ready(Ok(())) => (), + Poll::Ready(Err(err)) => return Poll::Ready(Err(err)), + Poll::Pending => { + // Try flushing when the reader has no progress to avoid deadlock + // when the reader depends on buffered writer. + if self.need_flush { + ready!(writer.as_mut().poll_flush(cx))?; + self.need_flush = false; + } + + return Poll::Pending; + } + } + } + + // If our buffer has some data, let's write it out! + while self.pos < self.cap { + let i = ready!(self.poll_write_buf(cx, reader.as_mut(), writer.as_mut()))?; + if i == 0 { + return Poll::Ready(Err(io::Error::new( + io::ErrorKind::WriteZero, + "write zero byte into writer", + ))); + } else { + self.pos += i; + self.amt += i as u64; + self.need_flush = true; + } + } + + // If pos larger than cap, this loop will never stop. + // In particular, user's wrong poll_write implementation returning + // incorrect written length may lead to thread blocking. + debug_assert!( + self.pos <= self.cap, + "writer returned length larger than input slice" + ); + + // If we've written all the data and we've seen EOF, flush out the + // data and finish the transfer. + if self.pos == self.cap && self.read_done { + ready!(writer.as_mut().poll_flush(cx))?; + return Poll::Ready(Ok(self.amt)); + } + } + } +} + +#[cfg(test)] +mod tests { + use super::*; + use tokio::io::AsyncWriteExt; + + #[tokio::test] + async fn test_client_to_compute() { + let (mut client_client, mut client_proxy) = tokio::io::duplex(8); // Create a mock duplex stream + let (mut compute_proxy, mut compute_client) = tokio::io::duplex(32); // Create a mock duplex stream + + // Simulate 'a' finishing while there's still data for 'b' + client_client.write_all(b"hello").await.unwrap(); + client_client.shutdown().await.unwrap(); + compute_client.write_all(b"Neon").await.unwrap(); + compute_client.shutdown().await.unwrap(); + + let result = copy_bidirectional_client_compute(&mut client_proxy, &mut compute_proxy) + .await + .unwrap(); + + // Assert correct transferred amounts + let (client_to_compute_count, compute_to_client_count) = result; + assert_eq!(client_to_compute_count, 5); // 'hello' was transferred + assert_eq!(compute_to_client_count, 4); // response only partially transferred or not at all + } + + #[tokio::test] + async fn test_compute_to_client() { + let (mut client_client, mut client_proxy) = tokio::io::duplex(32); // Create a mock duplex stream + let (mut compute_proxy, mut compute_client) = tokio::io::duplex(8); // Create a mock duplex stream + + // Simulate 'a' finishing while there's still data for 'b' + compute_client.write_all(b"hello").await.unwrap(); + compute_client.shutdown().await.unwrap(); + client_client + .write_all(b"Neon Serverless Postgres") + .await + .unwrap(); + + let result = copy_bidirectional_client_compute(&mut client_proxy, &mut compute_proxy) + .await + .unwrap(); + + // Assert correct transferred amounts + let (client_to_compute_count, compute_to_client_count) = result; + assert_eq!(compute_to_client_count, 5); // 'hello' was transferred + assert!(client_to_compute_count <= 8); // response only partially transferred or not at all + } +} diff --git a/proxy/src/proxy/handshake.rs b/proxy/src/proxy/handshake.rs new file mode 100644 index 0000000000..4665e07d23 --- /dev/null +++ b/proxy/src/proxy/handshake.rs @@ -0,0 +1,140 @@ +use pq_proto::{BeMessage as Be, CancelKeyData, FeStartupPacket, StartupMessageParams}; +use thiserror::Error; +use tokio::io::{AsyncRead, AsyncWrite}; +use tracing::info; + +use crate::{ + config::TlsConfig, + error::ReportableError, + proxy::ERR_INSECURE_CONNECTION, + stream::{PqStream, Stream, StreamUpgradeError}, +}; + +#[derive(Error, Debug)] +pub enum HandshakeError { + #[error("data is sent before server replied with EncryptionResponse")] + EarlyData, + + #[error("protocol violation")] + ProtocolViolation, + + #[error("missing certificate")] + MissingCertificate, + + #[error("{0}")] + StreamUpgradeError(#[from] StreamUpgradeError), + + #[error("{0}")] + Io(#[from] std::io::Error), + + #[error("{0}")] + ReportedError(#[from] crate::stream::ReportedError), +} + +impl ReportableError for HandshakeError { + fn get_error_kind(&self) -> crate::error::ErrorKind { + match self { + HandshakeError::EarlyData => crate::error::ErrorKind::User, + HandshakeError::ProtocolViolation => crate::error::ErrorKind::User, + // This error should not happen, but will if we have no default certificate and + // the client sends no SNI extension. + // If they provide SNI then we can be sure there is a certificate that matches. + HandshakeError::MissingCertificate => crate::error::ErrorKind::Service, + HandshakeError::StreamUpgradeError(upgrade) => match upgrade { + StreamUpgradeError::AlreadyTls => crate::error::ErrorKind::Service, + StreamUpgradeError::Io(_) => crate::error::ErrorKind::ClientDisconnect, + }, + HandshakeError::Io(_) => crate::error::ErrorKind::ClientDisconnect, + HandshakeError::ReportedError(e) => e.get_error_kind(), + } + } +} + +pub enum HandshakeData { + Startup(PqStream>, StartupMessageParams), + Cancel(CancelKeyData), +} + +/// Establish a (most probably, secure) connection with the client. +/// For better testing experience, `stream` can be any object satisfying the traits. +/// It's easier to work with owned `stream` here as we need to upgrade it to TLS; +/// we also take an extra care of propagating only the select handshake errors to client. +#[tracing::instrument(skip_all)] +pub async fn handshake( + stream: S, + mut tls: Option<&TlsConfig>, +) -> Result, HandshakeError> { + // Client may try upgrading to each protocol only once + let (mut tried_ssl, mut tried_gss) = (false, false); + + let mut stream = PqStream::new(Stream::from_raw(stream)); + loop { + let msg = stream.read_startup_packet().await?; + info!("received {msg:?}"); + + use FeStartupPacket::*; + match msg { + SslRequest => match stream.get_ref() { + Stream::Raw { .. } if !tried_ssl => { + tried_ssl = true; + + // We can't perform TLS handshake without a config + let enc = tls.is_some(); + stream.write_message(&Be::EncryptionResponse(enc)).await?; + if let Some(tls) = tls.take() { + // Upgrade raw stream into a secure TLS-backed stream. + // NOTE: We've consumed `tls`; this fact will be used later. + + let (raw, read_buf) = stream.into_inner(); + // TODO: Normally, client doesn't send any data before + // server says TLS handshake is ok and read_buf is empy. + // However, you could imagine pipelining of postgres + // SSLRequest + TLS ClientHello in one hunk similar to + // pipelining in our node js driver. We should probably + // support that by chaining read_buf with the stream. + if !read_buf.is_empty() { + return Err(HandshakeError::EarlyData); + } + let tls_stream = raw.upgrade(tls.to_server_config()).await?; + + let (_, tls_server_end_point) = tls + .cert_resolver + .resolve(tls_stream.get_ref().1.server_name()) + .ok_or(HandshakeError::MissingCertificate)?; + + stream = PqStream::new(Stream::Tls { + tls: Box::new(tls_stream), + tls_server_end_point, + }); + } + } + _ => return Err(HandshakeError::ProtocolViolation), + }, + GssEncRequest => match stream.get_ref() { + Stream::Raw { .. } if !tried_gss => { + tried_gss = true; + + // Currently, we don't support GSSAPI + stream.write_message(&Be::EncryptionResponse(false)).await?; + } + _ => return Err(HandshakeError::ProtocolViolation), + }, + StartupMessage { params, .. } => { + // Check that the config has been consumed during upgrade + // OR we didn't provide it at all (for dev purposes). + if tls.is_some() { + return stream + .throw_error_str(ERR_INSECURE_CONNECTION, crate::error::ErrorKind::User) + .await?; + } + + info!(session_type = "normal", "successful handshake"); + break Ok(HandshakeData::Startup(stream, params)); + } + CancelRequest(cancel_key_data) => { + info!(session_type = "cancellation", "successful handshake"); + break Ok(HandshakeData::Cancel(cancel_key_data)); + } + } + } +} diff --git a/proxy/src/proxy/passthrough.rs b/proxy/src/proxy/passthrough.rs new file mode 100644 index 0000000000..b2f682fd2f --- /dev/null +++ b/proxy/src/proxy/passthrough.rs @@ -0,0 +1,74 @@ +use crate::{ + cancellation, + compute::PostgresConnection, + console::messages::MetricsAuxInfo, + metrics::NUM_BYTES_PROXIED_COUNTER, + stream::Stream, + usage_metrics::{Ids, USAGE_METRICS}, +}; +use metrics::IntCounterPairGuard; +use tokio::io::{AsyncRead, AsyncWrite}; +use tracing::info; +use utils::measured_stream::MeasuredStream; + +/// Forward bytes in both directions (client <-> compute). +#[tracing::instrument(skip_all)] +pub async fn proxy_pass( + client: impl AsyncRead + AsyncWrite + Unpin, + compute: impl AsyncRead + AsyncWrite + Unpin, + aux: MetricsAuxInfo, +) -> anyhow::Result<()> { + let usage = USAGE_METRICS.register(Ids { + endpoint_id: aux.endpoint_id.clone(), + branch_id: aux.branch_id.clone(), + }); + + let m_sent = NUM_BYTES_PROXIED_COUNTER.with_label_values(&["tx"]); + let mut client = MeasuredStream::new( + client, + |_| {}, + |cnt| { + // Number of bytes we sent to the client (outbound). + m_sent.inc_by(cnt as u64); + usage.record_egress(cnt as u64); + }, + ); + + let m_recv = NUM_BYTES_PROXIED_COUNTER.with_label_values(&["rx"]); + let mut compute = MeasuredStream::new( + compute, + |_| {}, + |cnt| { + // Number of bytes the client sent to the compute node (inbound). + m_recv.inc_by(cnt as u64); + }, + ); + + // Starting from here we only proxy the client's traffic. + info!("performing the proxy pass..."); + let _ = crate::proxy::copy_bidirectional::copy_bidirectional_client_compute( + &mut client, + &mut compute, + ) + .await?; + + Ok(()) +} + +pub struct ProxyPassthrough { + pub client: Stream, + pub compute: PostgresConnection, + pub aux: MetricsAuxInfo, + + pub req: IntCounterPairGuard, + pub conn: IntCounterPairGuard, + pub cancel: cancellation::Session, +} + +impl ProxyPassthrough { + pub async fn proxy_pass(self) -> anyhow::Result<()> { + let res = proxy_pass(self.client, self.compute.stream, self.aux).await; + self.compute.cancel_closure.try_cancel_query().await?; + res + } +} diff --git a/proxy/src/proxy/tests.rs b/proxy/src/proxy/tests.rs index a552a857b9..5d0340e852 100644 --- a/proxy/src/proxy/tests.rs +++ b/proxy/src/proxy/tests.rs @@ -2,17 +2,25 @@ mod mitm; +use std::time::Duration; + use super::connect_compute::ConnectMechanism; use super::retry::ShouldRetry; use super::*; -use crate::auth::backend::{ComputeUserInfo, TestBackend}; -use crate::auth::IpPattern; +use crate::auth::backend::{ + ComputeCredentialKeys, ComputeCredentials, ComputeUserInfo, MaybeOwned, TestBackend, +}; use crate::config::CertResolver; +use crate::console::caches::NodeInfoCache; +use crate::console::provider::{CachedAllowedIps, CachedRoleSecret, ConsoleBackend}; use crate::console::{self, CachedNodeInfo, NodeInfo}; +use crate::error::ErrorKind; use crate::proxy::retry::{retry_after, NUM_RETRIES_CONNECT}; -use crate::{auth, http, sasl, scram}; +use crate::{http, sasl, scram}; +use anyhow::{bail, Context}; use async_trait::async_trait; use rstest::rstest; +use rustls::pki_types; use tokio_postgres::config::SslMode; use tokio_postgres::tls::{MakeTlsConnect, NoTls}; use tokio_postgres_rustls::{MakeRustlsConnect, RustlsStream}; @@ -21,7 +29,11 @@ use tokio_postgres_rustls::{MakeRustlsConnect, RustlsStream}; fn generate_certs( hostname: &str, common_name: &str, -) -> anyhow::Result<(rustls::Certificate, rustls::Certificate, rustls::PrivateKey)> { +) -> anyhow::Result<( + pki_types::CertificateDer<'static>, + pki_types::CertificateDer<'static>, + pki_types::PrivateKeyDer<'static>, +)> { let ca = rcgen::Certificate::from_params({ let mut params = rcgen::CertificateParams::default(); params.is_ca = rcgen::IsCa::Ca(rcgen::BasicConstraints::Unconstrained); @@ -38,9 +50,9 @@ fn generate_certs( })?; Ok(( - rustls::Certificate(ca.serialize_der()?), - rustls::Certificate(cert.serialize_der_with_signer(&ca)?), - rustls::PrivateKey(cert.serialize_private_key_der()), + pki_types::CertificateDer::from(ca.serialize_der()?), + pki_types::CertificateDer::from(cert.serialize_der_with_signer(&ca)?), + pki_types::PrivateKeyDer::Pkcs8(cert.serialize_private_key_der().into()), )) } @@ -75,9 +87,8 @@ fn generate_tls_config<'a>( let tls_config = { let config = rustls::ServerConfig::builder() - .with_safe_defaults() .with_no_client_auth() - .with_single_cert(vec![cert.clone()], key.clone())? + .with_single_cert(vec![cert.clone()], key.clone_key())? .into(); let mut cert_resolver = CertResolver::new(); @@ -94,10 +105,9 @@ fn generate_tls_config<'a>( let client_config = { let config = rustls::ClientConfig::builder() - .with_safe_defaults() .with_root_certificates({ let mut store = rustls::RootCertStore::empty(); - store.add(&ca)?; + store.add(ca)?; store }) .with_no_client_auth(); @@ -126,9 +136,8 @@ struct Scram(scram::ServerSecret); impl Scram { fn new(password: &str) -> anyhow::Result { - let salt = rand::random::<[u8; 16]>(); - let secret = scram::ServerSecret::build(password, &salt, 256) - .context("failed to generate scram secret")?; + let secret = + scram::ServerSecret::build(password).context("failed to generate scram secret")?; Ok(Scram(secret)) } @@ -144,7 +153,7 @@ impl TestAuth for Scram { stream: &mut PqStream>, ) -> anyhow::Result<()> { let outcome = auth::AuthFlow::new(stream) - .begin(auth::Scram(&self.0)) + .begin(auth::Scram(&self.0, &mut RequestMonitoring::test())) .await? .authenticate() .await?; @@ -163,11 +172,11 @@ async fn dummy_proxy( tls: Option, auth: impl TestAuth + Send, ) -> anyhow::Result<()> { - let cancel_map = CancelMap::default(); let client = WithClientIp::new(client); - let (mut stream, _params) = handshake(client, tls.as_ref(), &cancel_map) - .await? - .context("handshake failed")?; + let mut stream = match handshake(client, tls.as_ref()).await? { + HandshakeData::Startup(stream, _) => stream, + HandshakeData::Cancel(_) => bail!("cancellation not supported"), + }; auth.authenticate(&mut stream).await?; @@ -371,9 +380,11 @@ enum ConnectAction { Fail, } +#[derive(Clone)] struct TestConnectMechanism { counter: Arc>, sequence: Vec, + cache: &'static NodeInfoCache, } impl TestConnectMechanism { @@ -392,6 +403,12 @@ impl TestConnectMechanism { Self { counter: Arc::new(std::sync::Mutex::new(0)), sequence, + cache: Box::leak(Box::new(NodeInfoCache::new( + "test", + 1, + Duration::from_secs(100), + false, + ))), } } } @@ -402,6 +419,13 @@ struct TestConnection; #[derive(Debug)] struct TestConnectError { retryable: bool, + kind: crate::error::ErrorKind, +} + +impl ReportableError for TestConnectError { + fn get_error_kind(&self) -> crate::error::ErrorKind { + self.kind + } } impl std::fmt::Display for TestConnectError { @@ -435,8 +459,14 @@ impl ConnectMechanism for TestConnectMechanism { *counter += 1; match action { ConnectAction::Connect => Ok(TestConnection), - ConnectAction::Retry => Err(TestConnectError { retryable: true }), - ConnectAction::Fail => Err(TestConnectError { retryable: false }), + ConnectAction::Retry => Err(TestConnectError { + retryable: true, + kind: ErrorKind::Compute, + }), + ConnectAction::Fail => Err(TestConnectError { + retryable: false, + kind: ErrorKind::Compute, + }), x => panic!("expecting action {:?}, connect is called instead", x), } } @@ -450,7 +480,7 @@ impl TestBackend for TestConnectMechanism { let action = self.sequence[*counter]; *counter += 1; match action { - ConnectAction::Wake => Ok(helper_create_cached_node_info()), + ConnectAction::Wake => Ok(helper_create_cached_node_info(self.cache)), ConnectAction::WakeFail => { let err = console::errors::ApiError::Console { status: http::StatusCode::FORBIDDEN, @@ -471,35 +501,52 @@ impl TestBackend for TestConnectMechanism { } } - fn get_allowed_ips(&self) -> Result, console::errors::GetAuthInfoError> { + fn get_allowed_ips_and_secret( + &self, + ) -> Result<(CachedAllowedIps, Option), console::errors::GetAuthInfoError> + { + unimplemented!("not used in tests") + } + fn get_role_secret(&self) -> Result { unimplemented!("not used in tests") } } -fn helper_create_cached_node_info() -> CachedNodeInfo { +fn helper_create_cached_node_info(cache: &'static NodeInfoCache) -> CachedNodeInfo { let node = NodeInfo { config: compute::ConnCfg::new(), aux: Default::default(), allow_self_signed_compute: false, }; - CachedNodeInfo::new_uncached(node) + let (_, node) = cache.insert("key".into(), node); + node } fn helper_create_connect_info( mechanism: &TestConnectMechanism, -) -> (CachedNodeInfo, auth::BackendType<'_, ComputeUserInfo>) { - let cache = helper_create_cached_node_info(); - let user_info = auth::BackendType::Test(mechanism); - (cache, user_info) +) -> auth::BackendType<'static, ComputeCredentials, &()> { + let user_info = auth::BackendType::Console( + MaybeOwned::Owned(ConsoleBackend::Test(Box::new(mechanism.clone()))), + ComputeCredentials { + info: ComputeUserInfo { + endpoint: "endpoint".into(), + user: "user".into(), + options: NeonOptions::parse_options_raw(""), + }, + keys: ComputeCredentialKeys::Password("password".into()), + }, + ); + user_info } #[tokio::test] async fn connect_to_compute_success() { + let _ = env_logger::try_init(); use ConnectAction::*; let mut ctx = RequestMonitoring::test(); - let mechanism = TestConnectMechanism::new(vec![Connect]); - let (cache, user_info) = helper_create_connect_info(&mechanism); - connect_to_compute(&mut ctx, &mechanism, cache, &user_info) + let mechanism = TestConnectMechanism::new(vec![Wake, Connect]); + let user_info = helper_create_connect_info(&mechanism); + connect_to_compute(&mut ctx, &mechanism, &user_info, false) .await .unwrap(); mechanism.verify(); @@ -507,11 +554,12 @@ async fn connect_to_compute_success() { #[tokio::test] async fn connect_to_compute_retry() { + let _ = env_logger::try_init(); use ConnectAction::*; let mut ctx = RequestMonitoring::test(); - let mechanism = TestConnectMechanism::new(vec![Retry, Wake, Retry, Connect]); - let (cache, user_info) = helper_create_connect_info(&mechanism); - connect_to_compute(&mut ctx, &mechanism, cache, &user_info) + let mechanism = TestConnectMechanism::new(vec![Wake, Retry, Wake, Connect]); + let user_info = helper_create_connect_info(&mechanism); + connect_to_compute(&mut ctx, &mechanism, &user_info, false) .await .unwrap(); mechanism.verify(); @@ -520,11 +568,12 @@ async fn connect_to_compute_retry() { /// Test that we don't retry if the error is not retryable. #[tokio::test] async fn connect_to_compute_non_retry_1() { + let _ = env_logger::try_init(); use ConnectAction::*; let mut ctx = RequestMonitoring::test(); - let mechanism = TestConnectMechanism::new(vec![Retry, Wake, Retry, Fail]); - let (cache, user_info) = helper_create_connect_info(&mechanism); - connect_to_compute(&mut ctx, &mechanism, cache, &user_info) + let mechanism = TestConnectMechanism::new(vec![Wake, Retry, Wake, Fail]); + let user_info = helper_create_connect_info(&mechanism); + connect_to_compute(&mut ctx, &mechanism, &user_info, false) .await .unwrap_err(); mechanism.verify(); @@ -533,11 +582,12 @@ async fn connect_to_compute_non_retry_1() { /// Even for non-retryable errors, we should retry at least once. #[tokio::test] async fn connect_to_compute_non_retry_2() { + let _ = env_logger::try_init(); use ConnectAction::*; let mut ctx = RequestMonitoring::test(); - let mechanism = TestConnectMechanism::new(vec![Fail, Wake, Retry, Connect]); - let (cache, user_info) = helper_create_connect_info(&mechanism); - connect_to_compute(&mut ctx, &mechanism, cache, &user_info) + let mechanism = TestConnectMechanism::new(vec![Wake, Fail, Wake, Connect]); + let user_info = helper_create_connect_info(&mechanism); + connect_to_compute(&mut ctx, &mechanism, &user_info, false) .await .unwrap(); mechanism.verify(); @@ -546,15 +596,16 @@ async fn connect_to_compute_non_retry_2() { /// Retry for at most `NUM_RETRIES_CONNECT` times. #[tokio::test] async fn connect_to_compute_non_retry_3() { + let _ = env_logger::try_init(); assert_eq!(NUM_RETRIES_CONNECT, 16); use ConnectAction::*; let mut ctx = RequestMonitoring::test(); let mechanism = TestConnectMechanism::new(vec![ - Retry, Wake, Retry, Retry, Retry, Retry, Retry, Retry, Retry, Retry, Retry, Retry, Retry, - Retry, Retry, Retry, Retry, /* the 17th time */ Retry, + Wake, Retry, Wake, Retry, Retry, Retry, Retry, Retry, Retry, Retry, Retry, Retry, Retry, + Retry, Retry, Retry, Retry, Retry, /* the 17th time */ Retry, ]); - let (cache, user_info) = helper_create_connect_info(&mechanism); - connect_to_compute(&mut ctx, &mechanism, cache, &user_info) + let user_info = helper_create_connect_info(&mechanism); + connect_to_compute(&mut ctx, &mechanism, &user_info, false) .await .unwrap_err(); mechanism.verify(); @@ -563,11 +614,12 @@ async fn connect_to_compute_non_retry_3() { /// Should retry wake compute. #[tokio::test] async fn wake_retry() { + let _ = env_logger::try_init(); use ConnectAction::*; let mut ctx = RequestMonitoring::test(); - let mechanism = TestConnectMechanism::new(vec![Retry, WakeRetry, Wake, Connect]); - let (cache, user_info) = helper_create_connect_info(&mechanism); - connect_to_compute(&mut ctx, &mechanism, cache, &user_info) + let mechanism = TestConnectMechanism::new(vec![WakeRetry, Wake, Connect]); + let user_info = helper_create_connect_info(&mechanism); + connect_to_compute(&mut ctx, &mechanism, &user_info, false) .await .unwrap(); mechanism.verify(); @@ -576,11 +628,12 @@ async fn wake_retry() { /// Wake failed with a non-retryable error. #[tokio::test] async fn wake_non_retry() { + let _ = env_logger::try_init(); use ConnectAction::*; let mut ctx = RequestMonitoring::test(); - let mechanism = TestConnectMechanism::new(vec![Retry, WakeFail]); - let (cache, user_info) = helper_create_connect_info(&mechanism); - connect_to_compute(&mut ctx, &mechanism, cache, &user_info) + let mechanism = TestConnectMechanism::new(vec![WakeRetry, WakeFail]); + let user_info = helper_create_connect_info(&mechanism); + connect_to_compute(&mut ctx, &mechanism, &user_info, false) .await .unwrap_err(); mechanism.verify(); diff --git a/proxy/src/proxy/tests/mitm.rs b/proxy/src/proxy/tests/mitm.rs index a0a84a1dc0..e0c2d836f4 100644 --- a/proxy/src/proxy/tests/mitm.rs +++ b/proxy/src/proxy/tests/mitm.rs @@ -11,7 +11,6 @@ use bytes::{Bytes, BytesMut}; use futures::{SinkExt, StreamExt}; use postgres_protocol::message::frontend; use tokio::io::{AsyncReadExt, DuplexStream}; -use tokio_postgres::config::SslMode; use tokio_postgres::tls::TlsConnect; use tokio_util::codec::{Decoder, Encoder}; @@ -35,12 +34,10 @@ async fn proxy_mitm( tokio::spawn(async move { // begin handshake with end_server let end_server = connect_tls(server2, client_config2.make_tls_connect().unwrap()).await; - // process handshake with end_client - let (end_client, startup) = - handshake(client1, Some(&server_config1), &CancelMap::default()) - .await - .unwrap() - .unwrap(); + let (end_client, startup) = match handshake(client1, Some(&server_config1)).await.unwrap() { + HandshakeData::Startup(stream, params) => (stream, params), + HandshakeData::Cancel(_) => panic!("cancellation not supported"), + }; let mut end_server = tokio_util::codec::Framed::new(end_server, PgFrame); let (end_client, buf) = end_client.framed.into_inner(); diff --git a/proxy/src/proxy/wake_compute.rs b/proxy/src/proxy/wake_compute.rs new file mode 100644 index 0000000000..bfe4b7ec3a --- /dev/null +++ b/proxy/src/proxy/wake_compute.rs @@ -0,0 +1,95 @@ +use crate::console::{errors::WakeComputeError, provider::CachedNodeInfo}; +use crate::context::RequestMonitoring; +use crate::metrics::{bool_to_str, NUM_WAKEUP_FAILURES}; +use crate::proxy::retry::retry_after; +use hyper::StatusCode; +use std::ops::ControlFlow; +use tracing::{error, warn}; + +use super::connect_compute::ComputeConnectBackend; +use super::retry::ShouldRetry; + +pub async fn wake_compute( + num_retries: &mut u32, + ctx: &mut RequestMonitoring, + api: &B, +) -> Result { + loop { + let wake_res = api.wake_compute(ctx).await; + match handle_try_wake(wake_res, *num_retries) { + Err(e) => { + error!(error = ?e, num_retries, retriable = false, "couldn't wake compute node"); + report_error(&e, false); + return Err(e); + } + Ok(ControlFlow::Continue(e)) => { + warn!(error = ?e, num_retries, retriable = true, "couldn't wake compute node"); + report_error(&e, true); + } + Ok(ControlFlow::Break(n)) => return Ok(n), + } + + let wait_duration = retry_after(*num_retries); + *num_retries += 1; + tokio::time::sleep(wait_duration).await; + } +} + +/// Attempts to wake up the compute node. +/// * Returns Ok(Continue(e)) if there was an error waking but retries are acceptable +/// * Returns Ok(Break(node)) if the wakeup succeeded +/// * Returns Err(e) if there was an error +pub fn handle_try_wake( + result: Result, + num_retries: u32, +) -> Result, WakeComputeError> { + match result { + Err(err) => match &err { + WakeComputeError::ApiError(api) if api.should_retry(num_retries) => { + Ok(ControlFlow::Continue(err)) + } + _ => Err(err), + }, + // Ready to try again. + Ok(new) => Ok(ControlFlow::Break(new)), + } +} + +fn report_error(e: &WakeComputeError, retry: bool) { + use crate::console::errors::ApiError; + let retry = bool_to_str(retry); + let kind = match e { + WakeComputeError::BadComputeAddress(_) => "bad_compute_address", + WakeComputeError::ApiError(ApiError::Transport(_)) => "api_transport_error", + WakeComputeError::ApiError(ApiError::Console { + status: StatusCode::LOCKED, + ref text, + }) if text.contains("written data quota exceeded") + || text.contains("the limit for current plan reached") => + { + "quota_exceeded" + } + WakeComputeError::ApiError(ApiError::Console { + status: StatusCode::UNPROCESSABLE_ENTITY, + ref text, + }) if text.contains("compute time quota of non-primary branches is exceeded") => { + "quota_exceeded" + } + WakeComputeError::ApiError(ApiError::Console { + status: StatusCode::LOCKED, + .. + }) => "api_console_locked", + WakeComputeError::ApiError(ApiError::Console { + status: StatusCode::BAD_REQUEST, + .. + }) => "api_console_bad_request", + WakeComputeError::ApiError(ApiError::Console { status, .. }) + if status.is_server_error() => + { + "api_console_other_server_error" + } + WakeComputeError::ApiError(ApiError::Console { .. }) => "api_console_other_error", + WakeComputeError::TimeoutError => "timeout_error", + }; + NUM_WAKEUP_FAILURES.with_label_values(&[retry, kind]).inc(); +} diff --git a/proxy/src/rate_limiter.rs b/proxy/src/rate_limiter.rs index b26386d159..f0da4ead23 100644 --- a/proxy/src/rate_limiter.rs +++ b/proxy/src/rate_limiter.rs @@ -4,4 +4,4 @@ mod limiter; pub use aimd::Aimd; pub use limit_algorithm::{AimdConfig, Fixed, RateLimitAlgorithm, RateLimiterConfig}; pub use limiter::Limiter; -pub use limiter::{EndpointRateLimiter, RateBucketInfo}; +pub use limiter::{EndpointRateLimiter, RateBucketInfo, RedisRateLimiter}; diff --git a/proxy/src/rate_limiter/limiter.rs b/proxy/src/rate_limiter/limiter.rs index cbae72711c..3181060e2f 100644 --- a/proxy/src/rate_limiter/limiter.rs +++ b/proxy/src/rate_limiter/limiter.rs @@ -22,6 +22,44 @@ use super::{ RateLimiterConfig, }; +pub struct RedisRateLimiter { + data: Vec, + info: &'static [RateBucketInfo], +} + +impl RedisRateLimiter { + pub fn new(info: &'static [RateBucketInfo]) -> Self { + Self { + data: vec![ + RateBucket { + start: Instant::now(), + count: 0, + }; + info.len() + ], + info, + } + } + + /// Check that number of connections is below `max_rps` rps. + pub fn check(&mut self) -> bool { + let now = Instant::now(); + + let should_allow_request = self + .data + .iter_mut() + .zip(self.info) + .all(|(bucket, info)| bucket.should_allow_request(info, now)); + + if should_allow_request { + // only increment the bucket counts if the request will actually be accepted + self.data.iter_mut().for_each(RateBucket::inc); + } + + should_allow_request + } +} + // Simple per-endpoint rate limiter. // // Check that number of connections to the endpoint is below `max_rps` rps. diff --git a/proxy/src/redis.rs b/proxy/src/redis.rs index c2a91bed97..35d6db074e 100644 --- a/proxy/src/redis.rs +++ b/proxy/src/redis.rs @@ -1 +1,2 @@ pub mod notifications; +pub mod publisher; diff --git a/proxy/src/redis/notifications.rs b/proxy/src/redis/notifications.rs index 9cd70b109b..6ae848c0d2 100644 --- a/proxy/src/redis/notifications.rs +++ b/proxy/src/redis/notifications.rs @@ -1,35 +1,45 @@ use std::{convert::Infallible, sync::Arc}; use futures::StreamExt; +use pq_proto::CancelKeyData; use redis::aio::PubSub; -use serde::Deserialize; +use serde::{Deserialize, Serialize}; +use uuid::Uuid; -use crate::{cache::project_info::ProjectInfoCache, ProjectId, RoleName}; +use crate::{ + cache::project_info::ProjectInfoCache, + cancellation::{CancelMap, CancellationHandler, NotificationsCancellationHandler}, + intern::{ProjectIdInt, RoleNameInt}, + metrics::REDIS_BROKEN_MESSAGES, +}; -const CHANNEL_NAME: &str = "neondb-proxy-ws-updates"; +const CPLANE_CHANNEL_NAME: &str = "neondb-proxy-ws-updates"; +pub(crate) const PROXY_CHANNEL_NAME: &str = "neondb-proxy-to-proxy-updates"; const RECONNECT_TIMEOUT: std::time::Duration = std::time::Duration::from_secs(20); const INVALIDATION_LAG: std::time::Duration = std::time::Duration::from_secs(20); -struct ConsoleRedisClient { +struct RedisConsumerClient { client: redis::Client, } -impl ConsoleRedisClient { +impl RedisConsumerClient { pub fn new(url: &str) -> anyhow::Result { let client = redis::Client::open(url)?; Ok(Self { client }) } async fn try_connect(&self) -> anyhow::Result { let mut conn = self.client.get_async_connection().await?.into_pubsub(); - tracing::info!("subscribing to a channel `{CHANNEL_NAME}`"); - conn.subscribe(CHANNEL_NAME).await?; + tracing::info!("subscribing to a channel `{CPLANE_CHANNEL_NAME}`"); + conn.subscribe(CPLANE_CHANNEL_NAME).await?; + tracing::info!("subscribing to a channel `{PROXY_CHANNEL_NAME}`"); + conn.subscribe(PROXY_CHANNEL_NAME).await?; Ok(conn) } } -#[derive(Clone, Debug, Deserialize, Eq, PartialEq)] +#[derive(Clone, Debug, Serialize, Deserialize, Eq, PartialEq)] #[serde(tag = "topic", content = "data")] -enum Notification { +pub(crate) enum Notification { #[serde( rename = "/allowed_ips_updated", deserialize_with = "deserialize_json_string" @@ -42,16 +52,25 @@ enum Notification { deserialize_with = "deserialize_json_string" )] PasswordUpdate { password_update: PasswordUpdate }, + #[serde(rename = "/cancel_session")] + Cancel(CancelSession), } -#[derive(Clone, Debug, Deserialize, Eq, PartialEq)] -struct AllowedIpsUpdate { - project_id: ProjectId, +#[derive(Clone, Debug, Serialize, Deserialize, Eq, PartialEq)] +pub(crate) struct AllowedIpsUpdate { + project_id: ProjectIdInt, } -#[derive(Clone, Debug, Deserialize, Eq, PartialEq)] -struct PasswordUpdate { - project_id: ProjectId, - role_name: RoleName, +#[derive(Clone, Debug, Serialize, Deserialize, Eq, PartialEq)] +pub(crate) struct PasswordUpdate { + project_id: ProjectIdInt, + role_name: RoleNameInt, } +#[derive(Clone, Debug, Serialize, Deserialize, Eq, PartialEq)] +pub(crate) struct CancelSession { + pub region_id: Option, + pub cancel_key_data: CancelKeyData, + pub session_id: Uuid, +} + fn deserialize_json_string<'de, D, T>(deserializer: D) -> Result where T: for<'de2> serde::Deserialize<'de2>, @@ -61,60 +80,128 @@ where serde_json::from_str(&s).map_err(::custom) } +struct MessageHandler< + C: ProjectInfoCache + Send + Sync + 'static, + H: NotificationsCancellationHandler + Send + Sync + 'static, +> { + cache: Arc, + cancellation_handler: Arc, + region_id: String, +} + +impl< + C: ProjectInfoCache + Send + Sync + 'static, + H: NotificationsCancellationHandler + Send + Sync + 'static, + > MessageHandler +{ + pub fn new(cache: Arc, cancellation_handler: Arc, region_id: String) -> Self { + Self { + cache, + cancellation_handler, + region_id, + } + } + pub fn disable_ttl(&self) { + self.cache.disable_ttl(); + } + pub fn enable_ttl(&self) { + self.cache.enable_ttl(); + } + #[tracing::instrument(skip(self, msg), fields(session_id = tracing::field::Empty))] + async fn handle_message(&self, msg: redis::Msg) -> anyhow::Result<()> { + use Notification::*; + let payload: String = msg.get_payload()?; + tracing::debug!(?payload, "received a message payload"); + + let msg: Notification = match serde_json::from_str(&payload) { + Ok(msg) => msg, + Err(e) => { + REDIS_BROKEN_MESSAGES + .with_label_values(&[msg.get_channel_name()]) + .inc(); + tracing::error!("broken message: {e}"); + return Ok(()); + } + }; + tracing::debug!(?msg, "received a message"); + match msg { + Cancel(cancel_session) => { + tracing::Span::current().record( + "session_id", + &tracing::field::display(cancel_session.session_id), + ); + if let Some(cancel_region) = cancel_session.region_id { + // If the message is not for this region, ignore it. + if cancel_region != self.region_id { + return Ok(()); + } + } + // This instance of cancellation_handler doesn't have a RedisPublisherClient so it can't publish the message. + match self + .cancellation_handler + .cancel_session_no_publish(cancel_session.cancel_key_data) + .await + { + Ok(()) => {} + Err(e) => { + tracing::error!("failed to cancel session: {e}"); + } + } + } + _ => { + invalidate_cache(self.cache.clone(), msg.clone()); + // It might happen that the invalid entry is on the way to be cached. + // To make sure that the entry is invalidated, let's repeat the invalidation in INVALIDATION_LAG seconds. + // TODO: include the version (or the timestamp) in the message and invalidate only if the entry is cached before the message. + let cache = self.cache.clone(); + tokio::spawn(async move { + tokio::time::sleep(INVALIDATION_LAG).await; + invalidate_cache(cache, msg); + }); + } + } + + Ok(()) + } +} + fn invalidate_cache(cache: Arc, msg: Notification) { use Notification::*; match msg { AllowedIpsUpdate { allowed_ips_update } => { - cache.invalidate_allowed_ips_for_project(&allowed_ips_update.project_id) + cache.invalidate_allowed_ips_for_project(allowed_ips_update.project_id) } PasswordUpdate { password_update } => cache.invalidate_role_secret_for_project( - &password_update.project_id, - &password_update.role_name, + password_update.project_id, + password_update.role_name, ), + Cancel(_) => unreachable!("cancel message should be handled separately"), } } -#[tracing::instrument(skip(cache))] -fn handle_message(msg: redis::Msg, cache: Arc) -> anyhow::Result<()> -where - C: ProjectInfoCache + Send + Sync + 'static, -{ - let payload: String = msg.get_payload()?; - tracing::debug!(?payload, "received a message payload"); - - let msg: Notification = match serde_json::from_str(&payload) { - Ok(msg) => msg, - Err(e) => { - tracing::error!("broken message: {e}"); - return Ok(()); - } - }; - tracing::debug!(?msg, "received a message"); - invalidate_cache(cache.clone(), msg.clone()); - // It might happen that the invalid entry is on the way to be cached. - // To make sure that the entry is invalidated, let's repeat the invalidation in INVALIDATION_LAG seconds. - // TODO: include the version (or the timestamp) in the message and invalidate only if the entry is cached before the message. - tokio::spawn(async move { - tokio::time::sleep(INVALIDATION_LAG).await; - invalidate_cache(cache, msg.clone()); - }); - - Ok(()) -} - /// Handle console's invalidation messages. #[tracing::instrument(name = "console_notifications", skip_all)] -pub async fn task_main(url: String, cache: Arc) -> anyhow::Result +pub async fn task_main( + url: String, + cache: Arc, + cancel_map: CancelMap, + region_id: String, +) -> anyhow::Result where C: ProjectInfoCache + Send + Sync + 'static, { cache.enable_ttl(); + let handler = MessageHandler::new( + cache, + Arc::new(CancellationHandler::new(cancel_map, None)), + region_id, + ); loop { - let redis = ConsoleRedisClient::new(&url)?; + let redis = RedisConsumerClient::new(&url)?; let conn = match redis.try_connect().await { Ok(conn) => { - cache.disable_ttl(); + handler.disable_ttl(); conn } Err(e) => { @@ -127,7 +214,7 @@ where }; let mut stream = conn.into_on_message(); while let Some(msg) = stream.next().await { - match handle_message(msg, cache.clone()) { + match handler.handle_message(msg).await { Ok(()) => {} Err(e) => { tracing::error!("failed to handle message: {e}, will try to reconnect"); @@ -135,18 +222,20 @@ where } } } - cache.enable_ttl(); + handler.enable_ttl(); } } #[cfg(test)] mod tests { + use crate::{ProjectId, RoleName}; + use super::*; use serde_json::json; #[test] fn parse_allowed_ips() -> anyhow::Result<()> { - let project_id = "new_project".to_string(); + let project_id: ProjectId = "new_project".into(); let data = format!("{{\"project_id\": \"{project_id}\"}}"); let text = json!({ "type": "message", @@ -161,7 +250,7 @@ mod tests { result, Notification::AllowedIpsUpdate { allowed_ips_update: AllowedIpsUpdate { - project_id: project_id.into() + project_id: (&project_id).into() } } ); @@ -171,8 +260,8 @@ mod tests { #[test] fn parse_password_updated() -> anyhow::Result<()> { - let project_id = "new_project".to_string(); - let role_name = "new_role".to_string(); + let project_id: ProjectId = "new_project".into(); + let role_name: RoleName = "new_role".into(); let data = format!("{{\"project_id\": \"{project_id}\", \"role_name\": \"{role_name}\"}}"); let text = json!({ "type": "message", @@ -187,12 +276,39 @@ mod tests { result, Notification::PasswordUpdate { password_update: PasswordUpdate { - project_id: project_id.into(), - role_name: role_name.into() + project_id: (&project_id).into(), + role_name: (&role_name).into(), } } ); + Ok(()) + } + #[test] + fn parse_cancel_session() -> anyhow::Result<()> { + let cancel_key_data = CancelKeyData { + backend_pid: 42, + cancel_key: 41, + }; + let uuid = uuid::Uuid::new_v4(); + let msg = Notification::Cancel(CancelSession { + cancel_key_data, + region_id: None, + session_id: uuid, + }); + let text = serde_json::to_string(&msg)?; + let result: Notification = serde_json::from_str(&text)?; + assert_eq!(msg, result); + + let msg = Notification::Cancel(CancelSession { + cancel_key_data, + region_id: Some("region".to_string()), + session_id: uuid, + }); + let text = serde_json::to_string(&msg)?; + let result: Notification = serde_json::from_str(&text)?; + assert_eq!(msg, result,); + Ok(()) } } diff --git a/proxy/src/redis/publisher.rs b/proxy/src/redis/publisher.rs new file mode 100644 index 0000000000..f85593afdd --- /dev/null +++ b/proxy/src/redis/publisher.rs @@ -0,0 +1,80 @@ +use pq_proto::CancelKeyData; +use redis::AsyncCommands; +use uuid::Uuid; + +use crate::rate_limiter::{RateBucketInfo, RedisRateLimiter}; + +use super::notifications::{CancelSession, Notification, PROXY_CHANNEL_NAME}; + +pub struct RedisPublisherClient { + client: redis::Client, + publisher: Option, + region_id: String, + limiter: RedisRateLimiter, +} + +impl RedisPublisherClient { + pub fn new( + url: &str, + region_id: String, + info: &'static [RateBucketInfo], + ) -> anyhow::Result { + let client = redis::Client::open(url)?; + Ok(Self { + client, + publisher: None, + region_id, + limiter: RedisRateLimiter::new(info), + }) + } + pub async fn try_publish( + &mut self, + cancel_key_data: CancelKeyData, + session_id: Uuid, + ) -> anyhow::Result<()> { + if !self.limiter.check() { + tracing::info!("Rate limit exceeded. Skipping cancellation message"); + return Err(anyhow::anyhow!("Rate limit exceeded")); + } + match self.publish(cancel_key_data, session_id).await { + Ok(()) => return Ok(()), + Err(e) => { + tracing::error!("failed to publish a message: {e}"); + self.publisher = None; + } + } + tracing::info!("Publisher is disconnected. Reconnectiong..."); + self.try_connect().await?; + self.publish(cancel_key_data, session_id).await + } + + async fn publish( + &mut self, + cancel_key_data: CancelKeyData, + session_id: Uuid, + ) -> anyhow::Result<()> { + let conn = self + .publisher + .as_mut() + .ok_or_else(|| anyhow::anyhow!("not connected"))?; + let payload = serde_json::to_string(&Notification::Cancel(CancelSession { + region_id: Some(self.region_id.clone()), + cancel_key_data, + session_id, + }))?; + conn.publish(PROXY_CHANNEL_NAME, payload).await?; + Ok(()) + } + pub async fn try_connect(&mut self) -> anyhow::Result<()> { + match self.client.get_async_connection().await { + Ok(conn) => { + self.publisher = Some(conn); + } + Err(e) => { + tracing::error!("failed to connect to redis: {e}"); + return Err(e.into()); + } + } + Ok(()) + } +} diff --git a/proxy/src/sasl.rs b/proxy/src/sasl.rs index da1cf21c6a..1cf8b53e11 100644 --- a/proxy/src/sasl.rs +++ b/proxy/src/sasl.rs @@ -10,7 +10,7 @@ mod channel_binding; mod messages; mod stream; -use crate::error::UserFacingError; +use crate::error::{ReportableError, UserFacingError}; use std::io; use thiserror::Error; @@ -48,6 +48,18 @@ impl UserFacingError for Error { } } +impl ReportableError for Error { + fn get_error_kind(&self) -> crate::error::ErrorKind { + match self { + Error::ChannelBindingFailed(_) => crate::error::ErrorKind::User, + Error::ChannelBindingBadMethod(_) => crate::error::ErrorKind::User, + Error::BadClientMessage(_) => crate::error::ErrorKind::User, + Error::MissingBinding => crate::error::ErrorKind::Service, + Error::Io(_) => crate::error::ErrorKind::ClientDisconnect, + } + } +} + /// A convenient result type for SASL exchange. pub type Result = std::result::Result; diff --git a/proxy/src/scram.rs b/proxy/src/scram.rs index 49a7a13043..a95e734d06 100644 --- a/proxy/src/scram.rs +++ b/proxy/src/scram.rs @@ -12,9 +12,6 @@ mod messages; mod secret; mod signature; -#[cfg(any(test, doc))] -mod password; - pub use exchange::{exchange, Exchange}; pub use key::ScramKey; pub use secret::ServerSecret; @@ -59,27 +56,21 @@ fn sha256<'a>(parts: impl IntoIterator) -> [u8; 32] { #[cfg(test)] mod tests { + use postgres_protocol::authentication::sasl::{ChannelBinding, ScramSha256}; + use crate::sasl::{Mechanism, Step}; - use super::{password::SaltedPassword, Exchange, ServerSecret}; + use super::{Exchange, ServerSecret}; #[test] - fn happy_path() { + fn snapshot() { let iterations = 4096; - let salt_base64 = "QSXCR+Q6sek8bf92"; - let pw = SaltedPassword::new( - b"pencil", - base64::decode(salt_base64).unwrap().as_slice(), - iterations, - ); + let salt = "QSXCR+Q6sek8bf92"; + let stored_key = "FO+9jBb3MUukt6jJnzjPZOWc5ow/Pu6JtPyju0aqaE8="; + let server_key = "qxJ1SbmSAi5EcS0J5Ck/cKAm/+Ixa+Kwp63f4OHDgzo="; + let secret = format!("SCRAM-SHA-256${iterations}:{salt}${stored_key}:{server_key}",); + let secret = ServerSecret::parse(&secret).unwrap(); - let secret = ServerSecret { - iterations, - salt_base64: salt_base64.to_owned(), - stored_key: pw.client_key().sha256(), - server_key: pw.server_key(), - doomed: false, - }; const NONCE: [u8; 18] = [ 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, ]; @@ -121,4 +112,33 @@ mod tests { ] ); } + + fn run_round_trip_test(server_password: &str, client_password: &str) { + let scram_secret = ServerSecret::build(server_password).unwrap(); + let sasl_client = + ScramSha256::new(client_password.as_bytes(), ChannelBinding::unsupported()); + + let outcome = super::exchange( + &scram_secret, + sasl_client, + crate::config::TlsServerEndPoint::Undefined, + ) + .unwrap(); + + match outcome { + crate::sasl::Outcome::Success(_) => {} + crate::sasl::Outcome::Failure(r) => panic!("{r}"), + } + } + + #[test] + fn round_trip() { + run_round_trip_test("pencil", "pencil") + } + + #[test] + #[should_panic(expected = "password doesn't match")] + fn failure() { + run_round_trip_test("pencil", "eraser") + } } diff --git a/proxy/src/scram/key.rs b/proxy/src/scram/key.rs index 66c2c6b207..973126e729 100644 --- a/proxy/src/scram/key.rs +++ b/proxy/src/scram/key.rs @@ -3,7 +3,7 @@ /// Faithfully taken from PostgreSQL. pub const SCRAM_KEY_LEN: usize = 32; -/// One of the keys derived from the [password](super::password::SaltedPassword). +/// One of the keys derived from the user's password. /// We use the same structure for all keys, i.e. /// `ClientKey`, `StoredKey`, and `ServerKey`. #[derive(Clone, Default, PartialEq, Eq, Debug)] diff --git a/proxy/src/scram/password.rs b/proxy/src/scram/password.rs deleted file mode 100644 index 022f2842dd..0000000000 --- a/proxy/src/scram/password.rs +++ /dev/null @@ -1,74 +0,0 @@ -//! Password hashing routines. - -use super::key::ScramKey; - -pub const SALTED_PASSWORD_LEN: usize = 32; - -/// Salted hashed password is essential for [key](super::key) derivation. -#[repr(transparent)] -pub struct SaltedPassword { - bytes: [u8; SALTED_PASSWORD_LEN], -} - -impl SaltedPassword { - /// See `scram-common.c : scram_SaltedPassword` for details. - /// Further reading: (see `PBKDF2`). - pub fn new(password: &[u8], salt: &[u8], iterations: u32) -> SaltedPassword { - pbkdf2::pbkdf2_hmac_array::(password, salt, iterations).into() - } - - /// Derive `ClientKey` from a salted hashed password. - pub fn client_key(&self) -> ScramKey { - super::hmac_sha256(&self.bytes, [b"Client Key".as_ref()]).into() - } - - /// Derive `ServerKey` from a salted hashed password. - pub fn server_key(&self) -> ScramKey { - super::hmac_sha256(&self.bytes, [b"Server Key".as_ref()]).into() - } -} - -impl From<[u8; SALTED_PASSWORD_LEN]> for SaltedPassword { - #[inline(always)] - fn from(bytes: [u8; SALTED_PASSWORD_LEN]) -> Self { - Self { bytes } - } -} - -#[cfg(test)] -mod tests { - use super::SaltedPassword; - - fn legacy_pbkdf2_impl(password: &[u8], salt: &[u8], iterations: u32) -> SaltedPassword { - let one = 1_u32.to_be_bytes(); // magic - - let mut current = super::super::hmac_sha256(password, [salt, &one]); - let mut result = current; - for _ in 1..iterations { - current = super::super::hmac_sha256(password, [current.as_ref()]); - // TODO: result = current.zip(result).map(|(x, y)| x ^ y), issue #80094 - for (i, x) in current.iter().enumerate() { - result[i] ^= x; - } - } - - result.into() - } - - #[test] - fn pbkdf2() { - let password = "a-very-secure-password"; - let salt = "such-a-random-salt"; - let iterations = 4096; - let output = [ - 203, 18, 206, 81, 4, 154, 193, 100, 147, 41, 211, 217, 177, 203, 69, 210, 194, 211, - 101, 1, 248, 156, 96, 0, 8, 223, 30, 87, 158, 41, 20, 42, - ]; - - let actual = SaltedPassword::new(password.as_bytes(), salt.as_bytes(), iterations); - let expected = legacy_pbkdf2_impl(password.as_bytes(), salt.as_bytes(), iterations); - - assert_eq!(actual.bytes, output); - assert_eq!(actual.bytes, expected.bytes); - } -} diff --git a/proxy/src/scram/secret.rs b/proxy/src/scram/secret.rs index 041548014a..fb3c45816e 100644 --- a/proxy/src/scram/secret.rs +++ b/proxy/src/scram/secret.rs @@ -3,7 +3,7 @@ use super::base64_decode_array; use super::key::ScramKey; -/// Server secret is produced from [password](super::password::SaltedPassword) +/// Server secret is produced from user's password, /// and is used throughout the authentication process. #[derive(Clone, Eq, PartialEq, Debug)] pub struct ServerSecret { @@ -59,21 +59,10 @@ impl ServerSecret { /// Build a new server secret from the prerequisites. /// XXX: We only use this function in tests. #[cfg(test)] - pub fn build(password: &str, salt: &[u8], iterations: u32) -> Option { - // TODO: implement proper password normalization required by the RFC - if !password.is_ascii() { - return None; - } - - let password = super::password::SaltedPassword::new(password.as_bytes(), salt, iterations); - - Some(Self { - iterations, - salt_base64: base64::encode(salt), - stored_key: password.client_key().sha256(), - server_key: password.server_key(), - doomed: false, - }) + pub fn build(password: &str) -> Option { + Self::parse(&postgres_protocol::password::scram_sha_256( + password.as_bytes(), + )) } } @@ -103,20 +92,4 @@ mod tests { assert_eq!(base64::encode(parsed.stored_key), stored_key); assert_eq!(base64::encode(parsed.server_key), server_key); } - - #[test] - fn build_scram_secret() { - let salt = b"salt"; - let secret = ServerSecret::build("password", salt, 4096).unwrap(); - assert_eq!(secret.iterations, 4096); - assert_eq!(secret.salt_base64, base64::encode(salt)); - assert_eq!( - base64::encode(secret.stored_key.as_ref()), - "lF4cRm/Jky763CN4HtxdHnjV4Q8AWTNlKvGmEFFU8IQ=" - ); - assert_eq!( - base64::encode(secret.server_key.as_ref()), - "ub8OgRsftnk2ccDMOt7ffHXNcikRkQkq1lh4xaAqrSw=" - ); - } } diff --git a/proxy/src/serverless.rs b/proxy/src/serverless.rs index 8af008394a..68f68eaba1 100644 --- a/proxy/src/serverless.rs +++ b/proxy/src/serverless.rs @@ -2,8 +2,11 @@ //! //! Handles both SQL over HTTP and SQL over Websockets. +mod backend; mod conn_pool; +mod json; mod sql_over_http; +pub mod tls_listener; mod websocket; pub use conn_pool::GlobalConnPoolOptions; @@ -17,46 +20,46 @@ pub use reqwest_middleware::{ClientWithMiddleware, Error}; pub use reqwest_retry::{policies::ExponentialBackoff, RetryTransientMiddleware}; use tokio_util::task::TaskTracker; -use crate::config::TlsConfig; use crate::context::RequestMonitoring; -use crate::metrics::NUM_CLIENT_CONNECTION_GAUGE; -use crate::protocol2::{ProxyProtocolAccept, WithClientIp}; +use crate::protocol2::{ProxyProtocolAccept, WithClientIp, WithConnectionGuard}; use crate::rate_limiter::EndpointRateLimiter; -use crate::{cancellation::CancelMap, config::ProxyConfig}; -use futures::StreamExt; +use crate::serverless::backend::PoolingBackend; +use crate::{cancellation::CancellationHandler, config::ProxyConfig}; use hyper::{ - server::{ - accept, - conn::{AddrIncoming, AddrStream}, - }, + server::conn::{AddrIncoming, AddrStream}, Body, Method, Request, Response, }; +use std::convert::Infallible; use std::net::IpAddr; +use std::sync::Arc; use std::task::Poll; -use std::{future::ready, sync::Arc}; use tls_listener::TlsListener; use tokio::net::TcpListener; use tokio_util::sync::CancellationToken; -use tracing::{error, info, info_span, warn, Instrument}; +use tracing::{error, info, warn, Instrument}; use utils::http::{error::ApiError, json::json_response}; +pub const SERVERLESS_DRIVER_SNI: &str = "api"; + pub async fn task_main( config: &'static ProxyConfig, ws_listener: TcpListener, cancellation_token: CancellationToken, endpoint_rate_limiter: Arc, + cancellation_handler: Arc, ) -> anyhow::Result<()> { scopeguard::defer! { info!("websocket server has shut down"); } - let conn_pool = conn_pool::GlobalConnPool::new(config); - - let conn_pool2 = Arc::clone(&conn_pool); - tokio::spawn(async move { - conn_pool2.gc_worker(StdRng::from_entropy()).await; - }); + let conn_pool = conn_pool::GlobalConnPool::new(&config.http_config); + { + let conn_pool = Arc::clone(&conn_pool); + tokio::spawn(async move { + conn_pool.gc_worker(StdRng::from_entropy()).await; + }); + } // shutdown the connection pool tokio::spawn({ @@ -70,6 +73,11 @@ pub async fn task_main( } }); + let backend = Arc::new(PoolingBackend { + pool: Arc::clone(&conn_pool), + config, + }); + let tls_config = match config.tls_config.as_ref() { Some(config) => config, None => { @@ -77,79 +85,84 @@ pub async fn task_main( return Ok(()); } }; - let tls_acceptor: tokio_rustls::TlsAcceptor = tls_config.to_server_config().into(); + let mut tls_server_config = rustls::ServerConfig::clone(&tls_config.to_server_config()); + // prefer http2, but support http/1.1 + tls_server_config.alpn_protocols = vec![b"h2".to_vec(), b"http/1.1".to_vec()]; + let tls_acceptor: tokio_rustls::TlsAcceptor = Arc::new(tls_server_config).into(); let mut addr_incoming = AddrIncoming::from_listener(ws_listener)?; let _ = addr_incoming.set_nodelay(true); let addr_incoming = ProxyProtocolAccept { incoming: addr_incoming, + protocol: "http", }; let ws_connections = tokio_util::task::task_tracker::TaskTracker::new(); ws_connections.close(); // allows `ws_connections.wait to complete` - let tls_listener = TlsListener::new(tls_acceptor, addr_incoming).filter(|conn| { - if let Err(err) = conn { - error!("failed to accept TLS connection for websockets: {err:?}"); - ready(false) - } else { - ready(true) - } - }); + let tls_listener = TlsListener::new( + tls_acceptor, + addr_incoming, + "http", + config.handshake_timeout, + ); let make_svc = hyper::service::make_service_fn( - |stream: &tokio_rustls::server::TlsStream>| { - let (io, tls) = stream.get_ref(); - let client_addr = io.client_addr(); - let remote_addr = io.inner.remote_addr(); - let sni_name = tls.server_name().map(|s| s.to_string()); - let conn_pool = conn_pool.clone(); + |stream: &tokio_rustls::server::TlsStream< + WithConnectionGuard>, + >| { + let (conn, _) = stream.get_ref(); + + // this is jank. should dissapear with hyper 1.0 migration. + let gauge = conn + .gauge + .lock() + .expect("lock should not be poisoned") + .take() + .expect("gauge should be set on connection start"); + + let client_addr = conn.inner.client_addr(); + let remote_addr = conn.inner.inner.remote_addr(); + let backend = backend.clone(); let ws_connections = ws_connections.clone(); let endpoint_rate_limiter = endpoint_rate_limiter.clone(); - + let cancellation_handler = cancellation_handler.clone(); async move { let peer_addr = match client_addr { Some(addr) => addr, None if config.require_client_ip => bail!("missing required client ip"), None => remote_addr, }; - Ok(MetricService::new(hyper::service::service_fn( - move |req: Request| { - let sni_name = sni_name.clone(); - let conn_pool = conn_pool.clone(); + Ok(MetricService::new( + hyper::service::service_fn(move |req: Request| { + let backend = backend.clone(); let ws_connections = ws_connections.clone(); let endpoint_rate_limiter = endpoint_rate_limiter.clone(); + let cancellation_handler = cancellation_handler.clone(); async move { - let cancel_map = Arc::new(CancelMap::default()); - let session_id = uuid::Uuid::new_v4(); - - request_handler( - req, - config, - tls_config, - conn_pool, - ws_connections, - cancel_map, - session_id, - sni_name, - peer_addr.ip(), - endpoint_rate_limiter, + Ok::<_, Infallible>( + request_handler( + req, + config, + backend, + ws_connections, + cancellation_handler, + peer_addr.ip(), + endpoint_rate_limiter, + ) + .await + .map_or_else(|e| e.into_response(), |r| r), ) - .instrument(info_span!( - "serverless", - session = %session_id, - %peer_addr, - )) - .await } - }, - ))) + }), + gauge, + )) } }, ); - hyper::Server::builder(accept::from_stream(tls_listener)) + hyper::Server::builder(tls_listener) .serve(make_svc) .with_graceful_shutdown(cancellation_token.cancelled()) .await?; @@ -166,13 +179,8 @@ struct MetricService { } impl MetricService { - fn new(inner: S) -> MetricService { - MetricService { - inner, - _gauge: NUM_CLIENT_CONNECTION_GAUGE - .with_label_values(&["http"]) - .guard(), - } + fn new(inner: S, _gauge: IntCounterPairGuard) -> MetricService { + MetricService { inner, _gauge } } } @@ -197,15 +205,14 @@ where async fn request_handler( mut request: Request, config: &'static ProxyConfig, - tls: &'static TlsConfig, - conn_pool: Arc, + backend: Arc, ws_connections: TaskTracker, - cancel_map: Arc, - session_id: uuid::Uuid, - sni_hostname: Option, + cancellation_handler: Arc, peer_addr: IpAddr, endpoint_rate_limiter: Arc, ) -> Result, ApiError> { + let session_id = uuid::Uuid::new_v4(); + let host = request .headers() .get("host") @@ -215,45 +222,40 @@ async fn request_handler( // Check if the request is a websocket upgrade request. if hyper_tungstenite::is_upgrade_request(&request) { - info!(session_id = ?session_id, "performing websocket upgrade"); + let ctx = RequestMonitoring::new(session_id, peer_addr, "ws", &config.region); + let span = ctx.span.clone(); + info!(parent: &span, "performing websocket upgrade"); let (response, websocket) = hyper_tungstenite::upgrade(&mut request, None) .map_err(|e| ApiError::BadRequest(e.into()))?; ws_connections.spawn( async move { - let mut ctx = RequestMonitoring::new(session_id, peer_addr, "ws", &config.region); - if let Err(e) = websocket::serve_websocket( config, - &mut ctx, + ctx, websocket, - &cancel_map, + cancellation_handler, host, endpoint_rate_limiter, ) .await { - error!(session_id = ?session_id, "error in websocket connection: {e:#}"); + error!("error in websocket connection: {e:#}"); } } - .in_current_span(), + .instrument(span), ); // Return the response so the spawned future can continue. Ok(response) } else if request.uri().path() == "/sql" && request.method() == Method::POST { - let mut ctx = RequestMonitoring::new(session_id, peer_addr, "http", &config.region); + let ctx = RequestMonitoring::new(session_id, peer_addr, "http", &config.region); + let span = ctx.span.clone(); - sql_over_http::handle( - tls, - &config.http_config, - &mut ctx, - request, - sni_hostname, - conn_pool, - ) - .await + sql_over_http::handle(config, ctx, request, backend) + .instrument(span) + .await } else if request.uri().path() == "/sql" && request.method() == Method::OPTIONS { Response::builder() .header("Allow", "OPTIONS, POST") diff --git a/proxy/src/serverless/backend.rs b/proxy/src/serverless/backend.rs new file mode 100644 index 0000000000..9b3ca8d447 --- /dev/null +++ b/proxy/src/serverless/backend.rs @@ -0,0 +1,187 @@ +use std::{sync::Arc, time::Duration}; + +use async_trait::async_trait; +use tracing::{field::display, info}; + +use crate::{ + auth::{backend::ComputeCredentials, check_peer_addr_is_in_list, AuthError}, + compute, + config::ProxyConfig, + console::{ + errors::{GetAuthInfoError, WakeComputeError}, + messages::ColdStartInfo, + CachedNodeInfo, + }, + context::RequestMonitoring, + error::{ErrorKind, ReportableError, UserFacingError}, + proxy::connect_compute::ConnectMechanism, +}; + +use super::conn_pool::{poll_client, Client, ConnInfo, GlobalConnPool}; + +pub struct PoolingBackend { + pub pool: Arc>, + pub config: &'static ProxyConfig, +} + +impl PoolingBackend { + pub async fn authenticate( + &self, + ctx: &mut RequestMonitoring, + conn_info: &ConnInfo, + ) -> Result { + let user_info = conn_info.user_info.clone(); + let backend = self.config.auth_backend.as_ref().map(|_| user_info.clone()); + let (allowed_ips, maybe_secret) = backend.get_allowed_ips_and_secret(ctx).await?; + if !check_peer_addr_is_in_list(&ctx.peer_addr, &allowed_ips) { + return Err(AuthError::ip_address_not_allowed(ctx.peer_addr)); + } + let cached_secret = match maybe_secret { + Some(secret) => secret, + None => backend.get_role_secret(ctx).await?, + }; + + let secret = match cached_secret.value.clone() { + Some(secret) => secret, + None => { + // If we don't have an authentication secret, for the http flow we can just return an error. + info!("authentication info not found"); + return Err(AuthError::auth_failed(&*user_info.user)); + } + }; + let auth_outcome = + crate::auth::validate_password_and_exchange(&conn_info.password, secret)?; + let res = match auth_outcome { + crate::sasl::Outcome::Success(key) => Ok(key), + crate::sasl::Outcome::Failure(reason) => { + info!("auth backend failed with an error: {reason}"); + Err(AuthError::auth_failed(&*conn_info.user_info.user)) + } + }; + res.map(|key| ComputeCredentials { + info: user_info, + keys: key, + }) + } + + // Wake up the destination if needed. Code here is a bit involved because + // we reuse the code from the usual proxy and we need to prepare few structures + // that this code expects. + #[tracing::instrument(fields(pid = tracing::field::Empty), skip_all)] + pub async fn connect_to_compute( + &self, + ctx: &mut RequestMonitoring, + conn_info: ConnInfo, + keys: ComputeCredentials, + force_new: bool, + ) -> Result, HttpConnError> { + let maybe_client = if !force_new { + info!("pool: looking for an existing connection"); + self.pool.get(ctx, &conn_info).await? + } else { + info!("pool: pool is disabled"); + None + }; + + if let Some(client) = maybe_client { + ctx.set_cold_start_info(ColdStartInfo::Warm); + return Ok(client); + } + let conn_id = uuid::Uuid::new_v4(); + tracing::Span::current().record("conn_id", display(conn_id)); + info!(%conn_id, "pool: opening a new connection '{conn_info}'"); + let backend = self.config.auth_backend.as_ref().map(|_| keys); + crate::proxy::connect_compute::connect_to_compute( + ctx, + &TokioMechanism { + conn_id, + conn_info, + pool: self.pool.clone(), + }, + &backend, + false, // do not allow self signed compute for http flow + ) + .await + } +} + +#[derive(Debug, thiserror::Error)] +pub enum HttpConnError { + #[error("pooled connection closed at inconsistent state")] + ConnectionClosedAbruptly(#[from] tokio::sync::watch::error::SendError), + #[error("could not connection to compute")] + ConnectionError(#[from] tokio_postgres::Error), + + #[error("could not get auth info")] + GetAuthInfo(#[from] GetAuthInfoError), + #[error("user not authenticated")] + AuthError(#[from] AuthError), + #[error("wake_compute returned error")] + WakeCompute(#[from] WakeComputeError), +} + +impl ReportableError for HttpConnError { + fn get_error_kind(&self) -> ErrorKind { + match self { + HttpConnError::ConnectionClosedAbruptly(_) => ErrorKind::Compute, + HttpConnError::ConnectionError(p) => p.get_error_kind(), + HttpConnError::GetAuthInfo(a) => a.get_error_kind(), + HttpConnError::AuthError(a) => a.get_error_kind(), + HttpConnError::WakeCompute(w) => w.get_error_kind(), + } + } +} + +impl UserFacingError for HttpConnError { + fn to_string_client(&self) -> String { + match self { + HttpConnError::ConnectionClosedAbruptly(_) => self.to_string(), + HttpConnError::ConnectionError(p) => p.to_string(), + HttpConnError::GetAuthInfo(c) => c.to_string_client(), + HttpConnError::AuthError(c) => c.to_string_client(), + HttpConnError::WakeCompute(c) => c.to_string_client(), + } + } +} + +struct TokioMechanism { + pool: Arc>, + conn_info: ConnInfo, + conn_id: uuid::Uuid, +} + +#[async_trait] +impl ConnectMechanism for TokioMechanism { + type Connection = Client; + type ConnectError = tokio_postgres::Error; + type Error = HttpConnError; + + async fn connect_once( + &self, + ctx: &mut RequestMonitoring, + node_info: &CachedNodeInfo, + timeout: Duration, + ) -> Result { + let mut config = (*node_info.config).clone(); + let config = config + .user(&self.conn_info.user_info.user) + .password(&*self.conn_info.password) + .dbname(&self.conn_info.dbname) + .connect_timeout(timeout); + + let (client, connection) = config.connect(tokio_postgres::NoTls).await?; + + tracing::Span::current().record("pid", &tracing::field::display(client.get_process_id())); + Ok(poll_client( + self.pool.clone(), + ctx, + self.conn_info.clone(), + client, + connection, + self.conn_id, + node_info.aux.clone(), + )) + } + + fn update_connect_config(&self, _config: &mut compute::ConnCfg) {} +} diff --git a/proxy/src/serverless/conn_pool.rs b/proxy/src/serverless/conn_pool.rs index 5a7279ae63..c7e8eaef76 100644 --- a/proxy/src/serverless/conn_pool.rs +++ b/proxy/src/serverless/conn_pool.rs @@ -1,17 +1,9 @@ -use anyhow::Context; -use async_trait::async_trait; use dashmap::DashMap; use futures::{future::poll_fn, Future}; -use metrics::{register_int_counter_pair, IntCounterPair, IntCounterPairGuard}; -use once_cell::sync::Lazy; +use metrics::IntCounterPairGuard; use parking_lot::RwLock; -use pbkdf2::{ - password_hash::{PasswordHashString, PasswordHasher, PasswordVerifier, SaltString}, - Params, Pbkdf2, -}; -use prometheus::{exponential_buckets, register_histogram, Histogram}; use rand::Rng; -use smol_str::SmolStr; +use smallvec::SmallVec; use std::{collections::HashMap, pin::pin, sync::Arc, sync::Weak, time::Duration}; use std::{ fmt, @@ -21,30 +13,28 @@ use std::{ ops::Deref, sync::atomic::{self, AtomicUsize}, }; -use tokio::time::{self, Instant}; -use tokio_postgres::{AsyncMessage, ReadyForQueryStatus}; +use tokio::time::Instant; +use tokio_postgres::tls::NoTlsStream; +use tokio_postgres::{AsyncMessage, ReadyForQueryStatus, Socket}; +use crate::console::messages::MetricsAuxInfo; +use crate::metrics::{ENDPOINT_POOLS, GC_LATENCY, NUM_OPEN_CLIENTS_IN_HTTP_POOL}; +use crate::usage_metrics::{Ids, MetricCounter, USAGE_METRICS}; use crate::{ - auth::{self, backend::ComputeUserInfo, check_peer_addr_is_in_list}, - console::{self, messages::MetricsAuxInfo}, - context::RequestMonitoring, - metrics::NUM_DB_CONNECTIONS_GAUGE, - proxy::connect_compute::ConnectMechanism, - usage_metrics::{Ids, MetricCounter, USAGE_METRICS}, + auth::backend::ComputeUserInfo, context::RequestMonitoring, metrics::NUM_DB_CONNECTIONS_GAUGE, DbName, EndpointCacheKey, RoleName, }; -use crate::{compute, config}; use tracing::{debug, error, warn, Span}; use tracing::{info, info_span, Instrument}; -pub const APP_NAME: SmolStr = SmolStr::new_inline("/sql_over_http"); +use super::backend::HttpConnError; #[derive(Debug, Clone)] pub struct ConnInfo { pub user_info: ComputeUserInfo, pub dbname: DbName, - pub password: SmolStr, + pub password: SmallVec<[u8; 16]>, } impl ConnInfo { @@ -53,8 +43,13 @@ impl ConnInfo { (self.dbname.clone(), self.user_info.user.clone()) } - pub fn endpoint_cache_key(&self) -> EndpointCacheKey { - self.user_info.endpoint_cache_key() + pub fn endpoint_cache_key(&self) -> Option { + // We don't want to cache http connections for ephemeral endpoints. + if self.user_info.options.is_ephemeral() { + None + } else { + Some(self.user_info.endpoint_cache_key()) + } } } @@ -72,39 +67,51 @@ impl fmt::Display for ConnInfo { } } -struct ConnPoolEntry { - conn: ClientInner, +struct ConnPoolEntry { + conn: ClientInner, _last_access: std::time::Instant, } // Per-endpoint connection pool, (dbname, username) -> DbUserConnPool // Number of open connections is limited by the `max_conns_per_endpoint`. -pub struct EndpointConnPool { - pools: HashMap<(DbName, RoleName), DbUserConnPool>, +pub struct EndpointConnPool { + pools: HashMap<(DbName, RoleName), DbUserConnPool>, total_conns: usize, max_conns: usize, _guard: IntCounterPairGuard, + global_connections_count: Arc, + global_pool_size_max_conns: usize, } -impl EndpointConnPool { - fn get_conn_entry(&mut self, db_user: (DbName, RoleName)) -> Option { +impl EndpointConnPool { + fn get_conn_entry(&mut self, db_user: (DbName, RoleName)) -> Option> { let Self { - pools, total_conns, .. + pools, + total_conns, + global_connections_count, + .. } = self; - pools - .get_mut(&db_user) - .and_then(|pool_entries| pool_entries.get_conn_entry(total_conns)) + pools.get_mut(&db_user).and_then(|pool_entries| { + pool_entries.get_conn_entry(total_conns, global_connections_count.clone()) + }) } fn remove_client(&mut self, db_user: (DbName, RoleName), conn_id: uuid::Uuid) -> bool { let Self { - pools, total_conns, .. + pools, + total_conns, + global_connections_count, + .. } = self; if let Some(pool) = pools.get_mut(&db_user) { let old_len = pool.conns.len(); pool.conns.retain(|conn| conn.conn.conn_id != conn_id); let new_len = pool.conns.len(); let removed = old_len - new_len; + if removed > 0 { + global_connections_count.fetch_sub(removed, atomic::Ordering::Relaxed); + NUM_OPEN_CLIENTS_IN_HTTP_POOL.sub(removed as i64); + } *total_conns -= removed; removed > 0 } else { @@ -112,12 +119,22 @@ impl EndpointConnPool { } } - fn put(pool: &RwLock, conn_info: &ConnInfo, client: ClientInner) -> anyhow::Result<()> { + fn put(pool: &RwLock, conn_info: &ConnInfo, client: ClientInner) { let conn_id = client.conn_id; - if client.inner.is_closed() { + if client.is_closed() { info!(%conn_id, "pool: throwing away connection '{conn_info}' because connection is closed"); - return Ok(()); + return; + } + let global_max_conn = pool.read().global_pool_size_max_conns; + if pool + .read() + .global_connections_count + .load(atomic::Ordering::Relaxed) + >= global_max_conn + { + info!(%conn_id, "pool: throwing away connection '{conn_info}' because pool is full"); + return; } // return connection to the pool @@ -127,18 +144,19 @@ impl EndpointConnPool { let mut pool = pool.write(); if pool.total_conns < pool.max_conns { - // we create this db-user entry in get, so it should not be None - if let Some(pool_entries) = pool.pools.get_mut(&conn_info.db_and_user()) { - pool_entries.conns.push(ConnPoolEntry { - conn: client, - _last_access: std::time::Instant::now(), - }); + let pool_entries = pool.pools.entry(conn_info.db_and_user()).or_default(); + pool_entries.conns.push(ConnPoolEntry { + conn: client, + _last_access: std::time::Instant::now(), + }); - returned = true; - per_db_size = pool_entries.conns.len(); + returned = true; + per_db_size = pool_entries.conns.len(); - pool.total_conns += 1; - } + pool.total_conns += 1; + pool.global_connections_count + .fetch_add(1, atomic::Ordering::Relaxed); + NUM_OPEN_CLIENTS_IN_HTTP_POOL.inc(); } pool.total_conns @@ -150,54 +168,64 @@ impl EndpointConnPool { } else { info!(%conn_id, "pool: throwing away connection '{conn_info}' because pool is full, total_conns={total_conns}"); } - - Ok(()) } } -/// 4096 is the number of rounds that SCRAM-SHA-256 recommends. -/// It's not the 600,000 that OWASP recommends... but our passwords are high entropy anyway. -/// -/// Still takes 1.4ms to hash on my hardware. -/// We don't want to ruin the latency improvements of using the pool by making password verification take too long -const PARAMS: Params = Params { - rounds: 4096, - output_length: 32, -}; - -#[derive(Default)] -pub struct DbUserConnPool { - conns: Vec, - password_hash: Option, +impl Drop for EndpointConnPool { + fn drop(&mut self) { + if self.total_conns > 0 { + self.global_connections_count + .fetch_sub(self.total_conns, atomic::Ordering::Relaxed); + NUM_OPEN_CLIENTS_IN_HTTP_POOL.sub(self.total_conns as i64); + } + } } -impl DbUserConnPool { - fn clear_closed_clients(&mut self, conns: &mut usize) { +pub struct DbUserConnPool { + conns: Vec>, +} + +impl Default for DbUserConnPool { + fn default() -> Self { + Self { conns: Vec::new() } + } +} + +impl DbUserConnPool { + fn clear_closed_clients(&mut self, conns: &mut usize) -> usize { let old_len = self.conns.len(); - self.conns.retain(|conn| !conn.conn.inner.is_closed()); + self.conns.retain(|conn| !conn.conn.is_closed()); let new_len = self.conns.len(); let removed = old_len - new_len; *conns -= removed; + removed } - fn get_conn_entry(&mut self, conns: &mut usize) -> Option { - self.clear_closed_clients(conns); + fn get_conn_entry( + &mut self, + conns: &mut usize, + global_connections_count: Arc, + ) -> Option> { + let mut removed = self.clear_closed_clients(conns); let conn = self.conns.pop(); if conn.is_some() { *conns -= 1; + removed += 1; } + global_connections_count.fetch_sub(removed, atomic::Ordering::Relaxed); + NUM_OPEN_CLIENTS_IN_HTTP_POOL.sub(removed as i64); conn } } -pub struct GlobalConnPool { +pub struct GlobalConnPool { // endpoint -> per-endpoint connection pool // // That should be a fairly conteded map, so return reference to the per-endpoint // pool as early as possible and release the lock. - global_pool: DashMap>>, + global_pool: DashMap>>>, /// Number of endpoint-connection pools /// @@ -206,7 +234,10 @@ pub struct GlobalConnPool { /// It's only used for diagnostics. global_pool_size: AtomicUsize, - proxy_config: &'static crate::config::ProxyConfig, + /// Total number of connections in the pool + global_connections_count: Arc, + + config: &'static crate::config::HttpConfig, } #[derive(Debug, Clone, Copy)] @@ -224,45 +255,39 @@ pub struct GlobalConnPoolOptions { pub idle_timeout: Duration, pub opt_in: bool, + + // Total number of connections in the pool. + pub max_total_conns: usize, } -pub static GC_LATENCY: Lazy = Lazy::new(|| { - register_histogram!( - "proxy_http_pool_reclaimation_lag_seconds", - "Time it takes to reclaim unused connection pools", - // 1us -> 65ms - exponential_buckets(1e-6, 2.0, 16).unwrap(), - ) - .unwrap() -}); - -pub static ENDPOINT_POOLS: Lazy = Lazy::new(|| { - register_int_counter_pair!( - "proxy_http_pool_endpoints_registered_total", - "Number of endpoints we have registered pools for", - "proxy_http_pool_endpoints_unregistered_total", - "Number of endpoints we have unregistered pools for", - ) - .unwrap() -}); - -impl GlobalConnPool { - pub fn new(config: &'static crate::config::ProxyConfig) -> Arc { - let shards = config.http_config.pool_options.pool_shards; +impl GlobalConnPool { + pub fn new(config: &'static crate::config::HttpConfig) -> Arc { + let shards = config.pool_options.pool_shards; Arc::new(Self { global_pool: DashMap::with_shard_amount(shards), global_pool_size: AtomicUsize::new(0), - proxy_config: config, + config, + global_connections_count: Arc::new(AtomicUsize::new(0)), }) } + #[cfg(test)] + pub fn get_global_connections_count(&self) -> usize { + self.global_connections_count + .load(atomic::Ordering::Relaxed) + } + + pub fn get_idle_timeout(&self) -> Duration { + self.config.pool_options.idle_timeout + } + pub fn shutdown(&self) { // drops all strong references to endpoint-pools self.global_pool.clear(); } pub async fn gc_worker(&self, mut rng: impl Rng) { - let epoch = self.proxy_config.http_config.pool_options.gc_epoch; + let epoch = self.config.pool_options.gc_epoch; let mut interval = tokio::time::interval(epoch / (self.global_pool.shards().len()) as u32); loop { interval.tick().await; @@ -280,6 +305,7 @@ impl GlobalConnPool { let timer = GC_LATENCY.start_timer(); let current_len = shard.len(); + let mut clients_removed = 0; shard.retain(|endpoint, x| { // if the current endpoint pool is unique (no other strong or weak references) // then it is currently not in use by any connections. @@ -289,9 +315,9 @@ impl GlobalConnPool { } = pool.get_mut(); // ensure that closed clients are removed - pools - .iter_mut() - .for_each(|(_, db_pool)| db_pool.clear_closed_clients(total_conns)); + pools.iter_mut().for_each(|(_, db_pool)| { + clients_removed += db_pool.clear_closed_clients(total_conns); + }); // we only remove this pool if it has no active connections if *total_conns == 0 { @@ -302,10 +328,20 @@ impl GlobalConnPool { true }); + let new_len = shard.len(); drop(shard); timer.observe_duration(); + // Do logging outside of the lock. + if clients_removed > 0 { + let size = self + .global_connections_count + .fetch_sub(clients_removed, atomic::Ordering::Relaxed) + - clients_removed; + NUM_OPEN_CLIENTS_IN_HTTP_POOL.sub(clients_removed as i64); + info!("pool: performed global pool gc. removed {clients_removed} clients, total number of clients in pool is {size}"); + } let removed = current_len - new_len; if removed > 0 { @@ -320,131 +356,47 @@ impl GlobalConnPool { pub async fn get( self: &Arc, ctx: &mut RequestMonitoring, - conn_info: ConnInfo, - force_new: bool, - ) -> anyhow::Result { - let mut client: Option = None; + conn_info: &ConnInfo, + ) -> Result>, HttpConnError> { + let mut client: Option> = None; + let Some(endpoint) = conn_info.endpoint_cache_key() else { + return Ok(None); + }; - let mut hash_valid = false; - let mut endpoint_pool = Weak::new(); - if !force_new { - let pool = self.get_or_create_endpoint_pool(&conn_info.endpoint_cache_key()); - endpoint_pool = Arc::downgrade(&pool); - let mut hash = None; - - // find a pool entry by (dbname, username) if exists - { - let pool = pool.read(); - if let Some(pool_entries) = pool.pools.get(&conn_info.db_and_user()) { - if !pool_entries.conns.is_empty() { - hash = pool_entries.password_hash.clone(); - } - } - } - - // a connection exists in the pool, verify the password hash - if let Some(hash) = hash { - let pw = conn_info.password.clone(); - let validate = tokio::task::spawn_blocking(move || { - Pbkdf2.verify_password(pw.as_bytes(), &hash.password_hash()) - }) - .await?; - - // if the hash is invalid, don't error - // we will continue with the regular connection flow - if validate.is_ok() { - hash_valid = true; - if let Some(entry) = pool.write().get_conn_entry(conn_info.db_and_user()) { - client = Some(entry.conn) - } - } - } + let endpoint_pool = self.get_or_create_endpoint_pool(&endpoint); + if let Some(entry) = endpoint_pool + .write() + .get_conn_entry(conn_info.db_and_user()) + { + client = Some(entry.conn) } + let endpoint_pool = Arc::downgrade(&endpoint_pool); // ok return cached connection if found and establish a new one otherwise - let new_client = if let Some(client) = client { - ctx.set_project(client.aux.clone()); - if client.inner.is_closed() { - let conn_id = uuid::Uuid::new_v4(); - info!(%conn_id, "pool: cached connection '{conn_info}' is closed, opening a new one"); - connect_to_compute( - self.proxy_config, - ctx, - &conn_info, - conn_id, - endpoint_pool.clone(), - ) - .await + if let Some(client) = client { + if client.is_closed() { + info!("pool: cached connection '{conn_info}' is closed, opening a new one"); + return Ok(None); } else { - info!("pool: reusing connection '{conn_info}'"); - client.session.send(ctx.session_id)?; + tracing::Span::current().record("conn_id", tracing::field::display(client.conn_id)); tracing::Span::current().record( "pid", &tracing::field::display(client.inner.get_process_id()), ); + info!("pool: reusing connection '{conn_info}'"); + client.session.send(ctx.session_id)?; ctx.latency_timer.pool_hit(); ctx.latency_timer.success(); - return Ok(Client::new(client, conn_info, endpoint_pool).await); + return Ok(Some(Client::new(client, conn_info.clone(), endpoint_pool))); } - } else { - let conn_id = uuid::Uuid::new_v4(); - info!(%conn_id, "pool: opening a new connection '{conn_info}'"); - connect_to_compute( - self.proxy_config, - ctx, - &conn_info, - conn_id, - endpoint_pool.clone(), - ) - .await - }; - if let Ok(client) = &new_client { - tracing::Span::current().record( - "pid", - &tracing::field::display(client.inner.get_process_id()), - ); } - - match &new_client { - // clear the hash. it's no longer valid - // TODO: update tokio-postgres fork to allow access to this error kind directly - Err(err) - if hash_valid && err.to_string().contains("password authentication failed") => - { - let pool = self.get_or_create_endpoint_pool(&conn_info.endpoint_cache_key()); - let mut pool = pool.write(); - if let Some(entry) = pool.pools.get_mut(&conn_info.db_and_user()) { - entry.password_hash = None; - } - } - // new password is valid and we should insert/update it - Ok(_) if !force_new && !hash_valid => { - let pw = conn_info.password.clone(); - let new_hash = tokio::task::spawn_blocking(move || { - let salt = SaltString::generate(rand::rngs::OsRng); - Pbkdf2 - .hash_password_customized(pw.as_bytes(), None, None, PARAMS, &salt) - .map(|s| s.serialize()) - }) - .await??; - - let pool = self.get_or_create_endpoint_pool(&conn_info.endpoint_cache_key()); - let mut pool = pool.write(); - pool.pools - .entry(conn_info.db_and_user()) - .or_default() - .password_hash = Some(new_hash); - } - _ => {} - } - let new_client = new_client?; - Ok(Client::new(new_client, conn_info, endpoint_pool).await) + Ok(None) } fn get_or_create_endpoint_pool( - &self, + self: &Arc, endpoint: &EndpointCacheKey, - ) -> Arc> { + ) -> Arc>> { // fast path if let Some(pool) = self.global_pool.get(endpoint) { return pool.clone(); @@ -454,12 +406,10 @@ impl GlobalConnPool { let new_pool = Arc::new(RwLock::new(EndpointConnPool { pools: HashMap::new(), total_conns: 0, - max_conns: self - .proxy_config - .http_config - .pool_options - .max_conns_per_endpoint, + max_conns: self.config.pool_options.max_conns_per_endpoint, _guard: ENDPOINT_POOLS.guard(), + global_connections_count: self.global_connections_count.clone(), + global_pool_size_max_conns: self.config.pool_options.max_total_conns, })); // find or create a pool for this endpoint @@ -488,196 +438,130 @@ impl GlobalConnPool { } } -struct TokioMechanism<'a> { - pool: Weak>, - conn_info: &'a ConnInfo, - conn_id: uuid::Uuid, - idle: Duration, -} - -#[async_trait] -impl ConnectMechanism for TokioMechanism<'_> { - type Connection = ClientInner; - type ConnectError = tokio_postgres::Error; - type Error = anyhow::Error; - - async fn connect_once( - &self, - ctx: &mut RequestMonitoring, - node_info: &console::CachedNodeInfo, - timeout: time::Duration, - ) -> Result { - connect_to_compute_once( - ctx, - node_info, - self.conn_info, - timeout, - self.conn_id, - self.pool.clone(), - self.idle, - ) - .await - } - - fn update_connect_config(&self, _config: &mut compute::ConnCfg) {} -} - -// Wake up the destination if needed. Code here is a bit involved because -// we reuse the code from the usual proxy and we need to prepare few structures -// that this code expects. -#[tracing::instrument(fields(pid = tracing::field::Empty), skip_all)] -async fn connect_to_compute( - config: &config::ProxyConfig, +pub fn poll_client( + global_pool: Arc>, ctx: &mut RequestMonitoring, - conn_info: &ConnInfo, + conn_info: ConnInfo, + client: C, + mut connection: tokio_postgres::Connection, conn_id: uuid::Uuid, - pool: Weak>, -) -> anyhow::Result { - ctx.set_application(Some(APP_NAME)); - let backend = config - .auth_backend - .as_ref() - .map(|_| conn_info.user_info.clone()); - - if !config.disable_ip_check_for_http { - let allowed_ips = backend.get_allowed_ips(ctx).await?; - if !check_peer_addr_is_in_list(&ctx.peer_addr, &allowed_ips) { - return Err(auth::AuthError::ip_address_not_allowed().into()); - } - } - let node_info = backend - .wake_compute(ctx) - .await? - .context("missing cache entry from wake_compute")?; - - ctx.set_project(node_info.aux.clone()); - - crate::proxy::connect_compute::connect_to_compute( - ctx, - &TokioMechanism { - conn_id, - conn_info, - pool, - idle: config.http_config.pool_options.idle_timeout, - }, - node_info, - &backend, - ) - .await -} - -async fn connect_to_compute_once( - ctx: &mut RequestMonitoring, - node_info: &console::CachedNodeInfo, - conn_info: &ConnInfo, - timeout: time::Duration, - conn_id: uuid::Uuid, - pool: Weak>, - idle: Duration, -) -> Result { - let mut config = (*node_info.config).clone(); - let mut session = ctx.session_id; - - let (client, mut connection) = config - .user(&conn_info.user_info.user) - .password(&*conn_info.password) - .dbname(&conn_info.dbname) - .connect_timeout(timeout) - .connect(tokio_postgres::NoTls) - .await?; - + aux: MetricsAuxInfo, +) -> Client { let conn_gauge = NUM_DB_CONNECTIONS_GAUGE .with_label_values(&[ctx.protocol]) .guard(); - - tracing::Span::current().record("pid", &tracing::field::display(client.get_process_id())); - - let (tx, mut rx) = tokio::sync::watch::channel(session); + let mut session_id = ctx.session_id; + let (tx, mut rx) = tokio::sync::watch::channel(session_id); let span = info_span!(parent: None, "connection", %conn_id); span.in_scope(|| { - info!(%conn_info, %session, "new connection"); + info!(%conn_info, %session_id, "new connection"); }); + let pool = match conn_info.endpoint_cache_key() { + Some(endpoint) => Arc::downgrade(&global_pool.get_or_create_endpoint_pool(&endpoint)), + None => Weak::new(), + }; + let pool_clone = pool.clone(); let db_user = conn_info.db_and_user(); + let idle = global_pool.get_idle_timeout(); tokio::spawn( - async move { - let _conn_gauge = conn_gauge; - let mut idle_timeout = pin!(tokio::time::sleep(idle)); - poll_fn(move |cx| { - if matches!(rx.has_changed(), Ok(true)) { - session = *rx.borrow_and_update(); - info!(%session, "changed session"); - idle_timeout.as_mut().reset(Instant::now() + idle); - } + async move { + let _conn_gauge = conn_gauge; + let mut idle_timeout = pin!(tokio::time::sleep(idle)); + poll_fn(move |cx| { + if matches!(rx.has_changed(), Ok(true)) { + session_id = *rx.borrow_and_update(); + info!(%session_id, "changed session"); + idle_timeout.as_mut().reset(Instant::now() + idle); + } - // 5 minute idle connection timeout - if idle_timeout.as_mut().poll(cx).is_ready() { - idle_timeout.as_mut().reset(Instant::now() + idle); - info!("connection idle"); - if let Some(pool) = pool.clone().upgrade() { - // remove client from pool - should close the connection if it's idle. - // does nothing if the client is currently checked-out and in-use - if pool.write().remove_client(db_user.clone(), conn_id) { - info!("idle connection removed"); - } - } - } - - loop { - let message = ready!(connection.poll_message(cx)); - - match message { - Some(Ok(AsyncMessage::Notice(notice))) => { - info!(%session, "notice: {}", notice); - } - Some(Ok(AsyncMessage::Notification(notif))) => { - warn!(%session, pid = notif.process_id(), channel = notif.channel(), "notification received"); - } - Some(Ok(_)) => { - warn!(%session, "unknown message"); - } - Some(Err(e)) => { - error!(%session, "connection error: {}", e); - break - } - None => { - info!("connection closed"); - break - } - } - } - - // remove from connection pool + // 5 minute idle connection timeout + if idle_timeout.as_mut().poll(cx).is_ready() { + idle_timeout.as_mut().reset(Instant::now() + idle); + info!("connection idle"); if let Some(pool) = pool.clone().upgrade() { + // remove client from pool - should close the connection if it's idle. + // does nothing if the client is currently checked-out and in-use if pool.write().remove_client(db_user.clone(), conn_id) { - info!("closed connection removed"); + info!("idle connection removed"); } } + } - Poll::Ready(()) - }).await; + loop { + let message = ready!(connection.poll_message(cx)); - } - .instrument(span) - ); + match message { + Some(Ok(AsyncMessage::Notice(notice))) => { + info!(%session_id, "notice: {}", notice); + } + Some(Ok(AsyncMessage::Notification(notif))) => { + warn!(%session_id, pid = notif.process_id(), channel = notif.channel(), "notification received"); + } + Some(Ok(_)) => { + warn!(%session_id, "unknown message"); + } + Some(Err(e)) => { + error!(%session_id, "connection error: {}", e); + break + } + None => { + info!("connection closed"); + break + } + } + } - Ok(ClientInner { + // remove from connection pool + if let Some(pool) = pool.clone().upgrade() { + if pool.write().remove_client(db_user.clone(), conn_id) { + info!("closed connection removed"); + } + } + + Poll::Ready(()) + }).await; + + } + .instrument(span)); + let inner = ClientInner { inner: client, session: tx, - aux: node_info.aux.clone(), + aux, conn_id, - }) + }; + Client::new(inner, conn_info, pool_clone) } -struct ClientInner { - inner: tokio_postgres::Client, +struct ClientInner { + inner: C, session: tokio::sync::watch::Sender, aux: MetricsAuxInfo, conn_id: uuid::Uuid, } -impl Client { +pub trait ClientInnerExt: Sync + Send + 'static { + fn is_closed(&self) -> bool; + fn get_process_id(&self) -> i32; +} + +impl ClientInnerExt for tokio_postgres::Client { + fn is_closed(&self) -> bool { + self.is_closed() + } + fn get_process_id(&self) -> i32 { + self.get_process_id() + } +} + +impl ClientInner { + pub fn is_closed(&self) -> bool { + self.inner.is_closed() + } +} + +impl Client { pub fn metrics(&self) -> Arc { let aux = &self.inner.as_ref().unwrap().aux; USAGE_METRICS.register(Ids { @@ -687,80 +571,60 @@ impl Client { } } -pub struct Client { - conn_id: uuid::Uuid, +pub struct Client { span: Span, - inner: Option, + inner: Option>, conn_info: ConnInfo, - pool: Weak>, + pool: Weak>>, } -pub struct Discard<'a> { - conn_id: uuid::Uuid, +pub struct Discard<'a, C: ClientInnerExt> { conn_info: &'a ConnInfo, - pool: &'a mut Weak>, + pool: &'a mut Weak>>, } -impl Client { - pub(self) async fn new( - inner: ClientInner, +impl Client { + pub(self) fn new( + inner: ClientInner, conn_info: ConnInfo, - pool: Weak>, + pool: Weak>>, ) -> Self { Self { - conn_id: inner.conn_id, inner: Some(inner), span: Span::current(), conn_info, pool, } } - pub fn inner(&mut self) -> (&mut tokio_postgres::Client, Discard<'_>) { + pub fn inner(&mut self) -> (&mut C, Discard<'_, C>) { let Self { inner, pool, - conn_id, conn_info, span: _, } = self; - ( - &mut inner - .as_mut() - .expect("client inner should not be removed") - .inner, - Discard { - pool, - conn_info, - conn_id: *conn_id, - }, - ) - } - - pub fn check_idle(&mut self, status: ReadyForQueryStatus) { - self.inner().1.check_idle(status) - } - pub fn discard(&mut self) { - self.inner().1.discard() + let inner = inner.as_mut().expect("client inner should not be removed"); + (&mut inner.inner, Discard { pool, conn_info }) } } -impl Discard<'_> { +impl Discard<'_, C> { pub fn check_idle(&mut self, status: ReadyForQueryStatus) { let conn_info = &self.conn_info; if status != ReadyForQueryStatus::Idle && std::mem::take(self.pool).strong_count() > 0 { - info!(conn_id = %self.conn_id, "pool: throwing away connection '{conn_info}' because connection is not idle") + info!("pool: throwing away connection '{conn_info}' because connection is not idle") } } pub fn discard(&mut self) { let conn_info = &self.conn_info; if std::mem::take(self.pool).strong_count() > 0 { - info!(conn_id = %self.conn_id, "pool: throwing away connection '{conn_info}' because connection is potentially in a broken state") + info!("pool: throwing away connection '{conn_info}' because connection is potentially in a broken state") } } } -impl Deref for Client { - type Target = tokio_postgres::Client; +impl Deref for Client { + type Target = C; fn deref(&self) -> &Self::Target { &self @@ -771,8 +635,8 @@ impl Deref for Client { } } -impl Drop for Client { - fn drop(&mut self) { +impl Client { + fn do_drop(&mut self) -> Option { let conn_info = self.conn_info.clone(); let client = self .inner @@ -781,10 +645,162 @@ impl Drop for Client { if let Some(conn_pool) = std::mem::take(&mut self.pool).upgrade() { let current_span = self.span.clone(); // return connection to the pool - tokio::task::spawn_blocking(move || { + return Some(move || { let _span = current_span.enter(); - let _ = EndpointConnPool::put(&conn_pool, &conn_info, client); + EndpointConnPool::put(&conn_pool, &conn_info, client); }); } + None + } +} + +impl Drop for Client { + fn drop(&mut self) { + if let Some(drop) = self.do_drop() { + tokio::task::spawn_blocking(drop); + } + } +} + +#[cfg(test)] +mod tests { + use std::{mem, sync::atomic::AtomicBool}; + + use super::*; + + struct MockClient(Arc); + impl MockClient { + fn new(is_closed: bool) -> Self { + MockClient(Arc::new(is_closed.into())) + } + } + impl ClientInnerExt for MockClient { + fn is_closed(&self) -> bool { + self.0.load(atomic::Ordering::Relaxed) + } + fn get_process_id(&self) -> i32 { + 0 + } + } + + fn create_inner() -> ClientInner { + create_inner_with(MockClient::new(false)) + } + + fn create_inner_with(client: MockClient) -> ClientInner { + ClientInner { + inner: client, + session: tokio::sync::watch::Sender::new(uuid::Uuid::new_v4()), + aux: Default::default(), + conn_id: uuid::Uuid::new_v4(), + } + } + + #[tokio::test] + async fn test_pool() { + let _ = env_logger::try_init(); + let config = Box::leak(Box::new(crate::config::HttpConfig { + pool_options: GlobalConnPoolOptions { + max_conns_per_endpoint: 2, + gc_epoch: Duration::from_secs(1), + pool_shards: 2, + idle_timeout: Duration::from_secs(1), + opt_in: false, + max_total_conns: 3, + }, + request_timeout: Duration::from_secs(1), + })); + let pool = GlobalConnPool::new(config); + let conn_info = ConnInfo { + user_info: ComputeUserInfo { + user: "user".into(), + endpoint: "endpoint".into(), + options: Default::default(), + }, + dbname: "dbname".into(), + password: "password".as_bytes().into(), + }; + let ep_pool = Arc::downgrade( + &pool.get_or_create_endpoint_pool(&conn_info.endpoint_cache_key().unwrap()), + ); + { + let mut client = Client::new(create_inner(), conn_info.clone(), ep_pool.clone()); + assert_eq!(0, pool.get_global_connections_count()); + client.inner().1.discard(); + // Discard should not add the connection from the pool. + assert_eq!(0, pool.get_global_connections_count()); + } + { + let mut client = Client::new(create_inner(), conn_info.clone(), ep_pool.clone()); + client.do_drop().unwrap()(); + mem::forget(client); // drop the client + assert_eq!(1, pool.get_global_connections_count()); + } + { + let mut closed_client = Client::new( + create_inner_with(MockClient::new(true)), + conn_info.clone(), + ep_pool.clone(), + ); + closed_client.do_drop().unwrap()(); + mem::forget(closed_client); // drop the client + // The closed client shouldn't be added to the pool. + assert_eq!(1, pool.get_global_connections_count()); + } + let is_closed: Arc = Arc::new(false.into()); + { + let mut client = Client::new( + create_inner_with(MockClient(is_closed.clone())), + conn_info.clone(), + ep_pool.clone(), + ); + client.do_drop().unwrap()(); + mem::forget(client); // drop the client + + // The client should be added to the pool. + assert_eq!(2, pool.get_global_connections_count()); + } + { + let mut client = Client::new(create_inner(), conn_info, ep_pool); + client.do_drop().unwrap()(); + mem::forget(client); // drop the client + + // The client shouldn't be added to the pool. Because the ep-pool is full. + assert_eq!(2, pool.get_global_connections_count()); + } + + let conn_info = ConnInfo { + user_info: ComputeUserInfo { + user: "user".into(), + endpoint: "endpoint-2".into(), + options: Default::default(), + }, + dbname: "dbname".into(), + password: "password".as_bytes().into(), + }; + let ep_pool = Arc::downgrade( + &pool.get_or_create_endpoint_pool(&conn_info.endpoint_cache_key().unwrap()), + ); + { + let mut client = Client::new(create_inner(), conn_info.clone(), ep_pool.clone()); + client.do_drop().unwrap()(); + mem::forget(client); // drop the client + assert_eq!(3, pool.get_global_connections_count()); + } + { + let mut client = Client::new(create_inner(), conn_info.clone(), ep_pool.clone()); + client.do_drop().unwrap()(); + mem::forget(client); // drop the client + + // The client shouldn't be added to the pool. Because the global pool is full. + assert_eq!(3, pool.get_global_connections_count()); + } + + is_closed.store(true, atomic::Ordering::Relaxed); + // Do gc for all shards. + pool.gc(0); + pool.gc(1); + // Closed client should be removed from the pool. + assert_eq!(2, pool.get_global_connections_count()); } } diff --git a/proxy/src/serverless/json.rs b/proxy/src/serverless/json.rs new file mode 100644 index 0000000000..c22c63e85b --- /dev/null +++ b/proxy/src/serverless/json.rs @@ -0,0 +1,462 @@ +use serde_json::Map; +use serde_json::Value; +use tokio_postgres::types::Kind; +use tokio_postgres::types::Type; +use tokio_postgres::Row; + +// +// Convert json non-string types to strings, so that they can be passed to Postgres +// as parameters. +// +pub fn json_to_pg_text(json: Vec) -> Vec> { + json.iter().map(json_value_to_pg_text).collect() +} + +fn json_value_to_pg_text(value: &Value) -> Option { + match value { + // special care for nulls + Value::Null => None, + + // convert to text with escaping + v @ (Value::Bool(_) | Value::Number(_) | Value::Object(_)) => Some(v.to_string()), + + // avoid escaping here, as we pass this as a parameter + Value::String(s) => Some(s.to_string()), + + // special care for arrays + Value::Array(_) => json_array_to_pg_array(value), + } +} + +// +// Serialize a JSON array to a Postgres array. Contrary to the strings in the params +// in the array we need to escape the strings. Postgres is okay with arrays of form +// '{1,"2",3}'::int[], so we don't check that array holds values of the same type, leaving +// it for Postgres to check. +// +// Example of the same escaping in node-postgres: packages/pg/lib/utils.js +// +fn json_array_to_pg_array(value: &Value) -> Option { + match value { + // special care for nulls + Value::Null => None, + + // convert to text with escaping + // here string needs to be escaped, as it is part of the array + v @ (Value::Bool(_) | Value::Number(_) | Value::String(_)) => Some(v.to_string()), + v @ Value::Object(_) => json_array_to_pg_array(&Value::String(v.to_string())), + + // recurse into array + Value::Array(arr) => { + let vals = arr + .iter() + .map(json_array_to_pg_array) + .map(|v| v.unwrap_or_else(|| "NULL".to_string())) + .collect::>() + .join(","); + + Some(format!("{{{}}}", vals)) + } + } +} + +#[derive(Debug, thiserror::Error)] +pub enum JsonConversionError { + #[error("internal error compute returned invalid data: {0}")] + AsTextError(tokio_postgres::Error), + #[error("parse int error: {0}")] + ParseIntError(#[from] std::num::ParseIntError), + #[error("parse float error: {0}")] + ParseFloatError(#[from] std::num::ParseFloatError), + #[error("parse json error: {0}")] + ParseJsonError(#[from] serde_json::Error), + #[error("unbalanced array")] + UnbalancedArray, +} + +// +// Convert postgres row with text-encoded values to JSON object +// +pub fn pg_text_row_to_json( + row: &Row, + columns: &[Type], + raw_output: bool, + array_mode: bool, +) -> Result { + let iter = row + .columns() + .iter() + .zip(columns) + .enumerate() + .map(|(i, (column, typ))| { + let name = column.name(); + let pg_value = row.as_text(i).map_err(JsonConversionError::AsTextError)?; + let json_value = if raw_output { + match pg_value { + Some(v) => Value::String(v.to_string()), + None => Value::Null, + } + } else { + pg_text_to_json(pg_value, typ)? + }; + Ok((name.to_string(), json_value)) + }); + + if array_mode { + // drop keys and aggregate into array + let arr = iter + .map(|r| r.map(|(_key, val)| val)) + .collect::, JsonConversionError>>()?; + Ok(Value::Array(arr)) + } else { + let obj = iter.collect::, JsonConversionError>>()?; + Ok(Value::Object(obj)) + } +} + +// +// Convert postgres text-encoded value to JSON value +// +fn pg_text_to_json(pg_value: Option<&str>, pg_type: &Type) -> Result { + if let Some(val) = pg_value { + if let Kind::Array(elem_type) = pg_type.kind() { + return pg_array_parse(val, elem_type); + } + + match *pg_type { + Type::BOOL => Ok(Value::Bool(val == "t")), + Type::INT2 | Type::INT4 => { + let val = val.parse::()?; + Ok(Value::Number(serde_json::Number::from(val))) + } + Type::FLOAT4 | Type::FLOAT8 => { + let fval = val.parse::()?; + let num = serde_json::Number::from_f64(fval); + if let Some(num) = num { + Ok(Value::Number(num)) + } else { + // Pass Nan, Inf, -Inf as strings + // JS JSON.stringify() does converts them to null, but we + // want to preserve them, so we pass them as strings + Ok(Value::String(val.to_string())) + } + } + Type::JSON | Type::JSONB => Ok(serde_json::from_str(val)?), + _ => Ok(Value::String(val.to_string())), + } + } else { + Ok(Value::Null) + } +} + +// +// Parse postgres array into JSON array. +// +// This is a bit involved because we need to handle nested arrays and quoted +// values. Unlike postgres we don't check that all nested arrays have the same +// dimensions, we just return them as is. +// +fn pg_array_parse(pg_array: &str, elem_type: &Type) -> Result { + _pg_array_parse(pg_array, elem_type, false).map(|(v, _)| v) +} + +fn _pg_array_parse( + pg_array: &str, + elem_type: &Type, + nested: bool, +) -> Result<(Value, usize), JsonConversionError> { + let mut pg_array_chr = pg_array.char_indices(); + let mut level = 0; + let mut quote = false; + let mut entries: Vec = Vec::new(); + let mut entry = String::new(); + + // skip bounds decoration + if let Some('[') = pg_array.chars().next() { + for (_, c) in pg_array_chr.by_ref() { + if c == '=' { + break; + } + } + } + + fn push_checked( + entry: &mut String, + entries: &mut Vec, + elem_type: &Type, + ) -> Result<(), JsonConversionError> { + if !entry.is_empty() { + // While in usual postgres response we get nulls as None and everything else + // as Some(&str), in arrays we get NULL as unquoted 'NULL' string (while + // string with value 'NULL' will be represented by '"NULL"'). So catch NULLs + // here while we have quotation info and convert them to None. + if entry == "NULL" { + entries.push(pg_text_to_json(None, elem_type)?); + } else { + entries.push(pg_text_to_json(Some(entry), elem_type)?); + } + entry.clear(); + } + + Ok(()) + } + + while let Some((mut i, mut c)) = pg_array_chr.next() { + let mut escaped = false; + + if c == '\\' { + escaped = true; + (i, c) = pg_array_chr.next().unwrap(); + } + + match c { + '{' if !quote => { + level += 1; + if level > 1 { + let (res, off) = _pg_array_parse(&pg_array[i..], elem_type, true)?; + entries.push(res); + for _ in 0..off - 1 { + pg_array_chr.next(); + } + } + } + '}' if !quote => { + level -= 1; + if level == 0 { + push_checked(&mut entry, &mut entries, elem_type)?; + if nested { + return Ok((Value::Array(entries), i)); + } + } + } + '"' if !escaped => { + if quote { + // end of quoted string, so push it manually without any checks + // for emptiness or nulls + entries.push(pg_text_to_json(Some(&entry), elem_type)?); + entry.clear(); + } + quote = !quote; + } + ',' if !quote => { + push_checked(&mut entry, &mut entries, elem_type)?; + } + _ => { + entry.push(c); + } + } + } + + if level != 0 { + return Err(JsonConversionError::UnbalancedArray); + } + + Ok((Value::Array(entries), 0)) +} + +#[cfg(test)] +mod tests { + use super::*; + use serde_json::json; + + #[test] + fn test_atomic_types_to_pg_params() { + let json = vec![Value::Bool(true), Value::Bool(false)]; + let pg_params = json_to_pg_text(json); + assert_eq!( + pg_params, + vec![Some("true".to_owned()), Some("false".to_owned())] + ); + + let json = vec![Value::Number(serde_json::Number::from(42))]; + let pg_params = json_to_pg_text(json); + assert_eq!(pg_params, vec![Some("42".to_owned())]); + + let json = vec![Value::String("foo\"".to_string())]; + let pg_params = json_to_pg_text(json); + assert_eq!(pg_params, vec![Some("foo\"".to_owned())]); + + let json = vec![Value::Null]; + let pg_params = json_to_pg_text(json); + assert_eq!(pg_params, vec![None]); + } + + #[test] + fn test_json_array_to_pg_array() { + // atoms and escaping + let json = "[true, false, null, \"NULL\", 42, \"foo\", \"bar\\\"-\\\\\"]"; + let json: Value = serde_json::from_str(json).unwrap(); + let pg_params = json_to_pg_text(vec![json]); + assert_eq!( + pg_params, + vec![Some( + "{true,false,NULL,\"NULL\",42,\"foo\",\"bar\\\"-\\\\\"}".to_owned() + )] + ); + + // nested arrays + let json = "[[true, false], [null, 42], [\"foo\", \"bar\\\"-\\\\\"]]"; + let json: Value = serde_json::from_str(json).unwrap(); + let pg_params = json_to_pg_text(vec![json]); + assert_eq!( + pg_params, + vec![Some( + "{{true,false},{NULL,42},{\"foo\",\"bar\\\"-\\\\\"}}".to_owned() + )] + ); + // array of objects + let json = r#"[{"foo": 1},{"bar": 2}]"#; + let json: Value = serde_json::from_str(json).unwrap(); + let pg_params = json_to_pg_text(vec![json]); + assert_eq!( + pg_params, + vec![Some(r#"{"{\"foo\":1}","{\"bar\":2}"}"#.to_owned())] + ); + } + + #[test] + fn test_atomic_types_parse() { + assert_eq!( + pg_text_to_json(Some("foo"), &Type::TEXT).unwrap(), + json!("foo") + ); + assert_eq!(pg_text_to_json(None, &Type::TEXT).unwrap(), json!(null)); + assert_eq!(pg_text_to_json(Some("42"), &Type::INT4).unwrap(), json!(42)); + assert_eq!(pg_text_to_json(Some("42"), &Type::INT2).unwrap(), json!(42)); + assert_eq!( + pg_text_to_json(Some("42"), &Type::INT8).unwrap(), + json!("42") + ); + assert_eq!( + pg_text_to_json(Some("42.42"), &Type::FLOAT8).unwrap(), + json!(42.42) + ); + assert_eq!( + pg_text_to_json(Some("42.42"), &Type::FLOAT4).unwrap(), + json!(42.42) + ); + assert_eq!( + pg_text_to_json(Some("NaN"), &Type::FLOAT4).unwrap(), + json!("NaN") + ); + assert_eq!( + pg_text_to_json(Some("Infinity"), &Type::FLOAT4).unwrap(), + json!("Infinity") + ); + assert_eq!( + pg_text_to_json(Some("-Infinity"), &Type::FLOAT4).unwrap(), + json!("-Infinity") + ); + + let json: Value = + serde_json::from_str("{\"s\":\"str\",\"n\":42,\"f\":4.2,\"a\":[null,3,\"a\"]}") + .unwrap(); + assert_eq!( + pg_text_to_json( + Some(r#"{"s":"str","n":42,"f":4.2,"a":[null,3,"a"]}"#), + &Type::JSONB + ) + .unwrap(), + json + ); + } + + #[test] + fn test_pg_array_parse_text() { + fn pt(pg_arr: &str) -> Value { + pg_array_parse(pg_arr, &Type::TEXT).unwrap() + } + assert_eq!( + pt(r#"{"aa\"\\\,a",cha,"bbbb"}"#), + json!(["aa\"\\,a", "cha", "bbbb"]) + ); + assert_eq!( + pt(r#"{{"foo","bar"},{"bee","bop"}}"#), + json!([["foo", "bar"], ["bee", "bop"]]) + ); + assert_eq!( + pt(r#"{{{{"foo",NULL,"bop",bup}}}}"#), + json!([[[["foo", null, "bop", "bup"]]]]) + ); + assert_eq!( + pt(r#"{{"1",2,3},{4,NULL,6},{NULL,NULL,NULL}}"#), + json!([["1", "2", "3"], ["4", null, "6"], [null, null, null]]) + ); + } + + #[test] + fn test_pg_array_parse_bool() { + fn pb(pg_arr: &str) -> Value { + pg_array_parse(pg_arr, &Type::BOOL).unwrap() + } + assert_eq!(pb(r#"{t,f,t}"#), json!([true, false, true])); + assert_eq!(pb(r#"{{t,f,t}}"#), json!([[true, false, true]])); + assert_eq!( + pb(r#"{{t,f},{f,t}}"#), + json!([[true, false], [false, true]]) + ); + assert_eq!( + pb(r#"{{t,NULL},{NULL,f}}"#), + json!([[true, null], [null, false]]) + ); + } + + #[test] + fn test_pg_array_parse_numbers() { + fn pn(pg_arr: &str, ty: &Type) -> Value { + pg_array_parse(pg_arr, ty).unwrap() + } + assert_eq!(pn(r#"{1,2,3}"#, &Type::INT4), json!([1, 2, 3])); + assert_eq!(pn(r#"{1,2,3}"#, &Type::INT2), json!([1, 2, 3])); + assert_eq!(pn(r#"{1,2,3}"#, &Type::INT8), json!(["1", "2", "3"])); + assert_eq!(pn(r#"{1,2,3}"#, &Type::FLOAT4), json!([1.0, 2.0, 3.0])); + assert_eq!(pn(r#"{1,2,3}"#, &Type::FLOAT8), json!([1.0, 2.0, 3.0])); + assert_eq!( + pn(r#"{1.1,2.2,3.3}"#, &Type::FLOAT4), + json!([1.1, 2.2, 3.3]) + ); + assert_eq!( + pn(r#"{1.1,2.2,3.3}"#, &Type::FLOAT8), + json!([1.1, 2.2, 3.3]) + ); + assert_eq!( + pn(r#"{NaN,Infinity,-Infinity}"#, &Type::FLOAT4), + json!(["NaN", "Infinity", "-Infinity"]) + ); + assert_eq!( + pn(r#"{NaN,Infinity,-Infinity}"#, &Type::FLOAT8), + json!(["NaN", "Infinity", "-Infinity"]) + ); + } + + #[test] + fn test_pg_array_with_decoration() { + fn p(pg_arr: &str) -> Value { + pg_array_parse(pg_arr, &Type::INT2).unwrap() + } + assert_eq!( + p(r#"[1:1][-2:-1][3:5]={{{1,2,3},{4,5,6}}}"#), + json!([[[1, 2, 3], [4, 5, 6]]]) + ); + } + + #[test] + fn test_pg_array_parse_json() { + fn pt(pg_arr: &str) -> Value { + pg_array_parse(pg_arr, &Type::JSONB).unwrap() + } + assert_eq!(pt(r#"{"{}"}"#), json!([{}])); + assert_eq!( + pt(r#"{"{\"foo\": 1, \"bar\": 2}"}"#), + json!([{"foo": 1, "bar": 2}]) + ); + assert_eq!( + pt(r#"{"{\"foo\": 1}", "{\"bar\": 2}"}"#), + json!([{"foo": 1}, {"bar": 2}]) + ); + assert_eq!( + pt(r#"{{"{\"foo\": 1}", "{\"bar\": 2}"}}"#), + json!([[{"foo": 1}, {"bar": 2}]]) + ); + } +} diff --git a/proxy/src/serverless/sql_over_http.rs b/proxy/src/serverless/sql_over_http.rs index f108ab34ab..86c278030f 100644 --- a/proxy/src/serverless/sql_over_http.rs +++ b/proxy/src/serverless/sql_over_http.rs @@ -1,8 +1,11 @@ +use std::pin::pin; use std::sync::Arc; -use anyhow::bail; -use futures::pin_mut; +use futures::future::select; +use futures::future::try_join; +use futures::future::Either; use futures::StreamExt; +use futures::TryFutureExt; use hyper::body::HttpBody; use hyper::header; use hyper::http::HeaderName; @@ -11,40 +14,53 @@ use hyper::Response; use hyper::StatusCode; use hyper::{Body, HeaderMap, Request}; use serde_json::json; -use serde_json::Map; use serde_json::Value; +use tokio::time; use tokio_postgres::error::DbError; use tokio_postgres::error::ErrorPosition; -use tokio_postgres::types::Kind; -use tokio_postgres::types::Type; +use tokio_postgres::error::SqlState; use tokio_postgres::GenericClient; use tokio_postgres::IsolationLevel; +use tokio_postgres::NoTls; use tokio_postgres::ReadyForQueryStatus; -use tokio_postgres::Row; use tokio_postgres::Transaction; +use tokio_util::sync::CancellationToken; use tracing::error; -use tracing::instrument; +use tracing::info; use url::Url; use utils::http::error::ApiError; use utils::http::json::json_response; use crate::auth::backend::ComputeUserInfo; use crate::auth::endpoint_sni; -use crate::config::HttpConfig; +use crate::auth::ComputeUserInfoParseError; +use crate::config::ProxyConfig; use crate::config::TlsConfig; use crate::context::RequestMonitoring; +use crate::error::ErrorKind; +use crate::error::ReportableError; +use crate::error::UserFacingError; +use crate::metrics::HTTP_CONTENT_LENGTH; use crate::metrics::NUM_CONNECTION_REQUESTS_GAUGE; use crate::proxy::NeonOptions; -use crate::EndpointId; +use crate::serverless::backend::HttpConnError; +use crate::DbName; use crate::RoleName; +use super::backend::PoolingBackend; use super::conn_pool::ConnInfo; -use super::conn_pool::GlobalConnPool; +use super::json::json_to_pg_text; +use super::json::pg_text_row_to_json; +use super::json::JsonConversionError; #[derive(serde::Deserialize)] +#[serde(rename_all = "camelCase")] struct QueryData { query: String, - params: Vec, + #[serde(deserialize_with = "bytes_to_pg_text")] + params: Vec>, + #[serde(default)] + array_mode: Option, } #[derive(serde::Deserialize)] @@ -61,7 +77,6 @@ enum Payload { const MAX_RESPONSE_SIZE: usize = 10 * 1024 * 1024; // 10 MiB const MAX_REQUEST_SIZE: u64 = 10 * 1024 * 1024; // 10 MiB -const SERVERLESS_DRIVER_SNI_HOSTNAME_FIRST_PART: &str = "api"; static RAW_TEXT_OUTPUT: HeaderName = HeaderName::from_static("neon-raw-text-output"); static ARRAY_MODE: HeaderName = HeaderName::from_static("neon-array-mode"); @@ -72,135 +87,111 @@ static TXN_DEFERRABLE: HeaderName = HeaderName::from_static("neon-batch-deferrab static HEADER_VALUE_TRUE: HeaderValue = HeaderValue::from_static("true"); -// -// Convert json non-string types to strings, so that they can be passed to Postgres -// as parameters. -// -fn json_to_pg_text(json: Vec) -> Vec> { - json.iter() - .map(|value| { - match value { - // special care for nulls - Value::Null => None, - - // convert to text with escaping - v @ (Value::Bool(_) | Value::Number(_) | Value::Object(_)) => Some(v.to_string()), - - // avoid escaping here, as we pass this as a parameter - Value::String(s) => Some(s.to_string()), - - // special care for arrays - Value::Array(_) => json_array_to_pg_array(value), - } - }) - .collect() +fn bytes_to_pg_text<'de, D>(deserializer: D) -> Result>, D::Error> +where + D: serde::de::Deserializer<'de>, +{ + // TODO: consider avoiding the allocation here. + let json: Vec = serde::de::Deserialize::deserialize(deserializer)?; + Ok(json_to_pg_text(json)) } -// -// Serialize a JSON array to a Postgres array. Contrary to the strings in the params -// in the array we need to escape the strings. Postgres is okay with arrays of form -// '{1,"2",3}'::int[], so we don't check that array holds values of the same type, leaving -// it for Postgres to check. -// -// Example of the same escaping in node-postgres: packages/pg/lib/utils.js -// -fn json_array_to_pg_array(value: &Value) -> Option { - match value { - // special care for nulls - Value::Null => None, +#[derive(Debug, thiserror::Error)] +pub enum ConnInfoError { + #[error("invalid header: {0}")] + InvalidHeader(&'static str), + #[error("invalid connection string: {0}")] + UrlParseError(#[from] url::ParseError), + #[error("incorrect scheme")] + IncorrectScheme, + #[error("missing database name")] + MissingDbName, + #[error("invalid database name")] + InvalidDbName, + #[error("missing username")] + MissingUsername, + #[error("invalid username: {0}")] + InvalidUsername(#[from] std::string::FromUtf8Error), + #[error("missing password")] + MissingPassword, + #[error("missing hostname")] + MissingHostname, + #[error("invalid hostname: {0}")] + InvalidEndpoint(#[from] ComputeUserInfoParseError), + #[error("malformed endpoint")] + MalformedEndpoint, +} - // convert to text with escaping - // here string needs to be escaped, as it is part of the array - v @ (Value::Bool(_) | Value::Number(_) | Value::String(_)) => Some(v.to_string()), - v @ Value::Object(_) => json_array_to_pg_array(&Value::String(v.to_string())), +impl ReportableError for ConnInfoError { + fn get_error_kind(&self) -> ErrorKind { + ErrorKind::User + } +} - // recurse into array - Value::Array(arr) => { - let vals = arr - .iter() - .map(json_array_to_pg_array) - .map(|v| v.unwrap_or_else(|| "NULL".to_string())) - .collect::>() - .join(","); - - Some(format!("{{{}}}", vals)) - } +impl UserFacingError for ConnInfoError { + fn to_string_client(&self) -> String { + self.to_string() } } fn get_conn_info( ctx: &mut RequestMonitoring, headers: &HeaderMap, - sni_hostname: Option, tls: &TlsConfig, -) -> Result { +) -> Result { + // HTTP only uses cleartext (for now and likely always) + ctx.set_auth_method(crate::context::AuthMethod::Cleartext); + let connection_string = headers .get("Neon-Connection-String") - .ok_or(anyhow::anyhow!("missing connection string"))? - .to_str()?; + .ok_or(ConnInfoError::InvalidHeader("Neon-Connection-String"))? + .to_str() + .map_err(|_| ConnInfoError::InvalidHeader("Neon-Connection-String"))?; let connection_url = Url::parse(connection_string)?; let protocol = connection_url.scheme(); if protocol != "postgres" && protocol != "postgresql" { - return Err(anyhow::anyhow!( - "connection string must start with postgres: or postgresql:" - )); + return Err(ConnInfoError::IncorrectScheme); } let mut url_path = connection_url .path_segments() - .ok_or(anyhow::anyhow!("missing database name"))?; + .ok_or(ConnInfoError::MissingDbName)?; - let dbname = url_path - .next() - .ok_or(anyhow::anyhow!("invalid database name"))?; + let dbname: DbName = url_path.next().ok_or(ConnInfoError::InvalidDbName)?.into(); + ctx.set_dbname(dbname.clone()); - let username = RoleName::from(connection_url.username()); + let username = RoleName::from(urlencoding::decode(connection_url.username())?); if username.is_empty() { - return Err(anyhow::anyhow!("missing username")); + return Err(ConnInfoError::MissingUsername); } ctx.set_user(username.clone()); let password = connection_url .password() - .ok_or(anyhow::anyhow!("no password"))?; - - // TLS certificate selector now based on SNI hostname, so if we are running here - // we are sure that SNI hostname is set to one of the configured domain names. - let sni_hostname = sni_hostname.ok_or(anyhow::anyhow!("no SNI hostname set"))?; + .ok_or(ConnInfoError::MissingPassword)?; + let password = urlencoding::decode_binary(password.as_bytes()); let hostname = connection_url .host_str() - .ok_or(anyhow::anyhow!("no host"))?; + .ok_or(ConnInfoError::MissingHostname)?; - let host_header = headers - .get("host") - .and_then(|h| h.to_str().ok()) - .and_then(|h| h.split(':').next()); - - // sni_hostname has to be either the same as hostname or the one used in serverless driver. - if !check_matches(&sni_hostname, hostname)? { - return Err(anyhow::anyhow!("mismatched SNI hostname and hostname")); - } else if let Some(h) = host_header { - if h != sni_hostname { - return Err(anyhow::anyhow!("mismatched host header and hostname")); - } - } - - let endpoint = endpoint_sni(hostname, &tls.common_names)?; - - let endpoint: EndpointId = endpoint.into(); - ctx.set_endpoint_id(Some(endpoint.clone())); + let endpoint = + endpoint_sni(hostname, &tls.common_names)?.ok_or(ConnInfoError::MalformedEndpoint)?; + ctx.set_endpoint_id(endpoint.clone()); let pairs = connection_url.query_pairs(); let mut options = Option::None; for (key, value) in pairs { - if key == "options" { - options = Some(NeonOptions::parse_options_raw(&value)); - break; + match &*key { + "options" => { + options = Some(NeonOptions::parse_options_raw(&value)); + } + "application_name" => ctx.set_application(Some(value.into())), + _ => {} } } @@ -212,130 +203,142 @@ fn get_conn_info( Ok(ConnInfo { user_info, - dbname: dbname.into(), - password: password.into(), + dbname, + password: match password { + std::borrow::Cow::Borrowed(b) => b.into(), + std::borrow::Cow::Owned(b) => b.into(), + }, }) } -fn check_matches(sni_hostname: &str, hostname: &str) -> Result { - if sni_hostname == hostname { - return Ok(true); - } - let (sni_hostname_first, sni_hostname_rest) = sni_hostname - .split_once('.') - .ok_or_else(|| anyhow::anyhow!("Unexpected sni format."))?; - let (_, hostname_rest) = hostname - .split_once('.') - .ok_or_else(|| anyhow::anyhow!("Unexpected hostname format."))?; - Ok(sni_hostname_rest == hostname_rest - && sni_hostname_first == SERVERLESS_DRIVER_SNI_HOSTNAME_FIRST_PART) -} - // TODO: return different http error codes pub async fn handle( - tls: &'static TlsConfig, - config: &'static HttpConfig, - ctx: &mut RequestMonitoring, + config: &'static ProxyConfig, + mut ctx: RequestMonitoring, request: Request, - sni_hostname: Option, - conn_pool: Arc, + backend: Arc, ) -> Result, ApiError> { - let result = tokio::time::timeout( - config.request_timeout, - handle_inner(tls, config, ctx, request, sni_hostname, conn_pool), - ) - .await; + let cancel = CancellationToken::new(); + let cancel2 = cancel.clone(); + let handle = tokio::spawn(async move { + time::sleep(config.http_config.request_timeout).await; + cancel2.cancel(); + }); + + let result = handle_inner(cancel, config, &mut ctx, request, backend).await; + handle.abort(); + let mut response = match result { - Ok(r) => match r { - Ok(r) => r, - Err(e) => { - let mut message = format!("{:?}", e); - let db_error = e - .downcast_ref::() - .and_then(|e| e.as_db_error()); - fn get<'a, T: serde::Serialize>( - db: Option<&'a DbError>, - x: impl FnOnce(&'a DbError) -> T, - ) -> Value { - db.map(x) - .and_then(|t| serde_json::to_value(t).ok()) - .unwrap_or_default() - } + Ok(r) => { + ctx.set_success(); + r + } + Err(e @ SqlOverHttpError::Cancelled(_)) => { + let error_kind = e.get_error_kind(); + ctx.set_error_kind(error_kind); - if let Some(db_error) = db_error { - db_error.message().clone_into(&mut message); - } - - let position = db_error.and_then(|db| db.position()); - let (position, internal_position, internal_query) = match position { - Some(ErrorPosition::Original(position)) => ( - Value::String(position.to_string()), - Value::Null, - Value::Null, - ), - Some(ErrorPosition::Internal { position, query }) => ( - Value::Null, - Value::String(position.to_string()), - Value::String(query.clone()), - ), - None => (Value::Null, Value::Null, Value::Null), - }; - - let code = get(db_error, |db| db.code().code()); - let severity = get(db_error, |db| db.severity()); - let detail = get(db_error, |db| db.detail()); - let hint = get(db_error, |db| db.hint()); - let where_ = get(db_error, |db| db.where_()); - let table = get(db_error, |db| db.table()); - let column = get(db_error, |db| db.column()); - let schema = get(db_error, |db| db.schema()); - let datatype = get(db_error, |db| db.datatype()); - let constraint = get(db_error, |db| db.constraint()); - let file = get(db_error, |db| db.file()); - let line = get(db_error, |db| db.line().map(|l| l.to_string())); - let routine = get(db_error, |db| db.routine()); - - error!( - ?code, - "sql-over-http per-client task finished with an error: {e:#}" - ); - // TODO: this shouldn't always be bad request. - json_response( - StatusCode::BAD_REQUEST, - json!({ - "message": message, - "code": code, - "detail": detail, - "hint": hint, - "position": position, - "internalPosition": internal_position, - "internalQuery": internal_query, - "severity": severity, - "where": where_, - "table": table, - "column": column, - "schema": schema, - "dataType": datatype, - "constraint": constraint, - "file": file, - "line": line, - "routine": routine, - }), - )? - } - }, - Err(_) => { let message = format!( - "HTTP-Connection timed out, execution time exeeded {} seconds", - config.request_timeout.as_secs() + "Query cancelled, runtime exceeded. SQL queries over HTTP must not exceed {} seconds of runtime. Please consider using our websocket based connections", + config.http_config.request_timeout.as_secs_f64() ); - error!(message); + + tracing::info!( + kind=error_kind.to_metric_label(), + error=%e, + msg=message, + "forwarding error to user" + ); + json_response( - StatusCode::GATEWAY_TIMEOUT, - json!({ "message": message, "code": StatusCode::GATEWAY_TIMEOUT.as_u16() }), + StatusCode::BAD_REQUEST, + json!({ "message": message, "code": SqlState::PROTOCOL_VIOLATION.code() }), + )? + } + Err(e) => { + let error_kind = e.get_error_kind(); + ctx.set_error_kind(error_kind); + + let mut message = e.to_string_client(); + let db_error = match &e { + SqlOverHttpError::ConnectCompute(HttpConnError::ConnectionError(e)) + | SqlOverHttpError::Postgres(e) => e.as_db_error(), + _ => None, + }; + fn get<'a, T: serde::Serialize>( + db: Option<&'a DbError>, + x: impl FnOnce(&'a DbError) -> T, + ) -> Value { + db.map(x) + .and_then(|t| serde_json::to_value(t).ok()) + .unwrap_or_default() + } + + if let Some(db_error) = db_error { + db_error.message().clone_into(&mut message); + } + + let position = db_error.and_then(|db| db.position()); + let (position, internal_position, internal_query) = match position { + Some(ErrorPosition::Original(position)) => ( + Value::String(position.to_string()), + Value::Null, + Value::Null, + ), + Some(ErrorPosition::Internal { position, query }) => ( + Value::Null, + Value::String(position.to_string()), + Value::String(query.clone()), + ), + None => (Value::Null, Value::Null, Value::Null), + }; + + let code = get(db_error, |db| db.code().code()); + let severity = get(db_error, |db| db.severity()); + let detail = get(db_error, |db| db.detail()); + let hint = get(db_error, |db| db.hint()); + let where_ = get(db_error, |db| db.where_()); + let table = get(db_error, |db| db.table()); + let column = get(db_error, |db| db.column()); + let schema = get(db_error, |db| db.schema()); + let datatype = get(db_error, |db| db.datatype()); + let constraint = get(db_error, |db| db.constraint()); + let file = get(db_error, |db| db.file()); + let line = get(db_error, |db| db.line().map(|l| l.to_string())); + let routine = get(db_error, |db| db.routine()); + + tracing::info!( + kind=error_kind.to_metric_label(), + error=%e, + msg=message, + "forwarding error to user" + ); + + // TODO: this shouldn't always be bad request. + json_response( + StatusCode::BAD_REQUEST, + json!({ + "message": message, + "code": code, + "detail": detail, + "hint": hint, + "position": position, + "internalPosition": internal_position, + "internalQuery": internal_query, + "severity": severity, + "where": where_, + "table": table, + "column": column, + "schema": schema, + "dataType": datatype, + "constraint": constraint, + "file": file, + "line": line, + "routine": routine, + }), )? } }; + response.headers_mut().insert( "Access-Control-Allow-Origin", hyper::http::HeaderValue::from_static("*"), @@ -343,34 +346,123 @@ pub async fn handle( Ok(response) } -#[instrument(name = "sql-over-http", fields(pid = tracing::field::Empty), skip_all)] +#[derive(Debug, thiserror::Error)] +pub enum SqlOverHttpError { + #[error("{0}")] + ReadPayload(#[from] ReadPayloadError), + #[error("{0}")] + ConnectCompute(#[from] HttpConnError), + #[error("{0}")] + ConnInfo(#[from] ConnInfoError), + #[error("request is too large (max is {MAX_REQUEST_SIZE} bytes)")] + RequestTooLarge, + #[error("response is too large (max is {MAX_RESPONSE_SIZE} bytes)")] + ResponseTooLarge, + #[error("invalid isolation level")] + InvalidIsolationLevel, + #[error("{0}")] + Postgres(#[from] tokio_postgres::Error), + #[error("{0}")] + JsonConversion(#[from] JsonConversionError), + #[error("{0}")] + Cancelled(SqlOverHttpCancel), +} + +impl ReportableError for SqlOverHttpError { + fn get_error_kind(&self) -> ErrorKind { + match self { + SqlOverHttpError::ReadPayload(e) => e.get_error_kind(), + SqlOverHttpError::ConnectCompute(e) => e.get_error_kind(), + SqlOverHttpError::ConnInfo(e) => e.get_error_kind(), + SqlOverHttpError::RequestTooLarge => ErrorKind::User, + SqlOverHttpError::ResponseTooLarge => ErrorKind::User, + SqlOverHttpError::InvalidIsolationLevel => ErrorKind::User, + SqlOverHttpError::Postgres(p) => p.get_error_kind(), + SqlOverHttpError::JsonConversion(_) => ErrorKind::Postgres, + SqlOverHttpError::Cancelled(c) => c.get_error_kind(), + } + } +} + +impl UserFacingError for SqlOverHttpError { + fn to_string_client(&self) -> String { + match self { + SqlOverHttpError::ReadPayload(p) => p.to_string(), + SqlOverHttpError::ConnectCompute(c) => c.to_string_client(), + SqlOverHttpError::ConnInfo(c) => c.to_string_client(), + SqlOverHttpError::RequestTooLarge => self.to_string(), + SqlOverHttpError::ResponseTooLarge => self.to_string(), + SqlOverHttpError::InvalidIsolationLevel => self.to_string(), + SqlOverHttpError::Postgres(p) => p.to_string(), + SqlOverHttpError::JsonConversion(_) => "could not parse postgres response".to_string(), + SqlOverHttpError::Cancelled(_) => self.to_string(), + } + } +} + +#[derive(Debug, thiserror::Error)] +pub enum ReadPayloadError { + #[error("could not read the HTTP request body: {0}")] + Read(#[from] hyper::Error), + #[error("could not parse the HTTP request body: {0}")] + Parse(#[from] serde_json::Error), +} + +impl ReportableError for ReadPayloadError { + fn get_error_kind(&self) -> ErrorKind { + match self { + ReadPayloadError::Read(_) => ErrorKind::ClientDisconnect, + ReadPayloadError::Parse(_) => ErrorKind::User, + } + } +} + +#[derive(Debug, thiserror::Error)] +pub enum SqlOverHttpCancel { + #[error("query was cancelled")] + Postgres, + #[error("query was cancelled while stuck trying to connect to the database")] + Connect, +} + +impl ReportableError for SqlOverHttpCancel { + fn get_error_kind(&self) -> ErrorKind { + match self { + SqlOverHttpCancel::Postgres => ErrorKind::RateLimit, + SqlOverHttpCancel::Connect => ErrorKind::ServiceRateLimit, + } + } +} + async fn handle_inner( - tls: &'static TlsConfig, - config: &'static HttpConfig, + cancel: CancellationToken, + config: &'static ProxyConfig, ctx: &mut RequestMonitoring, request: Request, - sni_hostname: Option, - conn_pool: Arc, -) -> anyhow::Result> { + backend: Arc, +) -> Result, SqlOverHttpError> { let _request_gauge = NUM_CONNECTION_REQUESTS_GAUGE - .with_label_values(&["http"]) + .with_label_values(&[ctx.protocol]) .guard(); + info!("handling interactive connection from client"); // // Determine the destination and connection params // let headers = request.headers(); - let conn_info = get_conn_info(ctx, headers, sni_hostname, tls)?; + // TLS config should be there. + let conn_info = get_conn_info(ctx, headers, config.tls_config.as_ref().unwrap())?; + info!(user = conn_info.user_info.user.as_str(), "credentials"); // Determine the output options. Default behaviour is 'false'. Anything that is not // strictly 'true' assumed to be false. let raw_output = headers.get(&RAW_TEXT_OUTPUT) == Some(&HEADER_VALUE_TRUE); - let array_mode = headers.get(&ARRAY_MODE) == Some(&HEADER_VALUE_TRUE); + let default_array_mode = headers.get(&ARRAY_MODE) == Some(&HEADER_VALUE_TRUE); // Allow connection pooling only if explicitly requested // or if we have decided that http pool is no longer opt-in - let allow_pool = - !config.pool_options.opt_in || headers.get(&ALLOW_POOL) == Some(&HEADER_VALUE_TRUE); + let allow_pool = !config.http_config.pool_options.opt_in + || headers.get(&ALLOW_POOL) == Some(&HEADER_VALUE_TRUE); // isolation level, read only and deferrable @@ -381,7 +473,7 @@ async fn handle_inner( b"ReadUncommitted" => IsolationLevel::ReadUncommitted, b"ReadCommitted" => IsolationLevel::ReadCommitted, b"RepeatableRead" => IsolationLevel::RepeatableRead, - _ => bail!("invalid isolation level"), + _ => return Err(SqlOverHttpError::InvalidIsolationLevel), }), None => None, }; @@ -389,28 +481,54 @@ async fn handle_inner( let txn_read_only = headers.get(&TXN_READ_ONLY) == Some(&HEADER_VALUE_TRUE); let txn_deferrable = headers.get(&TXN_DEFERRABLE) == Some(&HEADER_VALUE_TRUE); - let paused = ctx.latency_timer.pause(); let request_content_length = match request.body().size_hint().upper() { Some(v) => v, None => MAX_REQUEST_SIZE + 1, }; - drop(paused); + info!(request_content_length, "request size in bytes"); + HTTP_CONTENT_LENGTH.observe(request_content_length as f64); // we don't have a streaming request support yet so this is to prevent OOM // from a malicious user sending an extremely large request body if request_content_length > MAX_REQUEST_SIZE { - return Err(anyhow::anyhow!( - "request is too large (max is {MAX_REQUEST_SIZE} bytes)" - )); + return Err(SqlOverHttpError::RequestTooLarge); } - // - // Read the query and query params from the request body - // - let body = hyper::body::to_bytes(request.into_body()).await?; - let payload: Payload = serde_json::from_slice(&body)?; + let fetch_and_process_request = async { + let body = hyper::body::to_bytes(request.into_body()).await?; + info!(length = body.len(), "request payload read"); + let payload: Payload = serde_json::from_slice(&body)?; + Ok::(payload) // Adjust error type accordingly + } + .map_err(SqlOverHttpError::from); - let mut client = conn_pool.get(ctx, conn_info, !allow_pool).await?; + let authenticate_and_connect = async { + let keys = backend.authenticate(ctx, &conn_info).await?; + let client = backend + .connect_to_compute(ctx, conn_info, keys, !allow_pool) + .await?; + // not strictly necessary to mark success here, + // but it's just insurance for if we forget it somewhere else + ctx.latency_timer.success(); + Ok::<_, HttpConnError>(client) + } + .map_err(SqlOverHttpError::from); + + // Run both operations in parallel + let (payload, mut client) = match select( + try_join( + pin!(fetch_and_process_request), + pin!(authenticate_and_connect), + ), + pin!(cancel.cancelled()), + ) + .await + { + Either::Left((result, _cancelled)) => result?, + Either::Right((_cancelled, _)) => { + return Err(SqlOverHttpError::Cancelled(SqlOverHttpCancel::Connect)) + } + }; let mut response = Response::builder() .status(StatusCode::OK) @@ -420,86 +538,140 @@ async fn handle_inner( // Now execute the query and return the result // let mut size = 0; - let result = - match payload { - Payload::Single(stmt) => { - let (status, results) = - query_to_json(&*client, stmt, &mut 0, raw_output, array_mode) - .await - .map_err(|e| { - client.discard(); - e - })?; - client.check_idle(status); - results - } - Payload::Batch(statements) => { - let (inner, mut discard) = client.inner(); - let mut builder = inner.build_transaction(); - if let Some(isolation_level) = txn_isolation_level { - builder = builder.isolation_level(isolation_level); + let result = match payload { + Payload::Single(stmt) => { + let mut size = 0; + let (inner, mut discard) = client.inner(); + let cancel_token = inner.cancel_token(); + let query = pin!(query_to_json( + &*inner, + stmt, + &mut size, + raw_output, + default_array_mode + )); + let cancelled = pin!(cancel.cancelled()); + let res = select(query, cancelled).await; + match res { + Either::Left((Ok((status, results)), _cancelled)) => { + discard.check_idle(status); + results } - if txn_read_only { - builder = builder.read_only(true); - } - if txn_deferrable { - builder = builder.deferrable(true); - } - - let transaction = builder.start().await.map_err(|e| { - // if we cannot start a transaction, we should return immediately - // and not return to the pool. connection is clearly broken + Either::Left((Err(e), _cancelled)) => { discard.discard(); - e - })?; - - let results = - match query_batch(&transaction, statements, &mut size, raw_output, array_mode) - .await - { - Ok(results) => { - let status = transaction.commit().await.map_err(|e| { - // if we cannot commit - for now don't return connection to pool - // TODO: get a query status from the error - discard.discard(); - e - })?; + return Err(e); + } + Either::Right((_cancelled, query)) => { + if let Err(err) = cancel_token.cancel_query(NoTls).await { + tracing::error!(?err, "could not cancel query"); + } + match time::timeout(time::Duration::from_millis(100), query).await { + Ok(Ok((status, results))) => { discard.check_idle(status); results } - Err(err) => { - let status = transaction.rollback().await.map_err(|e| { - // if we cannot rollback - for now don't return connection to pool - // TODO: get a query status from the error + Ok(Err(error)) => { + let db_error = match &error { + SqlOverHttpError::ConnectCompute( + HttpConnError::ConnectionError(e), + ) + | SqlOverHttpError::Postgres(e) => e.as_db_error(), + _ => None, + }; + + // if errored for some other reason, it might not be safe to return + if !db_error.is_some_and(|e| *e.code() == SqlState::QUERY_CANCELED) { discard.discard(); - e - })?; - discard.check_idle(status); - return Err(err); + } + + return Err(SqlOverHttpError::Cancelled(SqlOverHttpCancel::Postgres)); } - }; - - if txn_read_only { - response = response.header( - TXN_READ_ONLY.clone(), - HeaderValue::try_from(txn_read_only.to_string())?, - ); + Err(_timeout) => { + discard.discard(); + return Err(SqlOverHttpError::Cancelled(SqlOverHttpCancel::Postgres)); + } + } } - if txn_deferrable { - response = response.header( - TXN_DEFERRABLE.clone(), - HeaderValue::try_from(txn_deferrable.to_string())?, - ); - } - if let Some(txn_isolation_level) = txn_isolation_level_raw { - response = response.header(TXN_ISOLATION_LEVEL.clone(), txn_isolation_level); - } - json!({ "results": results }) } - }; + } + Payload::Batch(statements) => { + info!("starting transaction"); + let (inner, mut discard) = client.inner(); + let cancel_token = inner.cancel_token(); + let mut builder = inner.build_transaction(); + if let Some(isolation_level) = txn_isolation_level { + builder = builder.isolation_level(isolation_level); + } + if txn_read_only { + builder = builder.read_only(true); + } + if txn_deferrable { + builder = builder.deferrable(true); + } + + let transaction = builder.start().await.map_err(|e| { + // if we cannot start a transaction, we should return immediately + // and not return to the pool. connection is clearly broken + discard.discard(); + e + })?; + + let results = match query_batch( + cancel.child_token(), + &transaction, + statements, + &mut size, + raw_output, + default_array_mode, + ) + .await + { + Ok(results) => { + info!("commit"); + let status = transaction.commit().await.map_err(|e| { + // if we cannot commit - for now don't return connection to pool + // TODO: get a query status from the error + discard.discard(); + e + })?; + discard.check_idle(status); + results + } + Err(SqlOverHttpError::Cancelled(_)) => { + if let Err(err) = cancel_token.cancel_query(NoTls).await { + tracing::error!(?err, "could not cancel query"); + } + // TODO: after cancelling, wait to see if we can get a status. maybe the connection is still safe. + discard.discard(); + + return Err(SqlOverHttpError::Cancelled(SqlOverHttpCancel::Postgres)); + } + Err(err) => { + info!("rollback"); + let status = transaction.rollback().await.map_err(|e| { + // if we cannot rollback - for now don't return connection to pool + // TODO: get a query status from the error + discard.discard(); + e + })?; + discard.check_idle(status); + return Err(err); + } + }; + + if txn_read_only { + response = response.header(TXN_READ_ONLY.clone(), &HEADER_VALUE_TRUE); + } + if txn_deferrable { + response = response.header(TXN_DEFERRABLE.clone(), &HEADER_VALUE_TRUE); + } + if let Some(txn_isolation_level) = txn_isolation_level_raw { + response = response.header(TXN_ISOLATION_LEVEL.clone(), txn_isolation_level); + } + json!({ "results": results }) + } + }; - ctx.set_success(); - ctx.log(); let metrics = client.metrics(); // how could this possibly fail @@ -519,19 +691,37 @@ async fn handle_inner( } async fn query_batch( + cancel: CancellationToken, transaction: &Transaction<'_>, queries: BatchQueryData, total_size: &mut usize, raw_output: bool, array_mode: bool, -) -> anyhow::Result> { +) -> Result, SqlOverHttpError> { let mut results = Vec::with_capacity(queries.queries.len()); let mut current_size = 0; for stmt in queries.queries { - // TODO: maybe we should check that the transaction bit is set here - let (_, values) = - query_to_json(transaction, stmt, &mut current_size, raw_output, array_mode).await?; - results.push(values); + let query = pin!(query_to_json( + transaction, + stmt, + &mut current_size, + raw_output, + array_mode + )); + let cancelled = pin!(cancel.cancelled()); + let res = select(query, cancelled).await; + match res { + // TODO: maybe we should check that the transaction bit is set here + Either::Left((Ok((_, values)), _cancelled)) => { + results.push(values); + } + Either::Left((Err(e), _cancelled)) => { + return Err(e); + } + Either::Right((_cancelled, _)) => { + return Err(SqlOverHttpError::Cancelled(SqlOverHttpCancel::Postgres)); + } + } } *total_size += current_size; Ok(results) @@ -542,15 +732,16 @@ async fn query_to_json( data: QueryData, current_size: &mut usize, raw_output: bool, - array_mode: bool, -) -> anyhow::Result<(ReadyForQueryStatus, Value)> { - let query_params = json_to_pg_text(data.params); - let row_stream = client.query_raw_txt(&data.query, query_params).await?; + default_array_mode: bool, +) -> Result<(ReadyForQueryStatus, Value), SqlOverHttpError> { + info!("executing query"); + let query_params = data.params; + let mut row_stream = std::pin::pin!(client.query_raw_txt(&data.query, query_params).await?); + info!("finished executing query"); // Manually drain the stream into a vector to leave row_stream hanging // around to get a command tag. Also check that the response is not too // big. - pin_mut!(row_stream); let mut rows: Vec = Vec::new(); while let Some(row) = row_stream.next().await { let row = row?; @@ -559,9 +750,7 @@ async fn query_to_json( // we don't have a streaming response support yet so this is to prevent OOM // from a malicious query (eg a cross join) if *current_size > MAX_RESPONSE_SIZE { - return Err(anyhow::anyhow!( - "response is too large (max is {MAX_RESPONSE_SIZE} bytes)" - )); + return Err(SqlOverHttpError::ResponseTooLarge); } } @@ -580,6 +769,13 @@ async fn query_to_json( } .and_then(|s| s.parse::().ok()); + info!( + rows = rows.len(), + ?ready, + command_tag, + "finished reading rows" + ); + let mut fields = vec![]; let mut columns = vec![]; @@ -596,6 +792,8 @@ async fn query_to_json( columns.push(client.get_type(c.type_oid()).await?); } + let array_mode = data.array_mode.unwrap_or(default_array_mode); + // convert rows to JSON let rows = rows .iter() @@ -614,389 +812,3 @@ async fn query_to_json( }), )) } - -// -// Convert postgres row with text-encoded values to JSON object -// -pub fn pg_text_row_to_json( - row: &Row, - columns: &[Type], - raw_output: bool, - array_mode: bool, -) -> Result { - let iter = row - .columns() - .iter() - .zip(columns) - .enumerate() - .map(|(i, (column, typ))| { - let name = column.name(); - let pg_value = row.as_text(i)?; - let json_value = if raw_output { - match pg_value { - Some(v) => Value::String(v.to_string()), - None => Value::Null, - } - } else { - pg_text_to_json(pg_value, typ)? - }; - Ok((name.to_string(), json_value)) - }); - - if array_mode { - // drop keys and aggregate into array - let arr = iter - .map(|r| r.map(|(_key, val)| val)) - .collect::, anyhow::Error>>()?; - Ok(Value::Array(arr)) - } else { - let obj = iter.collect::, anyhow::Error>>()?; - Ok(Value::Object(obj)) - } -} - -// -// Convert postgres text-encoded value to JSON value -// -pub fn pg_text_to_json(pg_value: Option<&str>, pg_type: &Type) -> Result { - if let Some(val) = pg_value { - if let Kind::Array(elem_type) = pg_type.kind() { - return pg_array_parse(val, elem_type); - } - - match *pg_type { - Type::BOOL => Ok(Value::Bool(val == "t")), - Type::INT2 | Type::INT4 => { - let val = val.parse::()?; - Ok(Value::Number(serde_json::Number::from(val))) - } - Type::FLOAT4 | Type::FLOAT8 => { - let fval = val.parse::()?; - let num = serde_json::Number::from_f64(fval); - if let Some(num) = num { - Ok(Value::Number(num)) - } else { - // Pass Nan, Inf, -Inf as strings - // JS JSON.stringify() does converts them to null, but we - // want to preserve them, so we pass them as strings - Ok(Value::String(val.to_string())) - } - } - Type::JSON | Type::JSONB => Ok(serde_json::from_str(val)?), - _ => Ok(Value::String(val.to_string())), - } - } else { - Ok(Value::Null) - } -} - -// -// Parse postgres array into JSON array. -// -// This is a bit involved because we need to handle nested arrays and quoted -// values. Unlike postgres we don't check that all nested arrays have the same -// dimensions, we just return them as is. -// -fn pg_array_parse(pg_array: &str, elem_type: &Type) -> Result { - _pg_array_parse(pg_array, elem_type, false).map(|(v, _)| v) -} - -fn _pg_array_parse( - pg_array: &str, - elem_type: &Type, - nested: bool, -) -> Result<(Value, usize), anyhow::Error> { - let mut pg_array_chr = pg_array.char_indices(); - let mut level = 0; - let mut quote = false; - let mut entries: Vec = Vec::new(); - let mut entry = String::new(); - - // skip bounds decoration - if let Some('[') = pg_array.chars().next() { - for (_, c) in pg_array_chr.by_ref() { - if c == '=' { - break; - } - } - } - - fn push_checked( - entry: &mut String, - entries: &mut Vec, - elem_type: &Type, - ) -> Result<(), anyhow::Error> { - if !entry.is_empty() { - // While in usual postgres response we get nulls as None and everything else - // as Some(&str), in arrays we get NULL as unquoted 'NULL' string (while - // string with value 'NULL' will be represented by '"NULL"'). So catch NULLs - // here while we have quotation info and convert them to None. - if entry == "NULL" { - entries.push(pg_text_to_json(None, elem_type)?); - } else { - entries.push(pg_text_to_json(Some(entry), elem_type)?); - } - entry.clear(); - } - - Ok(()) - } - - while let Some((mut i, mut c)) = pg_array_chr.next() { - let mut escaped = false; - - if c == '\\' { - escaped = true; - (i, c) = pg_array_chr.next().unwrap(); - } - - match c { - '{' if !quote => { - level += 1; - if level > 1 { - let (res, off) = _pg_array_parse(&pg_array[i..], elem_type, true)?; - entries.push(res); - for _ in 0..off - 1 { - pg_array_chr.next(); - } - } - } - '}' if !quote => { - level -= 1; - if level == 0 { - push_checked(&mut entry, &mut entries, elem_type)?; - if nested { - return Ok((Value::Array(entries), i)); - } - } - } - '"' if !escaped => { - if quote { - // end of quoted string, so push it manually without any checks - // for emptiness or nulls - entries.push(pg_text_to_json(Some(&entry), elem_type)?); - entry.clear(); - } - quote = !quote; - } - ',' if !quote => { - push_checked(&mut entry, &mut entries, elem_type)?; - } - _ => { - entry.push(c); - } - } - } - - if level != 0 { - return Err(anyhow::anyhow!("unbalanced array")); - } - - Ok((Value::Array(entries), 0)) -} - -#[cfg(test)] -mod tests { - use super::*; - use serde_json::json; - - #[test] - fn test_atomic_types_to_pg_params() { - let json = vec![Value::Bool(true), Value::Bool(false)]; - let pg_params = json_to_pg_text(json); - assert_eq!( - pg_params, - vec![Some("true".to_owned()), Some("false".to_owned())] - ); - - let json = vec![Value::Number(serde_json::Number::from(42))]; - let pg_params = json_to_pg_text(json); - assert_eq!(pg_params, vec![Some("42".to_owned())]); - - let json = vec![Value::String("foo\"".to_string())]; - let pg_params = json_to_pg_text(json); - assert_eq!(pg_params, vec![Some("foo\"".to_owned())]); - - let json = vec![Value::Null]; - let pg_params = json_to_pg_text(json); - assert_eq!(pg_params, vec![None]); - } - - #[test] - fn test_json_array_to_pg_array() { - // atoms and escaping - let json = "[true, false, null, \"NULL\", 42, \"foo\", \"bar\\\"-\\\\\"]"; - let json: Value = serde_json::from_str(json).unwrap(); - let pg_params = json_to_pg_text(vec![json]); - assert_eq!( - pg_params, - vec![Some( - "{true,false,NULL,\"NULL\",42,\"foo\",\"bar\\\"-\\\\\"}".to_owned() - )] - ); - - // nested arrays - let json = "[[true, false], [null, 42], [\"foo\", \"bar\\\"-\\\\\"]]"; - let json: Value = serde_json::from_str(json).unwrap(); - let pg_params = json_to_pg_text(vec![json]); - assert_eq!( - pg_params, - vec![Some( - "{{true,false},{NULL,42},{\"foo\",\"bar\\\"-\\\\\"}}".to_owned() - )] - ); - // array of objects - let json = r#"[{"foo": 1},{"bar": 2}]"#; - let json: Value = serde_json::from_str(json).unwrap(); - let pg_params = json_to_pg_text(vec![json]); - assert_eq!( - pg_params, - vec![Some(r#"{"{\"foo\":1}","{\"bar\":2}"}"#.to_owned())] - ); - } - - #[test] - fn test_atomic_types_parse() { - assert_eq!( - pg_text_to_json(Some("foo"), &Type::TEXT).unwrap(), - json!("foo") - ); - assert_eq!(pg_text_to_json(None, &Type::TEXT).unwrap(), json!(null)); - assert_eq!(pg_text_to_json(Some("42"), &Type::INT4).unwrap(), json!(42)); - assert_eq!(pg_text_to_json(Some("42"), &Type::INT2).unwrap(), json!(42)); - assert_eq!( - pg_text_to_json(Some("42"), &Type::INT8).unwrap(), - json!("42") - ); - assert_eq!( - pg_text_to_json(Some("42.42"), &Type::FLOAT8).unwrap(), - json!(42.42) - ); - assert_eq!( - pg_text_to_json(Some("42.42"), &Type::FLOAT4).unwrap(), - json!(42.42) - ); - assert_eq!( - pg_text_to_json(Some("NaN"), &Type::FLOAT4).unwrap(), - json!("NaN") - ); - assert_eq!( - pg_text_to_json(Some("Infinity"), &Type::FLOAT4).unwrap(), - json!("Infinity") - ); - assert_eq!( - pg_text_to_json(Some("-Infinity"), &Type::FLOAT4).unwrap(), - json!("-Infinity") - ); - - let json: Value = - serde_json::from_str("{\"s\":\"str\",\"n\":42,\"f\":4.2,\"a\":[null,3,\"a\"]}") - .unwrap(); - assert_eq!( - pg_text_to_json( - Some(r#"{"s":"str","n":42,"f":4.2,"a":[null,3,"a"]}"#), - &Type::JSONB - ) - .unwrap(), - json - ); - } - - #[test] - fn test_pg_array_parse_text() { - fn pt(pg_arr: &str) -> Value { - pg_array_parse(pg_arr, &Type::TEXT).unwrap() - } - assert_eq!( - pt(r#"{"aa\"\\\,a",cha,"bbbb"}"#), - json!(["aa\"\\,a", "cha", "bbbb"]) - ); - assert_eq!( - pt(r#"{{"foo","bar"},{"bee","bop"}}"#), - json!([["foo", "bar"], ["bee", "bop"]]) - ); - assert_eq!( - pt(r#"{{{{"foo",NULL,"bop",bup}}}}"#), - json!([[[["foo", null, "bop", "bup"]]]]) - ); - assert_eq!( - pt(r#"{{"1",2,3},{4,NULL,6},{NULL,NULL,NULL}}"#), - json!([["1", "2", "3"], ["4", null, "6"], [null, null, null]]) - ); - } - - #[test] - fn test_pg_array_parse_bool() { - fn pb(pg_arr: &str) -> Value { - pg_array_parse(pg_arr, &Type::BOOL).unwrap() - } - assert_eq!(pb(r#"{t,f,t}"#), json!([true, false, true])); - assert_eq!(pb(r#"{{t,f,t}}"#), json!([[true, false, true]])); - assert_eq!( - pb(r#"{{t,f},{f,t}}"#), - json!([[true, false], [false, true]]) - ); - assert_eq!( - pb(r#"{{t,NULL},{NULL,f}}"#), - json!([[true, null], [null, false]]) - ); - } - - #[test] - fn test_pg_array_parse_numbers() { - fn pn(pg_arr: &str, ty: &Type) -> Value { - pg_array_parse(pg_arr, ty).unwrap() - } - assert_eq!(pn(r#"{1,2,3}"#, &Type::INT4), json!([1, 2, 3])); - assert_eq!(pn(r#"{1,2,3}"#, &Type::INT2), json!([1, 2, 3])); - assert_eq!(pn(r#"{1,2,3}"#, &Type::INT8), json!(["1", "2", "3"])); - assert_eq!(pn(r#"{1,2,3}"#, &Type::FLOAT4), json!([1.0, 2.0, 3.0])); - assert_eq!(pn(r#"{1,2,3}"#, &Type::FLOAT8), json!([1.0, 2.0, 3.0])); - assert_eq!( - pn(r#"{1.1,2.2,3.3}"#, &Type::FLOAT4), - json!([1.1, 2.2, 3.3]) - ); - assert_eq!( - pn(r#"{1.1,2.2,3.3}"#, &Type::FLOAT8), - json!([1.1, 2.2, 3.3]) - ); - assert_eq!( - pn(r#"{NaN,Infinity,-Infinity}"#, &Type::FLOAT4), - json!(["NaN", "Infinity", "-Infinity"]) - ); - assert_eq!( - pn(r#"{NaN,Infinity,-Infinity}"#, &Type::FLOAT8), - json!(["NaN", "Infinity", "-Infinity"]) - ); - } - - #[test] - fn test_pg_array_with_decoration() { - fn p(pg_arr: &str) -> Value { - pg_array_parse(pg_arr, &Type::INT2).unwrap() - } - assert_eq!( - p(r#"[1:1][-2:-1][3:5]={{{1,2,3},{4,5,6}}}"#), - json!([[[1, 2, 3], [4, 5, 6]]]) - ); - } - #[test] - fn test_pg_array_parse_json() { - fn pt(pg_arr: &str) -> Value { - pg_array_parse(pg_arr, &Type::JSONB).unwrap() - } - assert_eq!(pt(r#"{"{}"}"#), json!([{}])); - assert_eq!( - pt(r#"{"{\"foo\": 1, \"bar\": 2}"}"#), - json!([{"foo": 1, "bar": 2}]) - ); - assert_eq!( - pt(r#"{"{\"foo\": 1}", "{\"bar\": 2}"}"#), - json!([{"foo": 1}, {"bar": 2}]) - ); - assert_eq!( - pt(r#"{{"{\"foo\": 1}", "{\"bar\": 2}"}}"#), - json!([[{"foo": 1}, {"bar": 2}]]) - ); - } -} diff --git a/proxy/src/serverless/tls_listener.rs b/proxy/src/serverless/tls_listener.rs new file mode 100644 index 0000000000..cce02e3850 --- /dev/null +++ b/proxy/src/serverless/tls_listener.rs @@ -0,0 +1,130 @@ +use std::{ + convert::Infallible, + pin::Pin, + task::{Context, Poll}, + time::Duration, +}; + +use hyper::server::{accept::Accept, conn::AddrStream}; +use pin_project_lite::pin_project; +use tokio::{ + io::{AsyncRead, AsyncWrite}, + task::JoinSet, + time::timeout, +}; +use tokio_rustls::{server::TlsStream, TlsAcceptor}; +use tracing::{info, warn}; + +use crate::{ + metrics::TLS_HANDSHAKE_FAILURES, + protocol2::{WithClientIp, WithConnectionGuard}, +}; + +pin_project! { + /// Wraps a `Stream` of connections (such as a TCP listener) so that each connection is itself + /// encrypted using TLS. + pub(crate) struct TlsListener { + #[pin] + listener: A, + tls: TlsAcceptor, + waiting: JoinSet>>, + timeout: Duration, + protocol: &'static str, + } +} + +impl TlsListener { + /// Create a `TlsListener` with default options. + pub(crate) fn new( + tls: TlsAcceptor, + listener: A, + protocol: &'static str, + timeout: Duration, + ) -> Self { + TlsListener { + listener, + tls, + waiting: JoinSet::new(), + timeout, + protocol, + } + } +} + +impl Accept for TlsListener +where + A: Accept>>, + A::Error: std::error::Error, + A::Conn: AsyncRead + AsyncWrite + Unpin + Send + 'static, +{ + type Conn = TlsStream; + + type Error = Infallible; + + fn poll_accept( + self: Pin<&mut Self>, + cx: &mut Context<'_>, + ) -> Poll>> { + let mut this = self.project(); + + loop { + match this.listener.as_mut().poll_accept(cx) { + Poll::Pending => break, + Poll::Ready(Some(Ok(mut conn))) => { + let t = *this.timeout; + let tls = this.tls.clone(); + let protocol = *this.protocol; + this.waiting.spawn(async move { + let peer_addr = match conn.inner.wait_for_addr().await { + Ok(Some(addr)) => addr, + Err(e) => { + tracing::error!("failed to accept TCP connection: invalid PROXY protocol V2 header: {e:#}"); + return None; + } + Ok(None) => conn.inner.inner.remote_addr() + }; + + let accept = tls.accept(conn); + match timeout(t, accept).await { + Ok(Ok(conn)) => Some(conn), + // The handshake failed, try getting another connection from the queue + Ok(Err(e)) => { + TLS_HANDSHAKE_FAILURES.inc(); + warn!(%peer_addr, protocol, "failed to accept TLS connection: {e:?}"); + None + } + // The handshake timed out, try getting another connection from the queue + Err(_) => { + TLS_HANDSHAKE_FAILURES.inc(); + warn!(%peer_addr, protocol, "failed to accept TLS connection: timeout"); + None + } + } + }); + } + Poll::Ready(Some(Err(e))) => { + tracing::error!("error accepting TCP connection: {e}"); + continue; + } + Poll::Ready(None) => return Poll::Ready(None), + } + } + + loop { + return match this.waiting.poll_join_next(cx) { + Poll::Ready(Some(Ok(Some(conn)))) => { + info!(protocol = this.protocol, "accepted new TLS connection"); + Poll::Ready(Some(Ok(conn))) + } + // The handshake failed to complete, try getting another connection from the queue + Poll::Ready(Some(Ok(None))) => continue, + // The handshake panicked or was cancelled. ignore and get another connection + Poll::Ready(Some(Err(e))) => { + tracing::warn!("handshake aborted: {e}"); + continue; + } + _ => Poll::Pending, + }; + } + } +} diff --git a/proxy/src/serverless/websocket.rs b/proxy/src/serverless/websocket.rs index a6529c920a..a72ede6d0a 100644 --- a/proxy/src/serverless/websocket.rs +++ b/proxy/src/serverless/websocket.rs @@ -1,8 +1,9 @@ use crate::{ - cancellation::CancelMap, + cancellation::CancellationHandler, config::ProxyConfig, context::RequestMonitoring, - error::io_error, + error::{io_error, ReportableError}, + metrics::NUM_CLIENT_CONNECTION_GAUGE, proxy::{handle_client, ClientMode}, rate_limiter::EndpointRateLimiter, }; @@ -131,23 +132,46 @@ impl AsyncBufRead for WebSocketRw { pub async fn serve_websocket( config: &'static ProxyConfig, - ctx: &mut RequestMonitoring, + mut ctx: RequestMonitoring, websocket: HyperWebsocket, - cancel_map: &CancelMap, + cancellation_handler: Arc, hostname: Option, endpoint_rate_limiter: Arc, ) -> anyhow::Result<()> { let websocket = websocket.await?; - handle_client( + let conn_gauge = NUM_CLIENT_CONNECTION_GAUGE + .with_label_values(&["ws"]) + .guard(); + + let res = handle_client( config, - ctx, - cancel_map, + &mut ctx, + cancellation_handler, WebSocketRw::new(websocket), ClientMode::Websockets { hostname }, endpoint_rate_limiter, + conn_gauge, ) - .await?; - Ok(()) + .await; + + match res { + Err(e) => { + // todo: log and push to ctx the error kind + ctx.set_error_kind(e.get_error_kind()); + ctx.log(); + Err(e.into()) + } + Ok(None) => { + ctx.set_success(); + ctx.log(); + Ok(()) + } + Ok(Some(p)) => { + ctx.set_success(); + ctx.log(); + p.proxy_pass().await + } + } } #[cfg(test)] diff --git a/proxy/src/stream.rs b/proxy/src/stream.rs index f48b3fe39f..b6b7a85659 100644 --- a/proxy/src/stream.rs +++ b/proxy/src/stream.rs @@ -1,6 +1,6 @@ use crate::config::TlsServerEndPoint; -use crate::error::UserFacingError; -use anyhow::bail; +use crate::error::{ErrorKind, ReportableError, UserFacingError}; +use crate::metrics::TLS_HANDSHAKE_FAILURES; use bytes::BytesMut; use pq_proto::framed::{ConnectionError, Framed}; @@ -73,6 +73,30 @@ impl PqStream { } } +#[derive(Debug)] +pub struct ReportedError { + source: anyhow::Error, + error_kind: ErrorKind, +} + +impl std::fmt::Display for ReportedError { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + self.source.fmt(f) + } +} + +impl std::error::Error for ReportedError { + fn source(&self) -> Option<&(dyn std::error::Error + 'static)> { + self.source.source() + } +} + +impl ReportableError for ReportedError { + fn get_error_kind(&self) -> ErrorKind { + self.error_kind + } +} + impl PqStream { /// Write the message into an internal buffer, but don't flush the underlying stream. pub fn write_message_noflush(&mut self, message: &BeMessage<'_>) -> io::Result<&mut Self> { @@ -98,24 +122,52 @@ impl PqStream { /// Write the error message using [`Self::write_message`], then re-throw it. /// Allowing string literals is safe under the assumption they might not contain any runtime info. /// This method exists due to `&str` not implementing `Into`. - pub async fn throw_error_str(&mut self, error: &'static str) -> anyhow::Result { - tracing::info!("forwarding error to user: {error}"); - self.write_message(&BeMessage::ErrorResponse(error, None)) - .await?; - bail!(error) + pub async fn throw_error_str( + &mut self, + msg: &'static str, + error_kind: ErrorKind, + ) -> Result { + tracing::info!( + kind = error_kind.to_metric_label(), + msg, + "forwarding error to user" + ); + + // already error case, ignore client IO error + let _: Result<_, std::io::Error> = self + .write_message(&BeMessage::ErrorResponse(msg, None)) + .await; + + Err(ReportedError { + source: anyhow::anyhow!(msg), + error_kind, + }) } /// Write the error message using [`Self::write_message`], then re-throw it. /// Trait [`UserFacingError`] acts as an allowlist for error types. - pub async fn throw_error(&mut self, error: E) -> anyhow::Result + pub async fn throw_error(&mut self, error: E) -> Result where E: UserFacingError + Into, { + let error_kind = error.get_error_kind(); let msg = error.to_string_client(); - tracing::info!("forwarding error to user: {msg}"); - self.write_message(&BeMessage::ErrorResponse(&msg, None)) - .await?; - bail!(error) + tracing::info!( + kind=error_kind.to_metric_label(), + error=%error, + msg, + "forwarding error to user" + ); + + // already error case, ignore client IO error + let _: Result<_, std::io::Error> = self + .write_message(&BeMessage::ErrorResponse(&msg, None)) + .await; + + Err(ReportedError { + source: anyhow::anyhow!(error), + error_kind, + }) } } @@ -173,7 +225,10 @@ impl Stream { /// If possible, upgrade raw stream into a secure TLS-based stream. pub async fn upgrade(self, cfg: Arc) -> Result, StreamUpgradeError> { match self { - Stream::Raw { raw } => Ok(tokio_rustls::TlsAcceptor::from(cfg).accept(raw).await?), + Stream::Raw { raw } => Ok(tokio_rustls::TlsAcceptor::from(cfg) + .accept(raw) + .await + .inspect_err(|_| TLS_HANDSHAKE_FAILURES.inc())?), Stream::Tls { .. } => Err(StreamUpgradeError::AlreadyTls), } } diff --git a/pyproject.toml b/pyproject.toml index 24e075b489..6dff112a5e 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -33,22 +33,27 @@ psutil = "^5.9.4" types-psutil = "^5.9.5.12" types-toml = "^0.10.8.6" pytest-httpserver = "^1.0.8" -aiohttp = "3.9.0" +aiohttp = "3.9.2" pytest-rerunfailures = "^13.0" types-pytest-lazy-fixture = "^0.6.3.3" pytest-split = "^0.8.1" zstandard = "^0.21.0" +httpx = {extras = ["http2"], version = "^0.26.0"} +pytest-repeat = "^0.9.3" [tool.poetry.group.dev.dependencies] mypy = "==1.3.0" -ruff = "^0.1.11" +ruff = "^0.2.2" [build-system] requires = ["poetry-core>=1.0.0"] build-backend = "poetry.core.masonry.api" [tool.mypy] -exclude = "^vendor/" +exclude = [ + "^vendor/", + "^target/", +] check_untyped_defs = true # Help mypy find imports when running against list of individual files. # Without this line it would behave differently when executed on the entire project. @@ -72,7 +77,13 @@ ignore_missing_imports = true [tool.ruff] target-version = "py39" -extend-exclude = ["vendor/"] +extend-exclude = [ + "vendor/", + "target/", +] +line-length = 100 # this setting is rather guidance, it won't fail if it can't make the shorter + +[tool.ruff.lint] ignore = [ "E501", # Line too long, we don't want to be too strict about it ] @@ -83,4 +94,3 @@ select = [ "W", # pycodestyle "B", # bugbear ] -line-length = 100 # this setting is rather guidance, it won't fail if it can't make the shorter diff --git a/rust-toolchain.toml b/rust-toolchain.toml index 9b5a965f7d..b0949c32b1 100644 --- a/rust-toolchain.toml +++ b/rust-toolchain.toml @@ -1,5 +1,5 @@ [toolchain] -channel = "1.75.0" +channel = "1.76.0" profile = "default" # The default profile includes rustc, rust-std, cargo, rust-docs, rustfmt and clippy. # https://rust-lang.github.io/rustup/concepts/profiles.html diff --git a/s3_scrubber/src/checks.rs b/s3_scrubber/src/checks.rs index 7b9f96dce3..7c0f699958 100644 --- a/s3_scrubber/src/checks.rs +++ b/s3_scrubber/src/checks.rs @@ -11,7 +11,7 @@ use utils::id::TimelineId; use crate::cloud_admin_api::BranchData; use crate::metadata_stream::stream_listing; use crate::{download_object_with_retries, RootTarget, TenantShardTimelineId}; -use futures_util::{pin_mut, StreamExt}; +use futures_util::StreamExt; use pageserver::tenant::remote_timeline_client::parse_remote_index_path; use pageserver::tenant::storage_layer::LayerFileName; use pageserver::tenant::IndexPart; @@ -285,8 +285,7 @@ pub(crate) async fn list_timeline_blobs( let mut index_parts: Vec = Vec::new(); let mut initdb_archive: bool = false; - let stream = stream_listing(s3_client, &timeline_dir_target); - pin_mut!(stream); + let mut stream = std::pin::pin!(stream_listing(s3_client, &timeline_dir_target)); while let Some(obj) = stream.next().await { let obj = obj?; let key = obj.key(); diff --git a/s3_scrubber/src/cloud_admin_api.rs b/s3_scrubber/src/cloud_admin_api.rs index 151421c84f..45cac23690 100644 --- a/s3_scrubber/src/cloud_admin_api.rs +++ b/s3_scrubber/src/cloud_admin_api.rs @@ -1,11 +1,7 @@ -#![allow(unused)] - -use std::str::FromStr; use std::time::Duration; use chrono::{DateTime, Utc}; use hex::FromHex; -use pageserver::tenant::Tenant; use reqwest::{header, Client, StatusCode, Url}; use serde::Deserialize; use tokio::sync::Semaphore; @@ -290,7 +286,7 @@ impl CloudAdminApiClient { tokio::time::sleep(Duration::from_millis(500)).await; continue; } - status => { + _status => { return Err(Error::new( "List active projects".to_string(), ErrorKind::ResponseStatus(response.status()), diff --git a/s3_scrubber/src/garbage.rs b/s3_scrubber/src/garbage.rs index 93bb115883..7a08dffc66 100644 --- a/s3_scrubber/src/garbage.rs +++ b/s3_scrubber/src/garbage.rs @@ -12,7 +12,7 @@ use aws_sdk_s3::{ types::{Delete, ObjectIdentifier}, Client, }; -use futures_util::{pin_mut, TryStreamExt}; +use futures_util::TryStreamExt; use pageserver_api::shard::TenantShardId; use serde::{Deserialize, Serialize}; use tokio_stream::StreamExt; @@ -199,12 +199,12 @@ async fn find_garbage_inner( } } }); - let tenants_checked = tenants_checked.try_buffer_unordered(CONSOLE_CONCURRENCY); + let mut tenants_checked = + std::pin::pin!(tenants_checked.try_buffer_unordered(CONSOLE_CONCURRENCY)); // Process the results of Tenant checks. If a Tenant is garbage, it goes into // the `GarbageList`. Else it goes into `active_tenants` for more detailed timeline // checks if they are enabled by the `depth` parameter. - pin_mut!(tenants_checked); let mut garbage = GarbageList::new(node_kind, bucket_config); let mut active_tenants: Vec = vec![]; let mut counter = 0; @@ -267,10 +267,10 @@ async fn find_garbage_inner( .map(|r| (ttid, r)) } }); - let timelines_checked = timelines_checked.try_buffer_unordered(CONSOLE_CONCURRENCY); + let mut timelines_checked = + std::pin::pin!(timelines_checked.try_buffer_unordered(CONSOLE_CONCURRENCY)); // Update the GarbageList with any timelines which appear not to exist. - pin_mut!(timelines_checked); while let Some(result) = timelines_checked.next().await { let (ttid, console_result) = result?; if garbage.maybe_append(GarbageEntity::Timeline(ttid), console_result) { @@ -425,9 +425,9 @@ pub async fn purge_garbage( } } }); - let get_objects_results = get_objects_results.try_buffer_unordered(S3_CONCURRENCY); + let mut get_objects_results = + std::pin::pin!(get_objects_results.try_buffer_unordered(S3_CONCURRENCY)); - pin_mut!(get_objects_results); let mut objects_to_delete = Vec::new(); while let Some(result) = get_objects_results.next().await { let mut object_list = result?; diff --git a/s3_scrubber/src/scan_metadata.rs b/s3_scrubber/src/scan_metadata.rs index 4b63bb3884..6ff9783875 100644 --- a/s3_scrubber/src/scan_metadata.rs +++ b/s3_scrubber/src/scan_metadata.rs @@ -7,7 +7,7 @@ use crate::checks::{ use crate::metadata_stream::{stream_tenant_timelines, stream_tenants}; use crate::{init_remote, BucketConfig, NodeKind, RootTarget, TenantShardTimelineId}; use aws_sdk_s3::Client; -use futures_util::{pin_mut, StreamExt, TryStreamExt}; +use futures_util::{StreamExt, TryStreamExt}; use histogram::Histogram; use pageserver::tenant::remote_timeline_client::remote_layer_path; use pageserver::tenant::IndexPart; @@ -226,7 +226,7 @@ pub async fn scan_metadata( Ok((ttid, data)) } let timelines = timelines.map_ok(|ttid| report_on_timeline(&s3_client, &target, ttid)); - let timelines = timelines.try_buffered(CONCURRENCY); + let mut timelines = std::pin::pin!(timelines.try_buffered(CONCURRENCY)); // We must gather all the TenantShardTimelineId->S3TimelineBlobData for each tenant, because different // shards in the same tenant might refer to one anothers' keys if a shard split has happened. @@ -309,7 +309,6 @@ pub async fn scan_metadata( // all results for the same tenant will be adjacent. We accumulate these, // and then call `analyze_tenant` to flush, when we see the next tenant ID. let mut summary = MetadataSummary::new(); - pin_mut!(timelines); while let Some(i) = timelines.next().await { let (ttid, data) = i?; summary.update_data(&data); diff --git a/safekeeper/Cargo.toml b/safekeeper/Cargo.toml index 364cad7892..cb4a1def1f 100644 --- a/safekeeper/Cargo.toml +++ b/safekeeper/Cargo.toml @@ -61,3 +61,10 @@ tokio-stream.workspace = true utils.workspace = true workspace_hack.workspace = true + +[dev-dependencies] +walproposer.workspace = true +rand.workspace = true +desim.workspace = true +tracing.workspace = true +tracing-subscriber = { workspace = true, features = ["json"] } diff --git a/safekeeper/src/auth.rs b/safekeeper/src/auth.rs index bf4905aaa7..dd9058c468 100644 --- a/safekeeper/src/auth.rs +++ b/safekeeper/src/auth.rs @@ -12,8 +12,12 @@ pub fn check_permission(claims: &Claims, tenant_id: Option) -> Result< } Ok(()) } - (Scope::PageServerApi, _) => Err(AuthError( - "PageServerApi scope makes no sense for Safekeeper".into(), + (Scope::Admin | Scope::PageServerApi | Scope::GenerationsApi, _) => Err(AuthError( + format!( + "JWT scope '{:?}' is ineligible for Safekeeper auth", + claims.scope + ) + .into(), )), (Scope::SafekeeperData, _) => Ok(()), } diff --git a/safekeeper/src/bin/safekeeper.rs b/safekeeper/src/bin/safekeeper.rs index 33047051df..3c4c81e499 100644 --- a/safekeeper/src/bin/safekeeper.rs +++ b/safekeeper/src/bin/safekeeper.rs @@ -166,6 +166,10 @@ struct Args { /// useful for debugging. #[arg(long)] current_thread_runtime: bool, + /// Keep horizon for walsenders, i.e. don't remove WAL segments that are + /// still needed for existing replication connection. + #[arg(long)] + walsenders_keep_horizon: bool, } // Like PathBufValueParser, but allows empty string. @@ -295,6 +299,7 @@ async fn main() -> anyhow::Result<()> { pg_tenant_only_auth, http_auth, current_thread_runtime: args.current_thread_runtime, + walsenders_keep_horizon: args.walsenders_keep_horizon, }; // initialize sentry if SENTRY_DSN is provided diff --git a/safekeeper/src/control_file.rs b/safekeeper/src/control_file.rs index f1daddd7c3..d822c87c0e 100644 --- a/safekeeper/src/control_file.rs +++ b/safekeeper/src/control_file.rs @@ -3,8 +3,9 @@ use anyhow::{bail, ensure, Context, Result}; use byteorder::{LittleEndian, ReadBytesExt, WriteBytesExt}; use camino::Utf8PathBuf; -use tokio::fs::{self, File}; +use tokio::fs::File; use tokio::io::AsyncWriteExt; +use utils::crashsafe::durable_rename; use std::io::Read; use std::ops::Deref; @@ -18,8 +19,6 @@ use utils::{bin_ser::LeSer, id::TenantTimelineId}; use crate::SafeKeeperConf; -use std::convert::TryInto; - pub const SK_MAGIC: u32 = 0xcafeceefu32; pub const SK_FORMAT_VERSION: u32 = 7; @@ -203,35 +202,8 @@ impl Storage for FileStorage { ) })?; - // fsync the file - if !self.conf.no_sync { - control_partial.sync_all().await.with_context(|| { - format!( - "failed to sync partial control file at {}", - control_partial_path - ) - })?; - } - let control_path = self.timeline_dir.join(CONTROL_FILE_NAME); - - // rename should be atomic - fs::rename(&control_partial_path, &control_path).await?; - // this sync is not required by any standard but postgres does this (see durable_rename) - if !self.conf.no_sync { - let new_f = File::open(&control_path).await?; - new_f - .sync_all() - .await - .with_context(|| format!("failed to sync control file at: {}", &control_path))?; - - // fsync the directory (linux specific) - let tli_dir = File::open(&self.timeline_dir).await?; - tli_dir - .sync_all() - .await - .context("failed to sync control file directory")?; - } + durable_rename(&control_partial_path, &control_path, !self.conf.no_sync).await?; // update internal state self.state = s.clone(); @@ -245,11 +217,9 @@ impl Storage for FileStorage { #[cfg(test)] mod test { - use super::FileStorage; use super::*; - use crate::SafeKeeperConf; - use anyhow::Result; - use utils::{id::TenantTimelineId, lsn::Lsn}; + use tokio::fs; + use utils::lsn::Lsn; fn stub_conf() -> SafeKeeperConf { let workdir = camino_tempfile::tempdir().unwrap().into_path(); diff --git a/safekeeper/src/handler.rs b/safekeeper/src/handler.rs index 761541168c..f45bfb95fa 100644 --- a/safekeeper/src/handler.rs +++ b/safekeeper/src/handler.rs @@ -2,8 +2,7 @@ //! protocol commands. use anyhow::Context; -use std::str::FromStr; -use std::str::{self}; +use std::str::{self, FromStr}; use std::sync::Arc; use tokio::io::{AsyncRead, AsyncWrite}; use tracing::{debug, info, info_span, Instrument}; @@ -16,8 +15,8 @@ use crate::safekeeper::Term; use crate::timeline::TimelineError; use crate::wal_service::ConnectionId; use crate::{GlobalTimelines, SafeKeeperConf}; +use postgres_backend::PostgresBackend; use postgres_backend::QueryError; -use postgres_backend::{self, PostgresBackend}; use postgres_ffi::PG_TLI; use pq_proto::{BeMessage, FeStartupPacket, RowDescriptor, INT4_OID, TEXT_OID}; use regex::Regex; diff --git a/safekeeper/src/http/routes.rs b/safekeeper/src/http/routes.rs index 919b6b2982..a0c0c7ca4c 100644 --- a/safekeeper/src/http/routes.rs +++ b/safekeeper/src/http/routes.rs @@ -28,7 +28,7 @@ use crate::safekeeper::Term; use crate::safekeeper::{ServerInfo, TermLsn}; use crate::send_wal::WalSenderState; use crate::timeline::PeerInfo; -use crate::{copy_timeline, debug_dump, pull_timeline}; +use crate::{copy_timeline, debug_dump, patch_control_file, pull_timeline}; use crate::timelines_global_map::TimelineDeleteForceResult; use crate::GlobalTimelines; @@ -465,6 +465,26 @@ async fn dump_debug_handler(mut request: Request) -> Result Ok(response) } +async fn patch_control_file_handler( + mut request: Request, +) -> Result, ApiError> { + check_permission(&request, None)?; + + let ttid = TenantTimelineId::new( + parse_request_param(&request, "tenant_id")?, + parse_request_param(&request, "timeline_id")?, + ); + + let tli = GlobalTimelines::get(ttid).map_err(ApiError::from)?; + + let patch_request: patch_control_file::Request = json_request(&mut request).await?; + let response = patch_control_file::handle_request(tli, patch_request) + .await + .map_err(ApiError::InternalServerError)?; + + json_response(StatusCode::OK, response) +} + /// Safekeeper http router. pub fn make_router(conf: SafeKeeperConf) -> RouterBuilder { let mut router = endpoint::make_router(); @@ -526,6 +546,10 @@ pub fn make_router(conf: SafeKeeperConf) -> RouterBuilder "/v1/tenant/:tenant_id/timeline/:source_timeline_id/copy", |r| request_span(r, timeline_copy_handler), ) + .patch( + "/v1/tenant/:tenant_id/timeline/:timeline_id/control_file", + |r| request_span(r, patch_control_file_handler), + ) // for tests .post("/v1/record_safekeeper_info/:tenant_id/:timeline_id", |r| { request_span(r, record_safekeeper_info) diff --git a/safekeeper/src/lib.rs b/safekeeper/src/lib.rs index f18a1ec22d..ce4b4d7bd0 100644 --- a/safekeeper/src/lib.rs +++ b/safekeeper/src/lib.rs @@ -22,6 +22,7 @@ pub mod handler; pub mod http; pub mod json_ctrl; pub mod metrics; +pub mod patch_control_file; pub mod pull_timeline; pub mod receive_wal; pub mod recovery; @@ -77,6 +78,7 @@ pub struct SafeKeeperConf { pub pg_tenant_only_auth: Option>, pub http_auth: Option>, pub current_thread_runtime: bool, + pub walsenders_keep_horizon: bool, } impl SafeKeeperConf { @@ -120,6 +122,7 @@ impl SafeKeeperConf { heartbeat_timeout: Duration::new(5, 0), max_offloader_lag_bytes: defaults::DEFAULT_MAX_OFFLOADER_LAG_BYTES, current_thread_runtime: false, + walsenders_keep_horizon: false, } } } diff --git a/safekeeper/src/metrics.rs b/safekeeper/src/metrics.rs index fbba2e00fc..e541527b6a 100644 --- a/safekeeper/src/metrics.rs +++ b/safekeeper/src/metrics.rs @@ -140,6 +140,13 @@ pub static BROKER_ITERATION_TIMELINES: Lazy = Lazy::new(|| { ) .expect("Failed to register safekeeper_broker_iteration_timelines histogram vec") }); +pub static RECEIVED_PS_FEEDBACKS: Lazy = Lazy::new(|| { + register_int_counter!( + "safekeeper_received_ps_feedbacks_total", + "Number of pageserver feedbacks received" + ) + .expect("Failed to register safekeeper_received_ps_feedbacks_total counter") +}); pub const LABEL_UNKNOWN: &str = "unknown"; @@ -301,7 +308,8 @@ pub async fn time_io_closure>( #[derive(Clone)] pub struct FullTimelineInfo { pub ttid: TenantTimelineId, - pub ps_feedback: PageserverFeedback, + pub ps_feedback_count: u64, + pub last_ps_feedback: PageserverFeedback, pub wal_backup_active: bool, pub timeline_is_active: bool, pub num_computes: u32, @@ -327,6 +335,7 @@ pub struct TimelineCollector { remote_consistent_lsn: GenericGaugeVec, ps_last_received_lsn: GenericGaugeVec, feedback_last_time_seconds: GenericGaugeVec, + ps_feedback_count: GenericGaugeVec, timeline_active: GenericGaugeVec, wal_backup_active: GenericGaugeVec, connected_computes: IntGaugeVec, @@ -430,6 +439,15 @@ impl TimelineCollector { .unwrap(); descs.extend(feedback_last_time_seconds.desc().into_iter().cloned()); + let ps_feedback_count = GenericGaugeVec::new( + Opts::new( + "safekeeper_ps_feedback_count_total", + "Number of feedbacks received from the pageserver", + ), + &["tenant_id", "timeline_id"], + ) + .unwrap(); + let timeline_active = GenericGaugeVec::new( Opts::new( "safekeeper_timeline_active", @@ -538,6 +556,7 @@ impl TimelineCollector { remote_consistent_lsn, ps_last_received_lsn, feedback_last_time_seconds, + ps_feedback_count, timeline_active, wal_backup_active, connected_computes, @@ -570,6 +589,7 @@ impl Collector for TimelineCollector { self.remote_consistent_lsn.reset(); self.ps_last_received_lsn.reset(); self.feedback_last_time_seconds.reset(); + self.ps_feedback_count.reset(); self.timeline_active.reset(); self.wal_backup_active.reset(); self.connected_computes.reset(); @@ -646,9 +666,12 @@ impl Collector for TimelineCollector { self.ps_last_received_lsn .with_label_values(labels) - .set(tli.ps_feedback.last_received_lsn.0); + .set(tli.last_ps_feedback.last_received_lsn.0); + self.ps_feedback_count + .with_label_values(labels) + .set(tli.ps_feedback_count); if let Ok(unix_time) = tli - .ps_feedback + .last_ps_feedback .replytime .duration_since(SystemTime::UNIX_EPOCH) { @@ -679,6 +702,7 @@ impl Collector for TimelineCollector { mfs.extend(self.remote_consistent_lsn.collect()); mfs.extend(self.ps_last_received_lsn.collect()); mfs.extend(self.feedback_last_time_seconds.collect()); + mfs.extend(self.ps_feedback_count.collect()); mfs.extend(self.timeline_active.collect()); mfs.extend(self.wal_backup_active.collect()); mfs.extend(self.connected_computes.collect()); @@ -695,9 +719,11 @@ impl Collector for TimelineCollector { // report total number of timelines self.timelines_count.set(timelines_count as i64); + mfs.extend(self.timelines_count.collect()); + self.active_timelines_count .set(active_timelines_count as i64); - mfs.extend(self.timelines_count.collect()); + mfs.extend(self.active_timelines_count.collect()); mfs } diff --git a/safekeeper/src/patch_control_file.rs b/safekeeper/src/patch_control_file.rs new file mode 100644 index 0000000000..2136d1b5f7 --- /dev/null +++ b/safekeeper/src/patch_control_file.rs @@ -0,0 +1,85 @@ +use std::sync::Arc; + +use serde::{Deserialize, Serialize}; +use serde_json::Value; +use tracing::info; + +use crate::{state::TimelinePersistentState, timeline::Timeline}; + +#[derive(Deserialize, Debug, Clone)] +pub struct Request { + /// JSON object with fields to update + pub updates: serde_json::Value, + /// List of fields to apply + pub apply_fields: Vec, +} + +#[derive(Serialize)] +pub struct Response { + pub old_control_file: TimelinePersistentState, + pub new_control_file: TimelinePersistentState, +} + +/// Patch control file with given request. Will update the persistent state using +/// fields from the request and persist the new state on disk. +pub async fn handle_request(tli: Arc, request: Request) -> anyhow::Result { + let response = tli + .map_control_file(|state| { + let old_control_file = state.clone(); + let new_control_file = state_apply_diff(&old_control_file, &request)?; + + info!( + "patching control file, old: {:?}, new: {:?}, patch: {:?}", + old_control_file, new_control_file, request + ); + *state = new_control_file.clone(); + + Ok(Response { + old_control_file, + new_control_file, + }) + }) + .await?; + + Ok(response) +} + +fn state_apply_diff( + state: &TimelinePersistentState, + request: &Request, +) -> anyhow::Result { + let mut json_value = serde_json::to_value(state)?; + + if let Value::Object(a) = &mut json_value { + if let Value::Object(b) = &request.updates { + json_apply_diff(a, b, &request.apply_fields)?; + } else { + anyhow::bail!("request.updates is not a json object") + } + } else { + anyhow::bail!("TimelinePersistentState is not a json object") + } + + let new_state: TimelinePersistentState = serde_json::from_value(json_value)?; + Ok(new_state) +} + +fn json_apply_diff( + object: &mut serde_json::Map, + updates: &serde_json::Map, + apply_keys: &Vec, +) -> anyhow::Result<()> { + for key in apply_keys { + if let Some(new_value) = updates.get(key) { + if let Some(existing_value) = object.get_mut(key) { + *existing_value = new_value.clone(); + } else { + anyhow::bail!("key not found in original object: {}", key); + } + } else { + anyhow::bail!("key not found in request.updates: {}", key); + } + } + + Ok(()) +} diff --git a/safekeeper/src/receive_wal.rs b/safekeeper/src/receive_wal.rs index 9ce9b049ba..015b53bb2e 100644 --- a/safekeeper/src/receive_wal.rs +++ b/safekeeper/src/receive_wal.rs @@ -36,11 +36,15 @@ use tokio::time::Instant; use tracing::*; use utils::id::TenantTimelineId; use utils::lsn::Lsn; +use utils::pageserver_feedback::PageserverFeedback; + +const DEFAULT_FEEDBACK_CAPACITY: usize = 8; /// Registry of WalReceivers (compute connections). Timeline holds it (wrapped /// in Arc). pub struct WalReceivers { mutex: Mutex, + pageserver_feedback_tx: tokio::sync::broadcast::Sender, } /// Id under which walreceiver is registered in shmem. @@ -48,8 +52,12 @@ type WalReceiverId = usize; impl WalReceivers { pub fn new() -> Arc { + let (pageserver_feedback_tx, _) = + tokio::sync::broadcast::channel(DEFAULT_FEEDBACK_CAPACITY); + Arc::new(WalReceivers { mutex: Mutex::new(WalReceiversShared { slots: Vec::new() }), + pageserver_feedback_tx, }) } @@ -116,6 +124,12 @@ impl WalReceivers { let mut shared = self.mutex.lock(); shared.slots[id] = None; } + + /// Broadcast pageserver feedback to connected walproposers. + pub fn broadcast_pageserver_feedback(&self, feedback: PageserverFeedback) { + // Err means there is no subscribers, it is fine. + let _ = self.pageserver_feedback_tx.send(feedback); + } } /// Only a few connections are expected (normally one), so store in Vec. @@ -197,17 +211,28 @@ impl SafekeeperPostgresHandler { // sends, so this avoids deadlocks. let mut pgb_reader = pgb.split().context("START_WAL_PUSH split")?; let peer_addr = *pgb.get_peer_addr(); - let network_reader = NetworkReader { + let mut network_reader = NetworkReader { ttid: self.ttid, conn_id: self.conn_id, pgb_reader: &mut pgb_reader, peer_addr, acceptor_handle: &mut acceptor_handle, }; - let res = tokio::select! { - // todo: add read|write .context to these errors - r = network_reader.run(msg_tx, msg_rx, reply_tx) => r, - r = network_write(pgb, reply_rx) => r, + + // Read first message and create timeline if needed. + let res = network_reader.read_first_message().await; + + let res = if let Ok((tli, next_msg)) = res { + let pageserver_feedback_rx: tokio::sync::broadcast::Receiver = + tli.get_walreceivers().pageserver_feedback_tx.subscribe(); + + tokio::select! { + // todo: add read|write .context to these errors + r = network_reader.run(msg_tx, msg_rx, reply_tx, tli.clone(), next_msg) => r, + r = network_write(pgb, reply_rx, pageserver_feedback_rx) => r, + } + } else { + res.map(|_| ()) }; // Join pg backend back. @@ -251,12 +276,9 @@ struct NetworkReader<'a, IO> { } impl<'a, IO: AsyncRead + AsyncWrite + Unpin> NetworkReader<'a, IO> { - async fn run( - self, - msg_tx: Sender, - msg_rx: Receiver, - reply_tx: Sender, - ) -> Result<(), CopyStreamHandlerEnd> { + async fn read_first_message( + &mut self, + ) -> Result<(Arc, ProposerAcceptorMessage), CopyStreamHandlerEnd> { // Receive information about server to create timeline, if not yet. let next_msg = read_message(self.pgb_reader).await?; let tli = match next_msg { @@ -278,9 +300,19 @@ impl<'a, IO: AsyncRead + AsyncWrite + Unpin> NetworkReader<'a, IO> { ))) } }; + Ok((tli, next_msg)) + } + async fn run( + self, + msg_tx: Sender, + msg_rx: Receiver, + reply_tx: Sender, + tli: Arc, + next_msg: ProposerAcceptorMessage, + ) -> Result<(), CopyStreamHandlerEnd> { *self.acceptor_handle = Some(WalAcceptor::spawn( - tli.clone(), + tli, msg_rx, reply_tx, Some(self.conn_id), @@ -320,18 +352,46 @@ async fn read_network_loop( async fn network_write( pgb_writer: &mut PostgresBackend, mut reply_rx: Receiver, + mut pageserver_feedback_rx: tokio::sync::broadcast::Receiver, ) -> Result<(), CopyStreamHandlerEnd> { let mut buf = BytesMut::with_capacity(128); + // storing append_response to inject PageserverFeedback into it + let mut last_append_response = None; + loop { - match reply_rx.recv().await { - Some(msg) => { - buf.clear(); - msg.serialize(&mut buf)?; - pgb_writer.write_message(&BeMessage::CopyData(&buf)).await?; + // trying to read either AcceptorProposerMessage or PageserverFeedback + let msg = tokio::select! { + reply = reply_rx.recv() => { + if let Some(msg) = reply { + if let AcceptorProposerMessage::AppendResponse(append_response) = &msg { + last_append_response = Some(append_response.clone()); + } + Some(msg) + } else { + return Ok(()); // chan closed, WalAcceptor terminated + } } - None => return Ok(()), // chan closed, WalAcceptor terminated - } + + feedback = pageserver_feedback_rx.recv() => + match (feedback, &last_append_response) { + (Ok(feedback), Some(append_response)) => { + // clone AppendResponse and inject PageserverFeedback into it + let mut append_response = append_response.clone(); + append_response.pageserver_feedback = Some(feedback); + Some(AcceptorProposerMessage::AppendResponse(append_response)) + } + _ => None, + } + }; + + let Some(msg) = msg else { + continue; + }; + + buf.clear(); + msg.serialize(&mut buf)?; + pgb_writer.write_message(&BeMessage::CopyData(&buf)).await?; } } diff --git a/safekeeper/src/safekeeper.rs b/safekeeper/src/safekeeper.rs index d66db9b652..d7c8fa6955 100644 --- a/safekeeper/src/safekeeper.rs +++ b/safekeeper/src/safekeeper.rs @@ -4,7 +4,7 @@ use anyhow::{bail, Context, Result}; use byteorder::{LittleEndian, ReadBytesExt}; use bytes::{Buf, BufMut, Bytes, BytesMut}; -use postgres_ffi::{TimeLineID, XLogSegNo, MAX_SEND_SIZE}; +use postgres_ffi::{TimeLineID, MAX_SEND_SIZE}; use serde::{Deserialize, Serialize}; use std::cmp::max; use std::cmp::min; @@ -321,7 +321,7 @@ pub struct AppendRequestHeader { } /// Report safekeeper state to proposer -#[derive(Debug, Serialize)] +#[derive(Debug, Serialize, Clone)] pub struct AppendResponse { // Current term of the safekeeper; if it is higher than proposer's, the // compute is out of date. @@ -334,7 +334,7 @@ pub struct AppendResponse { // a criterion for walproposer --sync mode exit pub commit_lsn: Lsn, pub hs_feedback: HotStandbyFeedback, - pub pageserver_feedback: PageserverFeedback, + pub pageserver_feedback: Option, } impl AppendResponse { @@ -344,7 +344,7 @@ impl AppendResponse { flush_lsn: Lsn(0), commit_lsn: Lsn(0), hs_feedback: HotStandbyFeedback::empty(), - pageserver_feedback: PageserverFeedback::empty(), + pageserver_feedback: None, } } } @@ -462,7 +462,11 @@ impl AcceptorProposerMessage { buf.put_u64_le(msg.hs_feedback.xmin); buf.put_u64_le(msg.hs_feedback.catalog_xmin); - msg.pageserver_feedback.serialize(buf); + // AsyncReadMessage in walproposer.c will not try to decode pageserver_feedback + // if it is not present. + if let Some(ref msg) = msg.pageserver_feedback { + msg.serialize(buf); + } } } @@ -681,7 +685,7 @@ where commit_lsn: self.state.commit_lsn, // will be filled by the upper code to avoid bothering safekeeper hs_feedback: HotStandbyFeedback::empty(), - pageserver_feedback: PageserverFeedback::empty(), + pageserver_feedback: None, }; trace!("formed AppendResponse {:?}", ar); ar @@ -946,28 +950,12 @@ where } Ok(()) } - - /// Get oldest segno we still need to keep. We hold WAL till it is consumed - /// by all of 1) pageserver (remote_consistent_lsn) 2) peers 3) s3 - /// offloading. - /// While it is safe to use inmem values for determining horizon, - /// we use persistent to make possible normal states less surprising. - pub fn get_horizon_segno(&self, wal_backup_enabled: bool) -> XLogSegNo { - let mut horizon_lsn = min( - self.state.remote_consistent_lsn, - self.state.peer_horizon_lsn, - ); - if wal_backup_enabled { - horizon_lsn = min(horizon_lsn, self.state.backup_lsn); - } - horizon_lsn.segment_number(self.state.server.wal_seg_size as usize) - } } #[cfg(test)] mod tests { use futures::future::BoxFuture; - use postgres_ffi::WAL_SEGMENT_SIZE; + use postgres_ffi::{XLogSegNo, WAL_SEGMENT_SIZE}; use super::*; use crate::{ diff --git a/safekeeper/src/send_wal.rs b/safekeeper/src/send_wal.rs index ee3e4c8ead..7da5fd00b0 100644 --- a/safekeeper/src/send_wal.rs +++ b/safekeeper/src/send_wal.rs @@ -2,6 +2,8 @@ //! with the "START_REPLICATION" message, and registry of walsenders. use crate::handler::SafekeeperPostgresHandler; +use crate::metrics::RECEIVED_PS_FEEDBACKS; +use crate::receive_wal::WalReceivers; use crate::safekeeper::{Term, TermLsn}; use crate::timeline::Timeline; use crate::wal_service::ConnectionId; @@ -21,7 +23,7 @@ use utils::failpoint_support; use utils::id::TenantTimelineId; use utils::pageserver_feedback::PageserverFeedback; -use std::cmp::{max, min}; +use std::cmp::min; use std::net::SocketAddr; use std::str; use std::sync::Arc; @@ -90,12 +92,14 @@ pub struct StandbyFeedback { /// WalSenders registry. Timeline holds it (wrapped in Arc). pub struct WalSenders { mutex: Mutex, + walreceivers: Arc, } impl WalSenders { - pub fn new() -> Arc { + pub fn new(walreceivers: Arc) -> Arc { Arc::new(WalSenders { mutex: Mutex::new(WalSendersShared::new()), + walreceivers, }) } @@ -136,22 +140,44 @@ impl WalSenders { self.mutex.lock().slots.iter().flatten().cloned().collect() } - /// Get aggregated pageserver feedback. - pub fn get_ps_feedback(self: &Arc) -> PageserverFeedback { - self.mutex.lock().agg_ps_feedback + /// Get LSN of the most lagging pageserver receiver. Return None if there are no + /// active walsenders. + pub fn laggard_lsn(self: &Arc) -> Option { + self.mutex + .lock() + .slots + .iter() + .flatten() + .filter_map(|s| match s.feedback { + ReplicationFeedback::Pageserver(feedback) => Some(feedback.last_received_lsn), + ReplicationFeedback::Standby(_) => None, + }) + .min() } - /// Get aggregated pageserver and hot standby feedback (we send them to compute). - pub fn get_feedbacks(self: &Arc) -> (PageserverFeedback, HotStandbyFeedback) { + /// Returns total counter of pageserver feedbacks received and last feedback. + pub fn get_ps_feedback_stats(self: &Arc) -> (u64, PageserverFeedback) { let shared = self.mutex.lock(); - (shared.agg_ps_feedback, shared.agg_hs_feedback) + (shared.ps_feedback_counter, shared.last_ps_feedback) + } + + /// Get aggregated hot standby feedback (we send it to compute). + pub fn get_hotstandby(self: &Arc) -> HotStandbyFeedback { + self.mutex.lock().agg_hs_feedback } /// Record new pageserver feedback, update aggregated values. fn record_ps_feedback(self: &Arc, id: WalSenderId, feedback: &PageserverFeedback) { let mut shared = self.mutex.lock(); shared.get_slot_mut(id).feedback = ReplicationFeedback::Pageserver(*feedback); - shared.update_ps_feedback(); + shared.last_ps_feedback = *feedback; + shared.ps_feedback_counter += 1; + drop(shared); + + RECEIVED_PS_FEEDBACKS.inc(); + + // send feedback to connected walproposers + self.walreceivers.broadcast_pageserver_feedback(*feedback); } /// Record standby reply. @@ -207,8 +233,10 @@ impl WalSenders { struct WalSendersShared { // aggregated over all walsenders value agg_hs_feedback: HotStandbyFeedback, - // aggregated over all walsenders value - agg_ps_feedback: PageserverFeedback, + // last feedback ever received from any pageserver, empty if none + last_ps_feedback: PageserverFeedback, + // total counter of pageserver feedbacks received + ps_feedback_counter: u64, slots: Vec>, } @@ -216,7 +244,8 @@ impl WalSendersShared { fn new() -> Self { WalSendersShared { agg_hs_feedback: HotStandbyFeedback::empty(), - agg_ps_feedback: PageserverFeedback::empty(), + last_ps_feedback: PageserverFeedback::empty(), + ps_feedback_counter: 0, slots: Vec::new(), } } @@ -261,37 +290,6 @@ impl WalSendersShared { } self.agg_hs_feedback = agg; } - - /// Update aggregated pageserver feedback. LSNs (last_received, - /// disk_consistent, remote_consistent) and reply timestamp are just - /// maximized; timeline_size if taken from feedback with highest - /// last_received lsn. This is generally reasonable, but we might want to - /// implement other policies once multiple pageservers start to be actively - /// used. - fn update_ps_feedback(&mut self) { - let init = PageserverFeedback::empty(); - let acc = - self.slots - .iter() - .flatten() - .fold(init, |mut acc, ws_state| match ws_state.feedback { - ReplicationFeedback::Pageserver(feedback) => { - if feedback.last_received_lsn > acc.last_received_lsn { - acc.current_timeline_size = feedback.current_timeline_size; - } - acc.last_received_lsn = - max(feedback.last_received_lsn, acc.last_received_lsn); - acc.disk_consistent_lsn = - max(feedback.disk_consistent_lsn, acc.disk_consistent_lsn); - acc.remote_consistent_lsn = - max(feedback.remote_consistent_lsn, acc.remote_consistent_lsn); - acc.replytime = max(feedback.replytime, acc.replytime); - acc - } - ReplicationFeedback::Standby(_) => acc, - }); - self.agg_ps_feedback = acc; - } } // Serialized is used only for pretty printing in json. @@ -428,7 +426,7 @@ impl SafekeeperPostgresHandler { }; let mut reply_reader = ReplyReader { reader, - ws_guard, + ws_guard: ws_guard.clone(), tli, }; @@ -437,6 +435,18 @@ impl SafekeeperPostgresHandler { r = sender.run() => r, r = reply_reader.run() => r, }; + + let ws_state = ws_guard + .walsenders + .mutex + .lock() + .get_slot(ws_guard.id) + .clone(); + info!( + "finished streaming to {}, feedback={:?}", + ws_state.addr, ws_state.feedback, + ); + // Join pg backend back. pgb.unsplit(reply_reader.reader)?; @@ -718,7 +728,6 @@ async fn wait_for_lsn( #[cfg(test)] mod tests { - use postgres_protocol::PG_EPOCH; use utils::id::{TenantId, TimelineId}; use super::*; @@ -777,27 +786,4 @@ mod tests { wss.update_hs_feedback(); assert_eq!(wss.agg_hs_feedback.xmin, 42); } - - // form pageserver feedback with given last_record_lsn / tli size and the - // rest set to dummy values. - fn ps_feedback(current_timeline_size: u64, last_received_lsn: Lsn) -> ReplicationFeedback { - ReplicationFeedback::Pageserver(PageserverFeedback { - current_timeline_size, - last_received_lsn, - disk_consistent_lsn: Lsn::INVALID, - remote_consistent_lsn: Lsn::INVALID, - replytime: *PG_EPOCH, - }) - } - - // test that ps aggregation works as expected - #[test] - fn test_ps_feedback() { - let mut wss = WalSendersShared::new(); - push_feedback(&mut wss, ps_feedback(8, Lsn(42))); - push_feedback(&mut wss, ps_feedback(4, Lsn(84))); - wss.update_ps_feedback(); - assert_eq!(wss.agg_ps_feedback.current_timeline_size, 4); - assert_eq!(wss.agg_ps_feedback.last_received_lsn, Lsn(84)); - } } diff --git a/safekeeper/src/timeline.rs b/safekeeper/src/timeline.rs index ec7dd7d89b..4901b86acf 100644 --- a/safekeeper/src/timeline.rs +++ b/safekeeper/src/timeline.rs @@ -286,6 +286,29 @@ impl SharedState { .cloned() .collect() } + + /// Get oldest segno we still need to keep. We hold WAL till it is consumed + /// by all of 1) pageserver (remote_consistent_lsn) 2) peers 3) s3 + /// offloading. + /// While it is safe to use inmem values for determining horizon, + /// we use persistent to make possible normal states less surprising. + fn get_horizon_segno( + &self, + wal_backup_enabled: bool, + extra_horizon_lsn: Option, + ) -> XLogSegNo { + let state = &self.sk.state; + + use std::cmp::min; + let mut horizon_lsn = min(state.remote_consistent_lsn, state.peer_horizon_lsn); + if wal_backup_enabled { + horizon_lsn = min(horizon_lsn, state.backup_lsn); + } + if let Some(extra_horizon_lsn) = extra_horizon_lsn { + horizon_lsn = min(horizon_lsn, extra_horizon_lsn); + } + horizon_lsn.segment_number(state.server.wal_seg_size as usize) + } } #[derive(Debug, thiserror::Error)] @@ -353,6 +376,12 @@ pub struct Timeline { /// Directory where timeline state is stored. pub timeline_dir: Utf8PathBuf, + + /// Should we keep WAL on disk for active replication connections. + /// Especially useful for sharding, when different shards process WAL + /// with different speed. + // TODO: add `Arc` here instead of adding each field separately. + walsenders_keep_horizon: bool, } impl Timeline { @@ -373,6 +402,7 @@ impl Timeline { ))); let (cancellation_tx, cancellation_rx) = watch::channel(false); + let walreceivers = WalReceivers::new(); Ok(Timeline { ttid, wal_backup_launcher_tx, @@ -381,11 +411,12 @@ impl Timeline { term_flush_lsn_watch_tx, term_flush_lsn_watch_rx, mutex: Mutex::new(shared_state), - walsenders: WalSenders::new(), - walreceivers: WalReceivers::new(), + walsenders: WalSenders::new(walreceivers.clone()), + walreceivers, cancellation_rx, cancellation_tx, timeline_dir: conf.timeline_dir(&ttid), + walsenders_keep_horizon: conf.walsenders_keep_horizon, }) } @@ -405,6 +436,7 @@ impl Timeline { let state = TimelinePersistentState::new(&ttid, server_info, vec![], commit_lsn, local_start_lsn); + let walreceivers = WalReceivers::new(); Ok(Timeline { ttid, wal_backup_launcher_tx, @@ -413,11 +445,12 @@ impl Timeline { term_flush_lsn_watch_tx, term_flush_lsn_watch_rx, mutex: Mutex::new(SharedState::create_new(conf, &ttid, state)?), - walsenders: WalSenders::new(), - walreceivers: WalReceivers::new(), + walsenders: WalSenders::new(walreceivers.clone()), + walreceivers, cancellation_rx, cancellation_tx, timeline_dir: conf.timeline_dir(&ttid), + walsenders_keep_horizon: conf.walsenders_keep_horizon, }) } @@ -625,12 +658,9 @@ impl Timeline { let mut shared_state = self.write_shared_state().await; rmsg = shared_state.sk.process_msg(msg).await?; - // if this is AppendResponse, fill in proper pageserver and hot - // standby feedback. + // if this is AppendResponse, fill in proper hot standby feedback. if let Some(AcceptorProposerMessage::AppendResponse(ref mut resp)) = rmsg { - let (ps_feedback, hs_feedback) = self.walsenders.get_feedbacks(); - resp.hs_feedback = hs_feedback; - resp.pageserver_feedback = ps_feedback; + resp.hs_feedback = self.walsenders.get_hotstandby(); } commit_lsn = shared_state.sk.state.inmem.commit_lsn; @@ -817,10 +847,20 @@ impl Timeline { bail!(TimelineError::Cancelled(self.ttid)); } + // If enabled, we use LSN of the most lagging walsender as a WAL removal horizon. + // This allows to get better read speed for pageservers that are lagging behind, + // at the cost of keeping more WAL on disk. + let replication_horizon_lsn = if self.walsenders_keep_horizon { + self.walsenders.laggard_lsn() + } else { + None + }; + let horizon_segno: XLogSegNo; let remover = { let shared_state = self.write_shared_state().await; - horizon_segno = shared_state.sk.get_horizon_segno(wal_backup_enabled); + horizon_segno = + shared_state.get_horizon_segno(wal_backup_enabled, replication_horizon_lsn); if horizon_segno <= 1 || horizon_segno <= shared_state.last_removed_segno { return Ok(()); // nothing to do } @@ -857,12 +897,13 @@ impl Timeline { return None; } - let ps_feedback = self.walsenders.get_ps_feedback(); + let (ps_feedback_count, last_ps_feedback) = self.walsenders.get_ps_feedback_stats(); let state = self.write_shared_state().await; if state.active { Some(FullTimelineInfo { ttid: self.ttid, - ps_feedback, + ps_feedback_count, + last_ps_feedback, wal_backup_active: state.wal_backup_active, timeline_is_active: state.active, num_computes: self.walreceivers.get_num() as u32, @@ -901,6 +942,20 @@ impl Timeline { file_open, } } + + /// Apply a function to the control file state and persist it. + pub async fn map_control_file( + &self, + f: impl FnOnce(&mut TimelinePersistentState) -> Result, + ) -> Result { + let mut state = self.write_shared_state().await; + let mut persistent_state = state.sk.state.start_change(); + // If f returns error, we abort the change and don't persist anything. + let res = f(&mut persistent_state)?; + // If persisting fails, we abort the change and return error. + state.sk.state.finish_change(&persistent_state).await?; + Ok(res) + } } /// Deletes directory and it's contents. Returns false if directory does not exist. diff --git a/safekeeper/src/wal_backup.rs b/safekeeper/src/wal_backup.rs index c47381351d..944d80f777 100644 --- a/safekeeper/src/wal_backup.rs +++ b/safekeeper/src/wal_backup.rs @@ -10,6 +10,7 @@ use utils::id::NodeId; use std::cmp::min; use std::collections::{HashMap, HashSet}; +use std::num::NonZeroU32; use std::pin::Pin; use std::sync::Arc; use std::time::Duration; @@ -510,7 +511,11 @@ async fn backup_object( let file = tokio_util::io::ReaderStream::with_capacity(file, BUFFER_SIZE); - storage.upload_storage_object(file, size, target_file).await + let cancel = CancellationToken::new(); + + storage + .upload_storage_object(file, size, target_file, &cancel) + .await } pub async fn read_object( @@ -525,8 +530,10 @@ pub async fn read_object( info!("segment download about to start from remote path {file_path:?} at offset {offset}"); + let cancel = CancellationToken::new(); + let download = storage - .download_storage_object(Some((offset, None)), file_path) + .download_storage_object(Some((offset, None)), file_path, &cancel) .await .with_context(|| { format!("Failed to open WAL segment download stream for remote path {file_path:?}") @@ -546,6 +553,10 @@ pub async fn delete_timeline(ttid: &TenantTimelineId) -> Result<()> { let ttid_path = Utf8Path::new(&ttid.tenant_id.to_string()).join(ttid.timeline_id.to_string()); let remote_path = RemotePath::new(&ttid_path)?; + // see DEFAULT_MAX_KEYS_PER_LIST_RESPONSE + // const Option unwrap is not stable, otherwise it would be const. + let batch_size: NonZeroU32 = NonZeroU32::new(1000).unwrap(); + // A backoff::retry is used here for two reasons: // - To provide a backoff rather than busy-polling the API on errors // - To absorb transient 429/503 conditions without hitting our error @@ -554,20 +565,41 @@ pub async fn delete_timeline(ttid: &TenantTimelineId) -> Result<()> { // Note: listing segments might take a long time if there are many of them. // We don't currently have http requests timeout cancellation, but if/once // we have listing should get streaming interface to make progress. - let token = CancellationToken::new(); // not really used + + let cancel = CancellationToken::new(); // not really used backoff::retry( || async { - let files = storage.list_files(Some(&remote_path)).await?; - storage.delete_objects(&files).await?; - Ok(()) + // Do list-delete in batch_size batches to make progress even if there a lot of files. + // Alternatively we could make list_files return iterator, but it is more complicated and + // I'm not sure deleting while iterating is expected in s3. + loop { + let files = storage + .list_files(Some(&remote_path), Some(batch_size), &cancel) + .await?; + if files.is_empty() { + return Ok(()); // done + } + // (at least) s3 results are sorted, so can log min/max: + // "List results are always returned in UTF-8 binary order." + info!( + "deleting batch of {} WAL segments [{}-{}]", + files.len(), + files.first().unwrap().object_name().unwrap_or(""), + files.last().unwrap().object_name().unwrap_or("") + ); + storage.delete_objects(&files, &cancel).await?; + } }, + // consider TimeoutOrCancel::caused_by_cancel when using cancellation |_| false, 3, 10, "executing WAL segments deletion batch", - backoff::Cancel::new(token, || anyhow::anyhow!("canceled")), + &cancel, ) - .await?; + .await + .ok_or_else(|| anyhow::anyhow!("canceled")) + .and_then(|x| x)?; Ok(()) } @@ -593,7 +625,12 @@ pub async fn copy_s3_segments( let remote_path = RemotePath::new(&relative_dst_path)?; - let files = storage.list_files(Some(&remote_path)).await?; + let cancel = CancellationToken::new(); + + let files = storage + .list_files(Some(&remote_path), None, &cancel) + .await?; + let uploaded_segments = &files .iter() .filter_map(|file| file.object_name().map(ToOwned::to_owned)) @@ -621,7 +658,7 @@ pub async fn copy_s3_segments( let from = RemotePath::new(&relative_src_path.join(&segment_name))?; let to = RemotePath::new(&relative_dst_path.join(&segment_name))?; - storage.copy_object(&from, &to).await?; + storage.copy_object(&from, &to, &cancel).await?; } info!( diff --git a/safekeeper/src/wal_service.rs b/safekeeper/src/wal_service.rs index bceaad1e16..4a97eb3993 100644 --- a/safekeeper/src/wal_service.rs +++ b/safekeeper/src/wal_service.rs @@ -68,7 +68,7 @@ async fn handle_socket( // is not Unpin, and all pgbackend/framed/tokio dependencies require stream // to be Unpin. Which is reasonable, as indeed something like TimeoutReader // shouldn't be moved. - tokio::pin!(socket); + let socket = std::pin::pin!(socket); let traffic_metrics = TrafficMetrics::new(); if let Some(current_az) = conf.availability_zone.as_deref() { diff --git a/safekeeper/src/wal_storage.rs b/safekeeper/src/wal_storage.rs index ed6190042a..8bbd95e9e8 100644 --- a/safekeeper/src/wal_storage.rs +++ b/safekeeper/src/wal_storage.rs @@ -21,6 +21,7 @@ use tokio::fs::{self, remove_file, File, OpenOptions}; use tokio::io::{AsyncRead, AsyncWriteExt}; use tokio::io::{AsyncReadExt, AsyncSeekExt}; use tracing::*; +use utils::crashsafe::durable_rename; use crate::metrics::{time_io_closure, WalStorageMetrics, REMOVED_WAL_SEGMENTS}; use crate::state::TimelinePersistentState; @@ -196,15 +197,6 @@ impl PhysicalStorage { Ok(()) } - /// Call fsync if config requires so. - async fn fsync_file(&mut self, file: &File) -> Result<()> { - if !self.conf.no_sync { - self.metrics - .observe_flush_seconds(time_io_closure(file.sync_all()).await?); - } - Ok(()) - } - /// Open or create WAL segment file. Caller must call seek to the wanted position. /// Returns `file` and `is_partial`. async fn open_or_create(&mut self, segno: XLogSegNo) -> Result<(File, bool)> { @@ -223,15 +215,33 @@ impl PhysicalStorage { Ok((file, true)) } else { // Create and fill new partial file + // + // We're using fdatasync during WAL writing, so file size must not + // change; to this end it is filled with zeros here. To avoid using + // half initialized segment, first bake it under tmp filename and + // then rename. + let tmp_path = self.timeline_dir.join("waltmp"); let mut file = OpenOptions::new() .create(true) .write(true) - .open(&wal_file_partial_path) + .open(&tmp_path) .await - .with_context(|| format!("Failed to open log file {:?}", &wal_file_path))?; + .with_context(|| format!("Failed to open tmp wal file {:?}", &tmp_path))?; write_zeroes(&mut file, self.wal_seg_size).await?; - self.fsync_file(&file).await?; + + // Note: this doesn't get into observe_flush_seconds metric. But + // segment init should be separate metric, if any. + if let Err(e) = + durable_rename(&tmp_path, &wal_file_partial_path, !self.conf.no_sync).await + { + // Probably rename succeeded, but fsync of it failed. Remove + // the file then to avoid using it. + remove_file(wal_file_partial_path) + .await + .or_else(utils::fs_ext::ignore_not_found)?; + return Err(e.into()); + } Ok((file, true)) } } @@ -718,6 +728,11 @@ const ZERO_BLOCK: &[u8] = &[0u8; XLOG_BLCKSZ]; /// Helper for filling file with zeroes. async fn write_zeroes(file: &mut File, mut count: usize) -> Result<()> { + fail::fail_point!("sk-write-zeroes", |_| { + info!("write_zeroes hit failpoint"); + Err(anyhow::anyhow!("failpoint: sk-write-zeroes")) + }); + while count >= XLOG_BLCKSZ { file.write_all(ZERO_BLOCK).await?; count -= XLOG_BLCKSZ; diff --git a/safekeeper/tests/misc_test.rs b/safekeeper/tests/misc_test.rs new file mode 100644 index 0000000000..8e5b17a143 --- /dev/null +++ b/safekeeper/tests/misc_test.rs @@ -0,0 +1,155 @@ +use std::sync::Arc; + +use tracing::{info, warn}; +use utils::lsn::Lsn; + +use crate::walproposer_sim::{ + log::{init_logger, init_tracing_logger}, + simulation::{generate_network_opts, generate_schedule, Schedule, TestAction, TestConfig}, +}; + +pub mod walproposer_sim; + +// Test that simulation supports restarting (crashing) safekeepers. +#[test] +fn crash_safekeeper() { + let clock = init_logger(); + let config = TestConfig::new(Some(clock)); + let test = config.start(1337); + + let lsn = test.sync_safekeepers().unwrap(); + assert_eq!(lsn, Lsn(0)); + info!("Sucessfully synced empty safekeepers at 0/0"); + + let mut wp = test.launch_walproposer(lsn); + + // Write some WAL and crash safekeeper 0 without waiting for replication. + test.poll_for_duration(30); + wp.write_tx(3); + test.servers[0].restart(); + + // Wait some time, so that walproposer can reconnect. + test.poll_for_duration(2000); +} + +// Test that walproposer can be crashed (stopped). +#[test] +fn test_simple_restart() { + let clock = init_logger(); + let config = TestConfig::new(Some(clock)); + let test = config.start(1337); + + let lsn = test.sync_safekeepers().unwrap(); + assert_eq!(lsn, Lsn(0)); + info!("Sucessfully synced empty safekeepers at 0/0"); + + let mut wp = test.launch_walproposer(lsn); + + test.poll_for_duration(30); + wp.write_tx(3); + test.poll_for_duration(100); + + wp.stop(); + drop(wp); + + let lsn = test.sync_safekeepers().unwrap(); + info!("Sucessfully synced safekeepers at {}", lsn); +} + +// Test runnning a simple schedule, restarting everything a several times. +#[test] +fn test_simple_schedule() -> anyhow::Result<()> { + let clock = init_logger(); + let mut config = TestConfig::new(Some(clock)); + config.network.keepalive_timeout = Some(100); + let test = config.start(1337); + + let schedule: Schedule = vec![ + (0, TestAction::RestartWalProposer), + (50, TestAction::WriteTx(5)), + (100, TestAction::RestartSafekeeper(0)), + (100, TestAction::WriteTx(5)), + (110, TestAction::RestartSafekeeper(1)), + (110, TestAction::WriteTx(5)), + (120, TestAction::RestartSafekeeper(2)), + (120, TestAction::WriteTx(5)), + (201, TestAction::RestartWalProposer), + (251, TestAction::RestartSafekeeper(0)), + (251, TestAction::RestartSafekeeper(1)), + (251, TestAction::RestartSafekeeper(2)), + (251, TestAction::WriteTx(5)), + (255, TestAction::WriteTx(5)), + (1000, TestAction::WriteTx(5)), + ]; + + test.run_schedule(&schedule)?; + info!("Test finished, stopping all threads"); + test.world.deallocate(); + + Ok(()) +} + +// Test that simulation can process 10^4 transactions. +#[test] +fn test_many_tx() -> anyhow::Result<()> { + let clock = init_logger(); + let config = TestConfig::new(Some(clock)); + let test = config.start(1337); + + let mut schedule: Schedule = vec![]; + for i in 0..100 { + schedule.push((i * 10, TestAction::WriteTx(100))); + } + + test.run_schedule(&schedule)?; + info!("Test finished, stopping all threads"); + test.world.stop_all(); + + let events = test.world.take_events(); + info!("Events: {:?}", events); + let last_commit_lsn = events + .iter() + .filter_map(|event| { + if event.data.starts_with("commit_lsn;") { + let lsn: u64 = event.data.split(';').nth(1).unwrap().parse().unwrap(); + return Some(lsn); + } + None + }) + .last() + .unwrap(); + + let initdb_lsn = 21623024; + let diff = last_commit_lsn - initdb_lsn; + info!("Last commit lsn: {}, diff: {}", last_commit_lsn, diff); + // each tx is at least 8 bytes, it's written a 100 times for in a loop for 100 times + assert!(diff > 100 * 100 * 8); + Ok(()) +} + +// Checks that we don't have nasty circular dependencies, preventing Arc from deallocating. +// This test doesn't really assert anything, you need to run it manually to check if there +// is any issue. +#[test] +fn test_res_dealloc() -> anyhow::Result<()> { + let clock = init_tracing_logger(true); + let mut config = TestConfig::new(Some(clock)); + + let seed = 123456; + config.network = generate_network_opts(seed); + let test = config.start(seed); + warn!("Running test with seed {}", seed); + + let schedule = generate_schedule(seed); + info!("schedule: {:?}", schedule); + test.run_schedule(&schedule).unwrap(); + test.world.stop_all(); + + let world = test.world.clone(); + drop(test); + info!("world strong count: {}", Arc::strong_count(&world)); + world.deallocate(); + info!("world strong count: {}", Arc::strong_count(&world)); + + Ok(()) +} diff --git a/safekeeper/tests/random_test.rs b/safekeeper/tests/random_test.rs new file mode 100644 index 0000000000..6c6f6a8c96 --- /dev/null +++ b/safekeeper/tests/random_test.rs @@ -0,0 +1,56 @@ +use rand::Rng; +use tracing::{info, warn}; + +use crate::walproposer_sim::{ + log::{init_logger, init_tracing_logger}, + simulation::{generate_network_opts, generate_schedule, TestConfig}, + simulation_logs::validate_events, +}; + +pub mod walproposer_sim; + +// Generates 2000 random seeds and runs a schedule for each of them. +// If you seed this test fail, please report the last seed to the +// @safekeeper team. +#[test] +fn test_random_schedules() -> anyhow::Result<()> { + let clock = init_logger(); + let mut config = TestConfig::new(Some(clock)); + + for _ in 0..2000 { + let seed: u64 = rand::thread_rng().gen(); + config.network = generate_network_opts(seed); + + let test = config.start(seed); + warn!("Running test with seed {}", seed); + + let schedule = generate_schedule(seed); + test.run_schedule(&schedule).unwrap(); + validate_events(test.world.take_events()); + test.world.deallocate(); + } + + Ok(()) +} + +// After you found a seed that fails, you can insert this seed here +// and run the test to see the full debug output. +#[test] +fn test_one_schedule() -> anyhow::Result<()> { + let clock = init_tracing_logger(true); + let mut config = TestConfig::new(Some(clock)); + + let seed = 11047466935058776390; + config.network = generate_network_opts(seed); + info!("network: {:?}", config.network); + let test = config.start(seed); + warn!("Running test with seed {}", seed); + + let schedule = generate_schedule(seed); + info!("schedule: {:?}", schedule); + test.run_schedule(&schedule).unwrap(); + validate_events(test.world.take_events()); + test.world.deallocate(); + + Ok(()) +} diff --git a/safekeeper/tests/simple_test.rs b/safekeeper/tests/simple_test.rs new file mode 100644 index 0000000000..0be9d0deef --- /dev/null +++ b/safekeeper/tests/simple_test.rs @@ -0,0 +1,45 @@ +use tracing::info; +use utils::lsn::Lsn; + +use crate::walproposer_sim::{log::init_logger, simulation::TestConfig}; + +pub mod walproposer_sim; + +// Check that first start of sync_safekeepers() returns 0/0 on empty safekeepers. +#[test] +fn sync_empty_safekeepers() { + let clock = init_logger(); + let config = TestConfig::new(Some(clock)); + let test = config.start(1337); + + let lsn = test.sync_safekeepers().unwrap(); + assert_eq!(lsn, Lsn(0)); + info!("Sucessfully synced empty safekeepers at 0/0"); + + let lsn = test.sync_safekeepers().unwrap(); + assert_eq!(lsn, Lsn(0)); + info!("Sucessfully synced (again) empty safekeepers at 0/0"); +} + +// Check that there are no panics when we are writing and streaming WAL to safekeepers. +#[test] +fn run_walproposer_generate_wal() { + let clock = init_logger(); + let config = TestConfig::new(Some(clock)); + let test = config.start(1337); + + let lsn = test.sync_safekeepers().unwrap(); + assert_eq!(lsn, Lsn(0)); + info!("Sucessfully synced empty safekeepers at 0/0"); + + let mut wp = test.launch_walproposer(lsn); + + // wait for walproposer to start + test.poll_for_duration(30); + + // just write some WAL + for _ in 0..100 { + wp.write_tx(1); + test.poll_for_duration(5); + } +} diff --git a/safekeeper/tests/walproposer_sim/block_storage.rs b/safekeeper/tests/walproposer_sim/block_storage.rs new file mode 100644 index 0000000000..468c02ad2f --- /dev/null +++ b/safekeeper/tests/walproposer_sim/block_storage.rs @@ -0,0 +1,57 @@ +use std::collections::HashMap; + +const BLOCK_SIZE: usize = 8192; + +/// A simple in-memory implementation of a block storage. Can be used to implement external +/// storage in tests. +pub struct BlockStorage { + blocks: HashMap, +} + +impl Default for BlockStorage { + fn default() -> Self { + Self::new() + } +} + +impl BlockStorage { + pub fn new() -> Self { + BlockStorage { + blocks: HashMap::new(), + } + } + + pub fn read(&self, pos: u64, buf: &mut [u8]) { + let mut buf_offset = 0; + let mut storage_pos = pos; + while buf_offset < buf.len() { + let block_id = storage_pos / BLOCK_SIZE as u64; + let block = self.blocks.get(&block_id).unwrap_or(&[0; BLOCK_SIZE]); + let block_offset = storage_pos % BLOCK_SIZE as u64; + let block_len = BLOCK_SIZE as u64 - block_offset; + let buf_len = buf.len() - buf_offset; + let copy_len = std::cmp::min(block_len as usize, buf_len); + buf[buf_offset..buf_offset + copy_len] + .copy_from_slice(&block[block_offset as usize..block_offset as usize + copy_len]); + buf_offset += copy_len; + storage_pos += copy_len as u64; + } + } + + pub fn write(&mut self, pos: u64, buf: &[u8]) { + let mut buf_offset = 0; + let mut storage_pos = pos; + while buf_offset < buf.len() { + let block_id = storage_pos / BLOCK_SIZE as u64; + let block = self.blocks.entry(block_id).or_insert([0; BLOCK_SIZE]); + let block_offset = storage_pos % BLOCK_SIZE as u64; + let block_len = BLOCK_SIZE as u64 - block_offset; + let buf_len = buf.len() - buf_offset; + let copy_len = std::cmp::min(block_len as usize, buf_len); + block[block_offset as usize..block_offset as usize + copy_len] + .copy_from_slice(&buf[buf_offset..buf_offset + copy_len]); + buf_offset += copy_len; + storage_pos += copy_len as u64 + } + } +} diff --git a/safekeeper/tests/walproposer_sim/log.rs b/safekeeper/tests/walproposer_sim/log.rs new file mode 100644 index 0000000000..870f30de4f --- /dev/null +++ b/safekeeper/tests/walproposer_sim/log.rs @@ -0,0 +1,77 @@ +use std::{fmt, sync::Arc}; + +use desim::time::Timing; +use once_cell::sync::OnceCell; +use parking_lot::Mutex; +use tracing_subscriber::fmt::{format::Writer, time::FormatTime}; + +/// SimClock can be plugged into tracing logger to print simulation time. +#[derive(Clone)] +pub struct SimClock { + clock_ptr: Arc>>>, +} + +impl Default for SimClock { + fn default() -> Self { + SimClock { + clock_ptr: Arc::new(Mutex::new(None)), + } + } +} + +impl SimClock { + pub fn set_clock(&self, clock: Arc) { + *self.clock_ptr.lock() = Some(clock); + } +} + +impl FormatTime for SimClock { + fn format_time(&self, w: &mut Writer<'_>) -> fmt::Result { + let clock = self.clock_ptr.lock(); + + if let Some(clock) = clock.as_ref() { + let now = clock.now(); + write!(w, "[{}]", now) + } else { + write!(w, "[?]") + } + } +} + +static LOGGING_DONE: OnceCell = OnceCell::new(); + +/// Returns ptr to clocks attached to tracing logger to update them when the +/// world is (re)created. +pub fn init_tracing_logger(debug_enabled: bool) -> SimClock { + LOGGING_DONE + .get_or_init(|| { + let clock = SimClock::default(); + let base_logger = tracing_subscriber::fmt() + .with_target(false) + // prefix log lines with simulated time timestamp + .with_timer(clock.clone()) + // .with_ansi(true) TODO + .with_max_level(match debug_enabled { + true => tracing::Level::DEBUG, + false => tracing::Level::WARN, + }) + .with_writer(std::io::stdout); + base_logger.init(); + + // logging::replace_panic_hook_with_tracing_panic_hook().forget(); + + if !debug_enabled { + std::panic::set_hook(Box::new(|_| {})); + } + + clock + }) + .clone() +} + +pub fn init_logger() -> SimClock { + // RUST_TRACEBACK envvar controls whether we print all logs or only warnings. + let debug_enabled = std::env::var("RUST_TRACEBACK").is_ok(); + + init_tracing_logger(debug_enabled) +} diff --git a/safekeeper/tests/walproposer_sim/mod.rs b/safekeeper/tests/walproposer_sim/mod.rs new file mode 100644 index 0000000000..ec560dcb3b --- /dev/null +++ b/safekeeper/tests/walproposer_sim/mod.rs @@ -0,0 +1,8 @@ +pub mod block_storage; +pub mod log; +pub mod safekeeper; +pub mod safekeeper_disk; +pub mod simulation; +pub mod simulation_logs; +pub mod walproposer_api; +pub mod walproposer_disk; diff --git a/safekeeper/tests/walproposer_sim/safekeeper.rs b/safekeeper/tests/walproposer_sim/safekeeper.rs new file mode 100644 index 0000000000..e3aaf5d391 --- /dev/null +++ b/safekeeper/tests/walproposer_sim/safekeeper.rs @@ -0,0 +1,411 @@ +//! Safekeeper communication endpoint to WAL proposer (compute node). +//! Gets messages from the network, passes them down to consensus module and +//! sends replies back. + +use std::{collections::HashMap, sync::Arc, time::Duration}; + +use anyhow::{bail, Result}; +use bytes::{Bytes, BytesMut}; +use camino::Utf8PathBuf; +use desim::{ + executor::{self, PollSome}, + network::TCP, + node_os::NodeOs, + proto::{AnyMessage, NetEvent, NodeEvent}, +}; +use hyper::Uri; +use safekeeper::{ + safekeeper::{ProposerAcceptorMessage, SafeKeeper, ServerInfo, UNKNOWN_SERVER_VERSION}, + state::TimelinePersistentState, + timeline::TimelineError, + wal_storage::Storage, + SafeKeeperConf, +}; +use tracing::{debug, info_span}; +use utils::{ + id::{NodeId, TenantId, TenantTimelineId, TimelineId}, + lsn::Lsn, +}; + +use super::safekeeper_disk::{DiskStateStorage, DiskWALStorage, SafekeeperDisk, TimelineDisk}; + +struct SharedState { + sk: SafeKeeper, + disk: Arc, +} + +struct GlobalMap { + timelines: HashMap, + conf: SafeKeeperConf, + disk: Arc, +} + +impl GlobalMap { + /// Restores global state from disk. + fn new(disk: Arc, conf: SafeKeeperConf) -> Result { + let mut timelines = HashMap::new(); + + for (&ttid, disk) in disk.timelines.lock().iter() { + debug!("loading timeline {}", ttid); + let state = disk.state.lock().clone(); + + if state.server.wal_seg_size == 0 { + bail!(TimelineError::UninitializedWalSegSize(ttid)); + } + + if state.server.pg_version == UNKNOWN_SERVER_VERSION { + bail!(TimelineError::UninitialinzedPgVersion(ttid)); + } + + if state.commit_lsn < state.local_start_lsn { + bail!( + "commit_lsn {} is higher than local_start_lsn {}", + state.commit_lsn, + state.local_start_lsn + ); + } + + let control_store = DiskStateStorage::new(disk.clone()); + let wal_store = DiskWALStorage::new(disk.clone(), &control_store)?; + + let sk = SafeKeeper::new(control_store, wal_store, conf.my_id)?; + timelines.insert( + ttid, + SharedState { + sk, + disk: disk.clone(), + }, + ); + } + + Ok(Self { + timelines, + conf, + disk, + }) + } + + fn create(&mut self, ttid: TenantTimelineId, server_info: ServerInfo) -> Result<()> { + if self.timelines.contains_key(&ttid) { + bail!("timeline {} already exists", ttid); + } + + debug!("creating new timeline {}", ttid); + + let commit_lsn = Lsn::INVALID; + let local_start_lsn = Lsn::INVALID; + + let state = + TimelinePersistentState::new(&ttid, server_info, vec![], commit_lsn, local_start_lsn); + + if state.server.wal_seg_size == 0 { + bail!(TimelineError::UninitializedWalSegSize(ttid)); + } + + if state.server.pg_version == UNKNOWN_SERVER_VERSION { + bail!(TimelineError::UninitialinzedPgVersion(ttid)); + } + + if state.commit_lsn < state.local_start_lsn { + bail!( + "commit_lsn {} is higher than local_start_lsn {}", + state.commit_lsn, + state.local_start_lsn + ); + } + + let disk_timeline = self.disk.put_state(&ttid, state); + let control_store = DiskStateStorage::new(disk_timeline.clone()); + let wal_store = DiskWALStorage::new(disk_timeline.clone(), &control_store)?; + + let sk = SafeKeeper::new(control_store, wal_store, self.conf.my_id)?; + + self.timelines.insert( + ttid, + SharedState { + sk, + disk: disk_timeline, + }, + ); + Ok(()) + } + + fn get(&mut self, ttid: &TenantTimelineId) -> &mut SharedState { + self.timelines.get_mut(ttid).expect("timeline must exist") + } + + fn has_tli(&self, ttid: &TenantTimelineId) -> bool { + self.timelines.contains_key(ttid) + } +} + +/// State of a single connection to walproposer. +struct ConnState { + tcp: TCP, + + greeting: bool, + ttid: TenantTimelineId, + flush_pending: bool, + + runtime: tokio::runtime::Runtime, +} + +pub fn run_server(os: NodeOs, disk: Arc) -> Result<()> { + let _enter = info_span!("safekeeper", id = os.id()).entered(); + debug!("started server"); + os.log_event("started;safekeeper".to_owned()); + let conf = SafeKeeperConf { + workdir: Utf8PathBuf::from("."), + my_id: NodeId(os.id() as u64), + listen_pg_addr: String::new(), + listen_http_addr: String::new(), + no_sync: false, + broker_endpoint: "/".parse::().unwrap(), + broker_keepalive_interval: Duration::from_secs(0), + heartbeat_timeout: Duration::from_secs(0), + remote_storage: None, + max_offloader_lag_bytes: 0, + wal_backup_enabled: false, + listen_pg_addr_tenant_only: None, + advertise_pg_addr: None, + availability_zone: None, + peer_recovery_enabled: false, + backup_parallel_jobs: 0, + pg_auth: None, + pg_tenant_only_auth: None, + http_auth: None, + current_thread_runtime: false, + walsenders_keep_horizon: false, + }; + + let mut global = GlobalMap::new(disk, conf.clone())?; + let mut conns: HashMap = HashMap::new(); + + for (&_ttid, shared_state) in global.timelines.iter_mut() { + let flush_lsn = shared_state.sk.wal_store.flush_lsn(); + let commit_lsn = shared_state.sk.state.commit_lsn; + os.log_event(format!("tli_loaded;{};{}", flush_lsn.0, commit_lsn.0)); + } + + let node_events = os.node_events(); + let mut epoll_vec: Vec> = vec![]; + let mut epoll_idx: Vec = vec![]; + + // TODO: batch events processing (multiple events per tick) + loop { + epoll_vec.clear(); + epoll_idx.clear(); + + // node events channel + epoll_vec.push(Box::new(node_events.clone())); + epoll_idx.push(0); + + // tcp connections + for conn in conns.values() { + epoll_vec.push(Box::new(conn.tcp.recv_chan())); + epoll_idx.push(conn.tcp.connection_id()); + } + + // waiting for the next message + let index = executor::epoll_chans(&epoll_vec, -1).unwrap(); + + if index == 0 { + // got a new connection + match node_events.must_recv() { + NodeEvent::Accept(tcp) => { + conns.insert( + tcp.connection_id(), + ConnState { + tcp, + greeting: false, + ttid: TenantTimelineId::empty(), + flush_pending: false, + runtime: tokio::runtime::Builder::new_current_thread().build()?, + }, + ); + } + NodeEvent::Internal(_) => unreachable!(), + } + continue; + } + + let connection_id = epoll_idx[index]; + let conn = conns.get_mut(&connection_id).unwrap(); + let mut next_event = Some(conn.tcp.recv_chan().must_recv()); + + loop { + let event = match next_event { + Some(event) => event, + None => break, + }; + + match event { + NetEvent::Message(msg) => { + let res = conn.process_any(msg, &mut global); + if res.is_err() { + debug!("conn {:?} error: {:#}", connection_id, res.unwrap_err()); + conns.remove(&connection_id); + break; + } + } + NetEvent::Closed => { + // TODO: remove from conns? + } + } + + next_event = conn.tcp.recv_chan().try_recv(); + } + + conns.retain(|_, conn| { + let res = conn.flush(&mut global); + if res.is_err() { + debug!("conn {:?} error: {:?}", conn.tcp, res); + } + res.is_ok() + }); + } +} + +impl ConnState { + /// Process a message from the network. It can be START_REPLICATION request or a valid ProposerAcceptorMessage message. + fn process_any(&mut self, any: AnyMessage, global: &mut GlobalMap) -> Result<()> { + if let AnyMessage::Bytes(copy_data) = any { + let repl_prefix = b"START_REPLICATION "; + if !self.greeting && copy_data.starts_with(repl_prefix) { + self.process_start_replication(copy_data.slice(repl_prefix.len()..), global)?; + bail!("finished processing START_REPLICATION") + } + + let msg = ProposerAcceptorMessage::parse(copy_data)?; + debug!("got msg: {:?}", msg); + self.process(msg, global) + } else { + bail!("unexpected message, expected AnyMessage::Bytes"); + } + } + + /// Process START_REPLICATION request. + fn process_start_replication( + &mut self, + copy_data: Bytes, + global: &mut GlobalMap, + ) -> Result<()> { + // format is " " + let str = String::from_utf8(copy_data.to_vec())?; + + let mut parts = str.split(' '); + let tenant_id = parts.next().unwrap().parse::()?; + let timeline_id = parts.next().unwrap().parse::()?; + let start_lsn = parts.next().unwrap().parse::()?; + let end_lsn = parts.next().unwrap().parse::()?; + + let ttid = TenantTimelineId::new(tenant_id, timeline_id); + let shared_state = global.get(&ttid); + + // read bytes from start_lsn to end_lsn + let mut buf = vec![0; (end_lsn - start_lsn) as usize]; + shared_state.disk.wal.lock().read(start_lsn, &mut buf); + + // send bytes to the client + self.tcp.send(AnyMessage::Bytes(Bytes::from(buf))); + Ok(()) + } + + /// Get or create a timeline. + fn init_timeline( + &mut self, + ttid: TenantTimelineId, + server_info: ServerInfo, + global: &mut GlobalMap, + ) -> Result<()> { + self.ttid = ttid; + if global.has_tli(&ttid) { + return Ok(()); + } + + global.create(ttid, server_info) + } + + /// Process a ProposerAcceptorMessage. + fn process(&mut self, msg: ProposerAcceptorMessage, global: &mut GlobalMap) -> Result<()> { + if !self.greeting { + self.greeting = true; + + match msg { + ProposerAcceptorMessage::Greeting(ref greeting) => { + tracing::info!( + "start handshake with walproposer {:?} {:?}", + self.tcp, + greeting + ); + let server_info = ServerInfo { + pg_version: greeting.pg_version, + system_id: greeting.system_id, + wal_seg_size: greeting.wal_seg_size, + }; + let ttid = TenantTimelineId::new(greeting.tenant_id, greeting.timeline_id); + self.init_timeline(ttid, server_info, global)? + } + _ => { + bail!("unexpected message {msg:?} instead of greeting"); + } + } + } + + let tli = global.get(&self.ttid); + + match msg { + ProposerAcceptorMessage::AppendRequest(append_request) => { + self.flush_pending = true; + self.process_sk_msg( + tli, + &ProposerAcceptorMessage::NoFlushAppendRequest(append_request), + )?; + } + other => { + self.process_sk_msg(tli, &other)?; + } + } + + Ok(()) + } + + /// Process FlushWAL if needed. + fn flush(&mut self, global: &mut GlobalMap) -> Result<()> { + // TODO: try to add extra flushes in simulation, to verify that extra flushes don't break anything + if !self.flush_pending { + return Ok(()); + } + self.flush_pending = false; + let shared_state = global.get(&self.ttid); + self.process_sk_msg(shared_state, &ProposerAcceptorMessage::FlushWAL) + } + + /// Make safekeeper process a message and send a reply to the TCP + fn process_sk_msg( + &mut self, + shared_state: &mut SharedState, + msg: &ProposerAcceptorMessage, + ) -> Result<()> { + let mut reply = self.runtime.block_on(shared_state.sk.process_msg(msg))?; + if let Some(reply) = &mut reply { + // TODO: if this is AppendResponse, fill in proper hot standby feedback and disk consistent lsn + + let mut buf = BytesMut::with_capacity(128); + reply.serialize(&mut buf)?; + + self.tcp.send(AnyMessage::Bytes(buf.into())); + } + Ok(()) + } +} + +impl Drop for ConnState { + fn drop(&mut self) { + debug!("dropping conn: {:?}", self.tcp); + if !std::thread::panicking() { + self.tcp.close(); + } + // TODO: clean up non-fsynced WAL + } +} diff --git a/safekeeper/tests/walproposer_sim/safekeeper_disk.rs b/safekeeper/tests/walproposer_sim/safekeeper_disk.rs new file mode 100644 index 0000000000..35bca325aa --- /dev/null +++ b/safekeeper/tests/walproposer_sim/safekeeper_disk.rs @@ -0,0 +1,278 @@ +use std::collections::HashMap; +use std::sync::Arc; + +use parking_lot::Mutex; +use safekeeper::state::TimelinePersistentState; +use utils::id::TenantTimelineId; + +use super::block_storage::BlockStorage; + +use std::{ops::Deref, time::Instant}; + +use anyhow::Result; +use bytes::{Buf, BytesMut}; +use futures::future::BoxFuture; +use postgres_ffi::{waldecoder::WalStreamDecoder, XLogSegNo}; +use safekeeper::{control_file, metrics::WalStorageMetrics, wal_storage}; +use tracing::{debug, info}; +use utils::lsn::Lsn; + +/// All safekeeper state that is usually saved to disk. +pub struct SafekeeperDisk { + pub timelines: Mutex>>, +} + +impl Default for SafekeeperDisk { + fn default() -> Self { + Self::new() + } +} + +impl SafekeeperDisk { + pub fn new() -> Self { + SafekeeperDisk { + timelines: Mutex::new(HashMap::new()), + } + } + + pub fn put_state( + &self, + ttid: &TenantTimelineId, + state: TimelinePersistentState, + ) -> Arc { + self.timelines + .lock() + .entry(*ttid) + .and_modify(|e| { + let mut mu = e.state.lock(); + *mu = state.clone(); + }) + .or_insert_with(|| { + Arc::new(TimelineDisk { + state: Mutex::new(state), + wal: Mutex::new(BlockStorage::new()), + }) + }) + .clone() + } +} + +/// Control file state and WAL storage. +pub struct TimelineDisk { + pub state: Mutex, + pub wal: Mutex, +} + +/// Implementation of `control_file::Storage` trait. +pub struct DiskStateStorage { + persisted_state: TimelinePersistentState, + disk: Arc, + last_persist_at: Instant, +} + +impl DiskStateStorage { + pub fn new(disk: Arc) -> Self { + let guard = disk.state.lock(); + let state = guard.clone(); + drop(guard); + DiskStateStorage { + persisted_state: state, + disk, + last_persist_at: Instant::now(), + } + } +} + +#[async_trait::async_trait] +impl control_file::Storage for DiskStateStorage { + /// Persist safekeeper state on disk and update internal state. + async fn persist(&mut self, s: &TimelinePersistentState) -> Result<()> { + self.persisted_state = s.clone(); + *self.disk.state.lock() = s.clone(); + Ok(()) + } + + /// Timestamp of last persist. + fn last_persist_at(&self) -> Instant { + // TODO: don't rely on it in tests + self.last_persist_at + } +} + +impl Deref for DiskStateStorage { + type Target = TimelinePersistentState; + + fn deref(&self) -> &Self::Target { + &self.persisted_state + } +} + +/// Implementation of `wal_storage::Storage` trait. +pub struct DiskWALStorage { + /// Written to disk, but possibly still in the cache and not fully persisted. + /// Also can be ahead of record_lsn, if happen to be in the middle of a WAL record. + write_lsn: Lsn, + + /// The LSN of the last WAL record written to disk. Still can be not fully flushed. + write_record_lsn: Lsn, + + /// The LSN of the last WAL record flushed to disk. + flush_record_lsn: Lsn, + + /// Decoder is required for detecting boundaries of WAL records. + decoder: WalStreamDecoder, + + /// Bytes of WAL records that are not yet written to disk. + unflushed_bytes: BytesMut, + + /// Contains BlockStorage for WAL. + disk: Arc, +} + +impl DiskWALStorage { + pub fn new(disk: Arc, state: &TimelinePersistentState) -> Result { + let write_lsn = if state.commit_lsn == Lsn(0) { + Lsn(0) + } else { + Self::find_end_of_wal(disk.clone(), state.commit_lsn)? + }; + + let flush_lsn = write_lsn; + Ok(DiskWALStorage { + write_lsn, + write_record_lsn: flush_lsn, + flush_record_lsn: flush_lsn, + decoder: WalStreamDecoder::new(flush_lsn, 16), + unflushed_bytes: BytesMut::new(), + disk, + }) + } + + fn find_end_of_wal(disk: Arc, start_lsn: Lsn) -> Result { + let mut buf = [0; 8192]; + let mut pos = start_lsn.0; + let mut decoder = WalStreamDecoder::new(start_lsn, 16); + let mut result = start_lsn; + loop { + disk.wal.lock().read(pos, &mut buf); + pos += buf.len() as u64; + decoder.feed_bytes(&buf); + + loop { + match decoder.poll_decode() { + Ok(Some(record)) => result = record.0, + Err(e) => { + debug!( + "find_end_of_wal reached end at {:?}, decode error: {:?}", + result, e + ); + return Ok(result); + } + Ok(None) => break, // need more data + } + } + } + } +} + +#[async_trait::async_trait] +impl wal_storage::Storage for DiskWALStorage { + /// LSN of last durably stored WAL record. + fn flush_lsn(&self) -> Lsn { + self.flush_record_lsn + } + + /// Write piece of WAL from buf to disk, but not necessarily sync it. + async fn write_wal(&mut self, startpos: Lsn, buf: &[u8]) -> Result<()> { + if self.write_lsn != startpos { + panic!("write_wal called with wrong startpos"); + } + + self.unflushed_bytes.extend_from_slice(buf); + self.write_lsn += buf.len() as u64; + + if self.decoder.available() != startpos { + info!( + "restart decoder from {} to {}", + self.decoder.available(), + startpos, + ); + self.decoder = WalStreamDecoder::new(startpos, 16); + } + self.decoder.feed_bytes(buf); + loop { + match self.decoder.poll_decode()? { + None => break, // no full record yet + Some((lsn, _rec)) => { + self.write_record_lsn = lsn; + } + } + } + + Ok(()) + } + + /// Truncate WAL at specified LSN, which must be the end of WAL record. + async fn truncate_wal(&mut self, end_pos: Lsn) -> Result<()> { + if self.write_lsn != Lsn(0) && end_pos > self.write_lsn { + panic!( + "truncate_wal called on non-written WAL, write_lsn={}, end_pos={}", + self.write_lsn, end_pos + ); + } + + self.flush_wal().await?; + + // write zeroes to disk from end_pos until self.write_lsn + let buf = [0; 8192]; + let mut pos = end_pos.0; + while pos < self.write_lsn.0 { + self.disk.wal.lock().write(pos, &buf); + pos += buf.len() as u64; + } + + self.write_lsn = end_pos; + self.write_record_lsn = end_pos; + self.flush_record_lsn = end_pos; + self.unflushed_bytes.clear(); + self.decoder = WalStreamDecoder::new(end_pos, 16); + + Ok(()) + } + + /// Durably store WAL on disk, up to the last written WAL record. + async fn flush_wal(&mut self) -> Result<()> { + if self.flush_record_lsn == self.write_record_lsn { + // no need to do extra flush + return Ok(()); + } + + let num_bytes = self.write_record_lsn.0 - self.flush_record_lsn.0; + + self.disk.wal.lock().write( + self.flush_record_lsn.0, + &self.unflushed_bytes[..num_bytes as usize], + ); + self.unflushed_bytes.advance(num_bytes as usize); + self.flush_record_lsn = self.write_record_lsn; + + Ok(()) + } + + /// Remove all segments <= given segno. Returns function doing that as we + /// want to perform it without timeline lock. + fn remove_up_to(&self, _segno_up_to: XLogSegNo) -> BoxFuture<'static, anyhow::Result<()>> { + Box::pin(async move { Ok(()) }) + } + + /// Release resources associated with the storage -- technically, close FDs. + /// Currently we don't remove timelines until restart (#3146), so need to + /// spare descriptors. This would be useful for temporary tli detach as + /// well. + fn close(&mut self) {} + + /// Get metrics for this timeline. + fn get_metrics(&self) -> WalStorageMetrics { + WalStorageMetrics::default() + } +} diff --git a/safekeeper/tests/walproposer_sim/simulation.rs b/safekeeper/tests/walproposer_sim/simulation.rs new file mode 100644 index 0000000000..0d7aaf517b --- /dev/null +++ b/safekeeper/tests/walproposer_sim/simulation.rs @@ -0,0 +1,436 @@ +use std::{cell::Cell, str::FromStr, sync::Arc}; + +use crate::walproposer_sim::{safekeeper::run_server, walproposer_api::SimulationApi}; +use desim::{ + executor::{self, ExternalHandle}, + node_os::NodeOs, + options::{Delay, NetworkOptions}, + proto::{AnyMessage, NodeEvent}, + world::Node, + world::World, +}; +use rand::{Rng, SeedableRng}; +use tracing::{debug, info_span, warn}; +use utils::{id::TenantTimelineId, lsn::Lsn}; +use walproposer::walproposer::{Config, Wrapper}; + +use super::{ + log::SimClock, safekeeper_disk::SafekeeperDisk, walproposer_api, + walproposer_disk::DiskWalProposer, +}; + +/// Simulated safekeeper node. +pub struct SafekeeperNode { + pub node: Arc, + pub id: u32, + pub disk: Arc, + pub thread: Cell, +} + +impl SafekeeperNode { + /// Create and start a safekeeper at the specified Node. + pub fn new(node: Arc) -> Self { + let disk = Arc::new(SafekeeperDisk::new()); + let thread = Cell::new(SafekeeperNode::launch(disk.clone(), node.clone())); + + Self { + id: node.id, + node, + disk, + thread, + } + } + + fn launch(disk: Arc, node: Arc) -> ExternalHandle { + // start the server thread + node.launch(move |os| { + run_server(os, disk).expect("server should finish without errors"); + }) + } + + /// Restart the safekeeper. + pub fn restart(&self) { + let new_thread = SafekeeperNode::launch(self.disk.clone(), self.node.clone()); + let old_thread = self.thread.replace(new_thread); + old_thread.crash_stop(); + } +} + +/// Simulated walproposer node. +pub struct WalProposer { + thread: ExternalHandle, + node: Arc, + disk: Arc, + sync_safekeepers: bool, +} + +impl WalProposer { + /// Generic start function for both modes. + fn start( + os: NodeOs, + disk: Arc, + ttid: TenantTimelineId, + addrs: Vec, + lsn: Option, + ) { + let sync_safekeepers = lsn.is_none(); + + let _enter = if sync_safekeepers { + info_span!("sync", started = executor::now()).entered() + } else { + info_span!("walproposer", started = executor::now()).entered() + }; + + os.log_event(format!("started;walproposer;{}", sync_safekeepers as i32)); + + let config = Config { + ttid, + safekeepers_list: addrs, + safekeeper_reconnect_timeout: 1000, + safekeeper_connection_timeout: 5000, + sync_safekeepers, + }; + let args = walproposer_api::Args { + os, + config: config.clone(), + disk, + redo_start_lsn: lsn, + }; + let api = SimulationApi::new(args); + let wp = Wrapper::new(Box::new(api), config); + wp.start(); + } + + /// Start walproposer in a sync_safekeepers mode. + pub fn launch_sync(ttid: TenantTimelineId, addrs: Vec, node: Arc) -> Self { + debug!("sync_safekeepers started at node {}", node.id); + let disk = DiskWalProposer::new(); + let disk_wp = disk.clone(); + + // start the client thread + let handle = node.launch(move |os| { + WalProposer::start(os, disk_wp, ttid, addrs, None); + }); + + Self { + thread: handle, + node, + disk, + sync_safekeepers: true, + } + } + + /// Start walproposer in a normal mode. + pub fn launch_walproposer( + ttid: TenantTimelineId, + addrs: Vec, + node: Arc, + lsn: Lsn, + ) -> Self { + debug!("walproposer started at node {}", node.id); + let disk = DiskWalProposer::new(); + disk.lock().reset_to(lsn); + let disk_wp = disk.clone(); + + // start the client thread + let handle = node.launch(move |os| { + WalProposer::start(os, disk_wp, ttid, addrs, Some(lsn)); + }); + + Self { + thread: handle, + node, + disk, + sync_safekeepers: false, + } + } + + pub fn write_tx(&mut self, cnt: usize) { + let start_lsn = self.disk.lock().flush_rec_ptr(); + + for _ in 0..cnt { + self.disk + .lock() + .insert_logical_message("prefix", b"message") + .expect("failed to generate logical message"); + } + + let end_lsn = self.disk.lock().flush_rec_ptr(); + + // log event + self.node + .log_event(format!("write_wal;{};{};{}", start_lsn.0, end_lsn.0, cnt)); + + // now we need to set "Latch" in walproposer + self.node + .node_events() + .send(NodeEvent::Internal(AnyMessage::Just32(0))); + } + + pub fn stop(&self) { + self.thread.crash_stop(); + } +} + +/// Holds basic simulation settings, such as network options. +pub struct TestConfig { + pub network: NetworkOptions, + pub timeout: u64, + pub clock: Option, +} + +impl TestConfig { + /// Create a new TestConfig with default settings. + pub fn new(clock: Option) -> Self { + Self { + network: NetworkOptions { + keepalive_timeout: Some(2000), + connect_delay: Delay { + min: 1, + max: 5, + fail_prob: 0.0, + }, + send_delay: Delay { + min: 1, + max: 5, + fail_prob: 0.0, + }, + }, + timeout: 1_000 * 10, + clock, + } + } + + /// Start a new simulation with the specified seed. + pub fn start(&self, seed: u64) -> Test { + let world = Arc::new(World::new(seed, Arc::new(self.network.clone()))); + + if let Some(clock) = &self.clock { + clock.set_clock(world.clock()); + } + + let servers = [ + SafekeeperNode::new(world.new_node()), + SafekeeperNode::new(world.new_node()), + SafekeeperNode::new(world.new_node()), + ]; + + let server_ids = [servers[0].id, servers[1].id, servers[2].id]; + let safekeepers_addrs = server_ids.map(|id| format!("node:{}", id)).to_vec(); + + let ttid = TenantTimelineId::generate(); + + Test { + world, + servers, + sk_list: safekeepers_addrs, + ttid, + timeout: self.timeout, + } + } +} + +/// Holds simulation state. +pub struct Test { + pub world: Arc, + pub servers: [SafekeeperNode; 3], + pub sk_list: Vec, + pub ttid: TenantTimelineId, + pub timeout: u64, +} + +impl Test { + /// Start a sync_safekeepers thread and wait for it to finish. + pub fn sync_safekeepers(&self) -> anyhow::Result { + let wp = self.launch_sync_safekeepers(); + + // poll until exit or timeout + let time_limit = self.timeout; + while self.world.step() && self.world.now() < time_limit && !wp.thread.is_finished() {} + + if !wp.thread.is_finished() { + anyhow::bail!("timeout or idle stuck"); + } + + let res = wp.thread.result(); + if res.0 != 0 { + anyhow::bail!("non-zero exitcode: {:?}", res); + } + let lsn = Lsn::from_str(&res.1)?; + Ok(lsn) + } + + /// Spawn a new sync_safekeepers thread. + pub fn launch_sync_safekeepers(&self) -> WalProposer { + WalProposer::launch_sync(self.ttid, self.sk_list.clone(), self.world.new_node()) + } + + /// Spawn a new walproposer thread. + pub fn launch_walproposer(&self, lsn: Lsn) -> WalProposer { + let lsn = if lsn.0 == 0 { + // usual LSN after basebackup + Lsn(21623024) + } else { + lsn + }; + + WalProposer::launch_walproposer(self.ttid, self.sk_list.clone(), self.world.new_node(), lsn) + } + + /// Execute the simulation for the specified duration. + pub fn poll_for_duration(&self, duration: u64) { + let time_limit = std::cmp::min(self.world.now() + duration, self.timeout); + while self.world.step() && self.world.now() < time_limit {} + } + + /// Execute the simulation together with events defined in some schedule. + pub fn run_schedule(&self, schedule: &Schedule) -> anyhow::Result<()> { + // scheduling empty events so that world will stop in those points + { + let clock = self.world.clock(); + + let now = self.world.now(); + for (time, _) in schedule { + if *time < now { + continue; + } + clock.schedule_fake(*time - now); + } + } + + let mut wp = self.launch_sync_safekeepers(); + + let mut skipped_tx = 0; + let mut started_tx = 0; + + let mut schedule_ptr = 0; + + loop { + if wp.sync_safekeepers && wp.thread.is_finished() { + let res = wp.thread.result(); + if res.0 != 0 { + warn!("sync non-zero exitcode: {:?}", res); + debug!("restarting sync_safekeepers"); + // restart the sync_safekeepers + wp = self.launch_sync_safekeepers(); + continue; + } + let lsn = Lsn::from_str(&res.1)?; + debug!("sync_safekeepers finished at LSN {}", lsn); + wp = self.launch_walproposer(lsn); + debug!("walproposer started at thread {}", wp.thread.id()); + } + + let now = self.world.now(); + while schedule_ptr < schedule.len() && schedule[schedule_ptr].0 <= now { + if now != schedule[schedule_ptr].0 { + warn!("skipped event {:?} at {}", schedule[schedule_ptr], now); + } + + let action = &schedule[schedule_ptr].1; + match action { + TestAction::WriteTx(size) => { + if !wp.sync_safekeepers && !wp.thread.is_finished() { + started_tx += *size; + wp.write_tx(*size); + debug!("written {} transactions", size); + } else { + skipped_tx += size; + debug!("skipped {} transactions", size); + } + } + TestAction::RestartSafekeeper(id) => { + debug!("restarting safekeeper {}", id); + self.servers[*id].restart(); + } + TestAction::RestartWalProposer => { + debug!("restarting sync_safekeepers"); + wp.stop(); + wp = self.launch_sync_safekeepers(); + } + } + schedule_ptr += 1; + } + + if schedule_ptr == schedule.len() { + break; + } + let next_event_time = schedule[schedule_ptr].0; + + // poll until the next event + if wp.thread.is_finished() { + while self.world.step() && self.world.now() < next_event_time {} + } else { + while self.world.step() + && self.world.now() < next_event_time + && !wp.thread.is_finished() + {} + } + } + + debug!( + "finished schedule, total steps: {}", + self.world.get_thread_step_count() + ); + debug!("skipped_tx: {}", skipped_tx); + debug!("started_tx: {}", started_tx); + + Ok(()) + } +} + +#[derive(Debug, Clone)] +pub enum TestAction { + WriteTx(usize), + RestartSafekeeper(usize), + RestartWalProposer, +} + +pub type Schedule = Vec<(u64, TestAction)>; + +pub fn generate_schedule(seed: u64) -> Schedule { + let mut rng = rand::rngs::StdRng::seed_from_u64(seed); + let mut schedule = Vec::new(); + let mut time = 0; + + let cnt = rng.gen_range(1..100); + + for _ in 0..cnt { + time += rng.gen_range(0..500); + let action = match rng.gen_range(0..3) { + 0 => TestAction::WriteTx(rng.gen_range(1..10)), + 1 => TestAction::RestartSafekeeper(rng.gen_range(0..3)), + 2 => TestAction::RestartWalProposer, + _ => unreachable!(), + }; + schedule.push((time, action)); + } + + schedule +} + +pub fn generate_network_opts(seed: u64) -> NetworkOptions { + let mut rng = rand::rngs::StdRng::seed_from_u64(seed); + + let timeout = rng.gen_range(100..2000); + let max_delay = rng.gen_range(1..2 * timeout); + let min_delay = rng.gen_range(1..=max_delay); + + let max_fail_prob = rng.gen_range(0.0..0.9); + let connect_fail_prob = rng.gen_range(0.0..max_fail_prob); + let send_fail_prob = rng.gen_range(0.0..connect_fail_prob); + + NetworkOptions { + keepalive_timeout: Some(timeout), + connect_delay: Delay { + min: min_delay, + max: max_delay, + fail_prob: connect_fail_prob, + }, + send_delay: Delay { + min: min_delay, + max: max_delay, + fail_prob: send_fail_prob, + }, + } +} diff --git a/safekeeper/tests/walproposer_sim/simulation_logs.rs b/safekeeper/tests/walproposer_sim/simulation_logs.rs new file mode 100644 index 0000000000..38885e5dd0 --- /dev/null +++ b/safekeeper/tests/walproposer_sim/simulation_logs.rs @@ -0,0 +1,187 @@ +use desim::proto::SimEvent; +use tracing::debug; + +#[derive(Debug, Clone, PartialEq, Eq)] +enum NodeKind { + Unknown, + Safekeeper, + WalProposer, +} + +impl Default for NodeKind { + fn default() -> Self { + Self::Unknown + } +} + +/// Simulation state of walproposer/safekeeper, derived from the simulation logs. +#[derive(Clone, Debug, Default)] +struct NodeInfo { + kind: NodeKind, + + // walproposer + is_sync: bool, + term: u64, + epoch_lsn: u64, + + // safekeeper + commit_lsn: u64, + flush_lsn: u64, +} + +impl NodeInfo { + fn init_kind(&mut self, kind: NodeKind) { + if self.kind == NodeKind::Unknown { + self.kind = kind; + } else { + assert!(self.kind == kind); + } + } + + fn started(&mut self, data: &str) { + let mut parts = data.split(';'); + assert!(parts.next().unwrap() == "started"); + match parts.next().unwrap() { + "safekeeper" => { + self.init_kind(NodeKind::Safekeeper); + } + "walproposer" => { + self.init_kind(NodeKind::WalProposer); + let is_sync: u8 = parts.next().unwrap().parse().unwrap(); + self.is_sync = is_sync != 0; + } + _ => unreachable!(), + } + } +} + +/// Global state of the simulation, derived from the simulation logs. +#[derive(Debug, Default)] +struct GlobalState { + nodes: Vec, + commit_lsn: u64, + write_lsn: u64, + max_write_lsn: u64, + + written_wal: u64, + written_records: u64, +} + +impl GlobalState { + fn new() -> Self { + Default::default() + } + + fn get(&mut self, id: u32) -> &mut NodeInfo { + let id = id as usize; + if id >= self.nodes.len() { + self.nodes.resize(id + 1, NodeInfo::default()); + } + &mut self.nodes[id] + } +} + +/// Try to find inconsistencies in the simulation log. +pub fn validate_events(events: Vec) { + const INITDB_LSN: u64 = 21623024; + + let hook = std::panic::take_hook(); + scopeguard::defer_on_success! { + std::panic::set_hook(hook); + }; + + let mut state = GlobalState::new(); + state.max_write_lsn = INITDB_LSN; + + for event in events { + debug!("{:?}", event); + + let node = state.get(event.node); + if event.data.starts_with("started;") { + node.started(&event.data); + continue; + } + assert!(node.kind != NodeKind::Unknown); + + // drop reference to unlock state + let mut node = node.clone(); + + let mut parts = event.data.split(';'); + match node.kind { + NodeKind::Safekeeper => match parts.next().unwrap() { + "tli_loaded" => { + let flush_lsn: u64 = parts.next().unwrap().parse().unwrap(); + let commit_lsn: u64 = parts.next().unwrap().parse().unwrap(); + node.flush_lsn = flush_lsn; + node.commit_lsn = commit_lsn; + } + _ => unreachable!(), + }, + NodeKind::WalProposer => { + match parts.next().unwrap() { + "prop_elected" => { + let prop_lsn: u64 = parts.next().unwrap().parse().unwrap(); + let prop_term: u64 = parts.next().unwrap().parse().unwrap(); + let prev_lsn: u64 = parts.next().unwrap().parse().unwrap(); + let prev_term: u64 = parts.next().unwrap().parse().unwrap(); + + assert!(prop_lsn >= prev_lsn); + assert!(prop_term >= prev_term); + + assert!(prop_lsn >= state.commit_lsn); + + if prop_lsn > state.write_lsn { + assert!(prop_lsn <= state.max_write_lsn); + debug!( + "moving write_lsn up from {} to {}", + state.write_lsn, prop_lsn + ); + state.write_lsn = prop_lsn; + } + if prop_lsn < state.write_lsn { + debug!( + "moving write_lsn down from {} to {}", + state.write_lsn, prop_lsn + ); + state.write_lsn = prop_lsn; + } + + node.epoch_lsn = prop_lsn; + node.term = prop_term; + } + "write_wal" => { + assert!(!node.is_sync); + let start_lsn: u64 = parts.next().unwrap().parse().unwrap(); + let end_lsn: u64 = parts.next().unwrap().parse().unwrap(); + let cnt: u64 = parts.next().unwrap().parse().unwrap(); + + let size = end_lsn - start_lsn; + state.written_wal += size; + state.written_records += cnt; + + // TODO: If we allow writing WAL before winning the election + + assert!(start_lsn >= state.commit_lsn); + assert!(end_lsn >= start_lsn); + // assert!(start_lsn == state.write_lsn); + state.write_lsn = end_lsn; + + if end_lsn > state.max_write_lsn { + state.max_write_lsn = end_lsn; + } + } + "commit_lsn" => { + let lsn: u64 = parts.next().unwrap().parse().unwrap(); + assert!(lsn >= state.commit_lsn); + state.commit_lsn = lsn; + } + _ => unreachable!(), + } + } + _ => unreachable!(), + } + + // update the node in the state struct + *state.get(event.node) = node; + } +} diff --git a/safekeeper/tests/walproposer_sim/walproposer_api.rs b/safekeeper/tests/walproposer_sim/walproposer_api.rs new file mode 100644 index 0000000000..5c79e9082b --- /dev/null +++ b/safekeeper/tests/walproposer_sim/walproposer_api.rs @@ -0,0 +1,675 @@ +use std::{ + cell::{RefCell, RefMut, UnsafeCell}, + ffi::CStr, + sync::Arc, +}; + +use bytes::Bytes; +use desim::{ + executor::{self, PollSome}, + network::TCP, + node_os::NodeOs, + proto::{AnyMessage, NetEvent, NodeEvent}, + world::NodeId, +}; +use tracing::debug; +use utils::lsn::Lsn; +use walproposer::{ + api_bindings::Level, + bindings::{ + pg_atomic_uint64, NeonWALReadResult, PageserverFeedback, SafekeeperStateDesiredEvents, + WL_SOCKET_READABLE, WL_SOCKET_WRITEABLE, + }, + walproposer::{ApiImpl, Config}, +}; + +use super::walproposer_disk::DiskWalProposer; + +/// Special state for each wp->sk connection. +struct SafekeeperConn { + host: String, + port: String, + node_id: NodeId, + // socket is Some(..) equals to connection is established + socket: Option, + // connection is in progress + is_connecting: bool, + // START_WAL_PUSH is in progress + is_start_wal_push: bool, + // pointer to Safekeeper in walproposer for callbacks + raw_ptr: *mut walproposer::bindings::Safekeeper, +} + +impl SafekeeperConn { + pub fn new(host: String, port: String) -> Self { + // port number is the same as NodeId + let port_num = port.parse::().unwrap(); + Self { + host, + port, + node_id: port_num, + socket: None, + is_connecting: false, + is_start_wal_push: false, + raw_ptr: std::ptr::null_mut(), + } + } +} + +/// Simulation version of a postgres WaitEventSet. At pos 0 there is always +/// a special NodeEvents channel, which is used as a latch. +struct EventSet { + os: NodeOs, + // all pollable channels, 0 is always NodeEvent channel + chans: Vec>, + // 0 is always nullptr + sk_ptrs: Vec<*mut walproposer::bindings::Safekeeper>, + // event mask for each channel + masks: Vec, +} + +impl EventSet { + pub fn new(os: NodeOs) -> Self { + let node_events = os.node_events(); + Self { + os, + chans: vec![Box::new(node_events)], + sk_ptrs: vec![std::ptr::null_mut()], + masks: vec![WL_SOCKET_READABLE], + } + } + + /// Leaves all readable channels at the beginning of the array. + fn sort_readable(&mut self) -> usize { + let mut cnt = 1; + for i in 1..self.chans.len() { + if self.masks[i] & WL_SOCKET_READABLE != 0 { + self.chans.swap(i, cnt); + self.sk_ptrs.swap(i, cnt); + self.masks.swap(i, cnt); + cnt += 1; + } + } + cnt + } + + fn update_event_set(&mut self, conn: &SafekeeperConn, event_mask: u32) { + let index = self + .sk_ptrs + .iter() + .position(|&ptr| ptr == conn.raw_ptr) + .expect("safekeeper should exist in event set"); + self.masks[index] = event_mask; + } + + fn add_safekeeper(&mut self, sk: &SafekeeperConn, event_mask: u32) { + for ptr in self.sk_ptrs.iter() { + assert!(*ptr != sk.raw_ptr); + } + + self.chans.push(Box::new( + sk.socket + .as_ref() + .expect("socket should not be closed") + .recv_chan(), + )); + self.sk_ptrs.push(sk.raw_ptr); + self.masks.push(event_mask); + } + + fn remove_safekeeper(&mut self, sk: &SafekeeperConn) { + let index = self.sk_ptrs.iter().position(|&ptr| ptr == sk.raw_ptr); + if index.is_none() { + debug!("remove_safekeeper: sk={:?} not found", sk.raw_ptr); + return; + } + let index = index.unwrap(); + + self.chans.remove(index); + self.sk_ptrs.remove(index); + self.masks.remove(index); + + // to simulate the actual behaviour + self.refresh_event_set(); + } + + /// Updates all masks to match the result of a SafekeeperStateDesiredEvents. + fn refresh_event_set(&mut self) { + for (i, mask) in self.masks.iter_mut().enumerate() { + if i == 0 { + continue; + } + + let mut mask_sk: u32 = 0; + let mut mask_nwr: u32 = 0; + unsafe { SafekeeperStateDesiredEvents(self.sk_ptrs[i], &mut mask_sk, &mut mask_nwr) }; + + if mask_sk != *mask { + debug!( + "refresh_event_set: sk={:?}, old_mask={:#b}, new_mask={:#b}", + self.sk_ptrs[i], *mask, mask_sk + ); + *mask = mask_sk; + } + } + } + + /// Wait for events on all channels. + fn wait(&mut self, timeout_millis: i64) -> walproposer::walproposer::WaitResult { + // all channels are always writeable + for (i, mask) in self.masks.iter().enumerate() { + if *mask & WL_SOCKET_WRITEABLE != 0 { + return walproposer::walproposer::WaitResult::Network( + self.sk_ptrs[i], + WL_SOCKET_WRITEABLE, + ); + } + } + + let cnt = self.sort_readable(); + + let slice = &self.chans[0..cnt]; + match executor::epoll_chans(slice, timeout_millis) { + None => walproposer::walproposer::WaitResult::Timeout, + Some(0) => { + let msg = self.os.node_events().must_recv(); + match msg { + NodeEvent::Internal(AnyMessage::Just32(0)) => { + // got a notification about new WAL available + } + NodeEvent::Internal(_) => unreachable!(), + NodeEvent::Accept(_) => unreachable!(), + } + walproposer::walproposer::WaitResult::Latch + } + Some(index) => walproposer::walproposer::WaitResult::Network( + self.sk_ptrs[index], + WL_SOCKET_READABLE, + ), + } + } +} + +/// This struct handles all calls from walproposer into walproposer_api. +pub struct SimulationApi { + os: NodeOs, + safekeepers: RefCell>, + disk: Arc, + redo_start_lsn: Option, + last_logged_commit_lsn: u64, + shmem: UnsafeCell, + config: Config, + event_set: RefCell>, +} + +pub struct Args { + pub os: NodeOs, + pub config: Config, + pub disk: Arc, + pub redo_start_lsn: Option, +} + +impl SimulationApi { + pub fn new(args: Args) -> Self { + // initialize connection state for each safekeeper + let sk_conns = args + .config + .safekeepers_list + .iter() + .map(|s| { + SafekeeperConn::new( + s.split(':').next().unwrap().to_string(), + s.split(':').nth(1).unwrap().to_string(), + ) + }) + .collect::>(); + + Self { + os: args.os, + safekeepers: RefCell::new(sk_conns), + disk: args.disk, + redo_start_lsn: args.redo_start_lsn, + last_logged_commit_lsn: 0, + shmem: UnsafeCell::new(walproposer::bindings::WalproposerShmemState { + mutex: 0, + feedback: PageserverFeedback { + currentClusterSize: 0, + last_received_lsn: 0, + disk_consistent_lsn: 0, + remote_consistent_lsn: 0, + replytime: 0, + }, + mineLastElectedTerm: 0, + backpressureThrottlingTime: pg_atomic_uint64 { value: 0 }, + }), + config: args.config, + event_set: RefCell::new(None), + } + } + + /// Get SafekeeperConn for the given Safekeeper. + fn get_conn(&self, sk: &mut walproposer::bindings::Safekeeper) -> RefMut<'_, SafekeeperConn> { + let sk_port = unsafe { CStr::from_ptr(sk.port).to_str().unwrap() }; + let state = self.safekeepers.borrow_mut(); + RefMut::map(state, |v| { + v.iter_mut() + .find(|conn| conn.port == sk_port) + .expect("safekeeper conn not found by port") + }) + } +} + +impl ApiImpl for SimulationApi { + fn get_current_timestamp(&self) -> i64 { + debug!("get_current_timestamp"); + // PG TimestampTZ is microseconds, but simulation unit is assumed to be + // milliseconds, so add 10^3 + self.os.now() as i64 * 1000 + } + + fn conn_status( + &self, + _: &mut walproposer::bindings::Safekeeper, + ) -> walproposer::bindings::WalProposerConnStatusType { + debug!("conn_status"); + // break the connection with a 10% chance + if self.os.random(100) < 10 { + walproposer::bindings::WalProposerConnStatusType_WP_CONNECTION_BAD + } else { + walproposer::bindings::WalProposerConnStatusType_WP_CONNECTION_OK + } + } + + fn conn_connect_start(&self, sk: &mut walproposer::bindings::Safekeeper) { + debug!("conn_connect_start"); + let mut conn = self.get_conn(sk); + + assert!(conn.socket.is_none()); + let socket = self.os.open_tcp(conn.node_id); + conn.socket = Some(socket); + conn.raw_ptr = sk; + conn.is_connecting = true; + } + + fn conn_connect_poll( + &self, + _: &mut walproposer::bindings::Safekeeper, + ) -> walproposer::bindings::WalProposerConnectPollStatusType { + debug!("conn_connect_poll"); + // TODO: break the connection here + walproposer::bindings::WalProposerConnectPollStatusType_WP_CONN_POLLING_OK + } + + fn conn_send_query(&self, sk: &mut walproposer::bindings::Safekeeper, query: &str) -> bool { + debug!("conn_send_query: {}", query); + self.get_conn(sk).is_start_wal_push = true; + true + } + + fn conn_get_query_result( + &self, + _: &mut walproposer::bindings::Safekeeper, + ) -> walproposer::bindings::WalProposerExecStatusType { + debug!("conn_get_query_result"); + // TODO: break the connection here + walproposer::bindings::WalProposerExecStatusType_WP_EXEC_SUCCESS_COPYBOTH + } + + fn conn_async_read( + &self, + sk: &mut walproposer::bindings::Safekeeper, + vec: &mut Vec, + ) -> walproposer::bindings::PGAsyncReadResult { + debug!("conn_async_read"); + let mut conn = self.get_conn(sk); + + let socket = if let Some(socket) = conn.socket.as_mut() { + socket + } else { + // socket is already closed + return walproposer::bindings::PGAsyncReadResult_PG_ASYNC_READ_FAIL; + }; + + let msg = socket.recv_chan().try_recv(); + + match msg { + None => { + // no message is ready + walproposer::bindings::PGAsyncReadResult_PG_ASYNC_READ_TRY_AGAIN + } + Some(NetEvent::Closed) => { + // connection is closed + debug!("conn_async_read: connection is closed"); + conn.socket = None; + walproposer::bindings::PGAsyncReadResult_PG_ASYNC_READ_FAIL + } + Some(NetEvent::Message(msg)) => { + // got a message + let b = match msg { + desim::proto::AnyMessage::Bytes(b) => b, + _ => unreachable!(), + }; + vec.extend_from_slice(&b); + walproposer::bindings::PGAsyncReadResult_PG_ASYNC_READ_SUCCESS + } + } + } + + fn conn_blocking_write(&self, sk: &mut walproposer::bindings::Safekeeper, buf: &[u8]) -> bool { + let mut conn = self.get_conn(sk); + debug!("conn_blocking_write to {}: {:?}", conn.node_id, buf); + let socket = conn.socket.as_mut().unwrap(); + socket.send(desim::proto::AnyMessage::Bytes(Bytes::copy_from_slice(buf))); + true + } + + fn conn_async_write( + &self, + sk: &mut walproposer::bindings::Safekeeper, + buf: &[u8], + ) -> walproposer::bindings::PGAsyncWriteResult { + let mut conn = self.get_conn(sk); + debug!("conn_async_write to {}: {:?}", conn.node_id, buf); + if let Some(socket) = conn.socket.as_mut() { + socket.send(desim::proto::AnyMessage::Bytes(Bytes::copy_from_slice(buf))); + } else { + // connection is already closed + debug!("conn_async_write: writing to a closed socket!"); + // TODO: maybe we should return error here? + } + walproposer::bindings::PGAsyncWriteResult_PG_ASYNC_WRITE_SUCCESS + } + + fn wal_reader_allocate(&self, _: &mut walproposer::bindings::Safekeeper) -> NeonWALReadResult { + debug!("wal_reader_allocate"); + walproposer::bindings::NeonWALReadResult_NEON_WALREAD_SUCCESS + } + + fn wal_read( + &self, + _sk: &mut walproposer::bindings::Safekeeper, + buf: &mut [u8], + startpos: u64, + ) -> NeonWALReadResult { + self.disk.lock().read(startpos, buf); + walproposer::bindings::NeonWALReadResult_NEON_WALREAD_SUCCESS + } + + fn init_event_set(&self, _: &mut walproposer::bindings::WalProposer) { + debug!("init_event_set"); + let new_event_set = EventSet::new(self.os.clone()); + let old_event_set = self.event_set.replace(Some(new_event_set)); + assert!(old_event_set.is_none()); + } + + fn update_event_set(&self, sk: &mut walproposer::bindings::Safekeeper, event_mask: u32) { + debug!( + "update_event_set, sk={:?}, events_mask={:#b}", + sk as *mut walproposer::bindings::Safekeeper, event_mask + ); + let conn = self.get_conn(sk); + + self.event_set + .borrow_mut() + .as_mut() + .unwrap() + .update_event_set(&conn, event_mask); + } + + fn add_safekeeper_event_set( + &self, + sk: &mut walproposer::bindings::Safekeeper, + event_mask: u32, + ) { + debug!( + "add_safekeeper_event_set, sk={:?}, events_mask={:#b}", + sk as *mut walproposer::bindings::Safekeeper, event_mask + ); + + self.event_set + .borrow_mut() + .as_mut() + .unwrap() + .add_safekeeper(&self.get_conn(sk), event_mask); + } + + fn rm_safekeeper_event_set(&self, sk: &mut walproposer::bindings::Safekeeper) { + debug!( + "rm_safekeeper_event_set, sk={:?}", + sk as *mut walproposer::bindings::Safekeeper, + ); + + self.event_set + .borrow_mut() + .as_mut() + .unwrap() + .remove_safekeeper(&self.get_conn(sk)); + } + + fn active_state_update_event_set(&self, sk: &mut walproposer::bindings::Safekeeper) { + debug!("active_state_update_event_set"); + + assert!(sk.state == walproposer::bindings::SafekeeperState_SS_ACTIVE); + self.event_set + .borrow_mut() + .as_mut() + .unwrap() + .refresh_event_set(); + } + + fn wal_reader_events(&self, _sk: &mut walproposer::bindings::Safekeeper) -> u32 { + 0 + } + + fn wait_event_set( + &self, + _: &mut walproposer::bindings::WalProposer, + timeout_millis: i64, + ) -> walproposer::walproposer::WaitResult { + // TODO: handle multiple stages as part of the simulation (e.g. connect, start_wal_push, etc) + let mut conns = self.safekeepers.borrow_mut(); + for conn in conns.iter_mut() { + if conn.socket.is_some() && conn.is_connecting { + conn.is_connecting = false; + debug!("wait_event_set, connecting to {}:{}", conn.host, conn.port); + return walproposer::walproposer::WaitResult::Network( + conn.raw_ptr, + WL_SOCKET_READABLE | WL_SOCKET_WRITEABLE, + ); + } + if conn.socket.is_some() && conn.is_start_wal_push { + conn.is_start_wal_push = false; + debug!( + "wait_event_set, start wal push to {}:{}", + conn.host, conn.port + ); + return walproposer::walproposer::WaitResult::Network( + conn.raw_ptr, + WL_SOCKET_READABLE, + ); + } + } + drop(conns); + + let res = self + .event_set + .borrow_mut() + .as_mut() + .unwrap() + .wait(timeout_millis); + + debug!( + "wait_event_set, timeout_millis={}, res={:?}", + timeout_millis, res, + ); + res + } + + fn strong_random(&self, buf: &mut [u8]) -> bool { + debug!("strong_random"); + buf.fill(0); + true + } + + fn finish_sync_safekeepers(&self, lsn: u64) { + debug!("finish_sync_safekeepers, lsn={}", lsn); + executor::exit(0, Lsn(lsn).to_string()); + } + + fn log_internal(&self, _wp: &mut walproposer::bindings::WalProposer, level: Level, msg: &str) { + debug!("wp_log[{}] {}", level, msg); + if level == Level::Fatal || level == Level::Panic { + if msg.contains("rejects our connection request with term") { + // collected quorum with lower term, then got rejected by next connected safekeeper + executor::exit(1, msg.to_owned()); + } + if msg.contains("collected propEpochStartLsn") && msg.contains(", but basebackup LSN ") + { + // sync-safekeepers collected wrong quorum, walproposer collected another quorum + executor::exit(1, msg.to_owned()); + } + if msg.contains("failed to download WAL for logical replicaiton") { + // Recovery connection broken and recovery was failed + executor::exit(1, msg.to_owned()); + } + if msg.contains("missing majority of votes, collected") { + // Voting bug when safekeeper disconnects after voting + executor::exit(1, msg.to_owned()); + } + panic!("unknown FATAL error from walproposer: {}", msg); + } + } + + fn after_election(&self, wp: &mut walproposer::bindings::WalProposer) { + let prop_lsn = wp.propEpochStartLsn; + let prop_term = wp.propTerm; + + let mut prev_lsn: u64 = 0; + let mut prev_term: u64 = 0; + + unsafe { + let history = wp.propTermHistory.entries; + let len = wp.propTermHistory.n_entries as usize; + if len > 1 { + let entry = *history.wrapping_add(len - 2); + prev_lsn = entry.lsn; + prev_term = entry.term; + } + } + + let msg = format!( + "prop_elected;{};{};{};{}", + prop_lsn, prop_term, prev_lsn, prev_term + ); + + debug!(msg); + self.os.log_event(msg); + } + + fn get_redo_start_lsn(&self) -> u64 { + debug!("get_redo_start_lsn -> {:?}", self.redo_start_lsn); + self.redo_start_lsn.expect("redo_start_lsn is not set").0 + } + + fn get_shmem_state(&self) -> *mut walproposer::bindings::WalproposerShmemState { + self.shmem.get() + } + + fn start_streaming( + &self, + startpos: u64, + callback: &walproposer::walproposer::StreamingCallback, + ) { + let disk = &self.disk; + let disk_lsn = disk.lock().flush_rec_ptr().0; + debug!("start_streaming at {} (disk_lsn={})", startpos, disk_lsn); + if startpos < disk_lsn { + debug!("startpos < disk_lsn, it means we wrote some transaction even before streaming started"); + } + assert!(startpos <= disk_lsn); + let mut broadcasted = Lsn(startpos); + + loop { + let available = disk.lock().flush_rec_ptr(); + assert!(available >= broadcasted); + callback.broadcast(broadcasted, available); + broadcasted = available; + callback.poll(); + } + } + + fn process_safekeeper_feedback(&mut self, wp: &mut walproposer::bindings::WalProposer) { + debug!("process_safekeeper_feedback, commit_lsn={}", wp.commitLsn); + if wp.commitLsn > self.last_logged_commit_lsn { + self.os.log_event(format!("commit_lsn;{}", wp.commitLsn)); + self.last_logged_commit_lsn = wp.commitLsn; + } + } + + fn get_flush_rec_ptr(&self) -> u64 { + let lsn = self.disk.lock().flush_rec_ptr(); + debug!("get_flush_rec_ptr: {}", lsn); + lsn.0 + } + + fn recovery_download( + &self, + wp: &mut walproposer::bindings::WalProposer, + sk: &mut walproposer::bindings::Safekeeper, + ) -> bool { + let mut startpos = wp.truncateLsn; + let endpos = wp.propEpochStartLsn; + + if startpos == endpos { + debug!("recovery_download: nothing to download"); + return true; + } + + debug!("recovery_download from {} to {}", startpos, endpos,); + + let replication_prompt = format!( + "START_REPLICATION {} {} {} {}", + self.config.ttid.tenant_id, self.config.ttid.timeline_id, startpos, endpos, + ); + let async_conn = self.get_conn(sk); + + let conn = self.os.open_tcp(async_conn.node_id); + conn.send(desim::proto::AnyMessage::Bytes(replication_prompt.into())); + + let chan = conn.recv_chan(); + while startpos < endpos { + let event = chan.recv(); + match event { + NetEvent::Closed => { + debug!("connection closed in recovery"); + break; + } + NetEvent::Message(AnyMessage::Bytes(b)) => { + debug!("got recovery bytes from safekeeper"); + self.disk.lock().write(startpos, &b); + startpos += b.len() as u64; + } + NetEvent::Message(_) => unreachable!(), + } + } + + debug!("recovery finished at {}", startpos); + + startpos == endpos + } + + fn conn_finish(&self, sk: &mut walproposer::bindings::Safekeeper) { + let mut conn = self.get_conn(sk); + debug!("conn_finish to {}", conn.node_id); + if let Some(socket) = conn.socket.as_mut() { + socket.close(); + } else { + // connection is already closed + } + conn.socket = None; + } + + fn conn_error_message(&self, _sk: &mut walproposer::bindings::Safekeeper) -> String { + "connection is closed, probably".into() + } +} diff --git a/safekeeper/tests/walproposer_sim/walproposer_disk.rs b/safekeeper/tests/walproposer_sim/walproposer_disk.rs new file mode 100644 index 0000000000..aa329bd2f0 --- /dev/null +++ b/safekeeper/tests/walproposer_sim/walproposer_disk.rs @@ -0,0 +1,314 @@ +use std::{ffi::CString, sync::Arc}; + +use byteorder::{LittleEndian, WriteBytesExt}; +use crc32c::crc32c_append; +use parking_lot::{Mutex, MutexGuard}; +use postgres_ffi::{ + pg_constants::{ + RM_LOGICALMSG_ID, XLOG_LOGICAL_MESSAGE, XLP_LONG_HEADER, XLR_BLOCK_ID_DATA_LONG, + XLR_BLOCK_ID_DATA_SHORT, + }, + v16::{ + wal_craft_test_export::{XLogLongPageHeaderData, XLogPageHeaderData, XLOG_PAGE_MAGIC}, + xlog_utils::{ + XLogSegNoOffsetToRecPtr, XlLogicalMessage, XLOG_RECORD_CRC_OFFS, + XLOG_SIZE_OF_XLOG_LONG_PHD, XLOG_SIZE_OF_XLOG_RECORD, XLOG_SIZE_OF_XLOG_SHORT_PHD, + XLP_FIRST_IS_CONTRECORD, + }, + XLogRecord, + }, + WAL_SEGMENT_SIZE, XLOG_BLCKSZ, +}; +use utils::lsn::Lsn; + +use super::block_storage::BlockStorage; + +/// Simulation implementation of walproposer WAL storage. +pub struct DiskWalProposer { + state: Mutex, +} + +impl DiskWalProposer { + pub fn new() -> Arc { + Arc::new(DiskWalProposer { + state: Mutex::new(State { + internal_available_lsn: Lsn(0), + prev_lsn: Lsn(0), + disk: BlockStorage::new(), + }), + }) + } + + pub fn lock(&self) -> MutexGuard { + self.state.lock() + } +} + +pub struct State { + // flush_lsn + internal_available_lsn: Lsn, + // needed for WAL generation + prev_lsn: Lsn, + // actual WAL storage + disk: BlockStorage, +} + +impl State { + pub fn read(&self, pos: u64, buf: &mut [u8]) { + self.disk.read(pos, buf); + // TODO: fail on reading uninitialized data + } + + pub fn write(&mut self, pos: u64, buf: &[u8]) { + self.disk.write(pos, buf); + } + + /// Update the internal available LSN to the given value. + pub fn reset_to(&mut self, lsn: Lsn) { + self.internal_available_lsn = lsn; + } + + /// Get current LSN. + pub fn flush_rec_ptr(&self) -> Lsn { + self.internal_available_lsn + } + + /// Generate a new WAL record at the current LSN. + pub fn insert_logical_message(&mut self, prefix: &str, msg: &[u8]) -> anyhow::Result<()> { + let prefix_cstr = CString::new(prefix)?; + let prefix_bytes = prefix_cstr.as_bytes_with_nul(); + + let lm = XlLogicalMessage { + db_id: 0, + transactional: 0, + prefix_size: prefix_bytes.len() as ::std::os::raw::c_ulong, + message_size: msg.len() as ::std::os::raw::c_ulong, + }; + + let record_bytes = lm.encode(); + let rdatas: Vec<&[u8]> = vec![&record_bytes, prefix_bytes, msg]; + insert_wal_record(self, rdatas, RM_LOGICALMSG_ID, XLOG_LOGICAL_MESSAGE) + } +} + +fn insert_wal_record( + state: &mut State, + rdatas: Vec<&[u8]>, + rmid: u8, + info: u8, +) -> anyhow::Result<()> { + // bytes right after the header, in the same rdata block + let mut scratch = Vec::new(); + let mainrdata_len: usize = rdatas.iter().map(|rdata| rdata.len()).sum(); + + if mainrdata_len > 0 { + if mainrdata_len > 255 { + scratch.push(XLR_BLOCK_ID_DATA_LONG); + // TODO: verify endiness + let _ = scratch.write_u32::(mainrdata_len as u32); + } else { + scratch.push(XLR_BLOCK_ID_DATA_SHORT); + scratch.push(mainrdata_len as u8); + } + } + + let total_len: u32 = (XLOG_SIZE_OF_XLOG_RECORD + scratch.len() + mainrdata_len) as u32; + let size = maxalign(total_len); + assert!(size as usize > XLOG_SIZE_OF_XLOG_RECORD); + + let start_bytepos = recptr_to_bytepos(state.internal_available_lsn); + let end_bytepos = start_bytepos + size as u64; + + let start_recptr = bytepos_to_recptr(start_bytepos); + let end_recptr = bytepos_to_recptr(end_bytepos); + + assert!(recptr_to_bytepos(start_recptr) == start_bytepos); + assert!(recptr_to_bytepos(end_recptr) == end_bytepos); + + let mut crc = crc32c_append(0, &scratch); + for rdata in &rdatas { + crc = crc32c_append(crc, rdata); + } + + let mut header = XLogRecord { + xl_tot_len: total_len, + xl_xid: 0, + xl_prev: state.prev_lsn.0, + xl_info: info, + xl_rmid: rmid, + __bindgen_padding_0: [0u8; 2usize], + xl_crc: crc, + }; + + // now we have the header and can finish the crc + let header_bytes = header.encode()?; + let crc = crc32c_append(crc, &header_bytes[0..XLOG_RECORD_CRC_OFFS]); + header.xl_crc = crc; + + let mut header_bytes = header.encode()?.to_vec(); + assert!(header_bytes.len() == XLOG_SIZE_OF_XLOG_RECORD); + + header_bytes.extend_from_slice(&scratch); + + // finish rdatas + let mut rdatas = rdatas; + rdatas.insert(0, &header_bytes); + + write_walrecord_to_disk(state, total_len as u64, rdatas, start_recptr, end_recptr)?; + + state.internal_available_lsn = end_recptr; + state.prev_lsn = start_recptr; + Ok(()) +} + +fn write_walrecord_to_disk( + state: &mut State, + total_len: u64, + rdatas: Vec<&[u8]>, + start: Lsn, + end: Lsn, +) -> anyhow::Result<()> { + let mut curr_ptr = start; + let mut freespace = insert_freespace(curr_ptr); + let mut written: usize = 0; + + assert!(freespace >= std::mem::size_of::()); + + for mut rdata in rdatas { + while rdata.len() >= freespace { + assert!( + curr_ptr.segment_offset(WAL_SEGMENT_SIZE) >= XLOG_SIZE_OF_XLOG_SHORT_PHD + || freespace == 0 + ); + + state.write(curr_ptr.0, &rdata[..freespace]); + rdata = &rdata[freespace..]; + written += freespace; + curr_ptr = Lsn(curr_ptr.0 + freespace as u64); + + let mut new_page = XLogPageHeaderData { + xlp_magic: XLOG_PAGE_MAGIC as u16, + xlp_info: XLP_BKP_REMOVABLE, + xlp_tli: 1, + xlp_pageaddr: curr_ptr.0, + xlp_rem_len: (total_len - written as u64) as u32, + ..Default::default() // Put 0 in padding fields. + }; + if new_page.xlp_rem_len > 0 { + new_page.xlp_info |= XLP_FIRST_IS_CONTRECORD; + } + + if curr_ptr.segment_offset(WAL_SEGMENT_SIZE) == 0 { + new_page.xlp_info |= XLP_LONG_HEADER; + let long_page = XLogLongPageHeaderData { + std: new_page, + xlp_sysid: 0, + xlp_seg_size: WAL_SEGMENT_SIZE as u32, + xlp_xlog_blcksz: XLOG_BLCKSZ as u32, + }; + let header_bytes = long_page.encode()?; + assert!(header_bytes.len() == XLOG_SIZE_OF_XLOG_LONG_PHD); + state.write(curr_ptr.0, &header_bytes); + curr_ptr = Lsn(curr_ptr.0 + header_bytes.len() as u64); + } else { + let header_bytes = new_page.encode()?; + assert!(header_bytes.len() == XLOG_SIZE_OF_XLOG_SHORT_PHD); + state.write(curr_ptr.0, &header_bytes); + curr_ptr = Lsn(curr_ptr.0 + header_bytes.len() as u64); + } + freespace = insert_freespace(curr_ptr); + } + + assert!( + curr_ptr.segment_offset(WAL_SEGMENT_SIZE) >= XLOG_SIZE_OF_XLOG_SHORT_PHD + || rdata.is_empty() + ); + state.write(curr_ptr.0, rdata); + curr_ptr = Lsn(curr_ptr.0 + rdata.len() as u64); + written += rdata.len(); + freespace -= rdata.len(); + } + + assert!(written == total_len as usize); + curr_ptr.0 = maxalign(curr_ptr.0); + assert!(curr_ptr == end); + Ok(()) +} + +fn maxalign(size: T) -> T +where + T: std::ops::BitAnd + + std::ops::Add + + std::ops::Not + + From, +{ + (size + T::from(7)) & !T::from(7) +} + +fn insert_freespace(ptr: Lsn) -> usize { + if ptr.block_offset() == 0 { + 0 + } else { + (XLOG_BLCKSZ as u64 - ptr.block_offset()) as usize + } +} + +const XLP_BKP_REMOVABLE: u16 = 0x0004; +const USABLE_BYTES_IN_PAGE: u64 = (XLOG_BLCKSZ - XLOG_SIZE_OF_XLOG_SHORT_PHD) as u64; +const USABLE_BYTES_IN_SEGMENT: u64 = ((WAL_SEGMENT_SIZE / XLOG_BLCKSZ) as u64 + * USABLE_BYTES_IN_PAGE) + - (XLOG_SIZE_OF_XLOG_RECORD - XLOG_SIZE_OF_XLOG_SHORT_PHD) as u64; + +fn bytepos_to_recptr(bytepos: u64) -> Lsn { + let fullsegs = bytepos / USABLE_BYTES_IN_SEGMENT; + let mut bytesleft = bytepos % USABLE_BYTES_IN_SEGMENT; + + let seg_offset = if bytesleft < (XLOG_BLCKSZ - XLOG_SIZE_OF_XLOG_SHORT_PHD) as u64 { + // fits on first page of segment + bytesleft + XLOG_SIZE_OF_XLOG_SHORT_PHD as u64 + } else { + // account for the first page on segment with long header + bytesleft -= (XLOG_BLCKSZ - XLOG_SIZE_OF_XLOG_SHORT_PHD) as u64; + let fullpages = bytesleft / USABLE_BYTES_IN_PAGE; + bytesleft %= USABLE_BYTES_IN_PAGE; + + XLOG_BLCKSZ as u64 + + fullpages * XLOG_BLCKSZ as u64 + + bytesleft + + XLOG_SIZE_OF_XLOG_SHORT_PHD as u64 + }; + + Lsn(XLogSegNoOffsetToRecPtr( + fullsegs, + seg_offset as u32, + WAL_SEGMENT_SIZE, + )) +} + +fn recptr_to_bytepos(ptr: Lsn) -> u64 { + let fullsegs = ptr.segment_number(WAL_SEGMENT_SIZE); + let offset = ptr.segment_offset(WAL_SEGMENT_SIZE) as u64; + + let fullpages = offset / XLOG_BLCKSZ as u64; + let offset = offset % XLOG_BLCKSZ as u64; + + if fullpages == 0 { + fullsegs * USABLE_BYTES_IN_SEGMENT + + if offset > 0 { + assert!(offset >= XLOG_SIZE_OF_XLOG_SHORT_PHD as u64); + offset - XLOG_SIZE_OF_XLOG_SHORT_PHD as u64 + } else { + 0 + } + } else { + fullsegs * USABLE_BYTES_IN_SEGMENT + + (XLOG_BLCKSZ - XLOG_SIZE_OF_XLOG_SHORT_PHD) as u64 + + (fullpages - 1) * USABLE_BYTES_IN_PAGE + + if offset > 0 { + assert!(offset >= XLOG_SIZE_OF_XLOG_SHORT_PHD as u64); + offset - XLOG_SIZE_OF_XLOG_SHORT_PHD as u64 + } else { + 0 + } + } +} diff --git a/scripts/benchmark_durations.py b/scripts/benchmark_durations.py index 7f05d72a03..01f34a1b96 100755 --- a/scripts/benchmark_durations.py +++ b/scripts/benchmark_durations.py @@ -20,7 +20,7 @@ BENCHMARKS_DURATION_QUERY = """ FROM results WHERE started_at > CURRENT_DATE - INTERVAL '%s' day - AND parent_suite = 'test_runner.performance' + AND starts_with(parent_suite, 'test_runner.performance') AND status = 'passed' GROUP BY parent_suite, suite, name @@ -31,68 +31,75 @@ BENCHMARKS_DURATION_QUERY = """ # the total duration varies from 8 to 40 minutes. # We use some pre-collected durations as a fallback to have a better distribution. FALLBACK_DURATION = { - "test_runner/performance/test_branch_creation.py::test_branch_creation_heavy_write[20]": 62.144, - "test_runner/performance/test_branch_creation.py::test_branch_creation_many[1024]": 90.941, - "test_runner/performance/test_branch_creation.py::test_branch_creation_many_relations": 26.053, - "test_runner/performance/test_branching.py::test_compare_child_and_root_pgbench_perf": 25.67, - "test_runner/performance/test_branching.py::test_compare_child_and_root_read_perf": 14.497, - "test_runner/performance/test_branching.py::test_compare_child_and_root_write_perf": 18.852, - "test_runner/performance/test_bulk_insert.py::test_bulk_insert[neon]": 26.572, - "test_runner/performance/test_bulk_insert.py::test_bulk_insert[vanilla]": 6.259, - "test_runner/performance/test_bulk_tenant_create.py::test_bulk_tenant_create[10]": 21.206, - "test_runner/performance/test_bulk_tenant_create.py::test_bulk_tenant_create[1]": 3.474, - "test_runner/performance/test_bulk_tenant_create.py::test_bulk_tenant_create[5]": 11.262, - "test_runner/performance/test_bulk_update.py::test_bulk_update[100]": 94.225, - "test_runner/performance/test_bulk_update.py::test_bulk_update[10]": 68.159, - "test_runner/performance/test_bulk_update.py::test_bulk_update[50]": 76.719, - "test_runner/performance/test_compaction.py::test_compaction": 110.222, - "test_runner/performance/test_compare_pg_stats.py::test_compare_pg_stats_ro_with_pgbench_select_only[neon-5-10-100]": 10.743, - "test_runner/performance/test_compare_pg_stats.py::test_compare_pg_stats_ro_with_pgbench_select_only[vanilla-5-10-100]": 16.541, - "test_runner/performance/test_compare_pg_stats.py::test_compare_pg_stats_rw_with_pgbench_default[neon-5-10-100]": 11.109, - "test_runner/performance/test_compare_pg_stats.py::test_compare_pg_stats_rw_with_pgbench_default[vanilla-5-10-100]": 18.121, - "test_runner/performance/test_compare_pg_stats.py::test_compare_pg_stats_wal_with_pgbench_default[neon-5-10-100]": 11.3, - "test_runner/performance/test_compare_pg_stats.py::test_compare_pg_stats_wal_with_pgbench_default[vanilla-5-10-100]": 16.086, - "test_runner/performance/test_compare_pg_stats.py::test_compare_pg_stats_wo_with_heavy_write[neon-10-10]": 12.024, - "test_runner/performance/test_compare_pg_stats.py::test_compare_pg_stats_wo_with_heavy_write[neon-10-1]": 11.14, - "test_runner/performance/test_compare_pg_stats.py::test_compare_pg_stats_wo_with_heavy_write[vanilla-10-10]": 10.375, - "test_runner/performance/test_compare_pg_stats.py::test_compare_pg_stats_wo_with_heavy_write[vanilla-10-1]": 10.075, - "test_runner/performance/test_compare_pg_stats.py::test_compare_pg_stats_wo_with_pgbench_simple_update[neon-5-10-100]": 11.147, - "test_runner/performance/test_compare_pg_stats.py::test_compare_pg_stats_wo_with_pgbench_simple_update[vanilla-5-10-100]": 16.321, - "test_runner/performance/test_copy.py::test_copy[neon]": 16.579, - "test_runner/performance/test_copy.py::test_copy[vanilla]": 10.094, - "test_runner/performance/test_gc_feedback.py::test_gc_feedback": 590.157, - "test_runner/performance/test_gist_build.py::test_gist_buffering_build[neon]": 14.102, - "test_runner/performance/test_gist_build.py::test_gist_buffering_build[vanilla]": 8.677, - "test_runner/performance/test_latency.py::test_measure_read_latency_heavy_write_workload[neon-1]": 31.079, - "test_runner/performance/test_latency.py::test_measure_read_latency_heavy_write_workload[vanilla-1]": 38.119, - "test_runner/performance/test_layer_map.py::test_layer_map": 24.784, - "test_runner/performance/test_logical_replication.py::test_logical_replication": 117.707, - "test_runner/performance/test_parallel_copy_to.py::test_parallel_copy_different_tables[neon]": 21.194, - "test_runner/performance/test_parallel_copy_to.py::test_parallel_copy_different_tables[vanilla]": 59.068, - "test_runner/performance/test_parallel_copy_to.py::test_parallel_copy_same_table[neon]": 73.235, - "test_runner/performance/test_parallel_copy_to.py::test_parallel_copy_same_table[vanilla]": 82.586, - "test_runner/performance/test_perf_pgbench.py::test_pgbench[neon-45-10]": 106.536, - "test_runner/performance/test_perf_pgbench.py::test_pgbench[vanilla-45-10]": 98.753, - "test_runner/performance/test_random_writes.py::test_random_writes[neon]": 6.975, - "test_runner/performance/test_random_writes.py::test_random_writes[vanilla]": 3.69, - "test_runner/performance/test_seqscans.py::test_seqscans[neon-100000-100-0]": 3.529, - "test_runner/performance/test_seqscans.py::test_seqscans[neon-10000000-1-0]": 64.522, - "test_runner/performance/test_seqscans.py::test_seqscans[neon-10000000-1-4]": 40.964, - "test_runner/performance/test_seqscans.py::test_seqscans[vanilla-100000-100-0]": 0.55, - "test_runner/performance/test_seqscans.py::test_seqscans[vanilla-10000000-1-0]": 12.189, - "test_runner/performance/test_seqscans.py::test_seqscans[vanilla-10000000-1-4]": 13.899, - "test_runner/performance/test_startup.py::test_startup_simple": 2.51, - "test_runner/performance/test_wal_backpressure.py::test_heavy_write_workload[neon_off-10-5-5]": 527.245, - "test_runner/performance/test_wal_backpressure.py::test_heavy_write_workload[neon_on-10-5-5]": 583.46, - "test_runner/performance/test_wal_backpressure.py::test_heavy_write_workload[vanilla-10-5-5]": 113.653, - "test_runner/performance/test_wal_backpressure.py::test_pgbench_intensive_init_workload[neon_off-1000]": 233.728, - "test_runner/performance/test_wal_backpressure.py::test_pgbench_intensive_init_workload[neon_on-1000]": 419.093, - "test_runner/performance/test_wal_backpressure.py::test_pgbench_intensive_init_workload[vanilla-1000]": 982.461, - "test_runner/performance/test_wal_backpressure.py::test_pgbench_simple_update_workload[neon_off-45-100]": 116.522, - "test_runner/performance/test_wal_backpressure.py::test_pgbench_simple_update_workload[neon_on-45-100]": 115.583, - "test_runner/performance/test_wal_backpressure.py::test_pgbench_simple_update_workload[vanilla-45-100]": 155.282, - "test_runner/performance/test_write_amplification.py::test_write_amplification[neon]": 26.704, - "test_runner/performance/test_write_amplification.py::test_write_amplification[vanilla]": 16.088, + "test_runner/performance/pageserver/pagebench/test_pageserver_max_throughput_getpage_at_latest_lsn.py::test_pageserver_max_throughput_getpage_at_latest_lsn[1-13-30]": 400.15, + "test_runner/performance/pageserver/pagebench/test_pageserver_max_throughput_getpage_at_latest_lsn.py::test_pageserver_max_throughput_getpage_at_latest_lsn[1-6-30]": 372.521, + "test_runner/performance/pageserver/pagebench/test_pageserver_max_throughput_getpage_at_latest_lsn.py::test_pageserver_max_throughput_getpage_at_latest_lsn[10-13-30]": 420.017, + "test_runner/performance/pageserver/pagebench/test_pageserver_max_throughput_getpage_at_latest_lsn.py::test_pageserver_max_throughput_getpage_at_latest_lsn[10-6-30]": 373.769, + "test_runner/performance/pageserver/pagebench/test_pageserver_max_throughput_getpage_at_latest_lsn.py::test_pageserver_max_throughput_getpage_at_latest_lsn[100-13-30]": 678.742, + "test_runner/performance/pageserver/pagebench/test_pageserver_max_throughput_getpage_at_latest_lsn.py::test_pageserver_max_throughput_getpage_at_latest_lsn[100-6-30]": 512.135, + "test_runner/performance/test_branch_creation.py::test_branch_creation_heavy_write[20]": 58.036, + "test_runner/performance/test_branch_creation.py::test_branch_creation_many_relations": 22.104, + "test_runner/performance/test_branch_creation.py::test_branch_creation_many[1024]": 126.073, + "test_runner/performance/test_branching.py::test_compare_child_and_root_pgbench_perf": 25.759, + "test_runner/performance/test_branching.py::test_compare_child_and_root_read_perf": 6.885, + "test_runner/performance/test_branching.py::test_compare_child_and_root_write_perf": 8.758, + "test_runner/performance/test_bulk_insert.py::test_bulk_insert[neon]": 18.275, + "test_runner/performance/test_bulk_insert.py::test_bulk_insert[vanilla]": 9.533, + "test_runner/performance/test_bulk_tenant_create.py::test_bulk_tenant_create[1]": 12.09, + "test_runner/performance/test_bulk_tenant_create.py::test_bulk_tenant_create[10]": 35.145, + "test_runner/performance/test_bulk_tenant_create.py::test_bulk_tenant_create[5]": 22.28, + "test_runner/performance/test_bulk_update.py::test_bulk_update[10]": 66.353, + "test_runner/performance/test_bulk_update.py::test_bulk_update[100]": 75.487, + "test_runner/performance/test_bulk_update.py::test_bulk_update[50]": 54.142, + "test_runner/performance/test_compaction.py::test_compaction": 110.715, + "test_runner/performance/test_compare_pg_stats.py::test_compare_pg_stats_ro_with_pgbench_select_only[neon-5-10-100]": 11.68, + "test_runner/performance/test_compare_pg_stats.py::test_compare_pg_stats_ro_with_pgbench_select_only[vanilla-5-10-100]": 16.384, + "test_runner/performance/test_compare_pg_stats.py::test_compare_pg_stats_rw_with_pgbench_default[neon-5-10-100]": 11.315, + "test_runner/performance/test_compare_pg_stats.py::test_compare_pg_stats_rw_with_pgbench_default[vanilla-5-10-100]": 18.783, + "test_runner/performance/test_compare_pg_stats.py::test_compare_pg_stats_wal_with_pgbench_default[neon-5-10-100]": 11.647, + "test_runner/performance/test_compare_pg_stats.py::test_compare_pg_stats_wal_with_pgbench_default[vanilla-5-10-100]": 17.04, + "test_runner/performance/test_compare_pg_stats.py::test_compare_pg_stats_wo_with_heavy_write[neon-10-1]": 11.01, + "test_runner/performance/test_compare_pg_stats.py::test_compare_pg_stats_wo_with_heavy_write[neon-10-10]": 11.902, + "test_runner/performance/test_compare_pg_stats.py::test_compare_pg_stats_wo_with_heavy_write[vanilla-10-1]": 10.077, + "test_runner/performance/test_compare_pg_stats.py::test_compare_pg_stats_wo_with_heavy_write[vanilla-10-10]": 10.4, + "test_runner/performance/test_compare_pg_stats.py::test_compare_pg_stats_wo_with_pgbench_simple_update[neon-5-10-100]": 11.33, + "test_runner/performance/test_compare_pg_stats.py::test_compare_pg_stats_wo_with_pgbench_simple_update[vanilla-5-10-100]": 16.434, + "test_runner/performance/test_copy.py::test_copy[neon]": 13.817, + "test_runner/performance/test_copy.py::test_copy[vanilla]": 11.736, + "test_runner/performance/test_gc_feedback.py::test_gc_feedback": 575.735, + "test_runner/performance/test_gist_build.py::test_gist_buffering_build[neon]": 14.868, + "test_runner/performance/test_gist_build.py::test_gist_buffering_build[vanilla]": 14.393, + "test_runner/performance/test_latency.py::test_measure_read_latency_heavy_write_workload[neon-1]": 20.588, + "test_runner/performance/test_latency.py::test_measure_read_latency_heavy_write_workload[vanilla-1]": 30.849, + "test_runner/performance/test_layer_map.py::test_layer_map": 39.378, + "test_runner/performance/test_lazy_startup.py::test_lazy_startup": 2848.938, + "test_runner/performance/test_logical_replication.py::test_logical_replication": 120.952, + "test_runner/performance/test_parallel_copy_to.py::test_parallel_copy_different_tables[neon]": 35.552, + "test_runner/performance/test_parallel_copy_to.py::test_parallel_copy_different_tables[vanilla]": 66.762, + "test_runner/performance/test_parallel_copy_to.py::test_parallel_copy_same_table[neon]": 85.177, + "test_runner/performance/test_parallel_copy_to.py::test_parallel_copy_same_table[vanilla]": 92.12, + "test_runner/performance/test_perf_pgbench.py::test_pgbench[neon-45-10]": 107.009, + "test_runner/performance/test_perf_pgbench.py::test_pgbench[vanilla-45-10]": 99.582, + "test_runner/performance/test_random_writes.py::test_random_writes[neon]": 4.737, + "test_runner/performance/test_random_writes.py::test_random_writes[vanilla]": 2.686, + "test_runner/performance/test_seqscans.py::test_seqscans[neon-100000-100-0]": 3.271, + "test_runner/performance/test_seqscans.py::test_seqscans[neon-10000000-1-0]": 50.719, + "test_runner/performance/test_seqscans.py::test_seqscans[neon-10000000-1-4]": 15.992, + "test_runner/performance/test_seqscans.py::test_seqscans[vanilla-100000-100-0]": 0.566, + "test_runner/performance/test_seqscans.py::test_seqscans[vanilla-10000000-1-0]": 13.542, + "test_runner/performance/test_seqscans.py::test_seqscans[vanilla-10000000-1-4]": 13.35, + "test_runner/performance/test_startup.py::test_startup_simple": 13.043, + "test_runner/performance/test_wal_backpressure.py::test_heavy_write_workload[neon_off-10-5-5]": 194.841, + "test_runner/performance/test_wal_backpressure.py::test_heavy_write_workload[neon_on-10-5-5]": 286.667, + "test_runner/performance/test_wal_backpressure.py::test_heavy_write_workload[vanilla-10-5-5]": 85.577, + "test_runner/performance/test_wal_backpressure.py::test_pgbench_intensive_init_workload[neon_off-1000]": 297.626, + "test_runner/performance/test_wal_backpressure.py::test_pgbench_intensive_init_workload[neon_on-1000]": 646.187, + "test_runner/performance/test_wal_backpressure.py::test_pgbench_intensive_init_workload[vanilla-1000]": 989.776, + "test_runner/performance/test_wal_backpressure.py::test_pgbench_simple_update_workload[neon_off-45-100]": 125.638, + "test_runner/performance/test_wal_backpressure.py::test_pgbench_simple_update_workload[neon_on-45-100]": 123.554, + "test_runner/performance/test_wal_backpressure.py::test_pgbench_simple_update_workload[vanilla-45-100]": 190.083, + "test_runner/performance/test_write_amplification.py::test_write_amplification[neon]": 21.016, + "test_runner/performance/test_write_amplification.py::test_write_amplification[vanilla]": 23.028, } diff --git a/scripts/comment-test-report.js b/scripts/comment-test-report.js index 89befda71f..f42262cf48 100755 --- a/scripts/comment-test-report.js +++ b/scripts/comment-test-report.js @@ -188,7 +188,7 @@ const reportSummary = async (params) => { } const parseCoverageSummary = async ({ summaryJsonUrl, coverageUrl, fetch }) => { - let summary = `\n### Code coverage ([full report](${coverageUrl}))\n` + let summary = `\n### Code coverage* ([full report](${coverageUrl}))\n` const coverage = await (await fetch(summaryJsonUrl)).json() for (const covType of Object.keys(coverage).sort()) { @@ -198,7 +198,7 @@ const parseCoverageSummary = async ({ summaryJsonUrl, coverageUrl, fetch }) => { summary += `- \`${covType}s\`: \`${coverage[covType]["_summary"]}\`\n` } - + summary += "\n\\* collected from Rust tests only\n" summary += `\n___\n` return summary diff --git a/scripts/flaky_tests.py b/scripts/flaky_tests.py index 61a97f520d..853c67d218 100755 --- a/scripts/flaky_tests.py +++ b/scripts/flaky_tests.py @@ -15,7 +15,8 @@ FLAKY_TESTS_QUERY = """ DISTINCT parent_suite, suite, name FROM results WHERE - started_at > CURRENT_DATE - INTERVAL '%s' day + started_at > CURRENT_DATE - INTERVAL '10' day + AND started_at > '2024-03-11 14:50:11.845+00' -- we switched the default PAGESERVER_VIRTUAL_FILE_IO_ENGINE to `tokio-epoll-uring` from `std-fs` on this date, we want to ignore the flaky tests for `std-fs` AND ( (status IN ('failed', 'broken') AND reference = 'refs/heads/main') OR flaky @@ -46,11 +47,14 @@ def main(args: argparse.Namespace): logging.error("cannot fetch flaky tests from the DB due to an error", exc) rows = [] - # If a test run has non-default PAGESERVER_VIRTUAL_FILE_IO_ENGINE (i.e. not empty, not std-fs), + # If a test run has non-default PAGESERVER_VIRTUAL_FILE_IO_ENGINE (i.e. not empty, not tokio-epoll-uring), # use it to parametrize test name along with build_type and pg_version # # See test_runner/fixtures/parametrize.py for details - if (io_engine := os.getenv("PAGESERVER_VIRTUAL_FILE_IO_ENGINE", "")) not in ("", "std-fs"): + if (io_engine := os.getenv("PAGESERVER_VIRTUAL_FILE_IO_ENGINE", "")) not in ( + "", + "tokio-epoll-uring", + ): pageserver_virtual_file_io_engine_parameter = f"-{io_engine}" else: pageserver_virtual_file_io_engine_parameter = "" diff --git a/scripts/generate_and_push_perf_report.sh b/scripts/generate_and_push_perf_report.sh index 9e03302b0f..178c570b13 100755 --- a/scripts/generate_and_push_perf_report.sh +++ b/scripts/generate_and_push_perf_report.sh @@ -8,17 +8,3 @@ SCRIPT_DIR="$( cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd ) echo "Uploading perf report to neon pg" # ingest per test results data into neon backed postgres running in staging to build grafana reports on that data DATABASE_URL="$PERF_TEST_RESULT_CONNSTR" poetry run python "$SCRIPT_DIR"/ingest_perf_test_result.py --ingest "$REPORT_FROM" - -# Activate poetry's venv. Needed because git upload does not run in a project dir (it uses tmp to store the repository) -# so the problem occurs because poetry cannot find pyproject.toml in temp dir created by git upload -# shellcheck source=/dev/null -. "$(poetry env info --path)"/bin/activate - -echo "Uploading perf result to zenith-perf-data" -scripts/git-upload \ - --repo=https://"$VIP_VAP_ACCESS_TOKEN"@github.com/neondatabase/zenith-perf-data.git \ - --message="add performance test result for $GITHUB_SHA neon revision" \ - --branch=master \ - copy "$REPORT_FROM" "data/$REPORT_TO" `# COPY FROM TO_RELATIVE`\ - --merge \ - --run-cmd "python $SCRIPT_DIR/generate_perf_report_page.py --input-dir data/$REPORT_TO --out reports/$REPORT_TO.html" diff --git a/scripts/generate_perf_report_page.py b/scripts/generate_perf_report_page.py deleted file mode 100755 index b5b49bb600..0000000000 --- a/scripts/generate_perf_report_page.py +++ /dev/null @@ -1,219 +0,0 @@ -#!/usr/bin/env python3 -import argparse -import json -from dataclasses import dataclass -from pathlib import Path -from typing import Any, Dict, List, Optional, Tuple, cast - -from jinja2 import Template - -# skip 'input' columns. They are included in the header and just blow the table -EXCLUDE_COLUMNS = frozenset( - { - "scale", - "duration", - "number_of_clients", - "number_of_threads", - "init_start_timestamp", - "init_end_timestamp", - "run_start_timestamp", - "run_end_timestamp", - } -) - -KEY_EXCLUDE_FIELDS = frozenset( - { - "init_start_timestamp", - "init_end_timestamp", - "run_start_timestamp", - "run_end_timestamp", - } -) -NEGATIVE_COLOR = "negative" -POSITIVE_COLOR = "positive" -EPS = 1e-6 - - -@dataclass -class SuitRun: - revision: str - values: Dict[str, Any] - - -@dataclass -class SuitRuns: - platform: str - suit: str - common_columns: List[Tuple[str, str]] - value_columns: List[str] - runs: List[SuitRun] - - -@dataclass -class RowValue: - value: str - color: str - ratio: str - - -def get_columns(values: List[Dict[Any, Any]]) -> Tuple[List[Tuple[str, str]], List[str]]: - value_columns = [] - common_columns = [] - for item in values: - if item["name"] in KEY_EXCLUDE_FIELDS: - continue - if item["report"] != "test_param": - value_columns.append(cast(str, item["name"])) - else: - common_columns.append((cast(str, item["name"]), cast(str, item["value"]))) - value_columns.sort() - common_columns.sort(key=lambda x: x[0]) # sort by name - return common_columns, value_columns - - -def format_ratio(ratio: float, report: str) -> Tuple[str, str]: - color = "" - sign = "+" if ratio > 0 else "" - if abs(ratio) < 0.05: - return f" ({sign}{ratio:.2f})", color - - if report not in {"test_param", "higher_is_better", "lower_is_better"}: - raise ValueError(f"Unknown report type: {report}") - - if report == "test_param": - return f"{ratio:.2f}", color - - if ratio > 0: - if report == "higher_is_better": - color = POSITIVE_COLOR - elif report == "lower_is_better": - color = NEGATIVE_COLOR - elif ratio < 0: - if report == "higher_is_better": - color = NEGATIVE_COLOR - elif report == "lower_is_better": - color = POSITIVE_COLOR - - return f" ({sign}{ratio:.2f})", color - - -def extract_value(name: str, suit_run: SuitRun) -> Optional[Dict[str, Any]]: - for item in suit_run.values["data"]: - if item["name"] == name: - return cast(Dict[str, Any], item) - return None - - -def get_row_values( - columns: List[str], run_result: SuitRun, prev_result: Optional[SuitRun] -) -> List[RowValue]: - row_values = [] - for column in columns: - current_value = extract_value(column, run_result) - if current_value is None: - # should never happen - raise ValueError(f"{column} not found in {run_result.values}") - - value = current_value["value"] - if isinstance(value, float): - value = f"{value:.2f}" - - if prev_result is None: - row_values.append(RowValue(value, "", "")) - continue - - prev_value = extract_value(column, prev_result) - if prev_value is None: - # this might happen when new metric is added and there is no value for it in previous run - # let this be here, TODO add proper handling when this actually happens - raise ValueError(f"{column} not found in previous result") - # adding `EPS` to each term to avoid ZeroDivisionError when the denominator is zero - ratio = (float(value) + EPS) / (float(prev_value["value"]) + EPS) - 1 - ratio_display, color = format_ratio(ratio, current_value["report"]) - row_values.append(RowValue(value, color, ratio_display)) - return row_values - - -@dataclass -class SuiteRunTableRow: - revision: str - values: List[RowValue] - - -def prepare_rows_from_runs(value_columns: List[str], runs: List[SuitRun]) -> List[SuiteRunTableRow]: - rows = [] - prev_run = None - for run in runs: - rows.append( - SuiteRunTableRow( - revision=run.revision, values=get_row_values(value_columns, run, prev_run) - ) - ) - prev_run = run - - return rows - - -def main(args: argparse.Namespace) -> None: - input_dir = Path(args.input_dir) - grouped_runs: Dict[str, SuitRuns] = {} - # we have files in form: _.json - # fill them in the hashmap so we have grouped items for the - # same run configuration (scale, duration etc.) ordered by counter. - for item in sorted(input_dir.iterdir(), key=lambda x: int(x.name.split("_")[0])): - run_data = json.loads(item.read_text()) - revision = run_data["revision"] - - for suit_result in run_data["result"]: - key = "{}{}".format(run_data["platform"], suit_result["suit"]) - # pack total duration as a synthetic value - total_duration = suit_result["total_duration"] - suit_result["data"].append( - { - "name": "total_duration", - "value": total_duration, - "unit": "s", - "report": "lower_is_better", - } - ) - common_columns, value_columns = get_columns(suit_result["data"]) - - grouped_runs.setdefault( - key, - SuitRuns( - platform=run_data["platform"], - suit=suit_result["suit"], - common_columns=common_columns, - value_columns=value_columns, - runs=[], - ), - ) - - grouped_runs[key].runs.append(SuitRun(revision=revision, values=suit_result)) - context = {} - for result in grouped_runs.values(): - suit = result.suit - context[suit] = { - "common_columns": result.common_columns, - "value_columns": result.value_columns, - "platform": result.platform, - # reverse the order so newest results are on top of the table - "rows": reversed(prepare_rows_from_runs(result.value_columns, result.runs)), - } - - template = Template((Path(__file__).parent / "perf_report_template.html").read_text()) - - Path(args.out).write_text(template.render(context=context)) - - -if __name__ == "__main__": - parser = argparse.ArgumentParser() - parser.add_argument( - "--input-dir", - dest="input_dir", - required=True, - help="Directory with jsons generated by the test suite", - ) - parser.add_argument("--out", required=True, help="Output html file path") - args = parser.parse_args() - main(args) diff --git a/scripts/git-upload b/scripts/git-upload deleted file mode 100755 index d56c0f8e94..0000000000 --- a/scripts/git-upload +++ /dev/null @@ -1,170 +0,0 @@ -#!/usr/bin/env python3 - -import argparse -import os -import shlex -import shutil -import subprocess -import sys -import textwrap -from contextlib import contextmanager -from distutils.dir_util import copy_tree -from pathlib import Path -from tempfile import TemporaryDirectory -from typing import Optional - - -def absolute_path(path): - return Path(path).resolve() - - -def relative_path(path): - path = Path(path) - if path.is_absolute(): - raise Exception(f'path `{path}` must be relative!') - return path - - -@contextmanager -def chdir(cwd: Path): - old = os.getcwd() - os.chdir(cwd) - try: - yield cwd - finally: - os.chdir(old) - - -def run(cmd, *args, **kwargs): - print('$', ' '.join(cmd)) - subprocess.check_call(cmd, *args, **kwargs) - - -class GitRepo: - def __init__(self, url, branch: Optional[str] = None): - self.url = url - self.cwd = TemporaryDirectory() - self.branch = branch - - args = [ - 'git', - 'clone', - '--single-branch', - ] - if self.branch: - args.extend(['--branch', self.branch]) - - subprocess.check_call([ - *args, - str(url), - self.cwd.name, - ]) - - def is_dirty(self): - res = subprocess.check_output(['git', 'status', '--porcelain'], text=True).strip() - return bool(res) - - def update(self, message, action, branch=None): - with chdir(self.cwd.name): - if not branch: - cmd = ['git', 'branch', '--show-current'] - branch = subprocess.check_output(cmd, text=True).strip() - - # Run action in repo's directory - action() - - run(['git', 'add', '.']) - - if not self.is_dirty(): - print('No changes detected, quitting') - return - - git_with_user = [ - 'git', - '-c', - 'user.name=vipvap', - '-c', - 'user.email=vipvap@zenith.tech', - ] - run(git_with_user + [ - 'commit', - '--author="vipvap "', - f'--message={message}', - ]) - - for _ in range(5): - try: - run(['git', 'fetch', 'origin', branch]) - run(git_with_user + ['rebase', f'origin/{branch}']) - run(['git', 'push', 'origin', branch]) - return - - except subprocess.CalledProcessError as e: - print(f'failed to update branch `{branch}`: {e}', file=sys.stderr) - - raise Exception(f'failed to update branch `{branch}`') - - -def do_copy(args): - src = args.src - dst = args.dst - - if args.forbid_overwrite and dst.exists(): - raise FileExistsError(f"File exists: '{dst}'") - - if src.is_dir(): - if not args.merge: - shutil.rmtree(dst, ignore_errors=True) - # distutils is deprecated, but this is a temporary workaround before python version bump - # here we need dir_exists_ok=True from shutil.copytree which is available in python 3.8+ - copy_tree(str(src), str(dst)) - else: - shutil.copy(src, dst) - - if args.run_cmd: - run(shlex.split(args.run_cmd)) - - -def main(): - parser = argparse.ArgumentParser(description='Git upload tool') - parser.add_argument('--repo', type=str, metavar='URL', required=True, help='git repo url') - parser.add_argument('--message', type=str, metavar='TEXT', help='commit message') - parser.add_argument('--branch', type=str, metavar='TEXT', help='target git repo branch') - - commands = parser.add_subparsers(title='commands', dest='subparser_name') - - p_copy = commands.add_parser( - 'copy', - help='copy file into the repo', - formatter_class=argparse.RawTextHelpFormatter, - ) - p_copy.add_argument('src', type=absolute_path, help='source path') - p_copy.add_argument('dst', type=relative_path, help='relative dest path') - p_copy.add_argument('--forbid-overwrite', action='store_true', help='do not allow overwrites') - p_copy.add_argument( - '--merge', - action='store_true', - help='when copying a directory do not delete existing data, but add new files') - p_copy.add_argument('--run-cmd', - help=textwrap.dedent('''\ - run arbitrary cmd on top of copied files, - example usage is static content generation - based on current repository state\ - ''')) - - args = parser.parse_args() - - commands = { - 'copy': do_copy, - } - - action = commands.get(args.subparser_name) - if action: - message = args.message or 'update' - GitRepo(args.repo, args.branch).update(message, lambda: action(args)) - else: - parser.print_usage() - - -if __name__ == '__main__': - main() diff --git a/scripts/ingest_regress_test_result.py b/scripts/ingest_regress_test_result.py deleted file mode 100644 index 39c1c02941..0000000000 --- a/scripts/ingest_regress_test_result.py +++ /dev/null @@ -1,118 +0,0 @@ -#!/usr/bin/env python3 -import argparse -import logging -import os -import re -import sys -from contextlib import contextmanager -from pathlib import Path - -import backoff -import psycopg2 - -CREATE_TABLE = """ -CREATE TABLE IF NOT EXISTS regress_test_results ( - id SERIAL PRIMARY KEY, - reference CHAR(255), - revision CHAR(40), - build_type CHAR(16), - data JSONB -) -""" - - -def err(msg): - print(f"error: {msg}") - sys.exit(1) - - -@contextmanager -def get_connection_cursor(): - connstr = os.getenv("DATABASE_URL") - if not connstr: - err("DATABASE_URL environment variable is not set") - - @backoff.on_exception(backoff.expo, psycopg2.OperationalError, max_time=150) - def connect(connstr): - conn = psycopg2.connect(connstr, connect_timeout=30) - conn.autocommit = True - return conn - - conn = connect(connstr) - try: - with conn.cursor() as cur: - yield cur - finally: - if conn is not None: - conn.close() - - -def create_table(cur): - cur.execute(CREATE_TABLE) - - -def ingest_regress_test_result( - cursor, reference: str, revision: str, build_type: str, data_file: Path -): - data = data_file.read_text() - # In the JSON report we can have lines related to LazyFixture with escaped double-quote - # It's hard to insert them into jsonb field as is, so replace \" with ' to make it easier for us - # - # "" -> "" - data = re.sub(r'("")', r"\g<1>'\g<2>'\g<3>", data) - values = ( - reference, - revision, - build_type, - data, - ) - cursor.execute( - """ - INSERT INTO regress_test_results ( - reference, - revision, - build_type, - data - ) VALUES (%s, %s, %s, %s) - """, - values, - ) - - -def main(): - parser = argparse.ArgumentParser( - description="Regress test result uploader. \ - Database connection string should be provided via DATABASE_URL environment variable", - ) - parser.add_argument("--initdb", action="store_true", help="Initialuze database") - parser.add_argument( - "--reference", type=str, required=True, help="git reference, for example refs/heads/main" - ) - parser.add_argument("--revision", type=str, required=True, help="git revision") - parser.add_argument( - "--build-type", type=str, required=True, help="build type: release, debug or remote" - ) - parser.add_argument( - "--ingest", type=Path, required=True, help="Path to regress test result file" - ) - - args = parser.parse_args() - with get_connection_cursor() as cur: - if args.initdb: - create_table(cur) - - if not args.ingest.exists(): - err(f"ingest path {args.ingest} does not exist") - - ingest_regress_test_result( - cur, - reference=args.reference, - revision=args.revision, - build_type=args.build_type, - data_file=args.ingest, - ) - - -if __name__ == "__main__": - logging.getLogger("backoff").addHandler(logging.StreamHandler()) - main() diff --git a/test_runner/fixtures/compare_fixtures.py b/test_runner/fixtures/compare_fixtures.py index 6fbaa08512..429b6af548 100644 --- a/test_runner/fixtures/compare_fixtures.py +++ b/test_runner/fixtures/compare_fixtures.py @@ -155,12 +155,23 @@ class NeonCompare(PgCompare): "size", timeline_size / (1024 * 1024), "MB", report=MetricReport.LOWER_IS_BETTER ) - metric_filters = {"tenant_id": str(self.tenant), "timeline_id": str(self.timeline)} + metric_filters = { + "tenant_id": str(self.tenant), + "timeline_id": str(self.timeline), + "file_kind": "layer", + "op_kind": "upload", + } + # use `started` (not `finished`) counters here, because some callers + # don't wait for upload queue to drain total_files = self.zenbenchmark.get_int_counter_value( - self.env.pageserver, "pageserver_created_persistent_files_total", metric_filters + self.env.pageserver, + "pageserver_remote_timeline_client_calls_started_total", + metric_filters, ) total_bytes = self.zenbenchmark.get_int_counter_value( - self.env.pageserver, "pageserver_written_persistent_bytes_total", metric_filters + self.env.pageserver, + "pageserver_remote_timeline_client_bytes_started_total", + metric_filters, ) self.zenbenchmark.record( "data_uploaded", total_bytes / (1024 * 1024), "MB", report=MetricReport.LOWER_IS_BETTER diff --git a/test_runner/fixtures/metrics.py b/test_runner/fixtures/metrics.py index 7c489bda67..c615dd154f 100644 --- a/test_runner/fixtures/metrics.py +++ b/test_runner/fixtures/metrics.py @@ -4,6 +4,8 @@ from typing import Dict, List, Optional, Tuple from prometheus_client.parser import text_string_to_metric_families from prometheus_client.samples import Sample +from fixtures.log_helper import log + class Metrics: metrics: Dict[str, List[Sample]] @@ -31,6 +33,60 @@ class Metrics: return res[0] +class MetricsGetter: + """ + Mixin for types that implement a `get_metrics` function and would like associated + helpers for querying the metrics + """ + + def get_metrics(self) -> Metrics: + raise NotImplementedError() + + def get_metric_value( + self, name: str, filter: Optional[Dict[str, str]] = None + ) -> Optional[float]: + metrics = self.get_metrics() + results = metrics.query_all(name, filter=filter) + if not results: + log.info(f'could not find metric "{name}"') + return None + assert len(results) == 1, f"metric {name} with given filters is not unique, got: {results}" + return results[0].value + + def get_metrics_values( + self, names: list[str], filter: Optional[Dict[str, str]] = None, absence_ok=False + ) -> Dict[str, float]: + """ + When fetching multiple named metrics, it is more efficient to use this + than to call `get_metric_value` repeatedly. + + Throws RuntimeError if no metrics matching `names` are found, or if + not all of `names` are found: this method is intended for loading sets + of metrics whose existence is coupled. + + If it's expected that there may be no results for some of the metrics, + specify `absence_ok=True`. The returned dict will then not contain values + for these metrics. + """ + metrics = self.get_metrics() + samples = [] + for name in names: + samples.extend(metrics.query_all(name, filter=filter)) + + result = {} + for sample in samples: + if sample.name in result: + raise RuntimeError(f"Multiple values found for {sample.name}") + result[sample.name] = sample.value + + if not absence_ok: + if len(result) != len(names): + log.info(f"Metrics found: {metrics.metrics}") + raise RuntimeError(f"could not find all metrics {' '.join(names)}") + + return result + + def parse_metrics(text: str, name: str = "") -> Metrics: metrics = Metrics(name) gen = text_string_to_metric_families(text) @@ -47,7 +103,8 @@ def histogram(prefix_without_trailing_underscore: str) -> List[str]: PAGESERVER_PER_TENANT_REMOTE_TIMELINE_CLIENT_METRICS: Tuple[str, ...] = ( - "pageserver_remote_timeline_client_calls_unfinished", + "pageserver_remote_timeline_client_calls_started_total", + "pageserver_remote_timeline_client_calls_finished_total", "pageserver_remote_physical_size", "pageserver_remote_timeline_client_bytes_started_total", "pageserver_remote_timeline_client_bytes_finished_total", @@ -76,7 +133,6 @@ PAGESERVER_GLOBAL_METRICS: Tuple[str, ...] = ( *histogram("pageserver_getpage_get_reconstruct_data_seconds"), *histogram("pageserver_wait_lsn_seconds"), *histogram("pageserver_remote_operation_seconds"), - *histogram("pageserver_remote_timeline_client_calls_started"), *histogram("pageserver_io_operations_seconds"), "pageserver_tenant_states_count", ) @@ -91,10 +147,9 @@ PAGESERVER_PER_TENANT_METRICS: Tuple[str, ...] = ( "pageserver_smgr_query_seconds_sum", "pageserver_storage_operations_seconds_count_total", "pageserver_storage_operations_seconds_sum_total", - "pageserver_created_persistent_files_total", - "pageserver_written_persistent_bytes_total", "pageserver_evictions_total", "pageserver_evictions_with_low_residence_duration_total", *PAGESERVER_PER_TENANT_REMOTE_TIMELINE_CLIENT_METRICS, - # pageserver_broken_tenants_count is a leaked "metric" which is "cleared" on restart or reload + # "pageserver_directory_entries_count", -- only used if above a certain threshold + # "pageserver_broken_tenants_count" -- used only for broken ) diff --git a/test_runner/fixtures/neon_fixtures.py b/test_runner/fixtures/neon_fixtures.py index 142c97d5c3..b3f460c7fe 100644 --- a/test_runner/fixtures/neon_fixtures.py +++ b/test_runner/fixtures/neon_fixtures.py @@ -2,6 +2,7 @@ from __future__ import annotations import abc import asyncio +import concurrent.futures import filecmp import json import os @@ -14,18 +15,20 @@ import threading import time import uuid from contextlib import closing, contextmanager -from dataclasses import dataclass, field +from dataclasses import dataclass from datetime import datetime +from enum import Enum from fcntl import LOCK_EX, LOCK_UN, flock -from functools import cached_property +from functools import cached_property, partial from itertools import chain, product from pathlib import Path from types import TracebackType from typing import Any, Callable, Dict, Iterator, List, Optional, Tuple, Type, Union, cast -from urllib.parse import urlparse +from urllib.parse import quote, urlparse import asyncpg import backoff +import httpx import jwt import psycopg2 import pytest @@ -45,6 +48,7 @@ from urllib3.util.retry import Retry from fixtures import overlayfs from fixtures.broker import NeonBroker from fixtures.log_helper import log +from fixtures.metrics import Metrics, MetricsGetter, parse_metrics from fixtures.pageserver.allowed_errors import ( DEFAULT_PAGESERVER_ALLOWED_ERRORS, scan_pageserver_log_for_errors, @@ -66,6 +70,8 @@ from fixtures.remote_storage import ( default_remote_storage, remote_storage_to_toml_inline_table, ) +from fixtures.safekeeper.http import SafekeeperHttpClient +from fixtures.safekeeper.utils import are_walreceivers_absent from fixtures.types import Lsn, TenantId, TenantShardId, TimelineId from fixtures.utils import ( ATTACHMENT_NAME_REGEX, @@ -385,7 +391,8 @@ class PgProtocol: class AuthKeys: priv: str - def generate_token(self, *, scope: str, **token_data: str) -> str: + def generate_token(self, *, scope: TokenScope, **token_data: Any) -> str: + token_data = {key: str(val) for key, val in token_data.items()} token = jwt.encode({"scope": scope, **token_data}, self.priv, algorithm="EdDSA") # cast(Any, self.priv) @@ -398,14 +405,23 @@ class AuthKeys: return token def generate_pageserver_token(self) -> str: - return self.generate_token(scope="pageserverapi") + return self.generate_token(scope=TokenScope.PAGE_SERVER_API) def generate_safekeeper_token(self) -> str: - return self.generate_token(scope="safekeeperdata") + return self.generate_token(scope=TokenScope.SAFEKEEPER_DATA) # generate token giving access to only one tenant def generate_tenant_token(self, tenant_id: TenantId) -> str: - return self.generate_token(scope="tenant", tenant_id=str(tenant_id)) + return self.generate_token(scope=TokenScope.TENANT, tenant_id=str(tenant_id)) + + +# TODO: Replace with `StrEnum` when we upgrade to python 3.11 +class TokenScope(str, Enum): + ADMIN = "admin" + PAGE_SERVER_API = "pageserverapi" + GENERATIONS_API = "generations_api" + SAFEKEEPER_DATA = "safekeeperdata" + TENANT = "tenant" class NeonEnvBuilder: @@ -481,9 +497,15 @@ class NeonEnvBuilder: self.overlay_mounts_created_by_us: List[Tuple[str, Path]] = [] self.config_init_force: Optional[str] = None self.top_output_dir = top_output_dir + self.control_plane_compute_hook_api: Optional[str] = None self.pageserver_virtual_file_io_engine: Optional[str] = pageserver_virtual_file_io_engine + self.pageserver_get_vectored_impl: Optional[str] = None + if os.getenv("PAGESERVER_GET_VECTORED_IMPL", "") == "vectored": + self.pageserver_get_vectored_impl = "vectored" + log.debug('Overriding pageserver get_vectored_impl config to "vectored"') + assert test_name.startswith( "test_" ), "Unexpectedly instantiated from outside a test function" @@ -497,13 +519,13 @@ class NeonEnvBuilder: self.env = NeonEnv(self) return self.env - def start(self): + def start(self, register_pageservers=False): assert self.env is not None, "environment is not already initialized, call init() first" - self.env.start() + self.env.start(register_pageservers=register_pageservers) def init_start( self, - initial_tenant_conf: Optional[Dict[str, str]] = None, + initial_tenant_conf: Optional[Dict[str, Any]] = None, default_remote_storage_if_missing: bool = True, initial_tenant_shard_count: Optional[int] = None, initial_tenant_shard_stripe_size: Optional[int] = None, @@ -897,7 +919,7 @@ class NeonEnvBuilder: if self.scrub_on_exit: try: - S3Scrubber(self.test_output_dir, self).scan_metadata() + S3Scrubber(self).scan_metadata() except Exception as e: log.error(f"Error during remote storage scrub: {e}") cleanup_error = e @@ -992,9 +1014,24 @@ class NeonEnv: self.initial_tenant = config.initial_tenant self.initial_timeline = config.initial_timeline - attachment_service_port = self.port_distributor.get_port() - self.control_plane_api: str = f"http://127.0.0.1:{attachment_service_port}" - self.attachment_service: NeonAttachmentService = NeonAttachmentService( + # Find two adjacent ports for storage controller and its postgres DB. This + # loop would eventually throw from get_port() if we run out of ports (extremely + # unlikely): usually we find two adjacent free ports on the first iteration. + while True: + self.storage_controller_port = self.port_distributor.get_port() + storage_controller_pg_port = self.port_distributor.get_port() + if storage_controller_pg_port == self.storage_controller_port + 1: + break + + # The URL for the pageserver to use as its control_plane_api config + self.control_plane_api: str = f"http://127.0.0.1:{self.storage_controller_port}/upcall/v1" + # The base URL of the storage controller + self.storage_controller_api: str = f"http://127.0.0.1:{self.storage_controller_port}" + + # For testing this with a fake HTTP server, enable passing through a URL from config + self.control_plane_compute_hook_api = config.control_plane_compute_hook_api + + self.storage_controller: NeonStorageController = NeonStorageController( self, config.auth_enabled ) @@ -1013,6 +1050,9 @@ class NeonEnv: if self.control_plane_api is not None: cfg["control_plane_api"] = self.control_plane_api + if self.control_plane_compute_hook_api is not None: + cfg["control_plane_compute_hook_api"] = self.control_plane_compute_hook_api + # Create config for pageserver http_auth_type = "NeonJWT" if config.auth_enabled else "Trust" pg_auth_type = "NeonJWT" if config.auth_enabled else "Trust" @@ -1033,6 +1073,8 @@ class NeonEnv: } if self.pageserver_virtual_file_io_engine is not None: ps_cfg["virtual_file_io_engine"] = self.pageserver_virtual_file_io_engine + if config.pageserver_get_vectored_impl is not None: + ps_cfg["get_vectored_impl"] = config.pageserver_get_vectored_impl # Create a corresponding NeonPageserver object self.pageservers.append( @@ -1070,17 +1112,40 @@ class NeonEnv: log.info(f"Config: {cfg}") self.neon_cli.init(cfg, force=config.config_init_force) - def start(self): + def start(self, register_pageservers=False): + # storage controller starts first, so that pageserver /re-attach calls don't + # bounce through retries on startup + self.storage_controller.start() + + def storage_controller_ready(): + assert self.storage_controller.ready() is True + + # Wait for storage controller readiness to prevent unnecessary post start-up + # reconcile. + wait_until(30, 1, storage_controller_ready) + + if register_pageservers: + # Special case for forward compat tests, this can be removed later. + for pageserver in self.pageservers: + self.storage_controller.node_register(pageserver) + # Start up broker, pageserver and all safekeepers - self.broker.try_start() + futs = [] + with concurrent.futures.ThreadPoolExecutor( + max_workers=2 + len(self.pageservers) + len(self.safekeepers) + ) as executor: + futs.append( + executor.submit(lambda: self.broker.try_start() or None) + ) # The `or None` is for the linter - self.attachment_service.start() + for pageserver in self.pageservers: + futs.append(executor.submit(lambda ps=pageserver: ps.start())) - for pageserver in self.pageservers: - pageserver.start() + for safekeeper in self.safekeepers: + futs.append(executor.submit(lambda sk=safekeeper: sk.start())) - for safekeeper in self.safekeepers: - safekeeper.start() + for f in futs: + f.result() def stop(self, immediate=False, ps_assert_metric_no_errors=False): """ @@ -1093,7 +1158,7 @@ class NeonEnv: if ps_assert_metric_no_errors: pageserver.assert_no_metric_errors() pageserver.stop(immediate=immediate) - self.attachment_service.stop(immediate=immediate) + self.storage_controller.stop(immediate=immediate) self.broker.stop(immediate=immediate) @property @@ -1128,10 +1193,11 @@ class NeonEnv: def get_tenant_pageserver(self, tenant_id: Union[TenantId, TenantShardId]): """ Get the NeonPageserver where this tenant shard is currently attached, according - to the attachment service. + to the storage controller. """ - meta = self.attachment_service.inspect(tenant_id) - assert meta is not None, f"{tenant_id} attachment location not found" + meta = self.storage_controller.inspect(tenant_id) + if meta is None: + return None pageserver_id = meta[1] return self.get_pageserver(pageserver_id) @@ -1368,7 +1434,6 @@ class AbstractNeonCli(abc.ABC): args = [bin_neon] + arguments log.info('Running command "{}"'.format(" ".join(args))) - log.info(f'Running in "{self.env.repo_dir}"') env_vars = os.environ.copy() env_vars["NEON_REPO_DIR"] = str(self.env.repo_dir) @@ -1457,7 +1522,7 @@ class NeonCli(AbstractNeonCli): self, tenant_id: Optional[TenantId] = None, timeline_id: Optional[TimelineId] = None, - conf: Optional[Dict[str, str]] = None, + conf: Optional[Dict[str, Any]] = None, shard_count: Optional[int] = None, shard_stripe_size: Optional[int] = None, set_default: bool = False, @@ -1637,12 +1702,12 @@ class NeonCli(AbstractNeonCli): res.check_returncode() return res - def attachment_service_start(self): - cmd = ["attachment_service", "start"] + def storage_controller_start(self): + cmd = ["storage_controller", "start"] return self.raw_cli(cmd) - def attachment_service_stop(self, immediate: bool): - cmd = ["attachment_service", "stop"] + def storage_controller_stop(self, immediate: bool): + cmd = ["storage_controller", "stop"] if immediate: cmd.extend(["-m", "immediate"]) return self.raw_cli(cmd) @@ -1782,6 +1847,7 @@ class NeonCli(AbstractNeonCli): endpoint_id: str, destroy=False, check_return_code=True, + mode: Optional[str] = None, ) -> "subprocess.CompletedProcess[str]": args = [ "endpoint", @@ -1789,6 +1855,8 @@ class NeonCli(AbstractNeonCli): ] if destroy: args.append("--destroy") + if mode is not None: + args.append(f"--mode={mode}") if endpoint_id is not None: args.append(endpoint_id) @@ -1877,58 +1945,106 @@ class Pagectl(AbstractNeonCli): return IndexPartDump.from_json(parsed) -class NeonAttachmentService: - def __init__(self, env: NeonEnv, auth_enabled): +class StorageControllerApiException(Exception): + def __init__(self, message, status_code: int): + super().__init__(message) + self.message = message + self.status_code = status_code + + +class NeonStorageController(MetricsGetter): + def __init__(self, env: NeonEnv, auth_enabled: bool): self.env = env self.running = False self.auth_enabled = auth_enabled def start(self): assert not self.running - self.env.neon_cli.attachment_service_start() + self.env.neon_cli.storage_controller_start() self.running = True return self - def stop(self, immediate: bool = False) -> "NeonAttachmentService": + def stop(self, immediate: bool = False) -> "NeonStorageController": if self.running: - self.env.neon_cli.attachment_service_stop(immediate) + self.env.neon_cli.storage_controller_stop(immediate) self.running = False return self - def request(self, method, *args, **kwargs) -> requests.Response: - kwargs["headers"] = self.headers() - return requests.request(method, *args, **kwargs) + @staticmethod + def raise_api_exception(res: requests.Response): + try: + res.raise_for_status() + except requests.RequestException as e: + try: + msg = res.json()["msg"] + except: # noqa: E722 + msg = "" + raise StorageControllerApiException(msg, res.status_code) from e - def headers(self) -> Dict[str, str]: - headers = {} + def pageserver_api(self) -> PageserverHttpClient: + """ + The storage controller implements a subset of the pageserver REST API, for mapping + per-tenant actions into per-shard actions (e.g. timeline creation). Tests should invoke those + functions via the HttpClient, as an implicit check that these APIs remain compatible. + """ + auth_token = None if self.auth_enabled: - jwt_token = self.env.auth_keys.generate_pageserver_token() + auth_token = self.env.auth_keys.generate_token(scope=TokenScope.PAGE_SERVER_API) + return PageserverHttpClient(self.env.storage_controller_port, lambda: True, auth_token) + + def request(self, method, *args, **kwargs) -> requests.Response: + resp = requests.request(method, *args, **kwargs) + NeonStorageController.raise_api_exception(resp) + + return resp + + def headers(self, scope: Optional[TokenScope]) -> Dict[str, str]: + headers = {} + if self.auth_enabled and scope is not None: + jwt_token = self.env.auth_keys.generate_token(scope=scope) headers["Authorization"] = f"Bearer {jwt_token}" return headers + def get_metrics(self) -> Metrics: + res = self.request("GET", f"{self.env.storage_controller_api}/metrics") + return parse_metrics(res.text) + + def ready(self) -> bool: + status = None + try: + resp = self.request("GET", f"{self.env.storage_controller_api}/ready") + status = resp.status_code + except StorageControllerApiException as e: + status = e.status_code + + if status == 503: + return False + elif status == 200: + return True + else: + raise RuntimeError(f"Unexpected status {status} from readiness endpoint") + def attach_hook_issue( self, tenant_shard_id: Union[TenantId, TenantShardId], pageserver_id: int ) -> int: response = self.request( "POST", - f"{self.env.control_plane_api}/attach-hook", + f"{self.env.storage_controller_api}/debug/v1/attach-hook", json={"tenant_shard_id": str(tenant_shard_id), "node_id": pageserver_id}, - headers=self.headers(), + headers=self.headers(TokenScope.ADMIN), ) - response.raise_for_status() gen = response.json()["gen"] assert isinstance(gen, int) return gen def attach_hook_drop(self, tenant_shard_id: Union[TenantId, TenantShardId]): - response = self.request( + self.request( "POST", - f"{self.env.control_plane_api}/attach-hook", + f"{self.env.storage_controller_api}/debug/v1/attach-hook", json={"tenant_shard_id": str(tenant_shard_id), "node_id": None}, - headers=self.headers(), + headers=self.headers(TokenScope.ADMIN), ) - response.raise_for_status() def inspect(self, tenant_shard_id: Union[TenantId, TenantShardId]) -> Optional[tuple[int, int]]: """ @@ -1936,11 +2052,10 @@ class NeonAttachmentService: """ response = self.request( "POST", - f"{self.env.control_plane_api}/inspect", + f"{self.env.storage_controller_api}/debug/v1/inspect", json={"tenant_shard_id": str(tenant_shard_id)}, - headers=self.headers(), + headers=self.headers(TokenScope.ADMIN), ) - response.raise_for_status() json = response.json() log.info(f"Response: {json}") if json["attachment"]: @@ -1954,11 +2069,34 @@ class NeonAttachmentService: "node_id": int(node.id), "listen_http_addr": "localhost", "listen_http_port": node.service_port.http, + "listen_pg_addr": "localhost", + "listen_pg_port": node.service_port.pg, } log.info(f"node_register({body})") self.request( - "POST", f"{self.env.control_plane_api}/node", json=body, headers=self.headers() - ).raise_for_status() + "POST", + f"{self.env.storage_controller_api}/control/v1/node", + json=body, + headers=self.headers(TokenScope.ADMIN), + ) + + def node_list(self): + response = self.request( + "GET", + f"{self.env.storage_controller_api}/control/v1/node", + headers=self.headers(TokenScope.ADMIN), + ) + return response.json() + + def node_configure(self, node_id, body: dict[str, Any]): + log.info(f"node_configure({node_id}, {body})") + body["node_id"] = node_id + self.request( + "PUT", + f"{self.env.storage_controller_api}/control/v1/node/{node_id}/config", + json=body, + headers=self.headers(TokenScope.ADMIN), + ) def tenant_create( self, @@ -1967,6 +2105,9 @@ class NeonAttachmentService: shard_stripe_size: Optional[int] = None, tenant_config: Optional[Dict[Any, Any]] = None, ): + """ + Use this rather than pageserver_api() when you need to include shard parameters + """ body: Dict[str, Any] = {"new_tenant_id": str(tenant_id)} if shard_count is not None: @@ -1980,49 +2121,63 @@ class NeonAttachmentService: for k, v in tenant_config.items(): body[k] = v - response = self.request("POST", f"{self.env.control_plane_api}/tenant", json=body) - response.raise_for_status() + response = self.request( + "POST", + f"{self.env.storage_controller_api}/v1/tenant", + json=body, + headers=self.headers(TokenScope.PAGE_SERVER_API), + ) log.info(f"tenant_create success: {response.json()}") - def tenant_timeline_create(self, tenant_id: TenantId, timeline_id: TimelineId): - body: Dict[str, Any] = {"new_timeline_id": str(timeline_id)} - - response = self.request( - "POST", f"{self.env.control_plane_api}/tenant/{tenant_id}/timeline", json=body - ) - response.raise_for_status() - log.info(f"tenant_timeline_create success: {response.json()}") - def locate(self, tenant_id: TenantId) -> list[dict[str, Any]]: - response = self.request("GET", f"{self.env.control_plane_api}/tenant/{tenant_id}/locate") - response.raise_for_status() + """ + :return: list of {"shard_id": "", "node_id": int, "listen_pg_addr": str, "listen_pg_port": int, "listen_http_addr: str, "listen_http_port: int} + """ + response = self.request( + "GET", + f"{self.env.storage_controller_api}/control/v1/tenant/{tenant_id}/locate", + headers=self.headers(TokenScope.ADMIN), + ) body = response.json() shards: list[dict[str, Any]] = body["shards"] return shards - def tenant_shard_split(self, tenant_id: TenantId, shard_count: int) -> list[TenantShardId]: + def tenant_shard_split( + self, tenant_id: TenantId, shard_count: int, shard_stripe_size: Optional[int] = None + ) -> list[TenantShardId]: response = self.request( "PUT", - f"{self.env.control_plane_api}/tenant/{tenant_id}/shard_split", - json={"new_shard_count": shard_count}, + f"{self.env.storage_controller_api}/control/v1/tenant/{tenant_id}/shard_split", + json={"new_shard_count": shard_count, "new_stripe_size": shard_stripe_size}, + headers=self.headers(TokenScope.ADMIN), ) - response.raise_for_status() body = response.json() log.info(f"tenant_shard_split success: {body}") shards: list[TenantShardId] = body["new_shards"] return shards def tenant_shard_migrate(self, tenant_shard_id: TenantShardId, dest_ps_id: int): - response = self.request( + self.request( "PUT", - f"{self.env.control_plane_api}/tenant/{tenant_shard_id}/migrate", + f"{self.env.storage_controller_api}/control/v1/tenant/{tenant_shard_id}/migrate", json={"tenant_shard_id": str(tenant_shard_id), "node_id": dest_ps_id}, + headers=self.headers(TokenScope.ADMIN), ) - response.raise_for_status() log.info(f"Migrated tenant {tenant_shard_id} to pageserver {dest_ps_id}") assert self.env.get_tenant_pageserver(tenant_shard_id).id == dest_ps_id - def __enter__(self) -> "NeonAttachmentService": + def consistency_check(self): + """ + Throw an exception if the service finds any inconsistencies in its state + """ + self.request( + "POST", + f"{self.env.storage_controller_api}/debug/v1/consistency_check", + headers=self.headers(TokenScope.ADMIN), + ) + log.info("storage controller passed consistency check") + + def __enter__(self) -> "NeonStorageController": return self def __exit__( @@ -2034,6 +2189,11 @@ class NeonAttachmentService: self.stop(immediate=True) +@dataclass +class LogCursor: + _line_no: int + + class NeonPageserver(PgProtocol): """ An object representing a running pageserver. @@ -2196,7 +2356,18 @@ class NeonPageserver(PgProtocol): value = self.http_client().get_metric_value(metric) assert value == 0, f"Nonzero {metric} == {value}" - def log_contains(self, pattern: str) -> Optional[str]: + def assert_log_contains( + self, pattern: str, offset: None | LogCursor = None + ) -> Tuple[str, LogCursor]: + """Convenient for use inside wait_until()""" + + res = self.log_contains(pattern, offset=offset) + assert res is not None + return res + + def log_contains( + self, pattern: str, offset: None | LogCursor = None + ) -> Optional[Tuple[str, LogCursor]]: """Check that the pageserver log contains a line that matches the given regex""" logfile = self.workdir / "pageserver.log" if not logfile.exists(): @@ -2210,12 +2381,17 @@ class NeonPageserver(PgProtocol): # no guarantee it is already present in the log file. This hasn't # been a problem in practice, our python tests are not fast enough # to hit that race condition. + skip_until_line_no = 0 if offset is None else offset._line_no + cur_line_no = 0 with logfile.open("r") as f: for line in f: + if cur_line_no < skip_until_line_no: + cur_line_no += 1 + continue if contains_re.search(line): # found it! - return line - + cur_line_no += 1 + return (line, LogCursor(cur_line_no)) return None def tenant_attach( @@ -2231,7 +2407,7 @@ class NeonPageserver(PgProtocol): """ client = self.http_client() if generation is None: - generation = self.env.attachment_service.attach_hook_issue(tenant_id, self.id) + generation = self.env.storage_controller.attach_hook_issue(tenant_id, self.id) return client.tenant_attach( tenant_id, config, @@ -2240,14 +2416,14 @@ class NeonPageserver(PgProtocol): ) def tenant_detach(self, tenant_id: TenantId): - self.env.attachment_service.attach_hook_drop(tenant_id) + self.env.storage_controller.attach_hook_drop(tenant_id) client = self.http_client() return client.tenant_detach(tenant_id) def tenant_location_configure(self, tenant_id: TenantId, config: dict[str, Any], **kwargs): if config["mode"].startswith("Attached") and "generation" not in config: - config["generation"] = self.env.attachment_service.attach_hook_issue(tenant_id, self.id) + config["generation"] = self.env.storage_controller.attach_hook_issue(tenant_id, self.id) client = self.http_client() return client.tenant_location_conf(tenant_id, config, **kwargs) @@ -2271,14 +2447,14 @@ class NeonPageserver(PgProtocol): generation: Optional[int] = None, ) -> TenantId: if generation is None: - generation = self.env.attachment_service.attach_hook_issue(tenant_id, self.id) + generation = self.env.storage_controller.attach_hook_issue(tenant_id, self.id) client = self.http_client(auth_token=auth_token) return client.tenant_create(tenant_id, conf, generation=generation) def tenant_load(self, tenant_id: TenantId): client = self.http_client() return client.tenant_load( - tenant_id, generation=self.env.attachment_service.attach_hook_issue(tenant_id, self.id) + tenant_id, generation=self.env.storage_controller.attach_hook_issue(tenant_id, self.id) ) @@ -2379,12 +2555,27 @@ class PgBin: ) return base_path + def get_pg_controldata_checkpoint_lsn(self, pgdata: str) -> Lsn: + """ + Run pg_controldata on given datadir and extract checkpoint lsn. + """ + + pg_controldata_path = os.path.join(self.pg_bin_path, "pg_controldata") + cmd = f"{pg_controldata_path} -D {pgdata}" + result = subprocess.run(cmd, capture_output=True, text=True, shell=True) + checkpoint_lsn = re.findall( + "Latest checkpoint location:\\s+([0-9A-F]+/[0-9A-F]+)", result.stdout + )[0] + log.info(f"last checkpoint at {checkpoint_lsn}") + return Lsn(checkpoint_lsn) + @pytest.fixture(scope="function") def pg_bin(test_output_dir: Path, pg_distrib_dir: Path, pg_version: PgVersion) -> PgBin: return PgBin(test_output_dir, pg_distrib_dir, pg_version) +# TODO make port an optional argument class VanillaPostgres(PgProtocol): def __init__(self, pgdatadir: Path, pg_bin: PgBin, port: int, init: bool = True): super().__init__(host="localhost", port=port, dbname="postgres") @@ -2674,6 +2865,7 @@ class NeonProxy(PgProtocol): self.auth_backend = auth_backend self.metric_collection_endpoint = metric_collection_endpoint self.metric_collection_interval = metric_collection_interval + self.http_timeout_seconds = 15 self._popen: Optional[subprocess.Popen[bytes]] = None def start(self) -> NeonProxy: @@ -2712,6 +2904,7 @@ class NeonProxy(PgProtocol): *["--proxy", f"{self.host}:{self.proxy_port}"], *["--mgmt", f"{self.host}:{self.mgmt_port}"], *["--wss", f"{self.host}:{self.external_http_port}"], + *["--sql-over-http-timeout", f"{self.http_timeout_seconds}s"], *["-c", str(crt_path)], *["-k", str(key_path)], *self.auth_backend.extra_args(), @@ -2748,10 +2941,12 @@ class NeonProxy(PgProtocol): def http_query(self, query, args, **kwargs): # TODO maybe use default values if not provided - user = kwargs["user"] - password = kwargs["password"] + user = quote(kwargs["user"]) + password = quote(kwargs["password"]) expected_code = kwargs.get("expected_code") + log.info(f"Executing http query: {query}") + connstr = f"postgresql://{user}:{password}@{self.domain}:{self.proxy_port}/postgres" response = requests.post( f"https://{self.domain}:{self.external_http_port}/sql", @@ -2765,12 +2960,38 @@ class NeonProxy(PgProtocol): ) if expected_code is not None: - assert response.status_code == kwargs["expected_code"], f"response: {response.json()}" + assert response.status_code == expected_code, f"response: {response.json()}" return response.json() + async def http2_query(self, query, args, **kwargs): + # TODO maybe use default values if not provided + user = kwargs["user"] + password = kwargs["password"] + expected_code = kwargs.get("expected_code") + + log.info(f"Executing http2 query: {query}") + + connstr = f"postgresql://{user}:{password}@{self.domain}:{self.proxy_port}/postgres" + async with httpx.AsyncClient( + http2=True, verify=str(self.test_output_dir / "proxy.crt") + ) as client: + response = await client.post( + f"https://{self.domain}:{self.external_http_port}/sql", + json={"query": query, "params": args}, + headers={ + "Content-Type": "application/sql", + "Neon-Connection-String": connstr, + "Neon-Pool-Opt-In": "true", + }, + ) + assert response.http_version == "HTTP/2" + + if expected_code is not None: + assert response.status_code == expected_code, f"response: {response.json()}" + return response.json() + def get_metrics(self) -> str: request_result = requests.get(f"http://{self.host}:{self.http_port}/metrics") - request_result.raise_for_status() return request_result.text @staticmethod @@ -2980,6 +3201,8 @@ class Endpoint(PgProtocol): # set small 'max_replication_write_lag' to enable backpressure # and make tests more stable. config_lines = ["max_replication_write_lag=15MB"] + config_lines + + config_lines = ["neon.primary_is_running=on"] + config_lines self.config(config_lines) return self @@ -3042,6 +3265,17 @@ class Endpoint(PgProtocol): return self + def edit_hba(self, hba: List[str]): + """Prepend hba lines into pg_hba.conf file.""" + with open(os.path.join(self.pg_data_dir_path(), "pg_hba.conf"), "r+") as conf_file: + data = conf_file.read() + conf_file.seek(0) + conf_file.write("\n".join(hba) + "\n") + conf_file.write(data) + + if self.running: + self.safe_psql("SELECT pg_reload_conf()") + def reconfigure(self, pageserver_id: Optional[int] = None): assert self.endpoint_id is not None self.env.neon_cli.endpoint_reconfigure(self.endpoint_id, self.tenant_id, pageserver_id) @@ -3058,6 +3292,17 @@ class Endpoint(PgProtocol): log.info(json.dumps(dict(data_dict, **kwargs))) json.dump(dict(data_dict, **kwargs), file, indent=4) + # Please note: Migrations only run if pg_skip_catalog_updates is false + def wait_for_migrations(self): + with self.cursor() as cur: + + def check_migrations_done(): + cur.execute("SELECT id FROM neon_migration.migration_id") + migration_id = cur.fetchall()[0][0] + assert migration_id != 0 + + wait_until(20, 0.5, check_migrations_done) + # Mock the extension part of spec passed from control plane for local testing # endpooint.rs adds content of this file as a part of the spec.json def create_remote_extension_spec(self, spec: dict[str, Any]): @@ -3069,7 +3314,7 @@ class Endpoint(PgProtocol): with open(remote_extensions_spec_path, "w") as file: json.dump(spec, file, indent=4) - def stop(self) -> "Endpoint": + def stop(self, mode: str = "fast") -> "Endpoint": """ Stop the Postgres instance if it's running. Returns self. @@ -3078,13 +3323,13 @@ class Endpoint(PgProtocol): if self.running: assert self.endpoint_id is not None self.env.neon_cli.endpoint_stop( - self.endpoint_id, check_return_code=self.check_stop_result + self.endpoint_id, check_return_code=self.check_stop_result, mode=mode ) self.running = False return self - def stop_and_destroy(self) -> "Endpoint": + def stop_and_destroy(self, mode: str = "immediate") -> "Endpoint": """ Stop the Postgres instance, then destroy the endpoint. Returns self. @@ -3092,7 +3337,7 @@ class Endpoint(PgProtocol): assert self.endpoint_id is not None self.env.neon_cli.endpoint_stop( - self.endpoint_id, True, check_return_code=self.check_stop_result + self.endpoint_id, True, check_return_code=self.check_stop_result, mode=mode ) self.endpoint_id = None self.running = False @@ -3348,206 +3593,10 @@ class Safekeeper: return segments -# Walreceiver as returned by sk's timeline status endpoint. -@dataclass -class Walreceiver: - conn_id: int - state: str - - -@dataclass -class SafekeeperTimelineStatus: - acceptor_epoch: int - pg_version: int # Not exactly a PgVersion, safekeeper returns version as int, for example 150002 for 15.2 - flush_lsn: Lsn - commit_lsn: Lsn - timeline_start_lsn: Lsn - backup_lsn: Lsn - peer_horizon_lsn: Lsn - remote_consistent_lsn: Lsn - walreceivers: List[Walreceiver] - - -@dataclass -class SafekeeperMetrics: - # These are metrics from Prometheus which uses float64 internally. - # As a consequence, values may differ from real original int64s. - flush_lsn_inexact: Dict[Tuple[TenantId, TimelineId], int] = field(default_factory=dict) - commit_lsn_inexact: Dict[Tuple[TenantId, TimelineId], int] = field(default_factory=dict) - - -class SafekeeperHttpClient(requests.Session): - HTTPError = requests.HTTPError - - def __init__(self, port: int, auth_token: Optional[str] = None, is_testing_enabled=False): - super().__init__() - self.port = port - self.auth_token = auth_token - self.is_testing_enabled = is_testing_enabled - - if auth_token is not None: - self.headers["Authorization"] = f"Bearer {auth_token}" - - def check_status(self): - self.get(f"http://localhost:{self.port}/v1/status").raise_for_status() - - def is_testing_enabled_or_skip(self): - if not self.is_testing_enabled: - pytest.skip("safekeeper was built without 'testing' feature") - - def configure_failpoints(self, config_strings: Tuple[str, str] | List[Tuple[str, str]]): - self.is_testing_enabled_or_skip() - - if isinstance(config_strings, tuple): - pairs = [config_strings] - else: - pairs = config_strings - - log.info(f"Requesting config failpoints: {repr(pairs)}") - - res = self.put( - f"http://localhost:{self.port}/v1/failpoints", - json=[{"name": name, "actions": actions} for name, actions in pairs], - ) - log.info(f"Got failpoints request response code {res.status_code}") - res.raise_for_status() - res_json = res.json() - assert res_json is None - return res_json - - def debug_dump(self, params: Optional[Dict[str, str]] = None) -> Dict[str, Any]: - params = params or {} - res = self.get(f"http://localhost:{self.port}/v1/debug_dump", params=params) - res.raise_for_status() - res_json = json.loads(res.text) - assert isinstance(res_json, dict) - return res_json - - def pull_timeline(self, body: Dict[str, Any]) -> Dict[str, Any]: - res = self.post(f"http://localhost:{self.port}/v1/pull_timeline", json=body) - res.raise_for_status() - res_json = res.json() - assert isinstance(res_json, dict) - return res_json - - def copy_timeline(self, tenant_id: TenantId, timeline_id: TimelineId, body: Dict[str, Any]): - res = self.post( - f"http://localhost:{self.port}/v1/tenant/{tenant_id}/timeline/{timeline_id}/copy", - json=body, - ) - res.raise_for_status() - - def timeline_digest( - self, tenant_id: TenantId, timeline_id: TimelineId, from_lsn: Lsn, until_lsn: Lsn - ) -> Dict[str, Any]: - res = self.get( - f"http://localhost:{self.port}/v1/tenant/{tenant_id}/timeline/{timeline_id}/digest", - params={ - "from_lsn": str(from_lsn), - "until_lsn": str(until_lsn), - }, - ) - res.raise_for_status() - res_json = res.json() - assert isinstance(res_json, dict) - return res_json - - def timeline_create( - self, - tenant_id: TenantId, - timeline_id: TimelineId, - pg_version: int, # Not exactly a PgVersion, safekeeper returns version as int, for example 150002 for 15.2 - commit_lsn: Lsn, - ): - body = { - "tenant_id": str(tenant_id), - "timeline_id": str(timeline_id), - "pg_version": pg_version, - "commit_lsn": str(commit_lsn), - } - res = self.post(f"http://localhost:{self.port}/v1/tenant/timeline", json=body) - res.raise_for_status() - - def timeline_status( - self, tenant_id: TenantId, timeline_id: TimelineId - ) -> SafekeeperTimelineStatus: - res = self.get(f"http://localhost:{self.port}/v1/tenant/{tenant_id}/timeline/{timeline_id}") - res.raise_for_status() - resj = res.json() - walreceivers = [Walreceiver(wr["conn_id"], wr["status"]) for wr in resj["walreceivers"]] - return SafekeeperTimelineStatus( - acceptor_epoch=resj["acceptor_state"]["epoch"], - pg_version=resj["pg_info"]["pg_version"], - flush_lsn=Lsn(resj["flush_lsn"]), - commit_lsn=Lsn(resj["commit_lsn"]), - timeline_start_lsn=Lsn(resj["timeline_start_lsn"]), - backup_lsn=Lsn(resj["backup_lsn"]), - peer_horizon_lsn=Lsn(resj["peer_horizon_lsn"]), - remote_consistent_lsn=Lsn(resj["remote_consistent_lsn"]), - walreceivers=walreceivers, - ) - - def record_safekeeper_info(self, tenant_id: TenantId, timeline_id: TimelineId, body): - res = self.post( - f"http://localhost:{self.port}/v1/record_safekeeper_info/{tenant_id}/{timeline_id}", - json=body, - ) - res.raise_for_status() - - # only_local doesn't remove segments in the remote storage. - def timeline_delete( - self, tenant_id: TenantId, timeline_id: TimelineId, only_local: bool = False - ) -> Dict[Any, Any]: - res = self.delete( - f"http://localhost:{self.port}/v1/tenant/{tenant_id}/timeline/{timeline_id}", - params={ - "only_local": str(only_local).lower(), - }, - ) - res.raise_for_status() - res_json = res.json() - assert isinstance(res_json, dict) - return res_json - - def tenant_delete_force(self, tenant_id: TenantId) -> Dict[Any, Any]: - res = self.delete(f"http://localhost:{self.port}/v1/tenant/{tenant_id}") - res.raise_for_status() - res_json = res.json() - assert isinstance(res_json, dict) - return res_json - - def get_metrics_str(self) -> str: - request_result = self.get(f"http://localhost:{self.port}/metrics") - request_result.raise_for_status() - return request_result.text - - def get_metrics(self) -> SafekeeperMetrics: - all_metrics_text = self.get_metrics_str() - - metrics = SafekeeperMetrics() - for match in re.finditer( - r'^safekeeper_flush_lsn{tenant_id="([0-9a-f]+)",timeline_id="([0-9a-f]+)"} (\S+)$', - all_metrics_text, - re.MULTILINE, - ): - metrics.flush_lsn_inexact[(TenantId(match.group(1)), TimelineId(match.group(2)))] = int( - match.group(3) - ) - for match in re.finditer( - r'^safekeeper_commit_lsn{tenant_id="([0-9a-f]+)",timeline_id="([0-9a-f]+)"} (\S+)$', - all_metrics_text, - re.MULTILINE, - ): - metrics.commit_lsn_inexact[ - (TenantId(match.group(1)), TimelineId(match.group(2))) - ] = int(match.group(3)) - return metrics - - class S3Scrubber: - def __init__(self, log_dir: Path, env: NeonEnvBuilder): + def __init__(self, env: NeonEnvBuilder, log_dir: Optional[Path] = None): self.env = env - self.log_dir = log_dir + self.log_dir = log_dir or env.test_output_dir def scrubber_cli(self, args: list[str], timeout) -> str: assert isinstance(self.env.pageserver_remote_storage, S3Storage) @@ -3568,7 +3617,7 @@ class S3Scrubber: args = base_args + args (output_path, stdout, status_code) = subprocess_capture( - self.log_dir, + self.env.test_output_dir, args, echo_stderr=True, echo_stdout=True, @@ -3646,7 +3695,7 @@ def pytest_addoption(parser: Parser): SMALL_DB_FILE_NAME_REGEX: re.Pattern = re.compile( # type: ignore[type-arg] - r"config|config-v1|heatmap-v1|metadata|.+\.(?:toml|pid|json|sql|conf)" + r"config-v1|heatmap-v1|metadata|.+\.(?:toml|pid|json|sql|conf)" ) @@ -3853,32 +3902,29 @@ def list_files_to_compare(pgdata_dir: Path) -> List[str]: # pg is the existing and running compute node, that we want to compare with a basebackup def check_restored_datadir_content(test_output_dir: Path, env: NeonEnv, endpoint: Endpoint): + pg_bin = PgBin(test_output_dir, env.pg_distrib_dir, env.pg_version) + # Get the timeline ID. We need it for the 'basebackup' command timeline_id = TimelineId(endpoint.safe_psql("SHOW neon.timeline_id")[0][0]) - # many tests already checkpoint, but do it just in case - with closing(endpoint.connect()) as conn: - with conn.cursor() as cur: - cur.execute("CHECKPOINT") - - # wait for pageserver to catch up - wait_for_last_flush_lsn(env, endpoint, endpoint.tenant_id, timeline_id) # stop postgres to ensure that files won't change endpoint.stop() + # Read the shutdown checkpoint's LSN + checkpoint_lsn = pg_bin.get_pg_controldata_checkpoint_lsn(endpoint.pg_data_dir_path()) + # Take a basebackup from pageserver restored_dir_path = env.repo_dir / f"{endpoint.endpoint_id}_restored_datadir" restored_dir_path.mkdir(exist_ok=True) - pg_bin = PgBin(test_output_dir, env.pg_distrib_dir, env.pg_version) psql_path = os.path.join(pg_bin.pg_bin_path, "psql") - pageserver_id = env.attachment_service.locate(endpoint.tenant_id)[0]["node_id"] + pageserver_id = env.storage_controller.locate(endpoint.tenant_id)[0]["node_id"] cmd = rf""" {psql_path} \ --no-psqlrc \ postgres://localhost:{env.get_pageserver(pageserver_id).service_port.pg} \ - -c 'basebackup {endpoint.tenant_id} {timeline_id}' \ + -c 'basebackup {endpoint.tenant_id} {timeline_id} {checkpoint_lsn}' \ | tar -x -C {restored_dir_path} """ @@ -3897,8 +3943,17 @@ def check_restored_datadir_content(test_output_dir: Path, env: NeonEnv, endpoint # list files we're going to compare assert endpoint.pgdata_dir pgdata_files = list_files_to_compare(Path(endpoint.pgdata_dir)) + restored_files = list_files_to_compare(restored_dir_path) + if pgdata_files != restored_files: + # filter pg_xact and multixact files which are downloaded on demand + pgdata_files = [ + f + for f in pgdata_files + if not f.startswith("pg_xact") and not f.startswith("pg_multixact") + ] + # check that file sets are equal assert pgdata_files == restored_files @@ -3943,7 +3998,7 @@ def logical_replication_sync(subscriber: VanillaPostgres, publisher: Endpoint) - def tenant_get_shards( - env: NeonEnv, tenant_id: TenantId, pageserver_id: Optional[int] + env: NeonEnv, tenant_id: TenantId, pageserver_id: Optional[int] = None ) -> list[tuple[TenantShardId, NeonPageserver]]: """ Helper for when you want to talk to one or more pageservers, and the @@ -3951,7 +4006,7 @@ def tenant_get_shards( us to figure out the shards for a tenant. If the caller provides `pageserver_id`, it will be used for all shards, even - if the shard is indicated by attachment service to be on some other pageserver. + if the shard is indicated by storage controller to be on some other pageserver. Caller should over the response to apply their per-pageserver action to each shard @@ -3967,13 +4022,28 @@ def tenant_get_shards( TenantShardId.parse(s["shard_id"]), override_pageserver or env.get_pageserver(s["node_id"]), ) - for s in env.attachment_service.locate(tenant_id) + for s in env.storage_controller.locate(tenant_id) ] else: # Assume an unsharded tenant return [(TenantShardId(tenant_id, 0, 0), override_pageserver or env.pageserver)] +def wait_replica_caughtup(primary: Endpoint, secondary: Endpoint): + primary_lsn = Lsn( + primary.safe_psql_scalar("SELECT pg_current_wal_flush_lsn()", log_query=False) + ) + while True: + secondary_lsn = Lsn( + secondary.safe_psql_scalar("SELECT pg_last_wal_replay_lsn()", log_query=False) + ) + caught_up = secondary_lsn >= primary_lsn + log.info(f"caughtup={caught_up}, primary_lsn={primary_lsn}, secondary_lsn={secondary_lsn}") + if caught_up: + return + time.sleep(1) + + def wait_for_last_flush_lsn( env: NeonEnv, endpoint: Endpoint, @@ -4003,6 +4073,49 @@ def wait_for_last_flush_lsn( return min(results) +def flush_ep_to_pageserver( + env: NeonEnv, + ep: Endpoint, + tenant: TenantId, + timeline: TimelineId, + pageserver_id: Optional[int] = None, +) -> Lsn: + """ + Stop endpoint and wait until all committed WAL reaches the pageserver + (last_record_lsn). This is for use by tests which want everything written so + far to reach pageserver *and* expecting that no more data will arrive until + endpoint starts again, so unlike wait_for_last_flush_lsn it polls + safekeepers instead of compute to learn LSN. + + Returns the catch up LSN. + """ + ep.stop() + + commit_lsn: Lsn = Lsn(0) + # In principle in the absense of failures polling single sk would be enough. + for sk in env.safekeepers: + cli = sk.http_client() + # wait until compute connections are gone + wait_until(30, 0.5, partial(are_walreceivers_absent, cli, tenant, timeline)) + commit_lsn = max(cli.get_commit_lsn(tenant, timeline), commit_lsn) + + # Note: depending on WAL filtering implementation, probably most shards + # won't be able to reach commit_lsn (unless gaps are also ack'ed), so this + # is broken in sharded case. + shards = tenant_get_shards(env, tenant, pageserver_id) + for tenant_shard_id, pageserver in shards: + log.info( + f"flush_ep_to_pageserver: waiting for {commit_lsn} on shard {tenant_shard_id} on pageserver {pageserver.id})" + ) + waited = wait_for_last_record_lsn( + pageserver.http_client(), tenant_shard_id, timeline, commit_lsn + ) + + assert waited >= commit_lsn + + return commit_lsn + + def wait_for_wal_insert_lsn( env: NeonEnv, endpoint: Endpoint, diff --git a/test_runner/fixtures/pageserver/allowed_errors.py b/test_runner/fixtures/pageserver/allowed_errors.py index 74c6bddf23..8ff4341cc0 100755 --- a/test_runner/fixtures/pageserver/allowed_errors.py +++ b/test_runner/fixtures/pageserver/allowed_errors.py @@ -82,6 +82,11 @@ DEFAULT_PAGESERVER_ALLOWED_ERRORS = ( # During shutdown, DownloadError::Cancelled may be logged as an error. Cleaning this # up is tracked in https://github.com/neondatabase/neon/issues/6096 ".*Cancelled, shutting down.*", + # Open layers are only rolled at Lsn boundaries to avoid name clashses. + # Hence, we can overshoot the soft limit set by checkpoint distance. + # This is especially pronounced in tests that set small checkpoint + # distances. + ".*Flushed oversized open layer with size.*", ) diff --git a/test_runner/fixtures/pageserver/http.py b/test_runner/fixtures/pageserver/http.py index 340cc9e9e3..6e082374d7 100644 --- a/test_runner/fixtures/pageserver/http.py +++ b/test_runner/fixtures/pageserver/http.py @@ -4,6 +4,7 @@ import json import time from collections import defaultdict from dataclasses import dataclass +from datetime import datetime from typing import Any, Dict, List, Optional, Set, Tuple, Union import requests @@ -11,7 +12,7 @@ from requests.adapters import HTTPAdapter from urllib3.util.retry import Retry from fixtures.log_helper import log -from fixtures.metrics import Metrics, parse_metrics +from fixtures.metrics import Metrics, MetricsGetter, parse_metrics from fixtures.pg_version import PgVersion from fixtures.types import Lsn, TenantId, TenantShardId, TimelineId from fixtures.utils import Fn @@ -124,7 +125,7 @@ class TenantConfig: ) -class PageserverHttpClient(requests.Session): +class PageserverHttpClient(requests.Session, MetricsGetter): def __init__( self, port: int, @@ -285,7 +286,11 @@ class PageserverHttpClient(requests.Session): self.verbose_error(res) def tenant_location_conf( - self, tenant_id: Union[TenantId, TenantShardId], location_conf=dict[str, Any], flush_ms=None + self, + tenant_id: Union[TenantId, TenantShardId], + location_conf=dict[str, Any], + flush_ms=None, + lazy: Optional[bool] = None, ): body = location_conf.copy() body["tenant_id"] = str(tenant_id) @@ -294,6 +299,9 @@ class PageserverHttpClient(requests.Session): if flush_ms is not None: params["flush_ms"] = str(flush_ms) + if lazy is not None: + params["lazy"] = "true" if lazy else "false" + res = self.put( f"http://localhost:{self.port}/v1/tenant/{tenant_id}/location_config", json=body, @@ -301,6 +309,22 @@ class PageserverHttpClient(requests.Session): ) self.verbose_error(res) + def tenant_list_locations(self): + res = self.get( + f"http://localhost:{self.port}/v1/location_config", + ) + self.verbose_error(res) + res_json = res.json() + assert isinstance(res_json["tenant_shards"], list) + return res_json + + def tenant_get_location(self, tenant_id: TenantShardId): + res = self.get( + f"http://localhost:{self.port}/v1/location_config/{tenant_id}", + ) + self.verbose_error(res) + return res.json() + def tenant_delete(self, tenant_id: Union[TenantId, TenantShardId]): res = self.delete(f"http://localhost:{self.port}/v1/tenant/{tenant_id}") self.verbose_error(res) @@ -389,6 +413,28 @@ class PageserverHttpClient(requests.Session): ) return res.text + def tenant_time_travel_remote_storage( + self, + tenant_id: Union[TenantId, TenantShardId], + timestamp: datetime, + done_if_after: datetime, + shard_counts: Optional[List[int]] = None, + ): + """ + Issues a request to perform time travel operations on the remote storage + """ + + if shard_counts is None: + shard_counts = [] + body: Dict[str, Any] = { + "shard_counts": shard_counts, + } + res = self.put( + f"http://localhost:{self.port}/v1/tenant/{tenant_id}/time_travel_remote_storage?travel_to={timestamp.isoformat()}Z&done_if_after={done_if_after.isoformat()}Z", + json=body, + ) + self.verbose_error(res) + def timeline_list( self, tenant_id: Union[TenantId, TenantShardId], @@ -517,11 +563,14 @@ class PageserverHttpClient(requests.Session): tenant_id: Union[TenantId, TenantShardId], timeline_id: TimelineId, force_repartition=False, + force_image_layer_creation=False, ): self.is_testing_enabled_or_skip() query = {} if force_repartition: query["force_repartition"] = "true" + if force_image_layer_creation: + query["force_image_layer_creation"] = "true" log.info(f"Requesting compact: tenant {tenant_id}, timeline {timeline_id}") res = self.put( @@ -548,18 +597,13 @@ class PageserverHttpClient(requests.Session): self, tenant_id: Union[TenantId, TenantShardId], timeline_id: TimelineId, - timestamp, - version: Optional[int] = None, + timestamp: datetime, ): log.info( f"Requesting lsn by timestamp {timestamp}, tenant {tenant_id}, timeline {timeline_id}" ) - if version is None: - version_str = "" - else: - version_str = f"&version={version}" res = self.get( - f"http://localhost:{self.port}/v1/tenant/{tenant_id}/timeline/{timeline_id}/get_lsn_by_timestamp?timestamp={timestamp}{version_str}", + f"http://localhost:{self.port}/v1/tenant/{tenant_id}/timeline/{timeline_id}/get_lsn_by_timestamp?timestamp={timestamp.isoformat()}Z", ) self.verbose_error(res) res_json = res.json() @@ -581,11 +625,14 @@ class PageserverHttpClient(requests.Session): tenant_id: Union[TenantId, TenantShardId], timeline_id: TimelineId, force_repartition=False, + force_image_layer_creation=False, ): self.is_testing_enabled_or_skip() query = {} if force_repartition: query["force_repartition"] = "true" + if force_image_layer_creation: + query["force_image_layer_creation"] = "true" log.info(f"Requesting checkpoint: tenant {tenant_id}, timeline {timeline_id}") res = self.put( @@ -684,71 +731,33 @@ class PageserverHttpClient(requests.Session): }, ).value - def get_remote_timeline_client_metric( + def get_remote_timeline_client_queue_count( self, - metric_name: str, tenant_id: TenantId, timeline_id: TimelineId, file_kind: str, op_kind: str, - ) -> Optional[float]: - metrics = self.get_metrics() - matches = metrics.query_all( - name=metric_name, + ) -> Optional[int]: + metrics = [ + "pageserver_remote_timeline_client_calls_started_total", + "pageserver_remote_timeline_client_calls_finished_total", + ] + res = self.get_metrics_values( + metrics, filter={ "tenant_id": str(tenant_id), "timeline_id": str(timeline_id), "file_kind": str(file_kind), "op_kind": str(op_kind), }, + absence_ok=True, ) - if len(matches) == 0: - value = None - elif len(matches) == 1: - value = matches[0].value - assert value is not None - else: - assert len(matches) < 2, "above filter should uniquely identify metric" - return value - - def get_metric_value( - self, name: str, filter: Optional[Dict[str, str]] = None - ) -> Optional[float]: - metrics = self.get_metrics() - results = metrics.query_all(name, filter=filter) - if not results: - log.info(f'could not find metric "{name}"') + if len(res) != 2: return None - assert len(results) == 1, f"metric {name} with given filters is not unique, got: {results}" - return results[0].value - - def get_metrics_values( - self, names: list[str], filter: Optional[Dict[str, str]] = None - ) -> Dict[str, float]: - """ - When fetching multiple named metrics, it is more efficient to use this - than to call `get_metric_value` repeatedly. - - Throws RuntimeError if no metrics matching `names` are found, or if - not all of `names` are found: this method is intended for loading sets - of metrics whose existence is coupled. - """ - metrics = self.get_metrics() - samples = [] - for name in names: - samples.extend(metrics.query_all(name, filter=filter)) - - result = {} - for sample in samples: - if sample.name in result: - raise RuntimeError(f"Multiple values found for {sample.name}") - result[sample.name] = sample.value - - if len(result) != len(names): - log.info(f"Metrics found: {metrics.metrics}") - raise RuntimeError(f"could not find all metrics {' '.join(names)}") - - return result + inc, dec = [res[metric] for metric in metrics] + queue_count = int(inc) - int(dec) + assert queue_count >= 0 + return queue_count def layer_map_info( self, @@ -821,3 +830,16 @@ class PageserverHttpClient(requests.Session): self.put( f"http://localhost:{self.port}/v1/deletion_queue/flush?execute={'true' if execute else 'false'}" ).raise_for_status() + + def timeline_wait_logical_size(self, tenant_id: TenantId, timeline_id: TimelineId) -> int: + detail = self.timeline_detail( + tenant_id, + timeline_id, + include_non_incremental_logical_size=True, + force_await_initial_logical_size=True, + ) + current_logical_size = detail["current_logical_size"] + non_incremental = detail["current_logical_size_non_incremental"] + assert current_logical_size == non_incremental + assert isinstance(current_logical_size, int) + return current_logical_size diff --git a/test_runner/fixtures/pageserver/many_tenants.py b/test_runner/fixtures/pageserver/many_tenants.py index bbb4ccee5b..f47a3ea043 100644 --- a/test_runner/fixtures/pageserver/many_tenants.py +++ b/test_runner/fixtures/pageserver/many_tenants.py @@ -43,7 +43,7 @@ def single_timeline( log.info("detach template tenant form pageserver") env.pageserver.tenant_detach(template_tenant) env.pageserver.allowed_errors.append( - # tenant detach causes this because the underlying attach-hook removes the tenant from attachment_service entirely + # tenant detach causes this because the underlying attach-hook removes the tenant from storage controller entirely ".*Dropped remote consistent LSN updates.*", ) diff --git a/test_runner/fixtures/pageserver/utils.py b/test_runner/fixtures/pageserver/utils.py index 6b2651e447..cf64c86821 100644 --- a/test_runner/fixtures/pageserver/utils.py +++ b/test_runner/fixtures/pageserver/utils.py @@ -1,7 +1,12 @@ import time -from typing import Any, Dict, List, Optional, Union +from typing import Any, Dict, List, Optional, Tuple, Union -from mypy_boto3_s3.type_defs import ListObjectsV2OutputTypeDef, ObjectTypeDef +from mypy_boto3_s3.type_defs import ( + DeleteObjectOutputTypeDef, + EmptyResponseMetadataTypeDef, + ListObjectsV2OutputTypeDef, + ObjectTypeDef, +) from fixtures.log_helper import log from fixtures.pageserver.http import PageserverApiException, PageserverHttpClient @@ -15,7 +20,7 @@ def assert_tenant_state( tenant: TenantId, expected_state: str, message: Optional[str] = None, -): +) -> None: tenant_status = pageserver_http.tenant_status(tenant) log.info(f"tenant_status: {tenant_status}") assert tenant_status["state"]["slug"] == expected_state, message or tenant_status @@ -201,8 +206,8 @@ def wait_for_last_record_lsn( return current_lsn if i % 10 == 0: log.info( - "waiting for last_record_lsn to reach {}, now {}, iteration {}".format( - lsn, current_lsn, i + 1 + "{}/{} waiting for last_record_lsn to reach {}, now {}, iteration {}".format( + tenant, timeline, lsn, current_lsn, i + 1 ) ) time.sleep(0.1) @@ -214,20 +219,45 @@ def wait_for_last_record_lsn( def wait_for_upload_queue_empty( pageserver_http: PageserverHttpClient, tenant_id: TenantId, timeline_id: TimelineId ): + wait_period_secs = 0.2 while True: all_metrics = pageserver_http.get_metrics() - tl = all_metrics.query_all( - "pageserver_remote_timeline_client_calls_unfinished", + started = all_metrics.query_all( + "pageserver_remote_timeline_client_calls_started_total", { "tenant_id": str(tenant_id), "timeline_id": str(timeline_id), }, ) - assert len(tl) > 0 - log.info(f"upload queue for {tenant_id}/{timeline_id}: {tl}") - if all(m.value == 0 for m in tl): + finished = all_metrics.query_all( + "pageserver_remote_timeline_client_calls_finished_total", + { + "tenant_id": str(tenant_id), + "timeline_id": str(timeline_id), + }, + ) + + # this is `started left join finished`; if match, subtracting start from finished, resulting in queue depth + remaining_labels = ["shard_id", "file_kind", "op_kind"] + tl: List[Tuple[Any, float]] = [] + for s in started: + found = False + for f in finished: + if all([s.labels[label] == f.labels[label] for label in remaining_labels]): + assert ( + not found + ), "duplicate match, remaining_labels don't uniquely identify sample" + tl.append((s.labels, int(s.value) - int(f.value))) + found = True + if not found: + tl.append((s.labels, int(s.value))) + assert len(tl) == len(started), "something broken with join logic" + log.info(f"upload queue for {tenant_id}/{timeline_id}:") + for labels, queue_count in tl: + log.info(f" {labels}: {queue_count}") + if all(queue_count == 0 for (_, queue_count) in tl): return - time.sleep(0.2) + time.sleep(wait_period_secs) def wait_timeline_detail_404( @@ -262,7 +292,7 @@ def timeline_delete_wait_completed( iterations: int = 20, interval: Optional[float] = None, **delete_args, -): +) -> None: pageserver_http.timeline_delete(tenant_id=tenant_id, timeline_id=timeline_id, **delete_args) wait_timeline_detail_404(pageserver_http, tenant_id, timeline_id, iterations, interval) @@ -272,7 +302,7 @@ def assert_prefix_empty( remote_storage: Optional[RemoteStorage], prefix: Optional[str] = None, allowed_postfix: Optional[str] = None, -): +) -> None: assert remote_storage is not None response = list_prefix(remote_storage, prefix) keys = response["KeyCount"] @@ -327,7 +357,6 @@ def list_prefix( """ # For local_fs we need to properly handle empty directories, which we currently dont, so for simplicity stick to s3 api. assert isinstance(remote, S3Storage), "localfs is currently not supported" - assert remote.client is not None prefix_in_bucket = remote.prefix_in_bucket or "" if not prefix: @@ -346,6 +375,65 @@ def list_prefix( return response +def remote_storage_delete_key( + remote: RemoteStorage, + key: str, +) -> DeleteObjectOutputTypeDef: + """ + Note that this function takes into account prefix_in_bucket. + """ + # For local_fs we need to use a different implementation. As we don't need local_fs, just don't support it for now. + assert isinstance(remote, S3Storage), "localfs is currently not supported" + + prefix_in_bucket = remote.prefix_in_bucket or "" + + # real s3 tests have uniqie per test prefix + # mock_s3 tests use special pageserver prefix for pageserver stuff + key = "/".join((prefix_in_bucket, key)) + + response = remote.client.delete_object( + Bucket=remote.bucket_name, + Key=key, + ) + return response + + +def enable_remote_storage_versioning( + remote: RemoteStorage, +) -> EmptyResponseMetadataTypeDef: + """ + Enable S3 versioning for the remote storage + """ + # local_fs has no support for versioning + assert isinstance(remote, S3Storage), "localfs is currently not supported" + + # The SDK supports enabling versioning on normal S3 as well but we don't want to change + # these settings from a test in a live bucket (also, our access isn't enough nor should it be) + assert not remote.real, "Enabling storage versioning only supported on Mock S3" + + # Workaround to enable self-copy until upstream bug is fixed: https://github.com/getmoto/moto/issues/7300 + remote.client.put_bucket_encryption( + Bucket=remote.bucket_name, + ServerSideEncryptionConfiguration={ + "Rules": [ + { + "ApplyServerSideEncryptionByDefault": {"SSEAlgorithm": "AES256"}, + "BucketKeyEnabled": False, + }, + ] + }, + ) + # Note that this doesnt use pagination, so list is not guaranteed to be exhaustive. + response = remote.client.put_bucket_versioning( + Bucket=remote.bucket_name, + VersioningConfiguration={ + "MFADelete": "Disabled", + "Status": "Enabled", + }, + ) + return response + + def wait_tenant_status_404( pageserver_http: PageserverHttpClient, tenant_id: TenantId, @@ -395,8 +483,8 @@ def tenant_delete_wait_completed( MANY_SMALL_LAYERS_TENANT_CONFIG = { "gc_period": "0s", "compaction_period": "0s", - "checkpoint_distance": f"{1024**2}", - "image_creation_threshold": "100", + "checkpoint_distance": 1024**2, + "image_creation_threshold": 100, } diff --git a/test_runner/fixtures/parametrize.py b/test_runner/fixtures/parametrize.py index d8ac92abb6..b28da83508 100644 --- a/test_runner/fixtures/parametrize.py +++ b/test_runner/fixtures/parametrize.py @@ -2,57 +2,61 @@ import os from typing import Optional import pytest -from _pytest.fixtures import FixtureRequest from _pytest.python import Metafunc from fixtures.pg_version import PgVersion """ -Dynamically parametrize tests by Postgres version, build type (debug/release/remote), and possibly by other parameters +Dynamically parametrize tests by different parameters """ @pytest.fixture(scope="function", autouse=True) -def pg_version(request: FixtureRequest) -> Optional[PgVersion]: - # Do not parametrize performance tests yet, we need to prepare grafana charts first - if "test_runner/performance" in str(request.node.path): - v = os.environ.get("DEFAULT_PG_VERSION") - return PgVersion(v) - +def pg_version() -> Optional[PgVersion]: return None @pytest.fixture(scope="function", autouse=True) -def build_type(request: FixtureRequest) -> Optional[str]: - # Do not parametrize performance tests yet, we need to prepare grafana charts first - if "test_runner/performance" in str(request.node.path): - return os.environ.get("BUILD_TYPE", "").lower() - +def build_type() -> Optional[str]: return None @pytest.fixture(scope="function", autouse=True) -def pageserver_virtual_file_io_engine(request: FixtureRequest) -> Optional[str]: +def platform() -> Optional[str]: + return None + + +@pytest.fixture(scope="function", autouse=True) +def pageserver_virtual_file_io_engine() -> Optional[str]: return None def pytest_generate_tests(metafunc: Metafunc): - if (v := os.environ.get("DEFAULT_PG_VERSION")) is None: - pg_versions = [version for version in PgVersion if version != PgVersion.NOT_SET] - else: - pg_versions = [PgVersion(v)] - - if (bt := os.environ.get("BUILD_TYPE")) is None: + if (bt := os.getenv("BUILD_TYPE")) is None: build_types = ["debug", "release"] else: build_types = [bt.lower()] - # Do not parametrize performance tests yet by Postgres version or build type, we need to prepare grafana charts first - if "test_runner/performance" not in metafunc.definition._nodeid: - metafunc.parametrize("build_type", build_types) - metafunc.parametrize("pg_version", pg_versions, ids=map(lambda v: f"pg{v}", pg_versions)) + metafunc.parametrize("build_type", build_types) - # A hacky way to parametrize tests only for `pageserver_virtual_file_io_engine=tokio-epoll-uring` - # And do not change test name for default `pageserver_virtual_file_io_engine=std-fs` to keep tests statistics - if (io_engine := os.environ.get("PAGESERVER_VIRTUAL_FILE_IO_ENGINE", "")) not in ("", "std-fs"): + if (v := os.getenv("DEFAULT_PG_VERSION")) is None: + pg_versions = [version for version in PgVersion if version != PgVersion.NOT_SET] + else: + pg_versions = [PgVersion(v)] + + metafunc.parametrize("pg_version", pg_versions, ids=map(lambda v: f"pg{v}", pg_versions)) + + # A hacky way to parametrize tests only for `pageserver_virtual_file_io_engine=std-fs` + # And do not change test name for default `pageserver_virtual_file_io_engine=tokio-epoll-uring` to keep tests statistics + if (io_engine := os.getenv("PAGESERVER_VIRTUAL_FILE_IO_ENGINE", "")) not in ( + "", + "tokio-epoll-uring", + ): metafunc.parametrize("pageserver_virtual_file_io_engine", [io_engine]) + + # For performance tests, parametrize also by platform + if ( + "test_runner/performance" in metafunc.definition._nodeid + and (platform := os.getenv("PLATFORM")) is not None + ): + metafunc.parametrize("platform", [platform.lower()]) diff --git a/test_runner/fixtures/pg_version.py b/test_runner/fixtures/pg_version.py index 657718da00..941889a2f5 100644 --- a/test_runner/fixtures/pg_version.py +++ b/test_runner/fixtures/pg_version.py @@ -52,7 +52,7 @@ class PgVersion(str, enum.Enum): return None -DEFAULT_VERSION: PgVersion = PgVersion.V14 +DEFAULT_VERSION: PgVersion = PgVersion.V15 def skip_on_postgres(version: PgVersion, reason: str): @@ -78,6 +78,13 @@ def pytest_addoption(parser: Parser): ) +def run_only_on_default_postgres(reason: str): + return pytest.mark.skipif( + PgVersion(os.environ.get("DEFAULT_PG_VERSION", DEFAULT_VERSION)) is not DEFAULT_VERSION, + reason=reason, + ) + + def pytest_configure(config: Config): if config.getoption("--pg-version"): raise Exception("--pg-version is deprecated, use DEFAULT_PG_VERSION env var instead") diff --git a/test_runner/fixtures/remote_storage.py b/test_runner/fixtures/remote_storage.py index c0c2383feb..60591d8d46 100644 --- a/test_runner/fixtures/remote_storage.py +++ b/test_runner/fixtures/remote_storage.py @@ -160,8 +160,9 @@ class LocalFsStorage: class S3Storage: bucket_name: str bucket_region: str - access_key: str - secret_key: str + access_key: Optional[str] + secret_key: Optional[str] + aws_profile: Optional[str] prefix_in_bucket: str client: S3Client cleanup: bool @@ -170,10 +171,18 @@ class S3Storage: endpoint: Optional[str] = None def access_env_vars(self) -> Dict[str, str]: - return { - "AWS_ACCESS_KEY_ID": self.access_key, - "AWS_SECRET_ACCESS_KEY": self.secret_key, - } + if self.aws_profile is not None: + return { + "AWS_PROFILE": self.aws_profile, + } + if self.access_key is not None and self.secret_key is not None: + return { + "AWS_ACCESS_KEY_ID": self.access_key, + "AWS_SECRET_ACCESS_KEY": self.secret_key, + } + raise RuntimeError( + "Either AWS_PROFILE or (AWS_ACCESS_KEY_ID and AWS_SECRET_ACCESS_KEY) have to be set for S3Storage" + ) def to_string(self) -> str: return json.dumps( @@ -243,6 +252,16 @@ class S3Storage: log.info(f"deleted {cnt} objects from remote storage") + def tenant_path(self, tenant_id: TenantId) -> str: + return f"{self.prefix_in_bucket}/tenants/{tenant_id}" + + def heatmap_key(self, tenant_id: TenantId) -> str: + return f"{self.tenant_path(tenant_id)}/{TENANT_HEATMAP_FILE_NAME}" + + def heatmap_content(self, tenant_id: TenantId): + r = self.client.get_object(Bucket=self.bucket_name, Key=self.heatmap_key(tenant_id)) + return json.loads(r["Body"].read().decode("utf-8")) + RemoteStorage = Union[LocalFsStorage, S3Storage] @@ -308,6 +327,7 @@ class RemoteStorageKind(str, enum.Enum): bucket_region=mock_region, access_key=access_key, secret_key=secret_key, + aws_profile=None, prefix_in_bucket="", client=client, cleanup=False, @@ -317,12 +337,11 @@ class RemoteStorageKind(str, enum.Enum): assert self == RemoteStorageKind.REAL_S3 env_access_key = os.getenv("AWS_ACCESS_KEY_ID") - assert env_access_key, "no aws access key provided" env_secret_key = os.getenv("AWS_SECRET_ACCESS_KEY") - assert env_secret_key, "no aws access key provided" - - # session token is needed for local runs with sso auth - session_token = os.getenv("AWS_SESSION_TOKEN") + env_profile = os.getenv("AWS_PROFILE") + assert ( + env_access_key and env_secret_key + ) or env_profile, "need to specify either access key and secret access key or profile" bucket_name = bucket_name or os.getenv("REMOTE_STORAGE_S3_BUCKET") assert bucket_name is not None, "no remote storage bucket name provided" @@ -334,9 +353,6 @@ class RemoteStorageKind(str, enum.Enum): client = boto3.client( "s3", region_name=bucket_region, - aws_access_key_id=env_access_key, - aws_secret_access_key=env_secret_key, - aws_session_token=session_token, ) return S3Storage( @@ -344,6 +360,7 @@ class RemoteStorageKind(str, enum.Enum): bucket_region=bucket_region, access_key=env_access_key, secret_key=env_secret_key, + aws_profile=env_profile, prefix_in_bucket=prefix_in_bucket, client=client, cleanup=True, diff --git a/test_runner/fixtures/safekeeper/__init__.py b/test_runner/fixtures/safekeeper/__init__.py new file mode 100644 index 0000000000..e69de29bb2 diff --git a/test_runner/fixtures/safekeeper/http.py b/test_runner/fixtures/safekeeper/http.py new file mode 100644 index 0000000000..b9c1986818 --- /dev/null +++ b/test_runner/fixtures/safekeeper/http.py @@ -0,0 +1,227 @@ +import json +import re +from dataclasses import dataclass, field +from typing import Any, Dict, List, Optional, Tuple, Union + +import pytest +import requests + +from fixtures.log_helper import log +from fixtures.types import Lsn, TenantId, TimelineId + + +# Walreceiver as returned by sk's timeline status endpoint. +@dataclass +class Walreceiver: + conn_id: int + state: str + + +@dataclass +class SafekeeperTimelineStatus: + acceptor_epoch: int + pg_version: int # Not exactly a PgVersion, safekeeper returns version as int, for example 150002 for 15.2 + flush_lsn: Lsn + commit_lsn: Lsn + timeline_start_lsn: Lsn + backup_lsn: Lsn + peer_horizon_lsn: Lsn + remote_consistent_lsn: Lsn + walreceivers: List[Walreceiver] + + +@dataclass +class SafekeeperMetrics: + # These are metrics from Prometheus which uses float64 internally. + # As a consequence, values may differ from real original int64s. + flush_lsn_inexact: Dict[Tuple[TenantId, TimelineId], int] = field(default_factory=dict) + commit_lsn_inexact: Dict[Tuple[TenantId, TimelineId], int] = field(default_factory=dict) + + +class SafekeeperHttpClient(requests.Session): + HTTPError = requests.HTTPError + + def __init__(self, port: int, auth_token: Optional[str] = None, is_testing_enabled=False): + super().__init__() + self.port = port + self.auth_token = auth_token + self.is_testing_enabled = is_testing_enabled + + if auth_token is not None: + self.headers["Authorization"] = f"Bearer {auth_token}" + + def check_status(self): + self.get(f"http://localhost:{self.port}/v1/status").raise_for_status() + + def is_testing_enabled_or_skip(self): + if not self.is_testing_enabled: + pytest.skip("safekeeper was built without 'testing' feature") + + def configure_failpoints(self, config_strings: Union[Tuple[str, str], List[Tuple[str, str]]]): + self.is_testing_enabled_or_skip() + + if isinstance(config_strings, tuple): + pairs = [config_strings] + else: + pairs = config_strings + + log.info(f"Requesting config failpoints: {repr(pairs)}") + + res = self.put( + f"http://localhost:{self.port}/v1/failpoints", + json=[{"name": name, "actions": actions} for name, actions in pairs], + ) + log.info(f"Got failpoints request response code {res.status_code}") + res.raise_for_status() + res_json = res.json() + assert res_json is None + return res_json + + def debug_dump(self, params: Optional[Dict[str, str]] = None) -> Dict[str, Any]: + params = params or {} + res = self.get(f"http://localhost:{self.port}/v1/debug_dump", params=params) + res.raise_for_status() + res_json = json.loads(res.text) + assert isinstance(res_json, dict) + return res_json + + def patch_control_file( + self, + tenant_id: TenantId, + timeline_id: TimelineId, + patch: Dict[str, Any], + ) -> Dict[str, Any]: + res = self.patch( + f"http://localhost:{self.port}/v1/tenant/{tenant_id}/timeline/{timeline_id}/control_file", + json={ + "updates": patch, + "apply_fields": list(patch.keys()), + }, + ) + res.raise_for_status() + res_json = res.json() + assert isinstance(res_json, dict) + return res_json + + def pull_timeline(self, body: Dict[str, Any]) -> Dict[str, Any]: + res = self.post(f"http://localhost:{self.port}/v1/pull_timeline", json=body) + res.raise_for_status() + res_json = res.json() + assert isinstance(res_json, dict) + return res_json + + def copy_timeline(self, tenant_id: TenantId, timeline_id: TimelineId, body: Dict[str, Any]): + res = self.post( + f"http://localhost:{self.port}/v1/tenant/{tenant_id}/timeline/{timeline_id}/copy", + json=body, + ) + res.raise_for_status() + + def timeline_digest( + self, tenant_id: TenantId, timeline_id: TimelineId, from_lsn: Lsn, until_lsn: Lsn + ) -> Dict[str, Any]: + res = self.get( + f"http://localhost:{self.port}/v1/tenant/{tenant_id}/timeline/{timeline_id}/digest", + params={ + "from_lsn": str(from_lsn), + "until_lsn": str(until_lsn), + }, + ) + res.raise_for_status() + res_json = res.json() + assert isinstance(res_json, dict) + return res_json + + def timeline_create( + self, + tenant_id: TenantId, + timeline_id: TimelineId, + pg_version: int, # Not exactly a PgVersion, safekeeper returns version as int, for example 150002 for 15.2 + commit_lsn: Lsn, + ): + body = { + "tenant_id": str(tenant_id), + "timeline_id": str(timeline_id), + "pg_version": pg_version, + "commit_lsn": str(commit_lsn), + } + res = self.post(f"http://localhost:{self.port}/v1/tenant/timeline", json=body) + res.raise_for_status() + + def timeline_status( + self, tenant_id: TenantId, timeline_id: TimelineId + ) -> SafekeeperTimelineStatus: + res = self.get(f"http://localhost:{self.port}/v1/tenant/{tenant_id}/timeline/{timeline_id}") + res.raise_for_status() + resj = res.json() + walreceivers = [Walreceiver(wr["conn_id"], wr["status"]) for wr in resj["walreceivers"]] + return SafekeeperTimelineStatus( + acceptor_epoch=resj["acceptor_state"]["epoch"], + pg_version=resj["pg_info"]["pg_version"], + flush_lsn=Lsn(resj["flush_lsn"]), + commit_lsn=Lsn(resj["commit_lsn"]), + timeline_start_lsn=Lsn(resj["timeline_start_lsn"]), + backup_lsn=Lsn(resj["backup_lsn"]), + peer_horizon_lsn=Lsn(resj["peer_horizon_lsn"]), + remote_consistent_lsn=Lsn(resj["remote_consistent_lsn"]), + walreceivers=walreceivers, + ) + + def get_commit_lsn(self, tenant_id: TenantId, timeline_id: TimelineId) -> Lsn: + return self.timeline_status(tenant_id, timeline_id).commit_lsn + + def record_safekeeper_info(self, tenant_id: TenantId, timeline_id: TimelineId, body): + res = self.post( + f"http://localhost:{self.port}/v1/record_safekeeper_info/{tenant_id}/{timeline_id}", + json=body, + ) + res.raise_for_status() + + # only_local doesn't remove segments in the remote storage. + def timeline_delete( + self, tenant_id: TenantId, timeline_id: TimelineId, only_local: bool = False + ) -> Dict[Any, Any]: + res = self.delete( + f"http://localhost:{self.port}/v1/tenant/{tenant_id}/timeline/{timeline_id}", + params={ + "only_local": str(only_local).lower(), + }, + ) + res.raise_for_status() + res_json = res.json() + assert isinstance(res_json, dict) + return res_json + + def tenant_delete_force(self, tenant_id: TenantId) -> Dict[Any, Any]: + res = self.delete(f"http://localhost:{self.port}/v1/tenant/{tenant_id}") + res.raise_for_status() + res_json = res.json() + assert isinstance(res_json, dict) + return res_json + + def get_metrics_str(self) -> str: + request_result = self.get(f"http://localhost:{self.port}/metrics") + request_result.raise_for_status() + return request_result.text + + def get_metrics(self) -> SafekeeperMetrics: + all_metrics_text = self.get_metrics_str() + + metrics = SafekeeperMetrics() + for match in re.finditer( + r'^safekeeper_flush_lsn{tenant_id="([0-9a-f]+)",timeline_id="([0-9a-f]+)"} (\S+)$', + all_metrics_text, + re.MULTILINE, + ): + metrics.flush_lsn_inexact[(TenantId(match.group(1)), TimelineId(match.group(2)))] = int( + match.group(3) + ) + for match in re.finditer( + r'^safekeeper_commit_lsn{tenant_id="([0-9a-f]+)",timeline_id="([0-9a-f]+)"} (\S+)$', + all_metrics_text, + re.MULTILINE, + ): + metrics.commit_lsn_inexact[ + (TenantId(match.group(1)), TimelineId(match.group(2))) + ] = int(match.group(3)) + return metrics diff --git a/test_runner/fixtures/safekeeper/utils.py b/test_runner/fixtures/safekeeper/utils.py new file mode 100644 index 0000000000..2818a493d6 --- /dev/null +++ b/test_runner/fixtures/safekeeper/utils.py @@ -0,0 +1,11 @@ +from fixtures.log_helper import log +from fixtures.safekeeper.http import SafekeeperHttpClient +from fixtures.types import TenantId, TimelineId + + +def are_walreceivers_absent( + sk_http_cli: SafekeeperHttpClient, tenant_id: TenantId, timeline_id: TimelineId +): + status = sk_http_cli.timeline_status(tenant_id, timeline_id) + log.info(f"waiting for walreceivers to be gone, currently {status.walreceivers}") + return len(status.walreceivers) == 0 diff --git a/test_runner/fixtures/utils.py b/test_runner/fixtures/utils.py index 91f33e1196..7fc3bae3af 100644 --- a/test_runner/fixtures/utils.py +++ b/test_runner/fixtures/utils.py @@ -369,7 +369,12 @@ def start_in_background( return spawned_process -def wait_until(number_of_iterations: int, interval: float, func: Fn): +WaitUntilRet = TypeVar("WaitUntilRet") + + +def wait_until( + number_of_iterations: int, interval: float, func: Callable[[], WaitUntilRet] +) -> WaitUntilRet: """ Wait until 'func' returns successfully, without exception. Returns the last return value from the function. @@ -387,6 +392,18 @@ def wait_until(number_of_iterations: int, interval: float, func: Fn): raise Exception("timed out while waiting for %s" % func) from last_exception +def assert_eq(a, b) -> None: + assert a == b + + +def assert_gt(a, b) -> None: + assert a > b + + +def assert_ge(a, b) -> None: + assert a >= b + + def run_pg_bench_small(pg_bin: "PgBin", connstr: str): """ Fast way to populate data. diff --git a/test_runner/fixtures/workload.py b/test_runner/fixtures/workload.py index 30def1194d..1d5394dc1d 100644 --- a/test_runner/fixtures/workload.py +++ b/test_runner/fixtures/workload.py @@ -21,12 +21,21 @@ class Workload: - reads, checking we get the right data (`validate`) """ - def __init__(self, env: NeonEnv, tenant_id: TenantId, timeline_id: TimelineId): + def __init__( + self, + env: NeonEnv, + tenant_id: TenantId, + timeline_id: TimelineId, + branch_name: Optional[str] = None, + ): self.env = env self.tenant_id = tenant_id self.timeline_id = timeline_id self.table = "foo" + # By default, use the default branch name for initial tenant in NeonEnv + self.branch_name = branch_name or "main" + self.expect_rows = 0 self.churn_cursor = 0 @@ -35,7 +44,7 @@ class Workload: def endpoint(self, pageserver_id: Optional[int] = None) -> Endpoint: if self._endpoint is None: self._endpoint = self.env.endpoints.create( - "main", + self.branch_name, tenant_id=self.tenant_id, pageserver_id=pageserver_id, endpoint_id="ep-workload", @@ -64,7 +73,7 @@ class Workload: self.env, endpoint, self.tenant_id, self.timeline_id, pageserver_id=pageserver_id ) - def write_rows(self, n, pageserver_id: Optional[int] = None): + def write_rows(self, n, pageserver_id: Optional[int] = None, upload: bool = True): endpoint = self.endpoint(pageserver_id) start = self.expect_rows end = start + n - 1 @@ -78,9 +87,12 @@ class Workload: """ ) - return last_flush_lsn_upload( - self.env, endpoint, self.tenant_id, self.timeline_id, pageserver_id=pageserver_id - ) + if upload: + return last_flush_lsn_upload( + self.env, endpoint, self.tenant_id, self.timeline_id, pageserver_id=pageserver_id + ) + else: + return False def churn_rows(self, n, pageserver_id: Optional[int] = None, upload=True): assert self.expect_rows >= n diff --git a/test_runner/performance/pageserver/interactive/test_many_small_tenants.py b/test_runner/performance/pageserver/interactive/test_many_small_tenants.py index 3fb28ace46..0ff9c8fdaa 100644 --- a/test_runner/performance/pageserver/interactive/test_many_small_tenants.py +++ b/test_runner/performance/pageserver/interactive/test_many_small_tenants.py @@ -56,7 +56,7 @@ def setup_env( template_tenant, template_timeline = env.neon_cli.create_tenant(set_default=True) env.pageserver.tenant_detach(template_tenant) env.pageserver.allowed_errors.append( - # tenant detach causes this because the underlying attach-hook removes the tenant from attachment_service entirely + # tenant detach causes this because the underlying attach-hook removes the tenant from storage controller entirely ".*Dropped remote consistent LSN updates.*", ) env.pageserver.tenant_attach(template_tenant, config) diff --git a/test_runner/performance/pageserver/pagebench/test_large_slru_basebackup.py b/test_runner/performance/pageserver/pagebench/test_large_slru_basebackup.py new file mode 100644 index 0000000000..c98fa44b1a --- /dev/null +++ b/test_runner/performance/pageserver/pagebench/test_large_slru_basebackup.py @@ -0,0 +1,200 @@ +import asyncio +import json +import os +from pathlib import Path +from typing import Any, Dict, Tuple + +import pytest +from fixtures.benchmark_fixture import MetricReport, NeonBenchmarker +from fixtures.log_helper import log +from fixtures.neon_fixtures import Endpoint, NeonEnv, NeonEnvBuilder, PgBin, wait_for_last_flush_lsn +from fixtures.utils import get_scale_for_db, humantime_to_ms + +from performance.pageserver.util import ( + setup_pageserver_with_tenants, +) + + +@pytest.mark.parametrize("duration", [30]) +@pytest.mark.parametrize("pgbench_scale", [get_scale_for_db(200)]) +@pytest.mark.parametrize("n_tenants", [10]) +@pytest.mark.parametrize("get_vectored_impl", ["sequential", "vectored"]) +@pytest.mark.timeout(1000) +@pytest.mark.skipif( + os.getenv("CI", "false") == "true", + reason="The test if flaky on CI: https://github.com/neondatabase/neon/issues/7006", +) +def test_basebackup_with_high_slru_count( + neon_env_builder: NeonEnvBuilder, + zenbenchmark: NeonBenchmarker, + pg_bin: PgBin, + get_vectored_impl: str, + n_tenants: int, + pgbench_scale: int, + duration: int, +): + def record(metric, **kwargs): + zenbenchmark.record(metric_name=f"pageserver_basebackup.{metric}", **kwargs) + + params: Dict[str, Tuple[Any, Dict[str, Any]]] = {} + + # params from fixtures + params.update( + { + "n_tenants": (n_tenants, {"unit": ""}), + "pgbench_scale": (pgbench_scale, {"unit": ""}), + "duration": (duration, {"unit": "s"}), + } + ) + + # configure cache sizes like in prod + page_cache_size = 16384 + max_file_descriptors = 500000 + neon_env_builder.pageserver_config_override = ( + f"page_cache_size={page_cache_size}; max_file_descriptors={max_file_descriptors}; " + f"get_vectored_impl='{get_vectored_impl}'; validate_vectored_get=false" + ) + params.update( + { + "pageserver_config_override.page_cache_size": ( + page_cache_size * 8192, + {"unit": "byte"}, + ), + "pageserver_config_override.max_file_descriptors": (max_file_descriptors, {"unit": ""}), + } + ) + + for param, (value, kwargs) in params.items(): + record(param, metric_value=value, report=MetricReport.TEST_PARAM, **kwargs) + + n_txns = 500000 + + def setup_wrapper(env: NeonEnv): + return setup_tenant_template(env, n_txns) + + env = setup_pageserver_with_tenants( + neon_env_builder, f"large_slru_count-{n_tenants}-{n_txns}", n_tenants, setup_wrapper + ) + run_benchmark(env, pg_bin, record, duration) + + +def setup_tenant_template(env: NeonEnv, n_txns: int): + config = { + "gc_period": "0s", # disable periodic gc + "checkpoint_timeout": "10 years", + "compaction_period": "0s", # disable periodic compaction + "compaction_threshold": 10, + "compaction_target_size": 134217728, + "checkpoint_distance": 268435456, + "image_creation_threshold": 3, + } + + template_tenant, template_timeline = env.neon_cli.create_tenant(set_default=True) + env.pageserver.tenant_detach(template_tenant) + env.pageserver.allowed_errors.append( + # tenant detach causes this because the underlying attach-hook removes the tenant from storage controller entirely + ".*Dropped remote consistent LSN updates.*", + ) + env.pageserver.tenant_attach(template_tenant, config) + + ps_http = env.pageserver.http_client() + + with env.endpoints.create_start( + "main", tenant_id=template_tenant, config_lines=["shared_buffers=1MB"] + ) as ep: + rels = 10 + + asyncio.run(run_updates(ep, n_txns, rels)) + + wait_for_last_flush_lsn(env, ep, template_tenant, template_timeline) + ps_http.timeline_checkpoint(template_tenant, template_timeline) + ps_http.timeline_compact(template_tenant, template_timeline) + + return (template_tenant, template_timeline, config) + + +# Takes about 5 minutes and produces tenants with around 300 SLRU blocks +# of 8 KiB each. +async def run_updates(ep: Endpoint, n_txns: int, workers_count: int): + workers = [] + for i in range(workers_count): + workers.append(asyncio.create_task(run_update_loop_worker(ep, n_txns, i))) + + await asyncio.gather(*workers) + + +async def run_update_loop_worker(ep: Endpoint, n_txns: int, idx: int): + table = f"t_{idx}" + conn = await ep.connect_async() + await conn.execute(f"CREATE TABLE {table} (pk integer PRIMARY KEY, x integer)") + await conn.execute(f"ALTER TABLE {table} SET (autovacuum_enabled = false)") + await conn.execute(f"INSERT INTO {table} VALUES (1, 0)") + await conn.execute( + """ + CREATE PROCEDURE updating{0}() as + $$ + DECLARE + i integer; + BEGIN + FOR i IN 1..{1} LOOP + UPDATE {0} SET x = x + 1 WHERE pk=1; + COMMIT; + END LOOP; + END + $$ LANGUAGE plpgsql + """.format(table, n_txns) + ) + await conn.execute("SET statement_timeout=0") + await conn.execute(f"call updating{table}()") + + +def run_benchmark(env: NeonEnv, pg_bin: PgBin, record, duration_secs: int): + ps_http = env.pageserver.http_client() + cmd = [ + str(env.neon_binpath / "pagebench"), + "basebackup", + "--mgmt-api-endpoint", + ps_http.base_url, + "--page-service-connstring", + env.pageserver.connstr(password=None), + "--gzip-probability", + "1", + "--runtime", + f"{duration_secs}s", + # don't specify the targets explicitly, let pagebench auto-discover them + ] + + log.info(f"command: {' '.join(cmd)}") + basepath = pg_bin.run_capture(cmd, with_command_header=False) + results_path = Path(basepath + ".stdout") + log.info(f"Benchmark results at: {results_path}") + + with open(results_path, "r") as f: + results = json.load(f) + log.info(f"Results:\n{json.dumps(results, sort_keys=True, indent=2)}") + + total = results["total"] + metric = "request_count" + record( + metric, + metric_value=total[metric], + unit="", + report=MetricReport.HIGHER_IS_BETTER, + ) + + metric = "latency_mean" + record( + metric, + metric_value=humantime_to_ms(total[metric]), + unit="ms", + report=MetricReport.LOWER_IS_BETTER, + ) + + metric = "latency_percentiles" + for k, v in total[metric].items(): + record( + f"{metric}.{k}", + metric_value=humantime_to_ms(v), + unit="ms", + report=MetricReport.LOWER_IS_BETTER, + ) diff --git a/test_runner/performance/pageserver/pagebench/test_pageserver_max_throughput_getpage_at_latest_lsn.py b/test_runner/performance/pageserver/pagebench/test_pageserver_max_throughput_getpage_at_latest_lsn.py index 1ed7e577b9..1a0012397c 100644 --- a/test_runner/performance/pageserver/pagebench/test_pageserver_max_throughput_getpage_at_latest_lsn.py +++ b/test_runner/performance/pageserver/pagebench/test_pageserver_max_throughput_getpage_at_latest_lsn.py @@ -1,8 +1,8 @@ import json +import os from pathlib import Path from typing import Any, Dict, Tuple -import fixtures.pageserver.many_tenants as many_tenants import pytest from fixtures.benchmark_fixture import MetricReport, NeonBenchmarker from fixtures.log_helper import log @@ -14,7 +14,9 @@ from fixtures.neon_fixtures import ( ) from fixtures.utils import get_scale_for_db, humantime_to_ms -from performance.pageserver.util import ensure_pageserver_ready_for_benchmarking +from performance.pageserver.util import ( + setup_pageserver_with_tenants, +) # For reference, the space usage of the snapshots: @@ -33,6 +35,10 @@ from performance.pageserver.util import ensure_pageserver_ready_for_benchmarking @pytest.mark.timeout( 10000 ) # TODO: this value is just "a really high number"; have this per instance type +@pytest.mark.skipif( + os.getenv("CI", "false") == "true", + reason="The test if flaky on CI: https://github.com/neondatabase/neon/issues/6724", +) def test_pageserver_max_throughput_getpage_at_latest_lsn( neon_env_builder: NeonEnvBuilder, zenbenchmark: NeonBenchmarker, @@ -75,10 +81,77 @@ def test_pageserver_max_throughput_getpage_at_latest_lsn( for param, (value, kwargs) in params.items(): record(param, metric_value=value, report=MetricReport.TEST_PARAM, **kwargs) - env = setup_pageserver_with_pgbench_tenants(neon_env_builder, pg_bin, n_tenants, pgbench_scale) + + def setup_wrapper(env: NeonEnv): + return setup_tenant_template(env, pg_bin, pgbench_scale) + + env = setup_pageserver_with_tenants( + neon_env_builder, + f"max_throughput_latest_lsn-{n_tenants}-{pgbench_scale}", + n_tenants, + setup_wrapper, + ) run_benchmark_max_throughput_latest_lsn(env, pg_bin, record, duration) +def setup_tenant_template(env: NeonEnv, pg_bin: PgBin, scale: int): + """ + Set up a template tenant which will be replicated by the test infra. + It's a pgbench tenant, initialized to a certain scale, and treated afterwards + with a repeat application of (pgbench simple-update workload, checkpoint, compact). + """ + # use a config that makes production of on-disk state timing-insensitive + # as we ingest data into the tenant. + config = { + "gc_period": "0s", # disable periodic gc + "checkpoint_timeout": "10 years", + "compaction_period": "0s", # disable periodic compaction + "compaction_threshold": 10, + "compaction_target_size": 134217728, + "checkpoint_distance": 268435456, + "image_creation_threshold": 3, + } + template_tenant, template_timeline = env.neon_cli.create_tenant(set_default=True) + env.pageserver.tenant_detach(template_tenant) + env.pageserver.allowed_errors.append( + # tenant detach causes this because the underlying attach-hook removes the tenant from storage controller entirely + ".*Dropped remote consistent LSN updates.*", + ) + env.pageserver.tenant_attach(template_tenant, config) + ps_http = env.pageserver.http_client() + with env.endpoints.create_start("main", tenant_id=template_tenant) as ep: + pg_bin.run_capture(["pgbench", "-i", f"-s{scale}", "-I", "dtGvp", ep.connstr()]) + wait_for_last_flush_lsn(env, ep, template_tenant, template_timeline) + ps_http.timeline_checkpoint(template_tenant, template_timeline) + ps_http.timeline_compact(template_tenant, template_timeline) + for _ in range( + 0, 17 + ): # some prime number to avoid potential resonances with the "_threshold" variables from the config + # the L0s produced by this appear to have size ~5MiB + num_txns = 10_000 + pg_bin.run_capture( + ["pgbench", "-N", "-c1", "--transactions", f"{num_txns}", ep.connstr()] + ) + wait_for_last_flush_lsn(env, ep, template_tenant, template_timeline) + ps_http.timeline_checkpoint(template_tenant, template_timeline) + ps_http.timeline_compact(template_tenant, template_timeline) + # for reference, the output at scale=6 looked like so (306M total) + # ls -sh test_output/shared-snapshots/max_throughput_latest_lsn-2-6/snapshot/pageserver_1/tenants/35c30b88ea16a7a09f82d9c6a115551b/timelines/da902b378eebe83dc8a4e81cd3dc1c59 + # total 306M + # 188M 000000000000000000000000000000000000-030000000000000000000000000000000003__000000000149F060-0000000009E75829 + # 4.5M 000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__0000000009E75829-000000000A21E919 + # 33M 000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__000000000A21E919-000000000C20CB71 + # 36M 000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__000000000C20CB71-000000000E470791 + # 16M 000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__000000000E470791-000000000F34AEF1 + # 8.2M 000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__000000000F34AEF1-000000000FABA8A9 + # 6.0M 000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__000000000FABA8A9-000000000FFE0639 + # 6.1M 000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__000000000FFE0639-000000001051D799 + # 4.7M 000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__000000001051D799-0000000010908F19 + # 4.6M 000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__0000000010908F19-0000000010CD3021 + + return (template_tenant, template_timeline, config) + + def run_benchmark_max_throughput_latest_lsn( env: NeonEnv, pg_bin: PgBin, record, duration_secs: int ): @@ -133,78 +206,3 @@ def run_benchmark_max_throughput_latest_lsn( unit="ms", report=MetricReport.LOWER_IS_BETTER, ) - - -def setup_pageserver_with_pgbench_tenants( - neon_env_builder: NeonEnvBuilder, - pg_bin: PgBin, - n_tenants: int, - scale: int, -) -> NeonEnv: - """ - Utility function to set up a pageserver with a given number of identical tenants. - Each tenant is a pgbench tenant, initialize to a certain scale, and treated afterwards - with a repeat application of (pgbench simple-update workload, checkpoint, compact). - """ - - def setup_template(env: NeonEnv): - # use a config that makes production of on-disk state timing-insensitive - # as we ingest data into the tenant. - config = { - "gc_period": "0s", # disable periodic gc - "checkpoint_timeout": "10 years", - "compaction_period": "0s", # disable periodic compaction - "compaction_threshold": 10, - "compaction_target_size": 134217728, - "checkpoint_distance": 268435456, - "image_creation_threshold": 3, - } - template_tenant, template_timeline = env.neon_cli.create_tenant(set_default=True) - env.pageserver.tenant_detach(template_tenant) - env.pageserver.allowed_errors.append( - # tenant detach causes this because the underlying attach-hook removes the tenant from attachment_service entirely - ".*Dropped remote consistent LSN updates.*", - ) - env.pageserver.tenant_attach(template_tenant, config) - ps_http = env.pageserver.http_client() - with env.endpoints.create_start("main", tenant_id=template_tenant) as ep: - pg_bin.run_capture(["pgbench", "-i", f"-s{scale}", "-I", "dtGvp", ep.connstr()]) - wait_for_last_flush_lsn(env, ep, template_tenant, template_timeline) - ps_http.timeline_checkpoint(template_tenant, template_timeline) - ps_http.timeline_compact(template_tenant, template_timeline) - for _ in range( - 0, 17 - ): # some prime number to avoid potential resonances with the "_threshold" variables from the config - # the L0s produced by this appear to have size ~5MiB - num_txns = 10_000 - pg_bin.run_capture( - ["pgbench", "-N", "-c1", "--transactions", f"{num_txns}", ep.connstr()] - ) - wait_for_last_flush_lsn(env, ep, template_tenant, template_timeline) - ps_http.timeline_checkpoint(template_tenant, template_timeline) - ps_http.timeline_compact(template_tenant, template_timeline) - # for reference, the output at scale=6 looked like so (306M total) - # ls -sh test_output/shared-snapshots/max_throughput_latest_lsn-2-6/snapshot/pageserver_1/tenants/35c30b88ea16a7a09f82d9c6a115551b/timelines/da902b378eebe83dc8a4e81cd3dc1c59 - # total 306M - # 188M 000000000000000000000000000000000000-030000000000000000000000000000000003__000000000149F060-0000000009E75829 - # 4.5M 000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__0000000009E75829-000000000A21E919 - # 33M 000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__000000000A21E919-000000000C20CB71 - # 36M 000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__000000000C20CB71-000000000E470791 - # 16M 000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__000000000E470791-000000000F34AEF1 - # 8.2M 000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__000000000F34AEF1-000000000FABA8A9 - # 6.0M 000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__000000000FABA8A9-000000000FFE0639 - # 6.1M 000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__000000000FFE0639-000000001051D799 - # 4.7M 000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__000000001051D799-0000000010908F19 - # 4.6M 000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__0000000010908F19-0000000010CD3021 - - return (template_tenant, template_timeline, config) - - def doit(neon_env_builder: NeonEnvBuilder) -> NeonEnv: - return many_tenants.single_timeline(neon_env_builder, setup_template, n_tenants) - - env = neon_env_builder.build_and_use_snapshot( - f"max_throughput_latest_lsn-{n_tenants}-{scale}", doit - ) - env.start() - ensure_pageserver_ready_for_benchmarking(env, n_tenants) - return env diff --git a/test_runner/performance/pageserver/util.py b/test_runner/performance/pageserver/util.py index 45eb652362..009d62c9ba 100644 --- a/test_runner/performance/pageserver/util.py +++ b/test_runner/performance/pageserver/util.py @@ -2,9 +2,16 @@ Utilities used by all code in this sub-directory """ +from typing import Any, Callable, Dict, Tuple + +import fixtures.pageserver.many_tenants as many_tenants from fixtures.log_helper import log -from fixtures.neon_fixtures import NeonEnv +from fixtures.neon_fixtures import ( + NeonEnv, + NeonEnvBuilder, +) from fixtures.pageserver.utils import wait_until_all_tenants_state +from fixtures.types import TenantId, TimelineId def ensure_pageserver_ready_for_benchmarking(env: NeonEnv, n_tenants: int): @@ -27,3 +34,22 @@ def ensure_pageserver_ready_for_benchmarking(env: NeonEnv, n_tenants: int): assert not layer.remote log.info("ready") + + +def setup_pageserver_with_tenants( + neon_env_builder: NeonEnvBuilder, + name: str, + n_tenants: int, + setup: Callable[[NeonEnv], Tuple[TenantId, TimelineId, Dict[str, Any]]], +) -> NeonEnv: + """ + Utility function to set up a pageserver with a given number of identical tenants. + """ + + def doit(neon_env_builder: NeonEnvBuilder) -> NeonEnv: + return many_tenants.single_timeline(neon_env_builder, setup, n_tenants) + + env = neon_env_builder.build_and_use_snapshot(name, doit) + env.start() + ensure_pageserver_ready_for_benchmarking(env, n_tenants) + return env diff --git a/test_runner/performance/test_branch_creation.py b/test_runner/performance/test_branch_creation.py index 6edcb8f1f2..9777bf6748 100644 --- a/test_runner/performance/test_branch_creation.py +++ b/test_runner/performance/test_branch_creation.py @@ -1,4 +1,5 @@ import random +import re import statistics import threading import time @@ -7,11 +8,14 @@ from contextlib import closing from typing import List import pytest -from fixtures.benchmark_fixture import MetricReport +from fixtures.benchmark_fixture import MetricReport, NeonBenchmarker from fixtures.compare_fixtures import NeonCompare from fixtures.log_helper import log +from fixtures.neon_fixtures import NeonPageserver from fixtures.pageserver.utils import wait_for_last_record_lsn from fixtures.types import Lsn +from fixtures.utils import wait_until +from prometheus_client.samples import Sample def _record_branch_creation_durations(neon_compare: NeonCompare, durs: List[float]): @@ -89,11 +93,17 @@ def test_branch_creation_heavy_write(neon_compare: NeonCompare, n_branches: int) _record_branch_creation_durations(neon_compare, branch_creation_durations) -@pytest.mark.parametrize("n_branches", [1024]) -# Test measures the latency of branch creation when creating a lot of branches. -def test_branch_creation_many(neon_compare: NeonCompare, n_branches: int): +@pytest.mark.parametrize("n_branches", [500, 1024]) +@pytest.mark.parametrize("shape", ["one_ancestor", "random"]) +def test_branch_creation_many(neon_compare: NeonCompare, n_branches: int, shape: str): + """ + Test measures the latency of branch creation when creating a lot of branches. + """ env = neon_compare.env + # seed the prng so we will measure the same structure every time + rng = random.Random("2024-02-29") + env.neon_cli.create_branch("b0") endpoint = env.endpoints.create_start("b0") @@ -102,15 +112,101 @@ def test_branch_creation_many(neon_compare: NeonCompare, n_branches: int): branch_creation_durations = [] for i in range(n_branches): - # random a source branch - p = random.randint(0, i) + if shape == "random": + parent = f"b{rng.randint(0, i)}" + elif shape == "one_ancestor": + parent = "b0" + else: + raise RuntimeError(f"unimplemented shape: {shape}") + timer = timeit.default_timer() - env.neon_cli.create_branch("b{}".format(i + 1), "b{}".format(p)) + # each of these uploads to remote storage before completion + env.neon_cli.create_branch(f"b{i + 1}", parent) dur = timeit.default_timer() - timer branch_creation_durations.append(dur) _record_branch_creation_durations(neon_compare, branch_creation_durations) + endpoint.stop_and_destroy() + + with neon_compare.record_duration("shutdown"): + # this sleeps 100ms between polls + env.pageserver.stop() + + startup_line = "INFO version: git(-env)?:" + + # find the first line of the log file so we can find the next start later + _, first_start = wait_until(5, 1, lambda: env.pageserver.assert_log_contains(startup_line)) + + # start without gc so we can time compaction with less noise; use shorter + # period for compaction so it starts earlier + env.pageserver.start( + overrides=( + "--pageserver-config-override=tenant_config={ compaction_period = '3s', gc_period = '0s' }", + ), + # this does print more than we want, but the number should be comparable between runs + extra_env_vars={ + "RUST_LOG": f"[compaction_loop{{tenant_id={env.initial_tenant}}}]=debug,info" + }, + ) + + _, second_start = wait_until( + 5, 1, lambda: env.pageserver.assert_log_contains(startup_line, first_start) + ) + env.pageserver.quiesce_tenants() + + wait_and_record_startup_metrics(env.pageserver, neon_compare.zenbenchmark, "restart_after") + + # wait for compaction to complete, which most likely has already done so multiple times + msg, _ = wait_until( + 30, + 1, + lambda: env.pageserver.assert_log_contains( + f".*tenant_id={env.initial_tenant}.*: compaction iteration complete.*", second_start + ), + ) + needle = re.search(" elapsed_ms=([0-9]+)", msg) + assert needle is not None, "failed to find the elapsed time" + duration = int(needle.group(1)) / 1000.0 + neon_compare.zenbenchmark.record("compaction", duration, "s", MetricReport.LOWER_IS_BETTER) + + +def wait_and_record_startup_metrics( + pageserver: NeonPageserver, target: NeonBenchmarker, prefix: str +): + """ + Waits until all startup metrics have non-zero values on the pageserver, then records them on the target + """ + + client = pageserver.http_client() + + expected_labels = set( + [ + "background_jobs_can_start", + "complete", + "initial", + "initial_tenant_load", + "initial_tenant_load_remote", + ] + ) + + def metrics_are_filled() -> List[Sample]: + m = client.get_metrics() + samples = m.query_all("pageserver_startup_duration_seconds") + # we should not have duplicate labels + matching = [ + x for x in samples if x.labels.get("phase") in expected_labels and x.value > 0.0 + ] + assert len(matching) == len(expected_labels) + return matching + + samples = wait_until(10, 1, metrics_are_filled) + + for sample in samples: + phase = sample.labels["phase"] + name = f"{prefix}.{phase}" + target.record(name, sample.value, "s", MetricReport.LOWER_IS_BETTER) + # Test measures the branch creation time when branching from a timeline with a lot of relations. # diff --git a/test_runner/performance/test_bulk_insert.py b/test_runner/performance/test_bulk_insert.py index 72173dc2a7..9e3f602237 100644 --- a/test_runner/performance/test_bulk_insert.py +++ b/test_runner/performance/test_bulk_insert.py @@ -56,12 +56,12 @@ def measure_recovery_time(env: NeonCompare): # Delete the Tenant in the pageserver: this will drop local and remote layers, such that # when we "create" the Tenant again, we will replay the WAL from the beginning. # - # This is a "weird" thing to do, and can confuse the attachment service as we're re-using + # This is a "weird" thing to do, and can confuse the storage controller as we're re-using # the same tenant ID for a tenant that is logically different from the pageserver's point # of view, but the same as far as the safekeeper/WAL is concerned. To work around that, # we will explicitly create the tenant in the same generation that it was previously # attached in. - attach_status = env.env.attachment_service.inspect(tenant_shard_id=env.tenant) + attach_status = env.env.storage_controller.inspect(tenant_shard_id=env.tenant) assert attach_status is not None (attach_gen, _) = attach_status diff --git a/test_runner/performance/test_gc_feedback.py b/test_runner/performance/test_gc_feedback.py index cf9e4808fc..48dd84fb06 100644 --- a/test_runner/performance/test_gc_feedback.py +++ b/test_runner/performance/test_gc_feedback.py @@ -13,6 +13,11 @@ def test_gc_feedback(neon_env_builder: NeonEnvBuilder, zenbenchmark: NeonBenchma Information about image layers needed to collect old layers should be propagated by GC to compaction task which should take in in account when make a decision which new image layers needs to be created. + + NB: this test demonstrates the problem. The source tree contained the + `gc_feedback` mechanism for about 9 months, but, there were problems + with it and it wasn't enabled at runtime. + This PR removed the code: https://github.com/neondatabase/neon/pull/6863 """ env = neon_env_builder.init_start() client = env.pageserver.http_client() diff --git a/test_runner/performance/test_layer_map.py b/test_runner/performance/test_layer_map.py index 6bd0d85fa2..9b20954d45 100644 --- a/test_runner/performance/test_layer_map.py +++ b/test_runner/performance/test_layer_map.py @@ -17,10 +17,10 @@ def test_layer_map(neon_env_builder: NeonEnvBuilder, zenbenchmark): tenant, _ = env.neon_cli.create_tenant( conf={ "gc_period": "0s", - "checkpoint_distance": "8192", + "checkpoint_distance": "16384", "compaction_period": "1 s", "compaction_threshold": "1", - "compaction_target_size": "8192", + "compaction_target_size": "16384", } ) diff --git a/test_runner/performance/test_lazy_startup.py b/test_runner/performance/test_lazy_startup.py new file mode 100644 index 0000000000..e929bd4d05 --- /dev/null +++ b/test_runner/performance/test_lazy_startup.py @@ -0,0 +1,106 @@ +import pytest +import requests +from fixtures.benchmark_fixture import MetricReport, NeonBenchmarker +from fixtures.neon_fixtures import NeonEnvBuilder + + +# Start and measure duration with huge SLRU segments. +# This test is similar to test_startup_simple, but it creates huge number of transactions +# and records containing this XIDs. Autovacuum is disable for the table to prevent CLOG truncation. +# +# This test runs pretty quickly and can be informative when used in combination +# with emulated network delay. Some useful delay commands: +# +# 1. Add 2msec delay to all localhost traffic +# `sudo tc qdisc add dev lo root handle 1:0 netem delay 2msec` +# +# 2. Test that it works (you should see 4ms ping) +# `ping localhost` +# +# 3. Revert back to normal +# `sudo tc qdisc del dev lo root netem` +# +# NOTE this test might not represent the real startup time because the basebackup +# for a large database might be larger if there's a lof of transaction metadata, +# or safekeepers might need more syncing, or there might be more operations to +# apply during config step, like more users, databases, or extensions. By default +# we load extensions 'neon,pg_stat_statements,timescaledb,pg_cron', but in this +# test we only load neon. +@pytest.mark.timeout(1800) +@pytest.mark.parametrize("slru", ["lazy", "eager"]) +def test_lazy_startup(slru: str, neon_env_builder: NeonEnvBuilder, zenbenchmark: NeonBenchmarker): + neon_env_builder.num_safekeepers = 3 + env = neon_env_builder.init_start() + + lazy_slru_download = "true" if slru == "lazy" else "false" + tenant, _ = env.neon_cli.create_tenant( + conf={ + "lazy_slru_download": lazy_slru_download, + } + ) + + endpoint = env.endpoints.create_start("main", tenant_id=tenant) + with endpoint.cursor() as cur: + cur.execute("CREATE TABLE t (pk integer PRIMARY KEY, x integer)") + cur.execute("ALTER TABLE t SET (autovacuum_enabled = false)") + cur.execute("INSERT INTO t VALUES (1, 0)") + cur.execute( + """ + CREATE PROCEDURE updating() as + $$ + DECLARE + i integer; + BEGIN + FOR i IN 1..1000000 LOOP + UPDATE t SET x = x + 1 WHERE pk=1; + COMMIT; + END LOOP; + END + $$ LANGUAGE plpgsql + """ + ) + cur.execute("SET statement_timeout=0") + cur.execute("call updating()") + + endpoint.stop() + + # We do two iterations so we can see if the second startup is faster. It should + # be because the compute node should already be configured with roles, databases, + # extensions, etc from the first run. + for i in range(2): + # Start + with zenbenchmark.record_duration(f"{slru}_{i}_start"): + endpoint.start() + + with zenbenchmark.record_duration(f"{slru}_{i}_select"): + sum = endpoint.safe_psql("select sum(x) from t")[0][0] + assert sum == 1000000 + + # Get metrics + metrics = requests.get(f"http://localhost:{endpoint.http_port}/metrics.json").json() + durations = { + "wait_for_spec_ms": f"{slru}_{i}_wait_for_spec", + "sync_safekeepers_ms": f"{slru}_{i}_sync_safekeepers", + "sync_sk_check_ms": f"{slru}_{i}_sync_sk_check", + "basebackup_ms": f"{slru}_{i}_basebackup", + "start_postgres_ms": f"{slru}_{i}_start_postgres", + "config_ms": f"{slru}_{i}_config", + "total_startup_ms": f"{slru}_{i}_total_startup", + } + for key, name in durations.items(): + value = metrics[key] + zenbenchmark.record(name, value, "ms", report=MetricReport.LOWER_IS_BETTER) + + basebackup_bytes = metrics["basebackup_bytes"] + zenbenchmark.record( + f"{slru}_{i}_basebackup_bytes", + basebackup_bytes, + "bytes", + report=MetricReport.LOWER_IS_BETTER, + ) + + # Stop so we can restart + endpoint.stop() + + # Imitate optimizations that console would do for the second start + endpoint.respec(skip_pg_catalog_updates=True) diff --git a/test_runner/pg_clients/csharp/npgsql/Dockerfile b/test_runner/pg_clients/csharp/npgsql/Dockerfile index b23eb2e5eb..71717a6006 100644 --- a/test_runner/pg_clients/csharp/npgsql/Dockerfile +++ b/test_runner/pg_clients/csharp/npgsql/Dockerfile @@ -1,4 +1,4 @@ -FROM mcr.microsoft.com/dotnet/sdk:7.0 AS build +FROM mcr.microsoft.com/dotnet/sdk:8.0 AS build WORKDIR /source COPY *.csproj . @@ -7,7 +7,7 @@ RUN dotnet restore COPY . . RUN dotnet publish -c release -o /app --no-restore -FROM mcr.microsoft.com/dotnet/runtime:7.0 +FROM mcr.microsoft.com/dotnet/runtime:8.0 WORKDIR /app COPY --from=build /app . diff --git a/test_runner/pg_clients/csharp/npgsql/csharp-npgsql.csproj b/test_runner/pg_clients/csharp/npgsql/csharp-npgsql.csproj index bb4427f2c4..50243e3ea7 100644 --- a/test_runner/pg_clients/csharp/npgsql/csharp-npgsql.csproj +++ b/test_runner/pg_clients/csharp/npgsql/csharp-npgsql.csproj @@ -2,13 +2,13 @@ Exe - net7.0 + net8.0 enable enable - + diff --git a/test_runner/pg_clients/java/jdbc/Dockerfile b/test_runner/pg_clients/java/jdbc/Dockerfile index 74eb9bdc32..7e074e07b8 100644 --- a/test_runner/pg_clients/java/jdbc/Dockerfile +++ b/test_runner/pg_clients/java/jdbc/Dockerfile @@ -1,10 +1,10 @@ -FROM openjdk:20 +FROM openjdk:21 WORKDIR /source COPY . . WORKDIR /app -RUN curl --output postgresql.jar https://jdbc.postgresql.org/download/postgresql-42.6.0.jar && \ +RUN curl --output postgresql.jar https://jdbc.postgresql.org/download/postgresql-42.7.2.jar && \ javac -d /app /source/Example.java CMD ["java", "-cp", "/app/postgresql.jar:.", "Example"] diff --git a/test_runner/pg_clients/python/asyncpg/Dockerfile b/test_runner/pg_clients/python/asyncpg/Dockerfile index 8b6d56b8fb..f2cc37a7bb 100644 --- a/test_runner/pg_clients/python/asyncpg/Dockerfile +++ b/test_runner/pg_clients/python/asyncpg/Dockerfile @@ -1,4 +1,4 @@ -FROM python:3.11 +FROM python:3.12 WORKDIR /source COPY . . diff --git a/test_runner/pg_clients/python/asyncpg/requirements.txt b/test_runner/pg_clients/python/asyncpg/requirements.txt index b33c21474c..61972959a9 100644 --- a/test_runner/pg_clients/python/asyncpg/requirements.txt +++ b/test_runner/pg_clients/python/asyncpg/requirements.txt @@ -1 +1 @@ -asyncpg==0.27.0 +asyncpg==0.29.0 diff --git a/test_runner/pg_clients/python/pg8000/Dockerfile b/test_runner/pg_clients/python/pg8000/Dockerfile index ebef1f9059..ee1de20da5 100644 --- a/test_runner/pg_clients/python/pg8000/Dockerfile +++ b/test_runner/pg_clients/python/pg8000/Dockerfile @@ -1,4 +1,4 @@ -FROM python:3.11 +FROM python:3.12 WORKDIR /source COPY . . diff --git a/test_runner/pg_clients/python/pg8000/requirements.txt b/test_runner/pg_clients/python/pg8000/requirements.txt index a8407c3cb0..e086a937e6 100644 --- a/test_runner/pg_clients/python/pg8000/requirements.txt +++ b/test_runner/pg_clients/python/pg8000/requirements.txt @@ -1,2 +1,2 @@ -pg8000==1.29.8 +pg8000==1.30.5 scramp>=1.4.3 diff --git a/test_runner/pg_clients/rust/tokio-postgres/Cargo.lock b/test_runner/pg_clients/rust/tokio-postgres/Cargo.lock index 3ac0f16e4b..a4a2426b97 100644 --- a/test_runner/pg_clients/rust/tokio-postgres/Cargo.lock +++ b/test_runner/pg_clients/rust/tokio-postgres/Cargo.lock @@ -19,9 +19,9 @@ checksum = "f26201604c87b1e01bd3d98f8d5d9a8fcbb815e8cedb41ffccbeb4bf593a35fe" [[package]] name = "async-trait" -version = "0.1.74" +version = "0.1.77" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a66537f1bb974b254c98ed142ff995236e81b9d0fe4db0575f46612cb15eb0f9" +checksum = "c980ee35e870bd1a4d2c8294d4c04d0499e67bca1e4b5cefcc693c2fa00caea9" dependencies = [ "proc-macro2", "quote", @@ -51,9 +51,9 @@ dependencies = [ [[package]] name = "base64" -version = "0.21.4" +version = "0.21.7" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "9ba43ea6f343b788c8764558649e08df62f86c6ef251fdaeb1ffd010a9ae50a2" +checksum = "9d297deb1925b89f2ccc13d7635fa0714f12c87adce1c75356b39ca9b7178567" [[package]] name = "bitflags" @@ -63,9 +63,9 @@ checksum = "bef38d45163c2f1dde094a7dfd33ccf595c92905c8f8f4fdc18d06fb1037718a" [[package]] name = "bitflags" -version = "2.4.1" +version = "2.4.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "327762f6e5a765692301e5bb513e0d9fef63be86bbc14528052b1cd3e6f03e07" +checksum = "ed570934406eb16438a4e976b1b4500774099c13b8cb96eec99f620f05090ddf" [[package]] name = "block-buffer" @@ -78,9 +78,9 @@ dependencies = [ [[package]] name = "bumpalo" -version = "3.14.0" +version = "3.15.3" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "7f30e7476521f6f8af1a1c4c0b8cc94f0bee37d91763d0ca2665f299b6cd8aec" +checksum = "8ea184aa71bb362a1157c896979544cc23974e08fd265f29ea96b59f0b4a555b" [[package]] name = "byteorder" @@ -96,12 +96,9 @@ checksum = "a2bd12c1caf447e69cd4528f47f94d203fd2582878ecb9e9465484c4148a8223" [[package]] name = "cc" -version = "1.0.83" +version = "1.0.89" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f1174fb0b6ec23863f8b971027804a42614e347eafb0a95bf0b12cdae21fc4d0" -dependencies = [ - "libc", -] +checksum = "a0ba8f7aaa012f30d5b2861462f6708eccd49c3c39863fe083a308035f63d723" [[package]] name = "cfg-if" @@ -111,9 +108,9 @@ checksum = "baf1de4339761588bc0619e3cbc0120ee582ebb74b53b4efbf79117bd2da40fd" [[package]] name = "core-foundation" -version = "0.9.3" +version = "0.9.4" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "194a7a9e6de53fa55116934067c844d9d749312f75c6f6d0980e8c252f8c2146" +checksum = "91e195e091a93c46f7102ec7818a2aa394e1e1771c3ab4825963fa03e45afb8f" dependencies = [ "core-foundation-sys", "libc", @@ -121,15 +118,15 @@ dependencies = [ [[package]] name = "core-foundation-sys" -version = "0.8.4" +version = "0.8.6" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e496a50fda8aacccc86d7529e2c1e0892dbd0f898a6b5645b5561b89c3210efa" +checksum = "06ea2b9bc92be3c2baa9334a323ebca2d6f074ff852cd1d7b11064035cd3868f" [[package]] name = "cpufeatures" -version = "0.2.9" +version = "0.2.12" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a17b76ff3a4162b0b27f354a0c87015ddad39d35f9c0c36607a3bdd175dde1f1" +checksum = "53fe5e26ff1b7aef8bca9c6080520cfb8d9333c7568e1829cef191a9723e5504" dependencies = [ "libc", ] @@ -157,12 +154,12 @@ dependencies = [ [[package]] name = "errno" -version = "0.3.5" +version = "0.3.8" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ac3e13f66a2f95e32a39eaa81f6b95d42878ca0e1db0c7543723dfe12557e860" +checksum = "a258e46cdc063eb8519c00b9fc845fc47bcfca4130e2f08e88665ceda8474245" dependencies = [ "libc", - "windows-sys", + "windows-sys 0.52.0", ] [[package]] @@ -200,9 +197,9 @@ checksum = "00b0228411908ca8685dba7fc2cdd70ec9990a6e753e89b6ac91a84c40fbaf4b" [[package]] name = "futures" -version = "0.3.28" +version = "0.3.30" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "23342abe12aba583913b2e62f22225ff9c950774065e4bfb61a19cd9770fec40" +checksum = "645c6916888f6cb6350d2550b80fb63e734897a8498abe35cfb732b6487804b0" dependencies = [ "futures-channel", "futures-core", @@ -215,9 +212,9 @@ dependencies = [ [[package]] name = "futures-channel" -version = "0.3.28" +version = "0.3.30" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "955518d47e09b25bbebc7a18df10b81f0c766eaf4c4f1cccef2fca5f2a4fb5f2" +checksum = "eac8f7d7865dcb88bd4373ab671c8cf4508703796caa2b1985a9ca867b3fcb78" dependencies = [ "futures-core", "futures-sink", @@ -225,15 +222,15 @@ dependencies = [ [[package]] name = "futures-core" -version = "0.3.28" +version = "0.3.30" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "4bca583b7e26f571124fe5b7561d49cb2868d79116cfa0eefce955557c6fee8c" +checksum = "dfc6580bb841c5a68e9ef15c77ccc837b40a7504914d52e47b8b0e9bbda25a1d" [[package]] name = "futures-executor" -version = "0.3.28" +version = "0.3.30" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ccecee823288125bd88b4d7f565c9e58e41858e47ab72e8ea2d64e93624386e0" +checksum = "a576fc72ae164fca6b9db127eaa9a9dda0d61316034f33a0a0d4eda41f02b01d" dependencies = [ "futures-core", "futures-task", @@ -242,15 +239,15 @@ dependencies = [ [[package]] name = "futures-io" -version = "0.3.28" +version = "0.3.30" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "4fff74096e71ed47f8e023204cfd0aa1289cd54ae5430a9523be060cdb849964" +checksum = "a44623e20b9681a318efdd71c299b6b222ed6f231972bfe2f224ebad6311f0c1" [[package]] name = "futures-macro" -version = "0.3.28" +version = "0.3.30" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "89ca545a94061b6365f2c7355b4b32bd20df3ff95f02da9329b34ccc3bd6ee72" +checksum = "87750cf4b7a4c0625b1529e4c543c2182106e4dedc60a2a6455e00d212c489ac" dependencies = [ "proc-macro2", "quote", @@ -259,21 +256,21 @@ dependencies = [ [[package]] name = "futures-sink" -version = "0.3.28" +version = "0.3.30" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f43be4fe21a13b9781a69afa4985b0f6ee0e1afab2c6f454a8cf30e2b2237b6e" +checksum = "9fb8e00e87438d937621c1c6269e53f536c14d3fbd6a042bb24879e57d474fb5" [[package]] name = "futures-task" -version = "0.3.28" +version = "0.3.30" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "76d3d132be6c0e6aa1534069c705a74a5997a356c0dc2f86a47765e5617c5b65" +checksum = "38d84fa142264698cdce1a9f9172cf383a0c82de1bddcf3092901442c4097004" [[package]] name = "futures-util" -version = "0.3.28" +version = "0.3.30" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "26b01e40b772d54cf6c6d721c1d1abd0647a0106a12ecaa1c186273392a69533" +checksum = "3d6401deb83407ab3da39eba7e33987a73c3df0c82b4bb5813ee871c19c41d48" dependencies = [ "futures-channel", "futures-core", @@ -299,9 +296,9 @@ dependencies = [ [[package]] name = "getrandom" -version = "0.2.10" +version = "0.2.12" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "be4136b2a15dd319360be1c07d9933517ccf0be8f16bf62a3bee4f0d618df427" +checksum = "190092ea657667030ac6a35e305e62fc4dd69fd98ac98631e5d3a2b1575a12b5" dependencies = [ "cfg-if", "libc", @@ -310,9 +307,9 @@ dependencies = [ [[package]] name = "gimli" -version = "0.28.0" +version = "0.28.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "6fb8d784f27acf97159b40fc4db5ecd8aa23b9ad5ef69cdd136d3bc80665f0c0" +checksum = "4271d37baee1b8c7e4b708028c57d816cf9d2434acb33a549475f78c181f6253" [[package]] name = "hmac" @@ -325,9 +322,9 @@ dependencies = [ [[package]] name = "js-sys" -version = "0.3.64" +version = "0.3.69" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "c5f195fe497f702db0f318b07fdd68edb16955aed830df8363d837542f8f935a" +checksum = "29c15563dc2726973df627357ce0c9ddddbea194836909d655df6a75d2cf296d" dependencies = [ "wasm-bindgen", ] @@ -340,15 +337,15 @@ checksum = "e2abad23fbc42b3700f2f279844dc832adb2b2eb069b2df918f455c4e18cc646" [[package]] name = "libc" -version = "0.2.149" +version = "0.2.153" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a08173bc88b7955d1b3145aa561539096c421ac8debde8cbc3612ec635fee29b" +checksum = "9c198f91728a82281a64e1f4f9eeb25d82cb32a5de251c6bd1b5154d63a8e7bd" [[package]] name = "linux-raw-sys" -version = "0.4.10" +version = "0.4.13" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "da2479e8c062e40bf0066ffa0bc823de0a9368974af99c9f6df941d2c231e03f" +checksum = "01cda141df6706de531b6c46c3a33ecca755538219bd484262fa09410c13539c" [[package]] name = "lock_api" @@ -362,9 +359,9 @@ dependencies = [ [[package]] name = "log" -version = "0.4.20" +version = "0.4.21" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b5e6163cb8c49088c2c36f57875e58ccd8c87c7427f7fbd50ea6710b2f3f2e8f" +checksum = "90ed8c1e510134f979dbc4f070f87d4313098b704861a105fe34231c70a3901c" [[package]] name = "md-5" @@ -378,28 +375,28 @@ dependencies = [ [[package]] name = "memchr" -version = "2.6.4" +version = "2.7.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f665ee40bc4a3c5590afb1e9677db74a508659dfd71e126420da8274909a0167" +checksum = "523dc4f511e55ab87b694dc30d0f820d60906ef06413f93d4d7a1385599cc149" [[package]] name = "miniz_oxide" -version = "0.7.1" +version = "0.7.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e7810e0be55b428ada41041c41f32c9f1a42817901b4ccf45fa3d4b6561e74c7" +checksum = "9d811f3e15f28568be3407c8e7fdb6514c1cda3cb30683f15b6a1a1dc4ea14a7" dependencies = [ "adler", ] [[package]] name = "mio" -version = "0.8.8" +version = "0.8.11" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "927a765cd3fc26206e66b296465fa9d3e5ab003e651c1b3c060e7956d96b19d2" +checksum = "a4a650543ca06a924e8b371db273b2756685faae30f8487da1b56505a8f78b0c" dependencies = [ "libc", "wasi", - "windows-sys", + "windows-sys 0.48.0", ] [[package]] @@ -422,26 +419,26 @@ dependencies = [ [[package]] name = "object" -version = "0.32.1" +version = "0.32.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "9cf5f9dd3933bd50a9e1f149ec995f39ae2c496d31fd772c1fd45ebc27e902b0" +checksum = "a6a622008b6e321afc04970976f62ee297fdbaa6f95318ca343e3eebb9648441" dependencies = [ "memchr", ] [[package]] name = "once_cell" -version = "1.18.0" +version = "1.19.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "dd8b5dd2ae5ed71462c540258bedcb51965123ad7e7ccf4b9a8cafaa4a63576d" +checksum = "3fdb12b2476b595f9358c5161aa467c2438859caa136dec86c26fdd2efe17b92" [[package]] name = "openssl" -version = "0.10.60" +version = "0.10.64" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "79a4c6c3a2b158f7f8f2a2fc5a969fa3a068df6fc9dbb4a43845436e3af7c800" +checksum = "95a0481286a310808298130d22dd1fef0fa571e05a8f44ec801801e84b216b1f" dependencies = [ - "bitflags 2.4.1", + "bitflags 2.4.2", "cfg-if", "foreign-types", "libc", @@ -469,9 +466,9 @@ checksum = "ff011a302c396a5197692431fc1948019154afc178baf7d8e37367442a4601cf" [[package]] name = "openssl-sys" -version = "0.9.96" +version = "0.9.101" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "3812c071ba60da8b5677cc12bcb1d42989a65553772897a7e0355545a819838f" +checksum = "dda2b0f344e78efc2facf7d195d098df0dd72151b26ab98da807afc26c198dff" dependencies = [ "cc", "libc", @@ -497,16 +494,16 @@ checksum = "4c42a9226546d68acdd9c0a280d17ce19bfe27a46bf68784e4066115788d008e" dependencies = [ "cfg-if", "libc", - "redox_syscall 0.4.1", + "redox_syscall", "smallvec", - "windows-targets", + "windows-targets 0.48.5", ] [[package]] name = "percent-encoding" -version = "2.3.0" +version = "2.3.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "9b2a4787296e9989611394c33f193f676704af1686e70b8f8033ab5ba9a35a94" +checksum = "e3148f5046208a5d56bcfc03053e3ca6334e51da8dfb19b6cdc8b306fae3283e" [[package]] name = "phf" @@ -540,9 +537,9 @@ checksum = "8b870d8c151b6f2fb93e84a13146138f05d02ed11c7e7c54f8826aaaf7c9f184" [[package]] name = "pkg-config" -version = "0.3.27" +version = "0.3.30" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "26072860ba924cbfa98ea39c8c19b4dd6a4a25423dbdf219c1eca91aa0cf6964" +checksum = "d231b230927b5e4ad203db57bbcbee2802f6bce620b1e4a9024a07d94e2907ec" [[package]] name = "postgres-native-tls" @@ -594,18 +591,18 @@ checksum = "5b40af805b3121feab8a3c29f04d8ad262fa8e0561883e7653e024ae4479e6de" [[package]] name = "proc-macro2" -version = "1.0.69" +version = "1.0.78" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "134c189feb4956b20f6f547d2cf727d4c0fe06722b20a0eec87ed445a97f92da" +checksum = "e2422ad645d89c99f8f3e6b88a9fdeca7fabeac836b1002371c4367c8f984aae" dependencies = [ "unicode-ident", ] [[package]] name = "quote" -version = "1.0.33" +version = "1.0.35" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "5267fca4496028628a95160fc423a33e8b2e6af8a5302579e322e4b520293cae" +checksum = "291ec9ab5efd934aaf503a6466c5d5251535d108ee747472c3977cc5acc868ef" dependencies = [ "proc-macro2", ] @@ -640,15 +637,6 @@ dependencies = [ "getrandom", ] -[[package]] -name = "redox_syscall" -version = "0.3.5" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "567664f262709473930a4bf9e51bf2ebf3348f2e748ccc50dea20646858f8f29" -dependencies = [ - "bitflags 1.3.2", -] - [[package]] name = "redox_syscall" version = "0.4.1" @@ -676,24 +664,24 @@ checksum = "d626bb9dae77e28219937af045c257c28bfd3f69333c512553507f5f9798cb76" [[package]] name = "rustix" -version = "0.38.19" +version = "0.38.31" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "745ecfa778e66b2b63c88a61cb36e0eea109e803b0b86bf9879fbc77c70e86ed" +checksum = "6ea3e1a662af26cd7a3ba09c0297a31af215563ecf42817c98df621387f4e949" dependencies = [ - "bitflags 2.4.1", + "bitflags 2.4.2", "errno", "libc", "linux-raw-sys", - "windows-sys", + "windows-sys 0.52.0", ] [[package]] name = "schannel" -version = "0.1.22" +version = "0.1.23" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "0c3733bf4cf7ea0880754e19cb5a462007c4a8c1914bff372ccc95b464f1df88" +checksum = "fbc91545643bcf3a0bbb6569265615222618bdf33ce4ffbbd13c4bbd4c093534" dependencies = [ - "windows-sys", + "windows-sys 0.52.0", ] [[package]] @@ -753,18 +741,18 @@ dependencies = [ [[package]] name = "smallvec" -version = "1.11.1" +version = "1.13.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "942b4a808e05215192e39f4ab80813e599068285906cc91aa64f923db842bd5a" +checksum = "e6ecd384b10a64542d77071bd64bd7b231f4ed5940fba55e98c3de13824cf3d7" [[package]] name = "socket2" -version = "0.5.4" +version = "0.5.6" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "4031e820eb552adee9295814c0ced9e5cf38ddf1e8b7d566d6de8e2538ea989e" +checksum = "05ffd9c0a93b7543e062e759284fcf5f5e3b098501104bfbdde4d404db792871" dependencies = [ "libc", - "windows-sys", + "windows-sys 0.52.0", ] [[package]] @@ -786,9 +774,9 @@ checksum = "81cdd64d312baedb58e21336b31bc043b77e01cc99033ce76ef539f78e965ebc" [[package]] name = "syn" -version = "2.0.38" +version = "2.0.52" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e96b79aaa137db8f61e26363a0c9b47d8b4ec75da28b7d1d614c2303e232408b" +checksum = "b699d15b36d1f02c3e7c69f8ffef53de37aefae075d8488d4ba1a7788d574a07" dependencies = [ "proc-macro2", "quote", @@ -797,15 +785,14 @@ dependencies = [ [[package]] name = "tempfile" -version = "3.8.0" +version = "3.10.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "cb94d2f3cc536af71caac6b6fcebf65860b347e7ce0cc9ebe8f70d3e521054ef" +checksum = "85b77fafb263dd9d05cbeac119526425676db3784113aa9295c88498cbf8bff1" dependencies = [ "cfg-if", "fastrand", - "redox_syscall 0.3.5", "rustix", - "windows-sys", + "windows-sys 0.52.0", ] [[package]] @@ -825,9 +812,9 @@ checksum = "1f3ccbac311fea05f86f61904b462b55fb3df8837a366dfc601a0161d0532f20" [[package]] name = "tokio" -version = "1.33.0" +version = "1.36.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "4f38200e3ef7995e5ef13baec2f432a6da0aa9ac495b2c0e8f3b7eec2c92d653" +checksum = "61285f6515fa018fb2d1e46eb21223fff441ee8db5d0f1435e8ab4f5cdb80931" dependencies = [ "backtrace", "bytes", @@ -836,14 +823,14 @@ dependencies = [ "pin-project-lite", "socket2", "tokio-macros", - "windows-sys", + "windows-sys 0.48.0", ] [[package]] name = "tokio-macros" -version = "2.1.0" +version = "2.2.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "630bdcf245f78637c13ec01ffae6187cca34625e8c63150d424b59e55af2675e" +checksum = "5b8a1e28f2deaa14e508979454cb3a223b10b938b45af148bc0986de36f1923b" dependencies = [ "proc-macro2", "quote", @@ -888,9 +875,9 @@ dependencies = [ [[package]] name = "tokio-util" -version = "0.7.9" +version = "0.7.10" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "1d68074620f57a0b21594d9735eb2e98ab38b17f80d3fcb189fca266771ca60d" +checksum = "5419f34732d9eb6ee4c3578b7989078579b7f039cbbb9ca2c4da015749371e15" dependencies = [ "bytes", "futures-core", @@ -927,9 +914,9 @@ checksum = "42ff0bf0c66b8238c6f3b578df37d0b7848e55df8577b3f74f92a69acceeb825" [[package]] name = "unicode-bidi" -version = "0.3.13" +version = "0.3.15" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "92888ba5573ff080736b3648696b70cafad7d250551175acbaa4e0385b3e1460" +checksum = "08f95100a766bf4f8f28f90d77e0a5461bbdb219042e7679bebe79004fed8d75" [[package]] name = "unicode-ident" @@ -939,9 +926,9 @@ checksum = "3354b9ac3fae1ff6755cb6db53683adb661634f67557942dea4facebec0fee4b" [[package]] name = "unicode-normalization" -version = "0.1.22" +version = "0.1.23" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "5c5713f0fc4b5db668a2ac63cdb7bb4469d8c9fed047b1d0292cc7b0ce2ba921" +checksum = "a56d1686db2308d901306f92a263857ef59ea39678a5458e7cb17f01415101f5" dependencies = [ "tinyvec", ] @@ -965,10 +952,16 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "9c8d87e72b64a3b4db28d11ce29237c246188f4f51057d65a7eab63b7987e423" [[package]] -name = "wasm-bindgen" -version = "0.2.87" +name = "wasite" +version = "0.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "7706a72ab36d8cb1f80ffbf0e071533974a60d0a308d01a5d0375bf60499a342" +checksum = "b8dad83b4f25e74f184f64c43b150b91efe7647395b42289f38e50566d82855b" + +[[package]] +name = "wasm-bindgen" +version = "0.2.92" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "4be2531df63900aeb2bca0daaaddec08491ee64ceecbee5076636a3b026795a8" dependencies = [ "cfg-if", "wasm-bindgen-macro", @@ -976,9 +969,9 @@ dependencies = [ [[package]] name = "wasm-bindgen-backend" -version = "0.2.87" +version = "0.2.92" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "5ef2b6d3c510e9625e5fe6f509ab07d66a760f0885d858736483c32ed7809abd" +checksum = "614d787b966d3989fa7bb98a654e369c762374fd3213d212cfc0251257e747da" dependencies = [ "bumpalo", "log", @@ -991,9 +984,9 @@ dependencies = [ [[package]] name = "wasm-bindgen-macro" -version = "0.2.87" +version = "0.2.92" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "dee495e55982a3bd48105a7b947fd2a9b4a8ae3010041b9e0faab3f9cd028f1d" +checksum = "a1f8823de937b71b9460c0c34e25f3da88250760bec0ebac694b49997550d726" dependencies = [ "quote", "wasm-bindgen-macro-support", @@ -1001,9 +994,9 @@ dependencies = [ [[package]] name = "wasm-bindgen-macro-support" -version = "0.2.87" +version = "0.2.92" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "54681b18a46765f095758388f2d0cf16eb8d4169b639ab575a8f5693af210c7b" +checksum = "e94f17b526d0a461a191c78ea52bbce64071ed5c04c9ffe424dcb38f74171bb7" dependencies = [ "proc-macro2", "quote", @@ -1014,15 +1007,15 @@ dependencies = [ [[package]] name = "wasm-bindgen-shared" -version = "0.2.87" +version = "0.2.92" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ca6ad05a4870b2bf5fe995117d3728437bd27d7cd5f06f13c17443ef369775a1" +checksum = "af190c94f2773fdb3729c55b007a722abb5384da03bc0986df4c289bf5567e96" [[package]] name = "web-sys" -version = "0.3.64" +version = "0.3.69" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "9b85cbef8c220a6abc02aefd892dfc0fc23afb1c6a426316ec33253a3877249b" +checksum = "77afa9a11836342370f4817622a2f0f418b134426d91a82dfb48f532d2ec13ef" dependencies = [ "js-sys", "wasm-bindgen", @@ -1030,11 +1023,12 @@ dependencies = [ [[package]] name = "whoami" -version = "1.4.1" +version = "1.5.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "22fc3756b8a9133049b26c7f61ab35416c130e8c09b660f5b3958b446f52cc50" +checksum = "0fec781d48b41f8163426ed18e8fc2864c12937df9ce54c88ede7bd47270893e" dependencies = [ - "wasm-bindgen", + "redox_syscall", + "wasite", "web-sys", ] @@ -1044,7 +1038,16 @@ version = "0.48.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "677d2418bec65e3338edb076e806bc1ec15693c5d0104683f2efe857f61056a9" dependencies = [ - "windows-targets", + "windows-targets 0.48.5", +] + +[[package]] +name = "windows-sys" +version = "0.52.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "282be5f36a8ce781fad8c8ae18fa3f9beff57ec1b52cb3de0789201425d9a33d" +dependencies = [ + "windows-targets 0.52.4", ] [[package]] @@ -1053,13 +1056,28 @@ version = "0.48.5" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "9a2fa6e2155d7247be68c096456083145c183cbbbc2764150dda45a87197940c" dependencies = [ - "windows_aarch64_gnullvm", - "windows_aarch64_msvc", - "windows_i686_gnu", - "windows_i686_msvc", - "windows_x86_64_gnu", - "windows_x86_64_gnullvm", - "windows_x86_64_msvc", + "windows_aarch64_gnullvm 0.48.5", + "windows_aarch64_msvc 0.48.5", + "windows_i686_gnu 0.48.5", + "windows_i686_msvc 0.48.5", + "windows_x86_64_gnu 0.48.5", + "windows_x86_64_gnullvm 0.48.5", + "windows_x86_64_msvc 0.48.5", +] + +[[package]] +name = "windows-targets" +version = "0.52.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7dd37b7e5ab9018759f893a1952c9420d060016fc19a472b4bb20d1bdd694d1b" +dependencies = [ + "windows_aarch64_gnullvm 0.52.4", + "windows_aarch64_msvc 0.52.4", + "windows_i686_gnu 0.52.4", + "windows_i686_msvc 0.52.4", + "windows_x86_64_gnu 0.52.4", + "windows_x86_64_gnullvm 0.52.4", + "windows_x86_64_msvc 0.52.4", ] [[package]] @@ -1068,38 +1086,80 @@ version = "0.48.5" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "2b38e32f0abccf9987a4e3079dfb67dcd799fb61361e53e2882c3cbaf0d905d8" +[[package]] +name = "windows_aarch64_gnullvm" +version = "0.52.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "bcf46cf4c365c6f2d1cc93ce535f2c8b244591df96ceee75d8e83deb70a9cac9" + [[package]] name = "windows_aarch64_msvc" version = "0.48.5" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "dc35310971f3b2dbbf3f0690a219f40e2d9afcf64f9ab7cc1be722937c26b4bc" +[[package]] +name = "windows_aarch64_msvc" +version = "0.52.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "da9f259dd3bcf6990b55bffd094c4f7235817ba4ceebde8e6d11cd0c5633b675" + [[package]] name = "windows_i686_gnu" version = "0.48.5" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "a75915e7def60c94dcef72200b9a8e58e5091744960da64ec734a6c6e9b3743e" +[[package]] +name = "windows_i686_gnu" +version = "0.52.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b474d8268f99e0995f25b9f095bc7434632601028cf86590aea5c8a5cb7801d3" + [[package]] name = "windows_i686_msvc" version = "0.48.5" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "8f55c233f70c4b27f66c523580f78f1004e8b5a8b659e05a4eb49d4166cca406" +[[package]] +name = "windows_i686_msvc" +version = "0.52.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1515e9a29e5bed743cb4415a9ecf5dfca648ce85ee42e15873c3cd8610ff8e02" + [[package]] name = "windows_x86_64_gnu" version = "0.48.5" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "53d40abd2583d23e4718fddf1ebec84dbff8381c07cae67ff7768bbf19c6718e" +[[package]] +name = "windows_x86_64_gnu" +version = "0.52.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5eee091590e89cc02ad514ffe3ead9eb6b660aedca2183455434b93546371a03" + [[package]] name = "windows_x86_64_gnullvm" version = "0.48.5" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "0b7b52767868a23d5bab768e390dc5f5c55825b6d30b86c844ff2dc7414044cc" +[[package]] +name = "windows_x86_64_gnullvm" +version = "0.52.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "77ca79f2451b49fa9e2af39f0747fe999fcda4f5e241b2898624dca97a1f2177" + [[package]] name = "windows_x86_64_msvc" version = "0.48.5" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "ed94fce61571a4006852b7389a063ab983c02eb1bb37b47f8272ce92d06d9538" + +[[package]] +name = "windows_x86_64_msvc" +version = "0.52.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "32b752e52a2da0ddfbdbcc6fceadfeede4c939ed16d13e648833a61dfb611ed8" diff --git a/test_runner/pg_clients/rust/tokio-postgres/Cargo.toml b/test_runner/pg_clients/rust/tokio-postgres/Cargo.toml index 6f100aafd5..0f420e5b06 100644 --- a/test_runner/pg_clients/rust/tokio-postgres/Cargo.toml +++ b/test_runner/pg_clients/rust/tokio-postgres/Cargo.toml @@ -9,7 +9,7 @@ publish = false [dependencies] native-tls = "0.2.11" postgres-native-tls = "0.5.0" -tokio = { version = "1.33", features=["rt", "macros"] } +tokio = { version = "1.36", features=["rt", "macros"] } tokio-postgres = "0.7.10" diff --git a/test_runner/pg_clients/rust/tokio-postgres/Dockerfile b/test_runner/pg_clients/rust/tokio-postgres/Dockerfile index 1d3709803e..8611e66cbb 100644 --- a/test_runner/pg_clients/rust/tokio-postgres/Dockerfile +++ b/test_runner/pg_clients/rust/tokio-postgres/Dockerfile @@ -1,4 +1,4 @@ -FROM rust:1.73 +FROM rust:1.76 WORKDIR /source COPY . . diff --git a/test_runner/pg_clients/swift/PostgresClientKitExample/Dockerfile b/test_runner/pg_clients/swift/PostgresClientKitExample/Dockerfile index 9538cf4ed4..0402838820 100644 --- a/test_runner/pg_clients/swift/PostgresClientKitExample/Dockerfile +++ b/test_runner/pg_clients/swift/PostgresClientKitExample/Dockerfile @@ -1,11 +1,11 @@ -FROM swift:5.8 AS build +FROM swift:5.9 AS build RUN apt-get -q update && apt-get -q install -y libssl-dev WORKDIR /source COPY . . RUN swift build --configuration release -FROM swift:5.8 +FROM swift:5.9 WORKDIR /app COPY --from=build /source/.build/release . CMD ["/app/PostgresClientKitExample"] diff --git a/test_runner/pg_clients/swift/PostgresNIOExample/Dockerfile b/test_runner/pg_clients/swift/PostgresNIOExample/Dockerfile index 61e1d1bba6..9130e0973f 100644 --- a/test_runner/pg_clients/swift/PostgresNIOExample/Dockerfile +++ b/test_runner/pg_clients/swift/PostgresNIOExample/Dockerfile @@ -1,10 +1,10 @@ -FROM swift:5.8 AS build +FROM swift:5.9 AS build WORKDIR /source COPY . . RUN swift build --configuration release -FROM swift:5.8 +FROM swift:5.9 WORKDIR /app COPY --from=build /source/.build/release . CMD ["/app/PostgresNIOExample"] diff --git a/test_runner/pg_clients/swift/PostgresNIOExample/Package.resolved b/test_runner/pg_clients/swift/PostgresNIOExample/Package.resolved index 9f13106011..023e03a7b1 100644 --- a/test_runner/pg_clients/swift/PostgresNIOExample/Package.resolved +++ b/test_runner/pg_clients/swift/PostgresNIOExample/Package.resolved @@ -5,8 +5,8 @@ "kind" : "remoteSourceControl", "location" : "https://github.com/vapor/postgres-nio.git", "state" : { - "revision" : "061a0836d7c1887e04a975d1d2eaa2ef5fd7dfab", - "version" : "1.16.0" + "revision" : "69ccfdf4c80144d845e3b439961b7ec6cd7ae33f", + "version" : "1.20.2" } }, { @@ -14,8 +14,8 @@ "kind" : "remoteSourceControl", "location" : "https://github.com/apple/swift-atomics.git", "state" : { - "revision" : "6c89474e62719ddcc1e9614989fff2f68208fe10", - "version" : "1.1.0" + "revision" : "cd142fd2f64be2100422d658e7411e39489da985", + "version" : "1.2.0" } }, { @@ -41,8 +41,8 @@ "kind" : "remoteSourceControl", "location" : "https://github.com/apple/swift-log.git", "state" : { - "revision" : "32e8d724467f8fe623624570367e3d50c5638e46", - "version" : "1.5.2" + "revision" : "e97a6fcb1ab07462881ac165fdbb37f067e205d5", + "version" : "1.5.4" } }, { @@ -50,8 +50,8 @@ "kind" : "remoteSourceControl", "location" : "https://github.com/apple/swift-metrics.git", "state" : { - "revision" : "9b39d811a83cf18b79d7d5513b06f8b290198b10", - "version" : "2.3.3" + "revision" : "971ba26378ab69c43737ee7ba967a896cb74c0d1", + "version" : "2.4.1" } }, { @@ -59,8 +59,8 @@ "kind" : "remoteSourceControl", "location" : "https://github.com/apple/swift-nio.git", "state" : { - "revision" : "6213ba7a06febe8fef60563a4a7d26a4085783cf", - "version" : "2.54.0" + "revision" : "635b2589494c97e48c62514bc8b37ced762e0a62", + "version" : "2.63.0" } }, { @@ -68,8 +68,8 @@ "kind" : "remoteSourceControl", "location" : "https://github.com/apple/swift-nio-ssl.git", "state" : { - "revision" : "e866a626e105042a6a72a870c88b4c531ba05f83", - "version" : "2.24.0" + "revision" : "7c381eb6083542b124a6c18fae742f55001dc2b5", + "version" : "2.26.0" } }, { @@ -77,8 +77,17 @@ "kind" : "remoteSourceControl", "location" : "https://github.com/apple/swift-nio-transport-services.git", "state" : { - "revision" : "41f4098903878418537020075a4d8a6e20a0b182", - "version" : "1.17.0" + "revision" : "6cbe0ed2b394f21ab0d46b9f0c50c6be964968ce", + "version" : "1.20.1" + } + }, + { + "identity" : "swift-system", + "kind" : "remoteSourceControl", + "location" : "https://github.com/apple/swift-system.git", + "state" : { + "revision" : "025bcb1165deab2e20d4eaba79967ce73013f496", + "version" : "1.2.1" } } ], diff --git a/test_runner/pg_clients/swift/PostgresNIOExample/Package.swift b/test_runner/pg_clients/swift/PostgresNIOExample/Package.swift index a80590daa2..637eb4bc9d 100644 --- a/test_runner/pg_clients/swift/PostgresNIOExample/Package.swift +++ b/test_runner/pg_clients/swift/PostgresNIOExample/Package.swift @@ -1,10 +1,10 @@ -// swift-tools-version:5.8 +// swift-tools-version:5.9 import PackageDescription let package = Package( name: "PostgresNIOExample", dependencies: [ - .package(url: "https://github.com/vapor/postgres-nio.git", from: "1.16.0") + .package(url: "https://github.com/vapor/postgres-nio.git", from: "1.20.2") ], targets: [ .executableTarget( diff --git a/test_runner/pg_clients/typescript/postgresql-client/Dockerfile b/test_runner/pg_clients/typescript/postgresql-client/Dockerfile index 07e98c586b..004b383749 100644 --- a/test_runner/pg_clients/typescript/postgresql-client/Dockerfile +++ b/test_runner/pg_clients/typescript/postgresql-client/Dockerfile @@ -1,4 +1,4 @@ -FROM node:20 +FROM node:21 WORKDIR /source COPY . . diff --git a/test_runner/pg_clients/typescript/postgresql-client/package-lock.json b/test_runner/pg_clients/typescript/postgresql-client/package-lock.json index 4cedf56acd..b4f8587eac 100644 --- a/test_runner/pg_clients/typescript/postgresql-client/package-lock.json +++ b/test_runner/pg_clients/typescript/postgresql-client/package-lock.json @@ -5,24 +5,24 @@ "packages": { "": { "dependencies": { - "postgresql-client": "2.5.9" + "postgresql-client": "2.10.5" } }, "node_modules/doublylinked": { - "version": "2.5.2", - "resolved": "https://registry.npmjs.org/doublylinked/-/doublylinked-2.5.2.tgz", - "integrity": "sha512-TDh0XfQWWDrfvGdAN0hLNIdkTXlw04nVCO5B/37ie4dV0yw1iT9ZrZ6tD+q/0SwXxeI/u6TF9Mxgd7s5/XYV6A==", + "version": "2.5.4", + "resolved": "https://registry.npmjs.org/doublylinked/-/doublylinked-2.5.4.tgz", + "integrity": "sha512-jBCKDnFkEHJRjQvYEl5N9VngRV8ypHgw6a52OK4VN57eV2r2rYvgOx9uABdY78INNoW7S6auULp+KBVm/jfYqw==", "engines": { "node": ">= 10.0" } }, "node_modules/lightning-pool": { - "version": "4.2.1", - "resolved": "https://registry.npmjs.org/lightning-pool/-/lightning-pool-4.2.1.tgz", - "integrity": "sha512-/pUIoGD3nzTH/wI4TYiJM3cLPeUOzGMTfFeBRuxaOAnwL0LZfwvqn5YFqsfyF98M0C3UXxWgfTz+Lu6okkno+g==", + "version": "4.2.2", + "resolved": "https://registry.npmjs.org/lightning-pool/-/lightning-pool-4.2.2.tgz", + "integrity": "sha512-KW0Df0IbjNLxy5wAsdErTKYtHGwefLRQseHNksEctyaL7gtRwJT0nqLa2uiRdNYDwKSnZtqOjSjUNtfxmfH1qw==", "dependencies": { - "doublylinked": "^2.5.2", - "putil-promisify": "^1.8.6" + "doublylinked": "^2.5.3", + "putil-promisify": "^1.10.1" } }, "node_modules/obuf": { @@ -42,16 +42,16 @@ } }, "node_modules/postgresql-client": { - "version": "2.5.9", - "resolved": "https://registry.npmjs.org/postgresql-client/-/postgresql-client-2.5.9.tgz", - "integrity": "sha512-s+kgTN6TfWLzehEyxw4Im4odnxVRCbZ0DEJzWS6SLowPAmB2m1/DOiOvZC0+ZVoi5AfbGE6SBqFxKguSyVAXZg==", + "version": "2.10.5", + "resolved": "https://registry.npmjs.org/postgresql-client/-/postgresql-client-2.10.5.tgz", + "integrity": "sha512-R3EC16pUdbgrzk1J2MQLj7jY2TepWurJHoK90nOeLZj1XTpL/+wL1VCneTmclRVKDuKVjFHr+FASV47KrLpAbw==", "dependencies": { - "doublylinked": "^2.5.2", - "lightning-pool": "^4.2.1", + "doublylinked": "^2.5.4", + "lightning-pool": "^4.2.2", "postgres-bytea": "^3.0.0", - "power-tasks": "^1.7.0", - "putil-merge": "^3.10.3", - "putil-promisify": "^1.10.0", + "power-tasks": "^1.7.3", + "putil-merge": "^3.12.1", + "putil-promisify": "^1.10.1", "putil-varhelpers": "^1.6.5" }, "engines": { @@ -60,30 +60,29 @@ } }, "node_modules/power-tasks": { - "version": "1.7.0", - "resolved": "https://registry.npmjs.org/power-tasks/-/power-tasks-1.7.0.tgz", - "integrity": "sha512-rndZXCDxhuIDjPUJJvQwBDHaYagCkjvbPF/NA+omh/Ef4rAI9KtnvdA0k98dyiGpn1zXOpc6c2c0JWzg/xAhJg==", + "version": "1.7.3", + "resolved": "https://registry.npmjs.org/power-tasks/-/power-tasks-1.7.3.tgz", + "integrity": "sha512-EnkjLfaX4PxFYHbUWyWzlE4I8SgctaW9jx4qQXrVRoELlqBXrxIMtuhHzRwsHv2qs1tO7efOcZa6/wDCdCjRfA==", "dependencies": { - "doublylinked": "^2.5.2", - "strict-typed-events": "^2.3.1" + "doublylinked": "^2.5.4", + "strict-typed-events": "^2.3.3" }, "engines": { - "node": ">=14.0", - "npm": ">=7.0.0" + "node": ">=16.0" } }, "node_modules/putil-merge": { - "version": "3.10.3", - "resolved": "https://registry.npmjs.org/putil-merge/-/putil-merge-3.10.3.tgz", - "integrity": "sha512-B18CYi0/SmBYl9+fgowYWkgzJM/8XcLSeafHrFrGzwySQuOzLW0sOGx0CdFVp9zqaxgLctexUdGoSPpm6CPM6A==", + "version": "3.12.1", + "resolved": "https://registry.npmjs.org/putil-merge/-/putil-merge-3.12.1.tgz", + "integrity": "sha512-4clPyRkJPrd5zl98AP7I3JamyXbx0ixe2CnfvGwoTyWSr7Kslcv8weoKjfU4BMBifkWIRL54l4OrNe97pYcDwQ==", "engines": { "node": ">= 10.0" } }, "node_modules/putil-promisify": { - "version": "1.10.0", - "resolved": "https://registry.npmjs.org/putil-promisify/-/putil-promisify-1.10.0.tgz", - "integrity": "sha512-zYPoAoMxmf8pC+I75kRkYkVMwU4ZbZl82aTGema175bmhQ06BEJuuOlzOy1buQK9G+hCyQ+BFpzMTKAJhD8rZw==", + "version": "1.10.1", + "resolved": "https://registry.npmjs.org/putil-promisify/-/putil-promisify-1.10.1.tgz", + "integrity": "sha512-1jm0egJNrj5eBDRj15Cg08RNHDV91OVEHeeYjAFRcs663PXxFokndxcJAGbaO6CSErCTp8eTgC8vuOF+fvXIAA==", "engines": { "node": ">= 14.0" } @@ -97,21 +96,21 @@ } }, "node_modules/strict-typed-events": { - "version": "2.3.1", - "resolved": "https://registry.npmjs.org/strict-typed-events/-/strict-typed-events-2.3.1.tgz", - "integrity": "sha512-Z1h8KpVbrVg34Vwy/VwTD/tS9tFebH2h1Kvw4xnPkKpkISMwUpnqwU44rMfkKMpXbFCybIgDt7ARoCGTzURZhQ==", + "version": "2.3.3", + "resolved": "https://registry.npmjs.org/strict-typed-events/-/strict-typed-events-2.3.3.tgz", + "integrity": "sha512-Vc8/N5giCVpO2n5BCskqDD9ns7RkdEq0pFd4yQk1ROULusJDbjORNvbtyEPxxK7Xqn9/NdW8XHLxv/PvUTgFsA==", "dependencies": { - "putil-promisify": "^1.8.5", - "ts-gems": "^2.2.0" + "putil-promisify": "^1.10.1", + "ts-gems": "^3.1.0" }, "engines": { "node": ">=16.0" } }, "node_modules/ts-gems": { - "version": "2.4.0", - "resolved": "https://registry.npmjs.org/ts-gems/-/ts-gems-2.4.0.tgz", - "integrity": "sha512-SdugYAXoWvbqrxLodIObzxhEKacDxh5LfAJIiIkiH7q5thvuuCzdmkdTVQYf7uEDrEpPhfx4tokDMamdO3be9A==" + "version": "3.1.1", + "resolved": "https://registry.npmjs.org/ts-gems/-/ts-gems-3.1.1.tgz", + "integrity": "sha512-Li1Z44FnxN06c1lBwFepb932jPYT+4eOvOmoiC30lOTkvOJOERr9xZFg3UA9y19OYO9CrW3ZSqNL66DUSuwFTw==" } } } diff --git a/test_runner/pg_clients/typescript/postgresql-client/package.json b/test_runner/pg_clients/typescript/postgresql-client/package.json index 12703ce89f..07ec100d0d 100644 --- a/test_runner/pg_clients/typescript/postgresql-client/package.json +++ b/test_runner/pg_clients/typescript/postgresql-client/package.json @@ -1,6 +1,6 @@ { "type": "module", "dependencies": { - "postgresql-client": "2.5.9" + "postgresql-client": "2.10.5" } } diff --git a/test_runner/pg_clients/typescript/serverless-driver/Dockerfile b/test_runner/pg_clients/typescript/serverless-driver/Dockerfile index 07e98c586b..004b383749 100644 --- a/test_runner/pg_clients/typescript/serverless-driver/Dockerfile +++ b/test_runner/pg_clients/typescript/serverless-driver/Dockerfile @@ -1,4 +1,4 @@ -FROM node:20 +FROM node:21 WORKDIR /source COPY . . diff --git a/test_runner/pg_clients/typescript/serverless-driver/package-lock.json b/test_runner/pg_clients/typescript/serverless-driver/package-lock.json index 72cc452817..5a3ad3c238 100644 --- a/test_runner/pg_clients/typescript/serverless-driver/package-lock.json +++ b/test_runner/pg_clients/typescript/serverless-driver/package-lock.json @@ -5,14 +5,14 @@ "packages": { "": { "dependencies": { - "@neondatabase/serverless": "0.4.18", - "ws": "8.13.0" + "@neondatabase/serverless": "0.9.0", + "ws": "8.16.0" } }, "node_modules/@neondatabase/serverless": { - "version": "0.4.18", - "resolved": "https://registry.npmjs.org/@neondatabase/serverless/-/serverless-0.4.18.tgz", - "integrity": "sha512-2TZnIyRGC/+0fjZ8TKCzaSTPUD94PM7NBGuantGZbUrbWyqBwGnUoRtdZAQ95qBKVHqORLVfymlv2NE+HQMFeA==", + "version": "0.9.0", + "resolved": "https://registry.npmjs.org/@neondatabase/serverless/-/serverless-0.9.0.tgz", + "integrity": "sha512-mmJnUAzlzvxNSZuuhI6kgJjH+JgFdBMYUWxihtq/nj0Tjt+Y5UU3W+SvRFoucnd5NObYkuLYQzk+zV5DGFKGJg==", "dependencies": { "@types/pg": "8.6.6" } @@ -96,9 +96,9 @@ } }, "node_modules/ws": { - "version": "8.13.0", - "resolved": "https://registry.npmjs.org/ws/-/ws-8.13.0.tgz", - "integrity": "sha512-x9vcZYTrFPC7aSIbj7sRCYo7L/Xb8Iy+pW0ng0wt2vCJv7M9HOMy0UoN3rr+IFC7hb7vXoqS+P9ktyLLLhO+LA==", + "version": "8.16.0", + "resolved": "https://registry.npmjs.org/ws/-/ws-8.16.0.tgz", + "integrity": "sha512-HS0c//TP7Ina87TfiPUz1rQzMhHrl/SG2guqRcTOIUYD2q8uhUdNHZYJUaQ8aTGPzCh+c6oawMKW35nFl1dxyQ==", "engines": { "node": ">=10.0.0" }, diff --git a/test_runner/pg_clients/typescript/serverless-driver/package.json b/test_runner/pg_clients/typescript/serverless-driver/package.json index 840c7a5c4c..9d9da0f42c 100644 --- a/test_runner/pg_clients/typescript/serverless-driver/package.json +++ b/test_runner/pg_clients/typescript/serverless-driver/package.json @@ -1,7 +1,7 @@ { "type": "module", "dependencies": { - "@neondatabase/serverless": "0.4.18", - "ws": "8.13.0" + "@neondatabase/serverless": "0.9.0", + "ws": "8.16.0" } } diff --git a/test_runner/regress/test_ancestor_branch.py b/test_runner/regress/test_ancestor_branch.py index 0e390ba9e5..d16d2d6a24 100644 --- a/test_runner/regress/test_ancestor_branch.py +++ b/test_runner/regress/test_ancestor_branch.py @@ -45,7 +45,6 @@ def test_ancestor_branch(neon_env_builder: NeonEnvBuilder): # Create branch1. env.neon_cli.create_branch("branch1", "main", tenant_id=tenant, ancestor_start_lsn=lsn_100) endpoint_branch1 = env.endpoints.create_start("branch1", tenant_id=tenant) - log.info("postgres is running on 'branch1' branch") branch1_cur = endpoint_branch1.connect().cursor() branch1_timeline = TimelineId(query_scalar(branch1_cur, "SHOW neon.timeline_id")) @@ -68,7 +67,6 @@ def test_ancestor_branch(neon_env_builder: NeonEnvBuilder): # Create branch2. env.neon_cli.create_branch("branch2", "branch1", tenant_id=tenant, ancestor_start_lsn=lsn_200) endpoint_branch2 = env.endpoints.create_start("branch2", tenant_id=tenant) - log.info("postgres is running on 'branch2' branch") branch2_cur = endpoint_branch2.connect().cursor() branch2_timeline = TimelineId(query_scalar(branch2_cur, "SHOW neon.timeline_id")) diff --git a/test_runner/regress/test_attach_tenant_config.py b/test_runner/regress/test_attach_tenant_config.py index ed389b1aa2..3058926b25 100644 --- a/test_runner/regress/test_attach_tenant_config.py +++ b/test_runner/regress/test_attach_tenant_config.py @@ -63,10 +63,11 @@ def negative_env(neon_env_builder: NeonEnvBuilder) -> Generator[NegativeTests, N ] ) - def log_contains_bad_request(): - env.pageserver.log_contains(".*Error processing HTTP request: Bad request") - - wait_until(50, 0.1, log_contains_bad_request) + wait_until( + 50, + 0.1, + lambda: env.pageserver.assert_log_contains(".*Error processing HTTP request: Bad request"), + ) def test_null_body(negative_env: NegativeTests): @@ -136,7 +137,7 @@ def test_no_config(positive_env: NeonEnv, content_type: Optional[str]): ps_http.tenant_detach(tenant_id) assert tenant_id not in [TenantId(t["id"]) for t in ps_http.tenant_list()] - body = {"generation": env.attachment_service.attach_hook_issue(tenant_id, env.pageserver.id)} + body = {"generation": env.storage_controller.attach_hook_issue(tenant_id, env.pageserver.id)} ps_http.post( f"{ps_http.base_url}/v1/tenant/{tenant_id}/attach", @@ -160,21 +161,32 @@ def test_fully_custom_config(positive_env: NeonEnv): "compaction_target_size": 1048576, "checkpoint_distance": 10000, "checkpoint_timeout": "13m", + "compaction_algorithm": { + "kind": "Tiered", + }, "eviction_policy": { "kind": "LayerAccessThreshold", "period": "20s", "threshold": "23h", }, "evictions_low_residence_duration_metric_threshold": "2days", - "gc_feedback": True, "gc_horizon": 23 * (1024 * 1024), "gc_period": "2h 13m", "heatmap_period": "10m", "image_creation_threshold": 7, "pitr_interval": "1m", "lagging_wal_timeout": "23m", + "lazy_slru_download": True, "max_lsn_wal_lag": 230000, "min_resident_size_override": 23, + "timeline_get_throttle": { + "task_kinds": ["PageRequestHandler"], + "fair": True, + "initial": 0, + "refill_interval": "1s", + "refill_amount": 1000, + "max": 1000, + }, "trace_read_requests": True, "walreceiver_connect_timeout": "13m", } diff --git a/test_runner/regress/test_auth.py b/test_runner/regress/test_auth.py index bd87ff3efd..ea88b5d8e9 100644 --- a/test_runner/regress/test_auth.py +++ b/test_runner/regress/test_auth.py @@ -225,9 +225,7 @@ def test_auth_failures(neon_env_builder: NeonEnvBuilder, auth_enabled: bool): check_pageserver(True, password=pageserver_token) - env.pageserver.allowed_errors.append( - ".*SafekeeperData scope makes no sense for Pageserver.*" - ) + env.pageserver.allowed_errors.append(".*JWT scope '.+' is ineligible for Pageserver auth.*") check_pageserver(False, password=safekeeper_token) def check_safekeeper(expect_success: bool, **conn_kwargs): diff --git a/test_runner/regress/test_backpressure.py b/test_runner/regress/test_backpressure.py index bc3faf9271..819912dd05 100644 --- a/test_runner/regress/test_backpressure.py +++ b/test_runner/regress/test_backpressure.py @@ -107,7 +107,6 @@ def test_backpressure_received_lsn_lag(neon_env_builder: NeonEnvBuilder): # which is needed for backpressure_lsns() to work endpoint.respec(skip_pg_catalog_updates=False) endpoint.start() - log.info("postgres is running on 'test_backpressure' branch") # setup check thread check_stop_event = threading.Event() diff --git a/test_runner/regress/test_bad_connection.py b/test_runner/regress/test_bad_connection.py index ba0624c730..c808fa0f54 100644 --- a/test_runner/regress/test_bad_connection.py +++ b/test_runner/regress/test_bad_connection.py @@ -9,14 +9,14 @@ def test_compute_pageserver_connection_stress(neon_env_builder: NeonEnvBuilder): env = neon_env_builder.init_start() env.pageserver.allowed_errors.append(".*simulated connection error.*") + # Enable failpoint before starting everything else up so that we exercise the retry + # on fetching basebackup pageserver_http = env.pageserver.http_client() + pageserver_http.configure_failpoints(("simulated-bad-compute-connection", "50%return(15)")) + env.neon_cli.create_branch("test_compute_pageserver_connection_stress") endpoint = env.endpoints.create_start("test_compute_pageserver_connection_stress") - # Enable failpoint after starting everything else up so that loading initial - # basebackup doesn't fail - pageserver_http.configure_failpoints(("simulated-bad-compute-connection", "50%return(15)")) - pg_conn = endpoint.connect() cur = pg_conn.cursor() diff --git a/test_runner/regress/test_branch_behind.py b/test_runner/regress/test_branch_behind.py index 9879254897..46c74a26b8 100644 --- a/test_runner/regress/test_branch_behind.py +++ b/test_runner/regress/test_branch_behind.py @@ -21,7 +21,6 @@ def test_branch_behind(neon_env_builder: NeonEnvBuilder): # Branch at the point where only 100 rows were inserted branch_behind_timeline_id = env.neon_cli.create_branch("test_branch_behind") endpoint_main = env.endpoints.create_start("test_branch_behind") - log.info("postgres is running on 'test_branch_behind' branch") main_cur = endpoint_main.connect().cursor() diff --git a/test_runner/regress/test_change_pageserver.py b/test_runner/regress/test_change_pageserver.py index adb67a579e..97ab69049d 100644 --- a/test_runner/regress/test_change_pageserver.py +++ b/test_runner/regress/test_change_pageserver.py @@ -85,9 +85,9 @@ def test_change_pageserver(neon_env_builder: NeonEnvBuilder): # the endpoint. Whereas the previous reconfiguration was like a healthy migration, this # is more like what happens in an unexpected pageserver failure. # - # Since we're dual-attached, need to tip-off attachment service to treat the one we're + # Since we're dual-attached, need to tip-off storage controller to treat the one we're # about to start as the attached pageserver - env.attachment_service.attach_hook_issue(env.initial_tenant, env.pageservers[0].id) + env.storage_controller.attach_hook_issue(env.initial_tenant, env.pageservers[0].id) env.pageservers[0].start() env.pageservers[1].stop() @@ -97,9 +97,9 @@ def test_change_pageserver(neon_env_builder: NeonEnvBuilder): assert fetchone() == (100000,) env.pageservers[0].stop() - # Since we're dual-attached, need to tip-off attachment service to treat the one we're + # Since we're dual-attached, need to tip-off storage controller to treat the one we're # about to start as the attached pageserver - env.attachment_service.attach_hook_issue(env.initial_tenant, env.pageservers[1].id) + env.storage_controller.attach_hook_issue(env.initial_tenant, env.pageservers[1].id) env.pageservers[1].start() # Test a (former) bug where a child process spins without updating its connection string diff --git a/test_runner/regress/test_clog_truncate.py b/test_runner/regress/test_clog_truncate.py index f22eca02cc..26e6e336b9 100644 --- a/test_runner/regress/test_clog_truncate.py +++ b/test_runner/regress/test_clog_truncate.py @@ -25,7 +25,6 @@ def test_clog_truncate(neon_simple_env: NeonEnv): ] endpoint = env.endpoints.create_start("test_clog_truncate", config_lines=config) - log.info("postgres is running on test_clog_truncate branch") # Install extension containing function needed for test endpoint.safe_psql("CREATE EXTENSION neon_test_utils") @@ -62,7 +61,6 @@ def test_clog_truncate(neon_simple_env: NeonEnv): "test_clog_truncate_new", "test_clog_truncate", ancestor_start_lsn=lsn_after_truncation ) endpoint2 = env.endpoints.create_start("test_clog_truncate_new") - log.info("postgres is running on test_clog_truncate_new branch") # check that new node doesn't contain truncated segment pg_xact_0000_path_new = os.path.join(endpoint2.pg_xact_dir_path(), "0000") diff --git a/test_runner/regress/test_compatibility.py b/test_runner/regress/test_compatibility.py index 1a1425f069..5f815d3e6c 100644 --- a/test_runner/regress/test_compatibility.py +++ b/test_runner/regress/test_compatibility.py @@ -112,11 +112,6 @@ def test_create_snapshot( env = neon_env_builder.init_start() endpoint = env.endpoints.create_start("main") - # FIXME: Is this expected? - env.pageserver.allowed_errors.append( - ".*init_tenant_mgr: marking .* as locally complete, while it doesnt exist in remote index.*" - ) - pg_bin.run_capture(["pgbench", "--initialize", "--scale=10", endpoint.connstr()]) pg_bin.run_capture(["pgbench", "--time=60", "--progress=2", endpoint.connstr()]) pg_bin.run_capture( @@ -138,6 +133,7 @@ def test_create_snapshot( for sk in env.safekeepers: sk.stop() env.pageserver.stop() + env.storage_controller.stop() # Directory `compatibility_snapshot_dir` is uploaded to S3 in a workflow, keep the name in sync with it compatibility_snapshot_dir = ( @@ -145,7 +141,12 @@ def test_create_snapshot( ) if compatibility_snapshot_dir.exists(): shutil.rmtree(compatibility_snapshot_dir) - shutil.copytree(test_output_dir, compatibility_snapshot_dir) + + shutil.copytree( + test_output_dir, + compatibility_snapshot_dir, + ignore=shutil.ignore_patterns("pg_dynshmem"), + ) @check_ondisk_data_compatibility_if_enabled @@ -225,13 +226,23 @@ def test_forward_compatibility( ) try: + # TODO: remove this once the previous pageserrver version understands + # the 'get_vectored_impl' config + neon_env_builder.pageserver_get_vectored_impl = None + neon_env_builder.num_safekeepers = 3 + neon_local_binpath = neon_env_builder.neon_binpath env = neon_env_builder.from_repo_dir( compatibility_snapshot_dir / "repo", neon_binpath=compatibility_neon_bin, pg_distrib_dir=compatibility_postgres_distrib_dir, ) - neon_env_builder.start() + + # Use current neon_local even though we're using old binaries for + # everything else: our test code is written for latest CLI args. + env.neon_local_binpath = neon_local_binpath + + neon_env_builder.start(register_pageservers=True) check_neon_works( env, diff --git a/test_runner/regress/test_config.py b/test_runner/regress/test_config.py index 0ea5784b67..4bb7df1e6a 100644 --- a/test_runner/regress/test_config.py +++ b/test_runner/regress/test_config.py @@ -1,6 +1,5 @@ from contextlib import closing -from fixtures.log_helper import log from fixtures.neon_fixtures import NeonEnv @@ -13,7 +12,6 @@ def test_config(neon_simple_env: NeonEnv): # change config endpoint = env.endpoints.create_start("test_config", config_lines=["log_min_messages=debug1"]) - log.info("postgres is running on test_config branch") with closing(endpoint.connect()) as conn: with conn.cursor() as cur: diff --git a/test_runner/regress/test_createdropdb.py b/test_runner/regress/test_createdropdb.py index 500d19cf31..f741a9fc87 100644 --- a/test_runner/regress/test_createdropdb.py +++ b/test_runner/regress/test_createdropdb.py @@ -20,7 +20,6 @@ def test_createdb(neon_simple_env: NeonEnv, strategy: str): env.neon_cli.create_branch("test_createdb", "empty") endpoint = env.endpoints.create_start("test_createdb") - log.info("postgres is running on 'test_createdb' branch") with endpoint.cursor() as cur: # Cause a 'relmapper' change in the original branch @@ -65,7 +64,6 @@ def test_dropdb(neon_simple_env: NeonEnv, test_output_dir): env = neon_simple_env env.neon_cli.create_branch("test_dropdb", "empty") endpoint = env.endpoints.create_start("test_dropdb") - log.info("postgres is running on 'test_dropdb' branch") with endpoint.cursor() as cur: cur.execute("CREATE DATABASE foodb") diff --git a/test_runner/regress/test_createuser.py b/test_runner/regress/test_createuser.py index f1bc405287..17d9824f52 100644 --- a/test_runner/regress/test_createuser.py +++ b/test_runner/regress/test_createuser.py @@ -1,4 +1,3 @@ -from fixtures.log_helper import log from fixtures.neon_fixtures import NeonEnv from fixtures.utils import query_scalar @@ -10,7 +9,6 @@ def test_createuser(neon_simple_env: NeonEnv): env = neon_simple_env env.neon_cli.create_branch("test_createuser", "empty") endpoint = env.endpoints.create_start("test_createuser") - log.info("postgres is running on 'test_createuser' branch") with endpoint.cursor() as cur: # Cause a 'relmapper' change in the original branch diff --git a/test_runner/regress/test_ddl_forwarding.py b/test_runner/regress/test_ddl_forwarding.py index 7174487e68..50da673d87 100644 --- a/test_runner/regress/test_ddl_forwarding.py +++ b/test_runner/regress/test_ddl_forwarding.py @@ -296,7 +296,6 @@ def test_ddl_forwarding_invalid_db(neon_simple_env: NeonEnv): # Some non-existent url config_lines=["neon.console_url=http://localhost:9999/unknown/api/v0/roles_and_databases"], ) - log.info("postgres is running on 'test_ddl_forwarding_invalid_db' branch") with endpoint.cursor() as cur: cur.execute("SET neon.forward_ddl = false") diff --git a/test_runner/regress/test_disk_usage_eviction.py b/test_runner/regress/test_disk_usage_eviction.py index 6a4f0edbea..b83545216d 100644 --- a/test_runner/regress/test_disk_usage_eviction.py +++ b/test_runner/regress/test_disk_usage_eviction.py @@ -17,7 +17,7 @@ from fixtures.neon_fixtures import ( from fixtures.pageserver.http import PageserverHttpClient from fixtures.pageserver.utils import wait_for_upload_queue_empty from fixtures.remote_storage import RemoteStorageKind -from fixtures.types import Lsn, TenantId, TenantShardId, TimelineId +from fixtures.types import Lsn, TenantId, TimelineId from fixtures.utils import wait_until GLOBAL_LRU_LOG_LINE = "tenant_min_resident_size-respecting LRU would not relieve pressure, evicting more following global LRU policy" @@ -155,6 +155,15 @@ class EvictionEnv: mock_behavior, eviction_order: EvictionOrder, ): + """ + Starts pageserver up with mocked statvfs setup. The startup is + problematic because of dueling initial logical size calculations + requiring layers and disk usage based task evicting. + + Returns after initial logical sizes are complete, but the phase of disk + usage eviction task is unknown; it might need to run one more iteration + before assertions can be made. + """ disk_usage_config = { "period": period, "max_usage_pct": max_usage_pct, @@ -183,9 +192,17 @@ class EvictionEnv: ), ) - def statvfs_called(): - assert pageserver.log_contains(".*running mocked statvfs.*") + # we now do initial logical size calculation on startup, which on debug builds can fight with disk usage based eviction + for tenant_id, timeline_id in self.timelines: + tenant_ps = self.neon_env.get_tenant_pageserver(tenant_id) + # Pageserver may be none if we are currently not attached anywhere, e.g. during secondary eviction test + if tenant_ps is not None: + tenant_ps.http_client().timeline_wait_logical_size(tenant_id, timeline_id) + def statvfs_called(): + pageserver.assert_log_contains(".*running mocked statvfs.*") + + # we most likely have already completed multiple runs wait_until(10, 1, statvfs_called) @@ -516,7 +533,7 @@ def test_pageserver_falls_back_to_global_lru(eviction_env: EvictionEnv, order: E assert actual_change >= target, "eviction must always evict more than target" time.sleep(1) # give log time to flush - assert env.neon_env.pageserver.log_contains(GLOBAL_LRU_LOG_LINE) + env.neon_env.pageserver.assert_log_contains(GLOBAL_LRU_LOG_LINE) env.neon_env.pageserver.allowed_errors.append(".*" + GLOBAL_LRU_LOG_LINE) @@ -750,7 +767,7 @@ def test_statvfs_error_handling(eviction_env: EvictionEnv): eviction_order=EvictionOrder.ABSOLUTE_ORDER, ) - assert env.neon_env.pageserver.log_contains(".*statvfs failed.*EIO") + env.neon_env.pageserver.assert_log_contains(".*statvfs failed.*EIO") env.neon_env.pageserver.allowed_errors.append(".*statvfs failed.*EIO") @@ -784,14 +801,15 @@ def test_statvfs_pressure_usage(eviction_env: EvictionEnv): eviction_order=EvictionOrder.ABSOLUTE_ORDER, ) - def relieved_log_message(): - assert env.neon_env.pageserver.log_contains(".*disk usage pressure relieved") + wait_until( + 10, 1, lambda: env.neon_env.pageserver.assert_log_contains(".*disk usage pressure relieved") + ) - wait_until(10, 1, relieved_log_message) + def less_than_max_usage_pct(): + post_eviction_total_size, _, _ = env.timelines_du(env.pageserver) + assert post_eviction_total_size < 0.33 * total_size, "we requested max 33% usage" - post_eviction_total_size, _, _ = env.timelines_du(env.pageserver) - - assert post_eviction_total_size <= 0.33 * total_size, "we requested max 33% usage" + wait_until(2, 2, less_than_max_usage_pct) def test_statvfs_pressure_min_avail_bytes(eviction_env: EvictionEnv): @@ -826,16 +844,17 @@ def test_statvfs_pressure_min_avail_bytes(eviction_env: EvictionEnv): eviction_order=EvictionOrder.ABSOLUTE_ORDER, ) - def relieved_log_message(): - assert env.neon_env.pageserver.log_contains(".*disk usage pressure relieved") + wait_until( + 10, 1, lambda: env.neon_env.pageserver.assert_log_contains(".*disk usage pressure relieved") + ) - wait_until(10, 1, relieved_log_message) + def more_than_min_avail_bytes_freed(): + post_eviction_total_size, _, _ = env.timelines_du(env.pageserver) + assert ( + total_size - post_eviction_total_size >= min_avail_bytes + ), f"we requested at least {min_avail_bytes} worth of free space" - post_eviction_total_size, _, _ = env.timelines_du(env.pageserver) - - assert ( - total_size - post_eviction_total_size >= min_avail_bytes - ), "we requested at least min_avail_bytes worth of free space" + wait_until(2, 2, more_than_min_avail_bytes_freed) def test_secondary_mode_eviction(eviction_env_ha: EvictionEnv): @@ -845,18 +864,18 @@ def test_secondary_mode_eviction(eviction_env_ha: EvictionEnv): # Set up a situation where one pageserver _only_ has secondary locations on it, # so that when we release space we are sure it is via secondary locations. - - log.info("Setting up secondary location...") - ps_attached = env.neon_env.pageservers[0] + log.info("Setting up secondary locations...") ps_secondary = env.neon_env.pageservers[1] for tenant_id in tenant_ids: - # Migrate all attached tenants to the same pageserver, so that all the secondaries - # will run on the other pageserver. This is necessary because when we create tenants, - # they are spread over pageservers by default. - env.neon_env.attachment_service.tenant_shard_migrate( - TenantShardId(tenant_id, 0, 0), ps_attached.id - ) + # Find where it is attached + pageserver = env.neon_env.get_tenant_pageserver(tenant_id) + pageserver.http_client().tenant_heatmap_upload(tenant_id) + # Detach it + pageserver.tenant_detach(tenant_id) + + # Create a secondary mode location for the tenant, all tenants on one pageserver that will only + # contain secondary locations: this is the one where we will exercise disk usage eviction ps_secondary.tenant_location_configure( tenant_id, { @@ -868,41 +887,18 @@ def test_secondary_mode_eviction(eviction_env_ha: EvictionEnv): readback_conf = ps_secondary.read_tenant_location_conf(tenant_id) log.info(f"Read back conf: {readback_conf}") - # Request secondary location to download all layers that the attached location has - ps_attached.http_client().tenant_heatmap_upload(tenant_id) + # Request secondary location to download all layers that the attached location indicated + # in its heatmap ps_secondary.http_client().tenant_secondary_download(tenant_id) - # Configure the secondary pageserver to have a phony small disk size - ps_secondary.stop() total_size, _, _ = env.timelines_du(ps_secondary) - blocksize = 512 - total_blocks = (total_size + (blocksize - 1)) // blocksize + evict_bytes = total_size // 3 - min_avail_bytes = total_size // 3 - - env.pageserver_start_with_disk_usage_eviction( - ps_secondary, - period="1s", - max_usage_pct=100, - min_avail_bytes=min_avail_bytes, - mock_behavior={ - "type": "Success", - "blocksize": blocksize, - "total_blocks": total_blocks, - # Only count layer files towards used bytes in the mock_statvfs. - # This avoids accounting for metadata files & tenant conf in the tests. - "name_filter": ".*__.*", - }, - eviction_order=EvictionOrder.ABSOLUTE_ORDER, - ) - - def relieved_log_message(): - assert ps_secondary.log_contains(".*disk usage pressure relieved") - - wait_until(10, 1, relieved_log_message) + response = ps_secondary.http_client().disk_usage_eviction_run({"evict_bytes": evict_bytes}) + log.info(f"{response}") post_eviction_total_size, _, _ = env.timelines_du(ps_secondary) assert ( - total_size - post_eviction_total_size >= min_avail_bytes - ), "we requested at least min_avail_bytes worth of free space" + total_size - post_eviction_total_size >= evict_bytes + ), "we requested at least evict_bytes worth of free space" diff --git a/test_runner/regress/test_duplicate_layers.py b/test_runner/regress/test_duplicate_layers.py index 224e6f50c7..cb4fa43be7 100644 --- a/test_runner/regress/test_duplicate_layers.py +++ b/test_runner/regress/test_duplicate_layers.py @@ -36,7 +36,7 @@ def test_duplicate_layers(neon_env_builder: NeonEnvBuilder, pg_bin: PgBin): pg_bin.run_capture(["pgbench", "-i", "-s1", connstr]) time.sleep(10) # let compaction to be performed - assert env.pageserver.log_contains("compact-level0-phase1-return-same") + env.pageserver.assert_log_contains("compact-level0-phase1-return-same") def test_actually_duplicated_l1(neon_env_builder: NeonEnvBuilder, pg_bin: PgBin): diff --git a/test_runner/regress/test_explain_with_lfc_stats.py b/test_runner/regress/test_explain_with_lfc_stats.py new file mode 100644 index 0000000000..5231dedcda --- /dev/null +++ b/test_runner/regress/test_explain_with_lfc_stats.py @@ -0,0 +1,84 @@ +from pathlib import Path + +from fixtures.log_helper import log +from fixtures.neon_fixtures import NeonEnv + + +def test_explain_with_lfc_stats(neon_simple_env: NeonEnv): + env = neon_simple_env + + cache_dir = Path(env.repo_dir) / "file_cache" + cache_dir.mkdir(exist_ok=True) + + branchname = "test_explain_with_lfc_stats" + env.neon_cli.create_branch(branchname, "empty") + log.info(f"Creating endopint with 1MB shared_buffers and 64 MB LFC for branch {branchname}") + endpoint = env.endpoints.create_start( + branchname, + config_lines=[ + "shared_buffers='1MB'", + f"neon.file_cache_path='{cache_dir}/file.cache'", + "neon.max_file_cache_size='128MB'", + "neon.file_cache_size_limit='64MB'", + ], + ) + + cur = endpoint.connect().cursor() + + log.info(f"preparing some data in {endpoint.connstr()}") + + ddl = """ +CREATE TABLE pgbench_accounts ( + aid bigint NOT NULL, + bid integer, + abalance integer, + filler character(84), + -- more web-app like columns + text_column_plain TEXT DEFAULT repeat('NeonIsCool', 5), + jsonb_column_extended JSONB DEFAULT ('{ "tell everyone": [' || repeat('{"Neon": "IsCool"},',9) || ' {"Neon": "IsCool"}]}')::jsonb +) +WITH (fillfactor='100'); +""" + + cur.execute(ddl) + cur.execute( + "insert into pgbench_accounts(aid,bid,abalance,filler) select aid, (aid - 1) / 100000 + 1, 0, '' from generate_series(1, 100000) as aid;" + ) + + log.info(f"warming up caches with sequential scan in {endpoint.connstr()}") + cur.execute("SELECT * FROM pgbench_accounts WHERE abalance > 0") + + log.info("running explain analyze without LFC values to verify they do not show up in the plan") + cur.execute("EXPLAIN (ANALYZE, BUFFERS) SELECT * FROM pgbench_accounts WHERE abalance > 0") + rows = cur.fetchall() + plan = "\n".join(r[0] for r in rows) + log.debug(plan) + assert "Seq Scan on pgbench_accounts" in plan + assert "Buffers: shared hit" in plan + assert "File cache: hits=" not in plan + log.info("running explain analyze WITH LFC values to verify they do now show up") + cur.execute( + "EXPLAIN (ANALYZE, BUFFERS,FILECACHE) SELECT * FROM pgbench_accounts WHERE abalance > 0" + ) + rows = cur.fetchall() + plan = "\n".join(r[0] for r in rows) + log.debug(plan) + assert "Seq Scan on pgbench_accounts" in plan + assert "Buffers: shared hit" in plan + assert "File cache: hits=" in plan + log.info("running explain analyze WITH LFC values to verify json output") + cur.execute( + "EXPLAIN (ANALYZE, BUFFERS,FILECACHE, FORMAT JSON) SELECT * FROM pgbench_accounts WHERE abalance > 0" + ) + jsonplan = cur.fetchall()[0][0] + log.debug(jsonplan) + # Directly access the 'Plan' part of the first element of the JSON array + plan_details = jsonplan[0]["Plan"] + + # Extract "File Cache Hits" and "File Cache Misses" + file_cache_hits = plan_details.get("File Cache Hits") + file_cache_misses = plan_details.get("File Cache Misses") + + # Now you can assert the values + assert file_cache_hits >= 5000, f"Expected File Cache Hits to be > 5000, got {file_cache_hits}" + assert file_cache_misses == 0, f"Expected File Cache Misses to be 0, got {file_cache_misses}" diff --git a/test_runner/regress/test_fullbackup.py b/test_runner/regress/test_fullbackup.py index a456c06862..d5f898492b 100644 --- a/test_runner/regress/test_fullbackup.py +++ b/test_runner/regress/test_fullbackup.py @@ -26,7 +26,6 @@ def test_fullbackup( env.neon_cli.create_branch("test_fullbackup") endpoint_main = env.endpoints.create_start("test_fullbackup") - log.info("postgres is running on 'test_fullbackup' branch") with endpoint_main.cursor() as cur: timeline = TimelineId(query_scalar(cur, "SHOW neon.timeline_id")) @@ -67,12 +66,6 @@ def test_fullbackup( # Restore from the backup and find the data we inserted port = port_distributor.get_port() with VanillaPostgres(restored_dir_path, pg_bin, port, init=False) as vanilla_pg: - # TODO make port an optional argument - vanilla_pg.configure( - [ - f"port={port}", - ] - ) vanilla_pg.start() num_rows_found = vanilla_pg.safe_psql("select count(*) from tbl;", user="cloud_admin")[0][0] assert num_rows == num_rows_found diff --git a/test_runner/regress/test_gc_aggressive.py b/test_runner/regress/test_gc_aggressive.py index ef68049ee7..c5070ee815 100644 --- a/test_runner/regress/test_gc_aggressive.py +++ b/test_runner/regress/test_gc_aggressive.py @@ -71,7 +71,6 @@ def test_gc_aggressive(neon_env_builder: NeonEnvBuilder): env = neon_env_builder.init_start() timeline = env.neon_cli.create_branch("test_gc_aggressive", "main") endpoint = env.endpoints.create_start("test_gc_aggressive") - log.info("postgres is running on test_gc_aggressive branch") with endpoint.cursor() as cur: # Create table, and insert the first 100 rows diff --git a/test_runner/regress/test_gc_cutoff.py b/test_runner/regress/test_gc_cutoff.py deleted file mode 100644 index 284a8c3563..0000000000 --- a/test_runner/regress/test_gc_cutoff.py +++ /dev/null @@ -1,47 +0,0 @@ -import subprocess - -import pytest -from fixtures.neon_fixtures import NeonEnvBuilder, PgBin - - -# Test gc_cutoff -# -# This test sets fail point at the end of GC, and checks that pageserver -# normally restarts after it. Also, there should be GC ERRORs in the log, -# but the fixture checks the log for any unexpected ERRORs after every -# test anyway, so it doesn't need any special attention here. -@pytest.mark.timeout(600) -def test_gc_cutoff(neon_env_builder: NeonEnvBuilder, pg_bin: PgBin): - env = neon_env_builder.init_start( - initial_tenant_conf={ - "gc_period": "10 s", - "gc_horizon": f"{1024 ** 2}", - "checkpoint_distance": f"{1024 ** 2}", - "compaction_period": "5 s", - # set PITR interval to be small, so we can do GC - "pitr_interval": "1 s", - "compaction_threshold": "3", - "image_creation_threshold": "2", - } - ) - - pageserver_http = env.pageserver.http_client() - - # Use aggressive GC and checkpoint settings, so that we also exercise GC during the test - tenant_id = env.initial_tenant - endpoint = env.endpoints.create_start("main", tenant_id=tenant_id) - connstr = endpoint.connstr(options="-csynchronous_commit=off") - pg_bin.run_capture(["pgbench", "-i", "-s10", connstr]) - - pageserver_http.configure_failpoints(("after-timeline-gc-removed-layers", "exit")) - - # Because this test does a rapid series of restarts of the same node, it's possible that - # we are restarted again before we can clean up deletion lists form the previous generation, - # resulting in a subsequent startup logging a warning. - env.pageserver.allowed_errors.append(".*Dropping stale deletions for tenant.*") - - for _ in range(5): - with pytest.raises(subprocess.SubprocessError): - pg_bin.run_capture(["pgbench", "-P1", "-N", "-c5", "-T500", "-Mprepared", connstr]) - env.pageserver.stop() - env.pageserver.start(extra_env_vars={"FAILPOINTS": "after-timeline-gc-removed-layers=exit"}) diff --git a/test_runner/regress/test_hot_standby.py b/test_runner/regress/test_hot_standby.py index 7822e29ed9..0497e1965c 100644 --- a/test_runner/regress/test_hot_standby.py +++ b/test_runner/regress/test_hot_standby.py @@ -3,22 +3,7 @@ import re import time from fixtures.log_helper import log -from fixtures.neon_fixtures import Endpoint, NeonEnv - - -def wait_caughtup(primary: Endpoint, secondary: Endpoint): - primary_lsn = primary.safe_psql_scalar( - "SELECT pg_current_wal_insert_lsn()::text", log_query=False - ) - while True: - secondary_lsn = secondary.safe_psql_scalar( - "SELECT pg_last_wal_replay_lsn()", log_query=False - ) - caught_up = secondary_lsn >= primary_lsn - log.info(f"caughtup={caught_up}, primary_lsn={primary_lsn}, secondary_lsn={secondary_lsn}") - if caught_up: - return - time.sleep(1) +from fixtures.neon_fixtures import NeonEnv, wait_replica_caughtup # Check for corrupted WAL messages which might otherwise go unnoticed if @@ -79,7 +64,7 @@ def test_hot_standby(neon_simple_env: NeonEnv): primary.safe_psql("create table t(key int, value text)") primary.safe_psql("insert into t select generate_series(1, 100000), 'payload'") - wait_caughtup(primary, secondary) + wait_replica_caughtup(primary, secondary) with secondary.connect() as s_con: with s_con.cursor() as s_cur: diff --git a/test_runner/regress/test_import.py b/test_runner/regress/test_import.py index faedf5d944..ec57860033 100644 --- a/test_runner/regress/test_import.py +++ b/test_runner/regress/test_import.py @@ -95,16 +95,6 @@ def test_import_from_vanilla(test_output_dir, pg_bin, vanilla_pg, neon_env_build ".*InternalServerError.*Tenant .* not found.*", ".*InternalServerError.*Timeline .* not found.*", ".*InternalServerError.*Cannot delete timeline which has child timelines.*", - ".*ignored .* unexpected bytes after the tar archive.*", - ] - ) - - env.pageserver.allowed_errors.extend( - [ - # FIXME: we should clean up pageserver to not print this - ".*exited with error: unexpected message type: CopyData.*", - # FIXME: Is this expected? - ".*init_tenant_mgr: marking .* as locally complete, while it doesnt exist in remote index.*", ] ) @@ -142,12 +132,9 @@ def test_import_from_vanilla(test_output_dir, pg_bin, vanilla_pg, neon_env_build with pytest.raises(RuntimeError): import_tar(corrupt_base_tar, wal_tar) - # A tar with trailing garbage is currently accepted. It prints a warnings - # to the pageserver log, however. Check that. - import_tar(base_plus_garbage_tar, wal_tar) - assert env.pageserver.log_contains( - ".*WARN.*ignored .* unexpected bytes after the tar archive.*" - ) + # Importing a tar with trailing garbage fails + with pytest.raises(RuntimeError): + import_tar(base_plus_garbage_tar, wal_tar) client = env.pageserver.http_client() timeline_delete_wait_completed(client, tenant, timeline) @@ -163,6 +150,8 @@ def test_import_from_vanilla(test_output_dir, pg_bin, vanilla_pg, neon_env_build endpoint = env.endpoints.create_start(endpoint_id, tenant_id=tenant) assert endpoint.safe_psql("select count(*) from t") == [(300000,)] + vanilla_pg.stop() + def test_import_from_pageserver_small( pg_bin: PgBin, neon_env_builder: NeonEnvBuilder, test_output_dir: Path @@ -170,11 +159,6 @@ def test_import_from_pageserver_small( neon_env_builder.enable_pageserver_remote_storage(RemoteStorageKind.LOCAL_FS) env = neon_env_builder.init_start() - # FIXME: Is this expected? - env.pageserver.allowed_errors.append( - ".*init_tenant_mgr: marking .* as locally complete, while it doesnt exist in remote index.*" - ) - timeline = env.neon_cli.create_branch("test_import_from_pageserver_small") endpoint = env.endpoints.create_start("test_import_from_pageserver_small") diff --git a/test_runner/regress/test_layer_bloating.py b/test_runner/regress/test_layer_bloating.py new file mode 100644 index 0000000000..2fdee89389 --- /dev/null +++ b/test_runner/regress/test_layer_bloating.py @@ -0,0 +1,67 @@ +import os +import time + +import pytest +from fixtures.log_helper import log +from fixtures.neon_fixtures import ( + NeonEnv, + logical_replication_sync, + wait_for_last_flush_lsn, +) +from fixtures.pg_version import PgVersion + + +def test_layer_bloating(neon_simple_env: NeonEnv, vanilla_pg): + env = neon_simple_env + + if env.pg_version != PgVersion.V16: + pytest.skip("pg_log_standby_snapshot() function is available only in PG16") + + timeline = env.neon_cli.create_branch("test_logical_replication", "empty") + endpoint = env.endpoints.create_start( + "test_logical_replication", config_lines=["log_statement=all"] + ) + + pg_conn = endpoint.connect() + cur = pg_conn.cursor() + + # create table... + cur.execute("create table t(pk integer primary key)") + cur.execute("create publication pub1 for table t") + # Create slot to hold WAL + cur.execute("select pg_create_logical_replication_slot('my_slot', 'pgoutput')") + + # now start subscriber + vanilla_pg.start() + vanilla_pg.safe_psql("create table t(pk integer primary key)") + + connstr = endpoint.connstr().replace("'", "''") + log.info(f"ep connstr is {endpoint.connstr()}, subscriber connstr {vanilla_pg.connstr()}") + vanilla_pg.safe_psql(f"create subscription sub1 connection '{connstr}' publication pub1") + + cur.execute( + """create or replace function create_snapshots(n integer) returns void as $$ + declare + i integer; + begin + for i in 1..n loop + perform pg_log_standby_snapshot(); + end loop; + end; $$ language plpgsql""" + ) + cur.execute("set statement_timeout=0") + cur.execute("select create_snapshots(10000)") + # Wait logical replication to sync + logical_replication_sync(vanilla_pg, endpoint) + wait_for_last_flush_lsn(env, endpoint, env.initial_tenant, timeline) + time.sleep(10) + + # Check layer file sizes + timeline_path = "{}/tenants/{}/timelines/{}/".format( + env.pageserver.workdir, env.initial_tenant, timeline + ) + log.info(f"Check {timeline_path}") + for filename in os.listdir(timeline_path): + if filename.startswith("00000"): + log.info(f"layer {filename} size is {os.path.getsize(timeline_path + filename)}") + assert os.path.getsize(timeline_path + filename) < 512_000_000 diff --git a/test_runner/regress/test_layer_eviction.py b/test_runner/regress/test_layer_eviction.py index efba2033fb..7bbc0cc160 100644 --- a/test_runner/regress/test_layer_eviction.py +++ b/test_runner/regress/test_layer_eviction.py @@ -4,12 +4,11 @@ import pytest from fixtures.log_helper import log from fixtures.neon_fixtures import ( NeonEnvBuilder, + flush_ep_to_pageserver, wait_for_last_flush_lsn, ) -from fixtures.pageserver.utils import wait_for_last_record_lsn, wait_for_upload +from fixtures.pageserver.utils import wait_for_upload from fixtures.remote_storage import RemoteStorageKind -from fixtures.types import Lsn -from fixtures.utils import query_scalar # Crates a few layers, ensures that we can evict them (removing locally but keeping track of them anyway) @@ -46,14 +45,15 @@ def test_basic_eviction( FROM generate_series(1, 5000000) g """ ) - current_lsn = Lsn(query_scalar(cur, "SELECT pg_current_wal_flush_lsn()")) - wait_for_last_record_lsn(client, tenant_id, timeline_id, current_lsn) + # stops the endpoint + current_lsn = flush_ep_to_pageserver(env, endpoint, tenant_id, timeline_id) + client.timeline_checkpoint(tenant_id, timeline_id) wait_for_upload(client, tenant_id, timeline_id, current_lsn) - # disable compute & sks to avoid on-demand downloads by walreceiver / getpage - endpoint.stop() + # stop sks to avoid on-demand downloads by walreceiver / getpage; endpoint + # has already been stopped by flush_ep_to_pageserver for sk in env.safekeepers: sk.stop() diff --git a/test_runner/regress/test_layers_from_future.py b/test_runner/regress/test_layers_from_future.py index 999e077e45..ca4295c5cb 100644 --- a/test_runner/regress/test_layers_from_future.py +++ b/test_runner/regress/test_layers_from_future.py @@ -1,7 +1,7 @@ import time from fixtures.log_helper import log -from fixtures.neon_fixtures import NeonEnvBuilder +from fixtures.neon_fixtures import NeonEnvBuilder, flush_ep_to_pageserver from fixtures.pageserver.types import ( DeltaLayerFileName, ImageLayerFileName, @@ -115,8 +115,7 @@ def test_issue_5878(neon_env_builder: NeonEnvBuilder): ) == 0 ) - - endpoint.stop() + last_record_lsn = flush_ep_to_pageserver(env, endpoint, tenant_id, timeline_id) wait_for_upload_queue_empty(ps_http, tenant_id, timeline_id) @@ -160,7 +159,7 @@ def test_issue_5878(neon_env_builder: NeonEnvBuilder): time.sleep(1.1) # so that we can use change in pre_stat.st_mtime to detect overwrites def get_generation_number(): - attachment = env.attachment_service.inspect(tenant_id) + attachment = env.storage_controller.inspect(tenant_id) assert attachment is not None return attachment[0] @@ -184,10 +183,13 @@ def test_issue_5878(neon_env_builder: NeonEnvBuilder): # NB: the layer file is unlinked index part now, but, because we made the delete # operation stuck, the layer file itself is still in the remote_storage - def delete_at_pause_point(): - assert env.pageserver.log_contains(f".*{tenant_id}.*at failpoint.*{failpoint_name}") - - wait_until(10, 0.5, delete_at_pause_point) + wait_until( + 10, + 0.5, + lambda: env.pageserver.assert_log_contains( + f".*{tenant_id}.*at failpoint.*{failpoint_name}" + ), + ) future_layer_path = env.pageserver_remote_storage.remote_layer_path( tenant_id, timeline_id, future_layer.to_str(), generation=generation_before_detach ) diff --git a/test_runner/regress/test_lfc_resize.py b/test_runner/regress/test_lfc_resize.py index 5c68a63d06..2a3442448a 100644 --- a/test_runner/regress/test_lfc_resize.py +++ b/test_runner/regress/test_lfc_resize.py @@ -23,7 +23,6 @@ def test_lfc_resize(neon_simple_env: NeonEnv, pg_bin: PgBin): ) n_resize = 10 scale = 10 - log.info("postgres is running on 'test_lfc_resize' branch") def run_pgbench(connstr: str): log.info(f"Start a pgbench workload on pg {connstr}") diff --git a/test_runner/regress/test_lfc_working_set_approximation.py b/test_runner/regress/test_lfc_working_set_approximation.py new file mode 100644 index 0000000000..a6f05fe0f7 --- /dev/null +++ b/test_runner/regress/test_lfc_working_set_approximation.py @@ -0,0 +1,74 @@ +from pathlib import Path + +from fixtures.log_helper import log +from fixtures.neon_fixtures import NeonEnv +from fixtures.utils import query_scalar + + +def test_lfc_working_set_approximation(neon_simple_env: NeonEnv): + env = neon_simple_env + + cache_dir = Path(env.repo_dir) / "file_cache" + cache_dir.mkdir(exist_ok=True) + + branchname = "test_approximate_working_set_size" + env.neon_cli.create_branch(branchname, "empty") + log.info(f"Creating endopint with 1MB shared_buffers and 64 MB LFC for branch {branchname}") + endpoint = env.endpoints.create_start( + branchname, + config_lines=[ + "shared_buffers='1MB'", + f"neon.file_cache_path='{cache_dir}/file.cache'", + "neon.max_file_cache_size='128MB'", + "neon.file_cache_size_limit='64MB'", + ], + ) + + cur = endpoint.connect().cursor() + cur.execute("create extension neon") + + log.info(f"preparing some data in {endpoint.connstr()}") + + ddl = """ +CREATE TABLE pgbench_accounts ( + aid bigint NOT NULL, + bid integer, + abalance integer, + filler character(84), + -- more web-app like columns + text_column_plain TEXT DEFAULT repeat('NeonIsCool', 5), + jsonb_column_extended JSONB DEFAULT ('{ "tell everyone": [' || repeat('{"Neon": "IsCool"},',9) || ' {"Neon": "IsCool"}]}')::jsonb +) +WITH (fillfactor='100'); +""" + + cur.execute(ddl) + # prepare index access below + cur.execute( + "ALTER TABLE ONLY pgbench_accounts ADD CONSTRAINT pgbench_accounts_pkey PRIMARY KEY (aid)" + ) + cur.execute( + "insert into pgbench_accounts(aid,bid,abalance,filler) select aid, (aid - 1) / 100000 + 1, 0, '' from generate_series(1, 100000) as aid;" + ) + # ensure correct query plans and stats + cur.execute("vacuum ANALYZE pgbench_accounts") + # determine table size - working set should approximate table size after sequential scan + pages = query_scalar(cur, "SELECT relpages FROM pg_class WHERE relname = 'pgbench_accounts'") + log.info(f"pgbench_accounts has {pages} pages, resetting working set to zero") + cur.execute("select approximate_working_set_size(true)") + cur.execute( + 'SELECT count(*) FROM pgbench_accounts WHERE abalance > 0 or jsonb_column_extended @> \'{"tell everyone": [{"Neon": "IsCool"}]}\'::jsonb' + ) + # verify working set size after sequential scan matches table size and reset working set for next test + blocks = query_scalar(cur, "select approximate_working_set_size(true)") + log.info(f"working set size after sequential scan on pgbench_accounts {blocks}") + assert pages * 0.8 < blocks < pages * 1.2 + # run a few point queries with index lookup + cur.execute("SELECT abalance FROM pgbench_accounts WHERE aid = 4242") + cur.execute("SELECT abalance FROM pgbench_accounts WHERE aid = 54242") + cur.execute("SELECT abalance FROM pgbench_accounts WHERE aid = 104242") + cur.execute("SELECT abalance FROM pgbench_accounts WHERE aid = 204242") + # verify working set size after some index access of a few select pages only + blocks = query_scalar(cur, "select approximate_working_set_size(true)") + log.info(f"working set size after some index access of a few select pages only {blocks}") + assert blocks < 10 diff --git a/test_runner/regress/test_logging.py b/test_runner/regress/test_logging.py index d559be0a8f..bfffad7572 100644 --- a/test_runner/regress/test_logging.py +++ b/test_runner/regress/test_logging.py @@ -3,10 +3,12 @@ import uuid import pytest from fixtures.log_helper import log from fixtures.neon_fixtures import NeonEnvBuilder +from fixtures.pg_version import run_only_on_default_postgres from fixtures.utils import wait_until @pytest.mark.parametrize("level", ["trace", "debug", "info", "warn", "error"]) +@run_only_on_default_postgres("it does not use any postgres functionality") def test_logging_event_count(neon_env_builder: NeonEnvBuilder, level: str): # self-test: make sure the event is logged (i.e., our testing endpoint works) log_expected = { @@ -32,7 +34,7 @@ def test_logging_event_count(neon_env_builder: NeonEnvBuilder, level: str): def assert_logged(): if not log_expected: return - assert env.pageserver.log_contains(f".*{msg_id}.*") + env.pageserver.assert_log_contains(f".*{msg_id}.*") wait_until(10, 0.5, assert_logged) diff --git a/test_runner/regress/test_logical_replication.py b/test_runner/regress/test_logical_replication.py index 51e358e60d..3f4ca8070d 100644 --- a/test_runner/regress/test_logical_replication.py +++ b/test_runner/regress/test_logical_replication.py @@ -1,4 +1,7 @@ import time +from functools import partial +from random import choice +from string import ascii_lowercase import pytest from fixtures.log_helper import log @@ -8,7 +11,11 @@ from fixtures.neon_fixtures import ( wait_for_last_flush_lsn, ) from fixtures.types import Lsn -from fixtures.utils import query_scalar +from fixtures.utils import query_scalar, wait_until + + +def random_string(n: int): + return "".join([choice(ascii_lowercase) for _ in range(n)]) def test_logical_replication(neon_simple_env: NeonEnv, vanilla_pg): @@ -20,7 +27,6 @@ def test_logical_replication(neon_simple_env: NeonEnv, vanilla_pg): "test_logical_replication", config_lines=["log_statement=all"] ) - log.info("postgres is running on 'test_logical_replication' branch") pg_conn = endpoint.connect() cur = pg_conn.cursor() @@ -152,6 +158,51 @@ COMMIT; assert endpoint.safe_psql("select count(*) from pg_replication_slots")[0][0] == 1 +# Test that neon.logical_replication_max_snap_files works +def test_obsolete_slot_drop(neon_simple_env: NeonEnv, vanilla_pg): + def slot_removed(ep): + assert ( + endpoint.safe_psql( + "select count(*) from pg_replication_slots where slot_name = 'stale_slot'" + )[0][0] + == 0 + ) + + env = neon_simple_env + + env.neon_cli.create_branch("test_logical_replication", "empty") + # set low neon.logical_replication_max_snap_files + endpoint = env.endpoints.create_start( + "test_logical_replication", + config_lines=["log_statement=all", "neon.logical_replication_max_snap_files=1"], + ) + + pg_conn = endpoint.connect() + cur = pg_conn.cursor() + + # create obsolete slot + cur.execute("select pg_create_logical_replication_slot('stale_slot', 'pgoutput');") + assert ( + endpoint.safe_psql( + "select count(*) from pg_replication_slots where slot_name = 'stale_slot'" + )[0][0] + == 1 + ) + + # now insert some data and create and start live subscriber to create more .snap files + # (in most cases this is not needed as stale_slot snap will have higher LSN than restart_lsn anyway) + cur.execute("create table t(pk integer primary key, payload integer)") + cur.execute("create publication pub1 for table t") + + vanilla_pg.start() + vanilla_pg.safe_psql("create table t(pk integer primary key, payload integer)") + connstr = endpoint.connstr().replace("'", "''") + log.info(f"ep connstr is {endpoint.connstr()}, subscriber connstr {vanilla_pg.connstr()}") + vanilla_pg.safe_psql(f"create subscription sub1 connection '{connstr}' publication pub1") + + wait_until(number_of_iterations=10, interval=2, func=partial(slot_removed, endpoint)) + + # Test compute start at LSN page of which starts with contrecord # https://github.com/neondatabase/neon/issues/5749 def test_wal_page_boundary_start(neon_simple_env: NeonEnv, vanilla_pg): @@ -238,6 +289,57 @@ def test_wal_page_boundary_start(neon_simple_env: NeonEnv, vanilla_pg): ) == endpoint.safe_psql("select sum(somedata) from replication_example") +# Test that WAL redo works for fairly large records. +# +# See https://github.com/neondatabase/neon/pull/6534. That wasn't a +# logical replication bug as such, but without logical replication, +# records passed ot the WAL redo process are never large enough to hit +# the bug. +def test_large_records(neon_simple_env: NeonEnv, vanilla_pg): + env = neon_simple_env + + env.neon_cli.create_branch("init") + endpoint = env.endpoints.create_start("init") + + cur = endpoint.connect().cursor() + cur.execute("CREATE TABLE reptbl(id int, largeval text);") + cur.execute("alter table reptbl replica identity full") + cur.execute("create publication pub1 for table reptbl") + + # now start subscriber + vanilla_pg.start() + vanilla_pg.safe_psql("CREATE TABLE reptbl(id int, largeval text);") + + log.info(f"ep connstr is {endpoint.connstr()}, subscriber connstr {vanilla_pg.connstr()}") + connstr = endpoint.connstr().replace("'", "''") + vanilla_pg.safe_psql(f"create subscription sub1 connection '{connstr}' publication pub1") + + # Test simple insert, update, delete. But with very large values + value = random_string(10_000_000) + cur.execute(f"INSERT INTO reptbl VALUES (1, '{value}')") + logical_replication_sync(vanilla_pg, endpoint) + assert vanilla_pg.safe_psql("select id, largeval from reptbl") == [(1, value)] + + # Test delete, and reinsert another value + cur.execute("DELETE FROM reptbl WHERE id = 1") + cur.execute(f"INSERT INTO reptbl VALUES (2, '{value}')") + logical_replication_sync(vanilla_pg, endpoint) + assert vanilla_pg.safe_psql("select id, largeval from reptbl") == [(2, value)] + + value = random_string(10_000_000) + cur.execute(f"UPDATE reptbl SET largeval='{value}'") + logical_replication_sync(vanilla_pg, endpoint) + assert vanilla_pg.safe_psql("select id, largeval from reptbl") == [(2, value)] + + endpoint.stop() + endpoint.start() + cur = endpoint.connect().cursor() + value = random_string(10_000_000) + cur.execute(f"UPDATE reptbl SET largeval='{value}'") + logical_replication_sync(vanilla_pg, endpoint) + assert vanilla_pg.safe_psql("select id, largeval from reptbl") == [(2, value)] + + # # Check that slots are not inherited in brnach # @@ -258,7 +360,6 @@ def test_slots_and_branching(neon_simple_env: NeonEnv): # Create branch ws. env.neon_cli.create_branch("ws", "main", tenant_id=tenant) ws_branch = env.endpoints.create_start("ws", tenant_id=tenant) - log.info("postgres is running on 'ws' branch") # Check that we can create slot with the same name ws_cur = ws_branch.connect().cursor() diff --git a/test_runner/regress/test_lsn_mapping.py b/test_runner/regress/test_lsn_mapping.py index 65d6d7a9fd..5813231aab 100644 --- a/test_runner/regress/test_lsn_mapping.py +++ b/test_runner/regress/test_lsn_mapping.py @@ -28,7 +28,6 @@ def test_lsn_mapping(neon_env_builder: NeonEnvBuilder): timeline_id = env.neon_cli.create_branch("test_lsn_mapping", tenant_id=tenant_id) endpoint_main = env.endpoints.create_start("test_lsn_mapping", tenant_id=tenant_id) timeline_id = endpoint_main.safe_psql("show neon.timeline_id")[0][0] - log.info("postgres is running on 'main' branch") cur = endpoint_main.connect().cursor() @@ -64,18 +63,14 @@ def test_lsn_mapping(neon_env_builder: NeonEnvBuilder): # Check edge cases # Timestamp is in the future probe_timestamp = tbl[-1][1] + timedelta(hours=1) - result = client.timeline_get_lsn_by_timestamp( - tenant_id, timeline_id, f"{probe_timestamp.isoformat()}Z" - ) + result = client.timeline_get_lsn_by_timestamp(tenant_id, timeline_id, probe_timestamp) assert result["kind"] == "future" # make sure that we return a well advanced lsn here assert Lsn(result["lsn"]) > start_lsn # Timestamp is in the unreachable past probe_timestamp = tbl[0][1] - timedelta(hours=10) - result = client.timeline_get_lsn_by_timestamp( - tenant_id, timeline_id, f"{probe_timestamp.isoformat()}Z" - ) + result = client.timeline_get_lsn_by_timestamp(tenant_id, timeline_id, probe_timestamp) assert result["kind"] == "past" # make sure that we return the minimum lsn here at the start of the range assert Lsn(result["lsn"]) < start_lsn @@ -83,9 +78,7 @@ def test_lsn_mapping(neon_env_builder: NeonEnvBuilder): # Probe a bunch of timestamps in the valid range for i in range(1, len(tbl), 100): probe_timestamp = tbl[i][1] - result = client.timeline_get_lsn_by_timestamp( - tenant_id, timeline_id, f"{probe_timestamp.isoformat()}Z" - ) + result = client.timeline_get_lsn_by_timestamp(tenant_id, timeline_id, probe_timestamp) assert result["kind"] not in ["past", "nodata"] lsn = result["lsn"] # Call get_lsn_by_timestamp to get the LSN @@ -108,9 +101,7 @@ def test_lsn_mapping(neon_env_builder: NeonEnvBuilder): # Timestamp is in the unreachable past probe_timestamp = tbl[0][1] - timedelta(hours=10) - result = client.timeline_get_lsn_by_timestamp( - tenant_id, timeline_id_child, f"{probe_timestamp.isoformat()}Z", 2 - ) + result = client.timeline_get_lsn_by_timestamp(tenant_id, timeline_id_child, probe_timestamp) assert result["kind"] == "past" # make sure that we return the minimum lsn here at the start of the range assert Lsn(result["lsn"]) >= last_flush_lsn @@ -122,7 +113,6 @@ def test_ts_of_lsn_api(neon_env_builder: NeonEnvBuilder): new_timeline_id = env.neon_cli.create_branch("test_ts_of_lsn_api") endpoint_main = env.endpoints.create_start("test_ts_of_lsn_api") - log.info("postgres is running on 'test_ts_of_lsn_api' branch") cur = endpoint_main.connect().cursor() # Create table, and insert rows, each in a separate transaction diff --git a/test_runner/regress/test_migrations.py b/test_runner/regress/test_migrations.py index 121fa91f66..526ae14b87 100644 --- a/test_runner/regress/test_migrations.py +++ b/test_runner/regress/test_migrations.py @@ -10,27 +10,31 @@ def test_migrations(neon_simple_env: NeonEnv): endpoint = env.endpoints.create("test_migrations") log_path = endpoint.endpoint_path() / "compute.log" - endpoint.respec(skip_pg_catalog_updates=False, features=["migrations"]) + endpoint.respec(skip_pg_catalog_updates=False) endpoint.start() - time.sleep(1) # Sleep to let migrations run + endpoint.wait_for_migrations() + + num_migrations = 9 with endpoint.cursor() as cur: cur.execute("SELECT id FROM neon_migration.migration_id") migration_id = cur.fetchall() - assert migration_id[0][0] == 2 + assert migration_id[0][0] == num_migrations with open(log_path, "r") as log_file: logs = log_file.read() - assert "INFO handle_migrations: Ran 2 migrations" in logs + assert f"INFO handle_migrations: Ran {num_migrations} migrations" in logs endpoint.stop() endpoint.start() - time.sleep(1) # Sleep to let migrations run + # We don't have a good way of knowing that the migrations code path finished executing + # in compute_ctl in the case that no migrations are being run + time.sleep(1) with endpoint.cursor() as cur: cur.execute("SELECT id FROM neon_migration.migration_id") migration_id = cur.fetchall() - assert migration_id[0][0] == 2 + assert migration_id[0][0] == num_migrations with open(log_path, "r") as log_file: logs = log_file.read() diff --git a/test_runner/regress/test_multixact.py b/test_runner/regress/test_multixact.py index 9db463dc4a..88f7a5db59 100644 --- a/test_runner/regress/test_multixact.py +++ b/test_runner/regress/test_multixact.py @@ -1,4 +1,3 @@ -from fixtures.log_helper import log from fixtures.neon_fixtures import NeonEnv, check_restored_datadir_content from fixtures.utils import query_scalar @@ -18,7 +17,6 @@ def test_multixact(neon_simple_env: NeonEnv, test_output_dir): env.neon_cli.create_branch("test_multixact", "empty") endpoint = env.endpoints.create_start("test_multixact") - log.info("postgres is running on 'test_multixact' branch") cur = endpoint.connect().cursor() cur.execute( """ @@ -78,7 +76,6 @@ def test_multixact(neon_simple_env: NeonEnv, test_output_dir): env.neon_cli.create_branch("test_multixact_new", "test_multixact", ancestor_start_lsn=lsn) endpoint_new = env.endpoints.create_start("test_multixact_new") - log.info("postgres is running on 'test_multixact_new' branch") next_multixact_id_new = endpoint_new.safe_psql( "SELECT next_multixact_id FROM pg_control_checkpoint()" )[0][0] diff --git a/test_runner/regress/test_neon_cli.py b/test_runner/regress/test_neon_cli.py index 16d120e24a..cb69f0ef39 100644 --- a/test_runner/regress/test_neon_cli.py +++ b/test_runner/regress/test_neon_cli.py @@ -133,7 +133,7 @@ def test_cli_start_stop(neon_env_builder: NeonEnvBuilder): # Stop default ps/sk env.neon_cli.pageserver_stop(env.pageserver.id) env.neon_cli.safekeeper_stop() - env.neon_cli.attachment_service_stop(False) + env.neon_cli.storage_controller_stop(False) # Keep NeonEnv state up to date, it usually owns starting/stopping services env.pageserver.running = False @@ -175,7 +175,7 @@ def test_cli_start_stop_multi(neon_env_builder: NeonEnvBuilder): env.neon_cli.safekeeper_stop(neon_env_builder.safekeepers_id_start + 2) # Stop this to get out of the way of the following `start` - env.neon_cli.attachment_service_stop(False) + env.neon_cli.storage_controller_stop(False) # Default start res = env.neon_cli.raw_cli(["start"]) diff --git a/test_runner/regress/test_neon_extension.py b/test_runner/regress/test_neon_extension.py index 998f84f968..e31e1cab51 100644 --- a/test_runner/regress/test_neon_extension.py +++ b/test_runner/regress/test_neon_extension.py @@ -14,8 +14,6 @@ def test_neon_extension(neon_env_builder: NeonEnvBuilder): endpoint_main.respec(skip_pg_catalog_updates=False) endpoint_main.start() - log.info("postgres is running on 'test_create_extension_neon' branch") - with closing(endpoint_main.connect()) as conn: with conn.cursor() as cur: cur.execute("SELECT extversion from pg_extension where extname='neon'") @@ -25,4 +23,40 @@ def test_neon_extension(neon_env_builder: NeonEnvBuilder): # IMPORTANT: # If the version has changed, the test should be updated. # Ensure that the default version is also updated in the neon.control file - assert cur.fetchone() == ("1.1",) + assert cur.fetchone() == ("1.3",) + cur.execute("SELECT * from neon.NEON_STAT_FILE_CACHE") + res = cur.fetchall() + log.info(res) + assert len(res) == 1 + assert len(res[0]) == 5 + + +# Verify that the neon extension can be upgraded/downgraded. +def test_neon_extension_compatibility(neon_env_builder: NeonEnvBuilder): + env = neon_env_builder.init_start() + env.neon_cli.create_branch("test_neon_extension_compatibility") + + endpoint_main = env.endpoints.create("test_neon_extension_compatibility") + # don't skip pg_catalog updates - it runs CREATE EXTENSION neon + endpoint_main.respec(skip_pg_catalog_updates=False) + endpoint_main.start() + + with closing(endpoint_main.connect()) as conn: + with conn.cursor() as cur: + all_versions = ["1.3", "1.2", "1.1", "1.0"] + current_version = "1.3" + for idx, begin_version in enumerate(all_versions): + for target_version in all_versions[idx + 1 :]: + if current_version != begin_version: + cur.execute( + f"ALTER EXTENSION neon UPDATE TO '{begin_version}'; -- {current_version}->{begin_version}" + ) + current_version = begin_version + # downgrade + cur.execute( + f"ALTER EXTENSION neon UPDATE TO '{target_version}'; -- {begin_version}->{target_version}" + ) + # upgrade + cur.execute( + f"ALTER EXTENSION neon UPDATE TO '{begin_version}'; -- {target_version}->{begin_version}" + ) diff --git a/test_runner/regress/test_neon_local_cli.py b/test_runner/regress/test_neon_local_cli.py index 46b72fbca5..8edba49b8a 100644 --- a/test_runner/regress/test_neon_local_cli.py +++ b/test_runner/regress/test_neon_local_cli.py @@ -59,3 +59,5 @@ def test_neon_two_primary_endpoints_fail( env.neon_cli.endpoint_stop("ep1") # ep1 is stopped so create ep2 will succeed env.neon_cli.endpoint_start("ep2") + # cleanup + env.neon_cli.endpoint_stop("ep2") diff --git a/test_runner/regress/test_neon_superuser.py b/test_runner/regress/test_neon_superuser.py index 6be7c114cb..fd31df84da 100644 --- a/test_runner/regress/test_neon_superuser.py +++ b/test_runner/regress/test_neon_superuser.py @@ -1,26 +1,44 @@ -import time - +from fixtures.log_helper import log from fixtures.neon_fixtures import NeonEnv from fixtures.pg_version import PgVersion +from fixtures.utils import wait_until def test_neon_superuser(neon_simple_env: NeonEnv, pg_version: PgVersion): env = neon_simple_env - env.neon_cli.create_branch("test_neon_superuser", "empty") - endpoint = env.endpoints.create("test_neon_superuser") - endpoint.respec(skip_pg_catalog_updates=False, features=["migrations"]) - endpoint.start() + env.neon_cli.create_branch("test_neon_superuser_publisher", "empty") + pub = env.endpoints.create("test_neon_superuser_publisher") - time.sleep(1) # Sleep to let migrations run + env.neon_cli.create_branch("test_neon_superuser_subscriber") + sub = env.endpoints.create("test_neon_superuser_subscriber") - with endpoint.cursor() as cur: + pub.respec(skip_pg_catalog_updates=False) + pub.start() + + sub.respec(skip_pg_catalog_updates=False) + sub.start() + + pub.wait_for_migrations() + sub.wait_for_migrations() + + with pub.cursor() as cur: cur.execute( "CREATE ROLE mr_whiskers WITH PASSWORD 'cat' LOGIN INHERIT CREATEROLE CREATEDB BYPASSRLS REPLICATION IN ROLE neon_superuser" ) cur.execute("CREATE DATABASE neondb WITH OWNER mr_whiskers") cur.execute("GRANT ALL PRIVILEGES ON DATABASE neondb TO neon_superuser") - with endpoint.cursor(dbname="neondb", user="mr_whiskers", password="cat") as cur: + # If we don't do this, creating the subscription will fail later on PG16 + pub.edit_hba(["host all mr_whiskers 0.0.0.0/0 md5"]) + + with sub.cursor() as cur: + cur.execute( + "CREATE ROLE mr_whiskers WITH PASSWORD 'cat' LOGIN INHERIT CREATEROLE CREATEDB BYPASSRLS REPLICATION IN ROLE neon_superuser" + ) + cur.execute("CREATE DATABASE neondb WITH OWNER mr_whiskers") + cur.execute("GRANT ALL PRIVILEGES ON DATABASE neondb TO neon_superuser") + + with pub.cursor(dbname="neondb", user="mr_whiskers", password="cat") as cur: cur.execute("SELECT pg_has_role('mr_whiskers', 'neon_superuser', 'member')") assert cur.fetchall()[0][0] cur.execute("SELECT pg_has_role('mr_whiskers', 'neon_superuser', 'usage')") @@ -32,3 +50,50 @@ def test_neon_superuser(neon_simple_env: NeonEnv, pg_version: PgVersion): cur.execute("CREATE PUBLICATION pub FOR ALL TABLES") cur.execute("CREATE ROLE definitely_not_a_superuser WITH PASSWORD 'nope'") + cur.execute("CREATE DATABASE definitely_a_database") + cur.execute("CREATE TABLE t (a int)") + cur.execute("INSERT INTO t VALUES (10), (20)") + cur.execute("SELECT * from t") + res = cur.fetchall() + assert [r[0] for r in res] == [10, 20] + + with sub.cursor(dbname="neondb", user="mr_whiskers", password="cat") as cur: + cur.execute("CREATE TABLE t (a int)") + + pub_conn = f"host=localhost port={pub.pg_port} dbname=neondb user=mr_whiskers password=cat" + query = f"CREATE SUBSCRIPTION sub CONNECTION '{pub_conn}' PUBLICATION pub" + log.info(f"Creating subscription: {query}") + cur.execute(query) + + with pub.cursor(dbname="neondb", user="mr_whiskers", password="cat") as pcur: + pcur.execute("INSERT INTO t VALUES (30), (40)") + + def check_that_changes_propagated(): + cur.execute("SELECT * FROM t") + res = cur.fetchall() + log.info(res) + assert len(res) == 4 + assert [r[0] for r in res] == [10, 20, 30, 40] + + wait_until(10, 0.5, check_that_changes_propagated) + + # Test that pg_monitor is working for neon_superuser role + cur.execute("SELECT query from pg_stat_activity LIMIT 1") + assert cur.fetchall()[0][0] != "" + # Test that pg_monitor is not working for non neon_superuser role without grant + cur.execute("CREATE ROLE not_a_superuser LOGIN PASSWORD 'Password42!'") + cur.execute("GRANT not_a_superuser TO neon_superuser WITH ADMIN OPTION") + cur.execute("SET ROLE not_a_superuser") + cur.execute("SELECT query from pg_stat_activity LIMIT 1") + assert cur.fetchall()[0][0] == "" + cur.execute("RESET ROLE") + # Test that pg_monitor is working for non neon_superuser role with grant + cur.execute("GRANT pg_monitor TO not_a_superuser") + cur.execute("SET ROLE not_a_superuser") + cur.execute("SELECT query from pg_stat_activity LIMIT 1") + assert cur.fetchall()[0][0] != "" + cur.execute("RESET ROLE") + cur.execute("DROP ROLE not_a_superuser") + query = "DROP SUBSCRIPTION sub CASCADE" + log.info(f"Dropping subscription: {query}") + cur.execute(query) diff --git a/test_runner/regress/test_next_xid.py b/test_runner/regress/test_next_xid.py index da2580dbf9..e880445c4d 100644 --- a/test_runner/regress/test_next_xid.py +++ b/test_runner/regress/test_next_xid.py @@ -203,6 +203,16 @@ def test_import_at_2bil( $$; """ ) + + # Also create a multi-XID with members past the 2 billion mark + conn2 = endpoint.connect() + cur2 = conn2.cursor() + cur.execute("INSERT INTO t VALUES ('x')") + cur.execute("BEGIN; select * from t WHERE t = 'x' FOR SHARE;") + cur2.execute("BEGIN; select * from t WHERE t = 'x' FOR SHARE;") + cur.execute("COMMIT") + cur2.execute("COMMIT") + # A checkpoint writes a WAL record with xl_xid=0. Many other WAL # records would have the same effect. cur.execute("checkpoint") @@ -217,4 +227,4 @@ def test_import_at_2bil( conn = endpoint.connect() cur = conn.cursor() cur.execute("SELECT count(*) from t") - assert cur.fetchone() == (10000 + 1,) + assert cur.fetchone() == (10000 + 1 + 1,) diff --git a/test_runner/regress/test_old_request_lsn.py b/test_runner/regress/test_old_request_lsn.py index 9b0bab5125..391305c58a 100644 --- a/test_runner/regress/test_old_request_lsn.py +++ b/test_runner/regress/test_old_request_lsn.py @@ -20,7 +20,6 @@ def test_old_request_lsn(neon_env_builder: NeonEnvBuilder): env = neon_env_builder.init_start() env.neon_cli.create_branch("test_old_request_lsn", "main") endpoint = env.endpoints.create_start("test_old_request_lsn") - log.info("postgres is running on test_old_request_lsn branch") pg_conn = endpoint.connect() cur = pg_conn.cursor() diff --git a/test_runner/regress/test_ondemand_download.py b/test_runner/regress/test_ondemand_download.py index af2d7aae88..914f068afb 100644 --- a/test_runner/regress/test_ondemand_download.py +++ b/test_runner/regress/test_ondemand_download.py @@ -8,6 +8,7 @@ from typing import Any, DefaultDict, Dict, Tuple from fixtures.log_helper import log from fixtures.neon_fixtures import ( NeonEnvBuilder, + flush_ep_to_pageserver, last_flush_lsn_upload, wait_for_last_flush_lsn, ) @@ -17,6 +18,7 @@ from fixtures.pageserver.utils import ( wait_for_last_record_lsn, wait_for_upload, wait_for_upload_queue_empty, + wait_until_tenant_active, ) from fixtures.remote_storage import RemoteStorageKind from fixtures.types import Lsn @@ -165,6 +167,10 @@ def test_ondemand_download_timetravel(neon_env_builder: NeonEnvBuilder): tenant_id = env.initial_tenant timeline_id = env.initial_timeline + #### + # Produce layers + #### + lsns = [] table_len = 10000 @@ -194,11 +200,28 @@ def test_ondemand_download_timetravel(neon_env_builder: NeonEnvBuilder): # run checkpoint manually to be sure that data landed in remote storage client.timeline_checkpoint(tenant_id, timeline_id) - ##### Stop the first pageserver instance, erase all its data + # prevent new WAL from being produced, wait for layers to reach remote storage env.endpoints.stop_all() - - # wait until pageserver has successfully uploaded all the data to remote storage + for sk in env.safekeepers: + sk.stop() + # NB: the wait_for_upload returns as soon as remote_consistent_lsn == current_lsn. + # But the checkpoint also triggers a compaction + # => image layer generation => + # => doesn't advance LSN + # => but we want the remote state to deterministic, so additionally, wait for upload queue to drain wait_for_upload(client, tenant_id, timeline_id, current_lsn) + wait_for_upload_queue_empty(pageserver_http, env.initial_tenant, timeline_id) + client.deletion_queue_flush(execute=True) + env.pageserver.stop() + env.pageserver.start() + # We've shut down the SKs, then restarted the PSes to sever all walreceiver connections; + # This means pageserver's remote_consistent_lsn is now frozen to whatever it was after the pageserver.stop() call. + wait_until_tenant_active(client, tenant_id) + + ### + # Produce layers complete; + # Start the actual testing. + ### def get_api_current_physical_size(): d = client.timeline_detail(tenant_id, timeline_id) @@ -215,9 +238,7 @@ def test_ondemand_download_timetravel(neon_env_builder: NeonEnvBuilder): log.info(filled_size) assert filled_current_physical == filled_size, "we don't yet do layer eviction" - # Wait until generated image layers are uploaded to S3 - wait_for_upload_queue_empty(pageserver_http, env.initial_tenant, timeline_id) - + # Stop the first pageserver instance, erase all its data env.pageserver.stop() # remove all the layer files @@ -497,7 +518,7 @@ def test_compaction_downloads_on_demand_without_image_creation(neon_env_builder: with endpoint.cursor() as cur: cur.execute("update a set id = -id") - wait_for_last_flush_lsn(env, endpoint, tenant_id, timeline_id) + flush_ep_to_pageserver(env, endpoint, tenant_id, timeline_id) pageserver_http.timeline_checkpoint(tenant_id, timeline_id) layers = pageserver_http.layer_map_info(tenant_id, timeline_id) diff --git a/test_runner/regress/test_pageserver_api.py b/test_runner/regress/test_pageserver_api.py index e29db1e252..877deee08f 100644 --- a/test_runner/regress/test_pageserver_api.py +++ b/test_runner/regress/test_pageserver_api.py @@ -73,7 +73,7 @@ def check_client(env: NeonEnv, client: PageserverHttpClient): # create new tenant and check it is also there tenant_id = TenantId.generate() client.tenant_create( - tenant_id, generation=env.attachment_service.attach_hook_issue(tenant_id, env.pageserver.id) + tenant_id, generation=env.storage_controller.attach_hook_issue(tenant_id, env.pageserver.id) ) assert tenant_id in {TenantId(t["id"]) for t in client.tenant_list()} diff --git a/test_runner/regress/test_pageserver_generations.py b/test_runner/regress/test_pageserver_generations.py index 63f6130af5..3ca13a904d 100644 --- a/test_runner/regress/test_pageserver_generations.py +++ b/test_runner/regress/test_pageserver_generations.py @@ -20,6 +20,7 @@ from fixtures.log_helper import log from fixtures.neon_fixtures import ( NeonEnv, NeonEnvBuilder, + NeonPageserver, PgBin, S3Scrubber, last_flush_lsn_upload, @@ -62,7 +63,7 @@ def generate_uploads_and_deletions( tenant_id: Optional[TenantId] = None, timeline_id: Optional[TimelineId] = None, data: Optional[str] = None, - pageserver_id: Optional[int] = None, + pageserver: NeonPageserver, ): """ Using the environment's default tenant + timeline, generate a load pattern @@ -77,14 +78,16 @@ def generate_uploads_and_deletions( timeline_id = env.initial_timeline assert timeline_id is not None - ps_http = env.pageserver.http_client() + ps_http = pageserver.http_client() with env.endpoints.create_start( - "main", tenant_id=tenant_id, pageserver_id=pageserver_id + "main", tenant_id=tenant_id, pageserver_id=pageserver.id ) as endpoint: if init: endpoint.safe_psql("CREATE TABLE foo (id INTEGER PRIMARY KEY, val text)") - last_flush_lsn_upload(env, endpoint, tenant_id, timeline_id) + last_flush_lsn_upload( + env, endpoint, tenant_id, timeline_id, pageserver_id=pageserver.id + ) def churn(data): endpoint.safe_psql_many( @@ -105,7 +108,9 @@ def generate_uploads_and_deletions( # We are waiting for uploads as well as local flush, in order to avoid leaving the system # in a state where there are "future layers" in remote storage that will generate deletions # after a restart. - last_flush_lsn_upload(env, endpoint, tenant_id, timeline_id) + last_flush_lsn_upload( + env, endpoint, tenant_id, timeline_id, pageserver_id=pageserver.id + ) ps_http.timeline_checkpoint(tenant_id, timeline_id) # Compaction should generate some GC-elegible layers @@ -198,14 +203,17 @@ def test_generations_upgrade(neon_env_builder: NeonEnvBuilder): env.broker.try_start() for sk in env.safekeepers: sk.start() - env.attachment_service.start() + env.storage_controller.start() + + # We will start a pageserver with no control_plane_api set, so it won't be able to self-register + env.storage_controller.node_register(env.pageserver) env.pageserver.start(overrides=('--pageserver-config-override=control_plane_api=""',)) env.neon_cli.create_tenant( tenant_id=env.initial_tenant, conf=TENANT_CONF, timeline_id=env.initial_timeline ) - generate_uploads_and_deletions(env, pageserver_id=env.pageserver.id) + generate_uploads_and_deletions(env, pageserver=env.pageserver) def parse_generation_suffix(key): m = re.match(".+-([0-9a-zA-Z]{8})$", key) @@ -233,7 +241,7 @@ def test_generations_upgrade(neon_env_builder: NeonEnvBuilder): # Starting without the override that disabled control_plane_api env.pageserver.start() - generate_uploads_and_deletions(env, pageserver_id=env.pageserver.id, init=False) + generate_uploads_and_deletions(env, pageserver=env.pageserver, init=False) legacy_objects: list[str] = [] suffixed_objects = [] @@ -265,9 +273,7 @@ def test_generations_upgrade(neon_env_builder: NeonEnvBuilder): # Having written a mixture of generation-aware and legacy index_part.json, # ensure the scrubber handles the situation as expected. - metadata_summary = S3Scrubber( - neon_env_builder.test_output_dir, neon_env_builder - ).scan_metadata() + metadata_summary = S3Scrubber(neon_env_builder).scan_metadata() assert metadata_summary["tenant_count"] == 1 # Scrubber should have seen our timeline assert metadata_summary["timeline_count"] == 1 assert metadata_summary["timeline_shard_count"] == 1 @@ -279,13 +285,16 @@ def test_deferred_deletion(neon_env_builder: NeonEnvBuilder): neon_env_builder.enable_pageserver_remote_storage( RemoteStorageKind.MOCK_S3, ) + neon_env_builder.num_pageservers = 2 env = neon_env_builder.init_start(initial_tenant_conf=TENANT_CONF) - some_other_pageserver = 1234 + attached_to_id = env.storage_controller.locate(env.initial_tenant)[0]["node_id"] + main_pageserver = env.get_pageserver(attached_to_id) + other_pageserver = [p for p in env.pageservers if p.id != attached_to_id][0] - ps_http = env.pageserver.http_client() + ps_http = main_pageserver.http_client() - generate_uploads_and_deletions(env) + generate_uploads_and_deletions(env, pageserver=main_pageserver) # Flush: pending deletions should all complete assert_deletion_queue(ps_http, lambda n: n > 0) @@ -298,14 +307,14 @@ def test_deferred_deletion(neon_env_builder: NeonEnvBuilder): assert timeline["remote_consistent_lsn"] == timeline["remote_consistent_lsn_visible"] assert get_deletion_queue_dropped_lsn_updates(ps_http) == 0 - env.pageserver.allowed_errors.extend( + main_pageserver.allowed_errors.extend( [".*Dropped remote consistent LSN updates.*", ".*Dropping stale deletions.*"] ) # Now advance the generation in the control plane: subsequent validations # from the running pageserver will fail. No more deletions should happen. - env.attachment_service.attach_hook_issue(env.initial_tenant, some_other_pageserver) - generate_uploads_and_deletions(env, init=False, pageserver_id=env.pageserver.id) + env.storage_controller.attach_hook_issue(env.initial_tenant, other_pageserver.id) + generate_uploads_and_deletions(env, init=False, pageserver=main_pageserver) assert_deletion_queue(ps_http, lambda n: n > 0) queue_depth_before = get_deletion_queue_depth(ps_http) @@ -357,9 +366,14 @@ def test_deletion_queue_recovery( neon_env_builder.enable_pageserver_remote_storage( RemoteStorageKind.MOCK_S3, ) + neon_env_builder.num_pageservers = 2 env = neon_env_builder.init_start(initial_tenant_conf=TENANT_CONF) - ps_http = env.pageserver.http_client() + attached_to_id = env.storage_controller.locate(env.initial_tenant)[0]["node_id"] + main_pageserver = env.get_pageserver(attached_to_id) + other_pageserver = [p for p in env.pageservers if p.id != attached_to_id][0] + + ps_http = main_pageserver.http_client() failpoints = [ # Prevent deletion lists from being executed, to build up some backlog of deletions @@ -376,7 +390,7 @@ def test_deletion_queue_recovery( ps_http.configure_failpoints(failpoints) - generate_uploads_and_deletions(env) + generate_uploads_and_deletions(env, pageserver=main_pageserver) # There should be entries in the deletion queue assert_deletion_queue(ps_http, lambda n: n > 0) @@ -403,7 +417,7 @@ def test_deletion_queue_recovery( # also wait to see the header hit the disk: this seems paranoid but the race # can really happen on a heavily overloaded test machine. def assert_header_written(): - assert (env.pageserver.workdir / "deletion" / "header-01").exists() + assert (main_pageserver.workdir / "deletion" / "header-01").exists() wait_until(20, 1, assert_header_written) @@ -413,15 +427,15 @@ def test_deletion_queue_recovery( before_restart_depth = get_deletion_queue_validated(ps_http) log.info(f"Restarting pageserver with {before_restart_depth} deletions enqueued") - env.pageserver.stop(immediate=True) + main_pageserver.stop(immediate=True) if keep_attachment == KeepAttachment.LOSE: - some_other_pageserver = 101010 - env.attachment_service.attach_hook_issue(env.initial_tenant, some_other_pageserver) + some_other_pageserver = other_pageserver.id + env.storage_controller.attach_hook_issue(env.initial_tenant, some_other_pageserver) - env.pageserver.start() + main_pageserver.start() - def assert_deletions_submitted(n: int): + def assert_deletions_submitted(n: int) -> None: assert ps_http.get_metric_value("pageserver_deletion_queue_submitted_total") == n # After restart, issue a flush to kick the deletion frontend to do recovery. @@ -442,7 +456,7 @@ def test_deletion_queue_recovery( # validated before restart. assert get_deletion_queue_executed(ps_http) == before_restart_depth else: - env.pageserver.allowed_errors.extend([".*Dropping stale deletions.*"]) + main_pageserver.allowed_errors.extend([".*Dropping stale deletions.*"]) # If we lost the attachment, we should have dropped our pre-restart deletions. assert get_deletion_queue_dropped(ps_http) == before_restart_depth @@ -451,8 +465,8 @@ def test_deletion_queue_recovery( assert get_deletion_queue_dropped_lsn_updates(ps_http) == 0 # Restart again - env.pageserver.stop(immediate=True) - env.pageserver.start() + main_pageserver.stop(immediate=True) + main_pageserver.start() # No deletion lists should be recovered: this demonstrates that deletion lists # were cleaned up after being executed or dropped in the previous process lifetime. @@ -471,7 +485,7 @@ def test_emergency_mode(neon_env_builder: NeonEnvBuilder, pg_bin: PgBin): ps_http = env.pageserver.http_client() - generate_uploads_and_deletions(env, pageserver_id=env.pageserver.id) + generate_uploads_and_deletions(env, pageserver=env.pageserver) env.pageserver.allowed_errors.extend( [ @@ -483,12 +497,12 @@ def test_emergency_mode(neon_env_builder: NeonEnvBuilder, pg_bin: PgBin): ) # Simulate a major incident: the control plane goes offline - env.attachment_service.stop() + env.storage_controller.stop() # Remember how many validations had happened before the control plane went offline validated = get_deletion_queue_validated(ps_http) - generate_uploads_and_deletions(env, init=False, pageserver_id=env.pageserver.id) + generate_uploads_and_deletions(env, init=False, pageserver=env.pageserver) # The running pageserver should stop progressing deletions time.sleep(10) @@ -499,11 +513,11 @@ def test_emergency_mode(neon_env_builder: NeonEnvBuilder, pg_bin: PgBin): # and serve clients. env.pageserver.stop() # Non-immediate: implicitly checking that shutdown doesn't hang waiting for CP env.pageserver.start( - overrides=("--pageserver-config-override=control_plane_emergency_mode=true",) + overrides=("--pageserver-config-override=control_plane_emergency_mode=true",), ) # The pageserver should provide service to clients - generate_uploads_and_deletions(env, init=False, pageserver_id=env.pageserver.id) + generate_uploads_and_deletions(env, init=False, pageserver=env.pageserver) # The pageserver should neither validate nor execute any deletions, it should have # loaded the DeletionLists from before though @@ -513,7 +527,7 @@ def test_emergency_mode(neon_env_builder: NeonEnvBuilder, pg_bin: PgBin): assert get_deletion_queue_executed(ps_http) == 0 # When the control plane comes back up, normal service should resume - env.attachment_service.start() + env.storage_controller.start() ps_http.deletion_queue_flush(execute=True) assert get_deletion_queue_depth(ps_http) == 0 @@ -524,7 +538,7 @@ def test_emergency_mode(neon_env_builder: NeonEnvBuilder, pg_bin: PgBin): env.pageserver.stop() # Non-immediate: implicitly checking that shutdown doesn't hang waiting for CP env.pageserver.start() - generate_uploads_and_deletions(env, init=False, pageserver_id=env.pageserver.id) + generate_uploads_and_deletions(env, init=False, pageserver=env.pageserver) ps_http.deletion_queue_flush(execute=True) assert get_deletion_queue_depth(ps_http) == 0 assert get_deletion_queue_validated(ps_http) > 0 @@ -562,7 +576,7 @@ def test_eviction_across_generations(neon_env_builder: NeonEnvBuilder): tenant_id = env.initial_tenant timeline_id = env.initial_timeline - generate_uploads_and_deletions(env) + generate_uploads_and_deletions(env, pageserver=env.pageserver) read_all(env, tenant_id, timeline_id) evict_all_layers(env, tenant_id, timeline_id) diff --git a/test_runner/regress/test_pageserver_getpage_throttle.py b/test_runner/regress/test_pageserver_getpage_throttle.py new file mode 100644 index 0000000000..42cc28efee --- /dev/null +++ b/test_runner/regress/test_pageserver_getpage_throttle.py @@ -0,0 +1,118 @@ +import json +import uuid + +from anyio import Path +from fixtures.log_helper import log +from fixtures.neon_fixtures import NeonEnvBuilder, PgBin +from fixtures.pg_version import PgVersion +from fixtures.types import TenantId, TimelineId +from fixtures.utils import wait_until + + +def test_pageserver_getpage_throttle(neon_env_builder: NeonEnvBuilder, pg_bin: PgBin): + env = neon_env_builder.init_start() + + env.pageserver.tenant_detach(env.initial_tenant) + + env.pageserver.allowed_errors.append( + # https://github.com/neondatabase/neon/issues/6925 + r".*query handler for.*pagestream.*failed: unexpected message: CopyFail during COPY.*" + ) + + tenant_id = TenantId.generate() + timeline_id = TimelineId.generate() + + rate_limit_rps = 100 + compaction_period = 5 + env.pageserver.tenant_create( + tenant_id, + conf={ + "compaction_period": f"{compaction_period}s", + "timeline_get_throttle": { + "task_kinds": ["PageRequestHandler"], + "initial": 0, + "refill_interval": "100ms", + "refill_amount": int(rate_limit_rps / 10), + "max": int(rate_limit_rps / 10), + "fair": True, + }, + }, + ) + + ps_http = env.pageserver.http_client() + + ps_http.timeline_create(PgVersion.V16, tenant_id, timeline_id) + + def run_pagebench_at_max_speed_and_get_total_requests_completed(duration_secs: int): + cmd = [ + str(env.neon_binpath / "pagebench"), + "get-page-latest-lsn", + "--mgmt-api-endpoint", + ps_http.base_url, + "--page-service-connstring", + env.pageserver.connstr(password=None), + "--runtime", + f"{duration_secs}s", + f"{tenant_id}/{timeline_id}", + ] + + basepath = pg_bin.run_capture(cmd, with_command_header=False) + results_path = Path(basepath + ".stdout") + log.info(f"Benchmark results at: {results_path}") + + with open(results_path, "r") as f: + results = json.load(f) + log.info(f"Results:\n{json.dumps(results, sort_keys=True, indent=2)}") + return int(results["total"]["request_count"]) + + log.info("warmup / make sure metrics are present") + run_pagebench_at_max_speed_and_get_total_requests_completed(2) + metrics_query = { + "tenant_id": str(tenant_id), + "timeline_id": str(timeline_id), + "smgr_query_type": "get_page_at_lsn", + } + metric_name = "pageserver_smgr_query_seconds_sum" + smgr_query_seconds_pre = ps_http.get_metric_value(metric_name, metrics_query) + assert smgr_query_seconds_pre is not None + + marker = uuid.uuid4().hex + ps_http.post_tracing_event("info", marker) + _, marker_offset = wait_until( + 10, 0.5, lambda: env.pageserver.assert_log_contains(marker, offset=None) + ) + + log.info("run pagebench") + duration_secs = 10 + actual_ncompleted = run_pagebench_at_max_speed_and_get_total_requests_completed(duration_secs) + + log.info("validate the client is capped at the configured rps limit") + expect_ncompleted = duration_secs * rate_limit_rps + delta_abs = abs(expect_ncompleted - actual_ncompleted) + threshold = 0.05 * expect_ncompleted + assert ( + threshold / rate_limit_rps < 0.1 * duration_secs + ), "test self-test: unrealistic expecations regarding precision in this test" + assert ( + delta_abs < 0.05 * expect_ncompleted + ), "the throttling deviates more than 5percent from the expectation" + + log.info("validate that we logged the throttling") + + wait_until( + 10, + compaction_period / 10, + lambda: env.pageserver.assert_log_contains( + f".*{tenant_id}.*shard was throttled in the last n_seconds.*", + offset=marker_offset, + ), + ) + + log.info("validate that the metric doesn't include throttle wait time") + smgr_query_seconds_post = ps_http.get_metric_value(metric_name, metrics_query) + assert smgr_query_seconds_post is not None + actual_smgr_query_seconds = smgr_query_seconds_post - smgr_query_seconds_pre + + assert ( + duration_secs >= 10 * actual_smgr_query_seconds + ), "smgr metrics should not include throttle wait time" diff --git a/test_runner/regress/test_pageserver_restart.py b/test_runner/regress/test_pageserver_restart.py index c4499196b5..753898f747 100644 --- a/test_runner/regress/test_pageserver_restart.py +++ b/test_runner/regress/test_pageserver_restart.py @@ -1,4 +1,6 @@ +import random from contextlib import closing +from typing import Optional import pytest from fixtures.log_helper import log @@ -141,18 +143,24 @@ def test_pageserver_restart(neon_env_builder: NeonEnvBuilder): # Test that repeatedly kills and restarts the page server, while the # safekeeper and compute node keep running. @pytest.mark.timeout(540) -def test_pageserver_chaos(neon_env_builder: NeonEnvBuilder, build_type: str): +@pytest.mark.parametrize("shard_count", [None, 4]) +def test_pageserver_chaos( + neon_env_builder: NeonEnvBuilder, build_type: str, shard_count: Optional[int] +): if build_type == "debug": pytest.skip("times out in debug builds") neon_env_builder.enable_pageserver_remote_storage(s3_storage()) neon_env_builder.enable_scrub_on_exit() + if shard_count is not None: + neon_env_builder.num_pageservers = shard_count - env = neon_env_builder.init_start() + env = neon_env_builder.init_start(initial_tenant_shard_count=shard_count) # these can happen, if we shutdown at a good time. to be fixed as part of #5172. message = ".*duplicated L1 layer layer=.*" - env.pageserver.allowed_errors.append(message) + for ps in env.pageservers: + ps.allowed_errors.append(message) # Use a tiny checkpoint distance, to create a lot of layers quickly. # That allows us to stress the compaction and layer flushing logic more. @@ -192,13 +200,19 @@ def test_pageserver_chaos(neon_env_builder: NeonEnvBuilder, build_type: str): log.info(f"shared_buffers is {row[0]}, table size {row[1]}") assert int(row[0]) < int(row[1]) + # We run "random" kills using a fixed seed, to improve reproducibility if a test + # failure is related to a particular order of operations. + seed = 0xDEADBEEF + rng = random.Random(seed) + # Update the whole table, then immediately kill and restart the pageserver for i in range(1, 15): endpoint.safe_psql("UPDATE foo set updates = updates + 1") # This kills the pageserver immediately, to simulate a crash - env.pageserver.stop(immediate=True) - env.pageserver.start() + to_kill = rng.choice(env.pageservers) + to_kill.stop(immediate=True) + to_kill.start() # Check that all the updates are visible num_updates = endpoint.safe_psql("SELECT sum(updates) FROM foo")[0][0] diff --git a/test_runner/regress/test_pageserver_secondary.py b/test_runner/regress/test_pageserver_secondary.py index 293152dd62..79145f61b3 100644 --- a/test_runner/regress/test_pageserver_secondary.py +++ b/test_runner/regress/test_pageserver_secondary.py @@ -1,3 +1,4 @@ +import json import random from pathlib import Path from typing import Any, Dict, Optional @@ -7,9 +8,10 @@ from fixtures.log_helper import log from fixtures.neon_fixtures import NeonEnvBuilder, NeonPageserver, S3Scrubber from fixtures.pageserver.utils import ( assert_prefix_empty, + poll_for_remote_storage_iterations, tenant_delete_wait_completed, ) -from fixtures.remote_storage import LocalFsStorage, RemoteStorageKind +from fixtures.remote_storage import LocalFsStorage, RemoteStorageKind, S3Storage from fixtures.types import TenantId, TimelineId from fixtures.utils import wait_until from fixtures.workload import Workload @@ -73,16 +75,19 @@ def test_location_conf_churn(neon_env_builder: NeonEnvBuilder, seed: int): tenant_id = env.initial_tenant timeline_id = env.initial_timeline - # We will make no effort to avoid stale attachments for ps in env.pageservers: ps.allowed_errors.extend( [ + # We will make no effort to avoid stale attachments ".*Dropped remote consistent LSN updates.*", ".*Dropping stale deletions.*", # page_service_conn_main{peer_addr=[::1]:41176}: query handler for 'pagestream 3b19aec5038c796f64b430b30a555121 d07776761d44050b8aab511df1657d83' failed: Tenant 3b19aec5038c796f64b430b30a555121 not found ".*query handler.*Tenant.*not found.*", # page_service_conn_main{peer_addr=[::1]:45552}: query handler for 'pagestream 414ede7ad50f775a8e7d9ba0e43b9efc a43884be16f44b3626482b6981b2c745' failed: Tenant 414ede7ad50f775a8e7d9ba0e43b9efc is not active ".*query handler.*Tenant.*not active.*", + # this shutdown case is logged at WARN severity by the time it bubbles up to logical size calculation code + # WARN ...: initial size calculation failed: downloading failed, possibly for shutdown + ".*downloading failed, possibly for shutdown", ] ) @@ -152,7 +157,7 @@ def test_location_conf_churn(neon_env_builder: NeonEnvBuilder, seed: int): workload.churn_rows(rng.randint(128, 256), pageserver.id) workload.validate(pageserver.id) elif last_state_ps[0].startswith("Attached"): - # The `attachment_service` will only re-attach on startup when a pageserver was the + # The `storage_controller` will only re-attach on startup when a pageserver was the # holder of the latest generation: otherwise the pageserver will revert to detached # state if it was running attached with a stale generation last_state[pageserver.id] = ("Detached", None) @@ -177,12 +182,12 @@ def test_location_conf_churn(neon_env_builder: NeonEnvBuilder, seed: int): generation = last_state_ps[1] else: # Switch generations, while also jumping between attached states - generation = env.attachment_service.attach_hook_issue( + generation = env.storage_controller.attach_hook_issue( tenant_id, pageserver.id ) latest_attached = pageserver.id else: - generation = env.attachment_service.attach_hook_issue(tenant_id, pageserver.id) + generation = env.storage_controller.attach_hook_issue(tenant_id, pageserver.id) latest_attached = pageserver.id else: generation = None @@ -224,9 +229,8 @@ def test_live_migration(neon_env_builder: NeonEnvBuilder): Test the sequence of location states that are used in a live migration. """ neon_env_builder.num_pageservers = 2 - neon_env_builder.enable_pageserver_remote_storage( - remote_storage_kind=RemoteStorageKind.MOCK_S3, - ) + remote_storage_kind = RemoteStorageKind.MOCK_S3 + neon_env_builder.enable_pageserver_remote_storage(remote_storage_kind=remote_storage_kind) env = neon_env_builder.init_start(initial_tenant_conf=TENANT_CONF) tenant_id = env.initial_tenant @@ -269,7 +273,7 @@ def test_live_migration(neon_env_builder: NeonEnvBuilder): # Encourage the new location to download while still in secondary mode pageserver_b.http_client().tenant_secondary_download(tenant_id) - migrated_generation = env.attachment_service.attach_hook_issue(tenant_id, pageserver_b.id) + migrated_generation = env.storage_controller.attach_hook_issue(tenant_id, pageserver_b.id) log.info(f"Acquired generation {migrated_generation} for destination pageserver") assert migrated_generation == initial_generation + 1 @@ -342,6 +346,12 @@ def test_live_migration(neon_env_builder: NeonEnvBuilder): workload.churn_rows(64, pageserver_b.id) workload.validate(pageserver_b.id) + del workload + + # Check that deletion works properly on a tenant that was live-migrated + # (reproduce https://github.com/neondatabase/neon/issues/6802) + iterations = poll_for_remote_storage_iterations(remote_storage_kind) + tenant_delete_wait_completed(pageserver_b.http_client(), tenant_id, iterations) def test_heatmap_uploads(neon_env_builder: NeonEnvBuilder): @@ -426,7 +436,8 @@ def test_secondary_downloads(neon_env_builder: NeonEnvBuilder): remote_storage_kind=RemoteStorageKind.MOCK_S3, ) env = neon_env_builder.init_start(initial_tenant_conf=TENANT_CONF) - assert env.attachment_service is not None + assert env.storage_controller is not None + assert isinstance(env.pageserver_remote_storage, S3Storage) # Satisfy linter tenant_id = env.initial_tenant timeline_id = env.initial_timeline @@ -482,23 +493,40 @@ def test_secondary_downloads(neon_env_builder: NeonEnvBuilder): # Do evictions on attached pageserver, check secondary follows along # ================================================================== - log.info("Evicting a layer...") - layer_to_evict = list_layers(ps_attached, tenant_id, timeline_id)[0] - ps_attached.http_client().evict_layer(tenant_id, timeline_id, layer_name=layer_to_evict.name) + try: + log.info("Evicting a layer...") + layer_to_evict = list_layers(ps_attached, tenant_id, timeline_id)[0] + some_other_layer = list_layers(ps_attached, tenant_id, timeline_id)[1] + log.info(f"Victim layer: {layer_to_evict.name}") + ps_attached.http_client().evict_layer( + tenant_id, timeline_id, layer_name=layer_to_evict.name + ) - log.info("Synchronizing after eviction...") - ps_attached.http_client().tenant_heatmap_upload(tenant_id) - ps_secondary.http_client().tenant_secondary_download(tenant_id) + log.info("Synchronizing after eviction...") + ps_attached.http_client().tenant_heatmap_upload(tenant_id) + heatmap_after_eviction = env.pageserver_remote_storage.heatmap_content(tenant_id) + heatmap_layers = set( + layer["name"] for layer in heatmap_after_eviction["timelines"][0]["layers"] + ) + assert layer_to_evict.name not in heatmap_layers + assert some_other_layer.name in heatmap_layers - assert layer_to_evict not in list_layers(ps_attached, tenant_id, timeline_id) - assert list_layers(ps_attached, tenant_id, timeline_id) == list_layers( - ps_secondary, tenant_id, timeline_id - ) + ps_secondary.http_client().tenant_secondary_download(tenant_id) + + assert layer_to_evict not in list_layers(ps_attached, tenant_id, timeline_id) + assert list_layers(ps_attached, tenant_id, timeline_id) == list_layers( + ps_secondary, tenant_id, timeline_id + ) + except: + # On assertion failures, log some details to help with debugging + heatmap = env.pageserver_remote_storage.heatmap_content(tenant_id) + log.warn(f"heatmap contents: {json.dumps(heatmap,indent=2)}") + raise # Scrub the remote storage # ======================== # This confirms that the scrubber isn't upset by the presence of the heatmap - S3Scrubber(neon_env_builder.test_output_dir, neon_env_builder).scan_metadata() + S3Scrubber(neon_env_builder).scan_metadata() # Detach secondary and delete tenant # =================================== diff --git a/test_runner/regress/test_pageserver_small_inmemory_layers.py b/test_runner/regress/test_pageserver_small_inmemory_layers.py new file mode 100644 index 0000000000..5d55020e3c --- /dev/null +++ b/test_runner/regress/test_pageserver_small_inmemory_layers.py @@ -0,0 +1,110 @@ +import asyncio +import time +from typing import Tuple + +import pytest +from fixtures.log_helper import log +from fixtures.neon_fixtures import ( + NeonEnv, + NeonEnvBuilder, + tenant_get_shards, +) +from fixtures.pageserver.http import PageserverHttpClient +from fixtures.pageserver.utils import wait_for_last_record_lsn +from fixtures.types import Lsn, TenantId, TimelineId +from fixtures.utils import wait_until + +TIMELINE_COUNT = 10 +ENTRIES_PER_TIMELINE = 10_000 +CHECKPOINT_TIMEOUT_SECONDS = 60 + +TENANT_CONF = { + # Large `checkpoint_distance` effectively disables size + # based checkpointing. + "checkpoint_distance": f"{2 * 1024 ** 3}", + "checkpoint_timeout": f"{CHECKPOINT_TIMEOUT_SECONDS}s", +} + + +async def run_worker(env: NeonEnv, entries: int) -> Tuple[TenantId, TimelineId, Lsn]: + tenant, timeline = env.neon_cli.create_tenant(conf=TENANT_CONF) + with env.endpoints.create_start("main", tenant_id=tenant) as ep: + conn = await ep.connect_async() + try: + await conn.execute("CREATE TABLE IF NOT EXISTS t(key serial primary key, value text)") + await conn.execute( + f"INSERT INTO t SELECT i, CONCAT('payload_', i) FROM generate_series(0,{entries}) as i" + ) + finally: + await conn.close(timeout=10) + + last_flush_lsn = Lsn(ep.safe_psql("SELECT pg_current_wal_flush_lsn()")[0][0]) + return tenant, timeline, last_flush_lsn + + +async def workload( + env: NeonEnv, timelines: int, entries: int +) -> list[Tuple[TenantId, TimelineId, Lsn]]: + workers = [asyncio.create_task(run_worker(env, entries)) for _ in range(timelines)] + return await asyncio.gather(*workers) + + +def wait_until_pageserver_is_caught_up( + env: NeonEnv, last_flush_lsns: list[Tuple[TenantId, TimelineId, Lsn]] +): + for tenant, timeline, last_flush_lsn in last_flush_lsns: + shards = tenant_get_shards(env, tenant) + for tenant_shard_id, pageserver in shards: + waited = wait_for_last_record_lsn( + pageserver.http_client(), tenant_shard_id, timeline, last_flush_lsn + ) + assert waited >= last_flush_lsn + + +def wait_for_wal_ingest_metric(pageserver_http: PageserverHttpClient) -> float: + def query(): + value = pageserver_http.get_metric_value("pageserver_wal_ingest_records_received_total") + assert value is not None + return value + + # The metric gets initialised on the first update. + # Retry a few times, but return 0 if it's stable. + try: + return float(wait_until(3, 0.5, query)) + except Exception: + return 0 + + +@pytest.mark.parametrize("immediate_shutdown", [True, False]) +def test_pageserver_small_inmemory_layers( + neon_env_builder: NeonEnvBuilder, immediate_shutdown: bool +): + """ + Test that open layers get flushed after the `checkpoint_timeout` config + and do not require WAL reingest upon restart. + + The workload creates a number of timelines and writes some data to each, + but not enough to trigger flushes via the `checkpoint_distance` config. + """ + env = neon_env_builder.init_configs() + env.start() + + last_flush_lsns = asyncio.run(workload(env, TIMELINE_COUNT, ENTRIES_PER_TIMELINE)) + wait_until_pageserver_is_caught_up(env, last_flush_lsns) + + ps_http_client = env.pageserver.http_client() + total_wal_ingested_before_restart = wait_for_wal_ingest_metric(ps_http_client) + + log.info("Sleeping for checkpoint timeout ...") + time.sleep(CHECKPOINT_TIMEOUT_SECONDS + 5) + + env.pageserver.restart(immediate=immediate_shutdown) + wait_until_pageserver_is_caught_up(env, last_flush_lsns) + + total_wal_ingested_after_restart = wait_for_wal_ingest_metric(ps_http_client) + + log.info(f"WAL ingested before restart: {total_wal_ingested_before_restart}") + log.info(f"WAL ingested after restart: {total_wal_ingested_after_restart}") + + leeway = total_wal_ingested_before_restart * 5 / 100 + assert total_wal_ingested_after_restart <= leeway diff --git a/test_runner/regress/test_parallel_copy.py b/test_runner/regress/test_parallel_copy.py index 6f74d50b92..b33e387a66 100644 --- a/test_runner/regress/test_parallel_copy.py +++ b/test_runner/regress/test_parallel_copy.py @@ -1,7 +1,6 @@ import asyncio from io import BytesIO -from fixtures.log_helper import log from fixtures.neon_fixtures import Endpoint, NeonEnv @@ -44,7 +43,6 @@ def test_parallel_copy(neon_simple_env: NeonEnv, n_parallel=5): env = neon_simple_env env.neon_cli.create_branch("test_parallel_copy", "empty") endpoint = env.endpoints.create_start("test_parallel_copy") - log.info("postgres is running on 'test_parallel_copy' branch") # Create test table conn = endpoint.connect() diff --git a/test_runner/regress/test_pg_regress.py b/test_runner/regress/test_pg_regress.py index f26d04e2f3..e4219ec7a6 100644 --- a/test_runner/regress/test_pg_regress.py +++ b/test_runner/regress/test_pg_regress.py @@ -2,25 +2,40 @@ # This file runs pg_regress-based tests. # from pathlib import Path +from typing import Optional -from fixtures.neon_fixtures import NeonEnv, check_restored_datadir_content +import pytest +from fixtures.neon_fixtures import ( + NeonEnvBuilder, + check_restored_datadir_content, +) +from fixtures.remote_storage import s3_storage # Run the main PostgreSQL regression tests, in src/test/regress. # +@pytest.mark.parametrize("shard_count", [None, 4]) def test_pg_regress( - neon_simple_env: NeonEnv, + neon_env_builder: NeonEnvBuilder, test_output_dir: Path, pg_bin, capsys, base_dir: Path, pg_distrib_dir: Path, + shard_count: Optional[int], ): - env = neon_simple_env + """ + :param shard_count: if None, create an unsharded tenant. Otherwise create a tenant with this + many shards. + """ + if shard_count is not None: + neon_env_builder.num_pageservers = shard_count + neon_env_builder.enable_pageserver_remote_storage(s3_storage()) + neon_env_builder.enable_scrub_on_exit() + env = neon_env_builder.init_start(initial_tenant_shard_count=shard_count) - env.neon_cli.create_branch("test_pg_regress", "empty") # Connect to postgres and create a database called "regression". - endpoint = env.endpoints.create_start("test_pg_regress") + endpoint = env.endpoints.create_start("main") endpoint.safe_psql("CREATE DATABASE regression") # Create some local directories for pg_regress to run in. @@ -61,22 +76,25 @@ def test_pg_regress( # Run the PostgreSQL "isolation" tests, in src/test/isolation. # +@pytest.mark.parametrize("shard_count", [None, 4]) def test_isolation( - neon_simple_env: NeonEnv, + neon_env_builder: NeonEnvBuilder, test_output_dir: Path, pg_bin, capsys, base_dir: Path, pg_distrib_dir: Path, + shard_count: Optional[int], ): - env = neon_simple_env + if shard_count is not None: + neon_env_builder.num_pageservers = shard_count + neon_env_builder.enable_pageserver_remote_storage(s3_storage()) + neon_env_builder.enable_scrub_on_exit() + env = neon_env_builder.init_start(initial_tenant_shard_count=shard_count) - env.neon_cli.create_branch("test_isolation", "empty") # Connect to postgres and create a database called "regression". # isolation tests use prepared transactions, so enable them - endpoint = env.endpoints.create_start( - "test_isolation", config_lines=["max_prepared_transactions=100"] - ) + endpoint = env.endpoints.create_start("main", config_lines=["max_prepared_transactions=100"]) endpoint.safe_psql("CREATE DATABASE isolation_regression") # Create some local directories for pg_isolation_regress to run in. @@ -114,19 +132,24 @@ def test_isolation( # Run extra Neon-specific pg_regress-based tests. The tests and their # schedule file are in the sql_regress/ directory. +@pytest.mark.parametrize("shard_count", [None, 4]) def test_sql_regress( - neon_simple_env: NeonEnv, + neon_env_builder: NeonEnvBuilder, test_output_dir: Path, pg_bin, capsys, base_dir: Path, pg_distrib_dir: Path, + shard_count: Optional[int], ): - env = neon_simple_env + if shard_count is not None: + neon_env_builder.num_pageservers = shard_count + neon_env_builder.enable_pageserver_remote_storage(s3_storage()) + neon_env_builder.enable_scrub_on_exit() + env = neon_env_builder.init_start(initial_tenant_shard_count=shard_count) - env.neon_cli.create_branch("test_sql_regress", "empty") # Connect to postgres and create a database called "regression". - endpoint = env.endpoints.create_start("test_sql_regress") + endpoint = env.endpoints.create_start("main") endpoint.safe_psql("CREATE DATABASE regression") # Create some local directories for pg_regress to run in. diff --git a/test_runner/regress/test_pitr_gc.py b/test_runner/regress/test_pitr_gc.py index c2ea5b332a..539ef3eda7 100644 --- a/test_runner/regress/test_pitr_gc.py +++ b/test_runner/regress/test_pitr_gc.py @@ -16,7 +16,6 @@ def test_pitr_gc(neon_env_builder: NeonEnvBuilder): env = neon_env_builder.init_start() endpoint_main = env.endpoints.create_start("main") - log.info("postgres is running on 'main' branch") main_pg_conn = endpoint_main.connect() main_cur = main_pg_conn.cursor() diff --git a/test_runner/regress/test_proxy.py b/test_runner/regress/test_proxy.py index 1d62f09840..078589d8eb 100644 --- a/test_runner/regress/test_proxy.py +++ b/test_runner/regress/test_proxy.py @@ -390,14 +390,47 @@ def test_sql_over_http_batch(static_proxy: NeonProxy): assert result[0]["rows"] == [{"answer": 42}] +def test_sql_over_http_batch_output_options(static_proxy: NeonProxy): + static_proxy.safe_psql("create role http with login password 'http' superuser") + + connstr = f"postgresql://http:http@{static_proxy.domain}:{static_proxy.proxy_port}/postgres" + response = requests.post( + f"https://{static_proxy.domain}:{static_proxy.external_http_port}/sql", + data=json.dumps( + { + "queries": [ + {"query": "select $1 as answer", "params": [42], "arrayMode": True}, + {"query": "select $1 as answer", "params": [42], "arrayMode": False}, + ] + } + ), + headers={ + "Content-Type": "application/sql", + "Neon-Connection-String": connstr, + "Neon-Batch-Isolation-Level": "Serializable", + "Neon-Batch-Read-Only": "false", + "Neon-Batch-Deferrable": "false", + }, + verify=str(static_proxy.test_output_dir / "proxy.crt"), + ) + assert response.status_code == 200 + results = response.json()["results"] + + assert results[0]["rowAsArray"] + assert results[0]["rows"] == [["42"]] + + assert not results[1]["rowAsArray"] + assert results[1]["rows"] == [{"answer": "42"}] + + def test_sql_over_http_pool(static_proxy: NeonProxy): static_proxy.safe_psql("create user http_auth with password 'http' superuser") - def get_pid(status: int, pw: str) -> Any: + def get_pid(status: int, pw: str, user="http_auth") -> Any: return static_proxy.http_query( GET_CONNECTION_PID_QUERY, [], - user="http_auth", + user=user, password=pw, expected_code=status, ) @@ -418,23 +451,29 @@ def test_sql_over_http_pool(static_proxy: NeonProxy): static_proxy.safe_psql("alter user http_auth with password 'http2'") - # after password change, should open a new connection to verify it - pid2 = get_pid(200, "http2")["rows"][0]["pid"] - assert pid1 != pid2 + # after password change, shouldn't open a new connection because it checks password in proxy. + rows = get_pid(200, "http2")["rows"] + assert rows == [{"pid": pid1}] time.sleep(0.02) - # query should be on an existing connection - pid = get_pid(200, "http2")["rows"][0]["pid"] - assert pid in [pid1, pid2] - - time.sleep(0.02) - - # old password should not work - res = get_pid(400, "http") + # incorrect user shouldn't reveal that the user doesn't exists + res = get_pid(400, "http", user="http_auth2") assert "password authentication failed for user" in res["message"] +def test_sql_over_http_urlencoding(static_proxy: NeonProxy): + static_proxy.safe_psql("create user \"http+auth$$\" with password '%+$^&*@!' superuser") + + static_proxy.http_query( + "select 1", + [], + user="http+auth$$", + password="%+$^&*@!", + expected_code=200, + ) + + # Beginning a transaction should not impact the next query, # which might come from a completely different client. def test_http_pool_begin(static_proxy: NeonProxy): @@ -515,3 +554,45 @@ def test_sql_over_http_pool_custom_types(static_proxy: NeonProxy): "select array['foo'::foo, 'bar'::foo, 'baz'::foo] as data", ) assert response["rows"][0]["data"] == ["foo", "bar", "baz"] + + +@pytest.mark.asyncio +async def test_sql_over_http2(static_proxy: NeonProxy): + static_proxy.safe_psql("create role http with login password 'http' superuser") + + resp = await static_proxy.http2_query( + "select 42 as answer", [], user="http", password="http", expected_code=200 + ) + assert resp["rows"] == [{"answer": 42}] + + +def test_sql_over_http_timeout_cancel(static_proxy: NeonProxy): + static_proxy.safe_psql("create role http with login password 'http' superuser") + + static_proxy.safe_psql("create table test_table ( id int primary key )") + + # insert into a table, with a unique constraint, after sleeping for n seconds + query = "WITH temp AS ( \ + SELECT pg_sleep($1) as sleep, $2::int as id \ + ) INSERT INTO test_table (id) SELECT id FROM temp" + + # expect to fail with timeout + res = static_proxy.http_query( + query, + [static_proxy.http_timeout_seconds + 1, 1], + user="http", + password="http", + expected_code=400, + ) + assert "Query cancelled, runtime exceeded" in res["message"], "HTTP query should time out" + + time.sleep(2) + + res = static_proxy.http_query(query, [1, 1], user="http", password="http", expected_code=200) + assert res["command"] == "INSERT", "HTTP query should insert" + assert res["rowCount"] == 1, "HTTP query should insert" + + res = static_proxy.http_query(query, [0, 1], user="http", password="http", expected_code=400) + assert ( + "duplicate key value violates unique constraint" in res["message"] + ), "HTTP query should conflict" diff --git a/test_runner/regress/test_proxy_allowed_ips.py b/test_runner/regress/test_proxy_allowed_ips.py index f533579811..7a804114ba 100644 --- a/test_runner/regress/test_proxy_allowed_ips.py +++ b/test_runner/regress/test_proxy_allowed_ips.py @@ -24,7 +24,7 @@ async def test_proxy_psql_allowed_ips(static_proxy: NeonProxy, vanilla_pg: Vanil with pytest.raises(psycopg2.Error) as exprinfo: static_proxy.safe_psql(**kwargs) text = str(exprinfo.value).strip() - assert "This IP address is not allowed to connect" in text + assert "not allowed to connect" in text # no SNI, deprecated `options=project` syntax (before we had several endpoint in project) check_cannot_connect(query="select 1", sslsni=0, options="project=private-project") diff --git a/test_runner/regress/test_read_validation.py b/test_runner/regress/test_read_validation.py index d695410efc..effb7e83f9 100644 --- a/test_runner/regress/test_read_validation.py +++ b/test_runner/regress/test_read_validation.py @@ -18,7 +18,6 @@ def test_read_validation(neon_simple_env: NeonEnv): env.neon_cli.create_branch("test_read_validation", "empty") endpoint = env.endpoints.create_start("test_read_validation") - log.info("postgres is running on 'test_read_validation' branch") with closing(endpoint.connect()) as con: with con.cursor() as c: @@ -145,7 +144,6 @@ def test_read_validation_neg(neon_simple_env: NeonEnv): env.pageserver.allowed_errors.append(".*invalid LSN\\(0\\) in request.*") endpoint = env.endpoints.create_start("test_read_validation_neg") - log.info("postgres is running on 'test_read_validation_neg' branch") with closing(endpoint.connect()) as con: with con.cursor() as c: diff --git a/test_runner/regress/test_readonly_node.py b/test_runner/regress/test_readonly_node.py index 2d641e36a7..b7c8f36107 100644 --- a/test_runner/regress/test_readonly_node.py +++ b/test_runner/regress/test_readonly_node.py @@ -16,7 +16,6 @@ def test_readonly_node(neon_simple_env: NeonEnv): env = neon_simple_env env.neon_cli.create_branch("test_readonly_node", "empty") endpoint_main = env.endpoints.create_start("test_readonly_node") - log.info("postgres is running on 'test_readonly_node' branch") env.pageserver.allowed_errors.append(".*basebackup .* failed: invalid basebackup lsn.*") diff --git a/test_runner/regress/test_recovery.py b/test_runner/regress/test_recovery.py index 9d7a4a8fd6..6aac1e1d84 100644 --- a/test_runner/regress/test_recovery.py +++ b/test_runner/regress/test_recovery.py @@ -19,7 +19,6 @@ def test_pageserver_recovery(neon_env_builder: NeonEnvBuilder): env.neon_cli.create_branch("test_pageserver_recovery", "main") endpoint = env.endpoints.create_start("test_pageserver_recovery") - log.info("postgres is running on 'test_pageserver_recovery' branch") with closing(endpoint.connect()) as conn: with conn.cursor() as cur: diff --git a/test_runner/regress/test_remote_storage.py b/test_runner/regress/test_remote_storage.py index 98b2e856ec..05f769b0e3 100644 --- a/test_runner/regress/test_remote_storage.py +++ b/test_runner/regress/test_remote_storage.py @@ -28,7 +28,14 @@ from fixtures.remote_storage import ( available_remote_storages, ) from fixtures.types import Lsn, TenantId, TimelineId -from fixtures.utils import print_gc_result, query_scalar, wait_until +from fixtures.utils import ( + assert_eq, + assert_ge, + assert_gt, + print_gc_result, + query_scalar, + wait_until, +) from requests import ReadTimeout @@ -73,9 +80,6 @@ def test_remote_storage_backup_and_restore( env.pageserver.allowed_errors.extend( [ - # FIXME: Is this expected? - ".*marking .* as locally complete, while it doesnt exist in remote index.*", - ".*No timelines to attach received.*", ".*Failed to get local tenant state.*", # FIXME retry downloads without throwing errors ".*failed to load remote timeline.*", @@ -123,10 +127,10 @@ def test_remote_storage_backup_and_restore( log.info(f"upload of checkpoint {checkpoint_number} is done") # Check that we had to retry the uploads - assert env.pageserver.log_contains( + env.pageserver.assert_log_contains( ".*failed to perform remote task UploadLayer.*, will retry.*" ) - assert env.pageserver.log_contains( + env.pageserver.assert_log_contains( ".*failed to perform remote task UploadMetadata.*, will retry.*" ) @@ -165,7 +169,7 @@ def test_remote_storage_backup_and_restore( # Ensure that even though the tenant is broken, retrying the attachment fails with pytest.raises(Exception, match="Tenant state is Broken"): # Use same generation as in previous attempt - gen_state = env.attachment_service.inspect(tenant_id) + gen_state = env.storage_controller.inspect(tenant_id) assert gen_state is not None generation = gen_state[0] env.pageserver.tenant_attach(tenant_id, generation=generation) @@ -231,9 +235,9 @@ def test_remote_storage_upload_queue_retries( tenant_id, timeline_id = env.neon_cli.create_tenant( conf={ # small checkpointing and compaction targets to ensure we generate many upload operations - "checkpoint_distance": f"{128 * 1024}", + "checkpoint_distance": f"{64 * 1024}", "compaction_threshold": "1", - "compaction_target_size": f"{128 * 1024}", + "compaction_target_size": f"{64 * 1024}", # no PITR horizon, we specify the horizon when we request on-demand GC "pitr_interval": "0s", # disable background compaction and GC. We invoke it manually when we want it to happen. @@ -259,33 +263,30 @@ def test_remote_storage_upload_queue_retries( ] ) + FOO_ROWS_COUNT = 4000 + def overwrite_data_and_wait_for_it_to_arrive_at_pageserver(data): # create initial set of layers & upload them with failpoints configured - endpoint.safe_psql_many( - [ - f""" - INSERT INTO foo (id, val) - SELECT g, '{data}' - FROM generate_series(1, 20000) g - ON CONFLICT (id) DO UPDATE - SET val = EXCLUDED.val - """, - # to ensure that GC can actually remove some layers - "VACUUM foo", - ] - ) + for _v in range(2): + endpoint.safe_psql_many( + [ + f""" + INSERT INTO foo (id, val) + SELECT g, '{data}' + FROM generate_series(1, {FOO_ROWS_COUNT}) g + ON CONFLICT (id) DO UPDATE + SET val = EXCLUDED.val + """, + # to ensure that GC can actually remove some layers + "VACUUM foo", + ] + ) wait_for_last_flush_lsn(env, endpoint, tenant_id, timeline_id) def get_queued_count(file_kind, op_kind): - val = client.get_remote_timeline_client_metric( - "pageserver_remote_timeline_client_calls_unfinished", - tenant_id, - timeline_id, - file_kind, - op_kind, + return client.get_remote_timeline_client_queue_count( + tenant_id, timeline_id, file_kind, op_kind ) - assert val is not None, "expecting metric to be present" - return int(val) # create some layers & wait for uploads to finish overwrite_data_and_wait_for_it_to_arrive_at_pageserver("a") @@ -298,9 +299,9 @@ def test_remote_storage_upload_queue_retries( print_gc_result(gc_result) assert gc_result["layers_removed"] > 0 - wait_until(2, 1, lambda: get_queued_count(file_kind="layer", op_kind="upload") == 0) - wait_until(2, 1, lambda: get_queued_count(file_kind="index", op_kind="upload") == 0) - wait_until(2, 1, lambda: get_queued_count(file_kind="layer", op_kind="delete") == 0) + wait_until(2, 1, lambda: assert_eq(get_queued_count(file_kind="layer", op_kind="upload"), 0)) + wait_until(2, 1, lambda: assert_eq(get_queued_count(file_kind="index", op_kind="upload"), 0)) + wait_until(2, 1, lambda: assert_eq(get_queued_count(file_kind="layer", op_kind="delete"), 0)) # let all future operations queue up configure_storage_sync_failpoints("return") @@ -328,21 +329,22 @@ def test_remote_storage_upload_queue_retries( churn_while_failpoints_active_thread.start() # wait for churn thread's data to get stuck in the upload queue - wait_until(10, 0.1, lambda: get_queued_count(file_kind="layer", op_kind="upload") > 0) - wait_until(10, 0.1, lambda: get_queued_count(file_kind="index", op_kind="upload") >= 2) - wait_until(10, 0.1, lambda: get_queued_count(file_kind="layer", op_kind="delete") > 0) + # Exponential back-off in upload queue, so, gracious timeouts. + + wait_until(30, 1, lambda: assert_gt(get_queued_count(file_kind="layer", op_kind="upload"), 0)) + wait_until(30, 1, lambda: assert_ge(get_queued_count(file_kind="index", op_kind="upload"), 2)) + wait_until(30, 1, lambda: assert_gt(get_queued_count(file_kind="layer", op_kind="delete"), 0)) # unblock churn operations configure_storage_sync_failpoints("off") - # ... and wait for them to finish. Exponential back-off in upload queue, so, gracious timeouts. - wait_until(30, 1, lambda: get_queued_count(file_kind="layer", op_kind="upload") == 0) - wait_until(30, 1, lambda: get_queued_count(file_kind="index", op_kind="upload") == 0) - wait_until(30, 1, lambda: get_queued_count(file_kind="layer", op_kind="delete") == 0) + wait_until(30, 1, lambda: assert_eq(get_queued_count(file_kind="layer", op_kind="upload"), 0)) + wait_until(30, 1, lambda: assert_eq(get_queued_count(file_kind="index", op_kind="upload"), 0)) + wait_until(30, 1, lambda: assert_eq(get_queued_count(file_kind="layer", op_kind="delete"), 0)) # The churn thread doesn't make progress once it blocks on the first wait_completion() call, # so, give it some time to wrap up. - churn_while_failpoints_active_thread.join(30) + churn_while_failpoints_active_thread.join(60) assert not churn_while_failpoints_active_thread.is_alive() assert churn_thread_result[0] @@ -353,7 +355,7 @@ def test_remote_storage_upload_queue_retries( env.pageserver.stop(immediate=True) env.endpoints.stop_all() - # We are about to forcibly drop local dirs. Attachment service will increment generation in re-attach before + # We are about to forcibly drop local dirs. Storage controller will increment generation in re-attach before # we later increment when actually attaching it again, leading to skipping a generation and potentially getting # these warnings if there was a durable but un-executed deletion list at time of restart. env.pageserver.allowed_errors.extend( @@ -374,7 +376,7 @@ def test_remote_storage_upload_queue_retries( log.info("restarting postgres to validate") endpoint = env.endpoints.create_start("main", tenant_id=tenant_id) with endpoint.cursor() as cur: - assert query_scalar(cur, "SELECT COUNT(*) FROM foo WHERE val = 'd'") == 20000 + assert query_scalar(cur, "SELECT COUNT(*) FROM foo WHERE val = 'd'") == FOO_ROWS_COUNT def test_remote_timeline_client_calls_started_metric( @@ -388,6 +390,7 @@ def test_remote_timeline_client_calls_started_metric( initial_tenant_conf={ # small checkpointing and compaction targets to ensure we generate many upload operations "checkpoint_distance": f"{128 * 1024}", + # ensure each timeline_checkpoint() calls creates L1s "compaction_threshold": "1", "compaction_target_size": f"{128 * 1024}", # no PITR horizon, we specify the horizon when we request on-demand GC @@ -395,8 +398,6 @@ def test_remote_timeline_client_calls_started_metric( # disable background compaction and GC. We invoke it manually when we want it to happen. "gc_period": "0s", "compaction_period": "0s", - # create image layers eagerly, so that GC can remove some layers - "image_creation_threshold": "1", } ) @@ -437,7 +438,7 @@ def test_remote_timeline_client_calls_started_metric( assert timeline_id is not None for (file_kind, op_kind), observations in calls_started.items(): val = client.get_metric_value( - name="pageserver_remote_timeline_client_calls_started_count", + name="pageserver_remote_timeline_client_calls_started_total", filter={ "file_kind": str(file_kind), "op_kind": str(op_kind), @@ -455,12 +456,17 @@ def test_remote_timeline_client_calls_started_metric( ), f"observations for {file_kind} {op_kind} did not grow monotonically: {observations}" def churn(data_pass1, data_pass2): + # overwrite the same data in place, vacuum inbetween, and + # and create image layers; then run a gc(). + # this should + # - create new layers + # - delete some layers overwrite_data_and_wait_for_it_to_arrive_at_pageserver(data_pass1) - client.timeline_checkpoint(tenant_id, timeline_id) - client.timeline_compact(tenant_id, timeline_id) overwrite_data_and_wait_for_it_to_arrive_at_pageserver(data_pass2) - client.timeline_checkpoint(tenant_id, timeline_id) - client.timeline_compact(tenant_id, timeline_id) + client.timeline_checkpoint(tenant_id, timeline_id, force_image_layer_creation=True) + overwrite_data_and_wait_for_it_to_arrive_at_pageserver(data_pass1) + overwrite_data_and_wait_for_it_to_arrive_at_pageserver(data_pass2) + client.timeline_checkpoint(tenant_id, timeline_id, force_image_layer_creation=True) gc_result = client.timeline_gc(tenant_id, timeline_id, 0) print_gc_result(gc_result) assert gc_result["layers_removed"] > 0 @@ -540,16 +546,6 @@ def test_timeline_deletion_with_files_stuck_in_upload_queue( client = env.pageserver.http_client() - def get_queued_count(file_kind, op_kind): - val = client.get_remote_timeline_client_metric( - "pageserver_remote_timeline_client_calls_unfinished", - tenant_id, - timeline_id, - file_kind, - op_kind, - ) - return int(val) if val is not None else val - endpoint = env.endpoints.create_start("main", tenant_id=tenant_id) client.configure_failpoints(("before-upload-layer", "return")) @@ -583,7 +579,10 @@ def test_timeline_deletion_with_files_stuck_in_upload_queue( def assert_compacted_and_uploads_queued(): assert timeline_path.exists() assert len(list(timeline_path.glob("*"))) >= 8 - assert get_queued_count(file_kind="index", op_kind="upload") > 0 + assert ( + get_queued_count(client, tenant_id, timeline_id, file_kind="index", op_kind="upload") + > 0 + ) wait_until(20, 0.1, assert_compacted_and_uploads_queued) @@ -621,7 +620,10 @@ def test_timeline_deletion_with_files_stuck_in_upload_queue( assert len(filtered) == 0 # timeline deletion should kill ongoing uploads, so, the metric will be gone - assert get_queued_count(file_kind="index", op_kind="upload") is None + assert ( + get_queued_count(client, tenant_id, timeline_id, file_kind="index", op_kind="upload") + is None + ) # timeline deletion should be unblocking checkpoint ops checkpoint_thread.join(2.0) @@ -707,10 +709,8 @@ def test_empty_branch_remote_storage_upload_on_restart(neon_env_builder: NeonEnv # index upload is now hitting the failpoint, it should block the shutdown env.pageserver.stop(immediate=True) - local_metadata = ( - env.pageserver.timeline_dir(env.initial_tenant, new_branch_timeline_id) / "metadata" - ) - assert local_metadata.is_file() + timeline_dir = env.pageserver.timeline_dir(env.initial_tenant, new_branch_timeline_id) + assert timeline_dir.is_dir() assert isinstance(env.pageserver_remote_storage, LocalFsStorage) @@ -892,26 +892,23 @@ def wait_upload_queue_empty( wait_until( 2, 1, - lambda: get_queued_count( - client, tenant_id, timeline_id, file_kind="layer", op_kind="upload" - ) - == 0, + lambda: assert_eq( + get_queued_count(client, tenant_id, timeline_id, file_kind="layer", op_kind="upload"), 0 + ), ) wait_until( 2, 1, - lambda: get_queued_count( - client, tenant_id, timeline_id, file_kind="index", op_kind="upload" - ) - == 0, + lambda: assert_eq( + get_queued_count(client, tenant_id, timeline_id, file_kind="index", op_kind="upload"), 0 + ), ) wait_until( 2, 1, - lambda: get_queued_count( - client, tenant_id, timeline_id, file_kind="layer", op_kind="delete" - ) - == 0, + lambda: assert_eq( + get_queued_count(client, tenant_id, timeline_id, file_kind="layer", op_kind="delete"), 0 + ), ) @@ -922,16 +919,8 @@ def get_queued_count( file_kind: str, op_kind: str, ): - val = client.get_remote_timeline_client_metric( - "pageserver_remote_timeline_client_calls_unfinished", - tenant_id, - timeline_id, - file_kind, - op_kind, - ) - if val is None: - return val - return int(val) + """The most important aspect of this function is shorter name & no return type so asserts are more concise.""" + return client.get_remote_timeline_client_queue_count(tenant_id, timeline_id, file_kind, op_kind) def assert_nothing_to_upload( diff --git a/test_runner/regress/test_replication_start.py b/test_runner/regress/test_replication_start.py new file mode 100644 index 0000000000..b4699c7be8 --- /dev/null +++ b/test_runner/regress/test_replication_start.py @@ -0,0 +1,30 @@ +from fixtures.log_helper import log +from fixtures.neon_fixtures import NeonEnv, wait_replica_caughtup + + +def test_replication_start(neon_simple_env: NeonEnv): + env = neon_simple_env + + with env.endpoints.create_start(branch_name="main", endpoint_id="primary") as primary: + with primary.connect() as p_con: + with p_con.cursor() as p_cur: + p_cur.execute("begin") + p_cur.execute("create table t(pk integer primary key, payload integer)") + p_cur.execute("insert into t values (generate_series(1,100000), 0)") + p_cur.execute("select txid_current()") + xid = p_cur.fetchall()[0][0] + log.info(f"Master transaction {xid}") + with env.endpoints.new_replica_start( + origin=primary, endpoint_id="secondary" + ) as secondary: + wait_replica_caughtup(primary, secondary) + with secondary.connect() as s_con: + with s_con.cursor() as s_cur: + # Enforce setting hint bits for pg_class tuples. + # If master's transaction is not marked as in-progress in MVCC snapshot, + # then XMIN_INVALID hint bit will be set for table's 't' tuple makeing it invisible. + s_cur.execute("select * from pg_class") + p_cur.execute("commit") + wait_replica_caughtup(primary, secondary) + s_cur.execute("select * from t where pk = 1") + assert s_cur.fetchone() == (1, 0) diff --git a/test_runner/regress/test_s3_restore.py b/test_runner/regress/test_s3_restore.py new file mode 100644 index 0000000000..611bd1c2a2 --- /dev/null +++ b/test_runner/regress/test_s3_restore.py @@ -0,0 +1,119 @@ +import time +from datetime import datetime, timezone + +from fixtures.neon_fixtures import ( + NeonEnvBuilder, + PgBin, +) +from fixtures.pageserver.utils import ( + MANY_SMALL_LAYERS_TENANT_CONFIG, + assert_prefix_empty, + enable_remote_storage_versioning, + poll_for_remote_storage_iterations, + tenant_delete_wait_completed, + wait_for_upload, +) +from fixtures.remote_storage import RemoteStorageKind, s3_storage +from fixtures.types import Lsn +from fixtures.utils import run_pg_bench_small + + +def test_tenant_s3_restore( + neon_env_builder: NeonEnvBuilder, + pg_bin: PgBin, +): + remote_storage_kind = s3_storage() + neon_env_builder.enable_pageserver_remote_storage(remote_storage_kind) + + # Mock S3 doesn't have versioning enabled by default, enable it + # (also do it before there is any writes to the bucket) + if remote_storage_kind == RemoteStorageKind.MOCK_S3: + remote_storage = neon_env_builder.pageserver_remote_storage + assert remote_storage, "remote storage not configured" + enable_remote_storage_versioning(remote_storage) + + env = neon_env_builder.init_start(initial_tenant_conf=MANY_SMALL_LAYERS_TENANT_CONFIG) + env.pageserver.allowed_errors.extend( + [ + # The deletion queue will complain when it encounters simulated S3 errors + ".*deletion executor: DeleteObjects request failed.*", + # lucky race with stopping from flushing a layer we fail to schedule any uploads + ".*layer flush task.+: could not flush frozen layer: update_metadata_file", + ] + ) + + ps_http = env.pageserver.http_client() + + tenant_id = env.initial_tenant + + # Default tenant and the one we created + assert ps_http.get_metric_value("pageserver_tenant_manager_slots") == 1 + + # create two timelines one being the parent of another, both with non-trivial data + parent = None + last_flush_lsns = [] + + for timeline in ["first", "second"]: + timeline_id = env.neon_cli.create_branch( + timeline, tenant_id=tenant_id, ancestor_branch_name=parent + ) + with env.endpoints.create_start(timeline, tenant_id=tenant_id) as endpoint: + run_pg_bench_small(pg_bin, endpoint.connstr()) + endpoint.safe_psql(f"CREATE TABLE created_{timeline}(id integer);") + last_flush_lsn = Lsn(endpoint.safe_psql("SELECT pg_current_wal_flush_lsn()")[0][0]) + last_flush_lsns.append(last_flush_lsn) + ps_http.timeline_checkpoint(tenant_id, timeline_id) + wait_for_upload(ps_http, tenant_id, timeline_id, last_flush_lsn) + parent = timeline + + # These sleeps are important because they fend off differences in clocks between us and S3 + time.sleep(4) + ts_before_deletion = datetime.now(tz=timezone.utc).replace(tzinfo=None) + time.sleep(4) + + assert ( + ps_http.get_metric_value("pageserver_tenant_manager_slots") == 1 + ), "tenant removed before we deletion was issued" + iterations = poll_for_remote_storage_iterations(remote_storage_kind) + tenant_delete_wait_completed(ps_http, tenant_id, iterations) + ps_http.deletion_queue_flush(execute=True) + assert ( + ps_http.get_metric_value("pageserver_tenant_manager_slots") == 0 + ), "tenant removed before we deletion was issued" + env.storage_controller.attach_hook_drop(tenant_id) + + tenant_path = env.pageserver.tenant_dir(tenant_id) + assert not tenant_path.exists() + + assert_prefix_empty( + neon_env_builder.pageserver_remote_storage, + prefix="/".join( + ( + "tenants", + str(tenant_id), + ) + ), + ) + + time.sleep(4) + ts_after_deletion = datetime.now(tz=timezone.utc).replace(tzinfo=None) + time.sleep(4) + + ps_http.tenant_time_travel_remote_storage( + tenant_id, timestamp=ts_before_deletion, done_if_after=ts_after_deletion + ) + + generation = env.storage_controller.attach_hook_issue(tenant_id, env.pageserver.id) + + ps_http.tenant_attach(tenant_id, generation=generation) + env.pageserver.quiesce_tenants() + + for i, timeline in enumerate(["first", "second"]): + with env.endpoints.create_start(timeline, tenant_id=tenant_id) as endpoint: + endpoint.safe_psql(f"SELECT * FROM created_{timeline};") + last_flush_lsn = Lsn(endpoint.safe_psql("SELECT pg_current_wal_flush_lsn()")[0][0]) + expected_last_flush_lsn = last_flush_lsns[i] + # There might be some activity that advances the lsn so we can't use a strict equality check + assert last_flush_lsn >= expected_last_flush_lsn, "last_flush_lsn too old" + + assert ps_http.get_metric_value("pageserver_tenant_manager_slots") == 1 diff --git a/test_runner/regress/test_sharding.py b/test_runner/regress/test_sharding.py new file mode 100644 index 0000000000..9309af066b --- /dev/null +++ b/test_runner/regress/test_sharding.py @@ -0,0 +1,497 @@ +import os +from typing import Dict, List, Union + +import pytest +from fixtures.log_helper import log +from fixtures.neon_fixtures import ( + NeonEnvBuilder, + tenant_get_shards, +) +from fixtures.remote_storage import s3_storage +from fixtures.types import Lsn, TenantShardId, TimelineId +from fixtures.utils import wait_until +from fixtures.workload import Workload +from pytest_httpserver import HTTPServer +from werkzeug.wrappers.request import Request +from werkzeug.wrappers.response import Response + + +def test_sharding_smoke( + neon_env_builder: NeonEnvBuilder, +): + """ + Test the basic lifecycle of a sharded tenant: + - ingested data gets split up + - page service reads + - timeline creation and deletion + - splits + """ + + shard_count = 4 + neon_env_builder.num_pageservers = shard_count + + # 1MiB stripes: enable getting some meaningful data distribution without + # writing large quantities of data in this test. The stripe size is given + # in number of 8KiB pages. + stripe_size = 128 + + # Use S3-compatible remote storage so that we can scrub: this test validates + # that the scrubber doesn't barf when it sees a sharded tenant. + neon_env_builder.enable_pageserver_remote_storage(s3_storage()) + neon_env_builder.enable_scrub_on_exit() + + neon_env_builder.preserve_database_files = True + + env = neon_env_builder.init_start( + initial_tenant_shard_count=shard_count, initial_tenant_shard_stripe_size=stripe_size + ) + tenant_id = env.initial_tenant + + pageservers = dict((int(p.id), p) for p in env.pageservers) + shards = env.storage_controller.locate(tenant_id) + + def get_sizes(): + sizes = {} + for shard in shards: + node_id = int(shard["node_id"]) + pageserver = pageservers[node_id] + sizes[node_id] = pageserver.http_client().tenant_status(shard["shard_id"])[ + "current_physical_size" + ] + log.info(f"sizes = {sizes}") + return sizes + + # Test that timeline creation works on a sharded tenant + timeline_b = env.neon_cli.create_branch("branch_b", tenant_id=tenant_id) + + # Test that we can write data to a sharded tenant + workload = Workload(env, tenant_id, timeline_b, branch_name="branch_b") + workload.init() + + sizes_before = get_sizes() + workload.write_rows(256) + + # Test that we can read data back from a sharded tenant + workload.validate() + + # Validate that the data is spread across pageservers + sizes_after = get_sizes() + # Our sizes increased when we wrote data + assert sum(sizes_after.values()) > sum(sizes_before.values()) + # That increase is present on all shards + assert all(sizes_after[ps.id] > sizes_before[ps.id] for ps in env.pageservers) + + # Validate that timeline list API works properly on all shards + for shard in shards: + node_id = int(shard["node_id"]) + pageserver = pageservers[node_id] + timelines = set( + TimelineId(tl["timeline_id"]) + for tl in pageserver.http_client().timeline_list(shard["shard_id"]) + ) + assert timelines == {env.initial_timeline, timeline_b} + + env.storage_controller.consistency_check() + + +def test_sharding_split_unsharded( + neon_env_builder: NeonEnvBuilder, +): + """ + Test that shard splitting works on a tenant created as unsharded (i.e. with + ShardCount(0)). + """ + env = neon_env_builder.init_start() + tenant_id = env.initial_tenant + timeline_id = env.initial_timeline + + # Check that we created with an unsharded TenantShardId: this is the default, + # but check it in case we change the default in future + assert env.storage_controller.inspect(TenantShardId(tenant_id, 0, 0)) is not None + + workload = Workload(env, tenant_id, timeline_id, branch_name="main") + workload.init() + workload.write_rows(256) + workload.validate() + + # Split one shard into two + env.storage_controller.tenant_shard_split(tenant_id, shard_count=2) + + # Check we got the shard IDs we expected + assert env.storage_controller.inspect(TenantShardId(tenant_id, 0, 2)) is not None + assert env.storage_controller.inspect(TenantShardId(tenant_id, 1, 2)) is not None + + workload.validate() + + env.storage_controller.consistency_check() + + +def test_sharding_split_smoke( + neon_env_builder: NeonEnvBuilder, +): + """ + Test the basics of shard splitting: + - The API results in more shards than we started with + - The tenant's data remains readable + + """ + + # We will start with 4 shards and split into 8, then migrate all those + # 8 shards onto separate pageservers + shard_count = 4 + split_shard_count = 8 + neon_env_builder.num_pageservers = split_shard_count + + # 1MiB stripes: enable getting some meaningful data distribution without + # writing large quantities of data in this test. The stripe size is given + # in number of 8KiB pages. + stripe_size = 128 + + # Use S3-compatible remote storage so that we can scrub: this test validates + # that the scrubber doesn't barf when it sees a sharded tenant. + neon_env_builder.enable_pageserver_remote_storage(s3_storage()) + neon_env_builder.enable_scrub_on_exit() + + neon_env_builder.preserve_database_files = True + + env = neon_env_builder.init_start( + initial_tenant_shard_count=shard_count, initial_tenant_shard_stripe_size=stripe_size + ) + tenant_id = env.initial_tenant + timeline_id = env.initial_timeline + workload = Workload(env, tenant_id, timeline_id, branch_name="main") + workload.init() + + # Initial data + workload.write_rows(256) + + # Note which pageservers initially hold a shard after tenant creation + pre_split_pageserver_ids = [loc["node_id"] for loc in env.storage_controller.locate(tenant_id)] + + # For pageservers holding a shard, validate their ingest statistics + # reflect a proper splitting of the WAL. + for pageserver in env.pageservers: + if pageserver.id not in pre_split_pageserver_ids: + continue + + metrics = pageserver.http_client().get_metrics_values( + [ + "pageserver_wal_ingest_records_received_total", + "pageserver_wal_ingest_records_committed_total", + "pageserver_wal_ingest_records_filtered_total", + ] + ) + + log.info(f"Pageserver {pageserver.id} metrics: {metrics}") + + # Not everything received was committed + assert ( + metrics["pageserver_wal_ingest_records_received_total"] + > metrics["pageserver_wal_ingest_records_committed_total"] + ) + + # Something was committed + assert metrics["pageserver_wal_ingest_records_committed_total"] > 0 + + # Counts are self consistent + assert ( + metrics["pageserver_wal_ingest_records_received_total"] + == metrics["pageserver_wal_ingest_records_committed_total"] + + metrics["pageserver_wal_ingest_records_filtered_total"] + ) + + # TODO: validate that shards have different sizes + + workload.validate() + + assert len(pre_split_pageserver_ids) == 4 + + def shards_on_disk(shard_ids): + for pageserver in env.pageservers: + for shard_id in shard_ids: + if pageserver.tenant_dir(shard_id).exists(): + return True + + return False + + old_shard_ids = [TenantShardId(tenant_id, i, shard_count) for i in range(0, shard_count)] + # Before split, old shards exist + assert shards_on_disk(old_shard_ids) + + env.storage_controller.tenant_shard_split(tenant_id, shard_count=split_shard_count) + + post_split_pageserver_ids = [loc["node_id"] for loc in env.storage_controller.locate(tenant_id)] + # We should have split into 8 shards, on the same 4 pageservers we started on. + assert len(post_split_pageserver_ids) == split_shard_count + assert len(set(post_split_pageserver_ids)) == shard_count + assert set(post_split_pageserver_ids) == set(pre_split_pageserver_ids) + + # The old parent shards should no longer exist on disk + assert not shards_on_disk(old_shard_ids) + + workload.validate() + + workload.churn_rows(256) + + workload.validate() + + # Run GC on all new shards, to check they don't barf or delete anything that breaks reads + # (compaction was already run as part of churn_rows) + all_shards = tenant_get_shards(env, tenant_id) + for tenant_shard_id, pageserver in all_shards: + pageserver.http_client().timeline_gc(tenant_shard_id, timeline_id, None) + workload.validate() + + migrate_to_pageserver_ids = list( + set(p.id for p in env.pageservers) - set(pre_split_pageserver_ids) + ) + assert len(migrate_to_pageserver_ids) == split_shard_count - shard_count + + # Migrate shards away from the node where the split happened + for ps_id in pre_split_pageserver_ids: + shards_here = [ + tenant_shard_id + for (tenant_shard_id, pageserver) in all_shards + if pageserver.id == ps_id + ] + assert len(shards_here) == 2 + migrate_shard = shards_here[0] + destination = migrate_to_pageserver_ids.pop() + + log.info(f"Migrating shard {migrate_shard} from {ps_id} to {destination}") + env.neon_cli.tenant_migrate(migrate_shard, destination, timeout_secs=10) + + workload.validate() + + # Check that we didn't do any spurious reconciliations. + # Total number of reconciles should have been one per original shard, plus + # one for each shard that was migrated. + reconcile_ok = env.storage_controller.get_metric_value( + "storage_controller_reconcile_complete_total", filter={"status": "ok"} + ) + assert reconcile_ok == shard_count + split_shard_count // 2 + + # Check that no cancelled or errored reconciliations occurred: this test does no + # failure injection and should run clean. + assert ( + env.storage_controller.get_metric_value( + "storage_controller_reconcile_complete_total", filter={"status": "cancel"} + ) + is None + ) + assert ( + env.storage_controller.get_metric_value( + "storage_controller_reconcile_complete_total", filter={"status": "error"} + ) + is None + ) + + env.storage_controller.consistency_check() + + # Validate pageserver state + shards_exist: list[TenantShardId] = [] + for pageserver in env.pageservers: + locations = pageserver.http_client().tenant_list_locations() + shards_exist.extend(TenantShardId.parse(s[0]) for s in locations["tenant_shards"]) + + log.info("Shards after split: {shards_exist}") + assert len(shards_exist) == split_shard_count + + # Ensure post-split pageserver locations survive a restart (i.e. the child shards + # correctly wrote config to disk, and the storage controller responds correctly + # to /re-attach) + for pageserver in env.pageservers: + pageserver.stop() + pageserver.start() + + shards_exist = [] + for pageserver in env.pageservers: + locations = pageserver.http_client().tenant_list_locations() + shards_exist.extend(TenantShardId.parse(s[0]) for s in locations["tenant_shards"]) + + log.info("Shards after restart: {shards_exist}") + assert len(shards_exist) == split_shard_count + + workload.validate() + + +@pytest.mark.parametrize("initial_stripe_size", [None, 65536]) +def test_sharding_split_stripe_size( + neon_env_builder: NeonEnvBuilder, + httpserver: HTTPServer, + httpserver_listen_address, + initial_stripe_size: int, +): + """ + Check that modifying stripe size inline with a shard split works as expected + """ + (host, port) = httpserver_listen_address + neon_env_builder.control_plane_compute_hook_api = f"http://{host}:{port}/notify" + neon_env_builder.num_pageservers = 1 + + # Set up fake HTTP notify endpoint: we will use this to validate that we receive + # the correct stripe size after split. + notifications = [] + + def handler(request: Request): + log.info(f"Notify request: {request}") + notifications.append(request.json) + return Response(status=200) + + httpserver.expect_request("/notify", method="PUT").respond_with_handler(handler) + + env = neon_env_builder.init_start( + initial_tenant_shard_count=1, initial_tenant_shard_stripe_size=initial_stripe_size + ) + tenant_id = env.initial_tenant + + assert len(notifications) == 1 + expect: Dict[str, Union[List[Dict[str, int]], str, None, int]] = { + "tenant_id": str(env.initial_tenant), + "stripe_size": None, + "shards": [{"node_id": int(env.pageservers[0].id), "shard_number": 0}], + } + assert notifications[0] == expect + + new_stripe_size = 2048 + env.storage_controller.tenant_shard_split( + tenant_id, shard_count=2, shard_stripe_size=new_stripe_size + ) + + # Check that we ended up with the stripe size that we expected, both on the pageserver + # and in the notifications to compute + assert len(notifications) == 2 + expect_after: Dict[str, Union[List[Dict[str, int]], str, None, int]] = { + "tenant_id": str(env.initial_tenant), + "stripe_size": new_stripe_size, + "shards": [ + {"node_id": int(env.pageservers[0].id), "shard_number": 0}, + {"node_id": int(env.pageservers[0].id), "shard_number": 1}, + ], + } + log.info(f"Got notification: {notifications[1]}") + assert notifications[1] == expect_after + + # Inspect the stripe size on the pageserver + shard_0_loc = ( + env.pageservers[0].http_client().tenant_get_location(TenantShardId(tenant_id, 0, 2)) + ) + assert shard_0_loc["shard_stripe_size"] == new_stripe_size + shard_1_loc = ( + env.pageservers[0].http_client().tenant_get_location(TenantShardId(tenant_id, 1, 2)) + ) + assert shard_1_loc["shard_stripe_size"] == new_stripe_size + + # Ensure stripe size survives a pageserver restart + env.pageservers[0].stop() + env.pageservers[0].start() + shard_0_loc = ( + env.pageservers[0].http_client().tenant_get_location(TenantShardId(tenant_id, 0, 2)) + ) + assert shard_0_loc["shard_stripe_size"] == new_stripe_size + shard_1_loc = ( + env.pageservers[0].http_client().tenant_get_location(TenantShardId(tenant_id, 1, 2)) + ) + assert shard_1_loc["shard_stripe_size"] == new_stripe_size + + # Ensure stripe size survives a storage controller restart + env.storage_controller.stop() + env.storage_controller.start() + + def assert_restart_notification(): + assert len(notifications) == 3 + assert notifications[2] == expect_after + + wait_until(10, 1, assert_restart_notification) + + +@pytest.mark.skipif( + # The quantity of data isn't huge, but debug can be _very_ slow, and the things we're + # validating in this test don't benefit much from debug assertions. + os.getenv("BUILD_TYPE") == "debug", + reason="Avoid running bulkier ingest tests in debug mode", +) +def test_sharding_ingest( + neon_env_builder: NeonEnvBuilder, +): + """ + Check behaviors related to ingest: + - That we generate properly sized layers + - TODO: that updates to remote_consistent_lsn are made correctly via safekeepers + """ + + # Set a small stripe size and checkpoint distance, so that we can exercise rolling logic + # without writing a lot of data. + expect_layer_size = 131072 + TENANT_CONF = { + # small checkpointing and compaction targets to ensure we generate many upload operations + "checkpoint_distance": f"{expect_layer_size}", + "compaction_target_size": f"{expect_layer_size}", + } + shard_count = 4 + neon_env_builder.num_pageservers = shard_count + env = neon_env_builder.init_start( + initial_tenant_conf=TENANT_CONF, + initial_tenant_shard_count=shard_count, + # A stripe size the same order of magnitude as layer size: this ensures that + # within checkpoint_distance some shards will have no data to ingest, if LSN + # contains sequential page writes. This test checks that this kind of + # scenario doesn't result in some shards emitting empty/tiny layers. + initial_tenant_shard_stripe_size=expect_layer_size // 8192, + ) + tenant_id = env.initial_tenant + timeline_id = env.initial_timeline + + workload = Workload(env, tenant_id, timeline_id) + workload.init() + workload.write_rows(4096, upload=False) + workload.write_rows(4096, upload=False) + workload.write_rows(4096, upload=False) + workload.write_rows(4096, upload=False) + workload.validate() + + small_layer_count = 0 + ok_layer_count = 0 + huge_layer_count = 0 + + # Inspect the resulting layer map, count how many layers are undersized. + for shard in env.storage_controller.locate(tenant_id): + pageserver = env.get_pageserver(shard["node_id"]) + shard_id = shard["shard_id"] + layer_map = pageserver.http_client().layer_map_info(shard_id, timeline_id) + + for layer in layer_map.historic_layers: + assert layer.layer_file_size is not None + if layer.layer_file_size < expect_layer_size // 2: + classification = "Small" + small_layer_count += 1 + elif layer.layer_file_size > expect_layer_size * 2: + classification = "Huge " + huge_layer_count += 1 + else: + classification = "OK " + ok_layer_count += 1 + + if layer.kind == "Delta": + assert layer.lsn_end is not None + lsn_size = Lsn(layer.lsn_end) - Lsn(layer.lsn_start) + else: + lsn_size = 0 + + log.info( + f"{classification} layer[{pageserver.id}]: {layer.layer_file_name} (size {layer.layer_file_size}, LSN distance {lsn_size})" + ) + + # Why an inexact check? + # - Because we roll layers on checkpoint_distance * shard_count, we expect to obey the target + # layer size on average, but it is still possible to write some tiny layers. + log.info(f"Totals: {small_layer_count} small layers, {ok_layer_count} ok layers") + if small_layer_count <= shard_count: + # If each shard has <= 1 small layer + pass + else: + # General case: + assert float(small_layer_count) / float(ok_layer_count) < 0.25 + + # Each shard may emit up to one huge layer, because initdb ingest doesn't respect checkpoint_distance. + assert huge_layer_count <= shard_count diff --git a/test_runner/regress/test_sharding_service.py b/test_runner/regress/test_sharding_service.py new file mode 100644 index 0000000000..7a0707b564 --- /dev/null +++ b/test_runner/regress/test_sharding_service.py @@ -0,0 +1,771 @@ +import time +from collections import defaultdict +from datetime import datetime, timezone +from typing import Any, Dict, List, Union + +import pytest +from fixtures.log_helper import log +from fixtures.neon_fixtures import ( + NeonEnv, + NeonEnvBuilder, + PgBin, + StorageControllerApiException, + TokenScope, +) +from fixtures.pageserver.http import PageserverHttpClient +from fixtures.pageserver.utils import ( + MANY_SMALL_LAYERS_TENANT_CONFIG, + enable_remote_storage_versioning, + list_prefix, + remote_storage_delete_key, + tenant_delete_wait_completed, + timeline_delete_wait_completed, +) +from fixtures.pg_version import PgVersion +from fixtures.remote_storage import RemoteStorageKind, s3_storage +from fixtures.types import TenantId, TimelineId +from fixtures.utils import run_pg_bench_small, wait_until +from mypy_boto3_s3.type_defs import ( + ObjectTypeDef, +) +from pytest_httpserver import HTTPServer +from werkzeug.wrappers.request import Request +from werkzeug.wrappers.response import Response + + +def get_node_shard_counts(env: NeonEnv, tenant_ids): + counts: defaultdict[str, int] = defaultdict(int) + for tid in tenant_ids: + for shard in env.storage_controller.locate(tid): + counts[shard["node_id"]] += 1 + return counts + + +def test_sharding_service_smoke( + neon_env_builder: NeonEnvBuilder, +): + """ + Test the basic lifecycle of a sharding service: + - Restarting + - Restarting a pageserver + - Creating and deleting tenants and timelines + - Marking a pageserver offline + """ + + neon_env_builder.num_pageservers = 3 + env = neon_env_builder.init_configs() + + for pageserver in env.pageservers: + # This test detaches tenants during migration, which can race with deletion queue operations, + # during detach we only do an advisory flush, we don't wait for it. + pageserver.allowed_errors.extend([".*Dropped remote consistent LSN updates.*"]) + + # Start services by hand so that we can skip a pageserver (this will start + register later) + env.broker.try_start() + env.storage_controller.start() + env.pageservers[0].start() + env.pageservers[1].start() + for sk in env.safekeepers: + sk.start() + + # The pageservers we started should have registered with the sharding service on startup + nodes = env.storage_controller.node_list() + assert len(nodes) == 2 + assert set(n["id"] for n in nodes) == {env.pageservers[0].id, env.pageservers[1].id} + + # Starting an additional pageserver should register successfully + env.pageservers[2].start() + nodes = env.storage_controller.node_list() + assert len(nodes) == 3 + assert set(n["id"] for n in nodes) == {ps.id for ps in env.pageservers} + + # Use a multiple of pageservers to get nice even number of shards on each one + tenant_shard_count = len(env.pageservers) * 4 + tenant_count = len(env.pageservers) * 2 + shards_per_tenant = tenant_shard_count // tenant_count + tenant_ids = set(TenantId.generate() for i in range(0, tenant_count)) + + # Creating several tenants should spread out across the pageservers + for tid in tenant_ids: + env.neon_cli.create_tenant(tid, shard_count=shards_per_tenant) + + for node_id, count in get_node_shard_counts(env, tenant_ids).items(): + # we used a multiple of pagservers for the total shard count, + # so expect equal number on all pageservers + assert count == tenant_shard_count / len( + env.pageservers + ), f"Node {node_id} has bad count {count}" + + # Creating and deleting timelines should work, using identical API to pageserver + timeline_crud_tenant = next(iter(tenant_ids)) + timeline_id = TimelineId.generate() + env.storage_controller.pageserver_api().timeline_create( + pg_version=PgVersion.NOT_SET, tenant_id=timeline_crud_tenant, new_timeline_id=timeline_id + ) + timelines = env.storage_controller.pageserver_api().timeline_list(timeline_crud_tenant) + assert len(timelines) == 2 + assert timeline_id in set(TimelineId(t["timeline_id"]) for t in timelines) + # virtual_ps_http.timeline_delete(tenant_id=timeline_crud_tenant, timeline_id=timeline_id) + timeline_delete_wait_completed( + env.storage_controller.pageserver_api(), timeline_crud_tenant, timeline_id + ) + timelines = env.storage_controller.pageserver_api().timeline_list(timeline_crud_tenant) + assert len(timelines) == 1 + assert timeline_id not in set(TimelineId(t["timeline_id"]) for t in timelines) + + # Marking a pageserver offline should migrate tenants away from it. + env.storage_controller.node_configure(env.pageservers[0].id, {"availability": "Offline"}) + + def node_evacuated(node_id: int) -> None: + counts = get_node_shard_counts(env, tenant_ids) + assert counts[node_id] == 0 + + wait_until(10, 1, lambda: node_evacuated(env.pageservers[0].id)) + + # Marking pageserver active should not migrate anything to it + # immediately + env.storage_controller.node_configure(env.pageservers[0].id, {"availability": "Active"}) + time.sleep(1) + assert get_node_shard_counts(env, tenant_ids)[env.pageservers[0].id] == 0 + + # Restarting a pageserver should not detach any tenants (i.e. /re-attach works) + before_restart = env.pageservers[1].http_client().tenant_list_locations() + env.pageservers[1].stop() + env.pageservers[1].start() + after_restart = env.pageservers[1].http_client().tenant_list_locations() + assert len(after_restart) == len(before_restart) + + # Locations should be the same before & after restart, apart from generations + for _shard_id, tenant in after_restart["tenant_shards"]: + del tenant["generation"] + for _shard_id, tenant in before_restart["tenant_shards"]: + del tenant["generation"] + assert before_restart == after_restart + + # Delete all the tenants + for tid in tenant_ids: + tenant_delete_wait_completed(env.storage_controller.pageserver_api(), tid, 10) + + env.storage_controller.consistency_check() + + # Set a scheduling policy on one node, create all the tenants, observe + # that the scheduling policy is respected. + env.storage_controller.node_configure(env.pageservers[1].id, {"scheduling": "Draining"}) + + # Create some fresh tenants + tenant_ids = set(TenantId.generate() for i in range(0, tenant_count)) + for tid in tenant_ids: + env.neon_cli.create_tenant(tid, shard_count=shards_per_tenant) + + counts = get_node_shard_counts(env, tenant_ids) + # Nothing should have been scheduled on the node in Draining + assert counts[env.pageservers[1].id] == 0 + assert counts[env.pageservers[0].id] == tenant_shard_count // 2 + assert counts[env.pageservers[2].id] == tenant_shard_count // 2 + + env.storage_controller.consistency_check() + + +def test_node_status_after_restart( + neon_env_builder: NeonEnvBuilder, +): + neon_env_builder.num_pageservers = 2 + env = neon_env_builder.init_start() + + # Initially we have two online pageservers + nodes = env.storage_controller.node_list() + assert len(nodes) == 2 + + env.pageservers[1].stop() + + env.storage_controller.stop() + env.storage_controller.start() + + def is_ready(): + assert env.storage_controller.ready() is True + + wait_until(30, 1, is_ready) + + # We loaded nodes from database on restart + nodes = env.storage_controller.node_list() + assert len(nodes) == 2 + + # We should still be able to create a tenant, because the pageserver which is still online + # should have had its availabilty state set to Active. + env.storage_controller.tenant_create(TenantId.generate()) + + env.storage_controller.consistency_check() + + +def test_sharding_service_passthrough( + neon_env_builder: NeonEnvBuilder, +): + """ + For simple timeline/tenant GET APIs that don't require coordination across + shards, the sharding service implements a proxy to shard zero. This test + calls those APIs. + """ + neon_env_builder.num_pageservers = 2 + env = neon_env_builder.init_start() + + # We will talk to storage controller as if it was a pageserver, using the pageserver + # HTTP client + client = PageserverHttpClient(env.storage_controller_port, lambda: True) + timelines = client.timeline_list(tenant_id=env.initial_tenant) + assert len(timelines) == 1 + + status = client.tenant_status(env.initial_tenant) + assert TenantId(status["id"]) == env.initial_tenant + assert set(TimelineId(t) for t in status["timelines"]) == { + env.initial_timeline, + } + assert status["state"]["slug"] == "Active" + + env.storage_controller.consistency_check() + + +def test_sharding_service_restart(neon_env_builder: NeonEnvBuilder): + env = neon_env_builder.init_start() + tenant_a = env.initial_tenant + tenant_b = TenantId.generate() + env.storage_controller.tenant_create(tenant_b) + env.pageserver.tenant_detach(tenant_a) + + # TODO: extend this test to use multiple pageservers, and check that locations don't move around + # on restart. + + # Storage controller restart + env.storage_controller.stop() + env.storage_controller.start() + + observed = set(TenantId(tenant["id"]) for tenant in env.pageserver.http_client().tenant_list()) + + # Tenant A should still be attached + assert tenant_a not in observed + + # Tenant B should remain detached + assert tenant_b in observed + + # Pageserver restart + env.pageserver.stop() + env.pageserver.start() + + # Same assertions as above: restarting either service should not perturb things + observed = set(TenantId(tenant["id"]) for tenant in env.pageserver.http_client().tenant_list()) + assert tenant_a not in observed + assert tenant_b in observed + + env.storage_controller.consistency_check() + + +@pytest.mark.parametrize("warm_up", [True, False]) +def test_sharding_service_onboarding(neon_env_builder: NeonEnvBuilder, warm_up: bool): + """ + We onboard tenants to the sharding service by treating it as a 'virtual pageserver' + which provides the /location_config API. This is similar to creating a tenant, + but imports the generation number. + """ + + neon_env_builder.num_pageservers = 2 + + # Start services by hand so that we can skip registration on one of the pageservers + env = neon_env_builder.init_configs() + env.broker.try_start() + env.storage_controller.start() + + # This is the pageserver where we'll initially create the tenant. Run it in emergency + # mode so that it doesn't talk to storage controller, and do not register it. + env.pageservers[0].allowed_errors.append(".*Emergency mode!.*") + env.pageservers[0].start( + overrides=("--pageserver-config-override=control_plane_emergency_mode=true",), + ) + origin_ps = env.pageservers[0] + + # This is the pageserver managed by the sharding service, where the tenant + # will be attached after onboarding + env.pageservers[1].start() + dest_ps = env.pageservers[1] + virtual_ps_http = PageserverHttpClient(env.storage_controller_port, lambda: True) + + for sk in env.safekeepers: + sk.start() + + # Create a tenant directly via pageserver HTTP API, skipping the storage controller + tenant_id = TenantId.generate() + generation = 123 + origin_ps.http_client().tenant_create(tenant_id, generation=generation) + + # As if doing a live migration, first configure origin into stale mode + origin_ps.http_client().tenant_location_conf( + tenant_id, + { + "mode": "AttachedStale", + "secondary_conf": None, + "tenant_conf": {}, + "generation": generation, + }, + ) + + if warm_up: + origin_ps.http_client().tenant_heatmap_upload(tenant_id) + + # We expect to be called via live migration code, which may try to configure the tenant into secondary + # mode before attaching it. + virtual_ps_http.tenant_location_conf( + tenant_id, + { + "mode": "Secondary", + "secondary_conf": {"warm": True}, + "tenant_conf": {}, + "generation": None, + }, + ) + + virtual_ps_http.tenant_secondary_download(tenant_id) + + # Call into storage controller to onboard the tenant + generation += 1 + virtual_ps_http.tenant_location_conf( + tenant_id, + { + "mode": "AttachedMulti", + "secondary_conf": None, + "tenant_conf": {}, + "generation": generation, + }, + ) + + # As if doing a live migration, detach the original pageserver + origin_ps.http_client().tenant_location_conf( + tenant_id, + { + "mode": "Detached", + "secondary_conf": None, + "tenant_conf": {}, + "generation": None, + }, + ) + + # As if doing a live migration, call into the storage controller to + # set it to AttachedSingle: this is a no-op, but we test it because the + # cloud control plane may call this for symmetry with live migration to + # an individual pageserver + virtual_ps_http.tenant_location_conf( + tenant_id, + { + "mode": "AttachedSingle", + "secondary_conf": None, + "tenant_conf": {}, + "generation": generation, + }, + ) + + # We should see the tenant is now attached to the pageserver managed + # by the sharding service + origin_tenants = origin_ps.http_client().tenant_list() + assert len(origin_tenants) == 0 + dest_tenants = dest_ps.http_client().tenant_list() + assert len(dest_tenants) == 1 + assert TenantId(dest_tenants[0]["id"]) == tenant_id + + # sharding service advances generation by 1 when it first attaches. We started + # with a nonzero generation so this equality also proves that the generation + # was properly carried over during onboarding. + assert dest_tenants[0]["generation"] == generation + 1 + + # The onboarded tenant should survive a restart of sharding service + env.storage_controller.stop() + env.storage_controller.start() + + # The onboarded tenant should surviev a restart of pageserver + dest_ps.stop() + dest_ps.start() + + # Having onboarded via /location_config, we should also be able to update the + # TenantConf part of LocationConf, without inadvertently resetting the generation + modified_tenant_conf = {"max_lsn_wal_lag": 1024 * 1024 * 1024 * 100} + dest_tenant_before_conf_change = dest_ps.http_client().tenant_status(tenant_id) + + # The generation has moved on since we onboarded + assert generation != dest_tenant_before_conf_change["generation"] + + virtual_ps_http.tenant_location_conf( + tenant_id, + { + "mode": "AttachedSingle", + "secondary_conf": None, + "tenant_conf": modified_tenant_conf, + # This is intentionally a stale generation + "generation": generation, + }, + ) + dest_tenant_after_conf_change = dest_ps.http_client().tenant_status(tenant_id) + assert ( + dest_tenant_after_conf_change["generation"] == dest_tenant_before_conf_change["generation"] + ) + dest_tenant_conf_after = dest_ps.http_client().tenant_config(tenant_id) + assert dest_tenant_conf_after.tenant_specific_overrides == modified_tenant_conf + + env.storage_controller.consistency_check() + + +def test_sharding_service_compute_hook( + httpserver: HTTPServer, + neon_env_builder: NeonEnvBuilder, + httpserver_listen_address, +): + """ + Test that the sharding service calls out to the configured HTTP endpoint on attachment changes + """ + + # We will run two pageserver to migrate and check that the storage controller sends notifications + # when migrating. + neon_env_builder.num_pageservers = 2 + (host, port) = httpserver_listen_address + neon_env_builder.control_plane_compute_hook_api = f"http://{host}:{port}/notify" + + # Set up fake HTTP notify endpoint + notifications = [] + + def handler(request: Request): + log.info(f"Notify request: {request}") + notifications.append(request.json) + return Response(status=200) + + httpserver.expect_request("/notify", method="PUT").respond_with_handler(handler) + + # Start running + env = neon_env_builder.init_start() + + # We will to an unclean migration, which will result in deletion queue warnings + env.pageservers[0].allowed_errors.append(".*Dropped remote consistent LSN updates for tenant.*") + + # Initial notification from tenant creation + assert len(notifications) == 1 + expect: Dict[str, Union[List[Dict[str, int]], str, None, int]] = { + "tenant_id": str(env.initial_tenant), + "stripe_size": None, + "shards": [{"node_id": int(env.pageservers[0].id), "shard_number": 0}], + } + assert notifications[0] == expect + + env.storage_controller.node_configure(env.pageservers[0].id, {"availability": "Offline"}) + + def node_evacuated(node_id: int) -> None: + counts = get_node_shard_counts(env, [env.initial_tenant]) + assert counts[node_id] == 0 + + wait_until(10, 1, lambda: node_evacuated(env.pageservers[0].id)) + + # Additional notification from migration + log.info(f"notifications: {notifications}") + expect = { + "tenant_id": str(env.initial_tenant), + "stripe_size": None, + "shards": [{"node_id": int(env.pageservers[1].id), "shard_number": 0}], + } + + def received_migration_notification(): + assert len(notifications) == 2 + assert notifications[1] == expect + + wait_until(20, 0.25, received_migration_notification) + + # When we restart, we should re-emit notifications for all tenants + env.storage_controller.stop() + env.storage_controller.start() + + def received_restart_notification(): + assert len(notifications) == 3 + assert notifications[2] == expect + + wait_until(10, 1, received_restart_notification) + + # Splitting a tenant should cause its stripe size to become visible in the compute notification + env.storage_controller.tenant_shard_split(env.initial_tenant, shard_count=2) + expect = { + "tenant_id": str(env.initial_tenant), + "stripe_size": 32768, + "shards": [ + {"node_id": int(env.pageservers[1].id), "shard_number": 0}, + {"node_id": int(env.pageservers[1].id), "shard_number": 1}, + ], + } + + def received_split_notification(): + assert len(notifications) == 4 + assert notifications[3] == expect + + wait_until(10, 1, received_split_notification) + + env.storage_controller.consistency_check() + + +def test_sharding_service_debug_apis(neon_env_builder: NeonEnvBuilder): + """ + Verify that occasional-use debug APIs work as expected. This is a lightweight test + that just hits the endpoints to check that they don't bitrot. + """ + + neon_env_builder.num_pageservers = 2 + env = neon_env_builder.init_start() + + tenant_id = TenantId.generate() + env.storage_controller.tenant_create(tenant_id, shard_count=2, shard_stripe_size=8192) + + # Check that the consistency check passes on a freshly setup system + env.storage_controller.consistency_check() + + # These APIs are intentionally not implemented as methods on NeonStorageController, as + # they're just for use in unanticipated circumstances. + + # Initial tenant (1 shard) and the one we just created (2 shards) should be visible + response = env.storage_controller.request( + "GET", + f"{env.storage_controller_api}/debug/v1/tenant", + headers=env.storage_controller.headers(TokenScope.ADMIN), + ) + assert len(response.json()) == 3 + + # Scheduler should report the expected nodes and shard counts + response = env.storage_controller.request( + "GET", f"{env.storage_controller_api}/debug/v1/scheduler" + ) + # Two nodes, in a dict of node_id->node + assert len(response.json()["nodes"]) == 2 + assert sum(v["shard_count"] for v in response.json()["nodes"].values()) == 3 + assert all(v["may_schedule"] for v in response.json()["nodes"].values()) + + response = env.storage_controller.request( + "POST", + f"{env.storage_controller_api}/debug/v1/node/{env.pageservers[1].id}/drop", + headers=env.storage_controller.headers(TokenScope.ADMIN), + ) + assert len(env.storage_controller.node_list()) == 1 + + response = env.storage_controller.request( + "POST", + f"{env.storage_controller_api}/debug/v1/tenant/{tenant_id}/drop", + headers=env.storage_controller.headers(TokenScope.ADMIN), + ) + + # Tenant drop should be reflected in dump output + response = env.storage_controller.request( + "GET", + f"{env.storage_controller_api}/debug/v1/tenant", + headers=env.storage_controller.headers(TokenScope.ADMIN), + ) + assert len(response.json()) == 1 + + # Check that the 'drop' APIs didn't leave things in a state that would fail a consistency check: they're + # meant to be unclean wrt the pageserver state, but not leave a broken storage controller behind. + env.storage_controller.consistency_check() + + +def test_sharding_service_s3_time_travel_recovery( + neon_env_builder: NeonEnvBuilder, + pg_bin: PgBin, +): + """ + Test for S3 time travel + """ + + remote_storage_kind = s3_storage() + neon_env_builder.enable_pageserver_remote_storage(remote_storage_kind) + + # Mock S3 doesn't have versioning enabled by default, enable it + # (also do it before there is any writes to the bucket) + if remote_storage_kind == RemoteStorageKind.MOCK_S3: + remote_storage = neon_env_builder.pageserver_remote_storage + assert remote_storage, "remote storage not configured" + enable_remote_storage_versioning(remote_storage) + + neon_env_builder.num_pageservers = 1 + + env = neon_env_builder.init_start() + virtual_ps_http = PageserverHttpClient(env.storage_controller_port, lambda: True) + + tenant_id = TenantId.generate() + env.storage_controller.tenant_create( + tenant_id, + shard_count=2, + shard_stripe_size=8192, + tenant_config=MANY_SMALL_LAYERS_TENANT_CONFIG, + ) + + # Check that the consistency check passes + env.storage_controller.consistency_check() + + branch_name = "main" + timeline_id = env.neon_cli.create_timeline( + branch_name, + tenant_id=tenant_id, + ) + # Write some nontrivial amount of data into the endpoint and wait until it is uploaded + with env.endpoints.create_start("main", tenant_id=tenant_id) as endpoint: + run_pg_bench_small(pg_bin, endpoint.connstr()) + endpoint.safe_psql("CREATE TABLE created_foo(id integer);") + # last_flush_lsn_upload(env, endpoint, tenant_id, timeline_id) + + # Give the data time to be uploaded + time.sleep(4) + + # Detach the tenant + virtual_ps_http.tenant_location_conf( + tenant_id, + { + "mode": "Detached", + "secondary_conf": None, + "tenant_conf": {}, + "generation": None, + }, + ) + + time.sleep(4) + ts_before_disaster = datetime.now(tz=timezone.utc).replace(tzinfo=None) + time.sleep(4) + + # Simulate a "disaster": delete some random files from remote storage for one of the shards + assert env.pageserver_remote_storage + shard_id_for_list = "0002" + objects: List[ObjectTypeDef] = list_prefix( + env.pageserver_remote_storage, + f"tenants/{tenant_id}-{shard_id_for_list}/timelines/{timeline_id}/", + ).get("Contents", []) + assert len(objects) > 1 + log.info(f"Found {len(objects)} objects in remote storage") + should_delete = False + for obj in objects: + obj_key = obj["Key"] + should_delete = not should_delete + if not should_delete: + log.info(f"Keeping key on remote storage: {obj_key}") + continue + log.info(f"Deleting key from remote storage: {obj_key}") + remote_storage_delete_key(env.pageserver_remote_storage, obj_key) + pass + + time.sleep(4) + ts_after_disaster = datetime.now(tz=timezone.utc).replace(tzinfo=None) + time.sleep(4) + + # Do time travel recovery + virtual_ps_http.tenant_time_travel_remote_storage( + tenant_id, ts_before_disaster, ts_after_disaster, shard_counts=[2] + ) + time.sleep(4) + + # Attach the tenant again + virtual_ps_http.tenant_location_conf( + tenant_id, + { + "mode": "AttachedSingle", + "secondary_conf": None, + "tenant_conf": {}, + "generation": 100, + }, + ) + + with env.endpoints.create_start("main", tenant_id=tenant_id) as endpoint: + endpoint.safe_psql("SELECT * FROM created_foo;") + + env.storage_controller.consistency_check() + + +def test_sharding_service_auth(neon_env_builder: NeonEnvBuilder): + neon_env_builder.auth_enabled = True + env = neon_env_builder.init_start() + svc = env.storage_controller + api = env.storage_controller_api + + tenant_id = TenantId.generate() + body: Dict[str, Any] = {"new_tenant_id": str(tenant_id)} + + # No token + with pytest.raises( + StorageControllerApiException, + match="Unauthorized: missing authorization header", + ): + svc.request("POST", f"{env.storage_controller_api}/v1/tenant", json=body) + + # Token with incorrect scope + with pytest.raises( + StorageControllerApiException, + match="Forbidden: JWT authentication error", + ): + svc.request("POST", f"{api}/v1/tenant", json=body, headers=svc.headers(TokenScope.ADMIN)) + + # Token with correct scope + svc.request( + "POST", f"{api}/v1/tenant", json=body, headers=svc.headers(TokenScope.PAGE_SERVER_API) + ) + + # No token + with pytest.raises( + StorageControllerApiException, + match="Unauthorized: missing authorization header", + ): + svc.request("GET", f"{api}/debug/v1/tenant") + + # Token with incorrect scope + with pytest.raises( + StorageControllerApiException, + match="Forbidden: JWT authentication error", + ): + svc.request( + "GET", f"{api}/debug/v1/tenant", headers=svc.headers(TokenScope.GENERATIONS_API) + ) + + # No token + with pytest.raises( + StorageControllerApiException, + match="Unauthorized: missing authorization header", + ): + svc.request("POST", f"{api}/upcall/v1/re-attach") + + # Token with incorrect scope + with pytest.raises( + StorageControllerApiException, + match="Forbidden: JWT authentication error", + ): + svc.request( + "POST", f"{api}/upcall/v1/re-attach", headers=svc.headers(TokenScope.PAGE_SERVER_API) + ) + + +def test_sharding_service_tenant_conf(neon_env_builder: NeonEnvBuilder): + """ + Validate the pageserver-compatible API endpoints for setting and getting tenant conf, without + supplying the whole LocationConf. + """ + + env = neon_env_builder.init_start() + tenant_id = env.initial_tenant + + http = env.storage_controller.pageserver_api() + + default_value = "7days" + new_value = "1h" + http.set_tenant_config(tenant_id, {"pitr_interval": new_value}) + + # Ensure the change landed on the storage controller + readback_controller = http.tenant_config(tenant_id) + assert readback_controller.effective_config["pitr_interval"] == new_value + assert readback_controller.tenant_specific_overrides["pitr_interval"] == new_value + + # Ensure the change made it down to the pageserver + readback_ps = env.pageservers[0].http_client().tenant_config(tenant_id) + assert readback_ps.effective_config["pitr_interval"] == new_value + assert readback_ps.tenant_specific_overrides["pitr_interval"] == new_value + + # Omitting a value clears it. This looks different in storage controller + # vs. pageserver API calls, because pageserver has defaults. + http.set_tenant_config(tenant_id, {}) + readback_controller = http.tenant_config(tenant_id) + assert readback_controller.effective_config["pitr_interval"] is None + assert readback_controller.tenant_specific_overrides["pitr_interval"] is None + readback_ps = env.pageservers[0].http_client().tenant_config(tenant_id) + assert readback_ps.effective_config["pitr_interval"] == default_value + assert "pitr_interval" not in readback_ps.tenant_specific_overrides + + env.storage_controller.consistency_check() diff --git a/test_runner/regress/test_subxacts.py b/test_runner/regress/test_subxacts.py index eb96a8faa4..10cb00c780 100644 --- a/test_runner/regress/test_subxacts.py +++ b/test_runner/regress/test_subxacts.py @@ -1,4 +1,3 @@ -from fixtures.log_helper import log from fixtures.neon_fixtures import NeonEnv, check_restored_datadir_content @@ -13,15 +12,10 @@ def test_subxacts(neon_simple_env: NeonEnv, test_output_dir): env.neon_cli.create_branch("test_subxacts", "empty") endpoint = env.endpoints.create_start("test_subxacts") - log.info("postgres is running on 'test_subxacts' branch") pg_conn = endpoint.connect() cur = pg_conn.cursor() - cur.execute( - """ - CREATE TABLE t1(i int, j int); - """ - ) + cur.execute("CREATE TABLE t1(i int, j int);") cur.execute("select pg_switch_wal();") diff --git a/test_runner/regress/test_tenant_conf.py b/test_runner/regress/test_tenant_conf.py index 2ed22cabc4..fc099297e1 100644 --- a/test_runner/regress/test_tenant_conf.py +++ b/test_runner/regress/test_tenant_conf.py @@ -270,7 +270,7 @@ eviction_policy = { "kind" = "LayerAccessThreshold", period = "20s", threshold = "period": "20s", "threshold": "23h", } - assert final_effective_config["max_lsn_wal_lag"] == 10 * 1024 * 1024 + assert final_effective_config["max_lsn_wal_lag"] == 1024 * 1024 * 1024 # restart the pageserver and ensure that the config is still correct env.pageserver.stop() @@ -299,8 +299,7 @@ def test_creating_tenant_conf_after_attach(neon_env_builder: NeonEnvBuilder): # tenant is created with defaults, as in without config file (tenant_id, timeline_id) = env.neon_cli.create_tenant() - config_path = env.pageserver.tenant_dir(tenant_id) / "config" - assert config_path.exists(), "config file is always initially created" + config_path = env.pageserver.tenant_dir(tenant_id) / "config-v1" http_client = env.pageserver.http_client() diff --git a/test_runner/regress/test_tenant_delete.py b/test_runner/regress/test_tenant_delete.py index b4e5a550f3..52de889084 100644 --- a/test_runner/regress/test_tenant_delete.py +++ b/test_runner/regress/test_tenant_delete.py @@ -9,6 +9,7 @@ from fixtures.log_helper import log from fixtures.neon_fixtures import ( NeonEnvBuilder, PgBin, + S3Scrubber, last_flush_lsn_upload, wait_for_last_flush_lsn, ) @@ -19,12 +20,13 @@ from fixtures.pageserver.utils import ( assert_prefix_not_empty, poll_for_remote_storage_iterations, tenant_delete_wait_completed, + wait_for_upload, wait_tenant_status_404, wait_until_tenant_active, wait_until_tenant_state, ) from fixtures.remote_storage import RemoteStorageKind, available_s3_storages, s3_storage -from fixtures.types import TenantId, TimelineId +from fixtures.types import Lsn, TenantId, TimelineId from fixtures.utils import run_pg_bench_small, wait_until from requests.exceptions import ReadTimeout @@ -128,7 +130,6 @@ FAILPOINTS = [ "timeline-delete-before-index-deleted-at", "timeline-delete-before-rm", "timeline-delete-before-index-delete", - "timeline-delete-after-rm-dir", ] FAILPOINTS_BEFORE_BACKGROUND = [ @@ -189,6 +190,8 @@ def test_delete_tenant_exercise_crash_safety_failpoints( # So by ignoring these instead of waiting for empty upload queue # we execute more distinct code paths. '.*stopping left-over name="remote upload".*', + # an on-demand is cancelled by shutdown + ".*initial size calculation failed: downloading failed, possibly for shutdown", ] ) @@ -504,10 +507,10 @@ def test_tenant_delete_concurrent( return ps_http.tenant_delete(tenant_id) def hit_remove_failpoint(): - assert env.pageserver.log_contains(f"at failpoint {BEFORE_REMOVE_FAILPOINT}") + env.pageserver.assert_log_contains(f"at failpoint {BEFORE_REMOVE_FAILPOINT}") def hit_run_failpoint(): - assert env.pageserver.log_contains(f"at failpoint {BEFORE_RUN_FAILPOINT}") + env.pageserver.assert_log_contains(f"at failpoint {BEFORE_RUN_FAILPOINT}") with concurrent.futures.ThreadPoolExecutor() as executor: background_200_req = executor.submit(delete_tenant) @@ -611,12 +614,12 @@ def test_tenant_delete_races_timeline_creation( Thread(target=timeline_create).start() def hit_initdb_upload_failpoint(): - assert env.pageserver.log_contains(f"at failpoint {BEFORE_INITDB_UPLOAD_FAILPOINT}") + env.pageserver.assert_log_contains(f"at failpoint {BEFORE_INITDB_UPLOAD_FAILPOINT}") wait_until(100, 0.1, hit_initdb_upload_failpoint) def creation_connection_timed_out(): - assert env.pageserver.log_contains( + env.pageserver.assert_log_contains( "POST.*/timeline.* request was dropped before completing" ) @@ -635,7 +638,7 @@ def test_tenant_delete_races_timeline_creation( Thread(target=tenant_delete).start() def deletion_arrived(): - assert env.pageserver.log_contains( + env.pageserver.assert_log_contains( f"cfg failpoint: {DELETE_BEFORE_CLEANUP_FAILPOINT} pause" ) @@ -662,10 +665,46 @@ def test_tenant_delete_races_timeline_creation( ) # Ensure that creation cancelled and deletion didn't end up in broken state or encountered the leftover temp file - assert env.pageserver.log_contains(CANCELLED_ERROR) + env.pageserver.assert_log_contains(CANCELLED_ERROR) assert not env.pageserver.log_contains( ".*ERROR.*delete_tenant.*Timelines directory is not empty after all timelines deletion" ) # Zero tenants remain (we deleted the default tenant) assert ps_http.get_metric_value("pageserver_tenant_manager_slots") == 0 + + +def test_tenant_delete_scrubber(pg_bin: PgBin, neon_env_builder: NeonEnvBuilder): + """ + Validate that creating and then deleting the tenant both survives the scrubber, + and that one can run the scrubber without problems. + """ + + remote_storage_kind = RemoteStorageKind.MOCK_S3 + neon_env_builder.enable_pageserver_remote_storage(remote_storage_kind) + scrubber = S3Scrubber(neon_env_builder) + env = neon_env_builder.init_start(initial_tenant_conf=MANY_SMALL_LAYERS_TENANT_CONFIG) + + ps_http = env.pageserver.http_client() + # create a tenant separate from the main tenant so that we have one remaining + # after we deleted it, as the scrubber treats empty buckets as an error. + (tenant_id, timeline_id) = env.neon_cli.create_tenant() + + with env.endpoints.create_start("main", tenant_id=tenant_id) as endpoint: + run_pg_bench_small(pg_bin, endpoint.connstr()) + last_flush_lsn = Lsn(endpoint.safe_psql("SELECT pg_current_wal_flush_lsn()")[0][0]) + ps_http.timeline_checkpoint(tenant_id, timeline_id) + wait_for_upload(ps_http, tenant_id, timeline_id, last_flush_lsn) + env.stop() + + result = scrubber.scan_metadata() + assert result["with_warnings"] == [] + + env.start() + ps_http = env.pageserver.http_client() + iterations = poll_for_remote_storage_iterations(remote_storage_kind) + tenant_delete_wait_completed(ps_http, tenant_id, iterations) + env.stop() + + scrubber.scan_metadata() + assert result["with_warnings"] == [] diff --git a/test_runner/regress/test_tenant_detach.py b/test_runner/regress/test_tenant_detach.py index 8d5ef4e3c4..d3f24cb06e 100644 --- a/test_runner/regress/test_tenant_detach.py +++ b/test_runner/regress/test_tenant_detach.py @@ -92,10 +92,10 @@ def test_tenant_reattach(neon_env_builder: NeonEnvBuilder, mode: str): wait_for_upload(pageserver_http, tenant_id, timeline_id, current_lsn) # Check that we had to retry the uploads - assert env.pageserver.log_contains( + env.pageserver.assert_log_contains( ".*failed to perform remote task UploadLayer.*, will retry.*" ) - assert env.pageserver.log_contains( + env.pageserver.assert_log_contains( ".*failed to perform remote task UploadMetadata.*, will retry.*" ) @@ -742,8 +742,6 @@ def ensure_test_data(data_id: int, data: str, endpoint: Endpoint): def test_metrics_while_ignoring_broken_tenant_and_reloading( neon_env_builder: NeonEnvBuilder, ): - neon_env_builder.enable_pageserver_remote_storage(RemoteStorageKind.LOCAL_FS) - env = neon_env_builder.init_start() client = env.pageserver.http_client() @@ -761,56 +759,37 @@ def test_metrics_while_ignoring_broken_tenant_and_reloading( client.tenant_break(env.initial_tenant) - found_broken = False - active, broken, broken_set = ([], [], []) - for _ in range(10): + def found_broken(): m = client.get_metrics() active = m.query_all("pageserver_tenant_states_count", {"state": "Active"}) broken = m.query_all("pageserver_tenant_states_count", {"state": "Broken"}) broken_set = m.query_all( "pageserver_broken_tenants_count", {"tenant_id": str(env.initial_tenant)} ) - found_broken = only_int(active) == 0 and only_int(broken) == 1 and only_int(broken_set) == 1 + assert only_int(active) == 0 and only_int(broken) == 1 and only_int(broken_set) == 1 - if found_broken: - break - log.info(f"active: {active}, broken: {broken}, broken_set: {broken_set}") - time.sleep(0.5) - assert ( - found_broken - ), f"tenant shows up as broken; active={active}, broken={broken}, broken_set={broken_set}" + wait_until(10, 0.5, found_broken) client.tenant_ignore(env.initial_tenant) - found_broken = False - broken, broken_set = ([], []) - for _ in range(10): + def found_cleaned_up(): m = client.get_metrics() broken = m.query_all("pageserver_tenant_states_count", {"state": "Broken"}) broken_set = m.query_all( "pageserver_broken_tenants_count", {"tenant_id": str(env.initial_tenant)} ) - found_broken = only_int(broken) == 0 and only_int(broken_set) == 1 + assert only_int(broken) == 0 and len(broken_set) == 0 - if found_broken: - break - time.sleep(0.5) - assert found_broken, f"broken should still be in set, but it is not in the tenant state count: broken={broken}, broken_set={broken_set}" + wait_until(10, 0.5, found_cleaned_up) env.pageserver.tenant_load(env.initial_tenant) - found_active = False - active, broken_set = ([], []) - for _ in range(10): + def found_active(): m = client.get_metrics() active = m.query_all("pageserver_tenant_states_count", {"state": "Active"}) broken_set = m.query_all( "pageserver_broken_tenants_count", {"tenant_id": str(env.initial_tenant)} ) - found_active = only_int(active) == 1 and len(broken_set) == 0 + assert only_int(active) == 1 and len(broken_set) == 0 - if found_active: - break - time.sleep(0.5) - - assert found_active, f"reloaded tenant should be active, and broken tenant set item removed: active={active}, broken_set={broken_set}" + wait_until(10, 0.5, found_active) diff --git a/test_runner/regress/test_tenant_relocation.py b/test_runner/regress/test_tenant_relocation.py index 80b4fab1d3..9def3ad1c2 100644 --- a/test_runner/regress/test_tenant_relocation.py +++ b/test_runner/regress/test_tenant_relocation.py @@ -157,10 +157,7 @@ def switch_pg_to_new_pageserver( timeline_to_detach_local_path = origin_ps.timeline_dir(tenant_id, timeline_id) files_before_detach = os.listdir(timeline_to_detach_local_path) assert ( - "metadata" in files_before_detach - ), f"Regular timeline {timeline_to_detach_local_path} should have the metadata file, but got: {files_before_detach}" - assert ( - len(files_before_detach) >= 2 + len(files_before_detach) >= 1 ), f"Regular timeline {timeline_to_detach_local_path} should have at least one layer file, but got {files_before_detach}" return timeline_to_detach_local_path @@ -213,8 +210,6 @@ def test_tenant_relocation( env.pageservers[0].allowed_errors.extend( [ - # FIXME: Is this expected? - ".*init_tenant_mgr: marking .* as locally complete, while it doesnt exist in remote index.*", # Needed for detach polling on the original pageserver f".*NotFound: tenant {tenant_id}.*", # We will dual-attach in this test, so stale generations are expected @@ -500,7 +495,7 @@ def test_emergency_relocate_with_branches_slow_replay( assert cur.fetchall() == [("before pause",), ("after pause",)] # Sanity check that the failpoint was reached - assert env.pageserver.log_contains('failpoint "wal-ingest-logical-message-sleep": sleep done') + env.pageserver.assert_log_contains('failpoint "wal-ingest-logical-message-sleep": sleep done') assert time.time() - before_attach_time > 5 # Clean up @@ -637,7 +632,7 @@ def test_emergency_relocate_with_branches_createdb( assert query_scalar(cur, "SELECT count(*) FROM test_migrate_one") == 200 # Sanity check that the failpoint was reached - assert env.pageserver.log_contains('failpoint "wal-ingest-logical-message-sleep": sleep done') + env.pageserver.assert_log_contains('failpoint "wal-ingest-logical-message-sleep": sleep done') assert time.time() - before_attach_time > 5 # Clean up diff --git a/test_runner/regress/test_tenants.py b/test_runner/regress/test_tenants.py index 5164bda470..1e13a2f20f 100644 --- a/test_runner/regress/test_tenants.py +++ b/test_runner/regress/test_tenants.py @@ -18,6 +18,7 @@ from fixtures.metrics import ( from fixtures.neon_fixtures import ( NeonEnv, NeonEnvBuilder, + wait_for_last_flush_lsn, ) from fixtures.pageserver.http import PageserverApiException from fixtures.pageserver.utils import timeline_delete_wait_completed, wait_until_tenant_active @@ -285,7 +286,6 @@ def test_pageserver_with_empty_tenants(neon_env_builder: NeonEnvBuilder): env.pageserver.allowed_errors.extend( [ - ".*marking .* as locally complete, while it doesnt exist in remote index.*", ".*load failed.*list timelines directory.*", ] ) @@ -376,11 +376,6 @@ def test_create_churn_during_restart(neon_env_builder: NeonEnvBuilder): # so we allow it to log at WARN, even if it is occasionally a false positive. env.pageserver.allowed_errors.append(".*failed to freeze and flush.*") - # When we shut down a tenant during a timeline creation, initdb is not cancelled, we wait - # for it to complete (since https://github.com/neondatabase/neon/pull/6451). This means - # that shutdown can be delayed by >=1s on debug builds where initdb takes a long time to run. - env.pageserver.allowed_errors.append(".*still waiting, taking longer than expected... gate.*") - def create_bg(delay_ms): time.sleep(delay_ms / 1000.0) try: @@ -420,3 +415,50 @@ def test_create_churn_during_restart(neon_env_builder: NeonEnvBuilder): # The tenant should end up active wait_until_tenant_active(env.pageserver.http_client(), tenant_id, iterations=10, period=1) + + +def test_pageserver_metrics_many_relations(neon_env_builder: NeonEnvBuilder): + """Test for the directory_entries_count metric""" + + neon_env_builder.enable_pageserver_remote_storage(RemoteStorageKind.MOCK_S3) + + env = neon_env_builder.init_start() + ps_http = env.pageserver.http_client() + + endpoint_tenant = env.endpoints.create_start("main", tenant_id=env.initial_tenant) + + # Not sure why but this many tables creates more relations than our limit + TABLE_COUNT = 1600 + COUNT_AT_LEAST_EXPECTED = 5500 + + with endpoint_tenant.connect() as conn: + with conn.cursor() as cur: + # Wrapping begin; commit; around this and the loop below keeps the reproduction + # but it also doesn't have a performance benefit + cur.execute("CREATE TABLE template_tbl(key int primary key, value text);") + for i in range(TABLE_COUNT): + cur.execute(f"CREATE TABLE tbl_{i}(like template_tbl INCLUDING ALL);") + wait_for_last_flush_lsn(env, endpoint_tenant, env.initial_tenant, env.initial_timeline) + endpoint_tenant.stop() + + m = ps_http.get_metrics() + directory_entries_count_metric = m.query_all( + "pageserver_directory_entries_count", {"tenant_id": str(env.initial_tenant)} + ) + + def only_int(samples: List[Sample]) -> int: + assert len(samples) == 1 + return int(samples[0].value) + + directory_entries_count = only_int(directory_entries_count_metric) + + log.info(f"pageserver_directory_entries_count metric value: {directory_entries_count}") + + assert directory_entries_count > COUNT_AT_LEAST_EXPECTED + + timeline_detail = ps_http.timeline_detail(env.initial_tenant, env.initial_timeline) + + counts = timeline_detail["directory_entries_counts"] + assert counts + log.info(f"directory counts: {counts}") + assert counts[2] > COUNT_AT_LEAST_EXPECTED diff --git a/test_runner/regress/test_tenants_with_remote_storage.py b/test_runner/regress/test_tenants_with_remote_storage.py index 6f05d7f7cb..d16978d02a 100644 --- a/test_runner/regress/test_tenants_with_remote_storage.py +++ b/test_runner/regress/test_tenants_with_remote_storage.py @@ -61,11 +61,6 @@ async def all_tenants_workload(env: NeonEnv, tenants_endpoints): def test_tenants_many(neon_env_builder: NeonEnvBuilder): env = neon_env_builder.init_start() - # FIXME: Is this expected? - env.pageserver.allowed_errors.append( - ".*init_tenant_mgr: marking .* as locally complete, while it doesnt exist in remote index.*" - ) - tenants_endpoints: List[Tuple[TenantId, Endpoint]] = [] for _ in range(1, 5): @@ -117,14 +112,6 @@ def test_tenants_attached_after_download(neon_env_builder: NeonEnvBuilder): ##### First start, insert secret data and upload it to the remote storage env = neon_env_builder.init_start() - env.pageserver.allowed_errors.extend( - [ - # FIXME: Are these expected? - ".*No timelines to attach received.*", - ".*marking .* as locally complete, while it doesnt exist in remote index.*", - ] - ) - pageserver_http = env.pageserver.http_client() endpoint = env.endpoints.create_start("main") @@ -160,10 +147,10 @@ def test_tenants_attached_after_download(neon_env_builder: NeonEnvBuilder): log.info(f"upload of checkpoint {checkpoint_number} is done") # Check that we had to retry the uploads - assert env.pageserver.log_contains( + env.pageserver.assert_log_contains( ".*failed to perform remote task UploadLayer.*, will retry.*" ) - assert env.pageserver.log_contains( + env.pageserver.assert_log_contains( ".*failed to perform remote task UploadMetadata.*, will retry.*" ) @@ -223,9 +210,6 @@ def test_tenant_redownloads_truncated_file_on_startup( env.pageserver.allowed_errors.extend( [ ".*removing local file .* because .*", - # FIXME: Are these expected? - ".*init_tenant_mgr: marking .* as locally complete, while it doesnt exist in remote index.*", - ".*No timelines to attach received.*", ] ) diff --git a/test_runner/regress/test_threshold_based_eviction.py b/test_runner/regress/test_threshold_based_eviction.py index 5f72cfd747..7bf49a0874 100644 --- a/test_runner/regress/test_threshold_based_eviction.py +++ b/test_runner/regress/test_threshold_based_eviction.py @@ -179,6 +179,6 @@ def test_threshold_based_eviction( assert len(post.remote_layers) > 0, "some layers should be evicted once it's stabilized" assert len(post.local_layers) > 0, "the imitate accesses should keep some layers resident" - assert env.pageserver.log_contains( - metrics_refused_log_line + assert ( + env.pageserver.log_contains(metrics_refused_log_line) is not None ), "ensure the metrics collection worker ran" diff --git a/test_runner/regress/test_timeline_delete.py b/test_runner/regress/test_timeline_delete.py index 352b82d525..96a5cc491a 100644 --- a/test_runner/regress/test_timeline_delete.py +++ b/test_runner/regress/test_timeline_delete.py @@ -89,6 +89,7 @@ def test_timeline_delete(neon_simple_env: NeonEnv): assert timeline_path.exists() # retry deletes when compaction or gc is running in pageserver + # TODO: review whether this wait_until is actually necessary, we do an await() internally wait_until( number_of_iterations=3, interval=0.2, @@ -136,12 +137,9 @@ DELETE_FAILPOINTS = [ "timeline-delete-before-index-deleted-at", "timeline-delete-before-schedule", "timeline-delete-before-rm", - "timeline-delete-during-rm", "timeline-delete-after-rm", "timeline-delete-before-index-delete", "timeline-delete-after-index-delete", - "timeline-delete-after-rm-metadata", - "timeline-delete-after-rm-dir", ] @@ -215,7 +213,9 @@ def test_delete_timeline_exercise_crash_safety_failpoints( # This happens when timeline remains are cleaned up during loading ".*Timeline dir entry become invalid.*", # In one of the branches we poll for tenant to become active. Polls can generate this log message: - f".*Tenant {env.initial_tenant} is not active*", + f".*Tenant {env.initial_tenant} is not active.*", + # an on-demand is cancelled by shutdown + ".*initial size calculation failed: downloading failed, possibly for shutdown", ] ) @@ -534,7 +534,7 @@ def test_concurrent_timeline_delete_stuck_on( try: def first_call_hit_failpoint(): - assert env.pageserver.log_contains( + env.pageserver.assert_log_contains( f".*{child_timeline_id}.*at failpoint {stuck_failpoint}" ) @@ -605,7 +605,7 @@ def test_delete_timeline_client_hangup(neon_env_builder: NeonEnvBuilder): at_failpoint_log_message = f".*{child_timeline_id}.*at failpoint {failpoint_name}.*" def hit_failpoint(): - assert env.pageserver.log_contains(at_failpoint_log_message) + env.pageserver.assert_log_contains(at_failpoint_log_message) wait_until(50, 0.1, hit_failpoint) @@ -615,7 +615,7 @@ def test_delete_timeline_client_hangup(neon_env_builder: NeonEnvBuilder): env.pageserver.allowed_errors.append(hangup_log_message) def got_hangup_log_message(): - assert env.pageserver.log_contains(hangup_log_message) + env.pageserver.assert_log_contains(hangup_log_message) wait_until(50, 0.1, got_hangup_log_message) @@ -627,7 +627,7 @@ def test_delete_timeline_client_hangup(neon_env_builder: NeonEnvBuilder): def first_request_finished(): message = f".*DELETE.*{child_timeline_id}.*Cancelled request finished" - assert env.pageserver.log_contains(message) + env.pageserver.assert_log_contains(message) wait_until(50, 0.1, first_request_finished) @@ -651,9 +651,7 @@ def test_timeline_delete_works_for_remote_smoke( timeline_ids = [env.initial_timeline] for i in range(2): branch_timeline_id = env.neon_cli.create_branch(f"new{i}", "main") - pg = env.endpoints.create_start(f"new{i}") - - with pg.cursor() as cur: + with env.endpoints.create_start(f"new{i}") as pg, pg.cursor() as cur: cur.execute("CREATE TABLE f (i integer);") cur.execute("INSERT INTO f VALUES (generate_series(1,1000));") current_lsn = Lsn(query_scalar(cur, "SELECT pg_current_wal_flush_lsn()")) @@ -764,7 +762,7 @@ def test_delete_orphaned_objects( for orphan in orphans: assert not orphan.exists() - assert env.pageserver.log_contains( + env.pageserver.assert_log_contains( f"deleting a file not referenced from index_part.json name={orphan.stem}" ) @@ -803,7 +801,7 @@ def test_timeline_delete_resumed_on_attach( ) # failpoint before we remove index_part from s3 - failpoint = "timeline-delete-during-rm" + failpoint = "timeline-delete-after-rm" ps_http.configure_failpoints((failpoint, "return")) env.pageserver.allowed_errors.extend( diff --git a/test_runner/regress/test_timeline_size.py b/test_runner/regress/test_timeline_size.py index 4c5cb32caa..205ca18050 100644 --- a/test_runner/regress/test_timeline_size.py +++ b/test_runner/regress/test_timeline_size.py @@ -1,9 +1,8 @@ import concurrent.futures import math -import queue import random -import threading import time +from collections import defaultdict from contextlib import closing from pathlib import Path from typing import Optional @@ -16,11 +15,11 @@ from fixtures.neon_fixtures import ( Endpoint, NeonEnv, NeonEnvBuilder, + NeonPageserver, PgBin, VanillaPostgres, wait_for_last_flush_lsn, ) -from fixtures.pageserver.http import PageserverApiException, PageserverHttpClient from fixtures.pageserver.utils import ( assert_tenant_state, timeline_delete_wait_completed, @@ -40,10 +39,9 @@ def test_timeline_size(neon_simple_env: NeonEnv): new_timeline_id = env.neon_cli.create_branch("test_timeline_size", "empty") client = env.pageserver.http_client() - wait_for_timeline_size_init(client, tenant=env.initial_tenant, timeline=new_timeline_id) + client.timeline_wait_logical_size(env.initial_tenant, new_timeline_id) endpoint_main = env.endpoints.create_start("test_timeline_size") - log.info("postgres is running on 'test_timeline_size' branch") with closing(endpoint_main.connect()) as conn: with conn.cursor() as cur: @@ -73,13 +71,12 @@ def test_timeline_size_createdropdb(neon_simple_env: NeonEnv): new_timeline_id = env.neon_cli.create_branch("test_timeline_size_createdropdb", "empty") client = env.pageserver.http_client() - wait_for_timeline_size_init(client, tenant=env.initial_tenant, timeline=new_timeline_id) + client.timeline_wait_logical_size(env.initial_tenant, new_timeline_id) timeline_details = client.timeline_detail( env.initial_tenant, new_timeline_id, include_non_incremental_logical_size=True ) endpoint_main = env.endpoints.create_start("test_timeline_size_createdropdb") - log.info("postgres is running on 'test_timeline_size_createdropdb' branch") with closing(endpoint_main.connect()) as conn: with conn.cursor() as cur: @@ -153,7 +150,7 @@ def test_timeline_size_quota_on_startup(neon_env_builder: NeonEnvBuilder): client = env.pageserver.http_client() new_timeline_id = env.neon_cli.create_branch("test_timeline_size_quota_on_startup") - wait_for_timeline_size_init(client, tenant=env.initial_tenant, timeline=new_timeline_id) + client.timeline_wait_logical_size(env.initial_tenant, new_timeline_id) endpoint_main = env.endpoints.create( "test_timeline_size_quota_on_startup", @@ -162,8 +159,6 @@ def test_timeline_size_quota_on_startup(neon_env_builder: NeonEnvBuilder): ) endpoint_main.start() - log.info("postgres is running on 'test_timeline_size_quota_on_startup' branch") - with closing(endpoint_main.connect()) as conn: with conn.cursor() as cur: cur.execute("CREATE TABLE foo (t text)") @@ -219,7 +214,7 @@ def test_timeline_size_quota(neon_env_builder: NeonEnvBuilder): client = env.pageserver.http_client() new_timeline_id = env.neon_cli.create_branch("test_timeline_size_quota") - wait_for_timeline_size_init(client, tenant=env.initial_tenant, timeline=new_timeline_id) + client.timeline_wait_logical_size(env.initial_tenant, new_timeline_id) endpoint_main = env.endpoints.create( "test_timeline_size_quota", @@ -231,8 +226,6 @@ def test_timeline_size_quota(neon_env_builder: NeonEnvBuilder): endpoint_main.respec(skip_pg_catalog_updates=False) endpoint_main.start() - log.info("postgres is running on 'test_timeline_size_quota' branch") - with closing(endpoint_main.connect()) as conn: with conn.cursor() as cur: cur.execute("CREATE TABLE foo (t text)") @@ -337,41 +330,18 @@ def test_timeline_initial_logical_size_calculation_cancellation( assert_size_calculation_not_done() log.info( - f"try to delete the timeline using {deletion_method}, this should cancel size computation tasks and wait for them to finish" + f"delete the timeline using {deletion_method}, this should cancel size computation tasks and wait for them to finish" ) - delete_timeline_success: queue.Queue[bool] = queue.Queue(maxsize=1) - def delete_timeline_thread_fn(): - try: - if deletion_method == "tenant_detach": - client.tenant_detach(tenant_id) - elif deletion_method == "timeline_delete": - timeline_delete_wait_completed(client, tenant_id, timeline_id) - delete_timeline_success.put(True) - except PageserverApiException: - delete_timeline_success.put(False) - raise + if deletion_method == "tenant_detach": + client.tenant_detach(tenant_id) + elif deletion_method == "timeline_delete": + timeline_delete_wait_completed(client, tenant_id, timeline_id) + else: + raise RuntimeError(deletion_method) - delete_timeline_thread = threading.Thread(target=delete_timeline_thread_fn) - delete_timeline_thread.start() - # give it some time to settle in the state where it waits for size computation task - time.sleep(5) - if not delete_timeline_success.empty(): - raise AssertionError( - f"test is broken, the {deletion_method} should be stuck waiting for size computation task, got result {delete_timeline_success.get()}" - ) - - log.info( - "resume the size calculation. The failpoint checks that the timeline directory still exists." - ) - client.configure_failpoints(("timeline-calculate-logical-size-check-dir-exists", "return")) - client.configure_failpoints(("timeline-calculate-logical-size-pause", "off")) - - log.info("wait for delete timeline thread to finish and assert that it succeeded") - assert delete_timeline_success.get() - - # if the implementation is incorrect, the teardown would complain about an error log - # message emitted by the code behind failpoint "timeline-calculate-logical-size-check-dir-exists" + # timeline-calculate-logical-size-pause is still paused, but it doesn't + # matter because it's a pausable_failpoint, which can be cancelled by drop. def test_timeline_physical_size_init(neon_env_builder: NeonEnvBuilder): @@ -585,7 +555,6 @@ def test_timeline_size_metrics( pg_bin = PgBin(test_output_dir, pg_distrib_dir, pg_version) port = port_distributor.get_port() with VanillaPostgres(pgdatadir, pg_bin, port) as vanilla_pg: - vanilla_pg.configure([f"port={port}"]) vanilla_pg.start() # Create database based on template0 because we can't connect to template0 @@ -715,28 +684,6 @@ def assert_physical_size_invariants(sizes: TimelinePhysicalSizeValues): # XXX would be nice to assert layer file physical storage utilization here as well, but we can only do that for LocalFS -# Timeline logical size initialization is an asynchronous background task that runs once, -# try a few times to ensure it's activated properly -def wait_for_timeline_size_init( - client: PageserverHttpClient, tenant: TenantId, timeline: TimelineId -): - for i in range(10): - timeline_details = client.timeline_detail( - tenant, timeline, include_non_incremental_logical_size=True - ) - current_logical_size = timeline_details["current_logical_size"] - non_incremental = timeline_details["current_logical_size_non_incremental"] - if current_logical_size == non_incremental: - return - log.info( - f"waiting for current_logical_size of a timeline to be calculated, iteration {i}: {current_logical_size} vs {non_incremental}" - ) - time.sleep(1) - raise Exception( - f"timed out while waiting for current_logical_size of a timeline to reach its non-incremental value, details: {timeline_details}" - ) - - def test_ondemand_activation(neon_env_builder: NeonEnvBuilder): """ Tenants warmuping up opportunistically will wait for one another's logical size calculations to complete @@ -894,22 +841,40 @@ def test_ondemand_activation(neon_env_builder: NeonEnvBuilder): ) # Deleting a stuck tenant should prompt it to go active + # in some cases, it has already been activated because it's behind the detach + delete_lazy_activating(delete_tenant_id, env.pageserver, expect_attaching=False) + tenant_ids.remove(delete_tenant_id) + + # Check that all the stuck tenants proceed to active (apart from the one that deletes, and the one + # we detached) + wait_until(10, 1, all_active) + assert len(get_tenant_states()) == n_tenants - 2 + + +def delete_lazy_activating( + delete_tenant_id: TenantId, pageserver: NeonPageserver, expect_attaching: bool +): + pageserver_http = pageserver.http_client() + + # Deletion itself won't complete due to our failpoint: Tenant::shutdown can't complete while calculating + # logical size is paused in a failpoint. So instead we will use a log observation to check that + # on-demand activation was triggered by the tenant deletion + log_match = f".*attach{{tenant_id={delete_tenant_id} shard_id=0000 gen=[0-9a-f]+}}: Activating tenant \\(on-demand\\).*" + + if expect_attaching: + assert pageserver_http.tenant_status(delete_tenant_id)["state"]["slug"] == "Attaching" + with concurrent.futures.ThreadPoolExecutor() as executor: log.info("Starting background delete") + def activated_on_demand(): + assert pageserver.log_contains(log_match) is not None + def delete_tenant(): - env.pageserver.http_client().tenant_delete(delete_tenant_id) + pageserver_http.tenant_delete(delete_tenant_id) background_delete = executor.submit(delete_tenant) - # Deletion itself won't complete due to our failpoint: Tenant::shutdown can't complete while calculating - # logical size is paused in a failpoint. So instead we will use a log observation to check that - # on-demand activation was triggered by the tenant deletion - log_match = f".*attach{{tenant_id={delete_tenant_id} shard_id=0000}}: Activating tenant \\(on-demand\\).*" - - def activated_on_demand(): - assert env.pageserver.log_contains(log_match) is not None - log.info(f"Waiting for activation message '{log_match}'") try: wait_until(10, 1, activated_on_demand) @@ -923,12 +888,6 @@ def test_ondemand_activation(neon_env_builder: NeonEnvBuilder): # Poll for deletion to complete wait_tenant_status_404(pageserver_http, tenant_id=delete_tenant_id, iterations=40) - tenant_ids.remove(delete_tenant_id) - - # Check that all the stuck tenants proceed to active (apart from the one that deletes, and the one - # we detached) - wait_until(10, 1, all_active) - assert len(get_tenant_states()) == n_tenants - 2 def test_timeline_logical_size_task_priority(neon_env_builder: NeonEnvBuilder): @@ -994,3 +953,159 @@ def test_timeline_logical_size_task_priority(neon_env_builder: NeonEnvBuilder): client.configure_failpoints( [("initial-size-calculation-permit-pause", "off"), ("walreceiver-after-ingest", "off")] ) + + +def test_eager_attach_does_not_queue_up(neon_env_builder: NeonEnvBuilder): + neon_env_builder.pageserver_config_override = "concurrent_tenant_warmup = '1'" + + env = neon_env_builder.init_start() + + # the supporting_second does nothing except queue behind env.initial_tenant + # for purposes of showing that eager_tenant breezes past the queue + supporting_second, _ = env.neon_cli.create_tenant() + eager_tenant, _ = env.neon_cli.create_tenant() + + client = env.pageserver.http_client() + client.tenant_location_conf( + eager_tenant, + { + "mode": "Detached", + "secondary_conf": None, + "tenant_conf": {}, + "generation": None, + }, + ) + + env.pageserver.stop() + + # pause at logical size calculation, also pause before walreceiver can give feedback so it will give priority to logical size calculation + env.pageserver.start( + extra_env_vars={ + "FAILPOINTS": "timeline-calculate-logical-size-pause=pause;walreceiver-after-ingest=pause" + } + ) + + tenant_ids = [env.initial_tenant, supporting_second] + + def get_tenant_states() -> dict[str, list[TenantId]]: + states = defaultdict(list) + for id in tenant_ids: + state = client.tenant_status(id)["state"]["slug"] + states[state].append(id) + return dict(states) + + def one_is_active(): + states = get_tenant_states() + log.info(f"{states}") + assert len(states["Active"]) == 1 + + wait_until(10, 1, one_is_active) + + def other_is_attaching(): + states = get_tenant_states() + assert len(states["Attaching"]) == 1 + + wait_until(10, 1, other_is_attaching) + + def eager_tenant_is_active(): + resp = client.tenant_status(eager_tenant) + assert resp["state"]["slug"] == "Active" + + gen = env.storage_controller.attach_hook_issue(eager_tenant, env.pageserver.id) + client.tenant_location_conf( + eager_tenant, + { + "mode": "AttachedSingle", + "secondary_conf": None, + "tenant_conf": {}, + "generation": gen, + }, + lazy=False, + ) + wait_until(10, 1, eager_tenant_is_active) + + other_is_attaching() + + client.configure_failpoints( + [("timeline-calculate-logical-size-pause", "off"), ("walreceiver-after-ingest", "off")] + ) + + +@pytest.mark.parametrize("activation_method", ["endpoint", "branch", "delete"]) +def test_lazy_attach_activation(neon_env_builder: NeonEnvBuilder, activation_method: str): + # env.initial_tenant will take up this permit when attaching with lazy because of a failpoint activated after restart + neon_env_builder.pageserver_config_override = "concurrent_tenant_warmup = '1'" + + env = neon_env_builder.init_start() + + # because this returns (also elsewhere in this file), we know that SpawnMode::Create skips the queue + lazy_tenant, _ = env.neon_cli.create_tenant() + + client = env.pageserver.http_client() + client.tenant_location_conf( + lazy_tenant, + { + "mode": "Detached", + "secondary_conf": None, + "tenant_conf": {}, + "generation": None, + }, + ) + + env.pageserver.stop() + + # pause at logical size calculation, also pause before walreceiver can give feedback so it will give priority to logical size calculation + env.pageserver.start( + extra_env_vars={ + "FAILPOINTS": "timeline-calculate-logical-size-pause=pause;walreceiver-after-ingest=pause" + } + ) + + def initial_tenant_is_active(): + resp = client.tenant_status(env.initial_tenant) + assert resp["state"]["slug"] == "Active" + + wait_until(10, 1, initial_tenant_is_active) + + # even though the initial tenant is now active, because it was startup time + # attach, it will consume the only permit because logical size calculation + # is paused. + + gen = env.storage_controller.attach_hook_issue(lazy_tenant, env.pageserver.id) + client.tenant_location_conf( + lazy_tenant, + { + "mode": "AttachedSingle", + "secondary_conf": None, + "tenant_conf": {}, + "generation": gen, + }, + lazy=True, + ) + + def lazy_tenant_is_attaching(): + resp = client.tenant_status(lazy_tenant) + assert resp["state"]["slug"] == "Attaching" + + # paused logical size calculation of env.initial_tenant is keeping it attaching + wait_until(10, 1, lazy_tenant_is_attaching) + + for _ in range(5): + lazy_tenant_is_attaching() + time.sleep(0.5) + + def lazy_tenant_is_active(): + resp = client.tenant_status(lazy_tenant) + assert resp["state"]["slug"] == "Active" + + if activation_method == "endpoint": + with env.endpoints.create_start("main", tenant_id=lazy_tenant): + # starting up the endpoint should make it jump the queue + wait_until(10, 1, lazy_tenant_is_active) + elif activation_method == "branch": + env.neon_cli.create_timeline("second_branch", lazy_tenant) + wait_until(10, 1, lazy_tenant_is_active) + elif activation_method == "delete": + delete_lazy_activating(lazy_tenant, env.pageserver, expect_attaching=True) + else: + raise RuntimeError(activation_method) diff --git a/test_runner/regress/test_twophase.py b/test_runner/regress/test_twophase.py index 305271c715..dd76689008 100644 --- a/test_runner/regress/test_twophase.py +++ b/test_runner/regress/test_twophase.py @@ -13,7 +13,6 @@ def test_twophase(neon_simple_env: NeonEnv): endpoint = env.endpoints.create_start( "test_twophase", config_lines=["max_prepared_transactions=5"] ) - log.info("postgres is running on 'test_twophase' branch") conn = endpoint.connect() cur = conn.cursor() diff --git a/test_runner/regress/test_vm_bits.py b/test_runner/regress/test_vm_bits.py index 415f086bd3..eff103ca09 100644 --- a/test_runner/regress/test_vm_bits.py +++ b/test_runner/regress/test_vm_bits.py @@ -1,6 +1,7 @@ -import pytest +import time + from fixtures.log_helper import log -from fixtures.neon_fixtures import NeonEnv, fork_at_current_lsn +from fixtures.neon_fixtures import NeonEnv, NeonEnvBuilder, fork_at_current_lsn # @@ -13,7 +14,6 @@ def test_vm_bit_clear(neon_simple_env: NeonEnv): env.neon_cli.create_branch("test_vm_bit_clear", "empty") endpoint = env.endpoints.create_start("test_vm_bit_clear") - log.info("postgres is running on 'test_vm_bit_clear' branch") pg_conn = endpoint.connect() cur = pg_conn.cursor() @@ -92,7 +92,6 @@ def test_vm_bit_clear(neon_simple_env: NeonEnv): # server at the right point-in-time avoids that full-page image. endpoint_new = env.endpoints.create_start("test_vm_bit_clear_new") - log.info("postgres is running on 'test_vm_bit_clear_new' branch") pg_new_conn = endpoint_new.connect() cur_new = pg_new_conn.cursor() @@ -118,12 +117,20 @@ def test_vm_bit_clear(neon_simple_env: NeonEnv): # Test that the ALL_FROZEN VM bit is cleared correctly at a HEAP_LOCK # record. # -# FIXME: This test is broken -@pytest.mark.skip("See https://github.com/neondatabase/neon/pull/6412#issuecomment-1902072541") -def test_vm_bit_clear_on_heap_lock(neon_simple_env: NeonEnv): - env = neon_simple_env +def test_vm_bit_clear_on_heap_lock(neon_env_builder: NeonEnvBuilder): + tenant_conf = { + "checkpoint_distance": f"{128 * 1024}", + "compaction_target_size": f"{128 * 1024}", + "compaction_threshold": "1", + # create image layers eagerly, so that GC can remove some layers + "image_creation_threshold": "1", + # set PITR interval to be small, so we can do GC + "pitr_interval": "0 s", + } + env = neon_env_builder.init_start(initial_tenant_conf=tenant_conf) - env.neon_cli.create_branch("test_vm_bit_clear_on_heap_lock", "empty") + tenant_id = env.initial_tenant + timeline_id = env.neon_cli.create_branch("test_vm_bit_clear_on_heap_lock") endpoint = env.endpoints.create_start( "test_vm_bit_clear_on_heap_lock", config_lines=[ @@ -139,72 +146,82 @@ def test_vm_bit_clear_on_heap_lock(neon_simple_env: NeonEnv): # Install extension containing function needed for test cur.execute("CREATE EXTENSION neon_test_utils") - - cur.execute("SELECT pg_switch_wal()") + cur.execute("CREATE EXTENSION pageinspect") # Create a test table and freeze it to set the all-frozen VM bit on all pages. cur.execute("CREATE TABLE vmtest_lock (id integer PRIMARY KEY)") cur.execute("INSERT INTO vmtest_lock SELECT g FROM generate_series(1, 50000) g") - cur.execute("VACUUM FREEZE vmtest_lock") + + cur.execute("VACUUM (FREEZE, DISABLE_PAGE_SKIPPING true) vmtest_lock") # Lock a row. This clears the all-frozen VM bit for that page. + cur.execute("BEGIN") cur.execute("SELECT * FROM vmtest_lock WHERE id = 40000 FOR UPDATE") # Remember the XID. We will use it later to verify that we have consumed a lot of # XIDs after this. cur.execute("select pg_current_xact_id()") - locking_xid = cur.fetchall()[0][0] + locking_xid = int(cur.fetchall()[0][0]) - # Stop and restart postgres, to clear the buffer cache. + cur.execute("COMMIT") + + # The VM page in shared buffer cache, and the same page as reconstructed + # by the pageserver, should be equal. + # + # Ignore the LSN on the page though (first 8 bytes). If the dirty + # VM page is flushed from the cache for some reason, it gets WAL-logged, + # which changes the LSN on the page. + cur.execute("select get_raw_page( 'vmtest_lock', 'vm', 0 )") + vm_page_in_cache = (cur.fetchall()[0][0])[8:100].hex() + cur.execute("select get_raw_page_at_lsn( 'vmtest_lock', 'vm', 0, pg_current_wal_insert_lsn() )") + vm_page_at_pageserver = (cur.fetchall()[0][0])[8:100].hex() + + assert vm_page_at_pageserver == vm_page_in_cache + + # The above assert is enough to verify the bug that was fixed in + # commit 66fa176cc8. But for good measure, we also reproduce the + # original problem that the missing VM page update caused. The + # rest of the test does that. + + # Kill and restart postgres, to clear the buffer cache. # # NOTE: clear_buffer_cache() will not do, because it evicts the dirty pages # in a "clean" way. Our neon extension will write a full-page image of the VM - # page, and we want to avoid that. - endpoint.stop() + # page, and we want to avoid that. A clean shutdown will also not do, for the + # same reason. + endpoint.stop(mode="immediate") + endpoint.start() pg_conn = endpoint.connect() cur = pg_conn.cursor() - cur.execute("select xmin, xmax, * from vmtest_lock where id = 40000 ") - tup = cur.fetchall() - xmax_before = tup[0][1] - # Consume a lot of XIDs, so that anti-wraparound autovacuum kicks # in and the clog gets truncated. We set autovacuum_freeze_max_age to a very # low value, so it doesn't take all that many XIDs for autovacuum to kick in. - for i in range(1000): - cur.execute( - """ - CREATE TEMP TABLE othertable (i int) ON COMMIT DROP; - do $$ - begin - for i in 1..100000 loop - -- Use a begin-exception block to generate a new subtransaction on each iteration - begin - insert into othertable values (i); - exception when others then - raise 'not expected %', sqlerrm; - end; - end loop; - end; - $$; - """ - ) - cur.execute("select xmin, xmax, * from vmtest_lock where id = 40000 ") - tup = cur.fetchall() - log.info(f"tuple = {tup}") - xmax = tup[0][1] - assert xmax == xmax_before - - if i % 50 == 0: - cur.execute("select datfrozenxid from pg_database where datname='postgres'") - datfrozenxid = cur.fetchall()[0][0] - if datfrozenxid > locking_xid: - break + # + # We could use test_consume_xids() to consume XIDs much faster, + # but it wouldn't speed up the overall test, because we'd still + # need to wait for autovacuum to run. + for _ in range(1000): + cur.execute("select test_consume_xids(10000);") + for _ in range(1000): + cur.execute("select min(datfrozenxid::text::int) from pg_database") + datfrozenxid = int(cur.fetchall()[0][0]) + log.info(f"datfrozenxid {datfrozenxid} locking_xid: {locking_xid}") + if datfrozenxid > locking_xid + 3000000: + break + time.sleep(0.5) cur.execute("select pg_current_xact_id()") - curr_xid = cur.fetchall()[0][0] - assert int(curr_xid) - int(locking_xid) >= 100000 + curr_xid = int(cur.fetchall()[0][0]) + assert curr_xid - locking_xid >= 100000 + + # Perform GC in the pageserver. Otherwise the compute might still + # be able to download the already-deleted SLRU segment from the + # pageserver. That masks the original bug. + env.pageserver.http_client().timeline_checkpoint(tenant_id, timeline_id) + env.pageserver.http_client().timeline_compact(tenant_id, timeline_id) + env.pageserver.http_client().timeline_gc(tenant_id, timeline_id, 0) # Now, if the VM all-frozen bit was not correctly cleared on # replay, we will try to fetch the status of the XID that was @@ -214,3 +231,4 @@ def test_vm_bit_clear_on_heap_lock(neon_simple_env: NeonEnv): cur.execute("select xmin, xmax, * from vmtest_lock where id = 40000 for update") tup = cur.fetchall() log.info(f"tuple = {tup}") + cur.execute("commit transaction") diff --git a/test_runner/regress/test_wal_acceptor.py b/test_runner/regress/test_wal_acceptor.py index 2f8e69165e..2cac58dc1a 100644 --- a/test_runner/regress/test_wal_acceptor.py +++ b/test_runner/regress/test_wal_acceptor.py @@ -28,7 +28,6 @@ from fixtures.neon_fixtures import ( PgBin, PgProtocol, Safekeeper, - SafekeeperHttpClient, SafekeeperPort, last_flush_lsn_upload, ) @@ -46,6 +45,8 @@ from fixtures.remote_storage import ( default_remote_storage, s3_storage, ) +from fixtures.safekeeper.http import SafekeeperHttpClient +from fixtures.safekeeper.utils import are_walreceivers_absent from fixtures.types import Lsn, TenantId, TimelineId from fixtures.utils import get_dir_size, query_scalar, start_in_background @@ -280,11 +281,6 @@ def test_broker(neon_env_builder: NeonEnvBuilder): tenant_id = env.initial_tenant timeline_id = env.neon_cli.create_branch("test_broker", "main") - # FIXME: Is this expected? - env.pageserver.allowed_errors.append( - ".*init_tenant_mgr: marking .* as locally complete, while it doesnt exist in remote index.*" - ) - endpoint = env.endpoints.create_start("test_broker") endpoint.safe_psql("CREATE TABLE t(key int primary key, value text)") @@ -342,11 +338,6 @@ def test_wal_removal(neon_env_builder: NeonEnvBuilder, auth_enabled: bool): neon_env_builder.auth_enabled = auth_enabled env = neon_env_builder.init_start() - # FIXME: Is this expected? - env.pageserver.allowed_errors.append( - ".*init_tenant_mgr: marking .* as locally complete, while it doesnt exist in remote index.*" - ) - tenant_id = env.initial_tenant timeline_id = env.neon_cli.create_branch("test_safekeepers_wal_removal") endpoint = env.endpoints.create_start("test_safekeepers_wal_removal") @@ -1107,12 +1098,6 @@ def is_flush_lsn_aligned(sk_http_clis, tenant_id, timeline_id): return all([flush_lsns[0] == flsn for flsn in flush_lsns]) -def are_walreceivers_absent(sk_http_cli, tenant_id: TenantId, timeline_id: TimelineId): - status = sk_http_cli.timeline_status(tenant_id, timeline_id) - log.info(f"waiting for walreceivers to be gone, currently {status.walreceivers}") - return len(status.walreceivers) == 0 - - # Assert by xxd that WAL on given safekeepers is identical. No compute must be # running for this to be reliable. def cmp_sk_wal(sks: List[Safekeeper], tenant_id: TenantId, timeline_id: TimelineId): @@ -1357,6 +1342,36 @@ def test_peer_recovery(neon_env_builder: NeonEnvBuilder): endpoint.safe_psql("insert into t select generate_series(1,100), 'payload'") +# Test that when compute is terminated in fast (or smart) mode, walproposer is +# allowed to run and self terminate after shutdown checkpoint is written, so it +# commits it to safekeepers before exiting. This not required for correctness, +# but needed for tests using check_restored_datadir_content. +def test_wp_graceful_shutdown(neon_env_builder: NeonEnvBuilder, pg_bin: PgBin): + neon_env_builder.num_safekeepers = 1 + env = neon_env_builder.init_start() + + tenant_id = env.initial_tenant + timeline_id = env.neon_cli.create_branch("test_wp_graceful_shutdown") + ep = env.endpoints.create_start("test_wp_graceful_shutdown") + ep.safe_psql("create table t(key int, value text)") + ep.stop() + + # figure out checkpoint lsn + ckpt_lsn = pg_bin.get_pg_controldata_checkpoint_lsn(ep.pg_data_dir_path()) + + sk_http_cli = env.safekeepers[0].http_client() + commit_lsn = sk_http_cli.timeline_status(tenant_id, timeline_id).commit_lsn + # Note: this is in memory value. Graceful shutdown of walproposer currently + # doesn't guarantee persisted value, which is ok as we need it only for + # tests. Persisting it without risking too many cf flushes needs a wp -> sk + # protocol change. (though in reality shutdown sync-safekeepers does flush + # of cf, so most of the time persisted value wouldn't lag) + log.info(f"sk commit_lsn {commit_lsn}") + # note that ckpt_lsn is the *beginning* of checkpoint record, so commit_lsn + # must be actually higher + assert commit_lsn > ckpt_lsn, "safekeeper must have checkpoint record" + + class SafekeeperEnv: def __init__( self, @@ -1946,3 +1961,51 @@ def test_timeline_copy(neon_env_builder: NeonEnvBuilder, insert_rows: int): assert orig_digest == new_digest # TODO: test timelines can start after copy + + +def test_patch_control_file(neon_env_builder: NeonEnvBuilder): + neon_env_builder.num_safekeepers = 1 + env = neon_env_builder.init_start() + + tenant_id = env.initial_tenant + timeline_id = env.initial_timeline + + endpoint = env.endpoints.create_start("main") + # initialize safekeeper + endpoint.safe_psql("create table t(key int, value text)") + + # update control file + res = ( + env.safekeepers[0] + .http_client() + .patch_control_file( + tenant_id, + timeline_id, + { + "timeline_start_lsn": "0/1", + }, + ) + ) + + timeline_start_lsn_before = res["old_control_file"]["timeline_start_lsn"] + timeline_start_lsn_after = res["new_control_file"]["timeline_start_lsn"] + + log.info(f"patch_control_file response: {res}") + log.info( + f"updated control file timeline_start_lsn, before {timeline_start_lsn_before}, after {timeline_start_lsn_after}" + ) + + assert timeline_start_lsn_after == "0/1" + env.safekeepers[0].stop().start() + + # wait/check that safekeeper is alive + endpoint.safe_psql("insert into t values (1, 'payload')") + + # check that timeline_start_lsn is updated + res = ( + env.safekeepers[0] + .http_client() + .debug_dump({"dump_control_file": "true", "timeline_id": str(timeline_id)}) + ) + log.info(f"dump_control_file response: {res}") + assert res["timelines"][0]["control_file"]["timeline_start_lsn"] == "0/1" diff --git a/test_runner/regress/test_wal_acceptor_async.py b/test_runner/regress/test_wal_acceptor_async.py index 77d67cd63a..720633189e 100644 --- a/test_runner/regress/test_wal_acceptor_async.py +++ b/test_runner/regress/test_wal_acceptor_async.py @@ -515,6 +515,42 @@ def test_recovery_uncommitted(neon_env_builder: NeonEnvBuilder): asyncio.run(run_recovery_uncommitted(env)) +async def run_segment_init_failure(env: NeonEnv): + env.neon_cli.create_branch("test_segment_init_failure") + ep = env.endpoints.create_start("test_segment_init_failure") + ep.safe_psql("create table t(key int, value text)") + ep.safe_psql("insert into t select generate_series(1, 100), 'payload'") + + sk = env.safekeepers[0] + sk_http = sk.http_client() + sk_http.configure_failpoints([("sk-write-zeroes", "return")]) + conn = await ep.connect_async() + ep.safe_psql("select pg_switch_wal()") # jump to the segment boundary + # next insertion should hang until failpoint is disabled. + asyncio.create_task(conn.execute("insert into t select generate_series(1,1), 'payload'")) + sleep_sec = 2 + await asyncio.sleep(sleep_sec) + # also restart ep at segment boundary to make test more interesting + ep.stop() + # it must still be not finished + # assert not bg_query.done() + # Without segment rename during init (#6402) previous statement created + # partially initialized 16MB segment, so sk restart also triggers #6401. + sk.stop().start() + ep = env.endpoints.create_start("test_segment_init_failure") + ep.safe_psql("insert into t select generate_series(1,1), 'payload'") # should be ok now + + +# Test (injected) failure during WAL segment init. +# https://github.com/neondatabase/neon/issues/6401 +# https://github.com/neondatabase/neon/issues/6402 +def test_segment_init_failure(neon_env_builder: NeonEnvBuilder): + neon_env_builder.num_safekeepers = 1 + env = neon_env_builder.init_start() + + asyncio.run(run_segment_init_failure(env)) + + @dataclass class RaceConditionTest: iteration: int diff --git a/test_runner/regress/test_wal_restore.py b/test_runner/regress/test_wal_restore.py index 97db857c74..083a259d85 100644 --- a/test_runner/regress/test_wal_restore.py +++ b/test_runner/regress/test_wal_restore.py @@ -2,6 +2,7 @@ import sys import tarfile import tempfile from pathlib import Path +from typing import List import pytest import zstandard @@ -11,10 +12,17 @@ from fixtures.neon_fixtures import ( PgBin, VanillaPostgres, ) -from fixtures.pageserver.utils import timeline_delete_wait_completed +from fixtures.pageserver.utils import ( + list_prefix, + remote_storage_delete_key, + timeline_delete_wait_completed, +) from fixtures.port_distributor import PortDistributor -from fixtures.remote_storage import LocalFsStorage +from fixtures.remote_storage import LocalFsStorage, S3Storage, s3_storage from fixtures.types import Lsn, TenantId, TimelineId +from mypy_boto3_s3.type_defs import ( + ObjectTypeDef, +) @pytest.mark.skipif( @@ -128,7 +136,11 @@ def test_wal_restore_initdb( assert restored.safe_psql("select count(*) from t", user="cloud_admin") == [(300000,)] -def test_wal_restore_http(neon_env_builder: NeonEnvBuilder): +@pytest.mark.parametrize("broken_tenant", [True, False]) +def test_wal_restore_http(neon_env_builder: NeonEnvBuilder, broken_tenant: bool): + remote_storage_kind = s3_storage() + neon_env_builder.enable_pageserver_remote_storage(remote_storage_kind) + env = neon_env_builder.init_start() endpoint = env.endpoints.create_start("main") endpoint.safe_psql("create table t as select generate_series(1,300000)") @@ -137,15 +149,36 @@ def test_wal_restore_http(neon_env_builder: NeonEnvBuilder): ps_client = env.pageserver.http_client() + if broken_tenant: + env.pageserver.allowed_errors.append( + r".* Changing Active tenant to Broken state, reason: broken from test" + ) + ps_client.tenant_break(tenant_id) + # Mark the initdb archive for preservation ps_client.timeline_preserve_initdb_archive(tenant_id, timeline_id) # shut down the endpoint and delete the timeline from the pageserver endpoint.stop() - assert isinstance(env.pageserver_remote_storage, LocalFsStorage) + assert isinstance(env.pageserver_remote_storage, S3Storage) - timeline_delete_wait_completed(ps_client, tenant_id, timeline_id) + if broken_tenant: + ps_client.tenant_detach(tenant_id) + objects: List[ObjectTypeDef] = list_prefix( + env.pageserver_remote_storage, f"tenants/{tenant_id}/timelines/{timeline_id}/" + ).get("Contents", []) + for obj in objects: + obj_key = obj["Key"] + if "initdb-preserved.tar.zst" in obj_key: + continue + log.info(f"Deleting key from remote storage: {obj_key}") + remote_storage_delete_key(env.pageserver_remote_storage, obj_key) + pass + + ps_client.tenant_attach(tenant_id, generation=10) + else: + timeline_delete_wait_completed(ps_client, tenant_id, timeline_id) # issue the restoration command ps_client.timeline_create( diff --git a/test_runner/sql_regress/expected/neon-test-utils.out b/test_runner/sql_regress/expected/neon-test-utils.out new file mode 100644 index 0000000000..7d1634a6b8 --- /dev/null +++ b/test_runner/sql_regress/expected/neon-test-utils.out @@ -0,0 +1,28 @@ +-- Test the test utils in pgxn/neon_test_utils. We don't test that +-- these actually consume resources like they should - that would be +-- tricky - but at least we check that they don't crash. +CREATE EXTENSION neon_test_utils; +select test_consume_cpu(1); + test_consume_cpu +------------------ + +(1 row) + +select test_consume_memory(20); -- Allocate 20 MB + test_consume_memory +--------------------- + +(1 row) + +select test_release_memory(5); -- Release 5 MB + test_release_memory +--------------------- + +(1 row) + +select test_release_memory(); -- Release the remaining 15 MB + test_release_memory +--------------------- + +(1 row) + diff --git a/test_runner/sql_regress/parallel_schedule b/test_runner/sql_regress/parallel_schedule index 569c7b5066..d9508d1c90 100644 --- a/test_runner/sql_regress/parallel_schedule +++ b/test_runner/sql_regress/parallel_schedule @@ -7,4 +7,5 @@ test: neon-cid test: neon-rel-truncate test: neon-clog +test: neon-test-utils test: neon-vacuum-full diff --git a/test_runner/sql_regress/sql/neon-test-utils.sql b/test_runner/sql_regress/sql/neon-test-utils.sql new file mode 100644 index 0000000000..c5ca6c624b --- /dev/null +++ b/test_runner/sql_regress/sql/neon-test-utils.sql @@ -0,0 +1,11 @@ +-- Test the test utils in pgxn/neon_test_utils. We don't test that +-- these actually consume resources like they should - that would be +-- tricky - but at least we check that they don't crash. + +CREATE EXTENSION neon_test_utils; + +select test_consume_cpu(1); + +select test_consume_memory(20); -- Allocate 20 MB +select test_release_memory(5); -- Release 5 MB +select test_release_memory(); -- Release the remaining 15 MB diff --git a/trace/src/main.rs b/trace/src/main.rs index ddd970e95d..4605c124e9 100644 --- a/trace/src/main.rs +++ b/trace/src/main.rs @@ -60,6 +60,7 @@ fn analyze_trace(mut reader: R) { match msg { PagestreamFeMessage::Exists(_) => {} PagestreamFeMessage::Nblocks(_) => {} + PagestreamFeMessage::GetSlruSegment(_) => {} PagestreamFeMessage::GetPage(req) => { total += 1; diff --git a/vendor/postgres-v14 b/vendor/postgres-v14 index 11e970fe2b..b980d6f090 160000 --- a/vendor/postgres-v14 +++ b/vendor/postgres-v14 @@ -1 +1 @@ -Subproject commit 11e970fe2be56804f0a786ec5fc8141ffefa4ca7 +Subproject commit b980d6f090c676e55fb2c830fb2434f532f635c0 diff --git a/vendor/postgres-v15 b/vendor/postgres-v15 index 731b4d1609..56f32c0e73 160000 --- a/vendor/postgres-v15 +++ b/vendor/postgres-v15 @@ -1 +1 @@ -Subproject commit 731b4d1609d6db1c953755810a41e0e67ea3db7b +Subproject commit 56f32c0e7330d17aaeee8bf211a73995180bd133 diff --git a/vendor/postgres-v16 b/vendor/postgres-v16 index cf302768b2..9007894722 160000 --- a/vendor/postgres-v16 +++ b/vendor/postgres-v16 @@ -1 +1 @@ -Subproject commit cf302768b2890569956641e0e5ba112ae1445351 +Subproject commit 90078947229aa7f9ac5f7ed4527b2c7386d5332b diff --git a/vendor/revisions.json b/vendor/revisions.json index c7b33f8c8a..1941c235ee 100644 --- a/vendor/revisions.json +++ b/vendor/revisions.json @@ -1,5 +1,5 @@ { - "postgres-v16": "cf302768b2890569956641e0e5ba112ae1445351", - "postgres-v15": "731b4d1609d6db1c953755810a41e0e67ea3db7b", - "postgres-v14": "11e970fe2be56804f0a786ec5fc8141ffefa4ca7" + "postgres-v16": "90078947229aa7f9ac5f7ed4527b2c7386d5332b", + "postgres-v15": "56f32c0e7330d17aaeee8bf211a73995180bd133", + "postgres-v14": "b980d6f090c676e55fb2c830fb2434f532f635c0" } diff --git a/vm-image-spec.yaml b/vm-image-spec.yaml index bbe80ceeb1..5b93088303 100644 --- a/vm-image-spec.yaml +++ b/vm-image-spec.yaml @@ -102,7 +102,7 @@ files: - metric_name: lfc_used type: gauge - help: 'lfc_used' + help: 'LFC chunks used (chunk = 1MB)' key_labels: values: [lfc_used] query: | @@ -124,6 +124,69 @@ files: query: | select lfc_value as lfc_writes from neon.neon_lfc_stats where lfc_key='file_cache_writes'; + - metric_name: lfc_cache_size_limit + type: gauge + help: 'LFC cache size limit in bytes' + key_labels: + values: [lfc_cache_size_limit] + query: | + select pg_size_bytes(current_setting('neon.file_cache_size_limit')) as lfc_cache_size_limit; + + - metric_name: connection_counts + type: gauge + help: 'Connection counts' + key_labels: + - datname + - state + values: [count] + query: | + select datname, state, count(*) as count from pg_stat_activity where state <> '' group by datname, state; + + - metric_name: pg_stats_userdb + type: gauge + help: 'Stats for the oldest non-system db' + key_labels: + - datname + value_label: kind + values: + - db_size + - deadlocks + # Rows + - inserted + - updated + - deleted + # We export stats for only one non-system database. Without this limit + # it is too easy to abuse the system by creating lots of databases. + # We can try lifting this limit in the future after we understand the needs better. + query: | + select pg_database_size(datname) as db_size, deadlocks, + tup_inserted as inserted, tup_updated as updated, tup_deleted as deleted, + datname + from pg_stat_database + where datname IN ( + select datname + from pg_database + where datname <> 'postgres' and not datistemplate + order by oid + limit 1 + ); + + - metric_name: max_cluster_size + type: gauge + help: 'neon.max_cluster_size setting' + key_labels: + values: [max_cluster_size] + query: | + select setting::int as max_cluster_size from pg_settings where name = 'neon.max_cluster_size'; + + - metric_name: db_total_size + type: gauge + help: 'Size of all databases' + key_labels: + values: [total] + query: | + select sum(pg_database_size(datname)) as total from pg_database; + build: | # Build cgroup-tools # @@ -158,7 +221,7 @@ build: | # actually build the thing... && make install - FROM quay.io/prometheuscommunity/postgres-exporter:v0.12.0 AS postgres-exporter + FROM quay.io/prometheuscommunity/postgres-exporter:v0.12.1 AS postgres-exporter FROM burningalchemist/sql_exporter:0.13 AS sql-exporter @@ -174,11 +237,10 @@ build: | libtool \ pkg-config - # Note, we use pgbouncer from neondatabase/pgbouncer fork, which could contain extra commits. # Use `dist_man_MANS=` to skip manpage generation (which requires python3/pandoc) - ENV PGBOUNCER_TAG pgbouncer_1_21_0-neon-1 + ENV PGBOUNCER_TAG pgbouncer_1_22_1 RUN set -e \ - && git clone --recurse-submodules --depth 1 --branch ${PGBOUNCER_TAG} https://github.com/neondatabase/pgbouncer.git pgbouncer \ + && git clone --recurse-submodules --depth 1 --branch ${PGBOUNCER_TAG} https://github.com/pgbouncer/pgbouncer.git pgbouncer \ && cd pgbouncer \ && ./autogen.sh \ && LDFLAGS=-static ./configure --prefix=/usr/local/pgbouncer --without-openssl \ diff --git a/workspace_hack/Cargo.toml b/workspace_hack/Cargo.toml index b72e0f3c26..8593b752c2 100644 --- a/workspace_hack/Cargo.toml +++ b/workspace_hack/Cargo.toml @@ -15,7 +15,7 @@ publish = false [dependencies] anyhow = { version = "1", features = ["backtrace"] } aws-config = { version = "1", default-features = false, features = ["rustls", "sso"] } -aws-runtime = { version = "1", default-features = false, features = ["event-stream", "sigv4a"] } +aws-runtime = { version = "1", default-features = false, features = ["event-stream", "http-02x", "sigv4a"] } aws-sigv4 = { version = "1", features = ["http0-compat", "sign-eventstream", "sigv4a"] } aws-smithy-async = { version = "1", default-features = false, features = ["rt-tokio"] } aws-smithy-http = { version = "0.60", default-features = false, features = ["event-stream"] } @@ -38,19 +38,20 @@ futures-io = { version = "0.3" } futures-sink = { version = "0.3" } futures-util = { version = "0.3", features = ["channel", "io", "sink"] } getrandom = { version = "0.2", default-features = false, features = ["std"] } -hashbrown = { version = "0.14", default-features = false, features = ["raw"] } +hashbrown-582f2526e08bb6a0 = { package = "hashbrown", version = "0.14", features = ["raw"] } +hashbrown-594e8ee84c453af0 = { package = "hashbrown", version = "0.13", features = ["raw"] } hex = { version = "0.4", features = ["serde"] } hmac = { version = "0.12", default-features = false, features = ["reset"] } hyper = { version = "0.14", features = ["full"] } indexmap = { version = "1", default-features = false, features = ["std"] } itertools = { version = "0.10" } -libc = { version = "0.2", features = ["extra_traits"] } +libc = { version = "0.2", features = ["extra_traits", "use_std"] } log = { version = "0.4", default-features = false, features = ["std"] } memchr = { version = "2" } nom = { version = "7" } num-bigint = { version = "0.4" } num-integer = { version = "0.1", features = ["i128"] } -num-traits = { version = "0.2", features = ["i128"] } +num-traits = { version = "0.2", features = ["i128", "libm"] } once_cell = { version = "1" } parquet = { git = "https://github.com/neondatabase/arrow-rs", branch = "neon-fix-bugs", default-features = false, features = ["zstd"] } prost = { version = "0.11" } @@ -59,7 +60,6 @@ regex = { version = "1" } regex-automata = { version = "0.4", default-features = false, features = ["dfa-onepass", "hybrid", "meta", "nfa-backtrack", "perf-inline", "perf-literal", "unicode"] } regex-syntax = { version = "0.8" } reqwest = { version = "0.11", default-features = false, features = ["blocking", "default-tls", "json", "multipart", "rustls-tls", "stream"] } -ring = { version = "0.16" } rustls = { version = "0.21", features = ["dangerous_configuration"] } scopeguard = { version = "1" } serde = { version = "1", features = ["alloc", "derive"] } @@ -79,6 +79,7 @@ tracing-core = { version = "0.1" } tungstenite = { version = "0.20" } url = { version = "2", features = ["serde"] } uuid = { version = "1", features = ["serde", "v4", "v7"] } +zeroize = { version = "1", features = ["derive"] } zstd = { version = "0.13" } zstd-safe = { version = "7", default-features = false, features = ["arrays", "legacy", "std", "zdict_builder"] } zstd-sys = { version = "2", default-features = false, features = ["legacy", "std", "zdict_builder"] } @@ -90,16 +91,16 @@ cc = { version = "1", default-features = false, features = ["parallel"] } chrono = { version = "0.4", default-features = false, features = ["clock", "serde", "wasmbind"] } either = { version = "1" } getrandom = { version = "0.2", default-features = false, features = ["std"] } -hashbrown = { version = "0.14", default-features = false, features = ["raw"] } +hashbrown-582f2526e08bb6a0 = { package = "hashbrown", version = "0.14", features = ["raw"] } indexmap = { version = "1", default-features = false, features = ["std"] } itertools = { version = "0.10" } -libc = { version = "0.2", features = ["extra_traits"] } +libc = { version = "0.2", features = ["extra_traits", "use_std"] } log = { version = "0.4", default-features = false, features = ["std"] } memchr = { version = "2" } nom = { version = "7" } num-bigint = { version = "0.4" } num-integer = { version = "0.1", features = ["i128"] } -num-traits = { version = "0.2", features = ["i128"] } +num-traits = { version = "0.2", features = ["i128", "libm"] } once_cell = { version = "1" } parquet = { git = "https://github.com/neondatabase/arrow-rs", branch = "neon-fix-bugs", default-features = false, features = ["zstd"] } prost = { version = "0.11" } @@ -108,8 +109,10 @@ regex-automata = { version = "0.4", default-features = false, features = ["dfa-o regex-syntax = { version = "0.8" } serde = { version = "1", features = ["alloc", "derive"] } syn-dff4ba8e3ae991db = { package = "syn", version = "1", features = ["extra-traits", "full", "visit"] } -syn-f595c2ba2a3f28df = { package = "syn", version = "2", features = ["extra-traits", "full", "visit", "visit-mut"] } +syn-f595c2ba2a3f28df = { package = "syn", version = "2", features = ["extra-traits", "fold", "full", "visit", "visit-mut"] } time-macros = { version = "0.2", default-features = false, features = ["formatting", "parsing", "serde"] } +toml_datetime = { version = "0.6", default-features = false, features = ["serde"] } +toml_edit = { version = "0.19", features = ["serde"] } zstd = { version = "0.13" } zstd-safe = { version = "7", default-features = false, features = ["arrays", "legacy", "std", "zdict_builder"] } zstd-sys = { version = "2", default-features = false, features = ["legacy", "std", "zdict_builder"] }