diff --git a/.config/nextest.toml b/.config/nextest.toml new file mode 100644 index 0000000000..8bccd51c6d --- /dev/null +++ b/.config/nextest.toml @@ -0,0 +1,2 @@ +[profile.default] +slow-timeout = "1m" diff --git a/.github/workflows/build_and_push_docker_image.yml b/.github/workflows/build_and_push_docker_image.yml new file mode 100644 index 0000000000..e401b2f418 --- /dev/null +++ b/.github/workflows/build_and_push_docker_image.yml @@ -0,0 +1,105 @@ +name: Build and Push Docker Image + +on: + workflow_call: + inputs: + dockerfile-path: + required: true + type: string + image-name: + required: true + type: string + outputs: + build-tools-tag: + description: "tag generated for build tools" + value: ${{ jobs.tag.outputs.build-tools-tag }} + +jobs: + check-if-build-tools-dockerfile-changed: + runs-on: ubuntu-latest + outputs: + docker_file_changed: ${{ steps.dockerfile.outputs.docker_file_changed }} + steps: + - name: Check if Dockerfile.buildtools has changed + id: dockerfile + run: | + if [[ "$GITHUB_EVENT_NAME" != "pull_request" ]]; then + echo "docker_file_changed=false" >> $GITHUB_OUTPUT + exit + fi + updated_files=$(gh pr --repo neondatabase/neon diff ${{ github.event.pull_request.number }} --name-only) + if [[ $updated_files == *"Dockerfile.buildtools"* ]]; then + echo "docker_file_changed=true" >> $GITHUB_OUTPUT + fi + env: + GH_TOKEN: ${{ secrets.GITHUB_TOKEN }} + + tag: + runs-on: ubuntu-latest + needs: [ check-if-build-tools-dockerfile-changed ] + outputs: + build-tools-tag: ${{steps.buildtools-tag.outputs.image_tag}} + + steps: + - name: Get buildtools tag + env: + DOCKERFILE_CHANGED: ${{ needs.check-if-build-tools-dockerfile-changed.outputs.docker_file_changed }} + run: | + if [[ "$GITHUB_EVENT_NAME" == "pull_request" ]] && [[ "${DOCKERFILE_CHANGED}" == "true" ]]; then + IMAGE_TAG=$GITHUB_RUN_ID + else + IMAGE_TAG=pinned + fi + + echo "image_tag=${IMAGE_TAG}" >> $GITHUB_OUTPUT + shell: bash + id: buildtools-tag + + kaniko: + if: needs.check-if-build-tools-dockerfile-changed.outputs.docker_file_changed == 'true' + needs: [ tag, check-if-build-tools-dockerfile-changed ] + runs-on: [ self-hosted, dev, x64 ] + container: gcr.io/kaniko-project/executor:v1.7.0-debug + + steps: + - name: Checkout + uses: actions/checkout@v1 + + - name: Configure ECR login + run: echo "{\"credsStore\":\"ecr-login\"}" > /kaniko/.docker/config.json + + - name: Kaniko build + run: /kaniko/executor --reproducible --snapshotMode=redo --skip-unused-stages --dockerfile ${{ inputs.dockerfile-path }} --cache=true --cache-repo 369495373322.dkr.ecr.eu-central-1.amazonaws.com/cache --destination 369495373322.dkr.ecr.eu-central-1.amazonaws.com/${{ inputs.image-name }}:${{ needs.tag.outputs.build-tools-tag }}-amd64 + + kaniko-arm: + if: needs.check-if-build-tools-dockerfile-changed.outputs.docker_file_changed == 'true' + needs: [ tag, check-if-build-tools-dockerfile-changed ] + runs-on: [ self-hosted, dev, arm64 ] + container: gcr.io/kaniko-project/executor:v1.7.0-debug + + steps: + - name: Checkout + uses: actions/checkout@v1 + + - name: Configure ECR login + run: echo "{\"credsStore\":\"ecr-login\"}" > /kaniko/.docker/config.json + + - name: Kaniko build + run: /kaniko/executor --reproducible --snapshotMode=redo --skip-unused-stages --dockerfile ${{ inputs.dockerfile-path }} --cache=true --cache-repo 369495373322.dkr.ecr.eu-central-1.amazonaws.com/cache --destination 369495373322.dkr.ecr.eu-central-1.amazonaws.com/${{ inputs.image-name }}:${{ needs.tag.outputs.build-tools-tag }}-arm64 + + manifest: + if: needs.check-if-build-tools-dockerfile-changed.outputs.docker_file_changed == 'true' + name: 'manifest' + runs-on: [ self-hosted, dev, x64 ] + needs: + - tag + - kaniko + - kaniko-arm + - check-if-build-tools-dockerfile-changed + + steps: + - name: Create manifest + run: docker manifest create 369495373322.dkr.ecr.eu-central-1.amazonaws.com/${{ inputs.image-name }}:${{ needs.tag.outputs.build-tools-tag }} --amend 369495373322.dkr.ecr.eu-central-1.amazonaws.com/${{ inputs.image-name }}:${{ needs.tag.outputs.build-tools-tag }}-amd64 --amend 369495373322.dkr.ecr.eu-central-1.amazonaws.com/${{ inputs.image-name }}:${{ needs.tag.outputs.build-tools-tag }}-arm64 + + - name: Push manifest + run: docker manifest push 369495373322.dkr.ecr.eu-central-1.amazonaws.com/${{ inputs.image-name }}:${{ needs.tag.outputs.build-tools-tag }} diff --git a/.github/workflows/build_and_test.yml b/.github/workflows/build_and_test.yml index 96cc2997fa..b6d5b598d4 100644 --- a/.github/workflows/build_and_test.yml +++ b/.github/workflows/build_and_test.yml @@ -44,7 +44,6 @@ jobs: exit 1 - tag: needs: [ check-permissions ] runs-on: [ self-hosted, gen3, small ] @@ -74,11 +73,19 @@ jobs: shell: bash id: build-tag - check-codestyle-python: + build-buildtools-image: needs: [ check-permissions ] + uses: ./.github/workflows/build_and_push_docker_image.yml + with: + dockerfile-path: Dockerfile.buildtools + image-name: build-tools + secrets: inherit + + check-codestyle-python: + needs: [ check-permissions, build-buildtools-image ] runs-on: [ self-hosted, gen3, small ] container: - image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/rust:pinned + image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/build-tools:${{ needs.build-buildtools-image.outputs.build-tools-tag }} options: --init steps: @@ -98,20 +105,20 @@ jobs: - name: Install Python deps run: ./scripts/pysync - - name: Run ruff to ensure code format - run: poetry run ruff . + - name: Run `ruff check` to ensure code format + run: poetry run ruff check . - - name: Run black to ensure code format - run: poetry run black --diff --check . + - name: Run `ruff format` to ensure code format + run: poetry run ruff format --check . - name: Run mypy to check types run: poetry run mypy . check-codestyle-rust: - needs: [ check-permissions ] + needs: [ check-permissions, build-buildtools-image ] runs-on: [ self-hosted, gen3, large ] container: - image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/rust:pinned + image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/build-tools:${{ needs.build-buildtools-image.outputs.build-tools-tag }} options: --init steps: @@ -175,10 +182,10 @@ jobs: run: cargo deny check --hide-inclusion-graph build-neon: - needs: [ check-permissions, tag ] + needs: [ check-permissions, tag, build-buildtools-image ] runs-on: [ self-hosted, gen3, large ] container: - image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/rust:pinned + image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/build-tools:${{ needs.build-buildtools-image.outputs.build-tools-tag }} options: --init strategy: fail-fast: false @@ -332,18 +339,18 @@ jobs: run: | ${cov_prefix} mold -run cargo build $CARGO_FLAGS $CARGO_FEATURES --bins --tests - - name: Run cargo test + - name: Run rust tests run: | for io_engine in std-fs tokio-epoll-uring ; do - NEON_PAGESERVER_UNIT_TEST_VIRTUAL_FILE_IOENGINE=$io_engine ${cov_prefix} cargo test $CARGO_FLAGS $CARGO_FEATURES + NEON_PAGESERVER_UNIT_TEST_VIRTUAL_FILE_IOENGINE=$io_engine ${cov_prefix} cargo nextest $CARGO_FLAGS $CARGO_FEATURES done # Run separate tests for real S3 export ENABLE_REAL_S3_REMOTE_STORAGE=nonempty - export REMOTE_STORAGE_S3_BUCKET=neon-github-public-dev + export REMOTE_STORAGE_S3_BUCKET=neon-github-ci-tests export REMOTE_STORAGE_S3_REGION=eu-central-1 # Avoid `$CARGO_FEATURES` since there's no `testing` feature in the e2e tests now - ${cov_prefix} cargo test $CARGO_FLAGS --package remote_storage --test test_real_s3 + ${cov_prefix} cargo nextest run $CARGO_FLAGS -E 'package(remote_storage)' -E 'test(test_real_s3)' # Run separate tests for real Azure Blob Storage # XXX: replace region with `eu-central-1`-like region @@ -353,7 +360,7 @@ jobs: export REMOTE_STORAGE_AZURE_CONTAINER="${{ vars.REMOTE_STORAGE_AZURE_CONTAINER }}" export REMOTE_STORAGE_AZURE_REGION="${{ vars.REMOTE_STORAGE_AZURE_REGION }}" # Avoid `$CARGO_FEATURES` since there's no `testing` feature in the e2e tests now - ${cov_prefix} cargo test $CARGO_FLAGS --package remote_storage --test test_real_azure + ${cov_prefix} cargo nextest run $CARGO_FLAGS -E 'package(remote_storage)' -E 'test(test_real_azure)' - name: Install rust binaries run: | @@ -410,10 +417,10 @@ jobs: uses: ./.github/actions/save-coverage-data regress-tests: - needs: [ check-permissions, build-neon, tag ] + needs: [ check-permissions, build-neon, build-buildtools-image, tag ] runs-on: [ self-hosted, gen3, large ] container: - image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/rust:pinned + image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/build-tools:${{ needs.build-buildtools-image.outputs.build-tools-tag }} # Default shared memory is 64mb options: --init --shm-size=512mb strategy: @@ -451,10 +458,10 @@ jobs: uses: ./.github/actions/save-coverage-data benchmarks: - needs: [ check-permissions, build-neon ] + needs: [ check-permissions, build-neon, build-buildtools-image ] runs-on: [ self-hosted, gen3, small ] container: - image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/rust:pinned + image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/build-tools:${{ needs.build-buildtools-image.outputs.build-tools-tag }} # Default shared memory is 64mb options: --init --shm-size=512mb if: github.ref_name == 'main' || contains(github.event.pull_request.labels.*.name, 'run-benchmarks') @@ -485,12 +492,12 @@ jobs: # while coverage is currently collected for the debug ones create-test-report: - needs: [ check-permissions, regress-tests, coverage-report, benchmarks ] + needs: [ check-permissions, regress-tests, coverage-report, benchmarks, build-buildtools-image ] if: ${{ !cancelled() && contains(fromJSON('["skipped", "success"]'), needs.check-permissions.result) }} runs-on: [ self-hosted, gen3, small ] container: - image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/rust:pinned + image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/build-tools:${{ needs.build-buildtools-image.outputs.build-tools-tag }} options: --init steps: @@ -532,11 +539,10 @@ jobs: }) coverage-report: - needs: [ check-permissions, regress-tests ] - + needs: [ check-permissions, regress-tests, build-buildtools-image ] runs-on: [ self-hosted, gen3, small ] container: - image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/rust:pinned + image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/build-tools:${{ needs.build-buildtools-image.outputs.build-tools-tag }} options: --init strategy: fail-fast: false @@ -700,7 +706,7 @@ jobs: }" neon-image: - needs: [ check-permissions, tag ] + needs: [ check-permissions, build-buildtools-image, tag ] runs-on: [ self-hosted, gen3, large ] container: gcr.io/kaniko-project/executor:v1.9.2-debug defaults: @@ -739,6 +745,7 @@ jobs: --context . --build-arg GIT_VERSION=${{ github.event.pull_request.head.sha || github.sha }} --build-arg BUILD_TAG=${{ needs.tag.outputs.build-tag }} + --build-arg TAG=${{ needs.build-buildtools-image.outputs.build-tools-tag }} --build-arg REPOSITORY=369495373322.dkr.ecr.eu-central-1.amazonaws.com --destination 369495373322.dkr.ecr.eu-central-1.amazonaws.com/neon:${{needs.tag.outputs.build-tag}} --destination neondatabase/neon:${{needs.tag.outputs.build-tag}} @@ -749,7 +756,7 @@ jobs: compute-tools-image: runs-on: [ self-hosted, gen3, large ] - needs: [ check-permissions, tag ] + needs: [ check-permissions, build-buildtools-image, tag ] container: gcr.io/kaniko-project/executor:v1.9.2-debug defaults: run: @@ -784,6 +791,7 @@ jobs: --context . --build-arg GIT_VERSION=${{ github.event.pull_request.head.sha || github.sha }} --build-arg BUILD_TAG=${{needs.tag.outputs.build-tag}} + --build-arg TAG=${{needs.build-buildtools-image.outputs.build-tools-tag}} --build-arg REPOSITORY=369495373322.dkr.ecr.eu-central-1.amazonaws.com --dockerfile Dockerfile.compute-tools --destination 369495373322.dkr.ecr.eu-central-1.amazonaws.com/compute-tools:${{needs.tag.outputs.build-tag}} @@ -794,7 +802,7 @@ jobs: run: rm -rf ~/.ecr compute-node-image: - needs: [ check-permissions, tag ] + needs: [ check-permissions, build-buildtools-image, tag ] runs-on: [ self-hosted, gen3, large ] container: image: gcr.io/kaniko-project/executor:v1.9.2-debug @@ -842,6 +850,7 @@ jobs: --build-arg GIT_VERSION=${{ github.event.pull_request.head.sha || github.sha }} --build-arg PG_VERSION=${{ matrix.version }} --build-arg BUILD_TAG=${{needs.tag.outputs.build-tag}} + --build-arg TAG=${{needs.build-buildtools-image.outputs.build-tools-tag}} --build-arg REPOSITORY=369495373322.dkr.ecr.eu-central-1.amazonaws.com --dockerfile Dockerfile.compute-node --destination 369495373322.dkr.ecr.eu-central-1.amazonaws.com/compute-node-${{ matrix.version }}:${{needs.tag.outputs.build-tag}} diff --git a/.github/workflows/neon_extra_builds.yml b/.github/workflows/neon_extra_builds.yml index b1ea5e4f74..c6c2b7386a 100644 --- a/.github/workflows/neon_extra_builds.yml +++ b/.github/workflows/neon_extra_builds.yml @@ -218,7 +218,7 @@ jobs: # Run separate tests for real S3 export ENABLE_REAL_S3_REMOTE_STORAGE=nonempty - export REMOTE_STORAGE_S3_BUCKET=neon-github-public-dev + export REMOTE_STORAGE_S3_BUCKET=neon-github-ci-tests export REMOTE_STORAGE_S3_REGION=eu-central-1 # Avoid `$CARGO_FEATURES` since there's no `testing` feature in the e2e tests now cargo test $CARGO_FLAGS --package remote_storage --test test_real_s3 diff --git a/.github/workflows/update_build_tools_image.yml b/.github/workflows/update_build_tools_image.yml new file mode 100644 index 0000000000..88bab797b7 --- /dev/null +++ b/.github/workflows/update_build_tools_image.yml @@ -0,0 +1,130 @@ +name: 'Update build tools image tag' + +# This workflow it used to update tag of build tools in ECR. +# The most common use case is adding/moving `pinned` tag to `${GITHUB_RUN_IT}` image. + +on: + workflow_dispatch: + inputs: + from-tag: + description: 'Source tag' + required: true + type: string + to-tag: + description: 'Destination tag' + required: true + type: string + default: 'pinned' + +defaults: + run: + shell: bash -euo pipefail {0} + +env: + AWS_ACCESS_KEY_ID: ${{ secrets.AWS_ACCESS_KEY_DEV }} + AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_SECRET_KEY_DEV }} + +permissions: {} + +jobs: + tag-image: + runs-on: [ self-hosted, gen3, small ] + container: golang:1.19-bullseye + + env: + IMAGE: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/build-tools + FROM_TAG: ${{ inputs.from-tag }} + TO_TAG: ${{ inputs.to-tag }} + outputs: + next-digest-buildtools: ${{ steps.next-digest.outputs.next-digest-buildtools }} + prev-digest-buildtools: ${{ steps.prev-digest.outputs.prev-digest-buildtools }} + + steps: + - name: Install Crane & ECR helper + run: | + go install github.com/google/go-containerregistry/cmd/crane@a54d64203cffcbf94146e04069aae4a97f228ee2 # v0.16.1 + go install github.com/awslabs/amazon-ecr-credential-helper/ecr-login/cli/docker-credential-ecr-login@adf1bafd791ae7d4ff098108b1e91f36a4da5404 # v0.7.1 + + - name: Configure ECR login + run: | + mkdir /github/home/.docker/ + echo "{\"credsStore\":\"ecr-login\"}" > /github/home/.docker/config.json + + - name: Get source image digest + id: next-digest + run: | + NEXT_DIGEST=$(crane digest ${IMAGE}:${FROM_TAG} || true) + if [ -z "${NEXT_DIGEST}" ]; then + echo >&2 "Image ${IMAGE}:${FROM_TAG} does not exist" + exit 1 + fi + + echo "Current ${IMAGE}@${FROM_TAG} image is ${IMAGE}@${NEXT_DIGEST}" + echo "next-digest-buildtools=$NEXT_DIGEST" >> $GITHUB_OUTPUT + + - name: Get destination image digest (if already exists) + id: prev-digest + run: | + PREV_DIGEST=$(crane digest ${IMAGE}:${TO_TAG} || true) + if [ -z "${PREV_DIGEST}" ]; then + echo >&2 "Image ${IMAGE}:${TO_TAG} does not exist (it's ok)" + else + echo >&2 "Current ${IMAGE}@${TO_TAG} image is ${IMAGE}@${PREV_DIGEST}" + + echo "prev-digest-buildtools=$PREV_DIGEST" >> $GITHUB_OUTPUT + fi + + - name: Tag image + run: | + crane tag "${IMAGE}:${FROM_TAG}" "${TO_TAG}" + + rollback-tag-image: + needs: tag-image + if: ${{ !success() }} + + runs-on: [ self-hosted, gen3, small ] + container: golang:1.19-bullseye + + env: + IMAGE: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/build-tools + FROM_TAG: ${{ inputs.from-tag }} + TO_TAG: ${{ inputs.to-tag }} + + steps: + - name: Install Crane & ECR helper + run: | + go install github.com/google/go-containerregistry/cmd/crane@a54d64203cffcbf94146e04069aae4a97f228ee2 # v0.16.1 + go install github.com/awslabs/amazon-ecr-credential-helper/ecr-login/cli/docker-credential-ecr-login@adf1bafd791ae7d4ff098108b1e91f36a4da5404 # v0.7.1 + + - name: Configure ECR login + run: | + mkdir /github/home/.docker/ + echo "{\"credsStore\":\"ecr-login\"}" > /github/home/.docker/config.json + + - name: Restore previous tag if needed + run: | + NEXT_DIGEST="${{ needs.tag-image.outputs.next-digest-buildtools }}" + PREV_DIGEST="${{ needs.tag-image.outputs.prev-digest-buildtools }}" + + if [ -z "${NEXT_DIGEST}" ]; then + echo >&2 "Image ${IMAGE}:${FROM_TAG} does not exist, nothing to rollback" + exit 0 + fi + + if [ -z "${PREV_DIGEST}" ]; then + # I guess we should delete the tag here/untag the image, but crane does not support it + # - https://github.com/google/go-containerregistry/issues/999 + + echo >&2 "Image ${IMAGE}:${TO_TAG} did not exist, but it was created by the job, no need to rollback" + + exit 0 + fi + + CURRENT_DIGEST=$(crane digest "${IMAGE}:${TO_TAG}") + if [ "${CURRENT_DIGEST}" == "${NEXT_DIGEST}" ]; then + crane tag "${IMAGE}@${PREV_DIGEST}" "${TO_TAG}" + + echo >&2 "Successfully restored ${TO_TAG} tag from ${IMAGE}@${CURRENT_DIGEST} to ${IMAGE}@${PREV_DIGEST}" + else + echo >&2 "Image ${IMAGE}:${TO_TAG}@${CURRENT_DIGEST} is not required to be restored" + fi diff --git a/.gitignore b/.gitignore index c5fc121ac2..3f4495c9e7 100644 --- a/.gitignore +++ b/.gitignore @@ -6,6 +6,7 @@ __pycache__/ test_output/ .vscode .idea +neon.iml /.neon /integration_tests/.neon diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md index 2692684006..b318c295a3 100644 --- a/CONTRIBUTING.md +++ b/CONTRIBUTING.md @@ -70,3 +70,17 @@ We're using the following approach to make it work: - The label gets removed automatically, so to run CI again with new changes, the label should be added again (after the review) For details see [`approved-for-ci-run.yml`](.github/workflows/approved-for-ci-run.yml) + +## How do I add the "pinned" tag to an buildtools image? +We use the `pinned` tag for `Dockerfile.buildtools` build images in our CI/CD setup, currently adding the `pinned` tag is a manual operation. + +You can call it from GitHub UI: https://github.com/neondatabase/neon/actions/workflows/update_build_tools_image.yml, +or using GitHub CLI: + +```bash +gh workflow -R neondatabase/neon run update_build_tools_image.yml \ + -f from-tag=6254913013 \ + -f to-tag=pinned \ + +# Default `-f to-tag` is `pinned`, so the parameter can be omitted. +``` \ No newline at end of file diff --git a/Cargo.lock b/Cargo.lock index 6ebc2389c5..fc8c652031 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -30,6 +30,8 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "cd7d5a2cecb58716e47d67d5703a249964b14c7be1ec3cad3affc295b2d1c35d" dependencies = [ "cfg-if", + "const-random", + "getrandom 0.2.11", "once_cell", "version_check", "zerocopy", @@ -50,6 +52,12 @@ version = "0.2.16" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "0942ffc6dcaadf03badf6e6a2d0228460359d5e34b57ccdc720b7382dfbd5ec5" +[[package]] +name = "android-tzdata" +version = "0.1.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e999941b234f3131b00bc13c22d06e8c5ff726d1b6318ac7eb276997bbb4fef0" + [[package]] name = "android_system_properties" version = "0.1.5" @@ -247,6 +255,12 @@ dependencies = [ "syn 2.0.32", ] +[[package]] +name = "atomic" +version = "0.5.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c59bdb34bc650a32731b31bd8f0829cc15d24a708ee31559e0bb34f2bc320cba" + [[package]] name = "atomic-polyfill" version = "1.0.2" @@ -1011,17 +1025,17 @@ dependencies = [ [[package]] name = "chrono" -version = "0.4.24" +version = "0.4.31" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "4e3c5919066adf22df73762e50cffcde3a758f2a848b113b586d1f86728b673b" +checksum = "7f2c685bad3eb3d45a01354cedb7d5faa66194d1d58ba6e267a8de788f79db38" dependencies = [ + "android-tzdata", "iana-time-zone", "js-sys", - "num-integer", "num-traits", "serde", "wasm-bindgen", - "winapi", + "windows-targets 0.48.0", ] [[package]] @@ -1120,6 +1134,20 @@ version = "1.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "acbf1af155f9b9ef647e42cdc158db4b64a1b61f743629225fde6f3e0be2a7c7" +[[package]] +name = "combine" +version = "4.6.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "35ed6e9d84f0b51a7f52daf1c7d71dd136fd7a3f41a8462b8cdb8c78d920fad4" +dependencies = [ + "bytes", + "futures-core", + "memchr", + "pin-project-lite", + "tokio", + "tokio-util", +] + [[package]] name = "comfy-table" version = "6.1.4" @@ -1161,6 +1189,7 @@ dependencies = [ "flate2", "futures", "hyper", + "nix 0.26.2", "notify", "num_cpus", "opentelemetry", @@ -1168,8 +1197,10 @@ dependencies = [ "regex", "remote_storage", "reqwest", + "rust-ini", "serde", "serde_json", + "signal-hook", "tar", "tokio", "tokio-postgres", @@ -1201,6 +1232,26 @@ version = "0.9.5" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "28c122c3980598d243d63d9a704629a2d748d101f278052ff068be5a4423ab6f" +[[package]] +name = "const-random" +version = "0.1.17" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5aaf16c9c2c612020bcfd042e170f6e32de9b9d75adb5277cdbbd2e2c8c8299a" +dependencies = [ + "const-random-macro", +] + +[[package]] +name = "const-random-macro" +version = "0.1.16" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f9d839f2a20b0aee515dc581a6172f2321f96cab76c1a38a4c584a194955390e" +dependencies = [ + "getrandom 0.2.11", + "once_cell", + "tiny-keccak", +] + [[package]] name = "const_fn" version = "0.4.9" @@ -1433,6 +1484,12 @@ dependencies = [ "winapi", ] +[[package]] +name = "crunchy" +version = "0.2.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7a81dae078cea95a014a339291cec439d2f232ebe854a9d672b796c6afafa9b7" + [[package]] name = "crypto-bigint" version = "0.4.9" @@ -1575,6 +1632,15 @@ dependencies = [ "syn 2.0.32", ] +[[package]] +name = "dlv-list" +version = "0.5.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "442039f5147480ba31067cb00ada1adae6892028e40e45fc5de7b7df6dcc1b5f" +dependencies = [ + "const-random", +] + [[package]] name = "dyn-clone" version = "1.0.14" @@ -2106,6 +2172,20 @@ dependencies = [ "hashbrown 0.13.2", ] +[[package]] +name = "hdrhistogram" +version = "7.5.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "765c9198f173dd59ce26ff9f95ef0aafd0a0fe01fb9d72841bc5066a4c06511d" +dependencies = [ + "base64 0.21.1", + "byteorder", + "crossbeam-channel", + "flate2", + "nom", + "num-traits", +] + [[package]] name = "heapless" version = "0.8.0" @@ -2423,6 +2503,12 @@ dependencies = [ "web-sys", ] +[[package]] +name = "integer-encoding" +version = "3.0.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8bb03732005da905c88227371639bf1ad885cc712789c011c31c5fb3ab3ccf02" + [[package]] name = "io-lifetimes" version = "1.0.11" @@ -2796,6 +2882,19 @@ dependencies = [ "winapi", ] +[[package]] +name = "num" +version = "0.4.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b05180d69e3da0e530ba2a1dae5110317e49e3b7f3d41be227dc5f92e49ee7af" +dependencies = [ + "num-complex", + "num-integer", + "num-iter", + "num-rational", + "num-traits", +] + [[package]] name = "num-bigint" version = "0.4.3" @@ -2807,6 +2906,15 @@ dependencies = [ "num-traits", ] +[[package]] +name = "num-complex" +version = "0.4.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1ba157ca0885411de85d6ca030ba7e2a83a28636056c7c699b07c8b6f7383214" +dependencies = [ + "num-traits", +] + [[package]] name = "num-integer" version = "0.1.45" @@ -2817,6 +2925,28 @@ dependencies = [ "num-traits", ] +[[package]] +name = "num-iter" +version = "0.1.43" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7d03e6c028c5dc5cac6e2dec0efda81fc887605bb3d884578bb6d6bf7514e252" +dependencies = [ + "autocfg", + "num-integer", + "num-traits", +] + +[[package]] +name = "num-rational" +version = "0.4.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0638a1c9d0a3c0914158145bc76cff373a75a627e6ecbfb71cbe6f453a5a19b0" +dependencies = [ + "autocfg", + "num-integer", + "num-traits", +] + [[package]] name = "num-traits" version = "0.2.15" @@ -3039,6 +3169,25 @@ dependencies = [ "tokio-stream", ] +[[package]] +name = "ordered-float" +version = "2.10.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "68f19d67e5a2795c94e73e0bb1cc1a7edeb2e28efd39e2e1c9b7a40c1108b11c" +dependencies = [ + "num-traits", +] + +[[package]] +name = "ordered-multimap" +version = "0.7.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a4d6a8c22fc714f0c2373e6091bf6f5e9b37b1bc0b1184874b7e0a4e303d318f" +dependencies = [ + "dlv-list", + "hashbrown 0.14.0", +] + [[package]] name = "os_info" version = "3.7.0" @@ -3067,6 +3216,28 @@ dependencies = [ "sha2", ] +[[package]] +name = "pagebench" +version = "0.1.0" +dependencies = [ + "anyhow", + "clap", + "futures", + "hdrhistogram", + "humantime", + "humantime-serde", + "pageserver", + "pageserver_api", + "pageserver_client", + "rand 0.8.5", + "serde", + "serde_json", + "tokio", + "tracing", + "utils", + "workspace_hack", +] + [[package]] name = "pagectl" version = "0.1.0" @@ -3266,6 +3437,35 @@ dependencies = [ "windows-targets 0.48.0", ] +[[package]] +name = "parquet" +version = "49.0.0" +source = "git+https://github.com/neondatabase/arrow-rs?branch=neon-fix-bugs#8a0bc58aa67b98aabbd8eee7c6ca4281967ff9e9" +dependencies = [ + "ahash", + "bytes", + "chrono", + "hashbrown 0.14.0", + "num", + "num-bigint", + "paste", + "seq-macro", + "thrift", + "twox-hash", + "zstd", +] + +[[package]] +name = "parquet_derive" +version = "49.0.0" +source = "git+https://github.com/neondatabase/arrow-rs?branch=neon-fix-bugs#8a0bc58aa67b98aabbd8eee7c6ca4281967ff9e9" +dependencies = [ + "parquet", + "proc-macro2", + "quote", + "syn 2.0.32", +] + [[package]] name = "password-hash" version = "0.5.0" @@ -3689,6 +3889,8 @@ dependencies = [ "base64 0.13.1", "bstr", "bytes", + "camino", + "camino-tempfile", "chrono", "clap", "consumption_metrics", @@ -3711,6 +3913,8 @@ dependencies = [ "once_cell", "opentelemetry", "parking_lot 0.12.1", + "parquet", + "parquet_derive", "pbkdf2", "pin-project-lite", "postgres-native-tls", @@ -3720,7 +3924,9 @@ dependencies = [ "prometheus", "rand 0.8.5", "rcgen", + "redis", "regex", + "remote_storage", "reqwest", "reqwest-middleware", "reqwest-retry", @@ -3881,6 +4087,32 @@ dependencies = [ "yasna", ] +[[package]] +name = "redis" +version = "0.24.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c580d9cbbe1d1b479e8d67cf9daf6a62c957e6846048408b80b43ac3f6af84cd" +dependencies = [ + "async-trait", + "bytes", + "combine", + "futures-util", + "itoa", + "percent-encoding", + "pin-project-lite", + "rustls", + "rustls-native-certs", + "rustls-pemfile", + "rustls-webpki 0.101.7", + "ryu", + "sha1_smol", + "socket2 0.4.9", + "tokio", + "tokio-rustls", + "tokio-util", + "url", +] + [[package]] name = "redox_syscall" version = "0.2.16" @@ -4191,6 +4423,16 @@ dependencies = [ "unicode-ident", ] +[[package]] +name = "rust-ini" +version = "0.20.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "3e0698206bcb8882bf2a9ecb4c1e7785db57ff052297085a6efd4fe42302068a" +dependencies = [ + "cfg-if", + "ordered-multimap", +] + [[package]] name = "rustc-demangle" version = "0.1.23" @@ -4322,12 +4564,14 @@ dependencies = [ "async-stream", "aws-config", "aws-sdk-s3", + "aws-smithy-async", "bincode", "bytes", "chrono", "clap", "crc32c", "either", + "futures", "futures-util", "hex", "histogram", @@ -4366,6 +4610,7 @@ dependencies = [ "clap", "const_format", "crc32c", + "fail", "fs2", "futures", "git-version", @@ -4389,6 +4634,7 @@ dependencies = [ "serde", "serde_json", "serde_with", + "sha2", "signal-hook", "storage_broker", "thiserror", @@ -4595,6 +4841,12 @@ dependencies = [ "uuid", ] +[[package]] +name = "seq-macro" +version = "0.3.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a3f0bf26fd526d2a95683cd0f87bf103b8539e2ca1ef48ce002d67aad59aa0b4" + [[package]] name = "serde" version = "1.0.183" @@ -4717,6 +4969,12 @@ dependencies = [ "digest", ] +[[package]] +name = "sha1_smol" +version = "1.0.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ae1a47186c03a32177042e55dbc5fd5aee900b8e0069a8d70fba96a9375cd012" + [[package]] name = "sha2" version = "0.10.6" @@ -5115,6 +5373,17 @@ dependencies = [ "once_cell", ] +[[package]] +name = "thrift" +version = "0.17.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7e54bc85fc7faa8bc175c4bab5b92ba8d9a3ce893d0e9f42cc455c8ab16a9e09" +dependencies = [ + "byteorder", + "integer-encoding", + "ordered-float", +] + [[package]] name = "time" version = "0.3.21" @@ -5145,6 +5414,15 @@ dependencies = [ "time-core", ] +[[package]] +name = "tiny-keccak" +version = "2.0.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2c9d3793400a45f954c52e73d068316d76b6f4e36977e3fcebb13a2721e80237" +dependencies = [ + "crunchy", +] + [[package]] name = "tinytemplate" version = "1.2.1" @@ -5665,6 +5943,16 @@ dependencies = [ "utf-8", ] +[[package]] +name = "twox-hash" +version = "1.6.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "97fee6b57c6a41524a810daee9286c02d7752c4253064d0b05472833a438f675" +dependencies = [ + "cfg-if", + "static_assertions", +] + [[package]] name = "typenum" version = "1.16.0" @@ -5812,6 +6100,7 @@ dependencies = [ "chrono", "const_format", "criterion", + "fail", "futures", "heapless", "hex", @@ -5850,10 +6139,11 @@ dependencies = [ [[package]] name = "uuid" -version = "1.3.3" +version = "1.6.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "345444e32442451b267fc254ae85a209c64be56d2890e601a0c37ff0c3c5ecd2" +checksum = "5e395fcf16a7a3d8127ec99782007af141946b4795001f876d54fb0d55978560" dependencies = [ + "atomic", "getrandom 0.2.11", "serde", ] @@ -6336,6 +6626,7 @@ dependencies = [ "futures-io", "futures-sink", "futures-util", + "getrandom 0.2.11", "hex", "hmac", "hyper", @@ -6347,6 +6638,8 @@ dependencies = [ "num-bigint", "num-integer", "num-traits", + "once_cell", + "parquet", "prost", "rand 0.8.5", "regex", diff --git a/Cargo.toml b/Cargo.toml index aaccb405be..ddcdad91a6 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -6,6 +6,7 @@ members = [ "pageserver", "pageserver/ctl", "pageserver/client", + "pageserver/pagebench", "proxy", "safekeeper", "storage_broker", @@ -79,6 +80,7 @@ futures-util = "0.3" git-version = "0.3" hashbrown = "0.13" hashlink = "0.8.1" +hdrhistogram = "7.5.2" hex = "0.4" hex-literal = "0.4" hmac = "0.12.1" @@ -105,11 +107,14 @@ opentelemetry = "0.19.0" opentelemetry-otlp = { version = "0.12.0", default_features=false, features = ["http-proto", "trace", "http", "reqwest-client"] } opentelemetry-semantic-conventions = "0.11.0" parking_lot = "0.12" +parquet = { version = "49.0.0", default-features = false, features = ["zstd"] } +parquet_derive = "49.0.0" pbkdf2 = { version = "0.12.1", features = ["simple", "std"] } pin-project-lite = "0.2" prometheus = {version = "0.13", default_features=false, features = ["process"]} # removes protobuf dependency prost = "0.11" rand = "0.8" +redis = { version = "0.24.0", features = ["tokio-rustls-comp", "keep-alive"] } regex = "1.10.2" reqwest = { version = "0.11", default-features = false, features = ["rustls-tls"] } reqwest-tracing = { version = "0.4.0", features = ["opentelemetry_0_19"] } @@ -161,7 +166,7 @@ tracing-error = "0.2.0" tracing-opentelemetry = "0.19.0" tracing-subscriber = { version = "0.3", default_features = false, features = ["smallvec", "fmt", "tracing-log", "std", "env-filter", "json"] } url = "2.2" -uuid = { version = "1.2", features = ["v4", "serde"] } +uuid = { version = "1.6.1", features = ["v4", "v7", "serde"] } walkdir = "2.3.2" webpki-roots = "0.25" x509-parser = "0.15" @@ -215,6 +220,10 @@ tonic-build = "0.9" # TODO: we should probably fork `tokio-postgres-rustls` instead. tokio-postgres = { git = "https://github.com/neondatabase/rust-postgres.git", branch="neon" } +# bug fixes for UUID +parquet = { git = "https://github.com/neondatabase/arrow-rs", branch = "neon-fix-bugs" } +parquet_derive = { git = "https://github.com/neondatabase/arrow-rs", branch = "neon-fix-bugs" } + ################# Binary contents sections [profile.release] diff --git a/Dockerfile b/Dockerfile index 60de9cfa3e..5d5fde4f14 100644 --- a/Dockerfile +++ b/Dockerfile @@ -3,7 +3,7 @@ ### By default, the binaries inside the image have some mock parameters and can start, but are not intended to be used ### inside this image in the real deployments. ARG REPOSITORY=neondatabase -ARG IMAGE=rust +ARG IMAGE=build-tools ARG TAG=pinned # Build Postgres diff --git a/Dockerfile.buildtools b/Dockerfile.buildtools new file mode 100644 index 0000000000..213aed1679 --- /dev/null +++ b/Dockerfile.buildtools @@ -0,0 +1,166 @@ +FROM debian:bullseye-slim + +# Add nonroot user +RUN useradd -ms /bin/bash nonroot -b /home +SHELL ["/bin/bash", "-c"] + +# System deps +RUN set -e \ + && apt update \ + && apt install -y \ + autoconf \ + automake \ + bison \ + build-essential \ + ca-certificates \ + cmake \ + curl \ + flex \ + git \ + gnupg \ + gzip \ + jq \ + libcurl4-openssl-dev \ + libbz2-dev \ + libffi-dev \ + liblzma-dev \ + libncurses5-dev \ + libncursesw5-dev \ + libpq-dev \ + libreadline-dev \ + libseccomp-dev \ + libsqlite3-dev \ + libssl-dev \ + libstdc++-10-dev \ + libtool \ + libxml2-dev \ + libxmlsec1-dev \ + libxxhash-dev \ + lsof \ + make \ + netcat \ + net-tools \ + openssh-client \ + parallel \ + pkg-config \ + unzip \ + wget \ + xz-utils \ + zlib1g-dev \ + zstd \ + && rm -rf /var/lib/apt/lists/* /tmp/* /var/tmp/* + +# protobuf-compiler (protoc) +ENV PROTOC_VERSION 25.1 +RUN curl -fsSL "https://github.com/protocolbuffers/protobuf/releases/download/v${PROTOC_VERSION}/protoc-${PROTOC_VERSION}-linux-$(uname -m | sed 's/aarch64/aarch_64/g').zip" -o "protoc.zip" \ + && unzip -q protoc.zip -d protoc \ + && mv protoc/bin/protoc /usr/local/bin/protoc \ + && mv protoc/include/google /usr/local/include/google \ + && rm -rf protoc.zip protoc + +# LLVM +ENV LLVM_VERSION=17 +RUN curl -fsSL 'https://apt.llvm.org/llvm-snapshot.gpg.key' | apt-key add - \ + && echo "deb http://apt.llvm.org/bullseye/ llvm-toolchain-bullseye-${LLVM_VERSION} main" > /etc/apt/sources.list.d/llvm.stable.list \ + && apt update \ + && apt install -y clang-${LLVM_VERSION} llvm-${LLVM_VERSION} \ + && bash -c 'for f in /usr/bin/clang*-${LLVM_VERSION} /usr/bin/llvm*-${LLVM_VERSION}; do ln -s "${f}" "${f%-${LLVM_VERSION}}"; done' \ + && rm -rf /var/lib/apt/lists/* /tmp/* /var/tmp/* + +# PostgreSQL 14 +RUN curl -fsSL 'https://www.postgresql.org/media/keys/ACCC4CF8.asc' | apt-key add - \ + && echo 'deb http://apt.postgresql.org/pub/repos/apt bullseye-pgdg main' > /etc/apt/sources.list.d/pgdg.list \ + && apt update \ + && apt install -y postgresql-client-14 \ + && rm -rf /var/lib/apt/lists/* /tmp/* /var/tmp/* + +# AWS CLI +RUN curl "https://awscli.amazonaws.com/awscli-exe-linux-$(uname -m).zip" -o "awscliv2.zip" \ + && unzip -q awscliv2.zip \ + && ./aws/install \ + && rm awscliv2.zip + +# Mold: A Modern Linker +ENV MOLD_VERSION v2.4.0 +RUN set -e \ + && git clone https://github.com/rui314/mold.git \ + && mkdir mold/build \ + && cd mold/build \ + && git checkout ${MOLD_VERSION} \ + && cmake -DCMAKE_BUILD_TYPE=Release -DCMAKE_CXX_COMPILER=clang++ .. \ + && cmake --build . -j $(nproc) \ + && cmake --install . \ + && cd .. \ + && rm -rf mold + +# LCOV +# Build lcov from a fork: +# It includes several bug fixes on top on v2.0 release (https://github.com/linux-test-project/lcov/compare/v2.0...master) +# And patches from us: +# - Generates json file with code coverage summary (https://github.com/neondatabase/lcov/commit/426e7e7a22f669da54278e9b55e6d8caabd00af0.tar.gz) +RUN for package in Capture::Tiny DateTime Devel::Cover Digest::MD5 File::Spec JSON::XS Memory::Process Time::HiRes JSON; do yes | perl -MCPAN -e "CPAN::Shell->notest('install', '$package')"; done \ + && wget https://github.com/neondatabase/lcov/archive/426e7e7a22f669da54278e9b55e6d8caabd00af0.tar.gz -O lcov.tar.gz \ + && echo "61a22a62e20908b8b9e27d890bd0ea31f567a7b9668065589266371dcbca0992 lcov.tar.gz" | sha256sum --check \ + && mkdir -p lcov && tar -xzf lcov.tar.gz -C lcov --strip-components=1 \ + && cd lcov \ + && make install \ + && rm -rf ../lcov.tar.gz + +# Switch to nonroot user +USER nonroot:nonroot +WORKDIR /home/nonroot + +# Python +ENV PYTHON_VERSION=3.9.2 \ + PYENV_ROOT=/home/nonroot/.pyenv \ + PATH=/home/nonroot/.pyenv/shims:/home/nonroot/.pyenv/bin:/home/nonroot/.poetry/bin:$PATH +RUN set -e \ + && cd $HOME \ + && curl -sSO https://raw.githubusercontent.com/pyenv/pyenv-installer/master/bin/pyenv-installer \ + && chmod +x pyenv-installer \ + && ./pyenv-installer \ + && export PYENV_ROOT=/home/nonroot/.pyenv \ + && export PATH="$PYENV_ROOT/bin:$PATH" \ + && export PATH="$PYENV_ROOT/shims:$PATH" \ + && pyenv install ${PYTHON_VERSION} \ + && pyenv global ${PYTHON_VERSION} \ + && python --version \ + && pip install --upgrade pip \ + && pip --version \ + && pip install pipenv wheel poetry + +# Switch to nonroot user (again) +USER nonroot:nonroot +WORKDIR /home/nonroot + +# Rust +# Please keep the version of llvm (installed above) in sync with rust llvm (`rustc --version --verbose | grep LLVM`) +ENV RUSTC_VERSION=1.75.0 +ENV RUSTUP_HOME="/home/nonroot/.rustup" +ENV PATH="/home/nonroot/.cargo/bin:${PATH}" +RUN curl -sSO https://static.rust-lang.org/rustup/dist/$(uname -m)-unknown-linux-gnu/rustup-init && whoami && \ + chmod +x rustup-init && \ + ./rustup-init -y --default-toolchain ${RUSTC_VERSION} && \ + rm rustup-init && \ + export PATH="$HOME/.cargo/bin:$PATH" && \ + . "$HOME/.cargo/env" && \ + cargo --version && rustup --version && \ + rustup component add llvm-tools-preview rustfmt clippy && \ + cargo install --git https://github.com/paritytech/cachepot && \ + cargo install rustfilt && \ + cargo install cargo-hakari && \ + cargo install cargo-deny && \ + cargo install cargo-hack && \ + cargo install cargo-nextest && \ + rm -rf /home/nonroot/.cargo/registry && \ + rm -rf /home/nonroot/.cargo/git +ENV RUSTC_WRAPPER=cachepot + +# Show versions +RUN whoami \ + && python --version \ + && pip --version \ + && cargo --version --verbose \ + && rustup --version --verbose \ + && rustc --version --verbose \ + && clang --version diff --git a/Dockerfile.compute-node b/Dockerfile.compute-node index a23e930c48..14ba1b5b9a 100644 --- a/Dockerfile.compute-node +++ b/Dockerfile.compute-node @@ -1,6 +1,6 @@ ARG PG_VERSION ARG REPOSITORY=neondatabase -ARG IMAGE=rust +ARG IMAGE=build-tools ARG TAG=pinned ARG BUILD_TAG @@ -48,7 +48,29 @@ RUN cd postgres && \ echo 'trusted = true' >> /usr/local/pgsql/share/extension/pgrowlocks.control && \ echo 'trusted = true' >> /usr/local/pgsql/share/extension/pgstattuple.control && \ echo 'trusted = true' >> /usr/local/pgsql/share/extension/refint.control && \ - echo 'trusted = true' >> /usr/local/pgsql/share/extension/xml2.control + echo 'trusted = true' >> /usr/local/pgsql/share/extension/xml2.control && \ + # We need to grant EXECUTE on pg_stat_statements_reset() to neon_superuser. + # In vanilla postgres this function is limited to Postgres role superuser. + # In neon we have neon_superuser role that is not a superuser but replaces superuser in some cases. + # We could add the additional grant statements to the postgres repository but it would be hard to maintain, + # whenever we need to pick up a new postgres version and we want to limit the changes in our postgres fork, + # so we do it here. + old_list="pg_stat_statements--1.0--1.1.sql pg_stat_statements--1.1--1.2.sql pg_stat_statements--1.2--1.3.sql pg_stat_statements--1.3--1.4.sql pg_stat_statements--1.4--1.5.sql pg_stat_statements--1.4.sql pg_stat_statements--1.5--1.6.sql"; \ + # the first loop is for pg_stat_statement extension version <= 1.6 + for file in /usr/local/pgsql/share/extension/pg_stat_statements--*.sql; do \ + filename=$(basename "$file"); \ + if echo "$old_list" | grep -q -F "$filename"; then \ + echo 'GRANT EXECUTE ON FUNCTION pg_stat_statements_reset() TO neon_superuser;' >> $file; \ + fi; \ + done; \ + # the second loop is for pg_stat_statement extension versions >= 1.7, + # where pg_stat_statement_reset() got 3 additional arguments + for file in /usr/local/pgsql/share/extension/pg_stat_statements--*.sql; do \ + filename=$(basename "$file"); \ + if ! echo "$old_list" | grep -q -F "$filename"; then \ + echo 'GRANT EXECUTE ON FUNCTION pg_stat_statements_reset(Oid, Oid, bigint) TO neon_superuser;' >> $file; \ + fi; \ + done ######################################################################################### # diff --git a/Dockerfile.compute-tools b/Dockerfile.compute-tools index 3066e3f7ca..cc305cc556 100644 --- a/Dockerfile.compute-tools +++ b/Dockerfile.compute-tools @@ -1,7 +1,7 @@ # First transient image to build compute_tools binaries # NB: keep in sync with rust image version in .github/workflows/build_and_test.yml ARG REPOSITORY=neondatabase -ARG IMAGE=rust +ARG IMAGE=build-tools ARG TAG=pinned ARG BUILD_TAG diff --git a/compute_tools/Cargo.toml b/compute_tools/Cargo.toml index 18b30810b0..759a117ee9 100644 --- a/compute_tools/Cargo.toml +++ b/compute_tools/Cargo.toml @@ -13,6 +13,7 @@ clap.workspace = true flate2.workspace = true futures.workspace = true hyper = { workspace = true, features = ["full"] } +nix.workspace = true notify.workspace = true num_cpus.workspace = true opentelemetry.workspace = true @@ -20,6 +21,7 @@ postgres.workspace = true regex.workspace = true serde.workspace = true serde_json.workspace = true +signal-hook.workspace = true tar.workspace = true reqwest = { workspace = true, features = ["json"] } tokio = { workspace = true, features = ["rt", "rt-multi-thread"] } @@ -39,3 +41,4 @@ remote_storage = { version = "0.1", path = "../libs/remote_storage/" } vm_monitor = { version = "0.1", path = "../libs/vm_monitor/" } zstd = "0.13" bytes = "1.0" +rust-ini = "0.20.0" diff --git a/compute_tools/src/bin/compute_ctl.rs b/compute_tools/src/bin/compute_ctl.rs index ce7345d5be..2eaad5c3c0 100644 --- a/compute_tools/src/bin/compute_ctl.rs +++ b/compute_tools/src/bin/compute_ctl.rs @@ -31,25 +31,31 @@ //! -C 'postgresql://cloud_admin@localhost/postgres' \ //! -S /var/db/postgres/specs/current.json \ //! -b /usr/local/bin/postgres \ -//! -r http://pg-ext-s3-gateway +//! -r http://pg-ext-s3-gateway \ +//! --pgbouncer-connstr 'host=localhost port=6432 dbname=pgbouncer user=cloud_admin sslmode=disable' +//! --pgbouncer-ini-path /etc/pgbouncer.ini \ //! ``` //! use std::collections::HashMap; use std::fs::File; use std::path::Path; use std::process::exit; +use std::sync::atomic::Ordering; use std::sync::{mpsc, Arc, Condvar, Mutex, RwLock}; use std::{thread, time::Duration}; use anyhow::{Context, Result}; use chrono::Utc; use clap::Arg; +use nix::sys::signal::{kill, Signal}; +use signal_hook::consts::{SIGQUIT, SIGTERM}; +use signal_hook::{consts::SIGINT, iterator::Signals}; use tracing::{error, info}; use url::Url; use compute_api::responses::ComputeStatus; -use compute_tools::compute::{ComputeNode, ComputeState, ParsedSpec}; +use compute_tools::compute::{ComputeNode, ComputeState, ParsedSpec, PG_PID, SYNC_SAFEKEEPERS_PID}; use compute_tools::configurator::launch_configurator; use compute_tools::extension_server::get_pg_version; use compute_tools::http::api::launch_http_server; @@ -65,6 +71,13 @@ const BUILD_TAG_DEFAULT: &str = "latest"; fn main() -> Result<()> { init_tracing_and_logging(DEFAULT_LOG_LEVEL)?; + let mut signals = Signals::new([SIGINT, SIGTERM, SIGQUIT])?; + thread::spawn(move || { + for sig in signals.forever() { + handle_exit_signal(sig); + } + }); + let build_tag = option_env!("BUILD_TAG") .unwrap_or(BUILD_TAG_DEFAULT) .to_string(); @@ -99,6 +112,9 @@ fn main() -> Result<()> { let spec_json = matches.get_one::("spec"); let spec_path = matches.get_one::("spec-path"); + let pgbouncer_connstr = matches.get_one::("pgbouncer-connstr"); + let pgbouncer_ini_path = matches.get_one::("pgbouncer-ini-path"); + // Extract OpenTelemetry context for the startup actions from the // TRACEPARENT and TRACESTATE env variables, and attach it to the current // tracing context. @@ -209,6 +225,8 @@ fn main() -> Result<()> { ext_remote_storage: ext_remote_storage.map(|s| s.to_string()), ext_download_progress: RwLock::new(HashMap::new()), build_tag, + pgbouncer_connstr: pgbouncer_connstr.map(|s| s.to_string()), + pgbouncer_ini_path: pgbouncer_ini_path.map(|s| s.to_string()), }; let compute = Arc::new(compute_node); @@ -332,13 +350,20 @@ fn main() -> Result<()> { // Wait for the child Postgres process forever. In this state Ctrl+C will // propagate to Postgres and it will be shut down as well. - if let Some(mut pg) = pg { + if let Some((mut pg, logs_handle)) = pg { // Startup is finished, exit the startup tracing span drop(startup_context_guard); let ecode = pg .wait() .expect("failed to start waiting on Postgres process"); + PG_PID.store(0, Ordering::SeqCst); + + // Process has exited, so we can join the logs thread. + let _ = logs_handle + .join() + .map_err(|e| tracing::error!("log thread panicked: {:?}", e)); + info!("Postgres exited with code {}, shutting down", ecode); exit_code = ecode.code() } @@ -493,6 +518,41 @@ fn cli() -> clap::Command { ) .value_name("FILECACHE_CONNSTR"), ) + .arg( + Arg::new("pgbouncer-connstr") + .long("pgbouncer-connstr") + .default_value( + "host=localhost port=6432 dbname=pgbouncer user=cloud_admin sslmode=disable", + ) + .value_name("PGBOUNCER_CONNSTR"), + ) + .arg( + Arg::new("pgbouncer-ini-path") + .long("pgbouncer-ini-path") + // Note: this doesn't match current path for pgbouncer.ini. + // Until we fix it, we need to pass the path explicitly + // or this will be effectively no-op. + .default_value("/etc/pgbouncer.ini") + .value_name("PGBOUNCER_INI_PATH"), + ) +} + +/// When compute_ctl is killed, send also termination signal to sync-safekeepers +/// to prevent leakage. TODO: it is better to convert compute_ctl to async and +/// wait for termination which would be easy then. +fn handle_exit_signal(sig: i32) { + info!("received {sig} termination signal"); + let ss_pid = SYNC_SAFEKEEPERS_PID.load(Ordering::SeqCst); + if ss_pid != 0 { + let ss_pid = nix::unistd::Pid::from_raw(ss_pid as i32); + kill(ss_pid, Signal::SIGTERM).ok(); + } + let pg_pid = PG_PID.load(Ordering::SeqCst); + if pg_pid != 0 { + let pg_pid = nix::unistd::Pid::from_raw(pg_pid as i32); + kill(pg_pid, Signal::SIGTERM).ok(); + } + exit(1); } #[test] diff --git a/compute_tools/src/compute.rs b/compute_tools/src/compute.rs index b39a800f14..c2c1952521 100644 --- a/compute_tools/src/compute.rs +++ b/compute_tools/src/compute.rs @@ -6,7 +6,10 @@ use std::os::unix::fs::PermissionsExt; use std::path::Path; use std::process::{Command, Stdio}; use std::str::FromStr; +use std::sync::atomic::AtomicU32; +use std::sync::atomic::Ordering; use std::sync::{Condvar, Mutex, RwLock}; +use std::thread; use std::time::Instant; use anyhow::{Context, Result}; @@ -28,11 +31,15 @@ use utils::measured_stream::MeasuredReader; use remote_storage::{DownloadError, RemotePath}; use crate::checker::create_availability_check_data; +use crate::logger::inlinify; use crate::pg_helpers::*; use crate::spec::*; use crate::sync_sk::{check_if_synced, ping_safekeeper}; use crate::{config, extension_server}; +pub static SYNC_SAFEKEEPERS_PID: AtomicU32 = AtomicU32::new(0); +pub static PG_PID: AtomicU32 = AtomicU32::new(0); + /// Compute node info shared across several `compute_ctl` threads. pub struct ComputeNode { // Url type maintains proper escaping @@ -64,6 +71,10 @@ pub struct ComputeNode { // key: ext_archive_name, value: started download time, download_completed? pub ext_download_progress: RwLock, bool)>>, pub build_tag: String, + // connection string to pgbouncer to change settings + pub pgbouncer_connstr: Option, + // path to pgbouncer.ini to change settings + pub pgbouncer_ini_path: Option, } // store some metrics about download size that might impact startup time @@ -269,7 +280,7 @@ fn create_neon_superuser(spec: &ComputeSpec, client: &mut Client) -> Result<()> $$;"#, roles_decl, database_decl, ); - info!("Neon superuser created:\n{}", &query); + info!("Neon superuser created:\n{}", inlinify(&query)); client .simple_query(&query) .map_err(|e| anyhow::anyhow!(e).context(query))?; @@ -485,7 +496,7 @@ impl ComputeNode { pub fn sync_safekeepers(&self, storage_auth_token: Option) -> Result { let start_time = Utc::now(); - let sync_handle = maybe_cgexec(&self.pgbin) + let mut sync_handle = maybe_cgexec(&self.pgbin) .args(["--sync-safekeepers"]) .env("PGDATA", &self.pgdata) // we cannot use -D in this mode .envs(if let Some(storage_auth_token) = &storage_auth_token { @@ -494,15 +505,29 @@ impl ComputeNode { vec![] }) .stdout(Stdio::piped()) + .stderr(Stdio::piped()) .spawn() .expect("postgres --sync-safekeepers failed to start"); + SYNC_SAFEKEEPERS_PID.store(sync_handle.id(), Ordering::SeqCst); // `postgres --sync-safekeepers` will print all log output to stderr and - // final LSN to stdout. So we pipe only stdout, while stderr will be automatically - // redirected to the caller output. + // final LSN to stdout. So we leave stdout to collect LSN, while stderr logs + // will be collected in a child thread. + let stderr = sync_handle + .stderr + .take() + .expect("stderr should be captured"); + let logs_handle = handle_postgres_logs(stderr); + let sync_output = sync_handle .wait_with_output() .expect("postgres --sync-safekeepers failed"); + SYNC_SAFEKEEPERS_PID.store(0, Ordering::SeqCst); + + // Process has exited, so we can join the logs thread. + let _ = logs_handle + .join() + .map_err(|e| tracing::error!("log thread panicked: {:?}", e)); if !sync_output.status.success() { anyhow::bail!( @@ -640,11 +665,12 @@ impl ComputeNode { /// Start Postgres as a child process and manage DBs/roles. /// After that this will hang waiting on the postmaster process to exit. + /// Returns a handle to the child process and a handle to the logs thread. #[instrument(skip_all)] pub fn start_postgres( &self, storage_auth_token: Option, - ) -> Result { + ) -> Result<(std::process::Child, std::thread::JoinHandle<()>)> { let pgdata_path = Path::new(&self.pgdata); // Run postgres as a child process. @@ -655,12 +681,18 @@ impl ComputeNode { } else { vec![] }) + .stderr(Stdio::piped()) .spawn() .expect("cannot start postgres process"); + PG_PID.store(pg.id(), Ordering::SeqCst); + + // Start a thread to collect logs from stderr. + let stderr = pg.stderr.take().expect("stderr should be captured"); + let logs_handle = handle_postgres_logs(stderr); wait_for_postgres(&mut pg, pgdata_path)?; - Ok(pg) + Ok((pg, logs_handle)) } /// Do initial configuration of the already started Postgres. @@ -737,6 +769,31 @@ impl ComputeNode { pub fn reconfigure(&self) -> Result<()> { let spec = self.state.lock().unwrap().pspec.clone().unwrap().spec; + if let Some(connstr) = &self.pgbouncer_connstr { + info!("tuning pgbouncer with connstr: {:?}", connstr); + + let rt = tokio::runtime::Builder::new_current_thread() + .enable_all() + .build() + .expect("failed to create rt"); + + // Spawn a thread to do the tuning, + // so that we don't block the main thread that starts Postgres. + let pgbouncer_settings = spec.pgbouncer_settings.clone(); + let connstr_clone = connstr.clone(); + let pgbouncer_ini_path = self.pgbouncer_ini_path.clone(); + let _handle = thread::spawn(move || { + let res = rt.block_on(tune_pgbouncer( + pgbouncer_settings, + &connstr_clone, + pgbouncer_ini_path, + )); + if let Err(err) = res { + error!("error while tuning pgbouncer: {err:?}"); + } + }); + } + // Write new config let pgdata_path = Path::new(&self.pgdata); let postgresql_conf_path = pgdata_path.join("postgresql.conf"); @@ -780,7 +837,10 @@ impl ComputeNode { } #[instrument(skip_all)] - pub fn start_compute(&self, extension_server_port: u16) -> Result { + pub fn start_compute( + &self, + extension_server_port: u16, + ) -> Result<(std::process::Child, std::thread::JoinHandle<()>)> { let compute_state = self.state.lock().unwrap().clone(); let pspec = compute_state.pspec.as_ref().expect("spec must be set"); info!( @@ -791,6 +851,32 @@ impl ComputeNode { pspec.timeline_id, ); + // tune pgbouncer + if let Some(connstr) = &self.pgbouncer_connstr { + info!("tuning pgbouncer with connstr: {:?}", connstr); + + let rt = tokio::runtime::Builder::new_current_thread() + .enable_all() + .build() + .expect("failed to create rt"); + + // Spawn a thread to do the tuning, + // so that we don't block the main thread that starts Postgres. + let pgbouncer_settings = pspec.spec.pgbouncer_settings.clone(); + let connstr_clone = connstr.clone(); + let pgbouncer_ini_path = self.pgbouncer_ini_path.clone(); + let _handle = thread::spawn(move || { + let res = rt.block_on(tune_pgbouncer( + pgbouncer_settings, + &connstr_clone, + pgbouncer_ini_path, + )); + if let Err(err) = res { + error!("error while tuning pgbouncer: {err:?}"); + } + }); + } + info!( "start_compute spec.remote_extensions {:?}", pspec.spec.remote_extensions @@ -825,7 +911,7 @@ impl ComputeNode { self.prepare_pgdata(&compute_state, extension_server_port)?; let start_time = Utc::now(); - let pg = self.start_postgres(pspec.storage_auth_token.clone())?; + let pg_process = self.start_postgres(pspec.storage_auth_token.clone())?; let config_time = Utc::now(); if pspec.spec.mode == ComputeMode::Primary && !pspec.spec.skip_pg_catalog_updates { @@ -875,7 +961,7 @@ impl ComputeNode { }; info!(?metrics, "compute start finished"); - Ok(pg) + Ok(pg_process) } // Look for core dumps and collect backtraces. diff --git a/compute_tools/src/logger.rs b/compute_tools/src/logger.rs index 3ae68de8ef..84be5b0809 100644 --- a/compute_tools/src/logger.rs +++ b/compute_tools/src/logger.rs @@ -38,3 +38,9 @@ pub fn init_tracing_and_logging(default_log_level: &str) -> anyhow::Result<()> { Ok(()) } + +/// Replace all newline characters with a special character to make it +/// easier to grep for log messages. +pub fn inlinify(s: &str) -> String { + s.replace('\n', "\u{200B}") +} diff --git a/compute_tools/src/monitor.rs b/compute_tools/src/monitor.rs index f974d6023d..fd19b7e53f 100644 --- a/compute_tools/src/monitor.rs +++ b/compute_tools/src/monitor.rs @@ -3,7 +3,7 @@ use std::{thread, time::Duration}; use chrono::{DateTime, Utc}; use postgres::{Client, NoTls}; -use tracing::{debug, info}; +use tracing::{debug, info, warn}; use crate::compute::ComputeNode; @@ -84,6 +84,29 @@ fn watch_compute_activity(compute: &ComputeNode) { } } + // If there are existing (logical) walsenders, do not suspend. + // + // walproposer doesn't currently show up in pg_stat_replication, + // but protect if it will be + let ws_count_query = "select count(*) from pg_stat_replication where application_name != 'walproposer';"; + match cli.query_one(ws_count_query, &[]) { + Ok(r) => match r.try_get::<&str, i64>("count") { + Ok(num_ws) => { + if num_ws > 0 { + last_active = Some(Utc::now()); + } + } + Err(e) => { + warn!("failed to parse ws count: {:?}", e); + continue; + } + }, + Err(e) => { + warn!("failed to get list of walsenders: {:?}", e); + continue; + } + } + // Update the last activity in the shared state if we got a more recent one. let mut state = compute.state.lock().unwrap(); // NB: `Some()` is always greater than `None`. diff --git a/compute_tools/src/pg_helpers.rs b/compute_tools/src/pg_helpers.rs index b79e516650..bde1ba0a88 100644 --- a/compute_tools/src/pg_helpers.rs +++ b/compute_tools/src/pg_helpers.rs @@ -6,12 +6,17 @@ use std::io::{BufRead, BufReader}; use std::os::unix::fs::PermissionsExt; use std::path::Path; use std::process::Child; +use std::thread::JoinHandle; use std::time::{Duration, Instant}; use anyhow::{bail, Result}; +use ini::Ini; use notify::{RecursiveMode, Watcher}; use postgres::{Client, Transaction}; -use tracing::{debug, instrument}; +use tokio::io::AsyncBufReadExt; +use tokio::time::timeout; +use tokio_postgres::NoTls; +use tracing::{debug, error, info, instrument}; use compute_api::spec::{Database, GenericOption, GenericOptions, PgIdent, Role}; @@ -359,3 +364,137 @@ pub fn create_pgdata(pgdata: &str) -> Result<()> { Ok(()) } + +/// Update pgbouncer.ini with provided options +pub fn update_pgbouncer_ini( + pgbouncer_config: HashMap, + pgbouncer_ini_path: &str, +) -> Result<()> { + let mut conf = Ini::load_from_file(pgbouncer_ini_path)?; + let section = conf.section_mut(Some("pgbouncer")).unwrap(); + + for (option_name, value) in pgbouncer_config.iter() { + section.insert(option_name, value); + } + + conf.write_to_file(pgbouncer_ini_path)?; + Ok(()) +} + +/// Tune pgbouncer. +/// 1. Apply new config using pgbouncer admin console +/// 2. Add new values to pgbouncer.ini to preserve them after restart +pub async fn tune_pgbouncer( + pgbouncer_settings: Option>, + pgbouncer_connstr: &str, + pgbouncer_ini_path: Option, +) -> Result<()> { + if let Some(pgbouncer_config) = pgbouncer_settings { + // Apply new config + let connect_result = tokio_postgres::connect(pgbouncer_connstr, NoTls).await; + let (client, connection) = connect_result.unwrap(); + tokio::spawn(async move { + if let Err(e) = connection.await { + eprintln!("connection error: {}", e); + } + }); + + for (option_name, value) in pgbouncer_config.iter() { + info!( + "Applying pgbouncer setting change: {} = {}", + option_name, value + ); + let query = format!("SET {} = {}", option_name, value); + + let result = client.simple_query(&query).await; + + info!("Applying pgbouncer setting change: {}", query); + info!("pgbouncer setting change result: {:?}", result); + + if let Err(err) = result { + // Don't fail on error, just print it into log + error!( + "Failed to apply pgbouncer setting change: {}, {}", + query, err + ); + }; + } + + // save values to pgbouncer.ini + // so that they are preserved after pgbouncer restart + if let Some(pgbouncer_ini_path) = pgbouncer_ini_path { + update_pgbouncer_ini(pgbouncer_config, &pgbouncer_ini_path)?; + } + } + + Ok(()) +} + +/// Spawn a thread that will read Postgres logs from `stderr`, join multiline logs +/// and send them to the logger. In the future we may also want to add context to +/// these logs. +pub fn handle_postgres_logs(stderr: std::process::ChildStderr) -> JoinHandle<()> { + std::thread::spawn(move || { + let runtime = tokio::runtime::Builder::new_current_thread() + .enable_all() + .build() + .expect("failed to build tokio runtime"); + + let res = runtime.block_on(async move { + let stderr = tokio::process::ChildStderr::from_std(stderr)?; + handle_postgres_logs_async(stderr).await + }); + if let Err(e) = res { + tracing::error!("error while processing postgres logs: {}", e); + } + }) +} + +/// Read Postgres logs from `stderr` until EOF. Buffer is flushed on one of the following conditions: +/// - next line starts with timestamp +/// - EOF +/// - no new lines were written for the last second +async fn handle_postgres_logs_async(stderr: tokio::process::ChildStderr) -> Result<()> { + let mut lines = tokio::io::BufReader::new(stderr).lines(); + let timeout_duration = Duration::from_secs(1); + let ts_regex = + regex::Regex::new(r"^\d+-\d{2}-\d{2} \d{2}:\d{2}:\d{2}").expect("regex is valid"); + + let mut buf = vec![]; + loop { + let next_line = timeout(timeout_duration, lines.next_line()).await; + + // we should flush lines from the buffer if we cannot continue reading multiline message + let should_flush_buf = match next_line { + // Flushing if new line starts with timestamp + Ok(Ok(Some(ref line))) => ts_regex.is_match(line), + // Flushing on EOF, timeout or error + _ => true, + }; + + if !buf.is_empty() && should_flush_buf { + // join multiline message into a single line, separated by unicode Zero Width Space. + // "PG:" suffix is used to distinguish postgres logs from other logs. + let combined = format!("PG:{}\n", buf.join("\u{200B}")); + buf.clear(); + + // sync write to stderr to avoid interleaving with other logs + use std::io::Write; + let res = std::io::stderr().lock().write_all(combined.as_bytes()); + if let Err(e) = res { + tracing::error!("error while writing to stderr: {}", e); + } + } + + // if not timeout, append line to the buffer + if next_line.is_ok() { + match next_line?? { + Some(line) => buf.push(line), + // EOF + None => break, + }; + } + } + + Ok(()) +} diff --git a/compute_tools/src/spec.rs b/compute_tools/src/spec.rs index d545858dc2..1789df7b79 100644 --- a/compute_tools/src/spec.rs +++ b/compute_tools/src/spec.rs @@ -9,6 +9,7 @@ use reqwest::StatusCode; use tracing::{error, info, info_span, instrument, span_enabled, warn, Level}; use crate::config; +use crate::logger::inlinify; use crate::params::PG_HBA_ALL_MD5; use crate::pg_helpers::*; @@ -662,7 +663,11 @@ pub fn handle_grants(spec: &ComputeSpec, client: &mut Client, connstr: &str) -> $$;" .to_string(); - info!("grant query for db {} : {}", &db.name, &grant_query); + info!( + "grant query for db {} : {}", + &db.name, + inlinify(&grant_query) + ); db_client.simple_query(&grant_query)?; } diff --git a/control_plane/src/endpoint.rs b/control_plane/src/endpoint.rs index 071f22dc2b..3d5dfd6311 100644 --- a/control_plane/src/endpoint.rs +++ b/control_plane/src/endpoint.rs @@ -46,6 +46,8 @@ use std::time::Duration; use anyhow::{anyhow, bail, Context, Result}; use compute_api::spec::RemoteExtSpec; +use nix::sys::signal::kill; +use nix::sys::signal::Signal; use serde::{Deserialize, Serialize}; use utils::id::{NodeId, TenantId, TimelineId}; @@ -439,11 +441,14 @@ impl Endpoint { Ok(()) } - fn wait_for_compute_ctl_to_exit(&self) -> Result<()> { + fn wait_for_compute_ctl_to_exit(&self, send_sigterm: bool) -> Result<()> { // TODO use background_process::stop_process instead let pidfile_path = self.endpoint_path().join("compute_ctl.pid"); let pid: u32 = std::fs::read_to_string(pidfile_path)?.parse()?; let pid = nix::unistd::Pid::from_raw(pid as i32); + if send_sigterm { + kill(pid, Signal::SIGTERM).ok(); + } crate::background_process::wait_until_stopped("compute_ctl", pid)?; Ok(()) } @@ -537,6 +542,7 @@ impl Endpoint { safekeeper_connstrings, storage_auth_token: auth_token.clone(), remote_extensions, + pgbouncer_settings: None, }; let spec_path = self.endpoint_path().join("spec.json"); std::fs::write(spec_path, serde_json::to_string_pretty(&spec)?)?; @@ -732,10 +738,15 @@ impl Endpoint { &None, )?; - // Also wait for the compute_ctl process to die. It might have some cleanup - // work to do after postgres stops, like syncing safekeepers, etc. + // Also wait for the compute_ctl process to die. It might have some + // cleanup work to do after postgres stops, like syncing safekeepers, + // etc. // - self.wait_for_compute_ctl_to_exit()?; + // If destroying, send it SIGTERM before waiting. Sometimes we do *not* + // want this cleanup: tests intentionally do stop when majority of + // safekeepers is down, so sync-safekeepers would hang otherwise. This + // could be a separate flag though. + self.wait_for_compute_ctl_to_exit(destroy)?; if destroy { println!( "Destroying postgres data directory '{}'", diff --git a/control_plane/src/pageserver.rs b/control_plane/src/pageserver.rs index 7d490016bf..fb0d251722 100644 --- a/control_plane/src/pageserver.rs +++ b/control_plane/src/pageserver.rs @@ -485,6 +485,13 @@ impl PageServerNode { Ok(self.http_client.list_timelines(*tenant_id).await?) } + pub async fn tenant_secondary_download(&self, tenant_id: &TenantShardId) -> anyhow::Result<()> { + Ok(self + .http_client + .tenant_secondary_download(*tenant_id) + .await?) + } + pub async fn timeline_create( &self, tenant_id: TenantId, diff --git a/control_plane/src/tenant_migration.rs b/control_plane/src/tenant_migration.rs index 79df108896..23ea8f4060 100644 --- a/control_plane/src/tenant_migration.rs +++ b/control_plane/src/tenant_migration.rs @@ -11,6 +11,7 @@ use crate::{ use pageserver_api::models::{ LocationConfig, LocationConfigMode, LocationConfigSecondary, TenantConfig, }; +use pageserver_api::shard::TenantShardId; use std::collections::HashMap; use std::time::Duration; use utils::{ @@ -40,9 +41,9 @@ async fn await_lsn( loop { let latest = match get_lsns(tenant_id, pageserver).await { Ok(l) => l, - Err(e) => { + Err(_e) => { println!( - "🕑 Can't get LSNs on pageserver {} yet, waiting ({e})", + "🕑 Waiting for pageserver {} to activate...", pageserver.conf.id ); std::thread::sleep(Duration::from_millis(500)); @@ -89,7 +90,7 @@ pub async fn migrate_tenant( tenant_id: TenantId, dest_ps: PageServerNode, ) -> anyhow::Result<()> { - // Get a new generation + println!("🤔 Checking existing status..."); let attachment_service = AttachmentService::from_env(env); fn build_location_config( @@ -135,6 +136,20 @@ pub async fn migrate_tenant( baseline_lsns = Some(get_lsns(tenant_id, &origin_ps).await?); } + println!( + "🔁 Downloading latest layers to destination pageserver {}", + dest_ps.conf.id + ); + match dest_ps + .tenant_secondary_download(&TenantShardId::unsharded(tenant_id)) + .await + { + Ok(()) => {} + Err(_) => { + println!(" (skipping, destination wasn't in secondary mode)") + } + } + let gen = attachment_service .attach_hook(tenant_id, dest_ps.conf.id) .await?; diff --git a/deny.toml b/deny.toml index 079dcac679..22e39a2ca3 100644 --- a/deny.toml +++ b/deny.toml @@ -35,6 +35,7 @@ allow = [ "Artistic-2.0", "BSD-2-Clause", "BSD-3-Clause", + "CC0-1.0", "ISC", "MIT", "MPL-2.0", diff --git a/docs/rfcs/030-vectored-timeline-get.md b/docs/rfcs/030-vectored-timeline-get.md new file mode 100644 index 0000000000..d4017471b7 --- /dev/null +++ b/docs/rfcs/030-vectored-timeline-get.md @@ -0,0 +1,142 @@ +# Vectored Timeline Get + +Created on: 2024-01-02 +Author: Christian Schwarz + +# Summary + +A brief RFC / GitHub Epic describing a vectored version of the `Timeline::get` method that is at the heart of Pageserver. + +# Motivation + +During basebackup, we issue many `Timeline::get` calls for SLRU pages that are *adjacent* in key space. +For an example, see +https://github.com/neondatabase/neon/blob/5c88213eaf1b1e29c610a078d0b380f69ed49a7e/pageserver/src/basebackup.rs#L281-L302. + +Each of these `Timeline::get` calls must traverse the layer map to gather reconstruct data (`Timeline::get_reconstruct_data`) for the requested page number (`blknum` in the example). +For each layer visited by layer map traversal, we do a `DiskBtree` point lookup. +If it's negative (no entry), we resume layer map traversal. +If it's positive, we collect the result in our reconstruct data bag. +If the reconstruct data bag contents suffice to reconstruct the page, we're done with `get_reconstruct_data` and move on to walredo. +Otherwise, we resume layer map traversal. + +Doing this many `Timeline::get` calls is quite inefficient because: + +1. We do the layer map traversal repeatedly, even if, e.g., all the data sits in the same image layer at the bottom of the stack. +2. We may visit many DiskBtree inner pages multiple times for point lookup of different keys. + This is likely particularly bad for L0s which span the whole key space and hence must be visited by layer map traversal, but + may not contain the data we're looking for. +3. Anecdotally, keys adjacent in keyspace and written simultaneously also end up physically adjacent in the layer files [^1]. + So, to provide the reconstruct data for N adjacent keys, we would actually only _need_ to issue a single large read to the filesystem, instead of the N reads we currently do. + The filesystem, in turn, ideally stores the layer file physically contiguously, so our large read will turn into one IOP toward the disk. + +[^1]: https://www.notion.so/neondatabase/Christian-Investigation-Slow-Basebackups-Early-2023-12-34ea5c7dcdc1485d9ac3731da4d2a6fc?pvs=4#15ee4e143392461fa64590679c8f54c9 + +# Solution + +We should have a vectored aka batched aka scatter-gather style alternative API for `Timeline::get`. Having such an API unlocks: + +* more efficient basebackup +* batched IO during compaction (useful for strides of unchanged pages) +* page_service: expose vectored get_page_at_lsn for compute (=> good for seqscan / prefetch) + * if [on-demand SLRU downloads](https://github.com/neondatabase/neon/pull/6151) land before vectored Timeline::get, on-demand SLRU downloads will still benefit from this API + +# DoD + +There is a new variant of `Timeline::get`, called `Timeline::get_vectored`. +It takes as arguments an `lsn: Lsn` and a `src: &[KeyVec]` where `struct KeyVec { base: Key, count: usize }`. + +It is up to the implementor to figure out a suitable and efficient way to return the reconstructed page images. +It is sufficient to simply return a `Vec`, but, likely more efficient solutions can be found after studying all the callers of `Timeline::get`. + +Functionally, the behavior of `Timeline::get_vectored` is equivalent to + +```rust +let mut keys_iter: impl Iterator + = src.map(|KeyVec{ base, count }| (base..base+count)).flatten(); +let mut out = Vec::new(); +for key in keys_iter { + let data = Timeline::get(key, lsn)?; + out.push(data); +} +return out; +``` + +However, unlike above, an ideal solution will + +* Visit each `struct Layer` at most once. +* For each visited layer, call `Layer::get_value_reconstruct_data` at most once. + * This means, read each `DiskBtree` page at most once. +* Facilitate merging of the reads we issue to the OS and eventually NVMe. + +Each of these items above represents a signficant amount of work. + +## Performance + +Ideally, the **base performance** of a vectored get of a single page should be identical to the current `Timeline::get`. +A reasonable constant overhead over current `Timeline::get` is acceptable. + +The performance improvement for the vectored use case is demonstrated in some way, e.g., using the `pagebench` basebackup benchmark against a tenant with a lot of SLRU segments. + +# Implementation + +High-level set of tasks / changes to be made: + +- **Get clarity on API**: + - Define naive `Timeline::get_vectored` implementation & adopt it across pageserver. + - The tricky thing here will be the return type (e.g. `Vec` vs `impl Stream`). + - Start with something simple to explore the different usages of the API. + Then iterate with peers until we have something that is good enough. +- **Vectored Layer Map traversal** + - Vectored `LayerMap::search` (take 1 LSN and N `Key`s instead of just 1 LSN and 1 `Key`) + - Refactor `Timeline::get_reconstruct_data` to hold & return state for N `Key`s instead of 1 + - The slightly tricky part here is what to do about `cont_lsn` [after we've found some reconstruct data for some keys](https://github.com/neondatabase/neon/blob/d066dad84b076daf3781cdf9a692098889d3974e/pageserver/src/tenant/timeline.rs#L2378-L2385) + but need more. + Likely we'll need to keep track of `cont_lsn` per key and continue next iteration at `max(cont_lsn)` of all keys that still need data. +- **Vectored `Layer::get_value_reconstruct_data` / `DiskBtree`** + - Current code calls it [here](https://github.com/neondatabase/neon/blob/d066dad84b076daf3781cdf9a692098889d3974e/pageserver/src/tenant/timeline.rs#L2378-L2384). + - Delta layers use `DiskBtreeReader::visit()` to collect the `(offset,len)` pairs for delta record blobs to load. + - Image layers use `DiskBtreeReader::get` to get the offset of the image blob to load. Underneath, that's just a `::visit()` call. + - What needs to happen to `DiskBtree::visit()`? + * Minimally + * take a single `KeyVec` instead of a single `Key` as argument, i.e., take a single contiguous key range to visit. + * Change the visit code to to invoke the callback for all values in the `KeyVec`'s key range + * This should be good enough for what we've seen when investigating basebackup slowness, because there, the key ranges are contiguous. + * Ideally: + * Take a `&[KeyVec]`, sort it; + * during Btree traversal, peek at the next `KeyVec` range to determine whether we need to descend or back out. + * NB: this should be a straight-forward extension of the minimal solution above, as we'll already be checking for "is there more key range in the requested `KeyVec`". +- **Facilitate merging of the reads we issue to the OS and eventually NVMe.** + - The `DiskBtree::visit` produces a set of offsets which we then read from a `VirtualFile` [here](https://github.com/neondatabase/neon/blob/292281c9dfb24152b728b1a846cc45105dac7fe0/pageserver/src/tenant/storage_layer/delta_layer.rs#L772-L804) + - [Delta layer reads](https://github.com/neondatabase/neon/blob/292281c9dfb24152b728b1a846cc45105dac7fe0/pageserver/src/tenant/storage_layer/delta_layer.rs#L772-L804) + - We hit (and rely) on `PageCache` and `VirtualFile here (not great under pressure) + - [Image layer reads](https://github.com/neondatabase/neon/blob/292281c9dfb24152b728b1a846cc45105dac7fe0/pageserver/src/tenant/storage_layer/image_layer.rs#L429-L435) + - What needs to happen is the **vectorization of the `blob_io` interface and then the `VirtualFile` API**. + - That is tricky because + - the `VirtualFile` API, which sits underneath `blob_io`, is being touched by ongoing [io_uring work](https://github.com/neondatabase/neon/pull/5824) + - there's the question how IO buffers will be managed; currently this area relies heavily on `PageCache`, but there's controversy around the future of `PageCache`. + - The guiding principle here should be to avoid coupling this work to the `PageCache`. + - I.e., treat `PageCache` as an extra hop in the I/O chain, rather than as an integral part of buffer management. + + +Let's see how we can improve by doing the first three items in above list first, then revisit. + +## Rollout / Feature Flags + +No feature flags are required for this epic. + +At the end of this epic, `Timeline::get` forwards to `Timeline::get_vectored`, i.e., it's an all-or-nothing type of change. + +It is encouraged to deliver this feature incrementally, i.e., do many small PRs over multiple weeks. +That will help isolate performance regressions across weekly releases. + +# Interaction With Sharding + +[Sharding](https://github.com/neondatabase/neon/pull/5432) splits up the key space, see functions `is_key_local` / `key_to_shard_number`. + +Just as with `Timeline::get`, callers of `Timeline::get_vectored` are responsible for ensuring that they only ask for blocks of the given `struct Timeline`'s shard. + +Given that this is already the case, there shouldn't be significant interaction/interference with sharding. + +However, let's have a safety check for this constraint (error or assertion) because there are currently few affordances at the higher layers of Pageserver for sharding<=>keyspace interaction. +For example, `KeySpace` is not broken up by shard stripe, so if someone naively converted the compaction code to issue a vectored get for a keyspace range it would violate this constraint. diff --git a/docs/sourcetree.md b/docs/sourcetree.md index 95bed83ae5..12fa80349e 100644 --- a/docs/sourcetree.md +++ b/docs/sourcetree.md @@ -129,13 +129,13 @@ Run `poetry shell` to activate the virtual environment. Alternatively, use `poetry run` to run a single command in the venv, e.g. `poetry run pytest`. ### Obligatory checks -We force code formatting via `black`, `ruff`, and type hints via `mypy`. +We force code formatting via `ruff`, and type hints via `mypy`. Run the following commands in the repository's root (next to `pyproject.toml`): ```bash -poetry run black . # All code is reformatted -poetry run ruff . # Python linter -poetry run mypy . # Ensure there are no typing errors +poetry run ruff format . # All code is reformatted +poetry run ruff check . # Python linter +poetry run mypy . # Ensure there are no typing errors ``` **WARNING**: do not run `mypy` from a directory other than the root of the repository. diff --git a/libs/compute_api/src/spec.rs b/libs/compute_api/src/spec.rs index 2a483188e4..4ff6831272 100644 --- a/libs/compute_api/src/spec.rs +++ b/libs/compute_api/src/spec.rs @@ -73,6 +73,8 @@ pub struct ComputeSpec { // information about available remote extensions pub remote_extensions: Option, + + pub pgbouncer_settings: Option>, } /// Feature flag to signal `compute_ctl` to enable certain experimental functionality. diff --git a/libs/compute_api/tests/cluster_spec.json b/libs/compute_api/tests/cluster_spec.json index e2afa17ef0..ccd015ad19 100644 --- a/libs/compute_api/tests/cluster_spec.json +++ b/libs/compute_api/tests/cluster_spec.json @@ -243,5 +243,9 @@ "public_extensions": [ "postgis" ] + }, + "pgbouncer_settings": { + "default_pool_size": "42", + "pool_mode": "session" } } diff --git a/libs/pageserver_api/src/key.rs b/libs/pageserver_api/src/key.rs index d680a5600e..e00d15e494 100644 --- a/libs/pageserver_api/src/key.rs +++ b/libs/pageserver_api/src/key.rs @@ -141,8 +141,9 @@ impl Key { } } +#[inline(always)] pub fn is_rel_block_key(key: &Key) -> bool { - key.field1 == 0x00 && key.field4 != 0 + key.field1 == 0x00 && key.field4 != 0 && key.field6 != 0xffffffff } impl std::str::FromStr for Key { diff --git a/libs/pageserver_api/src/keyspace.rs b/libs/pageserver_api/src/keyspace.rs index 16651c322e..cab7b3d860 100644 --- a/libs/pageserver_api/src/keyspace.rs +++ b/libs/pageserver_api/src/keyspace.rs @@ -114,16 +114,21 @@ impl KeySpaceAccum { } } + #[inline(always)] pub fn add_key(&mut self, key: Key) { self.add_range(singleton_range(key)) } + #[inline(always)] pub fn add_range(&mut self, range: Range) { match self.accum.as_mut() { Some(accum) => { if range.start == accum.end { accum.end = range.end; } else { + // TODO: to efficiently support small sharding stripe sizes, we should avoid starting + // a new range here if the skipped region was all keys that don't belong on this shard. + // (https://github.com/neondatabase/neon/issues/6247) assert!(range.start > accum.end); self.ranges.push(accum.clone()); *accum = range; diff --git a/libs/pageserver_api/src/models.rs b/libs/pageserver_api/src/models.rs index be41b610b8..316d79b634 100644 --- a/libs/pageserver_api/src/models.rs +++ b/libs/pageserver_api/src/models.rs @@ -2,7 +2,7 @@ pub mod partitioning; use std::{ collections::HashMap, - io::Read, + io::{BufRead, Read}, num::{NonZeroU64, NonZeroUsize}, time::SystemTime, }; @@ -557,19 +557,6 @@ pub enum DownloadRemoteLayersTaskState { ShutDown, } -pub type ConfigureFailpointsRequest = Vec; - -/// Information for configuring a single fail point -#[derive(Debug, Serialize, Deserialize)] -pub struct FailpointConfig { - /// Name of the fail point - pub name: String, - /// List of actions to take, using the format described in `fail::cfg` - /// - /// We also support `actions = "exit"` to cause the fail point to immediately exit. - pub actions: String, -} - #[derive(Debug, Serialize, Deserialize)] pub struct TimelineGcRequest { pub gc_horizon: Option, @@ -826,9 +813,10 @@ impl PagestreamBeMessage { PagestreamBeMessage::GetPage(PagestreamGetPageResponse { page: page.into() }) } Tag::Error => { - let buf = buf.get_ref(); - let cstr = std::ffi::CStr::from_bytes_until_nul(buf)?; - let rust_str = cstr.to_str()?; + let mut msg = Vec::new(); + buf.read_until(0, &mut msg)?; + let cstring = std::ffi::CString::from_vec_with_nul(msg)?; + let rust_str = cstring.to_str()?; PagestreamBeMessage::Error(PagestreamErrorResponse { message: rust_str.to_owned(), }) diff --git a/libs/pageserver_api/src/shard.rs b/libs/pageserver_api/src/shard.rs index 3668f7939d..18ef2be523 100644 --- a/libs/pageserver_api/src/shard.rs +++ b/libs/pageserver_api/src/shard.rs @@ -81,6 +81,10 @@ impl TenantShardId { pub fn is_zero(&self) -> bool { self.shard_number == ShardNumber(0) } + + pub fn is_unsharded(&self) -> bool { + self.shard_number == ShardNumber(0) && self.shard_count == ShardCount(0) + } } /// Formatting helper @@ -418,6 +422,21 @@ impl ShardIdentity { } } + /// Return true if the key should be discarded if found in this shard's + /// data store, e.g. during compaction after a split + pub fn is_key_disposable(&self, key: &Key) -> bool { + if key_is_shard0(key) { + // Q: Why can't we dispose of shard0 content if we're not shard 0? + // A: because the WAL ingestion logic currently ingests some shard 0 + // content on all shards, even though it's only read on shard 0. If we + // dropped it, then subsequent WAL ingest to these keys would encounter + // an error. + false + } else { + !self.is_key_local(key) + } + } + pub fn shard_slug(&self) -> String { if self.count > ShardCount(0) { format!("-{:02x}{:02x}", self.number.0, self.count.0) @@ -511,12 +530,7 @@ fn key_is_shard0(key: &Key) -> bool { // relation pages are distributed to shards other than shard zero. Everything else gets // stored on shard 0. This guarantees that shard 0 can independently serve basebackup // requests, and any request other than those for particular blocks in relations. - // - // In this condition: - // - is_rel_block_key includes only relations, i.e. excludes SLRU data and - // all metadata. - // - field6 is set to -1 for relation size pages. - !(is_rel_block_key(key) && key.field6 != 0xffffffff) + !is_rel_block_key(key) } /// Provide the same result as the function in postgres `hashfn.h` with the same name diff --git a/libs/postgres_backend/src/lib.rs b/libs/postgres_backend/src/lib.rs index 1dae008a4f..73d25619c3 100644 --- a/libs/postgres_backend/src/lib.rs +++ b/libs/postgres_backend/src/lib.rs @@ -35,6 +35,12 @@ pub enum QueryError { /// We were instructed to shutdown while processing the query #[error("Shutting down")] Shutdown, + /// Query handler indicated that client should reconnect + #[error("Server requested reconnect")] + Reconnect, + /// Query named an entity that was not found + #[error("Not found: {0}")] + NotFound(std::borrow::Cow<'static, str>), /// Authentication failure #[error("Unauthorized: {0}")] Unauthorized(std::borrow::Cow<'static, str>), @@ -54,9 +60,9 @@ impl From for QueryError { impl QueryError { pub fn pg_error_code(&self) -> &'static [u8; 5] { match self { - Self::Disconnected(_) | Self::SimulatedConnectionError => b"08006", // connection failure + Self::Disconnected(_) | Self::SimulatedConnectionError | Self::Reconnect => b"08006", // connection failure Self::Shutdown => SQLSTATE_ADMIN_SHUTDOWN, - Self::Unauthorized(_) => SQLSTATE_INTERNAL_ERROR, + Self::Unauthorized(_) | Self::NotFound(_) => SQLSTATE_INTERNAL_ERROR, Self::Other(_) => SQLSTATE_INTERNAL_ERROR, // internal error } } @@ -425,6 +431,11 @@ impl PostgresBackend { info!("Stopped due to shutdown"); Ok(()) } + Err(QueryError::Reconnect) => { + // Dropping out of this loop implicitly disconnects + info!("Stopped due to handler reconnect request"); + Ok(()) + } Err(QueryError::Disconnected(e)) => { info!("Disconnected ({e:#})"); // Disconnection is not an error: we just use it that way internally to drop @@ -974,7 +985,9 @@ impl<'a, IO: AsyncRead + AsyncWrite + Unpin> AsyncWrite for CopyDataWriter<'a, I pub fn short_error(e: &QueryError) -> String { match e { QueryError::Disconnected(connection_error) => connection_error.to_string(), + QueryError::Reconnect => "reconnect".to_string(), QueryError::Shutdown => "shutdown".to_string(), + QueryError::NotFound(_) => "not found".to_string(), QueryError::Unauthorized(_e) => "JWT authentication error".to_string(), QueryError::SimulatedConnectionError => "simulated connection error".to_string(), QueryError::Other(e) => format!("{e:#}"), @@ -996,9 +1009,15 @@ fn log_query_error(query: &str, e: &QueryError) { QueryError::SimulatedConnectionError => { error!("query handler for query '{query}' failed due to a simulated connection error") } + QueryError::Reconnect => { + info!("query handler for '{query}' requested client to reconnect") + } QueryError::Shutdown => { info!("query handler for '{query}' cancelled during tenant shutdown") } + QueryError::NotFound(reason) => { + info!("query handler for '{query}' entity not found: {reason}") + } QueryError::Unauthorized(e) => { warn!("query handler for '{query}' failed with authentication error: {e}"); } diff --git a/libs/remote_storage/src/azure_blob.rs b/libs/remote_storage/src/azure_blob.rs index 548bde02f6..18cf5d97ba 100644 --- a/libs/remote_storage/src/azure_blob.rs +++ b/libs/remote_storage/src/azure_blob.rs @@ -117,6 +117,8 @@ impl AzureBlobStorage { ) -> Result { let mut response = builder.into_stream(); + let mut etag = None; + let mut last_modified = None; let mut metadata = HashMap::new(); // TODO give proper streaming response instead of buffering into RAM // https://github.com/neondatabase/neon/issues/5563 @@ -124,6 +126,13 @@ impl AzureBlobStorage { let mut bufs = Vec::new(); while let Some(part) = response.next().await { let part = part.map_err(to_download_error)?; + let etag_str: &str = part.blob.properties.etag.as_ref(); + if etag.is_none() { + etag = Some(etag.unwrap_or_else(|| etag_str.to_owned())); + } + if last_modified.is_none() { + last_modified = Some(part.blob.properties.last_modified.into()); + } if let Some(blob_meta) = part.blob.metadata { metadata.extend(blob_meta.iter().map(|(k, v)| (k.to_owned(), v.to_owned()))); } @@ -136,6 +145,8 @@ impl AzureBlobStorage { } Ok(Download { download_stream: Box::pin(futures::stream::iter(bufs.into_iter().map(Ok))), + etag, + last_modified, metadata: Some(StorageMetadata(metadata)), }) } @@ -311,6 +322,12 @@ impl RemoteStorage for AzureBlobStorage { } Ok(()) } + + async fn copy(&self, _from: &RemotePath, _to: &RemotePath) -> anyhow::Result<()> { + Err(anyhow::anyhow!( + "copy for azure blob storage is not implemented" + )) + } } pin_project_lite::pin_project! { diff --git a/libs/remote_storage/src/lib.rs b/libs/remote_storage/src/lib.rs index e77c54e1e7..942d0016b0 100644 --- a/libs/remote_storage/src/lib.rs +++ b/libs/remote_storage/src/lib.rs @@ -14,7 +14,9 @@ mod local_fs; mod s3_bucket; mod simulate_failures; -use std::{collections::HashMap, fmt::Debug, num::NonZeroUsize, pin::Pin, sync::Arc}; +use std::{ + collections::HashMap, fmt::Debug, num::NonZeroUsize, pin::Pin, sync::Arc, time::SystemTime, +}; use anyhow::{bail, Context}; use camino::{Utf8Path, Utf8PathBuf}; @@ -205,10 +207,18 @@ pub trait RemoteStorage: Send + Sync + 'static { async fn delete(&self, path: &RemotePath) -> anyhow::Result<()>; async fn delete_objects<'a>(&self, paths: &'a [RemotePath]) -> anyhow::Result<()>; + + /// Copy a remote object inside a bucket from one path to another. + async fn copy(&self, from: &RemotePath, to: &RemotePath) -> anyhow::Result<()>; } +pub type DownloadStream = Pin> + Unpin + Send + Sync>>; pub struct Download { - pub download_stream: Pin> + Unpin + Send + Sync>>, + pub download_stream: DownloadStream, + /// The last time the file was modified (`last-modified` HTTP header) + pub last_modified: Option, + /// A way to identify this specific version of the resource (`etag` HTTP header) + pub etag: Option, /// Extra key-value data, associated with the current remote file. pub metadata: Option, } @@ -367,6 +377,15 @@ impl GenericRemoteStorage { Self::Unreliable(s) => s.delete_objects(paths).await, } } + + pub async fn copy_object(&self, from: &RemotePath, to: &RemotePath) -> anyhow::Result<()> { + match self { + Self::LocalFs(s) => s.copy(from, to).await, + Self::AwsS3(s) => s.copy(from, to).await, + Self::AzureBlob(s) => s.copy(from, to).await, + Self::Unreliable(s) => s.copy(from, to).await, + } + } } impl GenericRemoteStorage { @@ -653,6 +672,7 @@ impl ConcurrencyLimiter { RequestKind::Put => &self.write, RequestKind::List => &self.read, RequestKind::Delete => &self.write, + RequestKind::Copy => &self.write, } } diff --git a/libs/remote_storage/src/local_fs.rs b/libs/remote_storage/src/local_fs.rs index 03b98e5ea2..bf8b6b5dde 100644 --- a/libs/remote_storage/src/local_fs.rs +++ b/libs/remote_storage/src/local_fs.rs @@ -18,7 +18,7 @@ use tokio_util::io::ReaderStream; use tracing::*; use utils::{crashsafe::path_with_suffix_extension, fs_ext::is_directory_empty}; -use crate::{Download, DownloadError, Listing, ListingMode, RemotePath}; +use crate::{Download, DownloadError, DownloadStream, Listing, ListingMode, RemotePath}; use super::{RemoteStorage, StorageMetadata}; @@ -331,6 +331,8 @@ impl RemoteStorage for LocalFs { .map_err(DownloadError::Other)?; Ok(Download { metadata, + last_modified: None, + etag: None, download_stream: Box::pin(source), }) } else { @@ -372,17 +374,17 @@ impl RemoteStorage for LocalFs { .await .map_err(DownloadError::Other)?; - Ok(match end_exclusive { - Some(end_exclusive) => Download { - metadata, - download_stream: Box::pin(ReaderStream::new( - source.take(end_exclusive - start_inclusive), - )), - }, - None => Download { - metadata, - download_stream: Box::pin(ReaderStream::new(source)), - }, + let download_stream: DownloadStream = match end_exclusive { + Some(end_exclusive) => Box::pin(ReaderStream::new( + source.take(end_exclusive - start_inclusive), + )), + None => Box::pin(ReaderStream::new(source)), + }; + Ok(Download { + metadata, + last_modified: None, + etag: None, + download_stream, }) } else { Err(DownloadError::NotFound) @@ -407,6 +409,20 @@ impl RemoteStorage for LocalFs { } Ok(()) } + + async fn copy(&self, from: &RemotePath, to: &RemotePath) -> anyhow::Result<()> { + let from_path = from.with_base(&self.storage_root); + let to_path = to.with_base(&self.storage_root); + create_target_directory(&to_path).await?; + fs::copy(&from_path, &to_path).await.with_context(|| { + format!( + "Failed to copy file from '{from_path}' to '{to_path}'", + from_path = from_path, + to_path = to_path + ) + })?; + Ok(()) + } } fn storage_metadata_path(original_path: &Utf8Path) -> Utf8PathBuf { diff --git a/libs/remote_storage/src/s3_bucket.rs b/libs/remote_storage/src/s3_bucket.rs index d63a5ed99b..d7b41edaaf 100644 --- a/libs/remote_storage/src/s3_bucket.rs +++ b/libs/remote_storage/src/s3_bucket.rs @@ -16,6 +16,7 @@ use aws_config::{ environment::credentials::EnvironmentVariableCredentialsProvider, imds::credentials::ImdsCredentialsProvider, meta::credentials::CredentialsProviderChain, + profile::ProfileFileCredentialsProvider, provider_config::ProviderConfig, retry::{RetryConfigBuilder, RetryMode}, web_identity_token::WebIdentityTokenCredentialsProvider, @@ -74,20 +75,29 @@ impl S3Bucket { let region = Some(Region::new(aws_config.bucket_region.clone())); + let provider_conf = ProviderConfig::without_region().with_region(region.clone()); + let credentials_provider = { // uses "AWS_ACCESS_KEY_ID", "AWS_SECRET_ACCESS_KEY" CredentialsProviderChain::first_try( "env", EnvironmentVariableCredentialsProvider::new(), ) + // uses "AWS_PROFILE" / `aws sso login --profile ` + .or_else( + "profile-sso", + ProfileFileCredentialsProvider::builder() + .configure(&provider_conf) + .build(), + ) // uses "AWS_WEB_IDENTITY_TOKEN_FILE", "AWS_ROLE_ARN", "AWS_ROLE_SESSION_NAME" // needed to access remote extensions bucket - .or_else("token", { - let provider_conf = ProviderConfig::without_region().with_region(region.clone()); + .or_else( + "token", WebIdentityTokenCredentialsProvider::builder() .configure(&provider_conf) - .build() - }) + .build(), + ) // uses imds v2 .or_else("imds", ImdsCredentialsProvider::builder().build()) }; @@ -221,6 +231,8 @@ impl S3Bucket { match get_object { Ok(object_output) => { let metadata = object_output.metadata().cloned().map(StorageMetadata); + let etag = object_output.e_tag.clone(); + let last_modified = object_output.last_modified.and_then(|t| t.try_into().ok()); let body = object_output.body; let body = ByteStreamAsStream::from(body); @@ -229,6 +241,8 @@ impl S3Bucket { Ok(Download { metadata, + etag, + last_modified, download_stream: Box::pin(body), }) } @@ -479,6 +493,38 @@ impl RemoteStorage for S3Bucket { Ok(()) } + async fn copy(&self, from: &RemotePath, to: &RemotePath) -> anyhow::Result<()> { + let kind = RequestKind::Copy; + let _guard = self.permit(kind).await; + + let started_at = start_measuring_requests(kind); + + // we need to specify bucket_name as a prefix + let copy_source = format!( + "{}/{}", + self.bucket_name, + self.relative_path_to_s3_object(from) + ); + + let res = self + .client + .copy_object() + .bucket(self.bucket_name.clone()) + .key(self.relative_path_to_s3_object(to)) + .copy_source(copy_source) + .send() + .await; + + let started_at = ScopeGuard::into_inner(started_at); + metrics::BUCKET_METRICS + .req_seconds + .observe_elapsed(kind, &res, started_at); + + res?; + + Ok(()) + } + async fn download(&self, from: &RemotePath) -> Result { // if prefix is not none then download file `prefix/from` // if prefix is none then download file `from` diff --git a/libs/remote_storage/src/s3_bucket/metrics.rs b/libs/remote_storage/src/s3_bucket/metrics.rs index ea11edafa5..21dde14906 100644 --- a/libs/remote_storage/src/s3_bucket/metrics.rs +++ b/libs/remote_storage/src/s3_bucket/metrics.rs @@ -11,6 +11,7 @@ pub(crate) enum RequestKind { Put = 1, Delete = 2, List = 3, + Copy = 4, } use RequestKind::*; @@ -22,6 +23,7 @@ impl RequestKind { Put => "put_object", Delete => "delete_object", List => "list_objects", + Copy => "copy_object", } } const fn as_index(&self) -> usize { @@ -29,7 +31,7 @@ impl RequestKind { } } -pub(super) struct RequestTyped([C; 4]); +pub(super) struct RequestTyped([C; 5]); impl RequestTyped { pub(super) fn get(&self, kind: RequestKind) -> &C { @@ -38,8 +40,8 @@ impl RequestTyped { fn build_with(mut f: impl FnMut(RequestKind) -> C) -> Self { use RequestKind::*; - let mut it = [Get, Put, Delete, List].into_iter(); - let arr = std::array::from_fn::(|index| { + let mut it = [Get, Put, Delete, List, Copy].into_iter(); + let arr = std::array::from_fn::(|index| { let next = it.next().unwrap(); assert_eq!(index, next.as_index()); f(next) diff --git a/libs/remote_storage/src/simulate_failures.rs b/libs/remote_storage/src/simulate_failures.rs index 802b0db7f5..7f5adcea30 100644 --- a/libs/remote_storage/src/simulate_failures.rs +++ b/libs/remote_storage/src/simulate_failures.rs @@ -162,4 +162,11 @@ impl RemoteStorage for UnreliableWrapper { } Ok(()) } + + async fn copy(&self, from: &RemotePath, to: &RemotePath) -> anyhow::Result<()> { + // copy is equivalent to download + upload + self.attempt(RemoteOp::Download(from.clone()))?; + self.attempt(RemoteOp::Upload(to.clone()))?; + self.inner.copy_object(from, to).await + } } diff --git a/libs/safekeeper_api/src/models.rs b/libs/safekeeper_api/src/models.rs index 786712deb1..ce5a1e411e 100644 --- a/libs/safekeeper_api/src/models.rs +++ b/libs/safekeeper_api/src/models.rs @@ -51,3 +51,9 @@ pub struct SkTimelineInfo { #[serde(default)] pub http_connstr: Option, } + +#[derive(Debug, Clone, Deserialize, Serialize)] +pub struct TimelineCopyRequest { + pub target_timeline_id: TimelineId, + pub until_lsn: Lsn, +} diff --git a/libs/utils/Cargo.toml b/libs/utils/Cargo.toml index af0414daa2..706b7a3187 100644 --- a/libs/utils/Cargo.toml +++ b/libs/utils/Cargo.toml @@ -4,6 +4,12 @@ version = "0.1.0" edition.workspace = true license.workspace = true +[features] +default = [] +# Enables test-only APIs, incuding failpoints. In particular, enables the `fail_point!` macro, +# which adds some runtime cost to run tests on outage conditions +testing = ["fail/failpoints"] + [dependencies] arc-swap.workspace = true sentry.workspace = true @@ -16,6 +22,7 @@ chrono.workspace = true heapless.workspace = true hex = { workspace = true, features = ["serde"] } hyper = { workspace = true, features = ["full"] } +fail.workspace = true futures = { workspace = true} jsonwebtoken.workspace = true nix.workspace = true diff --git a/libs/utils/src/failpoint_support.rs b/libs/utils/src/failpoint_support.rs new file mode 100644 index 0000000000..8704b72921 --- /dev/null +++ b/libs/utils/src/failpoint_support.rs @@ -0,0 +1,177 @@ +//! Failpoint support code shared between pageserver and safekeepers. + +use crate::http::{ + error::ApiError, + json::{json_request, json_response}, +}; +use hyper::{Body, Request, Response, StatusCode}; +use serde::{Deserialize, Serialize}; +use tokio_util::sync::CancellationToken; +use tracing::*; + +/// use with fail::cfg("$name", "return(2000)") +/// +/// The effect is similar to a "sleep(2000)" action, i.e. we sleep for the +/// specified time (in milliseconds). The main difference is that we use async +/// tokio sleep function. Another difference is that we print lines to the log, +/// which can be useful in tests to check that the failpoint was hit. +/// +/// Optionally pass a cancellation token, and this failpoint will drop out of +/// its sleep when the cancellation token fires. This is useful for testing +/// cases where we would like to block something, but test its clean shutdown behavior. +#[macro_export] +macro_rules! __failpoint_sleep_millis_async { + ($name:literal) => {{ + // If the failpoint is used with a "return" action, set should_sleep to the + // returned value (as string). Otherwise it's set to None. + let should_sleep = (|| { + ::fail::fail_point!($name, |x| x); + ::std::option::Option::None + })(); + + // Sleep if the action was a returned value + if let ::std::option::Option::Some(duration_str) = should_sleep { + $crate::failpoint_support::failpoint_sleep_helper($name, duration_str).await + } + }}; + ($name:literal, $cancel:expr) => {{ + // If the failpoint is used with a "return" action, set should_sleep to the + // returned value (as string). Otherwise it's set to None. + let should_sleep = (|| { + ::fail::fail_point!($name, |x| x); + ::std::option::Option::None + })(); + + // Sleep if the action was a returned value + if let ::std::option::Option::Some(duration_str) = should_sleep { + $crate::failpoint_support::failpoint_sleep_cancellable_helper( + $name, + duration_str, + $cancel, + ) + .await + } + }}; +} +pub use __failpoint_sleep_millis_async as sleep_millis_async; + +// Helper function used by the macro. (A function has nicer scoping so we +// don't need to decorate everything with "::") +#[doc(hidden)] +pub async fn failpoint_sleep_helper(name: &'static str, duration_str: String) { + let millis = duration_str.parse::().unwrap(); + let d = std::time::Duration::from_millis(millis); + + tracing::info!("failpoint {:?}: sleeping for {:?}", name, d); + tokio::time::sleep(d).await; + tracing::info!("failpoint {:?}: sleep done", name); +} + +// Helper function used by the macro. (A function has nicer scoping so we +// don't need to decorate everything with "::") +#[doc(hidden)] +pub async fn failpoint_sleep_cancellable_helper( + name: &'static str, + duration_str: String, + cancel: &CancellationToken, +) { + let millis = duration_str.parse::().unwrap(); + let d = std::time::Duration::from_millis(millis); + + tracing::info!("failpoint {:?}: sleeping for {:?}", name, d); + tokio::time::timeout(d, cancel.cancelled()).await.ok(); + tracing::info!("failpoint {:?}: sleep done", name); +} + +pub fn init() -> fail::FailScenario<'static> { + // The failpoints lib provides support for parsing the `FAILPOINTS` env var. + // We want non-default behavior for `exit`, though, so, we handle it separately. + // + // Format for FAILPOINTS is "name=actions" separated by ";". + let actions = std::env::var("FAILPOINTS"); + if actions.is_ok() { + std::env::remove_var("FAILPOINTS"); + } else { + // let the library handle non-utf8, or nothing for not present + } + + let scenario = fail::FailScenario::setup(); + + if let Ok(val) = actions { + val.split(';') + .enumerate() + .map(|(i, s)| s.split_once('=').ok_or((i, s))) + .for_each(|res| { + let (name, actions) = match res { + Ok(t) => t, + Err((i, s)) => { + panic!( + "startup failpoints: missing action on the {}th failpoint; try `{s}=return`", + i + 1, + ); + } + }; + if let Err(e) = apply_failpoint(name, actions) { + panic!("startup failpoints: failed to apply failpoint {name}={actions}: {e}"); + } + }); + } + + scenario +} + +pub fn apply_failpoint(name: &str, actions: &str) -> Result<(), String> { + if actions == "exit" { + fail::cfg_callback(name, exit_failpoint) + } else { + fail::cfg(name, actions) + } +} + +#[inline(never)] +fn exit_failpoint() { + tracing::info!("Exit requested by failpoint"); + std::process::exit(1); +} + +pub type ConfigureFailpointsRequest = Vec; + +/// Information for configuring a single fail point +#[derive(Debug, Serialize, Deserialize)] +pub struct FailpointConfig { + /// Name of the fail point + pub name: String, + /// List of actions to take, using the format described in `fail::cfg` + /// + /// We also support `actions = "exit"` to cause the fail point to immediately exit. + pub actions: String, +} + +/// Configure failpoints through http. +pub async fn failpoints_handler( + mut request: Request, + _cancel: CancellationToken, +) -> Result, ApiError> { + if !fail::has_failpoints() { + return Err(ApiError::BadRequest(anyhow::anyhow!( + "Cannot manage failpoints because storage was compiled without failpoints support" + ))); + } + + let failpoints: ConfigureFailpointsRequest = json_request(&mut request).await?; + for fp in failpoints { + info!("cfg failpoint: {} {}", fp.name, fp.actions); + + // We recognize one extra "action" that's not natively recognized + // by the failpoints crate: exit, to immediately kill the process + let cfg_result = apply_failpoint(&fp.name, &fp.actions); + + if let Err(err_msg) = cfg_result { + return Err(ApiError::BadRequest(anyhow::anyhow!( + "Failed to configure failpoints: {err_msg}" + ))); + } + } + + json_response(StatusCode::OK, ()) +} diff --git a/libs/utils/src/http/error.rs b/libs/utils/src/http/error.rs index ac68b04888..3e9281ac81 100644 --- a/libs/utils/src/http/error.rs +++ b/libs/utils/src/http/error.rs @@ -31,6 +31,9 @@ pub enum ApiError { #[error("Shutting down")] ShuttingDown, + #[error("Timeout")] + Timeout(Cow<'static, str>), + #[error(transparent)] InternalServerError(anyhow::Error), } @@ -67,6 +70,10 @@ impl ApiError { err.to_string(), StatusCode::SERVICE_UNAVAILABLE, ), + ApiError::Timeout(err) => HttpErrorBody::response_from_msg_and_status( + err.to_string(), + StatusCode::REQUEST_TIMEOUT, + ), ApiError::InternalServerError(err) => HttpErrorBody::response_from_msg_and_status( err.to_string(), StatusCode::INTERNAL_SERVER_ERROR, diff --git a/libs/utils/src/lib.rs b/libs/utils/src/lib.rs index bb6c848bf4..890061dc59 100644 --- a/libs/utils/src/lib.rs +++ b/libs/utils/src/lib.rs @@ -83,6 +83,10 @@ pub mod timeout; pub mod sync; +pub mod failpoint_support; + +pub mod yielding_loop; + /// This is a shortcut to embed git sha into binaries and avoid copying the same build script to all packages /// /// we have several cases: diff --git a/libs/utils/src/lsn.rs b/libs/utils/src/lsn.rs index 262dcb8a8a..b3269ae049 100644 --- a/libs/utils/src/lsn.rs +++ b/libs/utils/src/lsn.rs @@ -366,6 +366,49 @@ impl MonotonicCounter for RecordLsn { } } +/// Implements [`rand::distributions::uniform::UniformSampler`] so we can sample [`Lsn`]s. +/// +/// This is used by the `pagebench` pageserver benchmarking tool. +pub struct LsnSampler(::Sampler); + +impl rand::distributions::uniform::SampleUniform for Lsn { + type Sampler = LsnSampler; +} + +impl rand::distributions::uniform::UniformSampler for LsnSampler { + type X = Lsn; + + fn new(low: B1, high: B2) -> Self + where + B1: rand::distributions::uniform::SampleBorrow + Sized, + B2: rand::distributions::uniform::SampleBorrow + Sized, + { + Self( + ::Sampler::new( + low.borrow().0, + high.borrow().0, + ), + ) + } + + fn new_inclusive(low: B1, high: B2) -> Self + where + B1: rand::distributions::uniform::SampleBorrow + Sized, + B2: rand::distributions::uniform::SampleBorrow + Sized, + { + Self( + ::Sampler::new_inclusive( + low.borrow().0, + high.borrow().0, + ), + ) + } + + fn sample(&self, rng: &mut R) -> Self::X { + Lsn(self.0.sample(rng)) + } +} + #[cfg(test)] mod tests { use crate::bin_ser::BeSer; diff --git a/libs/utils/src/sync/gate.rs b/libs/utils/src/sync/gate.rs index 31c76d2f74..abc3842da8 100644 --- a/libs/utils/src/sync/gate.rs +++ b/libs/utils/src/sync/gate.rs @@ -15,6 +15,12 @@ pub struct Gate { name: String, } +impl std::fmt::Debug for Gate { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + write!(f, "Gate<{}>", self.name) + } +} + /// RAII guard for a [`Gate`]: as long as this exists, calls to [`Gate::close`] will /// not complete. #[derive(Debug)] diff --git a/libs/utils/src/yielding_loop.rs b/libs/utils/src/yielding_loop.rs new file mode 100644 index 0000000000..963279eb4c --- /dev/null +++ b/libs/utils/src/yielding_loop.rs @@ -0,0 +1,35 @@ +use tokio_util::sync::CancellationToken; + +#[derive(thiserror::Error, Debug)] +pub enum YieldingLoopError { + #[error("Cancelled")] + Cancelled, +} + +/// Helper for long synchronous loops, e.g. over all tenants in the system. Periodically +/// yields to avoid blocking the executor, and after resuming checks the provided +/// cancellation token to drop out promptly on shutdown. +#[inline(always)] +pub async fn yielding_loop( + interval: usize, + cancel: &CancellationToken, + iter: I, + mut visitor: F, +) -> Result<(), YieldingLoopError> +where + I: Iterator, + F: FnMut(T), +{ + for (i, item) in iter.enumerate() { + visitor(item); + + if i + 1 % interval == 0 { + tokio::task::yield_now().await; + if cancel.is_cancelled() { + return Err(YieldingLoopError::Cancelled); + } + } + } + + Ok(()) +} diff --git a/libs/vm_monitor/src/runner.rs b/libs/vm_monitor/src/runner.rs index f162f53d24..ba37966476 100644 --- a/libs/vm_monitor/src/runner.rs +++ b/libs/vm_monitor/src/runner.rs @@ -446,12 +446,11 @@ impl Runner { if let Some(t) = self.last_upscale_request_at { let elapsed = t.elapsed(); if elapsed < Duration::from_secs(1) { - info!( - elapsed_millis = elapsed.as_millis(), - avg_non_reclaimable = bytes_to_mebibytes(cgroup_mem_stat.avg_non_reclaimable), - threshold = bytes_to_mebibytes(cgroup.threshold), - "cgroup memory stats are high enough to upscale but too soon to forward the request, ignoring", - ); + // *Ideally* we'd like to log here that we're ignoring the fact the + // memory stats are too high, but in practice this can result in + // spamming the logs with repetitive messages about ignoring the signal + // + // See https://github.com/neondatabase/neon/issues/5865 for more. continue; } } diff --git a/libs/walproposer/src/api_bindings.rs b/libs/walproposer/src/api_bindings.rs index 77afe1e686..1f7bf952dc 100644 --- a/libs/walproposer/src/api_bindings.rs +++ b/libs/walproposer/src/api_bindings.rs @@ -8,12 +8,12 @@ use std::ffi::CString; use crate::bindings::uint32; use crate::bindings::walproposer_api; +use crate::bindings::NeonWALReadResult; use crate::bindings::PGAsyncReadResult; use crate::bindings::PGAsyncWriteResult; use crate::bindings::Safekeeper; use crate::bindings::Size; use crate::bindings::StringInfoData; -use crate::bindings::TimeLineID; use crate::bindings::TimestampTz; use crate::bindings::WalProposer; use crate::bindings::WalProposerConnStatusType; @@ -178,31 +178,11 @@ extern "C" fn conn_blocking_write( } } -extern "C" fn recovery_download( - sk: *mut Safekeeper, - _timeline: TimeLineID, - startpos: XLogRecPtr, - endpos: XLogRecPtr, -) -> bool { +extern "C" fn recovery_download(wp: *mut WalProposer, sk: *mut Safekeeper) -> bool { unsafe { let callback_data = (*(*(*sk).wp).config).callback_data; let api = callback_data as *mut Box; - (*api).recovery_download(&mut (*sk), startpos, endpos) - } -} - -#[allow(clippy::unnecessary_cast)] -extern "C" fn wal_read( - sk: *mut Safekeeper, - buf: *mut ::std::os::raw::c_char, - startptr: XLogRecPtr, - count: Size, -) { - unsafe { - let buf = std::slice::from_raw_parts_mut(buf as *mut u8, count); - let callback_data = (*(*(*sk).wp).config).callback_data; - let api = callback_data as *mut Box; - (*api).wal_read(&mut (*sk), buf, startptr) + (*api).recovery_download(&mut (*wp), &mut (*sk)) } } @@ -214,11 +194,28 @@ extern "C" fn wal_reader_allocate(sk: *mut Safekeeper) { } } -extern "C" fn free_event_set(wp: *mut WalProposer) { +#[allow(clippy::unnecessary_cast)] +extern "C" fn wal_read( + sk: *mut Safekeeper, + buf: *mut ::std::os::raw::c_char, + startptr: XLogRecPtr, + count: Size, + _errmsg: *mut *mut ::std::os::raw::c_char, +) -> NeonWALReadResult { unsafe { - let callback_data = (*(*wp).config).callback_data; + let buf = std::slice::from_raw_parts_mut(buf as *mut u8, count); + let callback_data = (*(*(*sk).wp).config).callback_data; let api = callback_data as *mut Box; - (*api).free_event_set(&mut (*wp)); + // TODO: errmsg is not forwarded + (*api).wal_read(&mut (*sk), buf, startptr) + } +} + +extern "C" fn wal_reader_events(sk: *mut Safekeeper) -> uint32 { + unsafe { + let callback_data = (*(*(*sk).wp).config).callback_data; + let api = callback_data as *mut Box; + (*api).wal_reader_events(&mut (*sk)) } } @@ -238,6 +235,14 @@ extern "C" fn update_event_set(sk: *mut Safekeeper, events: uint32) { } } +extern "C" fn active_state_update_event_set(sk: *mut Safekeeper) { + unsafe { + let callback_data = (*(*(*sk).wp).config).callback_data; + let api = callback_data as *mut Box; + (*api).active_state_update_event_set(&mut (*sk)); + } +} + extern "C" fn add_safekeeper_event_set(sk: *mut Safekeeper, events: uint32) { unsafe { let callback_data = (*(*(*sk).wp).config).callback_data; @@ -246,6 +251,14 @@ extern "C" fn add_safekeeper_event_set(sk: *mut Safekeeper, events: uint32) { } } +extern "C" fn rm_safekeeper_event_set(sk: *mut Safekeeper) { + unsafe { + let callback_data = (*(*(*sk).wp).config).callback_data; + let api = callback_data as *mut Box; + (*api).rm_safekeeper_event_set(&mut (*sk)); + } +} + extern "C" fn wait_event_set( wp: *mut WalProposer, timeout: ::std::os::raw::c_long, @@ -313,14 +326,6 @@ extern "C" fn process_safekeeper_feedback(wp: *mut WalProposer, commit_lsn: XLog } } -extern "C" fn confirm_wal_streamed(wp: *mut WalProposer, lsn: XLogRecPtr) { - unsafe { - let callback_data = (*(*wp).config).callback_data; - let api = callback_data as *mut Box; - (*api).confirm_wal_streamed(&mut (*wp), lsn) - } -} - extern "C" fn log_internal( wp: *mut WalProposer, level: ::std::os::raw::c_int, @@ -335,14 +340,6 @@ extern "C" fn log_internal( } } -extern "C" fn after_election(wp: *mut WalProposer) { - unsafe { - let callback_data = (*(*wp).config).callback_data; - let api = callback_data as *mut Box; - (*api).after_election(&mut (*wp)) - } -} - #[derive(Debug)] pub enum Level { Debug5, @@ -401,20 +398,20 @@ pub(crate) fn create_api() -> walproposer_api { conn_async_write: Some(conn_async_write), conn_blocking_write: Some(conn_blocking_write), recovery_download: Some(recovery_download), - wal_read: Some(wal_read), wal_reader_allocate: Some(wal_reader_allocate), - free_event_set: Some(free_event_set), + wal_read: Some(wal_read), + wal_reader_events: Some(wal_reader_events), init_event_set: Some(init_event_set), update_event_set: Some(update_event_set), + active_state_update_event_set: Some(active_state_update_event_set), add_safekeeper_event_set: Some(add_safekeeper_event_set), + rm_safekeeper_event_set: Some(rm_safekeeper_event_set), wait_event_set: Some(wait_event_set), strong_random: Some(strong_random), get_redo_start_lsn: Some(get_redo_start_lsn), finish_sync_safekeepers: Some(finish_sync_safekeepers), process_safekeeper_feedback: Some(process_safekeeper_feedback), - confirm_wal_streamed: Some(confirm_wal_streamed), log_internal: Some(log_internal), - after_election: Some(after_election), } } diff --git a/libs/walproposer/src/walproposer.rs b/libs/walproposer/src/walproposer.rs index f5723018d7..7251545792 100644 --- a/libs/walproposer/src/walproposer.rs +++ b/libs/walproposer/src/walproposer.rs @@ -6,8 +6,8 @@ use utils::id::TenantTimelineId; use crate::{ api_bindings::{create_api, take_vec_u8, Level}, bindings::{ - Safekeeper, WalProposer, WalProposerConfig, WalProposerCreate, WalProposerFree, - WalProposerStart, + NeonWALReadResult, Safekeeper, WalProposer, WalProposerConfig, WalProposerCreate, + WalProposerFree, WalProposerStart, }, }; @@ -86,19 +86,19 @@ pub trait ApiImpl { todo!() } - fn recovery_download(&self, _sk: &mut Safekeeper, _startpos: u64, _endpos: u64) -> bool { + fn recovery_download(&self, _wp: &mut WalProposer, _sk: &mut Safekeeper) -> bool { todo!() } - fn wal_read(&self, _sk: &mut Safekeeper, _buf: &mut [u8], _startpos: u64) { + fn wal_reader_allocate(&self, _sk: &mut Safekeeper) -> NeonWALReadResult { todo!() } - fn wal_reader_allocate(&self, _sk: &mut Safekeeper) { + fn wal_read(&self, _sk: &mut Safekeeper, _buf: &mut [u8], _startpos: u64) -> NeonWALReadResult { todo!() } - fn free_event_set(&self, _wp: &mut WalProposer) { + fn wal_reader_events(&self, _sk: &mut Safekeeper) -> u32 { todo!() } @@ -110,10 +110,18 @@ pub trait ApiImpl { todo!() } + fn active_state_update_event_set(&self, _sk: &mut Safekeeper) { + todo!() + } + fn add_safekeeper_event_set(&self, _sk: &mut Safekeeper, _events_mask: u32) { todo!() } + fn rm_safekeeper_event_set(&self, _sk: &mut Safekeeper) { + todo!() + } + fn wait_event_set(&self, _wp: &mut WalProposer, _timeout_millis: i64) -> WaitResult { todo!() } @@ -134,10 +142,6 @@ pub trait ApiImpl { todo!() } - fn confirm_wal_streamed(&self, _wp: &mut WalProposer, _lsn: u64) { - todo!() - } - fn log_internal(&self, _wp: &mut WalProposer, _level: Level, _msg: &str) { todo!() } @@ -240,6 +244,7 @@ impl Drop for Wrapper { #[cfg(test)] mod tests { + use core::panic; use std::{ cell::Cell, sync::{atomic::AtomicUsize, mpsc::sync_channel}, @@ -247,7 +252,7 @@ mod tests { use utils::id::TenantTimelineId; - use crate::{api_bindings::Level, walproposer::Wrapper}; + use crate::{api_bindings::Level, bindings::NeonWALReadResult, walproposer::Wrapper}; use super::ApiImpl; @@ -355,12 +360,17 @@ mod tests { true } - fn wal_reader_allocate(&self, _: &mut crate::bindings::Safekeeper) { - println!("wal_reader_allocate") + fn recovery_download( + &self, + _wp: &mut crate::bindings::WalProposer, + _sk: &mut crate::bindings::Safekeeper, + ) -> bool { + true } - fn free_event_set(&self, _: &mut crate::bindings::WalProposer) { - println!("free_event_set") + fn wal_reader_allocate(&self, _: &mut crate::bindings::Safekeeper) -> NeonWALReadResult { + println!("wal_reader_allocate"); + crate::bindings::NeonWALReadResult_NEON_WALREAD_SUCCESS } fn init_event_set(&self, _: &mut crate::bindings::WalProposer) { @@ -383,6 +393,13 @@ mod tests { self.wait_events.set(WaitEventsData { sk, event_mask }); } + fn rm_safekeeper_event_set(&self, sk: &mut crate::bindings::Safekeeper) { + println!( + "rm_safekeeper_event_set, sk={:?}", + sk as *mut crate::bindings::Safekeeper + ); + } + fn wait_event_set( &self, _: &mut crate::bindings::WalProposer, @@ -408,7 +425,7 @@ mod tests { } fn log_internal(&self, _wp: &mut crate::bindings::WalProposer, level: Level, msg: &str) { - println!("walprop_log[{}] {}", level, msg); + println!("wp_log[{}] {}", level, msg); } fn after_election(&self, _wp: &mut crate::bindings::WalProposer) { diff --git a/pageserver/benches/bench_walredo.rs b/pageserver/benches/bench_walredo.rs index ba41866935..4837626086 100644 --- a/pageserver/benches/bench_walredo.rs +++ b/pageserver/benches/bench_walredo.rs @@ -13,6 +13,7 @@ use bytes::{Buf, Bytes}; use pageserver::{ config::PageServerConf, repository::Key, walrecord::NeonWalRecord, walredo::PostgresRedoManager, }; +use pageserver_api::shard::TenantShardId; use utils::{id::TenantId, lsn::Lsn}; use criterion::{criterion_group, criterion_main, BenchmarkId, Criterion}; @@ -26,9 +27,9 @@ fn redo_scenarios(c: &mut Criterion) { let conf = PageServerConf::dummy_conf(repo_dir.path().to_path_buf()); let conf = Box::leak(Box::new(conf)); - let tenant_id = TenantId::generate(); + let tenant_shard_id = TenantShardId::unsharded(TenantId::generate()); - let manager = PostgresRedoManager::new(conf, tenant_id); + let manager = PostgresRedoManager::new(conf, tenant_shard_id); let manager = Arc::new(manager); diff --git a/pageserver/client/src/mgmt_api.rs b/pageserver/client/src/mgmt_api.rs index 0ad4e1551e..0415ed05bd 100644 --- a/pageserver/client/src/mgmt_api.rs +++ b/pageserver/client/src/mgmt_api.rs @@ -1,10 +1,12 @@ -use pageserver_api::models::*; +use pageserver_api::{models::*, shard::TenantShardId}; use reqwest::{IntoUrl, Method}; use utils::{ http::error::HttpErrorBody, id::{TenantId, TimelineId}, }; +pub mod util; + #[derive(Debug)] pub struct Client { mgmt_api_endpoint: String, @@ -26,14 +28,12 @@ pub enum Error { pub type Result = std::result::Result; -#[async_trait::async_trait] -pub trait ResponseErrorMessageExt: Sized { +pub(crate) trait ResponseErrorMessageExt: Sized { async fn error_from_body(self) -> Result; } -#[async_trait::async_trait] impl ResponseErrorMessageExt for reqwest::Response { - async fn error_from_body(mut self) -> Result { + async fn error_from_body(self) -> Result { let status = self.status(); if !(status.is_client_error() || status.is_server_error()) { return Ok(self); @@ -49,6 +49,11 @@ impl ResponseErrorMessageExt for reqwest::Response { } } +pub enum ForceAwaitLogicalSize { + Yes, + No, +} + impl Client { pub fn new(mgmt_api_endpoint: String, jwt: Option<&str>) -> Self { Self { @@ -92,11 +97,18 @@ impl Client { &self, tenant_id: TenantId, timeline_id: TimelineId, + force_await_logical_size: ForceAwaitLogicalSize, ) -> Result { let uri = format!( "{}/v1/tenant/{tenant_id}/timeline/{timeline_id}", self.mgmt_api_endpoint ); + + let uri = match force_await_logical_size { + ForceAwaitLogicalSize::Yes => format!("{}?force-await-logical-size={}", uri, true), + ForceAwaitLogicalSize::No => uri, + }; + self.get(&uri) .await? .json() @@ -162,6 +174,18 @@ impl Client { Ok(()) } + pub async fn tenant_secondary_download(&self, tenant_id: TenantShardId) -> Result<()> { + let uri = format!( + "{}/v1/tenant/{}/secondary/download", + self.mgmt_api_endpoint, tenant_id + ); + self.request(Method::POST, &uri, ()) + .await? + .error_for_status() + .map(|_| ()) + .map_err(|e| Error::ApiError(format!("{}", e))) + } + pub async fn location_config( &self, tenant_id: TenantId, diff --git a/pageserver/client/src/mgmt_api/util.rs b/pageserver/client/src/mgmt_api/util.rs new file mode 100644 index 0000000000..048a3bb7cd --- /dev/null +++ b/pageserver/client/src/mgmt_api/util.rs @@ -0,0 +1,49 @@ +//! Helpers to do common higher-level tasks with the [`Client`]. + +use std::sync::Arc; + +use tokio::task::JoinSet; +use utils::id::{TenantId, TenantTimelineId}; + +use super::Client; + +/// Retrieve a list of all of the pageserver's timelines. +/// +/// Fails if there are sharded tenants present on the pageserver. +pub async fn get_pageserver_tenant_timelines_unsharded( + api_client: &Arc, +) -> anyhow::Result> { + let mut timelines: Vec = Vec::new(); + let mut tenants: Vec = Vec::new(); + for ti in api_client.list_tenants().await? { + if !ti.id.is_unsharded() { + anyhow::bail!( + "only unsharded tenants are supported at this time: {}", + ti.id + ); + } + tenants.push(ti.id.tenant_id) + } + let mut js = JoinSet::new(); + for tenant_id in tenants { + js.spawn({ + let mgmt_api_client = Arc::clone(api_client); + async move { + ( + tenant_id, + mgmt_api_client.tenant_details(tenant_id).await.unwrap(), + ) + } + }); + } + while let Some(res) = js.join_next().await { + let (tenant_id, details) = res.unwrap(); + for timeline_id in details.timelines { + timelines.push(TenantTimelineId { + tenant_id, + timeline_id, + }); + } + } + Ok(timelines) +} diff --git a/pageserver/client/src/page_service.rs b/pageserver/client/src/page_service.rs index fc0d2311f7..231461267a 100644 --- a/pageserver/client/src/page_service.rs +++ b/pageserver/client/src/page_service.rs @@ -115,15 +115,8 @@ impl PagestreamClient { pub async fn getpage( &mut self, - key: RelTagBlockNo, - lsn: Lsn, + req: PagestreamGetPageRequest, ) -> anyhow::Result { - let req = PagestreamGetPageRequest { - latest: false, - rel: key.rel_tag, - blkno: key.block_no, - lsn, - }; let req = PagestreamFeMessage::GetPage(req); let req: bytes::Bytes = req.serialize(); // let mut req = tokio_util::io::ReaderStream::new(&req); diff --git a/pageserver/pagebench/Cargo.toml b/pageserver/pagebench/Cargo.toml new file mode 100644 index 0000000000..169d9b7f8e --- /dev/null +++ b/pageserver/pagebench/Cargo.toml @@ -0,0 +1,26 @@ +[package] +name = "pagebench" +version = "0.1.0" +edition.workspace = true +license.workspace = true + +# See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html + +[dependencies] +anyhow.workspace = true +clap.workspace = true +futures.workspace = true +hdrhistogram.workspace = true +humantime.workspace = true +humantime-serde.workspace = true +rand.workspace = true +serde.workspace = true +serde_json.workspace = true +tracing.workspace = true +tokio.workspace = true + +pageserver = { path = ".." } +pageserver_client.workspace = true +pageserver_api.workspace = true +utils = { path = "../../libs/utils/" } +workspace_hack = { version = "0.1", path = "../../workspace_hack" } diff --git a/pageserver/pagebench/src/cmd/basebackup.rs b/pageserver/pagebench/src/cmd/basebackup.rs new file mode 100644 index 0000000000..2d61b0e252 --- /dev/null +++ b/pageserver/pagebench/src/cmd/basebackup.rs @@ -0,0 +1,275 @@ +use anyhow::Context; +use pageserver_client::mgmt_api::ForceAwaitLogicalSize; +use pageserver_client::page_service::BasebackupRequest; + +use utils::id::TenantTimelineId; +use utils::lsn::Lsn; + +use rand::prelude::*; +use tokio::sync::Barrier; +use tokio::task::JoinSet; +use tracing::{debug, info, instrument}; + +use std::collections::HashMap; +use std::num::NonZeroUsize; +use std::ops::Range; +use std::sync::atomic::{AtomicU64, AtomicUsize, Ordering}; +use std::sync::{Arc, Mutex}; +use std::time::Instant; + +use crate::util::tokio_thread_local_stats::AllThreadLocalStats; +use crate::util::{request_stats, tokio_thread_local_stats}; + +/// basebackup@LatestLSN +#[derive(clap::Parser)] +pub(crate) struct Args { + #[clap(long, default_value = "http://localhost:9898")] + mgmt_api_endpoint: String, + #[clap(long, default_value = "localhost:64000")] + page_service_host_port: String, + #[clap(long)] + pageserver_jwt: Option, + #[clap(long, default_value = "1")] + num_clients: NonZeroUsize, + #[clap(long, default_value = "1.0")] + gzip_probability: f64, + #[clap(long)] + runtime: Option, + #[clap(long)] + limit_to_first_n_targets: Option, + targets: Option>, +} + +#[derive(Debug, Default)] +struct LiveStats { + completed_requests: AtomicU64, +} + +impl LiveStats { + fn inc(&self) { + self.completed_requests.fetch_add(1, Ordering::Relaxed); + } +} + +struct Target { + timeline: TenantTimelineId, + lsn_range: Option>, +} + +#[derive(serde::Serialize)] +struct Output { + total: request_stats::Output, +} + +tokio_thread_local_stats::declare!(STATS: request_stats::Stats); + +pub(crate) fn main(args: Args) -> anyhow::Result<()> { + tokio_thread_local_stats::main!(STATS, move |thread_local_stats| { + main_impl(args, thread_local_stats) + }) +} + +async fn main_impl( + args: Args, + all_thread_local_stats: AllThreadLocalStats, +) -> anyhow::Result<()> { + let args: &'static Args = Box::leak(Box::new(args)); + + let mgmt_api_client = Arc::new(pageserver_client::mgmt_api::Client::new( + args.mgmt_api_endpoint.clone(), + args.pageserver_jwt.as_deref(), + )); + + // discover targets + let timelines: Vec = crate::util::cli::targets::discover( + &mgmt_api_client, + crate::util::cli::targets::Spec { + limit_to_first_n_targets: args.limit_to_first_n_targets, + targets: args.targets.clone(), + }, + ) + .await?; + let mut js = JoinSet::new(); + for timeline in &timelines { + js.spawn({ + let timeline = *timeline; + let info = mgmt_api_client + .timeline_info( + timeline.tenant_id, + timeline.timeline_id, + ForceAwaitLogicalSize::No, + ) + .await + .unwrap(); + async move { + anyhow::Ok(Target { + timeline, + // TODO: support lsn_range != latest LSN + lsn_range: Some(info.last_record_lsn..(info.last_record_lsn + 1)), + }) + } + }); + } + let mut all_targets: Vec = Vec::new(); + while let Some(res) = js.join_next().await { + all_targets.push(res.unwrap().unwrap()); + } + + let live_stats = Arc::new(LiveStats::default()); + + let num_client_tasks = timelines.len(); + let num_live_stats_dump = 1; + let num_work_sender_tasks = 1; + + let start_work_barrier = Arc::new(tokio::sync::Barrier::new( + num_client_tasks + num_live_stats_dump + num_work_sender_tasks, + )); + let all_work_done_barrier = Arc::new(tokio::sync::Barrier::new(num_client_tasks)); + + tokio::spawn({ + let stats = Arc::clone(&live_stats); + let start_work_barrier = Arc::clone(&start_work_barrier); + async move { + start_work_barrier.wait().await; + loop { + let start = std::time::Instant::now(); + tokio::time::sleep(std::time::Duration::from_secs(1)).await; + let completed_requests = stats.completed_requests.swap(0, Ordering::Relaxed); + let elapsed = start.elapsed(); + info!( + "RPS: {:.0}", + completed_requests as f64 / elapsed.as_secs_f64() + ); + } + } + }); + + let mut work_senders = HashMap::new(); + let mut tasks = Vec::new(); + for tl in &timelines { + let (sender, receiver) = tokio::sync::mpsc::channel(1); // TODO: not sure what the implications of this are + work_senders.insert(tl, sender); + tasks.push(tokio::spawn(client( + args, + *tl, + Arc::clone(&start_work_barrier), + receiver, + Arc::clone(&all_work_done_barrier), + Arc::clone(&live_stats), + ))); + } + + let work_sender = async move { + start_work_barrier.wait().await; + loop { + let (timeline, work) = { + let mut rng = rand::thread_rng(); + let target = all_targets.choose(&mut rng).unwrap(); + let lsn = target.lsn_range.clone().map(|r| rng.gen_range(r)); + ( + target.timeline, + Work { + lsn, + gzip: rng.gen_bool(args.gzip_probability), + }, + ) + }; + let sender = work_senders.get(&timeline).unwrap(); + // TODO: what if this blocks? + sender.send(work).await.ok().unwrap(); + } + }; + + if let Some(runtime) = args.runtime { + match tokio::time::timeout(runtime.into(), work_sender).await { + Ok(()) => unreachable!("work sender never terminates"), + Err(_timeout) => { + // this implicitly drops the work_senders, making all the clients exit + } + } + } else { + work_sender.await; + unreachable!("work sender never terminates"); + } + + for t in tasks { + t.await.unwrap(); + } + + let output = Output { + total: { + let mut agg_stats = request_stats::Stats::new(); + for stats in all_thread_local_stats.lock().unwrap().iter() { + let stats = stats.lock().unwrap(); + agg_stats.add(&stats); + } + agg_stats.output() + }, + }; + + let output = serde_json::to_string_pretty(&output).unwrap(); + println!("{output}"); + + anyhow::Ok(()) +} + +#[derive(Copy, Clone)] +struct Work { + lsn: Option, + gzip: bool, +} + +#[instrument(skip_all)] +async fn client( + args: &'static Args, + timeline: TenantTimelineId, + start_work_barrier: Arc, + mut work: tokio::sync::mpsc::Receiver, + all_work_done_barrier: Arc, + live_stats: Arc, +) { + start_work_barrier.wait().await; + + let client = pageserver_client::page_service::Client::new(crate::util::connstring::connstring( + &args.page_service_host_port, + args.pageserver_jwt.as_deref(), + )) + .await + .unwrap(); + + while let Some(Work { lsn, gzip }) = work.recv().await { + let start = Instant::now(); + let copy_out_stream = client + .basebackup(&BasebackupRequest { + tenant_id: timeline.tenant_id, + timeline_id: timeline.timeline_id, + lsn, + gzip, + }) + .await + .with_context(|| format!("start basebackup for {timeline}")) + .unwrap(); + + use futures::StreamExt; + let size = Arc::new(AtomicUsize::new(0)); + copy_out_stream + .for_each({ + |r| { + let size = Arc::clone(&size); + async move { + let size = Arc::clone(&size); + size.fetch_add(r.unwrap().len(), Ordering::Relaxed); + } + } + }) + .await; + debug!("basebackup size is {} bytes", size.load(Ordering::Relaxed)); + let elapsed = start.elapsed(); + live_stats.inc(); + STATS.with(|stats| { + stats.borrow().lock().unwrap().observe(elapsed).unwrap(); + }); + } + + all_work_done_barrier.wait().await; +} diff --git a/pageserver/pagebench/src/cmd/getpage_latest_lsn.rs b/pageserver/pagebench/src/cmd/getpage_latest_lsn.rs new file mode 100644 index 0000000000..b134ed895d --- /dev/null +++ b/pageserver/pagebench/src/cmd/getpage_latest_lsn.rs @@ -0,0 +1,357 @@ +use anyhow::Context; +use futures::future::join_all; +use pageserver::pgdatadir_mapping::key_to_rel_block; +use pageserver::repository; +use pageserver_api::key::is_rel_block_key; +use pageserver_api::keyspace::KeySpaceAccum; +use pageserver_api::models::PagestreamGetPageRequest; + +use utils::id::TenantTimelineId; +use utils::lsn::Lsn; + +use rand::prelude::*; +use tokio::sync::Barrier; +use tokio::task::JoinSet; +use tracing::{info, instrument}; + +use std::collections::HashMap; +use std::future::Future; +use std::num::NonZeroUsize; +use std::pin::Pin; +use std::sync::atomic::{AtomicU64, Ordering}; +use std::sync::{Arc, Mutex}; +use std::time::{Duration, Instant}; + +use crate::util::tokio_thread_local_stats::AllThreadLocalStats; +use crate::util::{request_stats, tokio_thread_local_stats}; + +/// GetPage@LatestLSN, uniformly distributed across the compute-accessible keyspace. +#[derive(clap::Parser)] +pub(crate) struct Args { + #[clap(long, default_value = "http://localhost:9898")] + mgmt_api_endpoint: String, + #[clap(long, default_value = "postgres://postgres@localhost:64000")] + page_service_connstring: String, + #[clap(long)] + pageserver_jwt: Option, + #[clap(long, default_value = "1")] + num_clients: NonZeroUsize, + #[clap(long)] + runtime: Option, + #[clap(long)] + per_target_rate_limit: Option, + /// Probability for sending `latest=true` in the request (uniform distribution). + #[clap(long, default_value = "1")] + req_latest_probability: f64, + #[clap(long)] + limit_to_first_n_targets: Option, + targets: Option>, +} + +#[derive(Debug, Default)] +struct LiveStats { + completed_requests: AtomicU64, +} + +impl LiveStats { + fn inc(&self) { + self.completed_requests.fetch_add(1, Ordering::Relaxed); + } +} + +#[derive(Clone)] +struct KeyRange { + timeline: TenantTimelineId, + timeline_lsn: Lsn, + start: i128, + end: i128, +} + +impl KeyRange { + fn len(&self) -> i128 { + self.end - self.start + } +} + +#[derive(serde::Serialize)] +struct Output { + total: request_stats::Output, +} + +tokio_thread_local_stats::declare!(STATS: request_stats::Stats); + +pub(crate) fn main(args: Args) -> anyhow::Result<()> { + tokio_thread_local_stats::main!(STATS, move |thread_local_stats| { + main_impl(args, thread_local_stats) + }) +} + +async fn main_impl( + args: Args, + all_thread_local_stats: AllThreadLocalStats, +) -> anyhow::Result<()> { + let args: &'static Args = Box::leak(Box::new(args)); + + let mgmt_api_client = Arc::new(pageserver_client::mgmt_api::Client::new( + args.mgmt_api_endpoint.clone(), + args.pageserver_jwt.as_deref(), + )); + + // discover targets + let timelines: Vec = crate::util::cli::targets::discover( + &mgmt_api_client, + crate::util::cli::targets::Spec { + limit_to_first_n_targets: args.limit_to_first_n_targets, + targets: args.targets.clone(), + }, + ) + .await?; + + let mut js = JoinSet::new(); + for timeline in &timelines { + js.spawn({ + let mgmt_api_client = Arc::clone(&mgmt_api_client); + let timeline = *timeline; + async move { + let partitioning = mgmt_api_client + .keyspace(timeline.tenant_id, timeline.timeline_id) + .await?; + let lsn = partitioning.at_lsn; + let start = Instant::now(); + let mut filtered = KeySpaceAccum::new(); + // let's hope this is inlined and vectorized... + // TODO: turn this loop into a is_rel_block_range() function. + for r in partitioning.keys.ranges.iter() { + let mut i = r.start; + while i != r.end { + if is_rel_block_key(&i) { + filtered.add_key(i); + } + i = i.next(); + } + } + let filtered = filtered.to_keyspace(); + let filter_duration = start.elapsed(); + + anyhow::Ok(( + filter_duration, + filtered.ranges.into_iter().map(move |r| KeyRange { + timeline, + timeline_lsn: lsn, + start: r.start.to_i128(), + end: r.end.to_i128(), + }), + )) + } + }); + } + let mut total_filter_duration = Duration::from_secs(0); + let mut all_ranges: Vec = Vec::new(); + while let Some(res) = js.join_next().await { + let (filter_duration, range) = res.unwrap().unwrap(); + all_ranges.extend(range); + total_filter_duration += filter_duration; + } + info!("filter duration: {}", total_filter_duration.as_secs_f64()); + + let live_stats = Arc::new(LiveStats::default()); + + let num_client_tasks = timelines.len(); + let num_live_stats_dump = 1; + let num_work_sender_tasks = 1; + + let start_work_barrier = Arc::new(tokio::sync::Barrier::new( + num_client_tasks + num_live_stats_dump + num_work_sender_tasks, + )); + let all_work_done_barrier = Arc::new(tokio::sync::Barrier::new(num_client_tasks)); + + tokio::spawn({ + let stats = Arc::clone(&live_stats); + let start_work_barrier = Arc::clone(&start_work_barrier); + async move { + start_work_barrier.wait().await; + loop { + let start = std::time::Instant::now(); + tokio::time::sleep(std::time::Duration::from_secs(1)).await; + let completed_requests = stats.completed_requests.swap(0, Ordering::Relaxed); + let elapsed = start.elapsed(); + info!( + "RPS: {:.0}", + completed_requests as f64 / elapsed.as_secs_f64() + ); + } + } + }); + + let mut work_senders = HashMap::new(); + let mut tasks = Vec::new(); + for tl in &timelines { + let (sender, receiver) = tokio::sync::mpsc::channel(10); // TODO: not sure what the implications of this are + work_senders.insert(tl, sender); + tasks.push(tokio::spawn(client( + args, + *tl, + Arc::clone(&start_work_barrier), + receiver, + Arc::clone(&all_work_done_barrier), + Arc::clone(&live_stats), + ))); + } + + let work_sender: Pin>> = match args.per_target_rate_limit { + None => Box::pin(async move { + let weights = rand::distributions::weighted::WeightedIndex::new( + all_ranges.iter().map(|v| v.len()), + ) + .unwrap(); + + start_work_barrier.wait().await; + + loop { + let (timeline, req) = { + let mut rng = rand::thread_rng(); + let r = &all_ranges[weights.sample(&mut rng)]; + let key: i128 = rng.gen_range(r.start..r.end); + let key = repository::Key::from_i128(key); + let (rel_tag, block_no) = + key_to_rel_block(key).expect("we filter non-rel-block keys out above"); + ( + r.timeline, + PagestreamGetPageRequest { + latest: rng.gen_bool(args.req_latest_probability), + lsn: r.timeline_lsn, + rel: rel_tag, + blkno: block_no, + }, + ) + }; + let sender = work_senders.get(&timeline).unwrap(); + // TODO: what if this blocks? + sender.send(req).await.ok().unwrap(); + } + }), + Some(rps_limit) => Box::pin(async move { + let period = Duration::from_secs_f64(1.0 / (rps_limit as f64)); + + let make_timeline_task: &dyn Fn( + TenantTimelineId, + ) + -> Pin>> = &|timeline| { + let sender = work_senders.get(&timeline).unwrap(); + let ranges: Vec = all_ranges + .iter() + .filter(|r| r.timeline == timeline) + .cloned() + .collect(); + let weights = rand::distributions::weighted::WeightedIndex::new( + ranges.iter().map(|v| v.len()), + ) + .unwrap(); + + Box::pin(async move { + let mut ticker = tokio::time::interval(period); + ticker.set_missed_tick_behavior( + /* TODO review this choice */ + tokio::time::MissedTickBehavior::Burst, + ); + loop { + ticker.tick().await; + let req = { + let mut rng = rand::thread_rng(); + let r = &ranges[weights.sample(&mut rng)]; + let key: i128 = rng.gen_range(r.start..r.end); + let key = repository::Key::from_i128(key); + assert!(is_rel_block_key(&key)); + let (rel_tag, block_no) = key_to_rel_block(key) + .expect("we filter non-rel-block keys out above"); + PagestreamGetPageRequest { + latest: rng.gen_bool(args.req_latest_probability), + lsn: r.timeline_lsn, + rel: rel_tag, + blkno: block_no, + } + }; + sender.send(req).await.ok().unwrap(); + } + }) + }; + + let tasks: Vec<_> = work_senders + .keys() + .map(|tl| make_timeline_task(**tl)) + .collect(); + + start_work_barrier.wait().await; + + join_all(tasks).await; + }), + }; + + if let Some(runtime) = args.runtime { + match tokio::time::timeout(runtime.into(), work_sender).await { + Ok(()) => unreachable!("work sender never terminates"), + Err(_timeout) => { + // this implicitly drops the work_senders, making all the clients exit + } + } + } else { + work_sender.await; + unreachable!("work sender never terminates"); + } + + for t in tasks { + t.await.unwrap(); + } + + let output = Output { + total: { + let mut agg_stats = request_stats::Stats::new(); + for stats in all_thread_local_stats.lock().unwrap().iter() { + let stats = stats.lock().unwrap(); + agg_stats.add(&stats); + } + agg_stats.output() + }, + }; + + let output = serde_json::to_string_pretty(&output).unwrap(); + println!("{output}"); + + anyhow::Ok(()) +} + +#[instrument(skip_all)] +async fn client( + args: &'static Args, + timeline: TenantTimelineId, + start_work_barrier: Arc, + mut work: tokio::sync::mpsc::Receiver, + all_work_done_barrier: Arc, + live_stats: Arc, +) { + start_work_barrier.wait().await; + + let client = pageserver_client::page_service::Client::new(args.page_service_connstring.clone()) + .await + .unwrap(); + let mut client = client + .pagestream(timeline.tenant_id, timeline.timeline_id) + .await + .unwrap(); + + while let Some(req) = work.recv().await { + let start = Instant::now(); + client + .getpage(req) + .await + .with_context(|| format!("getpage for {timeline}")) + .unwrap(); + let elapsed = start.elapsed(); + live_stats.inc(); + STATS.with(|stats| { + stats.borrow().lock().unwrap().observe(elapsed).unwrap(); + }); + } + + all_work_done_barrier.wait().await; +} diff --git a/pageserver/pagebench/src/cmd/trigger_initial_size_calculation.rs b/pageserver/pagebench/src/cmd/trigger_initial_size_calculation.rs new file mode 100644 index 0000000000..98938d780a --- /dev/null +++ b/pageserver/pagebench/src/cmd/trigger_initial_size_calculation.rs @@ -0,0 +1,88 @@ +use std::sync::Arc; + +use humantime::Duration; +use tokio::task::JoinSet; +use utils::id::TenantTimelineId; + +use pageserver_client::mgmt_api::ForceAwaitLogicalSize; + +#[derive(clap::Parser)] +pub(crate) struct Args { + #[clap(long, default_value = "http://localhost:9898")] + mgmt_api_endpoint: String, + #[clap(long, default_value = "localhost:64000")] + page_service_host_port: String, + #[clap(long)] + pageserver_jwt: Option, + #[clap( + long, + help = "if specified, poll mgmt api to check whether init logical size calculation has completed" + )] + poll_for_completion: Option, + #[clap(long)] + limit_to_first_n_targets: Option, + targets: Option>, +} + +pub(crate) fn main(args: Args) -> anyhow::Result<()> { + let rt = tokio::runtime::Builder::new_multi_thread() + .enable_all() + .build() + .unwrap(); + + let main_task = rt.spawn(main_impl(args)); + rt.block_on(main_task).unwrap() +} + +async fn main_impl(args: Args) -> anyhow::Result<()> { + let args: &'static Args = Box::leak(Box::new(args)); + + let mgmt_api_client = Arc::new(pageserver_client::mgmt_api::Client::new( + args.mgmt_api_endpoint.clone(), + args.pageserver_jwt.as_deref(), + )); + + // discover targets + let timelines: Vec = crate::util::cli::targets::discover( + &mgmt_api_client, + crate::util::cli::targets::Spec { + limit_to_first_n_targets: args.limit_to_first_n_targets, + targets: args.targets.clone(), + }, + ) + .await?; + + // kick it off + + let mut js = JoinSet::new(); + for tl in timelines { + let mgmt_api_client = Arc::clone(&mgmt_api_client); + js.spawn(async move { + let info = mgmt_api_client + .timeline_info(tl.tenant_id, tl.timeline_id, ForceAwaitLogicalSize::Yes) + .await + .unwrap(); + + // Polling should not be strictly required here since we await + // for the initial logical size, however it's possible for the request + // to land before the timeline is initialised. This results in an approximate + // logical size. + if let Some(period) = args.poll_for_completion { + let mut ticker = tokio::time::interval(period.into()); + ticker.set_missed_tick_behavior(tokio::time::MissedTickBehavior::Delay); + let mut info = info; + while !info.current_logical_size_is_accurate { + ticker.tick().await; + info = mgmt_api_client + .timeline_info(tl.tenant_id, tl.timeline_id, ForceAwaitLogicalSize::Yes) + .await + .unwrap(); + } + } + }); + } + while let Some(res) = js.join_next().await { + let _: () = res.unwrap(); + } + Ok(()) +} diff --git a/pageserver/pagebench/src/main.rs b/pageserver/pagebench/src/main.rs new file mode 100644 index 0000000000..e0120c9212 --- /dev/null +++ b/pageserver/pagebench/src/main.rs @@ -0,0 +1,48 @@ +use clap::Parser; +use utils::logging; + +/// Re-usable pieces of code that aren't CLI-specific. +mod util { + pub(crate) mod connstring; + pub(crate) mod request_stats; + #[macro_use] + pub(crate) mod tokio_thread_local_stats; + /// Re-usable pieces of CLI-specific code. + pub(crate) mod cli { + pub(crate) mod targets; + } +} + +/// The pagebench CLI sub-commands, dispatched in [`main`] below. +mod cmd { + pub(super) mod basebackup; + pub(super) mod getpage_latest_lsn; + pub(super) mod trigger_initial_size_calculation; +} + +/// Component-level performance test for pageserver. +#[derive(clap::Parser)] +enum Args { + Basebackup(cmd::basebackup::Args), + GetPageLatestLsn(cmd::getpage_latest_lsn::Args), + TriggerInitialSizeCalculation(cmd::trigger_initial_size_calculation::Args), +} + +fn main() { + logging::init( + logging::LogFormat::Plain, + logging::TracingErrorLayerEnablement::Disabled, + logging::Output::Stderr, + ) + .unwrap(); + + let args = Args::parse(); + match args { + Args::Basebackup(args) => cmd::basebackup::main(args), + Args::GetPageLatestLsn(args) => cmd::getpage_latest_lsn::main(args), + Args::TriggerInitialSizeCalculation(args) => { + cmd::trigger_initial_size_calculation::main(args) + } + } + .unwrap() +} diff --git a/pageserver/pagebench/src/util/cli/targets.rs b/pageserver/pagebench/src/util/cli/targets.rs new file mode 100644 index 0000000000..848eae27cf --- /dev/null +++ b/pageserver/pagebench/src/util/cli/targets.rs @@ -0,0 +1,34 @@ +use std::sync::Arc; + +use pageserver_client::mgmt_api; +use tracing::info; +use utils::id::TenantTimelineId; + +pub(crate) struct Spec { + pub(crate) limit_to_first_n_targets: Option, + pub(crate) targets: Option>, +} + +pub(crate) async fn discover( + api_client: &Arc, + spec: Spec, +) -> anyhow::Result> { + let mut timelines = if let Some(targets) = spec.targets { + targets + } else { + mgmt_api::util::get_pageserver_tenant_timelines_unsharded(api_client).await? + }; + + if let Some(limit) = spec.limit_to_first_n_targets { + timelines.sort(); // for determinism + timelines.truncate(limit); + if timelines.len() < limit { + anyhow::bail!("pageserver has less than limit_to_first_n_targets={limit} tenants"); + } + } + + info!("timelines:\n{:?}", timelines); + info!("number of timelines:\n{:?}", timelines.len()); + + Ok(timelines) +} diff --git a/pageserver/pagebench/src/util/connstring.rs b/pageserver/pagebench/src/util/connstring.rs new file mode 100644 index 0000000000..07a0ff042d --- /dev/null +++ b/pageserver/pagebench/src/util/connstring.rs @@ -0,0 +1,8 @@ +pub(crate) fn connstring(host_port: &str, jwt: Option<&str>) -> String { + let colon_and_jwt = if let Some(jwt) = jwt { + format!(":{jwt}") // TODO: urlescape + } else { + String::new() + }; + format!("postgres://postgres{colon_and_jwt}@{host_port}") +} diff --git a/pageserver/pagebench/src/util/request_stats.rs b/pageserver/pagebench/src/util/request_stats.rs new file mode 100644 index 0000000000..5ecf1cbf24 --- /dev/null +++ b/pageserver/pagebench/src/util/request_stats.rs @@ -0,0 +1,88 @@ +use std::time::Duration; + +use anyhow::Context; + +pub(crate) struct Stats { + latency_histo: hdrhistogram::Histogram, +} + +impl Stats { + pub(crate) fn new() -> Self { + Self { + // Initialize with fixed bounds so that we panic at runtime instead of resizing the histogram, + // which would skew the benchmark results. + latency_histo: hdrhistogram::Histogram::new_with_bounds(1, 1_000_000_000, 3).unwrap(), + } + } + pub(crate) fn observe(&mut self, latency: Duration) -> anyhow::Result<()> { + let micros: u64 = latency + .as_micros() + .try_into() + .context("latency greater than u64")?; + self.latency_histo + .record(micros) + .context("add to histogram")?; + Ok(()) + } + pub(crate) fn output(&self) -> Output { + let latency_percentiles = std::array::from_fn(|idx| { + let micros = self + .latency_histo + .value_at_percentile(LATENCY_PERCENTILES[idx]); + Duration::from_micros(micros) + }); + Output { + request_count: self.latency_histo.len(), + latency_mean: Duration::from_micros(self.latency_histo.mean() as u64), + latency_percentiles: LatencyPercentiles { + latency_percentiles, + }, + } + } + pub(crate) fn add(&mut self, other: &Self) { + let Self { + ref mut latency_histo, + } = self; + latency_histo.add(&other.latency_histo).unwrap(); + } +} + +impl Default for Stats { + fn default() -> Self { + Self::new() + } +} + +const LATENCY_PERCENTILES: [f64; 4] = [95.0, 99.00, 99.90, 99.99]; + +struct LatencyPercentiles { + latency_percentiles: [Duration; 4], +} + +impl serde::Serialize for LatencyPercentiles { + fn serialize(&self, serializer: S) -> Result + where + S: serde::Serializer, + { + use serde::ser::SerializeMap; + let mut ser = serializer.serialize_map(Some(LATENCY_PERCENTILES.len()))?; + for p in LATENCY_PERCENTILES { + ser.serialize_entry( + &format!("p{p}"), + &format!( + "{}", + &humantime::format_duration(self.latency_percentiles[0]) + ), + )?; + } + ser.end() + } +} + +#[derive(serde::Serialize)] +pub(crate) struct Output { + request_count: u64, + #[serde(with = "humantime_serde")] + latency_mean: Duration, + latency_percentiles: LatencyPercentiles, +} diff --git a/pageserver/pagebench/src/util/tokio_thread_local_stats.rs b/pageserver/pagebench/src/util/tokio_thread_local_stats.rs new file mode 100644 index 0000000000..82526213b6 --- /dev/null +++ b/pageserver/pagebench/src/util/tokio_thread_local_stats.rs @@ -0,0 +1,45 @@ +pub(crate) type ThreadLocalStats = Arc>; +pub(crate) type AllThreadLocalStats = Arc>>>; + +macro_rules! declare { + ($THREAD_LOCAL_NAME:ident: $T:ty) => { + thread_local! { + pub static $THREAD_LOCAL_NAME: std::cell::RefCell> = std::cell::RefCell::new( + std::sync::Arc::new(std::sync::Mutex::new(Default::default())) + ); + } + }; +} + +use std::sync::{Arc, Mutex}; + +pub(crate) use declare; + +macro_rules! main { + ($THREAD_LOCAL_NAME:ident, $main_impl:expr) => {{ + let main_impl = $main_impl; + let all = Arc::new(Mutex::new(Vec::new())); + + let rt = tokio::runtime::Builder::new_multi_thread() + .on_thread_start({ + let all = Arc::clone(&all); + move || { + // pre-initialize the thread local stats by accessesing them + // (some stats like requests_stats::Stats are quite costly to initialize, + // we don't want to pay that cost during the measurement period) + $THREAD_LOCAL_NAME.with(|stats| { + let stats: Arc<_> = Arc::clone(&*stats.borrow()); + all.lock().unwrap().push(stats); + }); + } + }) + .enable_all() + .build() + .unwrap(); + + let main_task = rt.spawn(main_impl(all)); + rt.block_on(main_task).unwrap() + }}; +} + +pub(crate) use main; diff --git a/pageserver/src/basebackup.rs b/pageserver/src/basebackup.rs index ed452eae7d..7e5ae892ad 100644 --- a/pageserver/src/basebackup.rs +++ b/pageserver/src/basebackup.rs @@ -23,6 +23,7 @@ use tracing::*; use tokio_tar::{Builder, EntryType, Header}; use crate::context::RequestContext; +use crate::pgdatadir_mapping::Version; use crate::tenant::Timeline; use pageserver_api::reltag::{RelTag, SlruKind}; @@ -174,7 +175,7 @@ where ] { for segno in self .timeline - .list_slru_segments(kind, self.lsn, self.ctx) + .list_slru_segments(kind, Version::Lsn(self.lsn), self.ctx) .await? { self.add_slru_segment(kind, segno).await?; @@ -192,7 +193,7 @@ where // Otherwise only include init forks of unlogged relations. let rels = self .timeline - .list_rels(spcnode, dbnode, self.lsn, self.ctx) + .list_rels(spcnode, dbnode, Version::Lsn(self.lsn), self.ctx) .await?; for &rel in rels.iter() { // Send init fork as main fork to provide well formed empty @@ -267,7 +268,7 @@ where async fn add_rel(&mut self, src: RelTag, dst: RelTag) -> anyhow::Result<()> { let nblocks = self .timeline - .get_rel_size(src, self.lsn, false, self.ctx) + .get_rel_size(src, Version::Lsn(self.lsn), false, self.ctx) .await?; // If the relation is empty, create an empty file @@ -288,7 +289,7 @@ where for blknum in startblk..endblk { let img = self .timeline - .get_rel_page_at_lsn(src, blknum, self.lsn, false, self.ctx) + .get_rel_page_at_lsn(src, blknum, Version::Lsn(self.lsn), false, self.ctx) .await?; segment_data.extend_from_slice(&img[..]); } @@ -310,7 +311,7 @@ where async fn add_slru_segment(&mut self, slru: SlruKind, segno: u32) -> anyhow::Result<()> { let nblocks = self .timeline - .get_slru_segment_size(slru, segno, self.lsn, self.ctx) + .get_slru_segment_size(slru, segno, Version::Lsn(self.lsn), self.ctx) .await?; let mut slru_buf: Vec = Vec::with_capacity(nblocks as usize * BLCKSZ as usize); @@ -352,7 +353,7 @@ where let relmap_img = if has_relmap_file { let img = self .timeline - .get_relmap_file(spcnode, dbnode, self.lsn, self.ctx) + .get_relmap_file(spcnode, dbnode, Version::Lsn(self.lsn), self.ctx) .await?; ensure!( @@ -399,7 +400,7 @@ where if !has_relmap_file && self .timeline - .list_rels(spcnode, dbnode, self.lsn, self.ctx) + .list_rels(spcnode, dbnode, Version::Lsn(self.lsn), self.ctx) .await? .is_empty() { diff --git a/pageserver/src/bin/pageserver.rs b/pageserver/src/bin/pageserver.rs index 4531b9d989..49e646dd71 100644 --- a/pageserver/src/bin/pageserver.rs +++ b/pageserver/src/bin/pageserver.rs @@ -31,6 +31,7 @@ use pageserver::{ virtual_file, }; use postgres_backend::AuthType; +use utils::failpoint_support; use utils::logging::TracingErrorLayerEnablement; use utils::signals::ShutdownSignals; use utils::{ @@ -126,7 +127,7 @@ fn main() -> anyhow::Result<()> { } // Initialize up failpoints support - let scenario = pageserver::failpoint_support::init(); + let scenario = failpoint_support::init(); // Basic initialization of things that don't change after startup virtual_file::init(conf.max_file_descriptors, conf.virtual_file_io_engine); diff --git a/pageserver/src/config.rs b/pageserver/src/config.rs index b91f137cdb..7f68492f72 100644 --- a/pageserver/src/config.rs +++ b/pageserver/src/config.rs @@ -38,8 +38,8 @@ use crate::tenant::{ }; use crate::virtual_file; use crate::{ - IGNORED_TENANT_FILE_NAME, METADATA_FILE_NAME, TENANT_CONFIG_NAME, TENANT_LOCATION_CONFIG_NAME, - TIMELINE_DELETE_MARK_SUFFIX, TIMELINE_UNINIT_MARK_SUFFIX, + IGNORED_TENANT_FILE_NAME, METADATA_FILE_NAME, TENANT_CONFIG_NAME, TENANT_HEATMAP_BASENAME, + TENANT_LOCATION_CONFIG_NAME, TIMELINE_DELETE_MARK_SUFFIX, TIMELINE_UNINIT_MARK_SUFFIX, }; use self::defaults::DEFAULT_CONCURRENT_TENANT_WARMUP; @@ -78,6 +78,9 @@ pub mod defaults { pub const DEFAULT_BACKGROUND_TASK_MAXIMUM_DELAY: &str = "10s"; pub const DEFAULT_HEATMAP_UPLOAD_CONCURRENCY: usize = 8; + pub const DEFAULT_SECONDARY_DOWNLOAD_CONCURRENCY: usize = 1; + + pub const DEFAULT_INGEST_BATCH_SIZE: u64 = 100; pub const DEFAULT_VIRTUAL_FILE_IO_ENGINE: &str = "std-fs"; @@ -93,6 +96,7 @@ pub mod defaults { #wait_lsn_timeout = '{DEFAULT_WAIT_LSN_TIMEOUT}' #wal_redo_timeout = '{DEFAULT_WAL_REDO_TIMEOUT}' +#page_cache_size = {DEFAULT_PAGE_CACHE_SIZE} #max_file_descriptors = {DEFAULT_MAX_FILE_DESCRIPTORS} # initial superuser role name to use when creating a new tenant @@ -113,6 +117,8 @@ pub mod defaults { #background_task_maximum_delay = '{DEFAULT_BACKGROUND_TASK_MAXIMUM_DELAY}' +#ingest_batch_size = {DEFAULT_INGEST_BATCH_SIZE} + #virtual_file_io_engine = '{DEFAULT_VIRTUAL_FILE_IO_ENGINE}' [tenant_config] @@ -132,6 +138,7 @@ pub mod defaults { #gc_feedback = false #heatmap_upload_concurrency = {DEFAULT_HEATMAP_UPLOAD_CONCURRENCY} +#secondary_download_concurrency = {DEFAULT_SECONDARY_DOWNLOAD_CONCURRENCY} [remote_storage] @@ -241,6 +248,13 @@ pub struct PageServerConf { /// heatmap uploads vs. other remote storage operations. pub heatmap_upload_concurrency: usize, + /// How many remote storage downloads may be done for secondary tenants concurrently. Implicitly + /// deprioritises secondary downloads vs. remote storage operations for attached tenants. + pub secondary_download_concurrency: usize, + + /// Maximum number of WAL records to be ingested and committed at the same time + pub ingest_batch_size: u64, + pub virtual_file_io_engine: virtual_file::IoEngineKind, } @@ -323,6 +337,9 @@ struct PageServerConfigBuilder { control_plane_emergency_mode: BuilderValue, heatmap_upload_concurrency: BuilderValue, + secondary_download_concurrency: BuilderValue, + + ingest_batch_size: BuilderValue, virtual_file_io_engine: BuilderValue, } @@ -397,6 +414,9 @@ impl Default for PageServerConfigBuilder { control_plane_emergency_mode: Set(false), heatmap_upload_concurrency: Set(DEFAULT_HEATMAP_UPLOAD_CONCURRENCY), + secondary_download_concurrency: Set(DEFAULT_SECONDARY_DOWNLOAD_CONCURRENCY), + + ingest_batch_size: Set(DEFAULT_INGEST_BATCH_SIZE), virtual_file_io_engine: Set(DEFAULT_VIRTUAL_FILE_IO_ENGINE.parse().unwrap()), } @@ -547,6 +567,14 @@ impl PageServerConfigBuilder { self.heatmap_upload_concurrency = BuilderValue::Set(value) } + pub fn secondary_download_concurrency(&mut self, value: usize) { + self.secondary_download_concurrency = BuilderValue::Set(value) + } + + pub fn ingest_batch_size(&mut self, ingest_batch_size: u64) { + self.ingest_batch_size = BuilderValue::Set(ingest_batch_size) + } + pub fn virtual_file_io_engine(&mut self, value: virtual_file::IoEngineKind) { self.virtual_file_io_engine = BuilderValue::Set(value); } @@ -649,10 +677,15 @@ impl PageServerConfigBuilder { control_plane_emergency_mode: self .control_plane_emergency_mode .ok_or(anyhow!("missing control_plane_emergency_mode"))?, - heatmap_upload_concurrency: self .heatmap_upload_concurrency .ok_or(anyhow!("missing heatmap_upload_concurrency"))?, + secondary_download_concurrency: self + .secondary_download_concurrency + .ok_or(anyhow!("missing secondary_download_concurrency"))?, + ingest_batch_size: self + .ingest_batch_size + .ok_or(anyhow!("missing ingest_batch_size"))?, virtual_file_io_engine: self .virtual_file_io_engine .ok_or(anyhow!("missing virtual_file_io_engine"))?, @@ -713,6 +746,11 @@ impl PageServerConf { .join(TENANT_LOCATION_CONFIG_NAME) } + pub(crate) fn tenant_heatmap_path(&self, tenant_shard_id: &TenantShardId) -> Utf8PathBuf { + self.tenant_path(tenant_shard_id) + .join(TENANT_HEATMAP_BASENAME) + } + pub fn timelines_path(&self, tenant_shard_id: &TenantShardId) -> Utf8PathBuf { self.tenant_path(tenant_shard_id) .join(TIMELINES_SEGMENT_NAME) @@ -898,6 +936,10 @@ impl PageServerConf { "heatmap_upload_concurrency" => { builder.heatmap_upload_concurrency(parse_toml_u64(key, item)? as usize) }, + "secondary_download_concurrency" => { + builder.secondary_download_concurrency(parse_toml_u64(key, item)? as usize) + }, + "ingest_batch_size" => builder.ingest_batch_size(parse_toml_u64(key, item)?), "virtual_file_io_engine" => { builder.virtual_file_io_engine(parse_toml_from_str("virtual_file_io_engine", item)?) } @@ -972,6 +1014,8 @@ impl PageServerConf { control_plane_api_token: None, control_plane_emergency_mode: false, heatmap_upload_concurrency: defaults::DEFAULT_HEATMAP_UPLOAD_CONCURRENCY, + secondary_download_concurrency: defaults::DEFAULT_SECONDARY_DOWNLOAD_CONCURRENCY, + ingest_batch_size: defaults::DEFAULT_INGEST_BATCH_SIZE, virtual_file_io_engine: DEFAULT_VIRTUAL_FILE_IO_ENGINE.parse().unwrap(), } } @@ -1202,6 +1246,8 @@ background_task_maximum_delay = '334 s' control_plane_api_token: None, control_plane_emergency_mode: false, heatmap_upload_concurrency: defaults::DEFAULT_HEATMAP_UPLOAD_CONCURRENCY, + secondary_download_concurrency: defaults::DEFAULT_SECONDARY_DOWNLOAD_CONCURRENCY, + ingest_batch_size: defaults::DEFAULT_INGEST_BATCH_SIZE, virtual_file_io_engine: DEFAULT_VIRTUAL_FILE_IO_ENGINE.parse().unwrap(), }, "Correct defaults should be used when no config values are provided" @@ -1264,6 +1310,8 @@ background_task_maximum_delay = '334 s' control_plane_api_token: None, control_plane_emergency_mode: false, heatmap_upload_concurrency: defaults::DEFAULT_HEATMAP_UPLOAD_CONCURRENCY, + secondary_download_concurrency: defaults::DEFAULT_SECONDARY_DOWNLOAD_CONCURRENCY, + ingest_batch_size: 100, virtual_file_io_engine: DEFAULT_VIRTUAL_FILE_IO_ENGINE.parse().unwrap(), }, "Should be able to parse all basic config values correctly" @@ -1494,6 +1542,7 @@ threshold = "20m" period: Duration::from_secs(10), #[cfg(feature = "testing")] mock_statvfs: None, + eviction_order: crate::disk_usage_eviction_task::EvictionOrder::AbsoluteAccessed, }) ); match &conf.default_tenant_conf.eviction_policy { diff --git a/pageserver/src/control_plane_client.rs b/pageserver/src/control_plane_client.rs index 25ae3d1b01..950791ea48 100644 --- a/pageserver/src/control_plane_client.rs +++ b/pageserver/src/control_plane_client.rs @@ -1,5 +1,6 @@ use std::collections::HashMap; +use futures::Future; use pageserver_api::{ control_api::{ ReAttachRequest, ReAttachResponse, ValidateRequest, ValidateRequestTenant, ValidateResponse, @@ -28,13 +29,14 @@ pub enum RetryForeverError { ShuttingDown, } -#[async_trait::async_trait] pub trait ControlPlaneGenerationsApi { - async fn re_attach(&self) -> Result, RetryForeverError>; - async fn validate( + fn re_attach( + &self, + ) -> impl Future, RetryForeverError>> + Send; + fn validate( &self, tenants: Vec<(TenantShardId, Generation)>, - ) -> Result, RetryForeverError>; + ) -> impl Future, RetryForeverError>> + Send; } impl ControlPlaneClient { @@ -123,7 +125,6 @@ impl ControlPlaneClient { } } -#[async_trait::async_trait] impl ControlPlaneGenerationsApi for ControlPlaneClient { /// Block until we get a successful response, or error out if we are shut down async fn re_attach(&self) -> Result, RetryForeverError> { diff --git a/pageserver/src/deletion_queue.rs b/pageserver/src/deletion_queue.rs index 7b05745483..6a820e1bdc 100644 --- a/pageserver/src/deletion_queue.rs +++ b/pageserver/src/deletion_queue.rs @@ -831,7 +831,6 @@ mod test { } } - #[async_trait::async_trait] impl ControlPlaneGenerationsApi for MockControlPlane { #[allow(clippy::diverging_sub_expression)] // False positive via async_trait async fn re_attach(&self) -> Result, RetryForeverError> { diff --git a/pageserver/src/disk_usage_eviction_task.rs b/pageserver/src/disk_usage_eviction_task.rs index 76906cfaf7..23b9b573b6 100644 --- a/pageserver/src/disk_usage_eviction_task.rs +++ b/pageserver/src/disk_usage_eviction_task.rs @@ -74,6 +74,45 @@ pub struct DiskUsageEvictionTaskConfig { pub period: Duration, #[cfg(feature = "testing")] pub mock_statvfs: Option, + /// Select sorting for evicted layers + #[serde(default)] + pub eviction_order: EvictionOrder, +} + +/// Selects the sort order for eviction candidates *after* per tenant `min_resident_size` +/// partitioning. +#[derive(Default, Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)] +#[serde(tag = "type", content = "args")] +pub enum EvictionOrder { + /// Order the layers to be evicted by how recently they have been accessed in absolute + /// time. + /// + /// This strategy is unfair when some tenants grow faster than others towards the slower + /// growing. + #[default] + AbsoluteAccessed, + + /// Order the layers to be evicted by how recently they have been accessed relatively within + /// the set of resident layers of a tenant. + /// + /// This strategy will evict layers more fairly but is untested. + RelativeAccessed { + #[serde(default)] + highest_layer_count_loses_first: bool, + }, +} + +impl EvictionOrder { + /// Return true, if with [`Self::RelativeAccessed`] order the tenants with the highest layer + /// counts should be the first ones to have their layers evicted. + fn highest_layer_count_loses_first(&self) -> bool { + match self { + EvictionOrder::AbsoluteAccessed => false, + EvictionOrder::RelativeAccessed { + highest_layer_count_loses_first, + } => *highest_layer_count_loses_first, + } + } } #[derive(Default)] @@ -192,7 +231,14 @@ async fn disk_usage_eviction_task_iteration( ) -> anyhow::Result<()> { let usage_pre = filesystem_level_usage::get(tenants_dir, task_config) .context("get filesystem-level disk usage before evictions")?; - let res = disk_usage_eviction_task_iteration_impl(state, storage, usage_pre, cancel).await; + let res = disk_usage_eviction_task_iteration_impl( + state, + storage, + usage_pre, + task_config.eviction_order, + cancel, + ) + .await; match res { Ok(outcome) => { debug!(?outcome, "disk_usage_eviction_iteration finished"); @@ -278,6 +324,7 @@ pub(crate) async fn disk_usage_eviction_task_iteration_impl( state: &State, _storage: &GenericRemoteStorage, usage_pre: U, + eviction_order: EvictionOrder, cancel: &CancellationToken, ) -> anyhow::Result> { // use tokio's mutex to get a Sync guard (instead of std::sync::Mutex) @@ -297,7 +344,7 @@ pub(crate) async fn disk_usage_eviction_task_iteration_impl( "running disk usage based eviction due to pressure" ); - let candidates = match collect_eviction_candidates(cancel).await? { + let candidates = match collect_eviction_candidates(eviction_order, cancel).await? { EvictionCandidates::Cancelled => { return Ok(IterationOutcome::Cancelled); } @@ -307,16 +354,16 @@ pub(crate) async fn disk_usage_eviction_task_iteration_impl( // Debug-log the list of candidates let now = SystemTime::now(); for (i, (partition, candidate)) in candidates.iter().enumerate() { + let nth = i + 1; let desc = candidate.layer.layer_desc(); + let total_candidates = candidates.len(); + let size = desc.file_size; + let rel = candidate.relative_last_activity; debug!( - "cand {}/{}: size={}, no_access_for={}us, partition={:?}, {}/{}/{}", - i + 1, - candidates.len(), - desc.file_size, + "cand {nth}/{total_candidates}: size={size}, rel_last_activity={rel}, no_access_for={}us, partition={partition:?}, {}/{}/{}", now.duration_since(candidate.last_activity_ts) .unwrap() .as_micros(), - partition, desc.tenant_shard_id, desc.timeline_id, candidate.layer, @@ -459,6 +506,7 @@ struct EvictionCandidate { timeline: Arc, layer: Layer, last_activity_ts: SystemTime, + relative_last_activity: finite_f32::FiniteF32, } #[derive(Debug, Clone, Copy, PartialEq, Eq, PartialOrd, Ord)] @@ -478,24 +526,24 @@ enum EvictionCandidates { /// order. A caller that evicts in that order, until pressure is relieved, implements /// the eviction policy outlined in the module comment. /// -/// # Example +/// # Example with EvictionOrder::AbsoluteAccessed /// /// Imagine that there are two tenants, A and B, with five layers each, a-e. /// Each layer has size 100, and both tenant's min_resident_size is 150. /// The eviction order would be /// /// ```text -/// partition last_activity_ts tenant/layer -/// Above 18:30 A/c -/// Above 19:00 A/b -/// Above 18:29 B/c -/// Above 19:05 B/b -/// Above 20:00 B/a -/// Above 20:03 A/a -/// Below 20:30 A/d -/// Below 20:40 B/d -/// Below 20:45 B/e -/// Below 20:58 A/e +/// partition last_activity_ts tenant/layer +/// Above 18:30 A/c +/// Above 19:00 A/b +/// Above 18:29 B/c +/// Above 19:05 B/b +/// Above 20:00 B/a +/// Above 20:03 A/a +/// Below 20:30 A/d +/// Below 20:40 B/d +/// Below 20:45 B/e +/// Below 20:58 A/e /// ``` /// /// Now, if we need to evict 300 bytes to relieve pressure, we'd evict `A/c, A/b, B/c`. @@ -505,7 +553,77 @@ enum EvictionCandidates { /// `A/c, A/b, B/c, B/b, B/a, A/a, A/d, B/d, B/e`, reaching into the `Below` partition /// after exhauting the `Above` partition. /// So, we did not respect each tenant's min_resident_size. +/// +/// # Example with EvictionOrder::RelativeAccessed +/// +/// ```text +/// partition relative_age last_activity_ts tenant/layer +/// Above 0/4 18:30 A/c +/// Above 0/4 18:29 B/c +/// Above 1/4 19:00 A/b +/// Above 1/4 19:05 B/b +/// Above 2/4 20:00 B/a +/// Above 2/4 20:03 A/a +/// Below 3/4 20:30 A/d +/// Below 3/4 20:40 B/d +/// Below 4/4 20:45 B/e +/// Below 4/4 20:58 A/e +/// ``` +/// +/// With tenants having the same number of layers the picture does not change much. The same with +/// A having many more layers **resident** (not all of them listed): +/// +/// ```text +/// Above 0/100 18:30 A/c +/// Above 0/4 18:29 B/c +/// Above 1/100 19:00 A/b +/// Above 2/100 20:03 A/a +/// Above 3/100 20:03 A/nth_3 +/// Above 4/100 20:03 A/nth_4 +/// ... +/// Above 1/4 19:05 B/b +/// Above 25/100 20:04 A/nth_25 +/// ... +/// Above 2/4 20:00 B/a +/// Above 50/100 20:10 A/nth_50 +/// ... +/// Below 3/4 20:40 B/d +/// Below 99/100 20:30 A/nth_99 +/// Below 4/4 20:45 B/e +/// Below 100/100 20:58 A/nth_100 +/// ``` +/// +/// Now it's easier to see that because A has grown fast it has more layers to get evicted. What is +/// difficult to see is what happens on the next round assuming the evicting 23 from the above list +/// relieves the pressure (22 A layers gone, 1 B layers gone) but a new fast growing tenant C has +/// appeared: +/// +/// ```text +/// Above 0/87 20:04 A/nth_23 +/// Above 0/3 19:05 B/b +/// Above 0/50 20:59 C/nth_0 +/// Above 1/87 20:04 A/nth_24 +/// Above 1/50 21:00 C/nth_1 +/// Above 2/87 20:04 A/nth_25 +/// ... +/// Above 16/50 21:02 C/nth_16 +/// Above 1/3 20:00 B/a +/// Above 27/87 20:10 A/nth_50 +/// ... +/// Below 2/3 20:40 B/d +/// Below 49/50 21:05 C/nth_49 +/// Below 86/87 20:30 A/nth_99 +/// Below 3/3 20:45 B/e +/// Below 50/50 21:05 C/nth_50 +/// Below 87/87 20:58 A/nth_100 +/// ``` +/// +/// Now relieving pressure with 23 layers would cost: +/// - tenant A 14 layers +/// - tenant B 1 layer +/// - tenant C 8 layers async fn collect_eviction_candidates( + eviction_order: EvictionOrder, cancel: &CancellationToken, ) -> anyhow::Result { // get a snapshot of the list of tenants @@ -591,12 +709,63 @@ async fn collect_eviction_candidates( tenant_candidates .sort_unstable_by_key(|(_, layer_info)| std::cmp::Reverse(layer_info.last_activity_ts)); let mut cumsum: i128 = 0; - for (timeline, layer_info) in tenant_candidates.into_iter() { + + // keeping the -1 or not decides if every tenant should lose their least recently accessed + // layer OR if this should happen in the order of having highest layer count: + let fudge = if eviction_order.highest_layer_count_loses_first() { + // relative_age vs. tenant layer count: + // - 0.1..=1.0 (10 layers) + // - 0.01..=1.0 (100 layers) + // - 0.001..=1.0 (1000 layers) + // + // leading to evicting less of the smallest tenants. + 0 + } else { + // use full 0.0..=1.0 range, which means even the smallest tenants could always lose a + // layer. the actual ordering is unspecified: for 10k tenants on a pageserver it could + // be that less than 10k layer evictions is enough, so we would not need to evict from + // all tenants. + // + // as the tenant ordering is now deterministic this could hit the same tenants + // disproportionetly on multiple invocations. alternative could be to remember how many + // layers did we evict last time from this tenant, and inject that as an additional + // fudge here. + 1 + }; + + let total = tenant_candidates + .len() + .checked_sub(fudge) + .filter(|&x| x > 0) + // support 0 or 1 resident layer tenants as well + .unwrap_or(1); + let divider = total as f32; + + for (i, (timeline, layer_info)) in tenant_candidates.into_iter().enumerate() { let file_size = layer_info.file_size(); + + // as we iterate this reverse sorted list, the most recently accessed layer will always + // be 1.0; this is for us to evict it last. + let relative_last_activity = if matches!( + eviction_order, + EvictionOrder::RelativeAccessed { .. } + ) { + // another possibility: use buckets, like (256.0 * relative_last_activity) as u8 or + // similarly for u16. unsure how it would help. + finite_f32::FiniteF32::try_from_normalized((total - i) as f32 / divider) + .unwrap_or_else(|val| { + tracing::warn!(%fudge, "calculated invalid relative_last_activity for i={i}, total={total}: {val}"); + finite_f32::FiniteF32::ZERO + }) + } else { + finite_f32::FiniteF32::ZERO + }; + let candidate = EvictionCandidate { timeline, last_activity_ts: layer_info.last_activity_ts, layer: layer_info.layer, + relative_last_activity, }; let partition = if cumsum > min_resident_size as i128 { MinResidentSizePartition::Above @@ -610,8 +779,19 @@ async fn collect_eviction_candidates( debug_assert!(MinResidentSizePartition::Above < MinResidentSizePartition::Below, "as explained in the function's doc comment, layers that aren't in the tenant's min_resident_size are evicted first"); - candidates - .sort_unstable_by_key(|(partition, candidate)| (*partition, candidate.last_activity_ts)); + + match eviction_order { + EvictionOrder::AbsoluteAccessed => { + candidates.sort_unstable_by_key(|(partition, candidate)| { + (*partition, candidate.last_activity_ts) + }); + } + EvictionOrder::RelativeAccessed { .. } => { + candidates.sort_unstable_by_key(|(partition, candidate)| { + (*partition, candidate.relative_last_activity) + }); + } + } Ok(EvictionCandidates::Finished(candidates)) } @@ -640,6 +820,66 @@ impl std::ops::Deref for TimelineKey { } } +/// A totally ordered f32 subset we can use with sorting functions. +mod finite_f32 { + + /// A totally ordered f32 subset we can use with sorting functions. + #[derive(Clone, Copy, PartialEq)] + pub struct FiniteF32(f32); + + impl std::fmt::Debug for FiniteF32 { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + std::fmt::Debug::fmt(&self.0, f) + } + } + + impl std::fmt::Display for FiniteF32 { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + std::fmt::Display::fmt(&self.0, f) + } + } + + impl std::cmp::Eq for FiniteF32 {} + + impl std::cmp::PartialOrd for FiniteF32 { + fn partial_cmp(&self, other: &Self) -> Option { + Some(self.cmp(other)) + } + } + + impl std::cmp::Ord for FiniteF32 { + fn cmp(&self, other: &Self) -> std::cmp::Ordering { + self.0.total_cmp(&other.0) + } + } + + impl TryFrom for FiniteF32 { + type Error = f32; + + fn try_from(value: f32) -> Result { + if value.is_finite() { + Ok(FiniteF32(value)) + } else { + Err(value) + } + } + } + + impl FiniteF32 { + pub const ZERO: FiniteF32 = FiniteF32(0.0); + + pub fn try_from_normalized(value: f32) -> Result { + if (0.0..=1.0).contains(&value) { + // -0.0 is within the range, make sure it is assumed 0.0..=1.0 + let value = value.abs(); + Ok(FiniteF32(value)) + } else { + Err(value) + } + } + } +} + mod filesystem_level_usage { use anyhow::Context; use camino::Utf8Path; @@ -721,6 +961,7 @@ mod filesystem_level_usage { #[test] fn max_usage_pct_pressure() { + use super::EvictionOrder; use super::Usage as _; use std::time::Duration; use utils::serde_percent::Percent; @@ -732,6 +973,7 @@ mod filesystem_level_usage { period: Duration::MAX, #[cfg(feature = "testing")] mock_statvfs: None, + eviction_order: EvictionOrder::default(), }, total_bytes: 100_000, avail_bytes: 0, diff --git a/pageserver/src/failpoint_support.rs b/pageserver/src/failpoint_support.rs deleted file mode 100644 index 2190eba18a..0000000000 --- a/pageserver/src/failpoint_support.rs +++ /dev/null @@ -1,86 +0,0 @@ -/// use with fail::cfg("$name", "return(2000)") -/// -/// The effect is similar to a "sleep(2000)" action, i.e. we sleep for the -/// specified time (in milliseconds). The main difference is that we use async -/// tokio sleep function. Another difference is that we print lines to the log, -/// which can be useful in tests to check that the failpoint was hit. -#[macro_export] -macro_rules! __failpoint_sleep_millis_async { - ($name:literal) => {{ - // If the failpoint is used with a "return" action, set should_sleep to the - // returned value (as string). Otherwise it's set to None. - let should_sleep = (|| { - ::fail::fail_point!($name, |x| x); - ::std::option::Option::None - })(); - - // Sleep if the action was a returned value - if let ::std::option::Option::Some(duration_str) = should_sleep { - $crate::failpoint_support::failpoint_sleep_helper($name, duration_str).await - } - }}; -} -pub use __failpoint_sleep_millis_async as sleep_millis_async; - -// Helper function used by the macro. (A function has nicer scoping so we -// don't need to decorate everything with "::") -#[doc(hidden)] -pub(crate) async fn failpoint_sleep_helper(name: &'static str, duration_str: String) { - let millis = duration_str.parse::().unwrap(); - let d = std::time::Duration::from_millis(millis); - - tracing::info!("failpoint {:?}: sleeping for {:?}", name, d); - tokio::time::sleep(d).await; - tracing::info!("failpoint {:?}: sleep done", name); -} - -pub fn init() -> fail::FailScenario<'static> { - // The failpoints lib provides support for parsing the `FAILPOINTS` env var. - // We want non-default behavior for `exit`, though, so, we handle it separately. - // - // Format for FAILPOINTS is "name=actions" separated by ";". - let actions = std::env::var("FAILPOINTS"); - if actions.is_ok() { - std::env::remove_var("FAILPOINTS"); - } else { - // let the library handle non-utf8, or nothing for not present - } - - let scenario = fail::FailScenario::setup(); - - if let Ok(val) = actions { - val.split(';') - .enumerate() - .map(|(i, s)| s.split_once('=').ok_or((i, s))) - .for_each(|res| { - let (name, actions) = match res { - Ok(t) => t, - Err((i, s)) => { - panic!( - "startup failpoints: missing action on the {}th failpoint; try `{s}=return`", - i + 1, - ); - } - }; - if let Err(e) = apply_failpoint(name, actions) { - panic!("startup failpoints: failed to apply failpoint {name}={actions}: {e}"); - } - }); - } - - scenario -} - -pub(crate) fn apply_failpoint(name: &str, actions: &str) -> Result<(), String> { - if actions == "exit" { - fail::cfg_callback(name, exit_failpoint) - } else { - fail::cfg(name, actions) - } -} - -#[inline(never)] -fn exit_failpoint() { - tracing::info!("Exit requested by failpoint"); - std::process::exit(1); -} diff --git a/pageserver/src/http/openapi_spec.yml b/pageserver/src/http/openapi_spec.yml index b79c5ada9a..1fbca1086f 100644 --- a/pageserver/src/http/openapi_spec.yml +++ b/pageserver/src/http/openapi_spec.yml @@ -159,6 +159,12 @@ paths: application/json: schema: $ref: "#/components/schemas/ConflictError" + "412": + description: Deletion may not proceed, tenant is not in Active state + content: + application/json: + schema: + $ref: "#/components/schemas/PreconditionFailedError" "500": description: Generic operation error content: diff --git a/pageserver/src/http/routes.rs b/pageserver/src/http/routes.rs index e641e44b08..af56a1b455 100644 --- a/pageserver/src/http/routes.rs +++ b/pageserver/src/http/routes.rs @@ -15,6 +15,7 @@ use hyper::StatusCode; use hyper::{Body, Request, Response, Uri}; use metrics::launch_timestamp::LaunchTimestamp; use pageserver_api::models::TenantDetails; +use pageserver_api::models::TenantState; use pageserver_api::models::{ DownloadRemoteLayersTaskSpawnRequest, LocationConfigMode, TenantAttachRequest, TenantLoadRequest, TenantLocationConfigRequest, @@ -25,6 +26,7 @@ use tenant_size_model::{SizeResult, StorageModel}; use tokio_util::sync::CancellationToken; use tracing::*; use utils::auth::JwtAuth; +use utils::failpoint_support::failpoints_handler; use utils::http::endpoint::request_span; use utils::http::json::json_request_or_empty_body; use utils::http::request::{get_request_param, must_get_query_param, parse_query_param}; @@ -36,6 +38,7 @@ use crate::pgdatadir_mapping::LsnForTimestamp; use crate::task_mgr::TaskKind; use crate::tenant::config::{LocationConf, TenantConfOpt}; use crate::tenant::mgr::GetActiveTenantError; +use crate::tenant::mgr::UpsertLocationError; use crate::tenant::mgr::{ GetTenantError, SetNewTenantConfigError, TenantManager, TenantMapError, TenantMapInsertError, TenantSlotError, TenantSlotUpsertError, TenantStateError, @@ -45,7 +48,8 @@ use crate::tenant::size::ModelInputs; use crate::tenant::storage_layer::LayerAccessStatsReset; use crate::tenant::timeline::CompactFlags; use crate::tenant::timeline::Timeline; -use crate::tenant::{LogicalSizeCalculationCause, PageReconstructError, TenantSharedResources}; +use crate::tenant::SpawnMode; +use crate::tenant::{LogicalSizeCalculationCause, PageReconstructError}; use crate::{config::PageServerConf, tenant::mgr}; use crate::{disk_usage_eviction_task, tenant}; use pageserver_api::models::{ @@ -66,9 +70,6 @@ use utils::{ lsn::Lsn, }; -// Imports only used for testing APIs -use pageserver_api::models::ConfigureFailpointsRequest; - // For APIs that require an Active tenant, how long should we block waiting for that state? // This is not functionally necessary (clients will retry), but avoids generating a lot of // failed API calls while tenants are activating. @@ -114,14 +115,6 @@ impl State { secondary_controller, }) } - - fn tenant_resources(&self) -> TenantSharedResources { - TenantSharedResources { - broker_client: self.broker_client.clone(), - remote_storage: self.remote_storage.clone(), - deletion_queue_client: self.deletion_queue_client.clone(), - } - } } #[inline(always)] @@ -154,6 +147,7 @@ impl From for ApiError { PageReconstructError::AncestorStopping(_) => { ApiError::ResourceUnavailable(format!("{pre}").into()) } + PageReconstructError::AncestorLsnTimeout(e) => ApiError::Timeout(format!("{e}").into()), PageReconstructError::WalRedo(pre) => ApiError::InternalServerError(pre), } } @@ -176,7 +170,7 @@ impl From for ApiError { NotFound(tenant_id) => { ApiError::NotFound(anyhow::anyhow!("NotFound: tenant {tenant_id}").into()) } - e @ (AlreadyExists(_, _) | Conflict(_)) => ApiError::Conflict(format!("{e}")), + e @ AlreadyExists(_, _) => ApiError::Conflict(format!("{e}")), InProgress => { ApiError::ResourceUnavailable("Tenant is being modified concurrently".into()) } @@ -195,6 +189,18 @@ impl From for ApiError { } } +impl From for ApiError { + fn from(e: UpsertLocationError) -> ApiError { + use UpsertLocationError::*; + match e { + BadRequest(e) => ApiError::BadRequest(e), + Unavailable(_) => ApiError::ShuttingDown, + e @ InProgress => ApiError::Conflict(format!("{e}")), + Flush(e) | Other(e) => ApiError::InternalServerError(e), + } + } +} + impl From for ApiError { fn from(e: TenantMapError) -> ApiError { use TenantMapError::*; @@ -308,6 +314,7 @@ impl From for ApiError { SlotUpsertError(e) => e.into(), Other(o) => ApiError::InternalServerError(o), e @ InvalidState(_) => ApiError::PreconditionFailed(e.to_string().into_boxed_str()), + Cancelled => ApiError::ShuttingDown, } } } @@ -316,11 +323,21 @@ impl From for ApiError { async fn build_timeline_info( timeline: &Arc, include_non_incremental_logical_size: bool, + force_await_initial_logical_size: bool, ctx: &RequestContext, ) -> anyhow::Result { crate::tenant::debug_assert_current_span_has_tenant_and_timeline_id(); - let mut info = build_timeline_info_common(timeline, ctx).await?; + if force_await_initial_logical_size { + timeline.clone().await_initial_logical_size().await + } + + let mut info = build_timeline_info_common( + timeline, + ctx, + tenant::timeline::GetLogicalSizePriority::Background, + ) + .await?; if include_non_incremental_logical_size { // XXX we should be using spawn_ondemand_logical_size_calculation here. // Otherwise, if someone deletes the timeline / detaches the tenant while @@ -337,6 +354,7 @@ async fn build_timeline_info( async fn build_timeline_info_common( timeline: &Arc, ctx: &RequestContext, + logical_size_task_priority: tenant::timeline::GetLogicalSizePriority, ) -> anyhow::Result { crate::tenant::debug_assert_current_span_has_tenant_and_timeline_id(); let initdb_lsn = timeline.initdb_lsn; @@ -359,8 +377,7 @@ async fn build_timeline_info_common( Lsn(0) => None, lsn @ Lsn(_) => Some(lsn), }; - let current_logical_size = - timeline.get_current_logical_size(tenant::timeline::GetLogicalSizePriority::User, ctx); + let current_logical_size = timeline.get_current_logical_size(logical_size_task_priority, ctx); let current_physical_size = Some(timeline.layer_size_sum().await); let state = timeline.current_state(); let remote_consistent_lsn_projected = timeline @@ -471,7 +488,7 @@ async fn timeline_create_handler( .await { Ok(new_timeline) => { // Created. Construct a TimelineInfo for it. - let timeline_info = build_timeline_info_common(&new_timeline, &ctx) + let timeline_info = build_timeline_info_common(&new_timeline, &ctx, tenant::timeline::GetLogicalSizePriority::User) .await .map_err(ApiError::InternalServerError)?; json_response(StatusCode::CREATED, timeline_info) @@ -507,6 +524,8 @@ async fn timeline_list_handler( let tenant_shard_id: TenantShardId = parse_request_param(&request, "tenant_shard_id")?; let include_non_incremental_logical_size: Option = parse_query_param(&request, "include-non-incremental-logical-size")?; + let force_await_initial_logical_size: Option = + parse_query_param(&request, "force-await-initial-logical-size")?; check_permission(&request, Some(tenant_shard_id.tenant_id))?; let ctx = RequestContext::new(TaskKind::MgmtRequest, DownloadBehavior::Download); @@ -520,6 +539,7 @@ async fn timeline_list_handler( let timeline_info = build_timeline_info( &timeline, include_non_incremental_logical_size.unwrap_or(false), + force_await_initial_logical_size.unwrap_or(false), &ctx, ) .instrument(info_span!("build_timeline_info", timeline_id = %timeline.timeline_id)) @@ -547,6 +567,8 @@ async fn timeline_detail_handler( let timeline_id: TimelineId = parse_request_param(&request, "timeline_id")?; let include_non_incremental_logical_size: Option = parse_query_param(&request, "include-non-incremental-logical-size")?; + let force_await_initial_logical_size: Option = + parse_query_param(&request, "force-await-initial-logical-size")?; check_permission(&request, Some(tenant_shard_id.tenant_id))?; // Logical size calculation needs downloading. @@ -562,6 +584,7 @@ async fn timeline_detail_handler( let timeline_info = build_timeline_info( &timeline, include_non_incremental_logical_size.unwrap_or(false), + force_await_initial_logical_size.unwrap_or(false), &ctx, ) .await @@ -680,16 +703,37 @@ async fn tenant_attach_handler( ))); } - mgr::attach_tenant( - state.conf, - tenant_id, - generation, - tenant_conf, - state.tenant_resources(), - &ctx, - ) - .instrument(info_span!("tenant_attach", %tenant_id)) - .await?; + let tenant_shard_id = TenantShardId::unsharded(tenant_id); + let location_conf = LocationConf::attached_single(tenant_conf, generation); + let tenant = state + .tenant_manager + .upsert_location( + tenant_shard_id, + location_conf, + None, + SpawnMode::Normal, + &ctx, + ) + .await?; + + let Some(tenant) = tenant else { + // This should never happen: indicates a bug in upsert_location + return Err(ApiError::InternalServerError(anyhow::anyhow!( + "Upsert succeeded but didn't return tenant!" + ))); + }; + + // We might have successfully constructed a Tenant, but it could still + // end up in a broken state: + if let TenantState::Broken { + reason, + backtrace: _, + } = tenant.current_state() + { + return Err(ApiError::InternalServerError(anyhow::anyhow!( + "Tenant state is Broken: {reason}" + ))); + } json_response(StatusCode::ACCEPTED, ()) } @@ -886,7 +930,9 @@ async fn tenant_delete_handler( let state = get_state(&request); - mgr::delete_tenant(state.conf, state.remote_storage.clone(), tenant_shard_id) + state + .tenant_manager + .delete_tenant(tenant_shard_id, ACTIVE_TENANT_TIMEOUT) .instrument(info_span!("tenant_delete_handler", tenant_id = %tenant_shard_id.tenant_id, shard = %tenant_shard_id.shard_slug() @@ -1146,16 +1192,25 @@ async fn tenant_create_handler( let ctx = RequestContext::new(TaskKind::MgmtRequest, DownloadBehavior::Warn); - let new_tenant = mgr::create_tenant( - state.conf, - tenant_conf, - target_tenant_id, - generation, - state.tenant_resources(), - &ctx, - ) - .instrument(info_span!("tenant_create", tenant_id = %target_tenant_id)) - .await?; + let location_conf = LocationConf::attached_single(tenant_conf, generation); + + let new_tenant = state + .tenant_manager + .upsert_location( + target_tenant_id, + location_conf, + None, + SpawnMode::Create, + &ctx, + ) + .await?; + + let Some(new_tenant) = new_tenant else { + // This should never happen: indicates a bug in upsert_location + return Err(ApiError::InternalServerError(anyhow::anyhow!( + "Upsert succeeded but didn't return tenant!" + ))); + }; // We created the tenant. Existing API semantics are that the tenant // is Active when this function returns. @@ -1164,7 +1219,7 @@ async fn tenant_create_handler( .await { // This shouldn't happen because we just created the tenant directory - // in tenant::mgr::create_tenant, and there aren't any remote timelines + // in upsert_location, and there aren't any remote timelines // to load, so, nothing can really fail during load. // Don't do cleanup because we don't know how we got here. // The tenant will likely be in `Broken` state and subsequent @@ -1265,12 +1320,31 @@ async fn put_tenant_location_config_handler( state .tenant_manager - .upsert_location(tenant_shard_id, location_conf, flush, &ctx) - .await - // TODO: badrequest assumes the caller was asking for something unreasonable, but in - // principle we might have hit something like concurrent API calls to the same tenant, - // which is not a 400 but a 409. - .map_err(ApiError::BadRequest)?; + .upsert_location( + tenant_shard_id, + location_conf, + flush, + tenant::SpawnMode::Normal, + &ctx, + ) + .await?; + + if let Some(_flush_ms) = flush { + match state + .secondary_controller + .upload_tenant(tenant_shard_id) + .await + { + Ok(()) => { + tracing::info!("Uploaded heatmap during flush"); + } + Err(e) => { + tracing::warn!("Failed to flush heatmap: {e}"); + } + } + } else { + tracing::info!("No flush requested when configuring"); + } json_response(StatusCode::OK, ()) } @@ -1290,34 +1364,6 @@ async fn handle_tenant_break( json_response(StatusCode::OK, ()) } -async fn failpoints_handler( - mut request: Request, - _cancel: CancellationToken, -) -> Result, ApiError> { - if !fail::has_failpoints() { - return Err(ApiError::BadRequest(anyhow!( - "Cannot manage failpoints because pageserver was compiled without failpoints support" - ))); - } - - let failpoints: ConfigureFailpointsRequest = json_request(&mut request).await?; - for fp in failpoints { - info!("cfg failpoint: {} {}", fp.name, fp.actions); - - // We recognize one extra "action" that's not natively recognized - // by the failpoints crate: exit, to immediately kill the process - let cfg_result = crate::failpoint_support::apply_failpoint(&fp.name, &fp.actions); - - if let Err(err_msg) = cfg_result { - return Err(ApiError::BadRequest(anyhow!( - "Failed to configure failpoints: {err_msg}" - ))); - } - } - - json_response(StatusCode::OK, ()) -} - // Run GC immediately on given timeline. async fn timeline_gc_handler( mut request: Request, @@ -1566,19 +1612,22 @@ async fn disk_usage_eviction_run( struct Config { /// How many bytes to evict before reporting that pressure is relieved. evict_bytes: u64, + + #[serde(default)] + eviction_order: crate::disk_usage_eviction_task::EvictionOrder, } #[derive(Debug, Clone, Copy, serde::Serialize)] struct Usage { // remains unchanged after instantiation of the struct - config: Config, + evict_bytes: u64, // updated by `add_available_bytes` freed_bytes: u64, } impl crate::disk_usage_eviction_task::Usage for Usage { fn has_pressure(&self) -> bool { - self.config.evict_bytes > self.freed_bytes + self.evict_bytes > self.freed_bytes } fn add_available_bytes(&mut self, bytes: u64) { @@ -1589,7 +1638,7 @@ async fn disk_usage_eviction_run( let config = json_request::(&mut r).await?; let usage = Usage { - config, + evict_bytes: config.evict_bytes, freed_bytes: 0, }; @@ -1604,7 +1653,11 @@ async fn disk_usage_eviction_run( let state = state.disk_usage_eviction_state.clone(); let res = crate::disk_usage_eviction_task::disk_usage_eviction_task_iteration_impl( - &state, storage, usage, &cancel, + &state, + storage, + usage, + config.eviction_order, + &cancel, ) .await; @@ -1630,6 +1683,21 @@ async fn secondary_upload_handler( json_response(StatusCode::OK, ()) } +async fn secondary_download_handler( + request: Request, + _cancel: CancellationToken, +) -> Result, ApiError> { + let state = get_state(&request); + let tenant_shard_id: TenantShardId = parse_request_param(&request, "tenant_shard_id")?; + state + .secondary_controller + .download_tenant(tenant_shard_id) + .await + .map_err(ApiError::InternalServerError)?; + + json_response(StatusCode::OK, ()) +} + async fn handler_404(_: Request) -> Result, ApiError> { json_response( StatusCode::NOT_FOUND, @@ -1898,6 +1966,9 @@ pub fn make_router( .put("/v1/deletion_queue/flush", |r| { api_handler(r, deletion_queue_flush) }) + .post("/v1/tenant/:tenant_shard_id/secondary/download", |r| { + api_handler(r, secondary_download_handler) + }) .put("/v1/tenant/:tenant_shard_id/break", |r| { testing_api_handler("set tenant state to broken", r, handle_tenant_break) }) diff --git a/pageserver/src/import_datadir.rs b/pageserver/src/import_datadir.rs index d95d75449d..d66df36b3a 100644 --- a/pageserver/src/import_datadir.rs +++ b/pageserver/src/import_datadir.rs @@ -21,6 +21,7 @@ use tracing::*; use walkdir::WalkDir; use crate::context::RequestContext; +use crate::metrics::WAL_INGEST; use crate::pgdatadir_mapping::*; use crate::tenant::remote_timeline_client::INITDB_PATH; use crate::tenant::Timeline; @@ -312,13 +313,16 @@ async fn import_wal( waldecoder.feed_bytes(&buf); let mut nrecords = 0; - let mut modification = tline.begin_modification(endpoint); + let mut modification = tline.begin_modification(last_lsn); let mut decoded = DecodedWALRecord::default(); while last_lsn <= endpoint { if let Some((lsn, recdata)) = waldecoder.poll_decode()? { walingest .ingest_record(recdata, lsn, &mut modification, &mut decoded, ctx) .await?; + WAL_INGEST.records_committed.inc(); + + modification.commit(ctx).await?; last_lsn = lsn; nrecords += 1; @@ -448,13 +452,14 @@ pub async fn import_wal_from_tar( waldecoder.feed_bytes(&bytes[offset..]); - let mut modification = tline.begin_modification(end_lsn); + let mut modification = tline.begin_modification(last_lsn); let mut decoded = DecodedWALRecord::default(); while last_lsn <= end_lsn { if let Some((lsn, recdata)) = waldecoder.poll_decode()? { walingest .ingest_record(recdata, lsn, &mut modification, &mut decoded, ctx) .await?; + modification.commit(ctx).await?; last_lsn = lsn; debug!("imported record at {} (end {})", lsn, end_lsn); diff --git a/pageserver/src/lib.rs b/pageserver/src/lib.rs index 22b1721292..bcde1166b7 100644 --- a/pageserver/src/lib.rs +++ b/pageserver/src/lib.rs @@ -26,8 +26,6 @@ pub mod walingest; pub mod walrecord; pub mod walredo; -pub mod failpoint_support; - use crate::task_mgr::TaskKind; use camino::Utf8Path; use deletion_queue::DeletionQueue; @@ -120,6 +118,10 @@ pub const TENANT_CONFIG_NAME: &str = "config"; /// Full path: `tenants//config`. pub const TENANT_LOCATION_CONFIG_NAME: &str = "config-v1"; +/// Per-tenant copy of their remote heatmap, downloaded into the local +/// tenant path while in secondary mode. +pub const TENANT_HEATMAP_BASENAME: &str = "heatmap-v1.json"; + /// A suffix used for various temporary files. Any temporary files found in the /// data directory at pageserver startup can be automatically removed. pub const TEMP_FILE_SUFFIX: &str = "___temp"; diff --git a/pageserver/src/metrics.rs b/pageserver/src/metrics.rs index f2bf65da24..6ed9d2ad0b 100644 --- a/pageserver/src/metrics.rs +++ b/pageserver/src/metrics.rs @@ -29,7 +29,7 @@ const CRITICAL_OP_BUCKETS: &[f64] = &[ // Metrics collected on operations on the storage repository. #[derive(Debug, EnumVariantNames, IntoStaticStr)] #[strum(serialize_all = "kebab_case")] -pub enum StorageTimeOperation { +pub(crate) enum StorageTimeOperation { #[strum(serialize = "layer flush")] LayerFlush, @@ -55,7 +55,7 @@ pub enum StorageTimeOperation { CreateTenant, } -pub static STORAGE_TIME_SUM_PER_TIMELINE: Lazy = Lazy::new(|| { +pub(crate) static STORAGE_TIME_SUM_PER_TIMELINE: Lazy = Lazy::new(|| { register_counter_vec!( "pageserver_storage_operations_seconds_sum", "Total time spent on storage operations with operation, tenant and timeline dimensions", @@ -64,7 +64,7 @@ pub static STORAGE_TIME_SUM_PER_TIMELINE: Lazy = Lazy::new(|| { .expect("failed to define a metric") }); -pub static STORAGE_TIME_COUNT_PER_TIMELINE: Lazy = Lazy::new(|| { +pub(crate) static STORAGE_TIME_COUNT_PER_TIMELINE: Lazy = Lazy::new(|| { register_int_counter_vec!( "pageserver_storage_operations_seconds_count", "Count of storage operations with operation, tenant and timeline dimensions", @@ -150,7 +150,7 @@ pub(crate) static MATERIALIZED_PAGE_CACHE_HIT: Lazy = Lazy::new(|| { .expect("failed to define a metric") }); -pub struct PageCacheMetricsForTaskKind { +pub(crate) struct PageCacheMetricsForTaskKind { pub read_accesses_materialized_page: IntCounter, pub read_accesses_immutable: IntCounter, @@ -159,7 +159,7 @@ pub struct PageCacheMetricsForTaskKind { pub read_hits_materialized_page_older_lsn: IntCounter, } -pub struct PageCacheMetrics { +pub(crate) struct PageCacheMetrics { map: EnumMap>, } @@ -181,7 +181,7 @@ static PAGE_CACHE_READ_ACCESSES: Lazy = Lazy::new(|| { .expect("failed to define a metric") }); -pub static PAGE_CACHE: Lazy = Lazy::new(|| PageCacheMetrics { +pub(crate) static PAGE_CACHE: Lazy = Lazy::new(|| PageCacheMetrics { map: EnumMap::from_array(std::array::from_fn(|task_kind| { let task_kind = ::from_usize(task_kind); let task_kind: &'static str = task_kind.into(); @@ -243,10 +243,9 @@ impl PageCacheMetrics { } } -pub struct PageCacheSizeMetrics { +pub(crate) struct PageCacheSizeMetrics { pub max_bytes: UIntGauge, - pub current_bytes_ephemeral: UIntGauge, pub current_bytes_immutable: UIntGauge, pub current_bytes_materialized_page: UIntGauge, } @@ -260,31 +259,26 @@ static PAGE_CACHE_SIZE_CURRENT_BYTES: Lazy = Lazy::new(|| { .expect("failed to define a metric") }); -pub static PAGE_CACHE_SIZE: Lazy = Lazy::new(|| PageCacheSizeMetrics { - max_bytes: { - register_uint_gauge!( - "pageserver_page_cache_size_max_bytes", - "Maximum size of the page cache in bytes" - ) - .expect("failed to define a metric") - }, - - current_bytes_ephemeral: { - PAGE_CACHE_SIZE_CURRENT_BYTES - .get_metric_with_label_values(&["ephemeral"]) - .unwrap() - }, - current_bytes_immutable: { - PAGE_CACHE_SIZE_CURRENT_BYTES - .get_metric_with_label_values(&["immutable"]) - .unwrap() - }, - current_bytes_materialized_page: { - PAGE_CACHE_SIZE_CURRENT_BYTES - .get_metric_with_label_values(&["materialized_page"]) - .unwrap() - }, -}); +pub(crate) static PAGE_CACHE_SIZE: Lazy = + Lazy::new(|| PageCacheSizeMetrics { + max_bytes: { + register_uint_gauge!( + "pageserver_page_cache_size_max_bytes", + "Maximum size of the page cache in bytes" + ) + .expect("failed to define a metric") + }, + current_bytes_immutable: { + PAGE_CACHE_SIZE_CURRENT_BYTES + .get_metric_with_label_values(&["immutable"]) + .unwrap() + }, + current_bytes_materialized_page: { + PAGE_CACHE_SIZE_CURRENT_BYTES + .get_metric_with_label_values(&["materialized_page"]) + .unwrap() + }, + }); pub(crate) mod page_cache_eviction_metrics { use std::num::NonZeroUsize; @@ -740,13 +734,13 @@ pub(crate) static TENANT: Lazy = Lazy::new(|| { /// Each `Timeline`'s [`EVICTIONS_WITH_LOW_RESIDENCE_DURATION`] metric. #[derive(Debug)] -pub struct EvictionsWithLowResidenceDuration { +pub(crate) struct EvictionsWithLowResidenceDuration { data_source: &'static str, threshold: Duration, counter: Option, } -pub struct EvictionsWithLowResidenceDurationBuilder { +pub(crate) struct EvictionsWithLowResidenceDurationBuilder { data_source: &'static str, threshold: Duration, } @@ -1010,7 +1004,7 @@ pub enum SmgrQueryType { } #[derive(Debug)] -pub struct SmgrQueryTimePerTimeline { +pub(crate) struct SmgrQueryTimePerTimeline { metrics: [GlobalAndPerTimelineHistogram; SmgrQueryType::COUNT], } @@ -1182,8 +1176,8 @@ static COMPUTE_STARTUP_BUCKETS: Lazy<[f64; 28]> = Lazy::new(|| { .map(|ms| (ms as f64) / 1000.0) }); -pub struct BasebackupQueryTime(HistogramVec); -pub static BASEBACKUP_QUERY_TIME: Lazy = Lazy::new(|| { +pub(crate) struct BasebackupQueryTime(HistogramVec); +pub(crate) static BASEBACKUP_QUERY_TIME: Lazy = Lazy::new(|| { BasebackupQueryTime({ register_histogram_vec!( "pageserver_basebackup_query_seconds", @@ -1203,7 +1197,7 @@ impl DurationResultObserver for BasebackupQueryTime { } } -pub static LIVE_CONNECTIONS_COUNT: Lazy = Lazy::new(|| { +pub(crate) static LIVE_CONNECTIONS_COUNT: Lazy = Lazy::new(|| { register_int_gauge_vec!( "pageserver_live_connections", "Number of live network connections", @@ -1370,6 +1364,8 @@ pub(crate) struct SecondaryModeMetrics { pub(crate) upload_heatmap: IntCounter, pub(crate) upload_heatmap_errors: IntCounter, pub(crate) upload_heatmap_duration: Histogram, + pub(crate) download_heatmap: IntCounter, + pub(crate) download_layer: IntCounter, } pub(crate) static SECONDARY_MODE: Lazy = Lazy::new(|| SecondaryModeMetrics { upload_heatmap: register_int_counter!( @@ -1387,6 +1383,16 @@ pub(crate) static SECONDARY_MODE: Lazy = Lazy::new(|| Seco "Time to build and upload a heatmap, including any waiting inside the S3 client" ) .expect("failed to define a metric"), + download_heatmap: register_int_counter!( + "pageserver_secondary_download_heatmap", + "Number of downloads of heatmaps by secondary mode locations" + ) + .expect("failed to define a metric"), + download_layer: register_int_counter!( + "pageserver_secondary_download_layer", + "Number of downloads of layers by secondary mode locations" + ) + .expect("failed to define a metric"), }); #[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)] @@ -1656,7 +1662,7 @@ pub(crate) static WAL_REDO_PROCESS_COUNTERS: Lazy = Lazy::new(WalRedoProcessCounters::default); /// Similar to `prometheus::HistogramTimer` but does not record on drop. -pub struct StorageTimeMetricsTimer { +pub(crate) struct StorageTimeMetricsTimer { metrics: StorageTimeMetrics, start: Instant, } @@ -1681,7 +1687,7 @@ impl StorageTimeMetricsTimer { /// Timing facilities for an globally histogrammed metric, which is supported by per tenant and /// timeline total sum and count. #[derive(Clone, Debug)] -pub struct StorageTimeMetrics { +pub(crate) struct StorageTimeMetrics { /// Sum of f64 seconds, per operation, tenant_id and timeline_id timeline_sum: Counter, /// Number of oeprations, per operation, tenant_id and timeline_id @@ -1720,7 +1726,7 @@ impl StorageTimeMetrics { } #[derive(Debug)] -pub struct TimelineMetrics { +pub(crate) struct TimelineMetrics { tenant_id: String, shard_id: String, timeline_id: String, @@ -1928,7 +1934,7 @@ impl Drop for PerTimelineRemotePhysicalSizeGauge { } } -pub struct RemoteTimelineClientMetrics { +pub(crate) struct RemoteTimelineClientMetrics { tenant_id: String, timeline_id: String, remote_physical_size_gauge: Mutex>, @@ -2226,7 +2232,7 @@ impl Drop for RemoteTimelineClientMetrics { /// Wrapper future that measures the time spent by a remote storage operation, /// and records the time and success/failure as a prometheus metric. -pub trait MeasureRemoteOp: Sized { +pub(crate) trait MeasureRemoteOp: Sized { fn measure_remote_op( self, tenant_id: TenantId, @@ -2251,7 +2257,7 @@ pub trait MeasureRemoteOp: Sized { impl MeasureRemoteOp for T {} pin_project! { - pub struct MeasuredRemoteOp + pub(crate) struct MeasuredRemoteOp { #[pin] inner: F, diff --git a/pageserver/src/page_service.rs b/pageserver/src/page_service.rs index d5ca7f7382..291490d016 100644 --- a/pageserver/src/page_service.rs +++ b/pageserver/src/page_service.rs @@ -25,6 +25,7 @@ use postgres_backend::{self, is_expected_io_error, AuthType, PostgresBackend, Qu use pq_proto::framed::ConnectionError; use pq_proto::FeStartupPacket; use pq_proto::{BeMessage, FeMessage, RowDescriptor}; +use std::borrow::Cow; use std::io; use std::net::TcpListener; use std::pin::pin; @@ -53,7 +54,7 @@ use crate::context::{DownloadBehavior, RequestContext}; use crate::import_datadir::import_wal_from_tar; use crate::metrics; use crate::metrics::LIVE_CONNECTIONS_COUNT; -use crate::pgdatadir_mapping::rel_block_to_key; +use crate::pgdatadir_mapping::{rel_block_to_key, Version}; use crate::task_mgr; use crate::task_mgr::TaskKind; use crate::tenant::debug_assert_current_span_has_tenant_and_timeline_id; @@ -61,6 +62,9 @@ use crate::tenant::mgr; use crate::tenant::mgr::get_active_tenant_with_timeout; use crate::tenant::mgr::GetActiveTenantError; use crate::tenant::mgr::ShardSelector; +use crate::tenant::timeline::WaitLsnError; +use crate::tenant::GetTimelineError; +use crate::tenant::PageReconstructError; use crate::tenant::Timeline; use crate::trace::Tracer; @@ -283,6 +287,64 @@ struct PageServerHandler { connection_ctx: RequestContext, } +#[derive(thiserror::Error, Debug)] +enum PageStreamError { + /// We encountered an error that should prompt the client to reconnect: + /// in practice this means we drop the connection without sending a response. + #[error("Reconnect required: {0}")] + Reconnect(Cow<'static, str>), + + /// We were instructed to shutdown while processing the query + #[error("Shutting down")] + Shutdown, + + /// Something went wrong reading a page: this likely indicates a pageserver bug + #[error("Read error: {0}")] + Read(PageReconstructError), + + /// Ran out of time waiting for an LSN + #[error("LSN timeout: {0}")] + LsnTimeout(WaitLsnError), + + /// The entity required to serve the request (tenant or timeline) is not found, + /// or is not found in a suitable state to serve a request. + #[error("Not found: {0}")] + NotFound(std::borrow::Cow<'static, str>), + + /// Request asked for something that doesn't make sense, like an invalid LSN + #[error("Bad request: {0}")] + BadRequest(std::borrow::Cow<'static, str>), +} + +impl From for PageStreamError { + fn from(value: PageReconstructError) -> Self { + match value { + PageReconstructError::Cancelled => Self::Shutdown, + e => Self::Read(e), + } + } +} + +impl From for PageStreamError { + fn from(value: GetActiveTimelineError) -> Self { + match value { + GetActiveTimelineError::Tenant(GetActiveTenantError::Cancelled) => Self::Shutdown, + GetActiveTimelineError::Tenant(e) => Self::NotFound(format!("{e}").into()), + GetActiveTimelineError::Timeline(e) => Self::NotFound(format!("{e}").into()), + } + } +} + +impl From for PageStreamError { + fn from(value: WaitLsnError) -> Self { + match value { + e @ WaitLsnError::Timeout(_) => Self::LsnTimeout(e), + WaitLsnError::Shutdown => Self::Shutdown, + WaitLsnError::BadState => Self::Reconnect("Timeline is not active".into()), + } + } +} + impl PageServerHandler { pub fn new( conf: &'static PageServerConf, @@ -428,7 +490,7 @@ impl PageServerHandler { // Check that the timeline exists let timeline = tenant .get_timeline(timeline_id, true) - .map_err(|e| anyhow::anyhow!(e))?; + .map_err(|e| QueryError::NotFound(format!("{e}").into()))?; // Avoid starting new requests if the timeline has already started shutting down, // and block timeline shutdown until this request is complete, or drops out due @@ -520,32 +582,44 @@ impl PageServerHandler { } }; - if let Err(e) = &response { - // Requests may fail as soon as we are Stopping, even if the Timeline's cancellation token wasn't fired yet, - // because wait_lsn etc will drop out - // is_stopping(): [`Timeline::flush_and_shutdown`] has entered - // is_canceled(): [`Timeline::shutdown`]` has entered - if timeline.cancel.is_cancelled() || timeline.is_stopping() { + match response { + Err(PageStreamError::Shutdown) => { // If we fail to fulfil a request during shutdown, which may be _because_ of // shutdown, then do not send the error to the client. Instead just drop the // connection. - span.in_scope(|| info!("dropped response during shutdown: {e:#}")); + span.in_scope(|| info!("dropping connection due to shutdown")); return Err(QueryError::Shutdown); } + Err(PageStreamError::Reconnect(reason)) => { + span.in_scope(|| info!("handler requested reconnect: {reason}")); + return Err(QueryError::Reconnect); + } + Err(e) if timeline.cancel.is_cancelled() || timeline.is_stopping() => { + // This branch accomodates code within request handlers that returns an anyhow::Error instead of a clean + // shutdown error, this may be buried inside a PageReconstructError::Other for example. + // + // Requests may fail as soon as we are Stopping, even if the Timeline's cancellation token wasn't fired yet, + // because wait_lsn etc will drop out + // is_stopping(): [`Timeline::flush_and_shutdown`] has entered + // is_canceled(): [`Timeline::shutdown`]` has entered + span.in_scope(|| info!("dropped error response during shutdown: {e:#}")); + return Err(QueryError::Shutdown); + } + r => { + let response_msg = r.unwrap_or_else(|e| { + // print the all details to the log with {:#}, but for the client the + // error message is enough. Do not log if shutting down, as the anyhow::Error + // here includes cancellation which is not an error. + span.in_scope(|| error!("error reading relation or page version: {:#}", e)); + PagestreamBeMessage::Error(PagestreamErrorResponse { + message: e.to_string(), + }) + }); + + pgb.write_message_noflush(&BeMessage::CopyData(&response_msg.serialize()))?; + self.flush_cancellable(pgb, &timeline.cancel).await?; + } } - - let response = response.unwrap_or_else(|e| { - // print the all details to the log with {:#}, but for the client the - // error message is enough. Do not log if shutting down, as the anyhow::Error - // here includes cancellation which is not an error. - span.in_scope(|| error!("error reading relation or page version: {:#}", e)); - PagestreamBeMessage::Error(PagestreamErrorResponse { - message: e.to_string(), - }) - }); - - pgb.write_message_noflush(&BeMessage::CopyData(&response.serialize()))?; - self.flush_cancellable(pgb, &timeline.cancel).await?; } Ok(()) } @@ -692,7 +766,7 @@ impl PageServerHandler { latest: bool, latest_gc_cutoff_lsn: &RcuReadGuard, ctx: &RequestContext, - ) -> anyhow::Result { + ) -> Result { if latest { // Latest page version was requested. If LSN is given, it is a hint // to the page server that there have been no modifications to the @@ -723,15 +797,19 @@ impl PageServerHandler { } } else { if lsn == Lsn(0) { - anyhow::bail!("invalid LSN(0) in request"); + return Err(PageStreamError::BadRequest( + "invalid LSN(0) in request".into(), + )); } timeline.wait_lsn(lsn, ctx).await?; } - anyhow::ensure!( - lsn >= **latest_gc_cutoff_lsn, - "tried to request a page version that was garbage collected. requested at {} gc cutoff {}", - lsn, **latest_gc_cutoff_lsn - ); + + if lsn < **latest_gc_cutoff_lsn { + return Err(PageStreamError::BadRequest(format!( + "tried to request a page version that was garbage collected. requested at {} gc cutoff {}", + lsn, **latest_gc_cutoff_lsn + ).into())); + } Ok(lsn) } @@ -740,14 +818,14 @@ impl PageServerHandler { timeline: &Timeline, req: &PagestreamExistsRequest, ctx: &RequestContext, - ) -> anyhow::Result { + ) -> Result { let latest_gc_cutoff_lsn = timeline.get_latest_gc_cutoff_lsn(); let lsn = Self::wait_or_get_last_lsn(timeline, req.lsn, req.latest, &latest_gc_cutoff_lsn, ctx) .await?; let exists = timeline - .get_rel_exists(req.rel, lsn, req.latest, ctx) + .get_rel_exists(req.rel, Version::Lsn(lsn), req.latest, ctx) .await?; Ok(PagestreamBeMessage::Exists(PagestreamExistsResponse { @@ -760,13 +838,15 @@ impl PageServerHandler { timeline: &Timeline, req: &PagestreamNblocksRequest, ctx: &RequestContext, - ) -> anyhow::Result { + ) -> Result { let latest_gc_cutoff_lsn = timeline.get_latest_gc_cutoff_lsn(); let lsn = Self::wait_or_get_last_lsn(timeline, req.lsn, req.latest, &latest_gc_cutoff_lsn, ctx) .await?; - let n_blocks = timeline.get_rel_size(req.rel, lsn, req.latest, ctx).await?; + let n_blocks = timeline + .get_rel_size(req.rel, Version::Lsn(lsn), req.latest, ctx) + .await?; Ok(PagestreamBeMessage::Nblocks(PagestreamNblocksResponse { n_blocks, @@ -778,14 +858,20 @@ impl PageServerHandler { timeline: &Timeline, req: &PagestreamDbSizeRequest, ctx: &RequestContext, - ) -> anyhow::Result { + ) -> Result { let latest_gc_cutoff_lsn = timeline.get_latest_gc_cutoff_lsn(); let lsn = Self::wait_or_get_last_lsn(timeline, req.lsn, req.latest, &latest_gc_cutoff_lsn, ctx) .await?; let total_blocks = timeline - .get_db_size(DEFAULTTABLESPACE_OID, req.dbnode, lsn, req.latest, ctx) + .get_db_size( + DEFAULTTABLESPACE_OID, + req.dbnode, + Version::Lsn(lsn), + req.latest, + ctx, + ) .await?; let db_size = total_blocks as i64 * BLCKSZ as i64; @@ -794,30 +880,35 @@ impl PageServerHandler { })) } + async fn do_handle_get_page_at_lsn_request( + &self, + timeline: &Timeline, + req: &PagestreamGetPageRequest, + ctx: &RequestContext, + ) -> Result { + let latest_gc_cutoff_lsn = timeline.get_latest_gc_cutoff_lsn(); + let lsn = + Self::wait_or_get_last_lsn(timeline, req.lsn, req.latest, &latest_gc_cutoff_lsn, ctx) + .await?; + let page = timeline + .get_rel_page_at_lsn(req.rel, req.blkno, Version::Lsn(lsn), req.latest, ctx) + .await?; + + Ok(PagestreamBeMessage::GetPage(PagestreamGetPageResponse { + page, + })) + } + async fn handle_get_page_at_lsn_request( &self, timeline: &Timeline, req: &PagestreamGetPageRequest, ctx: &RequestContext, - ) -> anyhow::Result { - let latest_gc_cutoff_lsn = timeline.get_latest_gc_cutoff_lsn(); - let lsn = - Self::wait_or_get_last_lsn(timeline, req.lsn, req.latest, &latest_gc_cutoff_lsn, ctx) - .await?; - /* - // Add a 1s delay to some requests. The delay helps the requests to - // hit the race condition from github issue #1047 more easily. - use rand::Rng; - if rand::thread_rng().gen::() < 5 { - std::thread::sleep(std::time::Duration::from_millis(1000)); - } - */ - + ) -> Result { let key = rel_block_to_key(req.rel, req.blkno); - let page = if timeline.get_shard_identity().is_key_local(&key) { - timeline - .get_rel_page_at_lsn(req.rel, req.blkno, lsn, req.latest, ctx) - .await? + if timeline.get_shard_identity().is_key_local(&key) { + self.do_handle_get_page_at_lsn_request(timeline, req, ctx) + .await } else { // The Tenant shard we looked up at connection start does not hold this particular // key: look for other shards in this tenant. This scenario occurs if a pageserver @@ -836,30 +927,30 @@ impl PageServerHandler { Err(GetActiveTimelineError::Tenant(GetActiveTenantError::NotFound(_))) => { // We already know this tenant exists in general, because we resolved it at // start of connection. Getting a NotFound here indicates that the shard containing - // the requested page is not present on this node. - - // TODO: this should be some kind of structured error that the client will understand, - // so that it can block until its config is updated: this error is expected in the case - // that the Tenant's shards' placements are being updated and the client hasn't been - // informed yet. - // - // https://github.com/neondatabase/neon/issues/6038 - return Err(anyhow::anyhow!("Request routed to wrong shard")); + // the requested page is not present on this node: the client's knowledge of shard->pageserver + // mapping is out of date. + tracing::info!("Page request routed to wrong shard: my identity {:?}, should go to shard {}, key {}", + timeline.get_shard_identity(), timeline.get_shard_identity().get_shard_number(&key).0, key); + // Closing the connection by returning ``::Reconnect` has the side effect of rate-limiting above message, via + // client's reconnect backoff, as well as hopefully prompting the client to load its updated configuration + // and talk to a different pageserver. + return Err(PageStreamError::Reconnect( + "getpage@lsn request routed to wrong shard".into(), + )); } Err(e) => return Err(e.into()), }; // Take a GateGuard for the duration of this request. If we were using our main Timeline object, // the GateGuard was already held over the whole connection. - let _timeline_guard = timeline.gate.enter().map_err(|_| QueryError::Shutdown)?; - timeline - .get_rel_page_at_lsn(req.rel, req.blkno, lsn, req.latest, ctx) - .await? - }; + let _timeline_guard = timeline + .gate + .enter() + .map_err(|_| PageStreamError::Shutdown)?; - Ok(PagestreamBeMessage::GetPage(PagestreamGetPageResponse { - page, - })) + self.do_handle_get_page_at_lsn_request(&timeline, req, ctx) + .await + } } #[allow(clippy::too_many_arguments)] @@ -1000,9 +1091,7 @@ impl PageServerHandler { ) .await .map_err(GetActiveTimelineError::Tenant)?; - let timeline = tenant - .get_timeline(timeline_id, true) - .map_err(|e| GetActiveTimelineError::Timeline(anyhow::anyhow!(e)))?; + let timeline = tenant.get_timeline(timeline_id, true)?; Ok(timeline) } } @@ -1424,14 +1513,15 @@ enum GetActiveTimelineError { #[error(transparent)] Tenant(GetActiveTenantError), #[error(transparent)] - Timeline(anyhow::Error), + Timeline(#[from] GetTimelineError), } impl From for QueryError { fn from(e: GetActiveTimelineError) -> Self { match e { + GetActiveTimelineError::Tenant(GetActiveTenantError::Cancelled) => QueryError::Shutdown, GetActiveTimelineError::Tenant(e) => e.into(), - GetActiveTimelineError::Timeline(e) => QueryError::Other(e), + GetActiveTimelineError::Timeline(e) => QueryError::NotFound(format!("{e}").into()), } } } diff --git a/pageserver/src/pgdatadir_mapping.rs b/pageserver/src/pgdatadir_mapping.rs index b81037ae47..f11a72f2ab 100644 --- a/pageserver/src/pgdatadir_mapping.rs +++ b/pageserver/src/pgdatadir_mapping.rs @@ -11,7 +11,7 @@ use crate::context::RequestContext; use crate::keyspace::{KeySpace, KeySpaceAccum}; use crate::repository::*; use crate::walrecord::NeonWalRecord; -use anyhow::Context; +use anyhow::{ensure, Context}; use bytes::{Buf, Bytes}; use pageserver_api::key::is_rel_block_key; use pageserver_api::reltag::{RelTag, SlruKind}; @@ -147,6 +147,7 @@ impl Timeline { { DatadirModification { tline: self, + pending_lsns: Vec::new(), pending_updates: HashMap::new(), pending_deletions: Vec::new(), pending_nblocks: 0, @@ -159,11 +160,11 @@ impl Timeline { //------------------------------------------------------------------------------ /// Look up given page version. - pub async fn get_rel_page_at_lsn( + pub(crate) async fn get_rel_page_at_lsn( &self, tag: RelTag, blknum: BlockNumber, - lsn: Lsn, + version: Version<'_>, latest: bool, ctx: &RequestContext, ) -> Result { @@ -173,44 +174,47 @@ impl Timeline { )); } - let nblocks = self.get_rel_size(tag, lsn, latest, ctx).await?; + let nblocks = self.get_rel_size(tag, version, latest, ctx).await?; if blknum >= nblocks { debug!( "read beyond EOF at {} blk {} at {}, size is {}: returning all-zeros page", - tag, blknum, lsn, nblocks + tag, + blknum, + version.get_lsn(), + nblocks ); return Ok(ZERO_PAGE.clone()); } let key = rel_block_to_key(tag, blknum); - self.get(key, lsn, ctx).await + version.get(self, key, ctx).await } // Get size of a database in blocks - pub async fn get_db_size( + pub(crate) async fn get_db_size( &self, spcnode: Oid, dbnode: Oid, - lsn: Lsn, + version: Version<'_>, latest: bool, ctx: &RequestContext, ) -> Result { let mut total_blocks = 0; - let rels = self.list_rels(spcnode, dbnode, lsn, ctx).await?; + let rels = self.list_rels(spcnode, dbnode, version, ctx).await?; for rel in rels { - let n_blocks = self.get_rel_size(rel, lsn, latest, ctx).await?; + let n_blocks = self.get_rel_size(rel, version, latest, ctx).await?; total_blocks += n_blocks as usize; } Ok(total_blocks) } /// Get size of a relation file - pub async fn get_rel_size( + pub(crate) async fn get_rel_size( &self, tag: RelTag, - lsn: Lsn, + version: Version<'_>, latest: bool, ctx: &RequestContext, ) -> Result { @@ -220,12 +224,12 @@ impl Timeline { )); } - if let Some(nblocks) = self.get_cached_rel_size(&tag, lsn) { + if let Some(nblocks) = self.get_cached_rel_size(&tag, version.get_lsn()) { return Ok(nblocks); } if (tag.forknum == FSM_FORKNUM || tag.forknum == VISIBILITYMAP_FORKNUM) - && !self.get_rel_exists(tag, lsn, latest, ctx).await? + && !self.get_rel_exists(tag, version, latest, ctx).await? { // FIXME: Postgres sometimes calls smgrcreate() to create // FSM, and smgrnblocks() on it immediately afterwards, @@ -235,7 +239,7 @@ impl Timeline { } let key = rel_size_to_key(tag); - let mut buf = self.get(key, lsn, ctx).await?; + let mut buf = version.get(self, key, ctx).await?; let nblocks = buf.get_u32_le(); if latest { @@ -246,16 +250,16 @@ impl Timeline { // latest=true, then it can not cause cache corruption, because with latest=true // pageserver choose max(request_lsn, last_written_lsn) and so cached value will be // associated with most recent value of LSN. - self.update_cached_rel_size(tag, lsn, nblocks); + self.update_cached_rel_size(tag, version.get_lsn(), nblocks); } Ok(nblocks) } /// Does relation exist? - pub async fn get_rel_exists( + pub(crate) async fn get_rel_exists( &self, tag: RelTag, - lsn: Lsn, + version: Version<'_>, _latest: bool, ctx: &RequestContext, ) -> Result { @@ -266,12 +270,12 @@ impl Timeline { } // first try to lookup relation in cache - if let Some(_nblocks) = self.get_cached_rel_size(&tag, lsn) { + if let Some(_nblocks) = self.get_cached_rel_size(&tag, version.get_lsn()) { return Ok(true); } // fetch directory listing let key = rel_dir_to_key(tag.spcnode, tag.dbnode); - let buf = self.get(key, lsn, ctx).await?; + let buf = version.get(self, key, ctx).await?; match RelDirectory::des(&buf).context("deserialization failure") { Ok(dir) => { @@ -287,16 +291,16 @@ impl Timeline { /// # Cancel-Safety /// /// This method is cancellation-safe. - pub async fn list_rels( + pub(crate) async fn list_rels( &self, spcnode: Oid, dbnode: Oid, - lsn: Lsn, + version: Version<'_>, ctx: &RequestContext, ) -> Result, PageReconstructError> { // fetch directory listing let key = rel_dir_to_key(spcnode, dbnode); - let buf = self.get(key, lsn, ctx).await?; + let buf = version.get(self, key, ctx).await?; match RelDirectory::des(&buf).context("deserialization failure") { Ok(dir) => { @@ -315,7 +319,7 @@ impl Timeline { } /// Look up given SLRU page version. - pub async fn get_slru_page_at_lsn( + pub(crate) async fn get_slru_page_at_lsn( &self, kind: SlruKind, segno: u32, @@ -328,29 +332,29 @@ impl Timeline { } /// Get size of an SLRU segment - pub async fn get_slru_segment_size( + pub(crate) async fn get_slru_segment_size( &self, kind: SlruKind, segno: u32, - lsn: Lsn, + version: Version<'_>, ctx: &RequestContext, ) -> Result { let key = slru_segment_size_to_key(kind, segno); - let mut buf = self.get(key, lsn, ctx).await?; + let mut buf = version.get(self, key, ctx).await?; Ok(buf.get_u32_le()) } /// Get size of an SLRU segment - pub async fn get_slru_segment_exists( + pub(crate) async fn get_slru_segment_exists( &self, kind: SlruKind, segno: u32, - lsn: Lsn, + version: Version<'_>, ctx: &RequestContext, ) -> Result { // fetch directory listing let key = slru_dir_to_key(kind); - let buf = self.get(key, lsn, ctx).await?; + let buf = version.get(self, key, ctx).await?; match SlruSegmentDirectory::des(&buf).context("deserialization failure") { Ok(dir) => { @@ -368,7 +372,7 @@ impl Timeline { /// so it's not well defined which LSN you get if there were multiple commits /// "in flight" at that point in time. /// - pub async fn find_lsn_for_timestamp( + pub(crate) async fn find_lsn_for_timestamp( &self, search_timestamp: TimestampTz, cancel: &CancellationToken, @@ -448,7 +452,7 @@ impl Timeline { /// Additionally, sets 'found_smaller'/'found_Larger, if encounters any commits /// with a smaller/larger timestamp. /// - pub async fn is_latest_commit_timestamp_ge_than( + pub(crate) async fn is_latest_commit_timestamp_ge_than( &self, search_timestamp: TimestampTz, probe_lsn: Lsn, @@ -471,7 +475,7 @@ impl Timeline { /// Obtain the possible timestamp range for the given lsn. /// /// If the lsn has no timestamps, returns None. returns `(min, max, median)` if it has timestamps. - pub async fn get_timestamp_for_lsn( + pub(crate) async fn get_timestamp_for_lsn( &self, probe_lsn: Lsn, ctx: &RequestContext, @@ -501,11 +505,11 @@ impl Timeline { mut f: impl FnMut(TimestampTz) -> ControlFlow, ) -> Result { for segno in self - .list_slru_segments(SlruKind::Clog, probe_lsn, ctx) + .list_slru_segments(SlruKind::Clog, Version::Lsn(probe_lsn), ctx) .await? { let nblocks = self - .get_slru_segment_size(SlruKind::Clog, segno, probe_lsn, ctx) + .get_slru_segment_size(SlruKind::Clog, segno, Version::Lsn(probe_lsn), ctx) .await?; for blknum in (0..nblocks).rev() { let clog_page = self @@ -528,36 +532,36 @@ impl Timeline { } /// Get a list of SLRU segments - pub async fn list_slru_segments( + pub(crate) async fn list_slru_segments( &self, kind: SlruKind, - lsn: Lsn, + version: Version<'_>, ctx: &RequestContext, ) -> Result, PageReconstructError> { // fetch directory entry let key = slru_dir_to_key(kind); - let buf = self.get(key, lsn, ctx).await?; + let buf = version.get(self, key, ctx).await?; match SlruSegmentDirectory::des(&buf).context("deserialization failure") { Ok(dir) => Ok(dir.segments), Err(e) => Err(PageReconstructError::from(e)), } } - pub async fn get_relmap_file( + pub(crate) async fn get_relmap_file( &self, spcnode: Oid, dbnode: Oid, - lsn: Lsn, + version: Version<'_>, ctx: &RequestContext, ) -> Result { let key = relmap_file_key(spcnode, dbnode); - let buf = self.get(key, lsn, ctx).await?; + let buf = version.get(self, key, ctx).await?; Ok(buf) } - pub async fn list_dbdirs( + pub(crate) async fn list_dbdirs( &self, lsn: Lsn, ctx: &RequestContext, @@ -571,7 +575,7 @@ impl Timeline { } } - pub async fn get_twophase_file( + pub(crate) async fn get_twophase_file( &self, xid: TransactionId, lsn: Lsn, @@ -582,7 +586,7 @@ impl Timeline { Ok(buf) } - pub async fn list_twophase_files( + pub(crate) async fn list_twophase_files( &self, lsn: Lsn, ctx: &RequestContext, @@ -596,7 +600,7 @@ impl Timeline { } } - pub async fn get_control_file( + pub(crate) async fn get_control_file( &self, lsn: Lsn, ctx: &RequestContext, @@ -604,7 +608,7 @@ impl Timeline { self.get(CONTROLFILE_KEY, lsn, ctx).await } - pub async fn get_checkpoint( + pub(crate) async fn get_checkpoint( &self, lsn: Lsn, ctx: &RequestContext, @@ -612,7 +616,7 @@ impl Timeline { self.get(CHECKPOINT_KEY, lsn, ctx).await } - pub async fn list_aux_files( + pub(crate) async fn list_aux_files( &self, lsn: Lsn, ctx: &RequestContext, @@ -652,7 +656,10 @@ impl Timeline { let mut total_size: u64 = 0; for (spcnode, dbnode) in dbdir.dbdirs.keys() { - for rel in self.list_rels(*spcnode, *dbnode, lsn, ctx).await? { + for rel in self + .list_rels(*spcnode, *dbnode, Version::Lsn(lsn), ctx) + .await? + { if self.cancel.is_cancelled() { return Err(CalculateLogicalSizeError::Cancelled); } @@ -692,7 +699,7 @@ impl Timeline { result.add_key(rel_dir_to_key(spcnode, dbnode)); let mut rels: Vec = self - .list_rels(spcnode, dbnode, lsn, ctx) + .list_rels(spcnode, dbnode, Version::Lsn(lsn), ctx) .await? .into_iter() .collect(); @@ -799,18 +806,39 @@ pub struct DatadirModification<'a> { /// in the state in 'tline' yet. pub tline: &'a Timeline, - /// Lsn assigned by begin_modification - pub lsn: Lsn, + /// Current LSN of the modification + lsn: Lsn, // The modifications are not applied directly to the underlying key-value store. // The put-functions add the modifications here, and they are flushed to the // underlying key-value store by the 'finish' function. - pending_updates: HashMap, - pending_deletions: Vec>, + pending_lsns: Vec, + pending_updates: HashMap>, + pending_deletions: Vec<(Range, Lsn)>, pending_nblocks: i64, } impl<'a> DatadirModification<'a> { + /// Get the current lsn + pub(crate) fn get_lsn(&self) -> Lsn { + self.lsn + } + + /// Set the current lsn + pub(crate) fn set_lsn(&mut self, lsn: Lsn) -> anyhow::Result<()> { + ensure!( + lsn >= self.lsn, + "setting an older lsn {} than {} is not allowed", + lsn, + self.lsn + ); + if lsn > self.lsn { + self.pending_lsns.push(self.lsn); + self.lsn = lsn; + } + Ok(()) + } + /// Initialize a completely new repository. /// /// This inserts the directory metadata entries that are assumed to @@ -984,11 +1012,9 @@ impl<'a> DatadirModification<'a> { dbnode: Oid, ctx: &RequestContext, ) -> anyhow::Result<()> { - let req_lsn = self.tline.get_last_record_lsn(); - let total_blocks = self .tline - .get_db_size(spcnode, dbnode, req_lsn, true, ctx) + .get_db_size(spcnode, dbnode, Version::Modified(self), true, ctx) .await?; // Remove entry from dbdir @@ -1077,8 +1103,11 @@ impl<'a> DatadirModification<'a> { ctx: &RequestContext, ) -> anyhow::Result<()> { anyhow::ensure!(rel.relnode != 0, RelationError::InvalidRelnode); - let last_lsn = self.tline.get_last_record_lsn(); - if self.tline.get_rel_exists(rel, last_lsn, true, ctx).await? { + if self + .tline + .get_rel_exists(rel, Version::Modified(self), true, ctx) + .await? + { let size_key = rel_size_to_key(rel); // Fetch the old size first let old_size = self.get(size_key, ctx).await?.get_u32_le(); @@ -1323,17 +1352,23 @@ impl<'a> DatadirModification<'a> { let writer = self.tline.writer().await; // Flush relation and SLRU data blocks, keep metadata. - let mut retained_pending_updates = HashMap::new(); - for (key, value) in self.pending_updates.drain() { - if is_rel_block_key(&key) || is_slru_block_key(key) { - // This bails out on first error without modifying pending_updates. - // That's Ok, cf this function's doc comment. - writer.put(key, self.lsn, &value, ctx).await?; - } else { - retained_pending_updates.insert(key, value); + let mut retained_pending_updates = HashMap::<_, Vec<_>>::new(); + for (key, values) in self.pending_updates.drain() { + for (lsn, value) in values { + if is_rel_block_key(&key) || is_slru_block_key(key) { + // This bails out on first error without modifying pending_updates. + // That's Ok, cf this function's doc comment. + writer.put(key, lsn, &value, ctx).await?; + } else { + retained_pending_updates + .entry(key) + .or_default() + .push((lsn, value)); + } } } - self.pending_updates.extend(retained_pending_updates); + + self.pending_updates = retained_pending_updates; if pending_nblocks != 0 { writer.update_current_logical_size(pending_nblocks * i64::from(BLCKSZ)); @@ -1350,18 +1385,28 @@ impl<'a> DatadirModification<'a> { /// pub async fn commit(&mut self, ctx: &RequestContext) -> anyhow::Result<()> { let writer = self.tline.writer().await; - let lsn = self.lsn; + let pending_nblocks = self.pending_nblocks; self.pending_nblocks = 0; - for (key, value) in self.pending_updates.drain() { - writer.put(key, lsn, &value, ctx).await?; - } - for key_range in self.pending_deletions.drain(..) { - writer.delete(key_range, lsn).await?; + if !self.pending_updates.is_empty() { + writer.put_batch(&self.pending_updates, ctx).await?; + self.pending_updates.clear(); } - writer.finish_write(lsn); + if !self.pending_deletions.is_empty() { + writer.delete_batch(&self.pending_deletions).await?; + self.pending_deletions.clear(); + } + + self.pending_lsns.push(self.lsn); + for pending_lsn in self.pending_lsns.drain(..) { + // Ideally, we should be able to call writer.finish_write() only once + // with the highest LSN. However, the last_record_lsn variable in the + // timeline keeps track of the latest LSN and the immediate previous LSN + // so we need to record every LSN to not leave a gap between them. + writer.finish_write(pending_lsn); + } if pending_nblocks != 0 { writer.update_current_logical_size(pending_nblocks * i64::from(BLCKSZ)); @@ -1370,44 +1415,86 @@ impl<'a> DatadirModification<'a> { Ok(()) } - pub(crate) fn is_empty(&self) -> bool { - self.pending_updates.is_empty() && self.pending_deletions.is_empty() + pub(crate) fn len(&self) -> usize { + self.pending_updates.len() + self.pending_deletions.len() } // Internal helper functions to batch the modifications async fn get(&self, key: Key, ctx: &RequestContext) -> Result { - // Have we already updated the same key? Read the pending updated + // Have we already updated the same key? Read the latest pending updated // version in that case. // // Note: we don't check pending_deletions. It is an error to request a // value that has been removed, deletion only avoids leaking storage. - if let Some(value) = self.pending_updates.get(&key) { - if let Value::Image(img) = value { - Ok(img.clone()) - } else { - // Currently, we never need to read back a WAL record that we - // inserted in the same "transaction". All the metadata updates - // work directly with Images, and we never need to read actual - // data pages. We could handle this if we had to, by calling - // the walredo manager, but let's keep it simple for now. - Err(PageReconstructError::from(anyhow::anyhow!( - "unexpected pending WAL record" - ))) + if let Some(values) = self.pending_updates.get(&key) { + if let Some((_, value)) = values.last() { + return if let Value::Image(img) = value { + Ok(img.clone()) + } else { + // Currently, we never need to read back a WAL record that we + // inserted in the same "transaction". All the metadata updates + // work directly with Images, and we never need to read actual + // data pages. We could handle this if we had to, by calling + // the walredo manager, but let's keep it simple for now. + Err(PageReconstructError::from(anyhow::anyhow!( + "unexpected pending WAL record" + ))) + }; } - } else { - let lsn = Lsn::max(self.tline.get_last_record_lsn(), self.lsn); - self.tline.get(key, lsn, ctx).await } + let lsn = Lsn::max(self.tline.get_last_record_lsn(), self.lsn); + self.tline.get(key, lsn, ctx).await } fn put(&mut self, key: Key, val: Value) { - self.pending_updates.insert(key, val); + let values = self.pending_updates.entry(key).or_default(); + // Replace the previous value if it exists at the same lsn + if let Some((last_lsn, last_value)) = values.last_mut() { + if *last_lsn == self.lsn { + *last_value = val; + return; + } + } + values.push((self.lsn, val)); } fn delete(&mut self, key_range: Range) { trace!("DELETE {}-{}", key_range.start, key_range.end); - self.pending_deletions.push(key_range); + self.pending_deletions.push((key_range, self.lsn)); + } +} + +/// This struct facilitates accessing either a committed key from the timeline at a +/// specific LSN, or the latest uncommitted key from a pending modification. +/// During WAL ingestion, the records from multiple LSNs may be batched in the same +/// modification before being flushed to the timeline. Hence, the routines in WalIngest +/// need to look up the keys in the modification first before looking them up in the +/// timeline to not miss the latest updates. +#[derive(Clone, Copy)] +pub enum Version<'a> { + Lsn(Lsn), + Modified(&'a DatadirModification<'a>), +} + +impl<'a> Version<'a> { + async fn get( + &self, + timeline: &Timeline, + key: Key, + ctx: &RequestContext, + ) -> Result { + match self { + Version::Lsn(lsn) => timeline.get(key, *lsn, ctx).await, + Version::Modified(modification) => modification.get(key, ctx).await, + } + } + + fn get_lsn(&self) -> Lsn { + match self { + Version::Lsn(lsn) => *lsn, + Version::Modified(modification) => modification.lsn, + } } } @@ -1776,6 +1863,7 @@ pub fn is_inherited_key(key: Key) -> bool { key != AUX_FILES_KEY } +/// Guaranteed to return `Ok()` if [[is_rel_block_key]] returns `true` for `key`. pub fn key_to_rel_block(key: Key) -> anyhow::Result<(RelTag, BlockNumber)> { Ok(match key.field1 { 0x00 => ( @@ -1790,7 +1878,6 @@ pub fn key_to_rel_block(key: Key) -> anyhow::Result<(RelTag, BlockNumber)> { _ => anyhow::bail!("unexpected value kind 0x{:02x}", key.field1), }) } - pub fn is_rel_fsm_block_key(key: Key) -> bool { key.field1 == 0x00 && key.field4 != 0 && key.field5 == FSM_FORKNUM && key.field6 != 0xffffffff } diff --git a/pageserver/src/statvfs.rs b/pageserver/src/statvfs.rs index 08b5264290..45a516566f 100644 --- a/pageserver/src/statvfs.rs +++ b/pageserver/src/statvfs.rs @@ -23,7 +23,7 @@ impl Statvfs { } // NB: allow() because the block count type is u32 on macOS. - #[allow(clippy::useless_conversion)] + #[allow(clippy::useless_conversion, clippy::unnecessary_fallible_conversions)] pub fn blocks(&self) -> u64 { match self { Statvfs::Real(stat) => u64::try_from(stat.blocks()).unwrap(), @@ -32,7 +32,7 @@ impl Statvfs { } // NB: allow() because the block count type is u32 on macOS. - #[allow(clippy::useless_conversion)] + #[allow(clippy::useless_conversion, clippy::unnecessary_fallible_conversions)] pub fn blocks_available(&self) -> u64 { match self { Statvfs::Real(stat) => u64::try_from(stat.blocks_available()).unwrap(), diff --git a/pageserver/src/task_mgr.rs b/pageserver/src/task_mgr.rs index cb1b2b8011..5a06a97525 100644 --- a/pageserver/src/task_mgr.rs +++ b/pageserver/src/task_mgr.rs @@ -147,7 +147,7 @@ pub(crate) static BACKGROUND_RUNTIME_WORKER_THREADS: Lazy = Lazy::new(|| // else, but that has not been needed in a long time. std::env::var("TOKIO_WORKER_THREADS") .map(|s| s.parse::().unwrap()) - .unwrap_or_else(|_e| usize::max(1, num_cpus::get())) + .unwrap_or_else(|_e| usize::max(2, num_cpus::get())) }); #[derive(Debug, Clone, Copy)] @@ -258,6 +258,9 @@ pub enum TaskKind { /// See [`crate::disk_usage_eviction_task`]. DiskUsageEviction, + /// See [`crate::tenant::secondary`]. + SecondaryDownloads, + /// See [`crate::tenant::secondary`]. SecondaryUploads, diff --git a/pageserver/src/tenant.rs b/pageserver/src/tenant.rs index 1d6f1001db..371b7465eb 100644 --- a/pageserver/src/tenant.rs +++ b/pageserver/src/tenant.rs @@ -12,7 +12,7 @@ //! use anyhow::{bail, Context}; -use camino::{Utf8Path, Utf8PathBuf}; +use camino::Utf8Path; use enumset::EnumSet; use futures::stream::FuturesUnordered; use futures::FutureExt; @@ -33,6 +33,7 @@ use tracing::*; use utils::backoff; use utils::completion; use utils::crashsafe::path_with_suffix_extension; +use utils::failpoint_support; use utils::fs_ext; use utils::sync::gate::Gate; use utils::sync::gate::GateGuard; @@ -55,6 +56,7 @@ use self::timeline::uninit::TimelineUninitMark; use self::timeline::uninit::UninitializedTimeline; use self::timeline::EvictionTaskTenantState; use self::timeline::TimelineResources; +use self::timeline::WaitLsnError; use crate::config::PageServerConf; use crate::context::{DownloadBehavior, RequestContext}; use crate::deletion_queue::DeletionQueueClient; @@ -128,6 +130,13 @@ macro_rules! pausable_failpoint { .expect("spawn_blocking"); } }; + ($name:literal, $cond:expr) => { + if cfg!(feature = "testing") { + if $cond { + pausable_failpoint!($name) + } + } + }; } pub mod blob_io; @@ -594,10 +603,9 @@ impl Tenant { mode: SpawnMode, ctx: &RequestContext, ) -> anyhow::Result> { - // TODO(sharding): make WalRedoManager shard-aware let wal_redo_manager = Arc::new(WalRedoManager::from(PostgresRedoManager::new( conf, - tenant_shard_id.tenant_id, + tenant_shard_id, ))); let TenantSharedResources { @@ -890,7 +898,7 @@ impl Tenant { ) -> anyhow::Result<()> { span::debug_assert_current_span_has_tenant_id(); - crate::failpoint_support::sleep_millis_async!("before-attaching-tenant"); + failpoint_support::sleep_millis_async!("before-attaching-tenant"); let preload = match preload { Some(p) => p, @@ -1002,7 +1010,7 @@ impl Tenant { // IndexPart is the source of truth. self.clean_up_timelines(&existent_timelines)?; - crate::failpoint_support::sleep_millis_async!("attach-before-activate"); + failpoint_support::sleep_millis_async!("attach-before-activate", &self.cancel); info!("Done"); @@ -1144,10 +1152,9 @@ impl Tenant { tenant_shard_id: TenantShardId, reason: String, ) -> Arc { - // TODO(sharding): make WalRedoManager shard-aware let wal_redo_manager = Arc::new(WalRedoManager::from(PostgresRedoManager::new( conf, - tenant_shard_id.tenant_id, + tenant_shard_id, ))); Arc::new(Tenant::new( TenantState::Broken { @@ -1759,7 +1766,15 @@ impl Tenant { // decoding the new WAL might need to look up previous pages, relation // sizes etc. and that would get confused if the previous page versions // are not in the repository yet. - ancestor_timeline.wait_lsn(*lsn, ctx).await?; + ancestor_timeline + .wait_lsn(*lsn, ctx) + .await + .map_err(|e| match e { + e @ (WaitLsnError::Timeout(_) | WaitLsnError::BadState) => { + CreateTimelineError::AncestorLsn(anyhow::anyhow!(e)) + } + WaitLsnError::Shutdown => CreateTimelineError::ShuttingDown, + })?; } self.branch_timeline( @@ -2028,6 +2043,13 @@ impl Tenant { // It's mesed up. // we just ignore the failure to stop + // If we're still attaching, fire the cancellation token early to drop out: this + // will prevent us flushing, but ensures timely shutdown if some I/O during attach + // is very slow. + if matches!(self.current_state(), TenantState::Attaching) { + self.cancel.cancel(); + } + match self.set_stopping(shutdown_progress, false, false).await { Ok(()) => {} Err(SetStoppingError::Broken) => { @@ -2726,6 +2748,10 @@ impl Tenant { "# .to_string(); + fail::fail_point!("tenant-config-before-write", |_| { + anyhow::bail!("tenant-config-before-write"); + }); + // Convert the config to a toml file. conf_content += &toml_edit::ser::to_string_pretty(&location_conf)?; @@ -2839,9 +2865,7 @@ impl Tenant { } }; - crate::failpoint_support::sleep_millis_async!( - "gc_iteration_internal_after_getting_gc_timelines" - ); + failpoint_support::sleep_millis_async!("gc_iteration_internal_after_getting_gc_timelines"); // If there is nothing to GC, we don't want any messages in the INFO log. if !gc_timelines.is_empty() { @@ -3134,6 +3158,7 @@ impl Tenant { /// For unit tests, make this visible so that other modules can directly create timelines #[cfg(test)] + #[tracing::instrument(fields(tenant_id=%self.tenant_shard_id.tenant_id, shard_id=%self.tenant_shard_id.shard_slug(), %timeline_id))] pub(crate) async fn bootstrap_timeline_test( &self, timeline_id: TimelineId, @@ -3643,140 +3668,6 @@ fn remove_timeline_and_uninit_mark( Ok(()) } -pub(crate) async fn create_tenant_files( - conf: &'static PageServerConf, - location_conf: &LocationConf, - tenant_shard_id: &TenantShardId, -) -> anyhow::Result { - let target_tenant_directory = conf.tenant_path(tenant_shard_id); - anyhow::ensure!( - !target_tenant_directory - .try_exists() - .context("check existence of tenant directory")?, - "tenant directory already exists", - ); - - let temporary_tenant_dir = - path_with_suffix_extension(&target_tenant_directory, TEMP_FILE_SUFFIX); - debug!("Creating temporary directory structure in {temporary_tenant_dir}"); - - // top-level dir may exist if we are creating it through CLI - crashsafe::create_dir_all(&temporary_tenant_dir).with_context(|| { - format!("could not create temporary tenant directory {temporary_tenant_dir}") - })?; - - let creation_result = try_create_target_tenant_dir( - conf, - location_conf, - tenant_shard_id, - &temporary_tenant_dir, - &target_tenant_directory, - ) - .await; - - if creation_result.is_err() { - error!( - "Failed to create directory structure for tenant {tenant_shard_id}, cleaning tmp data" - ); - if let Err(e) = fs::remove_dir_all(&temporary_tenant_dir) { - error!("Failed to remove temporary tenant directory {temporary_tenant_dir:?}: {e}") - } else if let Err(e) = crashsafe::fsync(&temporary_tenant_dir) { - error!( - "Failed to fsync removed temporary tenant directory {temporary_tenant_dir:?}: {e}" - ) - } - } - - creation_result?; - - Ok(target_tenant_directory) -} - -async fn try_create_target_tenant_dir( - conf: &'static PageServerConf, - location_conf: &LocationConf, - tenant_shard_id: &TenantShardId, - temporary_tenant_dir: &Utf8Path, - target_tenant_directory: &Utf8Path, -) -> Result<(), anyhow::Error> { - let temporary_tenant_timelines_dir = rebase_directory( - &conf.timelines_path(tenant_shard_id), - target_tenant_directory, - temporary_tenant_dir, - ) - .with_context(|| format!("resolve tenant {tenant_shard_id} temporary timelines dir"))?; - let temporary_legacy_tenant_config_path = rebase_directory( - &conf.tenant_config_path(tenant_shard_id), - target_tenant_directory, - temporary_tenant_dir, - ) - .with_context(|| format!("resolve tenant {tenant_shard_id} temporary config path"))?; - let temporary_tenant_config_path = rebase_directory( - &conf.tenant_location_config_path(tenant_shard_id), - target_tenant_directory, - temporary_tenant_dir, - ) - .with_context(|| format!("resolve tenant {tenant_shard_id} temporary config path"))?; - - Tenant::persist_tenant_config_at( - tenant_shard_id, - &temporary_tenant_config_path, - &temporary_legacy_tenant_config_path, - location_conf, - ) - .await?; - - crashsafe::create_dir(&temporary_tenant_timelines_dir).with_context(|| { - format!( - "create tenant {} temporary timelines directory {}", - tenant_shard_id, temporary_tenant_timelines_dir, - ) - })?; - fail::fail_point!("tenant-creation-before-tmp-rename", |_| { - anyhow::bail!("failpoint tenant-creation-before-tmp-rename"); - }); - - // Make sure the current tenant directory entries are durable before renaming. - // Without this, a crash may reorder any of the directory entry creations above. - crashsafe::fsync(temporary_tenant_dir) - .with_context(|| format!("sync temporary tenant directory {temporary_tenant_dir:?}"))?; - - fs::rename(temporary_tenant_dir, target_tenant_directory).with_context(|| { - format!( - "move tenant {} temporary directory {} into the permanent one {}", - tenant_shard_id, temporary_tenant_dir, target_tenant_directory - ) - })?; - let target_dir_parent = target_tenant_directory.parent().with_context(|| { - format!( - "get tenant {} dir parent for {}", - tenant_shard_id, target_tenant_directory, - ) - })?; - crashsafe::fsync(target_dir_parent).with_context(|| { - format!( - "fsync renamed directory's parent {} for tenant {}", - target_dir_parent, tenant_shard_id, - ) - })?; - - Ok(()) -} - -fn rebase_directory( - original_path: &Utf8Path, - base: &Utf8Path, - new_base: &Utf8Path, -) -> anyhow::Result { - let relative_path = original_path.strip_prefix(base).with_context(|| { - format!( - "Failed to strip base prefix '{}' off path '{}'", - base, original_path - ) - })?; - Ok(new_base.join(relative_path)) -} - /// Create the cluster temporarily in 'initdbpath' directory inside the repository /// to get bootstrap data for timeline initialization. async fn run_initdb( @@ -3871,6 +3762,7 @@ pub async fn dump_layerfile_from_path( #[cfg(test)] pub(crate) mod harness { use bytes::{Bytes, BytesMut}; + use camino::Utf8PathBuf; use once_cell::sync::OnceCell; use pageserver_api::shard::ShardIndex; use std::fs; @@ -3938,8 +3830,6 @@ pub(crate) mod harness { pub struct TenantHarness { pub conf: &'static PageServerConf, pub tenant_conf: TenantConf, - // TODO(sharding): remove duplicative `tenant_id` in favor of access to tenant_shard_id - pub(crate) tenant_id: TenantId, pub tenant_shard_id: TenantShardId, pub generation: Generation, pub shard: ShardIndex, @@ -4001,7 +3891,6 @@ pub(crate) mod harness { Ok(Self { conf, tenant_conf, - tenant_id, tenant_shard_id, generation: Generation::new(0xdeadbeef), shard: ShardIndex::unsharded(), diff --git a/pageserver/src/tenant/config.rs b/pageserver/src/tenant/config.rs index 25d97f51ce..2d4cd350d7 100644 --- a/pageserver/src/tenant/config.rs +++ b/pageserver/src/tenant/config.rs @@ -46,6 +46,8 @@ pub mod defaults { pub const DEFAULT_WALRECEIVER_LAGGING_WAL_TIMEOUT: &str = "10 seconds"; pub const DEFAULT_MAX_WALRECEIVER_LSN_WAL_LAG: u64 = 10 * 1024 * 1024; pub const DEFAULT_EVICTIONS_LOW_RESIDENCE_DURATION_METRIC_THRESHOLD: &str = "24 hour"; + + pub const DEFAULT_INGEST_BATCH_SIZE: u64 = 100; } #[derive(Debug, Clone, Serialize, Deserialize, PartialEq, Eq)] diff --git a/pageserver/src/tenant/delete.rs b/pageserver/src/tenant/delete.rs index e8491f26db..2f606ed822 100644 --- a/pageserver/src/tenant/delete.rs +++ b/pageserver/src/tenant/delete.rs @@ -48,6 +48,9 @@ pub(crate) enum DeleteTenantError { #[error("Timeline {0}")] Timeline(#[from] DeleteTimelineError), + #[error("Cancelled")] + Cancelled, + #[error(transparent)] Other(#[from] anyhow::Error), } @@ -585,7 +588,7 @@ impl DeleteTenantFlow { } break; } - TenantsMapRemoveResult::Occupied(TenantSlot::Secondary) => { + TenantsMapRemoveResult::Occupied(TenantSlot::Secondary(_)) => { // This is unexpected: this secondary tenants should not have been created, and we // are not in a position to shut it down from here. tracing::warn!("Tenant transitioned to secondary mode while deleting!"); diff --git a/pageserver/src/tenant/mgr.rs b/pageserver/src/tenant/mgr.rs index 31d80026f0..5d2a87d5b7 100644 --- a/pageserver/src/tenant/mgr.rs +++ b/pageserver/src/tenant/mgr.rs @@ -35,7 +35,7 @@ use crate::tenant::config::{ }; use crate::tenant::delete::DeleteTenantFlow; use crate::tenant::span::debug_assert_current_span_has_tenant_id; -use crate::tenant::{create_tenant_files, AttachedTenantConf, SpawnMode, Tenant, TenantState}; +use crate::tenant::{AttachedTenantConf, SpawnMode, Tenant, TenantState}; use crate::{InitializationOrder, IGNORED_TENANT_FILE_NAME, TEMP_FILE_SUFFIX}; use utils::crashsafe::path_with_suffix_extension; @@ -44,6 +44,7 @@ use utils::generation::Generation; use utils::id::{TenantId, TimelineId}; use super::delete::DeleteTenantError; +use super::secondary::SecondaryTenant; use super::TenantSharedResources; /// For a tenant that appears in TenantsMap, it may either be @@ -57,7 +58,7 @@ use super::TenantSharedResources; /// having a properly acquired generation (Secondary doesn't need a generation) pub(crate) enum TenantSlot { Attached(Arc), - Secondary, + Secondary(Arc), /// In this state, other administrative operations acting on the TenantId should /// block, or return a retry indicator equivalent to HTTP 503. InProgress(utils::completion::Barrier), @@ -67,7 +68,7 @@ impl std::fmt::Debug for TenantSlot { fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { match self { Self::Attached(tenant) => write!(f, "Attached({})", tenant.current_state()), - Self::Secondary => write!(f, "Secondary"), + Self::Secondary(_) => write!(f, "Secondary"), Self::InProgress(_) => write!(f, "InProgress"), } } @@ -78,7 +79,7 @@ impl TenantSlot { fn get_attached(&self) -> Option<&Arc> { match self { Self::Attached(t) => Some(t), - Self::Secondary => None, + Self::Secondary(_) => None, Self::InProgress(_) => None, } } @@ -130,7 +131,7 @@ impl TenantsMap { /// A page service client sends a TenantId, and to look up the correct Tenant we must /// resolve this to a fully qualified TenantShardId. - fn resolve_shard( + fn resolve_attached_shard( &self, tenant_id: &TenantId, selector: ShardSelector, @@ -140,25 +141,27 @@ impl TenantsMap { TenantsMap::Initializing => None, TenantsMap::Open(m) | TenantsMap::ShuttingDown(m) => { for slot in m.range(TenantShardId::tenant_range(*tenant_id)) { + // Ignore all slots that don't contain an attached tenant + let tenant = match &slot.1 { + TenantSlot::Attached(t) => t, + _ => continue, + }; + match selector { ShardSelector::First => return Some(*slot.0), ShardSelector::Zero if slot.0.shard_number == ShardNumber(0) => { return Some(*slot.0) } ShardSelector::Page(key) => { - if let Some(tenant) = slot.1.get_attached() { - // First slot we see for this tenant, calculate the expected shard number - // for the key: we will use this for checking if this and subsequent - // slots contain the key, rather than recalculating the hash each time. - if want_shard.is_none() { - want_shard = Some(tenant.shard_identity.get_shard_number(&key)); - } + // First slot we see for this tenant, calculate the expected shard number + // for the key: we will use this for checking if this and subsequent + // slots contain the key, rather than recalculating the hash each time. + if want_shard.is_none() { + want_shard = Some(tenant.shard_identity.get_shard_number(&key)); + } - if Some(tenant.shard_identity.number) == want_shard { - return Some(*slot.0); - } - } else { - continue; + if Some(tenant.shard_identity.number) == want_shard { + return Some(*slot.0); } } _ => continue, @@ -464,12 +467,18 @@ pub async fn init_tenant_mgr( *gen } else { match &location_conf.mode { - LocationMode::Secondary(_) => { + LocationMode::Secondary(secondary_config) => { // We do not require the control plane's permission for secondary mode // tenants, because they do no remote writes and hence require no // generation number info!(tenant_id=%tenant_shard_id.tenant_id, shard_id=%tenant_shard_id.shard_slug(), "Loaded tenant in secondary mode"); - tenants.insert(tenant_shard_id, TenantSlot::Secondary); + tenants.insert( + tenant_shard_id, + TenantSlot::Secondary(SecondaryTenant::new( + tenant_shard_id, + secondary_config, + )), + ); } LocationMode::Attached(_) => { // TODO: augment re-attach API to enable the control plane to @@ -661,8 +670,14 @@ async fn shutdown_all_tenants0(tenants: &std::sync::RwLock) { total_attached += 1; } - TenantSlot::Secondary => { - shutdown_state.insert(tenant_shard_id, TenantSlot::Secondary); + TenantSlot::Secondary(state) => { + // We don't need to wait for this individually per-tenant: the + // downloader task will be waited on eventually, this cancel + // is just to encourage it to drop out if it is doing work + // for this tenant right now. + state.cancel.cancel(); + + shutdown_state.insert(tenant_shard_id, TenantSlot::Secondary(state)); } TenantSlot::InProgress(notify) => { // InProgress tenants are not visible in TenantsMap::ShuttingDown: we will @@ -739,45 +754,6 @@ async fn shutdown_all_tenants0(tenants: &std::sync::RwLock) { // caller will log how long we took } -pub(crate) async fn create_tenant( - conf: &'static PageServerConf, - tenant_conf: TenantConfOpt, - tenant_shard_id: TenantShardId, - generation: Generation, - resources: TenantSharedResources, - ctx: &RequestContext, -) -> Result, TenantMapInsertError> { - let location_conf = LocationConf::attached_single(tenant_conf, generation); - info!("Creating tenant at location {location_conf:?}"); - - let slot_guard = - tenant_map_acquire_slot(&tenant_shard_id, TenantSlotAcquireMode::MustNotExist)?; - let tenant_path = super::create_tenant_files(conf, &location_conf, &tenant_shard_id).await?; - - let shard_identity = location_conf.shard; - let created_tenant = tenant_spawn( - conf, - tenant_shard_id, - &tenant_path, - resources, - AttachedTenantConf::try_from(location_conf)?, - shard_identity, - None, - &TENANTS, - SpawnMode::Create, - ctx, - )?; - // TODO: tenant object & its background loops remain, untracked in tenant map, if we fail here. - // See https://github.com/neondatabase/neon/issues/4233 - - let created_tenant_id = created_tenant.tenant_id(); - debug_assert_eq!(created_tenant_id, tenant_shard_id.tenant_id); - - slot_guard.upsert(TenantSlot::Attached(created_tenant.clone()))?; - - Ok(created_tenant) -} - #[derive(Debug, thiserror::Error)] pub(crate) enum SetNewTenantConfigError { #[error(transparent)] @@ -809,6 +785,24 @@ pub(crate) async fn set_new_tenant_config( Ok(()) } +#[derive(thiserror::Error, Debug)] +pub(crate) enum UpsertLocationError { + #[error("Bad config request: {0}")] + BadRequest(anyhow::Error), + + #[error("Cannot change config in this state: {0}")] + Unavailable(#[from] TenantMapError), + + #[error("Tenant is already being modified")] + InProgress, + + #[error("Failed to flush: {0}")] + Flush(anyhow::Error), + + #[error("Internal error: {0}")] + Other(#[from] anyhow::Error), +} + impl TenantManager { /// Convenience function so that anyone with a TenantManager can get at the global configuration, without /// having to pass it around everywhere as a separate object. @@ -845,27 +839,49 @@ impl TenantManager { Some(TenantSlot::InProgress(_)) => { Err(GetTenantError::NotActive(tenant_shard_id.tenant_id)) } - None | Some(TenantSlot::Secondary) => { + None | Some(TenantSlot::Secondary(_)) => { Err(GetTenantError::NotFound(tenant_shard_id.tenant_id)) } } } + pub(crate) fn get_secondary_tenant_shard( + &self, + tenant_shard_id: TenantShardId, + ) -> Option> { + let locked = self.tenants.read().unwrap(); + + let peek_slot = tenant_map_peek_slot(&locked, &tenant_shard_id, TenantSlotPeekMode::Read) + .ok() + .flatten(); + + match peek_slot { + Some(TenantSlot::Secondary(s)) => Some(s.clone()), + _ => None, + } + } + #[instrument(skip_all, fields(tenant_id=%tenant_shard_id.tenant_id, shard_id=%tenant_shard_id.shard_slug()))] pub(crate) async fn upsert_location( &self, tenant_shard_id: TenantShardId, new_location_config: LocationConf, flush: Option, + spawn_mode: SpawnMode, ctx: &RequestContext, - ) -> Result<(), anyhow::Error> { + ) -> Result>, UpsertLocationError> { debug_assert_current_span_has_tenant_id(); info!("configuring tenant location to state {new_location_config:?}"); - // Special case fast-path for updates to Tenant: if our upsert is only updating configuration, + enum FastPathModified { + Attached(Arc), + Secondary(Arc), + } + + // Special case fast-path for updates to existing slots: if our upsert is only updating configuration, // then we do not need to set the slot to InProgress, we can just call into the // existng tenant. - let modify_tenant = { + let fast_path_taken = { let locked = self.tenants.read().unwrap(); let peek_slot = tenant_map_peek_slot(&locked, &tenant_shard_id, TenantSlotPeekMode::Write)?; @@ -875,16 +891,24 @@ impl TenantManager { // A transition from Attached to Attached in the same generation, we may // take our fast path and just provide the updated configuration // to the tenant. - tenant.set_new_location_config(AttachedTenantConf::try_from( - new_location_config.clone(), - )?); + tenant.set_new_location_config( + AttachedTenantConf::try_from(new_location_config.clone()) + .map_err(UpsertLocationError::BadRequest)?, + ); - Some(tenant.clone()) + Some(FastPathModified::Attached(tenant.clone())) } else { // Different generations, fall through to general case None } } + ( + LocationMode::Secondary(secondary_conf), + Some(TenantSlot::Secondary(secondary_tenant)), + ) => { + secondary_tenant.set_config(secondary_conf); + Some(FastPathModified::Secondary(secondary_tenant.clone())) + } _ => { // Not an Attached->Attached transition, fall through to general case None @@ -893,69 +917,107 @@ impl TenantManager { }; // Fast-path continued: having dropped out of the self.tenants lock, do the async - // phase of waiting for flush, before returning. - if let Some(tenant) = modify_tenant { - // Transition to AttachedStale means we may well hold a valid generation - // still, and have been requested to go stale as part of a migration. If - // the caller set `flush`, then flush to remote storage. - if let LocationMode::Attached(AttachedLocationConfig { - generation: _, - attach_mode: AttachmentMode::Stale, - }) = &new_location_config.mode - { - if let Some(flush_timeout) = flush { - match tokio::time::timeout(flush_timeout, tenant.flush_remote()).await { - Ok(Err(e)) => { - return Err(e); - } - Ok(Ok(_)) => return Ok(()), - Err(_) => { - tracing::warn!( + // phase of writing config and/or waiting for flush, before returning. + match fast_path_taken { + Some(FastPathModified::Attached(tenant)) => { + Tenant::persist_tenant_config(self.conf, &tenant_shard_id, &new_location_config) + .await?; + + // Transition to AttachedStale means we may well hold a valid generation + // still, and have been requested to go stale as part of a migration. If + // the caller set `flush`, then flush to remote storage. + if let LocationMode::Attached(AttachedLocationConfig { + generation: _, + attach_mode: AttachmentMode::Stale, + }) = &new_location_config.mode + { + if let Some(flush_timeout) = flush { + match tokio::time::timeout(flush_timeout, tenant.flush_remote()).await { + Ok(Err(e)) => { + return Err(UpsertLocationError::Flush(e)); + } + Ok(Ok(_)) => return Ok(Some(tenant)), + Err(_) => { + tracing::warn!( timeout_ms = flush_timeout.as_millis(), "Timed out waiting for flush to remote storage, proceeding anyway." ) + } } } } - } - return Ok(()); - } + return Ok(Some(tenant)); + } + Some(FastPathModified::Secondary(_secondary_tenant)) => { + Tenant::persist_tenant_config(self.conf, &tenant_shard_id, &new_location_config) + .await?; + + return Ok(None); + } + None => { + // Proceed with the general case procedure, where we will shutdown & remove any existing + // slot contents and replace with a fresh one + } + }; // General case for upserts to TenantsMap, excluding the case above: we will substitute an // InProgress value to the slot while we make whatever changes are required. The state for // the tenant is inaccessible to the outside world while we are doing this, but that is sensible: // the state is ill-defined while we're in transition. Transitions are async, but fast: we do // not do significant I/O, and shutdowns should be prompt via cancellation tokens. - let mut slot_guard = tenant_map_acquire_slot(&tenant_shard_id, TenantSlotAcquireMode::Any)?; - - if let Some(TenantSlot::Attached(tenant)) = slot_guard.get_old_value() { - // The case where we keep a Tenant alive was covered above in the special case - // for Attached->Attached transitions in the same generation. By this point, - // if we see an attached tenant we know it will be discarded and should be - // shut down. - let (_guard, progress) = utils::completion::channel(); - - match tenant.get_attach_mode() { - AttachmentMode::Single | AttachmentMode::Multi => { - // Before we leave our state as the presumed holder of the latest generation, - // flush any outstanding deletions to reduce the risk of leaking objects. - self.resources.deletion_queue_client.flush_advisory() + let mut slot_guard = tenant_map_acquire_slot(&tenant_shard_id, TenantSlotAcquireMode::Any) + .map_err(|e| match e { + TenantSlotError::AlreadyExists(_, _) | TenantSlotError::NotFound(_) => { + unreachable!("Called with mode Any") } - AttachmentMode::Stale => { - // If we're stale there's not point trying to flush deletions - } - }; + TenantSlotError::InProgress => UpsertLocationError::InProgress, + TenantSlotError::MapState(s) => UpsertLocationError::Unavailable(s), + })?; - info!("Shutting down attached tenant"); - match tenant.shutdown(progress, false).await { - Ok(()) => {} - Err(barrier) => { - info!("Shutdown already in progress, waiting for it to complete"); - barrier.wait().await; + match slot_guard.get_old_value() { + Some(TenantSlot::Attached(tenant)) => { + // The case where we keep a Tenant alive was covered above in the special case + // for Attached->Attached transitions in the same generation. By this point, + // if we see an attached tenant we know it will be discarded and should be + // shut down. + let (_guard, progress) = utils::completion::channel(); + + match tenant.get_attach_mode() { + AttachmentMode::Single | AttachmentMode::Multi => { + // Before we leave our state as the presumed holder of the latest generation, + // flush any outstanding deletions to reduce the risk of leaking objects. + self.resources.deletion_queue_client.flush_advisory() + } + AttachmentMode::Stale => { + // If we're stale there's not point trying to flush deletions + } + }; + + info!("Shutting down attached tenant"); + match tenant.shutdown(progress, false).await { + Ok(()) => {} + Err(barrier) => { + info!("Shutdown already in progress, waiting for it to complete"); + barrier.wait().await; + } } + slot_guard.drop_old_value().expect("We just shut it down"); + } + Some(TenantSlot::Secondary(state)) => { + info!("Shutting down secondary tenant"); + state.shutdown().await; + } + Some(TenantSlot::InProgress(_)) => { + // This should never happen: acquire_slot should error out + // if the contents of a slot were InProgress. + return Err(UpsertLocationError::Other(anyhow::anyhow!( + "Acquired an InProgress slot, this is a bug." + ))); + } + None => { + // Slot was vacant, nothing needs shutting down. } - slot_guard.drop_old_value().expect("We just shut it down"); } let tenant_path = self.conf.tenant_path(&tenant_shard_id); @@ -973,12 +1035,12 @@ impl TenantManager { // Before activating either secondary or attached mode, persist the // configuration, so that on restart we will re-attach (or re-start // secondary) on the tenant. - Tenant::persist_tenant_config(self.conf, &tenant_shard_id, &new_location_config) - .await - .map_err(SetNewTenantConfigError::Persist)?; + Tenant::persist_tenant_config(self.conf, &tenant_shard_id, &new_location_config).await?; let new_slot = match &new_location_config.mode { - LocationMode::Secondary(_) => TenantSlot::Secondary, + LocationMode::Secondary(secondary_config) => { + TenantSlot::Secondary(SecondaryTenant::new(tenant_shard_id, secondary_config)) + } LocationMode::Attached(_attach_config) => { let shard_identity = new_location_config.shard; let tenant = tenant_spawn( @@ -990,7 +1052,7 @@ impl TenantManager { shard_identity, None, self.tenants, - SpawnMode::Normal, + spawn_mode, ctx, )?; @@ -998,9 +1060,20 @@ impl TenantManager { } }; - slot_guard.upsert(new_slot)?; + let attached_tenant = if let TenantSlot::Attached(tenant) = &new_slot { + Some(tenant.clone()) + } else { + None + }; - Ok(()) + slot_guard.upsert(new_slot).map_err(|e| match e { + TenantSlotUpsertError::InternalError(e) => { + UpsertLocationError::Other(anyhow::anyhow!(e)) + } + TenantSlotUpsertError::MapState(e) => UpsertLocationError::Unavailable(e), + })?; + + Ok(attached_tenant) } /// Resetting a tenant is equivalent to detaching it, then attaching it again with the same @@ -1091,6 +1164,95 @@ impl TenantManager { .collect(), } } + // Do some synchronous work for all tenant slots in Secondary state. The provided + // callback should be small and fast, as it will be called inside the global + // TenantsMap lock. + pub(crate) fn foreach_secondary_tenants(&self, mut func: F) + where + // TODO: let the callback return a hint to drop out of the loop early + F: FnMut(&TenantShardId, &Arc), + { + let locked = self.tenants.read().unwrap(); + + let map = match &*locked { + TenantsMap::Initializing | TenantsMap::ShuttingDown(_) => return, + TenantsMap::Open(m) => m, + }; + + for (tenant_id, slot) in map { + if let TenantSlot::Secondary(state) = slot { + // Only expose secondary tenants that are not currently shutting down + if !state.cancel.is_cancelled() { + func(tenant_id, state) + } + } + } + } + + pub(crate) async fn delete_tenant( + &self, + tenant_shard_id: TenantShardId, + activation_timeout: Duration, + ) -> Result<(), DeleteTenantError> { + // We acquire a SlotGuard during this function to protect against concurrent + // changes while the ::prepare phase of DeleteTenantFlow executes, but then + // have to return the Tenant to the map while the background deletion runs. + // + // TODO: refactor deletion to happen outside the lifetime of a Tenant. + // Currently, deletion requires a reference to the tenants map in order to + // keep the Tenant in the map until deletion is complete, and then remove + // it at the end. + // + // See https://github.com/neondatabase/neon/issues/5080 + + let slot_guard = + tenant_map_acquire_slot(&tenant_shard_id, TenantSlotAcquireMode::MustExist)?; + + // unwrap is safe because we used MustExist mode when acquiring + let tenant = match slot_guard.get_old_value().as_ref().unwrap() { + TenantSlot::Attached(tenant) => tenant.clone(), + _ => { + // Express "not attached" as equivalent to "not found" + return Err(DeleteTenantError::NotAttached); + } + }; + + match tenant.current_state() { + TenantState::Broken { .. } | TenantState::Stopping { .. } => { + // If a tenant is broken or stopping, DeleteTenantFlow can + // handle it: broken tenants proceed to delete, stopping tenants + // are checked for deletion already in progress. + } + _ => { + tenant + .wait_to_become_active(activation_timeout) + .await + .map_err(|e| match e { + GetActiveTenantError::WillNotBecomeActive(_) => { + DeleteTenantError::InvalidState(tenant.current_state()) + } + GetActiveTenantError::Cancelled => DeleteTenantError::Cancelled, + GetActiveTenantError::NotFound(_) => DeleteTenantError::NotAttached, + GetActiveTenantError::WaitForActiveTimeout { + latest_state: _latest_state, + wait_time: _wait_time, + } => DeleteTenantError::InvalidState(tenant.current_state()), + })?; + } + } + + let result = DeleteTenantFlow::run( + self.conf, + self.resources.remote_storage.clone(), + &TENANTS, + tenant, + ) + .await; + + // The Tenant goes back into the map in Stopping state, it will eventually be removed by DeleteTenantFLow + slot_guard.revert(); + result + } } #[derive(Debug, thiserror::Error)] @@ -1140,7 +1302,7 @@ pub(crate) fn get_tenant( Some(TenantSlot::InProgress(_)) => { Err(GetTenantError::NotActive(tenant_shard_id.tenant_id)) } - None | Some(TenantSlot::Secondary) => { + None | Some(TenantSlot::Secondary(_)) => { Err(GetTenantError::NotFound(tenant_shard_id.tenant_id)) } } @@ -1192,9 +1354,11 @@ pub(crate) async fn get_active_tenant_with_timeout( let locked = TENANTS.read().unwrap(); // Resolve TenantId to TenantShardId - let tenant_shard_id = locked.resolve_shard(&tenant_id, shard_selector).ok_or( - GetActiveTenantError::NotFound(GetTenantError::NotFound(tenant_id)), - )?; + let tenant_shard_id = locked + .resolve_attached_shard(&tenant_id, shard_selector) + .ok_or(GetActiveTenantError::NotFound(GetTenantError::NotFound( + tenant_id, + )))?; let peek_slot = tenant_map_peek_slot(&locked, &tenant_shard_id, TenantSlotPeekMode::Read) .map_err(GetTenantError::MapState)?; @@ -1211,7 +1375,7 @@ pub(crate) async fn get_active_tenant_with_timeout( } } } - Some(TenantSlot::Secondary) => { + Some(TenantSlot::Secondary(_)) => { return Err(GetActiveTenantError::NotFound(GetTenantError::NotActive( tenant_id, ))) @@ -1268,41 +1432,6 @@ pub(crate) async fn get_active_tenant_with_timeout( Ok(tenant) } -pub(crate) async fn delete_tenant( - conf: &'static PageServerConf, - remote_storage: Option, - tenant_shard_id: TenantShardId, -) -> Result<(), DeleteTenantError> { - // We acquire a SlotGuard during this function to protect against concurrent - // changes while the ::prepare phase of DeleteTenantFlow executes, but then - // have to return the Tenant to the map while the background deletion runs. - // - // TODO: refactor deletion to happen outside the lifetime of a Tenant. - // Currently, deletion requires a reference to the tenants map in order to - // keep the Tenant in the map until deletion is complete, and then remove - // it at the end. - // - // See https://github.com/neondatabase/neon/issues/5080 - - // TODO(sharding): make delete API sharding-aware - let slot_guard = tenant_map_acquire_slot(&tenant_shard_id, TenantSlotAcquireMode::MustExist)?; - - // unwrap is safe because we used MustExist mode when acquiring - let tenant = match slot_guard.get_old_value().as_ref().unwrap() { - TenantSlot::Attached(tenant) => tenant.clone(), - _ => { - // Express "not attached" as equivalent to "not found" - return Err(DeleteTenantError::NotAttached); - } - }; - - let result = DeleteTenantFlow::run(conf, remote_storage, &TENANTS, tenant).await; - - // The Tenant goes back into the map in Stopping state, it will eventually be removed by DeleteTenantFLow - slot_guard.revert(); - result -} - #[derive(Debug, thiserror::Error)] pub(crate) enum DeleteTimelineError { #[error("Tenant {0}")] @@ -1510,61 +1639,12 @@ pub(crate) async fn list_tenants() -> Result, Ok(m.iter() .filter_map(|(id, tenant)| match tenant { TenantSlot::Attached(tenant) => Some((*id, tenant.current_state())), - TenantSlot::Secondary => None, + TenantSlot::Secondary(_) => None, TenantSlot::InProgress(_) => None, }) .collect()) } -/// Execute Attach mgmt API command. -/// -/// Downloading all the tenant data is performed in the background, this merely -/// spawns the background task and returns quickly. -pub(crate) async fn attach_tenant( - conf: &'static PageServerConf, - tenant_id: TenantId, - generation: Generation, - tenant_conf: TenantConfOpt, - resources: TenantSharedResources, - ctx: &RequestContext, -) -> Result<(), TenantMapInsertError> { - // This is a legacy API (replaced by `/location_conf`). It does not support sharding - let tenant_shard_id = TenantShardId::unsharded(tenant_id); - - let slot_guard = - tenant_map_acquire_slot(&tenant_shard_id, TenantSlotAcquireMode::MustNotExist)?; - let location_conf = LocationConf::attached_single(tenant_conf, generation); - let tenant_dir = create_tenant_files(conf, &location_conf, &tenant_shard_id).await?; - // TODO: tenant directory remains on disk if we bail out from here on. - // See https://github.com/neondatabase/neon/issues/4233 - - let shard_identity = location_conf.shard; - let attached_tenant = tenant_spawn( - conf, - tenant_shard_id, - &tenant_dir, - resources, - AttachedTenantConf::try_from(location_conf)?, - shard_identity, - None, - &TENANTS, - SpawnMode::Normal, - ctx, - )?; - // TODO: tenant object & its background loops remain, untracked in tenant map, if we fail here. - // See https://github.com/neondatabase/neon/issues/4233 - - let attached_tenant_id = attached_tenant.tenant_id(); - if tenant_id != attached_tenant_id { - return Err(TenantMapInsertError::Other(anyhow::anyhow!( - "loaded created tenant has unexpected tenant id (expect {tenant_id} != actual {attached_tenant_id})", - ))); - } - - slot_guard.upsert(TenantSlot::Attached(attached_tenant))?; - Ok(()) -} - #[derive(Debug, thiserror::Error)] pub(crate) enum TenantMapInsertError { #[error(transparent)] @@ -1578,7 +1658,7 @@ pub(crate) enum TenantMapInsertError { /// Superset of TenantMapError: issues that can occur when acquiring a slot /// for a particular tenant ID. #[derive(Debug, thiserror::Error)] -pub enum TenantSlotError { +pub(crate) enum TenantSlotError { /// When acquiring a slot with the expectation that the tenant already exists. #[error("Tenant {0} not found")] NotFound(TenantShardId), @@ -1587,9 +1667,6 @@ pub enum TenantSlotError { #[error("tenant {0} already exists, state: {1:?}")] AlreadyExists(TenantShardId, TenantState), - #[error("tenant {0} already exists in but is not attached")] - Conflict(TenantShardId), - // Tried to read a slot that is currently being mutated by another administrative // operation. #[error("tenant has a state change in progress, try again later")] @@ -1767,11 +1844,7 @@ impl SlotGuard { fn old_value_is_shutdown(&self) -> bool { match self.old_value.as_ref() { Some(TenantSlot::Attached(tenant)) => tenant.gate.close_complete(), - Some(TenantSlot::Secondary) => { - // TODO: when adding secondary mode tenants, this will check for shutdown - // in the same way that we do for `Tenant` above - true - } + Some(TenantSlot::Secondary(secondary_tenant)) => secondary_tenant.gate.close_complete(), Some(TenantSlot::InProgress(_)) => { // A SlotGuard cannot be constructed for a slot that was already InProgress unreachable!() @@ -1981,26 +2054,19 @@ where let mut slot_guard = tenant_map_acquire_slot_impl(&tenant_shard_id, tenants, TenantSlotAcquireMode::MustExist)?; - // The SlotGuard allows us to manipulate the Tenant object without fear of some - // concurrent API request doing something else for the same tenant ID. - let attached_tenant = match slot_guard.get_old_value() { - Some(TenantSlot::Attached(t)) => Some(t), - _ => None, - }; - // allow pageserver shutdown to await for our completion let (_guard, progress) = completion::channel(); - // If the tenant was attached, shut it down gracefully. For secondary - // locations this part is not necessary - match &attached_tenant { - Some(attached_tenant) => { + // The SlotGuard allows us to manipulate the Tenant object without fear of some + // concurrent API request doing something else for the same tenant ID. + let attached_tenant = match slot_guard.get_old_value() { + Some(TenantSlot::Attached(tenant)) => { // whenever we remove a tenant from memory, we don't want to flush and wait for upload let freeze_and_flush = false; // shutdown is sure to transition tenant to stopping, and wait for all tasks to complete, so // that we can continue safely to cleanup. - match attached_tenant.shutdown(progress, freeze_and_flush).await { + match tenant.shutdown(progress, freeze_and_flush).await { Ok(()) => {} Err(_other) => { // if pageserver shutdown or other detach/ignore is already ongoing, we don't want to @@ -2009,11 +2075,19 @@ where return Err(TenantStateError::IsStopping(tenant_shard_id.tenant_id)); } } + Some(tenant) } - None => { - // Nothing to wait on when not attached, proceed. + Some(TenantSlot::Secondary(secondary_state)) => { + tracing::info!("Shutting down in secondary mode"); + secondary_state.shutdown().await; + None } - } + Some(TenantSlot::InProgress(_)) => { + // Acquiring a slot guarantees its old value was not InProgress + unreachable!(); + } + None => None, + }; match tenant_cleanup .await diff --git a/pageserver/src/tenant/remote_timeline_client.rs b/pageserver/src/tenant/remote_timeline_client.rs index 52ee8f49ce..ec2a6efef6 100644 --- a/pageserver/src/tenant/remote_timeline_client.rs +++ b/pageserver/src/tenant/remote_timeline_client.rs @@ -229,6 +229,7 @@ use crate::{ tenant::upload_queue::{ UploadOp, UploadQueue, UploadQueueInitialized, UploadQueueStopped, UploadTask, }, + TENANT_HEATMAP_BASENAME, }; use utils::id::{TenantId, TimelineId}; @@ -818,8 +819,25 @@ impl RemoteTimelineClient { fn schedule_deletion_of_unlinked0( self: &Arc, upload_queue: &mut UploadQueueInitialized, - with_metadata: Vec<(LayerFileName, LayerFileMetadata)>, + mut with_metadata: Vec<(LayerFileName, LayerFileMetadata)>, ) { + // Filter out any layers which were not created by this tenant shard. These are + // layers that originate from some ancestor shard after a split, and may still + // be referenced by other shards. We are free to delete them locally and remove + // them from our index (and would have already done so when we reach this point + // in the code), but we may not delete them remotely. + with_metadata.retain(|(name, meta)| { + let retain = meta.shard.shard_number == self.tenant_shard_id.shard_number + && meta.shard.shard_count == self.tenant_shard_id.shard_count; + if !retain { + tracing::debug!( + "Skipping deletion of ancestor-shard layer {name}, from shard {}", + meta.shard + ); + } + retain + }); + for (name, meta) in &with_metadata { info!( "scheduling deletion of layer {}{} (shard {})", @@ -1724,11 +1742,11 @@ pub fn remote_index_path( .expect("Failed to construct path") } -pub const HEATMAP_BASENAME: &str = "heatmap-v1.json"; - pub(crate) fn remote_heatmap_path(tenant_shard_id: &TenantShardId) -> RemotePath { - RemotePath::from_string(&format!("tenants/{tenant_shard_id}/{HEATMAP_BASENAME}")) - .expect("Failed to construct path") + RemotePath::from_string(&format!( + "tenants/{tenant_shard_id}/{TENANT_HEATMAP_BASENAME}" + )) + .expect("Failed to construct path") } /// Given the key of an index, parse out the generation part of the name @@ -1885,7 +1903,7 @@ mod tests { fn span(&self) -> tracing::Span { tracing::info_span!( "test", - tenant_id = %self.harness.tenant_id, + tenant_id = %self.harness.tenant_shard_id.tenant_id, timeline_id = %TIMELINE_ID ) } @@ -2192,15 +2210,6 @@ mod tests { let index_part_bytes = serde_json::to_vec(&example_index_part).unwrap(); - let timeline_path = test_state.harness.timeline_path(&TIMELINE_ID); - let remote_timeline_dir = test_state.harness.remote_fs_dir.join( - timeline_path - .strip_prefix(&test_state.harness.conf.workdir) - .unwrap(), - ); - - std::fs::create_dir_all(remote_timeline_dir).expect("creating test dir should work"); - let index_path = test_state.harness.remote_fs_dir.join( remote_index_path( &test_state.harness.tenant_shard_id, @@ -2209,6 +2218,10 @@ mod tests { ) .get_path(), ); + + std::fs::create_dir_all(index_path.parent().unwrap()) + .expect("creating test dir should work"); + eprintln!("Writing {index_path}"); std::fs::write(&index_path, index_part_bytes).unwrap(); example_index_part diff --git a/pageserver/src/tenant/secondary.rs b/pageserver/src/tenant/secondary.rs index d25fe56b92..2331447266 100644 --- a/pageserver/src/tenant/secondary.rs +++ b/pageserver/src/tenant/secondary.rs @@ -1,24 +1,48 @@ +mod downloader; pub mod heatmap; mod heatmap_uploader; +mod scheduler; use std::sync::Arc; use crate::task_mgr::{self, TaskKind, BACKGROUND_RUNTIME}; -use self::heatmap_uploader::heatmap_uploader_task; +use self::{ + downloader::{downloader_task, SecondaryDetail}, + heatmap_uploader::heatmap_uploader_task, +}; -use super::mgr::TenantManager; +use super::{config::SecondaryLocationConfig, mgr::TenantManager}; use pageserver_api::shard::TenantShardId; use remote_storage::GenericRemoteStorage; use tokio_util::sync::CancellationToken; -use utils::completion::Barrier; +use utils::{completion::Barrier, sync::gate::Gate}; +enum DownloadCommand { + Download(TenantShardId), +} enum UploadCommand { Upload(TenantShardId), } +impl UploadCommand { + fn get_tenant_shard_id(&self) -> &TenantShardId { + match self { + Self::Upload(id) => id, + } + } +} + +impl DownloadCommand { + fn get_tenant_shard_id(&self) -> &TenantShardId { + match self { + Self::Download(id) => id, + } + } +} + struct CommandRequest { payload: T, response_tx: tokio::sync::oneshot::Sender, @@ -28,12 +52,73 @@ struct CommandResponse { result: anyhow::Result<()>, } +// Whereas [`Tenant`] represents an attached tenant, this type represents the work +// we do for secondary tenant locations: where we are not serving clients or +// ingesting WAL, but we are maintaining a warm cache of layer files. +// +// This type is all about the _download_ path for secondary mode. The upload path +// runs separately (see [`heatmap_uploader`]) while a regular attached `Tenant` exists. +// +// This structure coordinates TenantManager and SecondaryDownloader, +// so that the downloader can indicate which tenants it is currently +// operating on, and the manager can indicate when a particular +// secondary tenant should cancel any work in flight. +#[derive(Debug)] +pub(crate) struct SecondaryTenant { + /// Carrying a tenant shard ID simplifies callers such as the downloader + /// which need to organize many of these objects by ID. + tenant_shard_id: TenantShardId, + + /// Cancellation token indicates to SecondaryDownloader that it should stop doing + /// any work for this tenant at the next opportunity. + pub(crate) cancel: CancellationToken, + + pub(crate) gate: Gate, + + detail: std::sync::Mutex, +} + +impl SecondaryTenant { + pub(crate) fn new( + tenant_shard_id: TenantShardId, + config: &SecondaryLocationConfig, + ) -> Arc { + Arc::new(Self { + tenant_shard_id, + // todo: shall we make this a descendent of the + // main cancellation token, or is it sufficient that + // on shutdown we walk the tenants and fire their + // individual cancellations? + cancel: CancellationToken::new(), + gate: Gate::new(format!("SecondaryTenant {tenant_shard_id}")), + + detail: std::sync::Mutex::new(SecondaryDetail::new(config.clone())), + }) + } + + pub(crate) async fn shutdown(&self) { + self.cancel.cancel(); + + // Wait for any secondary downloader work to complete + self.gate.close().await; + } + + pub(crate) fn set_config(&self, config: &SecondaryLocationConfig) { + self.detail.lock().unwrap().config = config.clone(); + } + + fn get_tenant_shard_id(&self) -> &TenantShardId { + &self.tenant_shard_id + } +} + /// The SecondaryController is a pseudo-rpc client for administrative control of secondary mode downloads, /// and heatmap uploads. This is not a hot data path: it's primarily a hook for tests, /// where we want to immediately upload/download for a particular tenant. In normal operation /// uploads & downloads are autonomous and not driven by this interface. pub struct SecondaryController { upload_req_tx: tokio::sync::mpsc::Sender>, + download_req_tx: tokio::sync::mpsc::Sender>, } impl SecondaryController { @@ -63,6 +148,13 @@ impl SecondaryController { self.dispatch(&self.upload_req_tx, UploadCommand::Upload(tenant_shard_id)) .await } + pub async fn download_tenant(&self, tenant_shard_id: TenantShardId) -> anyhow::Result<()> { + self.dispatch( + &self.download_req_tx, + DownloadCommand::Download(tenant_shard_id), + ) + .await + } } pub fn spawn_tasks( @@ -71,9 +163,37 @@ pub fn spawn_tasks( background_jobs_can_start: Barrier, cancel: CancellationToken, ) -> SecondaryController { + let mgr_clone = tenant_manager.clone(); + let storage_clone = remote_storage.clone(); + let cancel_clone = cancel.clone(); + let bg_jobs_clone = background_jobs_can_start.clone(); + + let (download_req_tx, download_req_rx) = + tokio::sync::mpsc::channel::>(16); let (upload_req_tx, upload_req_rx) = tokio::sync::mpsc::channel::>(16); + task_mgr::spawn( + BACKGROUND_RUNTIME.handle(), + TaskKind::SecondaryDownloads, + None, + None, + "secondary tenant downloads", + false, + async move { + downloader_task( + mgr_clone, + storage_clone, + download_req_rx, + bg_jobs_clone, + cancel_clone, + ) + .await; + + Ok(()) + }, + ); + task_mgr::spawn( BACKGROUND_RUNTIME.handle(), TaskKind::SecondaryUploads, @@ -89,16 +209,26 @@ pub fn spawn_tasks( background_jobs_can_start, cancel, ) - .await + .await; + + Ok(()) }, ); - SecondaryController { upload_req_tx } + SecondaryController { + download_req_tx, + upload_req_tx, + } } /// For running with remote storage disabled: a SecondaryController that is connected to nothing. pub fn null_controller() -> SecondaryController { + let (download_req_tx, _download_req_rx) = + tokio::sync::mpsc::channel::>(16); let (upload_req_tx, _upload_req_rx) = tokio::sync::mpsc::channel::>(16); - SecondaryController { upload_req_tx } + SecondaryController { + upload_req_tx, + download_req_tx, + } } diff --git a/pageserver/src/tenant/secondary/downloader.rs b/pageserver/src/tenant/secondary/downloader.rs new file mode 100644 index 0000000000..2a79c406cf --- /dev/null +++ b/pageserver/src/tenant/secondary/downloader.rs @@ -0,0 +1,800 @@ +use std::{ + collections::{HashMap, HashSet}, + pin::Pin, + str::FromStr, + sync::Arc, + time::{Duration, Instant, SystemTime}, +}; + +use crate::{ + config::PageServerConf, + metrics::SECONDARY_MODE, + tenant::{ + config::SecondaryLocationConfig, + debug_assert_current_span_has_tenant_and_timeline_id, + remote_timeline_client::{ + index::LayerFileMetadata, FAILED_DOWNLOAD_WARN_THRESHOLD, FAILED_REMOTE_OP_RETRIES, + }, + span::debug_assert_current_span_has_tenant_id, + storage_layer::LayerFileName, + tasks::{warn_when_period_overrun, BackgroundLoopKind}, + }, + virtual_file::{on_fatal_io_error, MaybeFatalIo, VirtualFile}, + METADATA_FILE_NAME, TEMP_FILE_SUFFIX, +}; + +use super::{ + heatmap::HeatMapLayer, + scheduler::{self, Completion, JobGenerator, SchedulingResult, TenantBackgroundJobs}, + SecondaryTenant, +}; + +use crate::tenant::{ + mgr::TenantManager, + remote_timeline_client::{download::download_layer_file, remote_heatmap_path}, +}; + +use chrono::format::{DelayedFormat, StrftimeItems}; +use futures::Future; +use pageserver_api::shard::TenantShardId; +use rand::Rng; +use remote_storage::{DownloadError, GenericRemoteStorage}; + +use tokio_util::sync::CancellationToken; +use tracing::{info_span, instrument, Instrument}; +use utils::{ + backoff, completion::Barrier, crashsafe::path_with_suffix_extension, fs_ext, id::TimelineId, +}; + +use super::{ + heatmap::{HeatMapTenant, HeatMapTimeline}, + CommandRequest, DownloadCommand, +}; + +/// For each tenant, how long must have passed since the last download_tenant call before +/// calling it again. This is approximately the time by which local data is allowed +/// to fall behind remote data. +/// +/// TODO: this should just be a default, and the actual period should be controlled +/// via the heatmap itself +/// `` +const DOWNLOAD_FRESHEN_INTERVAL: Duration = Duration::from_millis(60000); + +pub(super) async fn downloader_task( + tenant_manager: Arc, + remote_storage: GenericRemoteStorage, + command_queue: tokio::sync::mpsc::Receiver>, + background_jobs_can_start: Barrier, + cancel: CancellationToken, +) { + let concurrency = tenant_manager.get_conf().secondary_download_concurrency; + + let generator = SecondaryDownloader { + tenant_manager, + remote_storage, + }; + let mut scheduler = Scheduler::new(generator, concurrency); + + scheduler + .run(command_queue, background_jobs_can_start, cancel) + .instrument(info_span!("secondary_downloads")) + .await +} + +struct SecondaryDownloader { + tenant_manager: Arc, + remote_storage: GenericRemoteStorage, +} + +#[derive(Debug, Clone)] +pub(super) struct OnDiskState { + metadata: LayerFileMetadata, + access_time: SystemTime, +} + +impl OnDiskState { + fn new( + _conf: &'static PageServerConf, + _tenant_shard_id: &TenantShardId, + _imeline_id: &TimelineId, + _ame: LayerFileName, + metadata: LayerFileMetadata, + access_time: SystemTime, + ) -> Self { + Self { + metadata, + access_time, + } + } +} + +#[derive(Debug, Clone, Default)] +pub(super) struct SecondaryDetailTimeline { + pub(super) on_disk_layers: HashMap, + + /// We remember when layers were evicted, to prevent re-downloading them. + pub(super) evicted_at: HashMap, +} + +/// This state is written by the secondary downloader, it is opaque +/// to TenantManager +#[derive(Debug)] +pub(super) struct SecondaryDetail { + pub(super) config: SecondaryLocationConfig, + + last_download: Option, + next_download: Option, + pub(super) timelines: HashMap, +} + +/// Helper for logging SystemTime +fn strftime(t: &'_ SystemTime) -> DelayedFormat> { + let datetime: chrono::DateTime = (*t).into(); + datetime.format("%d/%m/%Y %T") +} + +impl SecondaryDetail { + pub(super) fn new(config: SecondaryLocationConfig) -> Self { + Self { + config, + last_download: None, + next_download: None, + timelines: HashMap::new(), + } + } +} + +struct PendingDownload { + secondary_state: Arc, + last_download: Option, + target_time: Option, + period: Option, +} + +impl scheduler::PendingJob for PendingDownload { + fn get_tenant_shard_id(&self) -> &TenantShardId { + self.secondary_state.get_tenant_shard_id() + } +} + +struct RunningDownload { + barrier: Barrier, +} + +impl scheduler::RunningJob for RunningDownload { + fn get_barrier(&self) -> Barrier { + self.barrier.clone() + } +} + +struct CompleteDownload { + secondary_state: Arc, + completed_at: Instant, +} + +impl scheduler::Completion for CompleteDownload { + fn get_tenant_shard_id(&self) -> &TenantShardId { + self.secondary_state.get_tenant_shard_id() + } +} + +type Scheduler = TenantBackgroundJobs< + SecondaryDownloader, + PendingDownload, + RunningDownload, + CompleteDownload, + DownloadCommand, +>; + +impl JobGenerator + for SecondaryDownloader +{ + #[instrument(skip_all, fields(tenant_id=%completion.get_tenant_shard_id().tenant_id, shard_id=%completion.get_tenant_shard_id().shard_slug()))] + fn on_completion(&mut self, completion: CompleteDownload) { + let CompleteDownload { + secondary_state, + completed_at: _completed_at, + } = completion; + + tracing::debug!("Secondary tenant download completed"); + + // Update freshened_at even if there was an error: we don't want errored tenants to implicitly + // take priority to run again. + let mut detail = secondary_state.detail.lock().unwrap(); + detail.next_download = Some(Instant::now() + DOWNLOAD_FRESHEN_INTERVAL); + } + + async fn schedule(&mut self) -> SchedulingResult { + let mut result = SchedulingResult { + jobs: Vec::new(), + want_interval: None, + }; + + // Step 1: identify some tenants that we may work on + let mut tenants: Vec> = Vec::new(); + self.tenant_manager + .foreach_secondary_tenants(|_id, secondary_state| { + tenants.push(secondary_state.clone()); + }); + + // Step 2: filter out tenants which are not yet elegible to run + let now = Instant::now(); + result.jobs = tenants + .into_iter() + .filter_map(|secondary_tenant| { + let (last_download, next_download) = { + let mut detail = secondary_tenant.detail.lock().unwrap(); + + if !detail.config.warm { + // Downloads are disabled for this tenant + detail.next_download = None; + return None; + } + + if detail.next_download.is_none() { + // Initialize with a jitter: this spreads initial downloads on startup + // or mass-attach across our freshen interval. + let jittered_period = + rand::thread_rng().gen_range(Duration::ZERO..DOWNLOAD_FRESHEN_INTERVAL); + detail.next_download = Some(now.checked_add(jittered_period).expect( + "Using our constant, which is known to be small compared with clock range", + )); + } + (detail.last_download, detail.next_download.unwrap()) + }; + + if now < next_download { + Some(PendingDownload { + secondary_state: secondary_tenant, + last_download, + target_time: Some(next_download), + period: Some(DOWNLOAD_FRESHEN_INTERVAL), + }) + } else { + None + } + }) + .collect(); + + // Step 3: sort by target execution time to run most urgent first. + result.jobs.sort_by_key(|j| j.target_time); + + result + } + + fn on_command(&mut self, command: DownloadCommand) -> anyhow::Result { + let tenant_shard_id = command.get_tenant_shard_id(); + + let tenant = self + .tenant_manager + .get_secondary_tenant_shard(*tenant_shard_id); + let Some(tenant) = tenant else { + { + return Err(anyhow::anyhow!("Not found or not in Secondary mode")); + } + }; + + Ok(PendingDownload { + target_time: None, + period: None, + last_download: None, + secondary_state: tenant, + }) + } + + fn spawn( + &mut self, + job: PendingDownload, + ) -> ( + RunningDownload, + Pin + Send>>, + ) { + let PendingDownload { + secondary_state, + last_download, + target_time, + period, + } = job; + + let (completion, barrier) = utils::completion::channel(); + let remote_storage = self.remote_storage.clone(); + let conf = self.tenant_manager.get_conf(); + let tenant_shard_id = *secondary_state.get_tenant_shard_id(); + (RunningDownload { barrier }, Box::pin(async move { + let _completion = completion; + + match TenantDownloader::new(conf, &remote_storage, &secondary_state) + .download() + .await + { + Err(UpdateError::NoData) => { + tracing::info!("No heatmap found for tenant. This is fine if it is new."); + }, + Err(UpdateError::NoSpace) => { + tracing::warn!("Insufficient space while downloading. Will retry later."); + } + Err(UpdateError::Cancelled) => { + tracing::debug!("Shut down while downloading"); + }, + Err(UpdateError::Deserialize(e)) => { + tracing::error!("Corrupt content while downloading tenant: {e}"); + }, + Err(e @ (UpdateError::DownloadError(_) | UpdateError::Other(_))) => { + tracing::error!("Error while downloading tenant: {e}"); + }, + Ok(()) => {} + }; + + // Irrespective of the result, we will reschedule ourselves to run after our usual period. + + // If the job had a target execution time, we may check our final execution + // time against that for observability purposes. + if let (Some(target_time), Some(period)) = (target_time, period) { + // Only track execution lag if this isn't our first download: otherwise, it is expected + // that execution will have taken longer than our configured interval, for example + // when starting up a pageserver and + if last_download.is_some() { + // Elapsed time includes any scheduling lag as well as the execution of the job + let elapsed = Instant::now().duration_since(target_time); + + warn_when_period_overrun( + elapsed, + period, + BackgroundLoopKind::SecondaryDownload, + ); + } + } + + CompleteDownload { + secondary_state, + completed_at: Instant::now(), + } + }.instrument(info_span!(parent: None, "secondary_download", tenant_id=%tenant_shard_id.tenant_id, shard_id=%tenant_shard_id.shard_slug())))) + } +} + +/// This type is a convenience to group together the various functions involved in +/// freshening a secondary tenant. +struct TenantDownloader<'a> { + conf: &'static PageServerConf, + remote_storage: &'a GenericRemoteStorage, + secondary_state: &'a SecondaryTenant, +} + +/// Errors that may be encountered while updating a tenant +#[derive(thiserror::Error, Debug)] +enum UpdateError { + #[error("No remote data found")] + NoData, + #[error("Insufficient local storage space")] + NoSpace, + #[error("Failed to download")] + DownloadError(DownloadError), + #[error(transparent)] + Deserialize(#[from] serde_json::Error), + #[error("Cancelled")] + Cancelled, + #[error(transparent)] + Other(#[from] anyhow::Error), +} + +impl From for UpdateError { + fn from(value: DownloadError) -> Self { + match &value { + DownloadError::Cancelled => Self::Cancelled, + DownloadError::NotFound => Self::NoData, + _ => Self::DownloadError(value), + } + } +} + +impl From for UpdateError { + fn from(value: std::io::Error) -> Self { + if let Some(nix::errno::Errno::ENOSPC) = value.raw_os_error().map(nix::errno::from_i32) { + UpdateError::NoSpace + } else { + // An I/O error from e.g. tokio::io::copy is most likely a remote storage issue + UpdateError::Other(anyhow::anyhow!(value)) + } + } +} + +impl<'a> TenantDownloader<'a> { + fn new( + conf: &'static PageServerConf, + remote_storage: &'a GenericRemoteStorage, + secondary_state: &'a SecondaryTenant, + ) -> Self { + Self { + conf, + remote_storage, + secondary_state, + } + } + + async fn download(&self) -> Result<(), UpdateError> { + debug_assert_current_span_has_tenant_id(); + + // For the duration of a download, we must hold the SecondaryTenant::gate, to ensure + // cover our access to local storage. + let Ok(_guard) = self.secondary_state.gate.enter() else { + // Shutting down + return Ok(()); + }; + + let tenant_shard_id = self.secondary_state.get_tenant_shard_id(); + // Download the tenant's heatmap + let heatmap_bytes = tokio::select!( + bytes = self.download_heatmap() => {bytes?}, + _ = self.secondary_state.cancel.cancelled() => return Ok(()) + ); + + let heatmap = serde_json::from_slice::(&heatmap_bytes)?; + + // Save the heatmap: this will be useful on restart, allowing us to reconstruct + // layer metadata without having to re-download it. + let heatmap_path = self.conf.tenant_heatmap_path(tenant_shard_id); + + let temp_path = path_with_suffix_extension(&heatmap_path, TEMP_FILE_SUFFIX); + let context_msg = format!("write tenant {tenant_shard_id} heatmap to {heatmap_path}"); + let heatmap_path_bg = heatmap_path.clone(); + tokio::task::spawn_blocking(move || { + tokio::runtime::Handle::current().block_on(async move { + VirtualFile::crashsafe_overwrite(&heatmap_path_bg, &temp_path, &heatmap_bytes).await + }) + }) + .await + .expect("Blocking task is never aborted") + .maybe_fatal_err(&context_msg)?; + + tracing::debug!("Wrote local heatmap to {}", heatmap_path); + + // Download the layers in the heatmap + for timeline in heatmap.timelines { + if self.secondary_state.cancel.is_cancelled() { + return Ok(()); + } + + let timeline_id = timeline.timeline_id; + self.download_timeline(timeline) + .instrument(tracing::info_span!( + "secondary_download_timeline", + tenant_id=%tenant_shard_id.tenant_id, + shard_id=%tenant_shard_id.shard_slug(), + %timeline_id + )) + .await?; + } + + Ok(()) + } + + async fn download_heatmap(&self) -> Result, UpdateError> { + debug_assert_current_span_has_tenant_id(); + let tenant_shard_id = self.secondary_state.get_tenant_shard_id(); + // TODO: make download conditional on ETag having changed since last download + // (https://github.com/neondatabase/neon/issues/6199) + tracing::debug!("Downloading heatmap for secondary tenant",); + + let heatmap_path = remote_heatmap_path(tenant_shard_id); + + let heatmap_bytes = backoff::retry( + || async { + let download = self + .remote_storage + .download(&heatmap_path) + .await + .map_err(UpdateError::from)?; + let mut heatmap_bytes = Vec::new(); + let mut body = tokio_util::io::StreamReader::new(download.download_stream); + let _size = tokio::io::copy(&mut body, &mut heatmap_bytes).await?; + Ok(heatmap_bytes) + }, + |e| matches!(e, UpdateError::NoData | UpdateError::Cancelled), + FAILED_DOWNLOAD_WARN_THRESHOLD, + FAILED_REMOTE_OP_RETRIES, + "download heatmap", + backoff::Cancel::new(self.secondary_state.cancel.clone(), || { + UpdateError::Cancelled + }), + ) + .await?; + + SECONDARY_MODE.download_heatmap.inc(); + + Ok(heatmap_bytes) + } + + async fn download_timeline(&self, timeline: HeatMapTimeline) -> Result<(), UpdateError> { + debug_assert_current_span_has_tenant_and_timeline_id(); + let tenant_shard_id = self.secondary_state.get_tenant_shard_id(); + let timeline_path = self + .conf + .timeline_path(tenant_shard_id, &timeline.timeline_id); + + // Accumulate updates to the state + let mut touched = Vec::new(); + + // Clone a view of what layers already exist on disk + let timeline_state = self + .secondary_state + .detail + .lock() + .unwrap() + .timelines + .get(&timeline.timeline_id) + .cloned(); + + let timeline_state = match timeline_state { + Some(t) => t, + None => { + // We have no existing state: need to scan local disk for layers first. + let timeline_state = + init_timeline_state(self.conf, tenant_shard_id, &timeline).await; + + // Re-acquire detail lock now that we're done with async load from local FS + self.secondary_state + .detail + .lock() + .unwrap() + .timelines + .insert(timeline.timeline_id, timeline_state.clone()); + timeline_state + } + }; + + let layers_in_heatmap = timeline + .layers + .iter() + .map(|l| &l.name) + .collect::>(); + let layers_on_disk = timeline_state + .on_disk_layers + .iter() + .map(|l| l.0) + .collect::>(); + + // Remove on-disk layers that are no longer present in heatmap + for layer in layers_on_disk.difference(&layers_in_heatmap) { + let local_path = timeline_path.join(layer.to_string()); + tracing::info!("Removing secondary local layer {layer} because it's absent in heatmap",); + tokio::fs::remove_file(&local_path) + .await + .or_else(fs_ext::ignore_not_found) + .maybe_fatal_err("Removing secondary layer")?; + } + + // Download heatmap layers that are not present on local disk, or update their + // access time if they are already present. + for layer in timeline.layers { + if self.secondary_state.cancel.is_cancelled() { + return Ok(()); + } + + // Existing on-disk layers: just update their access time. + if let Some(on_disk) = timeline_state.on_disk_layers.get(&layer.name) { + tracing::debug!("Layer {} is already on disk", layer.name); + if on_disk.metadata != LayerFileMetadata::from(&layer.metadata) + || on_disk.access_time != layer.access_time + { + // We already have this layer on disk. Update its access time. + tracing::debug!( + "Access time updated for layer {}: {} -> {}", + layer.name, + strftime(&on_disk.access_time), + strftime(&layer.access_time) + ); + touched.push(layer); + } + continue; + } else { + tracing::debug!("Layer {} not present on disk yet", layer.name); + } + + // Eviction: if we evicted a layer, then do not re-download it unless it was accessed more + // recently than it was evicted. + if let Some(evicted_at) = timeline_state.evicted_at.get(&layer.name) { + if &layer.access_time > evicted_at { + tracing::info!( + "Re-downloading evicted layer {}, accessed at {}, evicted at {}", + layer.name, + strftime(&layer.access_time), + strftime(evicted_at) + ); + } else { + tracing::trace!( + "Not re-downloading evicted layer {}, accessed at {}, evicted at {}", + layer.name, + strftime(&layer.access_time), + strftime(evicted_at) + ); + continue; + } + } + + // Note: no backoff::retry wrapper here because download_layer_file does its own retries internally + let downloaded_bytes = match download_layer_file( + self.conf, + self.remote_storage, + *tenant_shard_id, + timeline.timeline_id, + &layer.name, + &LayerFileMetadata::from(&layer.metadata), + &self.secondary_state.cancel, + ) + .await + { + Ok(bytes) => bytes, + Err(e) => { + if let DownloadError::NotFound = e { + // A heatmap might be out of date and refer to a layer that doesn't exist any more. + // This is harmless: continue to download the next layer. It is expected during compaction + // GC. + tracing::debug!( + "Skipped downloading missing layer {}, raced with compaction/gc?", + layer.name + ); + continue; + } else { + return Err(e.into()); + } + } + }; + + if downloaded_bytes != layer.metadata.file_size { + let local_path = timeline_path.join(layer.name.to_string()); + + tracing::warn!( + "Downloaded layer {} with unexpected size {} != {}. Removing download.", + layer.name, + downloaded_bytes, + layer.metadata.file_size + ); + + tokio::fs::remove_file(&local_path) + .await + .or_else(fs_ext::ignore_not_found)?; + } + + SECONDARY_MODE.download_layer.inc(); + touched.push(layer) + } + + // Write updates to state to record layers we just downloaded or touched. + { + let mut detail = self.secondary_state.detail.lock().unwrap(); + let timeline_detail = detail.timelines.entry(timeline.timeline_id).or_default(); + + tracing::info!("Wrote timeline_detail for {} touched layers", touched.len()); + + for t in touched { + use std::collections::hash_map::Entry; + match timeline_detail.on_disk_layers.entry(t.name.clone()) { + Entry::Occupied(mut v) => { + v.get_mut().access_time = t.access_time; + } + Entry::Vacant(e) => { + e.insert(OnDiskState::new( + self.conf, + tenant_shard_id, + &timeline.timeline_id, + t.name, + LayerFileMetadata::from(&t.metadata), + t.access_time, + )); + } + } + } + } + + Ok(()) + } +} + +/// Scan local storage and build up Layer objects based on the metadata in a HeatMapTimeline +async fn init_timeline_state( + conf: &'static PageServerConf, + tenant_shard_id: &TenantShardId, + heatmap: &HeatMapTimeline, +) -> SecondaryDetailTimeline { + let timeline_path = conf.timeline_path(tenant_shard_id, &heatmap.timeline_id); + let mut detail = SecondaryDetailTimeline::default(); + + let mut dir = match tokio::fs::read_dir(&timeline_path).await { + Ok(d) => d, + Err(e) => { + if e.kind() == std::io::ErrorKind::NotFound { + let context = format!("Creating timeline directory {timeline_path}"); + tracing::info!("{}", context); + tokio::fs::create_dir_all(&timeline_path) + .await + .fatal_err(&context); + + // No entries to report: drop out. + return detail; + } else { + on_fatal_io_error(&e, &format!("Reading timeline dir {timeline_path}")); + } + } + }; + + // As we iterate through layers found on disk, we will look up their metadata from this map. + // Layers not present in metadata will be discarded. + let heatmap_metadata: HashMap<&LayerFileName, &HeatMapLayer> = + heatmap.layers.iter().map(|l| (&l.name, l)).collect(); + + while let Some(dentry) = dir + .next_entry() + .await + .fatal_err(&format!("Listing {timeline_path}")) + { + let dentry_file_name = dentry.file_name(); + let file_name = dentry_file_name.to_string_lossy(); + let local_meta = dentry.metadata().await.fatal_err(&format!( + "Read metadata on {}", + dentry.path().to_string_lossy() + )); + + // Secondary mode doesn't use local metadata files, but they might have been left behind by an attached tenant. + if file_name == METADATA_FILE_NAME { + continue; + } + + match LayerFileName::from_str(&file_name) { + Ok(name) => { + let remote_meta = heatmap_metadata.get(&name); + match remote_meta { + Some(remote_meta) => { + // TODO: checksums for layers (https://github.com/neondatabase/neon/issues/2784) + if local_meta.len() != remote_meta.metadata.file_size { + // This should not happen, because we do crashsafe write-then-rename when downloading + // layers, and layers in remote storage are immutable. Remove the local file because + // we cannot trust it. + tracing::warn!( + "Removing local layer {name} with unexpected local size {} != {}", + local_meta.len(), + remote_meta.metadata.file_size + ); + } else { + // We expect the access time to be initialized immediately afterwards, when + // the latest heatmap is applied to the state. + detail.on_disk_layers.insert( + name.clone(), + OnDiskState::new( + conf, + tenant_shard_id, + &heatmap.timeline_id, + name, + LayerFileMetadata::from(&remote_meta.metadata), + remote_meta.access_time, + ), + ); + } + } + None => { + // FIXME: consider some optimization when transitioning from attached to secondary: maybe + // wait until we have seen a heatmap that is more recent than the most recent on-disk state? Otherwise + // we will end up deleting any layers which were created+uploaded more recently than the heatmap. + tracing::info!( + "Removing secondary local layer {} because it's absent in heatmap", + name + ); + tokio::fs::remove_file(&dentry.path()) + .await + .or_else(fs_ext::ignore_not_found) + .fatal_err(&format!( + "Removing layer {}", + dentry.path().to_string_lossy() + )); + } + } + } + Err(_) => { + // Ignore it. + tracing::warn!("Unexpected file in timeline directory: {file_name}"); + } + } + } + + detail +} diff --git a/pageserver/src/tenant/secondary/heatmap_uploader.rs b/pageserver/src/tenant/secondary/heatmap_uploader.rs index ece2b93ce1..df865658a4 100644 --- a/pageserver/src/tenant/secondary/heatmap_uploader.rs +++ b/pageserver/src/tenant/secondary/heatmap_uploader.rs @@ -1,5 +1,6 @@ use std::{ collections::HashMap, + pin::Pin, sync::{Arc, Weak}, time::{Duration, Instant}, }; @@ -7,35 +8,86 @@ use std::{ use crate::{ metrics::SECONDARY_MODE, tenant::{ - config::AttachmentMode, mgr::TenantManager, remote_timeline_client::remote_heatmap_path, - secondary::CommandResponse, span::debug_assert_current_span_has_tenant_id, Tenant, + config::AttachmentMode, + mgr::TenantManager, + remote_timeline_client::remote_heatmap_path, + span::debug_assert_current_span_has_tenant_id, + tasks::{warn_when_period_overrun, BackgroundLoopKind}, + Tenant, }, }; +use futures::Future; use md5; use pageserver_api::shard::TenantShardId; +use rand::Rng; use remote_storage::GenericRemoteStorage; -use tokio::task::JoinSet; +use super::{ + scheduler::{self, JobGenerator, RunningJob, SchedulingResult, TenantBackgroundJobs}, + CommandRequest, +}; use tokio_util::sync::CancellationToken; -use tracing::instrument; -use utils::{backoff, completion::Barrier}; +use tracing::{info_span, instrument, Instrument}; +use utils::{backoff, completion::Barrier, yielding_loop::yielding_loop}; -use super::{heatmap::HeatMapTenant, CommandRequest, UploadCommand}; +use super::{heatmap::HeatMapTenant, UploadCommand}; -/// Period between heatmap uploader walking Tenants to look for work to do. -/// If any tenants have a heatmap upload period lower than this, it will be adjusted -/// downward to match. -const DEFAULT_SCHEDULING_INTERVAL: Duration = Duration::from_millis(60000); -const MIN_SCHEDULING_INTERVAL: Duration = Duration::from_millis(1000); +pub(super) async fn heatmap_uploader_task( + tenant_manager: Arc, + remote_storage: GenericRemoteStorage, + command_queue: tokio::sync::mpsc::Receiver>, + background_jobs_can_start: Barrier, + cancel: CancellationToken, +) { + let concurrency = tenant_manager.get_conf().heatmap_upload_concurrency; + + let generator = HeatmapUploader { + tenant_manager, + remote_storage, + cancel: cancel.clone(), + tenants: HashMap::new(), + }; + let mut scheduler = Scheduler::new(generator, concurrency); + + scheduler + .run(command_queue, background_jobs_can_start, cancel) + .instrument(info_span!("heatmap_uploader")) + .await +} + +/// This type is owned by a single task ([`heatmap_uploader_task`]) which runs an event +/// handling loop and mutates it as needed: there are no locks here, because that event loop +/// can hold &mut references to this type throughout. +struct HeatmapUploader { + tenant_manager: Arc, + remote_storage: GenericRemoteStorage, + cancel: CancellationToken, + + tenants: HashMap, +} struct WriteInProgress { barrier: Barrier, } +impl RunningJob for WriteInProgress { + fn get_barrier(&self) -> Barrier { + self.barrier.clone() + } +} + struct UploadPending { tenant: Arc, last_digest: Option, + target_time: Option, + period: Option, +} + +impl scheduler::PendingJob for UploadPending { + fn get_tenant_shard_id(&self) -> &TenantShardId { + self.tenant.get_tenant_shard_id() + } } struct WriteComplete { @@ -45,6 +97,12 @@ struct WriteComplete { next_upload: Option, } +impl scheduler::Completion for WriteComplete { + fn get_tenant_shard_id(&self) -> &TenantShardId { + &self.tenant_shard_id + } +} + /// The heatmap uploader keeps a little bit of per-tenant state, mainly to remember /// when we last did a write. We only populate this after doing at least one /// write for a tenant -- this avoids holding state for tenants that have @@ -68,267 +126,110 @@ struct UploaderTenantState { next_upload: Option, } -/// This type is owned by a single task ([`heatmap_uploader_task`]) which runs an event -/// handling loop and mutates it as needed: there are no locks here, because that event loop -/// can hold &mut references to this type throughout. -struct HeatmapUploader { - tenant_manager: Arc, - remote_storage: GenericRemoteStorage, - cancel: CancellationToken, +type Scheduler = TenantBackgroundJobs< + HeatmapUploader, + UploadPending, + WriteInProgress, + WriteComplete, + UploadCommand, +>; - tenants: HashMap, - - /// Tenants with work to do, for which tasks should be spawned as soon as concurrency - /// limits permit it. - tenants_pending: std::collections::VecDeque, - - /// Tenants for which a task in `tasks` has been spawned. - tenants_uploading: HashMap, - - tasks: JoinSet<()>, - - /// Channel for our child tasks to send results to: we use a channel for results rather than - /// just getting task results via JoinSet because we need the channel's recv() "sleep until something - /// is available" semantic, rather than JoinSet::join_next()'s "sleep until next thing is available _or_ I'm empty" - /// behavior. - task_result_tx: tokio::sync::mpsc::UnboundedSender, - task_result_rx: tokio::sync::mpsc::UnboundedReceiver, - - concurrent_uploads: usize, - - scheduling_interval: Duration, -} - -/// The uploader task runs a loop that periodically wakes up and schedules tasks for -/// tenants that require an upload, or handles any commands that have been sent into -/// `command_queue`. No I/O is done in this loop: that all happens in the tasks we -/// spawn. -/// -/// Scheduling iterations are somewhat infrequent. However, each one will enqueue -/// all tenants that require an upload, and in between scheduling iterations we will -/// continue to spawn new tasks for pending tenants, as our concurrency limit permits. -/// -/// While we take a CancellationToken here, it is subordinate to the CancellationTokens -/// of tenants: i.e. we expect all Tenants to have been shut down before we are shut down, otherwise -/// we might block waiting on a Tenant. -pub(super) async fn heatmap_uploader_task( - tenant_manager: Arc, - remote_storage: GenericRemoteStorage, - mut command_queue: tokio::sync::mpsc::Receiver>, - background_jobs_can_start: Barrier, - cancel: CancellationToken, -) -> anyhow::Result<()> { - let concurrent_uploads = tenant_manager.get_conf().heatmap_upload_concurrency; - - let (result_tx, result_rx) = tokio::sync::mpsc::unbounded_channel(); - - let mut uploader = HeatmapUploader { - tenant_manager, - remote_storage, - cancel: cancel.clone(), - tasks: JoinSet::new(), - tenants: HashMap::new(), - tenants_pending: std::collections::VecDeque::new(), - tenants_uploading: HashMap::new(), - task_result_tx: result_tx, - task_result_rx: result_rx, - concurrent_uploads, - scheduling_interval: DEFAULT_SCHEDULING_INTERVAL, - }; - - tracing::info!("Waiting for background_jobs_can start..."); - background_jobs_can_start.wait().await; - tracing::info!("background_jobs_can is ready, proceeding."); - - while !cancel.is_cancelled() { - // Look for new work: this is relatively expensive because we have to go acquire the lock on - // the tenant manager to retrieve tenants, and then iterate over them to figure out which ones - // require an upload. - uploader.schedule_iteration().await?; - - // Between scheduling iterations, we will: - // - Drain any complete tasks and spawn pending tasks - // - Handle incoming administrative commands - // - Check our cancellation token - let next_scheduling_iteration = Instant::now() - .checked_add(uploader.scheduling_interval) - .unwrap_or_else(|| { - tracing::warn!( - "Scheduling interval invalid ({}s), running immediately!", - uploader.scheduling_interval.as_secs_f64() - ); - Instant::now() - }); - loop { - tokio::select! { - _ = cancel.cancelled() => { - // We do not simply drop the JoinSet, in order to have an orderly shutdown without cancellation. - tracing::info!("Heatmap uploader joining tasks"); - while let Some(_r) = uploader.tasks.join_next().await {}; - tracing::info!("Heatmap uploader terminating"); - - break; - }, - _ = tokio::time::sleep(next_scheduling_iteration.duration_since(Instant::now())) => { - tracing::debug!("heatmap_uploader_task: woke for scheduling interval"); - break;}, - cmd = command_queue.recv() => { - tracing::debug!("heatmap_uploader_task: woke for command queue"); - let cmd = match cmd { - Some(c) =>c, - None => { - // SecondaryController was destroyed, and this has raced with - // our CancellationToken - tracing::info!("Heatmap uploader terminating"); - cancel.cancel(); - break; - } - }; - - let CommandRequest{ - response_tx, - payload - } = cmd; - uploader.handle_command(payload, response_tx); - }, - _ = uploader.process_next_completion() => { - if !cancel.is_cancelled() { - uploader.spawn_pending(); - } - } - } - } - } - - Ok(()) -} - -impl HeatmapUploader { - /// Periodic execution phase: inspect all attached tenants and schedule any work they require. - async fn schedule_iteration(&mut self) -> anyhow::Result<()> { +impl JobGenerator + for HeatmapUploader +{ + async fn schedule(&mut self) -> SchedulingResult { // Cull any entries in self.tenants whose Arc is gone self.tenants .retain(|_k, v| v.tenant.upgrade().is_some() && v.next_upload.is_some()); - // The priority order of previously scheduled work may be invalidated by current state: drop - // all pending work (it will be re-scheduled if still needed) - self.tenants_pending.clear(); - - // Used a fixed 'now' through the following loop, for efficiency and fairness. let now = Instant::now(); - // While iterating over the potentially-long list of tenants, we will periodically yield - // to avoid blocking executor. - const YIELD_ITERATIONS: usize = 1000; + let mut result = SchedulingResult { + jobs: Vec::new(), + want_interval: None, + }; - // Iterate over tenants looking for work to do. let tenants = self.tenant_manager.get_attached_active_tenant_shards(); - for (i, tenant) in tenants.into_iter().enumerate() { - // Process is shutting down, drop out - if self.cancel.is_cancelled() { - return Ok(()); - } - // Skip tenants that already have a write in flight - if self - .tenants_uploading - .contains_key(tenant.get_tenant_shard_id()) - { - continue; - } + yielding_loop(1000, &self.cancel, tenants.into_iter(), |tenant| { + let period = match tenant.get_heatmap_period() { + None => { + // Heatmaps are disabled for this tenant + return; + } + Some(period) => { + // If any tenant has asked for uploads more frequent than our scheduling interval, + // reduce it to match so that we can keep up. This is mainly useful in testing, where + // we may set rather short intervals. + result.want_interval = match result.want_interval { + None => Some(period), + Some(existing) => Some(std::cmp::min(period, existing)), + }; - self.maybe_schedule_upload(&now, tenant); + period + } + }; - if i + 1 % YIELD_ITERATIONS == 0 { - tokio::task::yield_now().await; - } - } - - // Spawn tasks for as many of our pending tenants as we can. - self.spawn_pending(); - - Ok(()) - } - - /// - /// Cancellation: this method is cancel-safe. - async fn process_next_completion(&mut self) { - match self.task_result_rx.recv().await { - Some(r) => { - self.on_completion(r); - } - None => { - unreachable!("Result sender is stored on Self"); - } - } - } - - /// The 'maybe' refers to the tenant's state: whether it is configured - /// for heatmap uploads at all, and whether sufficient time has passed - /// since the last upload. - fn maybe_schedule_upload(&mut self, now: &Instant, tenant: Arc) { - match tenant.get_heatmap_period() { - None => { - // Heatmaps are disabled for this tenant + // Stale attachments do not upload anything: if we are in this state, there is probably some + // other attachment in mode Single or Multi running on another pageserver, and we don't + // want to thrash and overwrite their heatmap uploads. + if tenant.get_attach_mode() == AttachmentMode::Stale { return; } - Some(period) => { - // If any tenant has asked for uploads more frequent than our scheduling interval, - // reduce it to match so that we can keep up. This is mainly useful in testing, where - // we may set rather short intervals. - if period < self.scheduling_interval { - self.scheduling_interval = std::cmp::max(period, MIN_SCHEDULING_INTERVAL); - } + + // Create an entry in self.tenants if one doesn't already exist: this will later be updated + // with the completion time in on_completion. + let state = self + .tenants + .entry(*tenant.get_tenant_shard_id()) + .or_insert_with(|| { + let jittered_period = rand::thread_rng().gen_range(Duration::ZERO..period); + + UploaderTenantState { + tenant: Arc::downgrade(&tenant), + last_upload: None, + next_upload: Some(now.checked_add(jittered_period).unwrap_or(now)), + last_digest: None, + } + }); + + // Decline to do the upload if insufficient time has passed + if state.next_upload.map(|nu| nu > now).unwrap_or(false) { + return; } - } - // Stale attachments do not upload anything: if we are in this state, there is probably some - // other attachment in mode Single or Multi running on another pageserver, and we don't - // want to thrash and overwrite their heatmap uploads. - if tenant.get_attach_mode() == AttachmentMode::Stale { - return; - } - - // Create an entry in self.tenants if one doesn't already exist: this will later be updated - // with the completion time in on_completion. - let state = self - .tenants - .entry(*tenant.get_tenant_shard_id()) - .or_insert_with(|| UploaderTenantState { - tenant: Arc::downgrade(&tenant), - last_upload: None, - next_upload: Some(Instant::now()), - last_digest: None, + let last_digest = state.last_digest; + result.jobs.push(UploadPending { + tenant, + last_digest, + target_time: state.next_upload, + period: Some(period), }); + }) + .await + .ok(); - // Decline to do the upload if insufficient time has passed - if state.next_upload.map(|nu| &nu > now).unwrap_or(false) { - return; - } + result + } - let last_digest = state.last_digest; - self.tenants_pending.push_back(UploadPending { + fn spawn( + &mut self, + job: UploadPending, + ) -> ( + WriteInProgress, + Pin + Send>>, + ) { + let UploadPending { tenant, last_digest, - }) - } + target_time, + period, + } = job; - fn spawn_pending(&mut self) { - while !self.tenants_pending.is_empty() - && self.tenants_uploading.len() < self.concurrent_uploads - { - // unwrap: loop condition includes !is_empty() - let pending = self.tenants_pending.pop_front().unwrap(); - self.spawn_upload(pending.tenant, pending.last_digest); - } - } - - fn spawn_upload(&mut self, tenant: Arc, last_digest: Option) { let remote_storage = self.remote_storage.clone(); - let tenant_shard_id = *tenant.get_tenant_shard_id(); let (completion, barrier) = utils::completion::channel(); - let result_tx = self.task_result_tx.clone(); - self.tasks.spawn(async move { + let tenant_shard_id = *tenant.get_tenant_shard_id(); + (WriteInProgress { barrier }, Box::pin(async move { // Guard for the barrier in [`WriteInProgress`] let _completion = completion; @@ -362,22 +263,47 @@ impl HeatmapUploader { }; let now = Instant::now(); + + // If the job had a target execution time, we may check our final execution + // time against that for observability purposes. + if let (Some(target_time), Some(period)) = (target_time, period) { + // Elapsed time includes any scheduling lag as well as the execution of the job + let elapsed = now.duration_since(target_time); + + warn_when_period_overrun(elapsed, period, BackgroundLoopKind::HeatmapUpload); + } + let next_upload = tenant .get_heatmap_period() .and_then(|period| now.checked_add(period)); - result_tx - .send(WriteComplete { + WriteComplete { tenant_shard_id: *tenant.get_tenant_shard_id(), completed_at: now, digest, next_upload, - }) - .ok(); - }); + } + }.instrument(info_span!(parent: None, "heatmap_upload", tenant_id=%tenant_shard_id.tenant_id, shard_id=%tenant_shard_id.shard_slug())))) + } - self.tenants_uploading - .insert(tenant_shard_id, WriteInProgress { barrier }); + fn on_command(&mut self, command: UploadCommand) -> anyhow::Result { + let tenant_shard_id = command.get_tenant_shard_id(); + + tracing::info!( + tenant_id=%tenant_shard_id.tenant_id, shard_id=%tenant_shard_id.shard_slug(), + "Starting heatmap write on command"); + let tenant = self + .tenant_manager + .get_attached_tenant_shard(*tenant_shard_id, true) + .map_err(|e| anyhow::anyhow!(e))?; + + Ok(UploadPending { + // Ignore our state for last digest: this forces an upload even if nothing has changed + last_digest: None, + tenant, + target_time: None, + period: None, + }) } #[instrument(skip_all, fields(tenant_id=%completion.tenant_shard_id.tenant_id, shard_id=%completion.tenant_shard_id.shard_slug()))] @@ -389,7 +315,6 @@ impl HeatmapUploader { digest, next_upload, } = completion; - self.tenants_uploading.remove(&tenant_shard_id); use std::collections::hash_map::Entry; match self.tenants.entry(tenant_shard_id) { Entry::Vacant(_) => { @@ -402,69 +327,6 @@ impl HeatmapUploader { } } } - - fn handle_command( - &mut self, - command: UploadCommand, - response_tx: tokio::sync::oneshot::Sender, - ) { - match command { - UploadCommand::Upload(tenant_shard_id) => { - // If an upload was ongoing for this tenant, let it finish first. - let barrier = if let Some(writing_state) = - self.tenants_uploading.get(&tenant_shard_id) - { - tracing::info!( - tenant_id=%tenant_shard_id.tenant_id, shard_id=%tenant_shard_id.shard_slug(), - "Waiting for heatmap write to complete"); - writing_state.barrier.clone() - } else { - // Spawn the upload then immediately wait for it. This will block processing of other commands and - // starting of other background work. - tracing::info!( - tenant_id=%tenant_shard_id.tenant_id, shard_id=%tenant_shard_id.shard_slug(), - "Starting heatmap write on command"); - let tenant = match self - .tenant_manager - .get_attached_tenant_shard(tenant_shard_id, true) - { - Ok(t) => t, - Err(e) => { - // Drop result of send: we don't care if caller dropped their receiver - drop(response_tx.send(CommandResponse { - result: Err(e.into()), - })); - return; - } - }; - self.spawn_upload(tenant, None); - let writing_state = self - .tenants_uploading - .get(&tenant_shard_id) - .expect("We just inserted this"); - tracing::info!( - tenant_id=%tenant_shard_id.tenant_id, shard_id=%tenant_shard_id.shard_slug(), - "Waiting for heatmap upload to complete"); - - writing_state.barrier.clone() - }; - - // This task does no I/O: it only listens for a barrier's completion and then - // sends to the command response channel. It is therefore safe to spawn this without - // any gates/task_mgr hooks. - tokio::task::spawn(async move { - barrier.wait().await; - - tracing::info!( - tenant_id=%tenant_shard_id.tenant_id, shard_id=%tenant_shard_id.shard_slug(), - "Heatmap upload complete"); - - // Drop result of send: we don't care if caller dropped their receiver - drop(response_tx.send(CommandResponse { result: Ok(()) })) - }); - } - } - } } enum UploadHeatmapOutcome { @@ -487,7 +349,6 @@ enum UploadHeatmapError { /// The inner upload operation. This will skip if `last_digest` is Some and matches the digest /// of the object we would have uploaded. -#[instrument(skip_all, fields(tenant_id = %tenant.get_tenant_shard_id().tenant_id, shard_id = %tenant.get_tenant_shard_id().shard_slug()))] async fn upload_tenant_heatmap( remote_storage: GenericRemoteStorage, tenant: &Arc, diff --git a/pageserver/src/tenant/secondary/scheduler.rs b/pageserver/src/tenant/secondary/scheduler.rs new file mode 100644 index 0000000000..58bdb54161 --- /dev/null +++ b/pageserver/src/tenant/secondary/scheduler.rs @@ -0,0 +1,359 @@ +use futures::Future; +use std::{ + collections::HashMap, + marker::PhantomData, + pin::Pin, + time::{Duration, Instant}, +}; + +use pageserver_api::shard::TenantShardId; +use tokio::task::JoinSet; +use tokio_util::sync::CancellationToken; +use utils::{completion::Barrier, yielding_loop::yielding_loop}; + +use super::{CommandRequest, CommandResponse}; + +/// Scheduling interval is the time between calls to JobGenerator::schedule. +/// When we schedule jobs, the job generator may provide a hint of its preferred +/// interval, which we will respect within these intervals. +const MAX_SCHEDULING_INTERVAL: Duration = Duration::from_secs(10); +const MIN_SCHEDULING_INTERVAL: Duration = Duration::from_secs(1); + +/// Scheduling helper for background work across many tenants. +/// +/// Systems that need to run background work across many tenants may use this type +/// to schedule jobs within a concurrency limit, along with their own [`JobGenerator`] +/// implementation to provide the work to execute. This is a simple scheduler that just +/// polls the generator for outstanding work, replacing its queue of pending work with +/// what the generator yields on each call: the job generator can change its mind about +/// the order of jobs between calls. The job generator is notified when jobs complete, +/// and additionally may expose a command hook to generate jobs on-demand (e.g. to implement +/// admin APIs). +/// +/// For an example see [`crate::tenant::secondary::heatmap_uploader`] +/// +/// G: A JobGenerator that this scheduler will poll to find pending jobs +/// PJ: 'Pending Job': type for job descriptors that are ready to run +/// RJ: 'Running Job' type' for jobs that have been spawned +/// C : 'Completion' type that spawned jobs will send when they finish +/// CMD: 'Command' type that the job generator will accept to create jobs on-demand +pub(super) struct TenantBackgroundJobs +where + G: JobGenerator, + C: Completion, + PJ: PendingJob, + RJ: RunningJob, +{ + generator: G, + + /// Ready to run. Will progress to `running` once concurrent limit is satisfied, or + /// be removed on next scheduling pass. + pending: std::collections::VecDeque, + + /// Tasks currently running in Self::tasks for these tenants. Check this map + /// before pushing more work into pending for the same tenant. + running: HashMap, + + tasks: JoinSet, + + concurrency: usize, + + /// How often we would like schedule_interval to be called. + pub(super) scheduling_interval: Duration, + + _phantom: PhantomData<(PJ, RJ, C, CMD)>, +} + +pub(crate) trait JobGenerator +where + C: Completion, + PJ: PendingJob, + RJ: RunningJob, +{ + /// Called at each scheduling interval. Return a list of jobs to run, most urgent first. + /// + /// This function may be expensive (e.g. walk all tenants), but should not do any I/O. + /// Implementations should take care to yield the executor periodically if running + /// very long loops. + /// + /// Yielding a job here does _not_ guarantee that it will run: if the queue of pending + /// jobs is not drained by the next scheduling interval, pending jobs will be cleared + /// and re-generated. + async fn schedule(&mut self) -> SchedulingResult; + + /// Called when a pending job is ready to be run. + /// + /// The job generation provides a future, and a RJ (Running Job) descriptor that tracks it. + fn spawn(&mut self, pending_job: PJ) -> (RJ, Pin + Send>>); + + /// Called when a job previously spawned with spawn() transmits its completion + fn on_completion(&mut self, completion: C); + + /// Called when a command is received. A job will be spawned immediately if the return + /// value is Some, ignoring concurrency limits and the pending queue. + fn on_command(&mut self, cmd: CMD) -> anyhow::Result; +} + +/// [`JobGenerator`] returns this to provide pending jobs, and hints about scheduling +pub(super) struct SchedulingResult { + pub(super) jobs: Vec, + /// The job generator would like to be called again this soon + pub(super) want_interval: Option, +} + +/// See [`TenantBackgroundJobs`]. +pub(super) trait PendingJob { + fn get_tenant_shard_id(&self) -> &TenantShardId; +} + +/// See [`TenantBackgroundJobs`]. +pub(super) trait Completion: Send + 'static { + fn get_tenant_shard_id(&self) -> &TenantShardId; +} + +/// See [`TenantBackgroundJobs`]. +pub(super) trait RunningJob { + fn get_barrier(&self) -> Barrier; +} + +impl TenantBackgroundJobs +where + C: Completion, + PJ: PendingJob, + RJ: RunningJob, + G: JobGenerator, +{ + pub(super) fn new(generator: G, concurrency: usize) -> Self { + Self { + generator, + pending: std::collections::VecDeque::new(), + running: HashMap::new(), + tasks: JoinSet::new(), + concurrency, + scheduling_interval: MAX_SCHEDULING_INTERVAL, + _phantom: PhantomData, + } + } + + pub(super) async fn run( + &mut self, + mut command_queue: tokio::sync::mpsc::Receiver>, + background_jobs_can_start: Barrier, + cancel: CancellationToken, + ) { + tracing::info!("Waiting for background_jobs_can start..."); + background_jobs_can_start.wait().await; + tracing::info!("background_jobs_can is ready, proceeding."); + + while !cancel.is_cancelled() { + // Look for new work: this is relatively expensive because we have to go acquire the lock on + // the tenant manager to retrieve tenants, and then iterate over them to figure out which ones + // require an upload. + self.schedule_iteration(&cancel).await; + + if cancel.is_cancelled() { + return; + } + + // Schedule some work, if concurrency limit permits it + self.spawn_pending(); + + // Between scheduling iterations, we will: + // - Drain any complete tasks and spawn pending tasks + // - Handle incoming administrative commands + // - Check our cancellation token + let next_scheduling_iteration = Instant::now() + .checked_add(self.scheduling_interval) + .unwrap_or_else(|| { + tracing::warn!( + "Scheduling interval invalid ({}s)", + self.scheduling_interval.as_secs_f64() + ); + // unwrap(): this constant is small, cannot fail to add to time unless + // we are close to the end of the universe. + Instant::now().checked_add(MIN_SCHEDULING_INTERVAL).unwrap() + }); + loop { + tokio::select! { + _ = cancel.cancelled() => { + tracing::info!("joining tasks"); + // We do not simply drop the JoinSet, in order to have an orderly shutdown without cancellation. + // It is the callers responsibility to make sure that the tasks they scheduled + // respect an appropriate cancellation token, to shut down promptly. It is only + // safe to wait on joining these tasks because we can see the cancellation token + // has been set. + while let Some(_r) = self.tasks.join_next().await {} + tracing::info!("terminating on cancellation token."); + + break; + }, + _ = tokio::time::sleep(next_scheduling_iteration.duration_since(Instant::now())) => { + tracing::debug!("woke for scheduling interval"); + break;}, + cmd = command_queue.recv() => { + tracing::debug!("woke for command queue"); + let cmd = match cmd { + Some(c) =>c, + None => { + // SecondaryController was destroyed, and this has raced with + // our CancellationToken + tracing::info!("terminating on command queue destruction"); + cancel.cancel(); + break; + } + }; + + let CommandRequest{ + response_tx, + payload + } = cmd; + self.handle_command(payload, response_tx); + }, + _ = async { + let completion = self.process_next_completion().await; + match completion { + Some(c) => { + self.generator.on_completion(c); + if !cancel.is_cancelled() { + self.spawn_pending(); + } + }, + None => { + // Nothing is running, so just wait: expect that this future + // will be dropped when something in the outer select! fires. + cancel.cancelled().await; + } + } + + } => {} + } + } + } + } + + fn do_spawn(&mut self, job: PJ) { + let tenant_shard_id = *job.get_tenant_shard_id(); + let (in_progress, fut) = self.generator.spawn(job); + + self.tasks.spawn(fut); + + self.running.insert(tenant_shard_id, in_progress); + } + + /// For all pending tenants that are elegible for execution, spawn their task. + /// + /// Caller provides the spawn operation, we track the resulting execution. + fn spawn_pending(&mut self) { + while !self.pending.is_empty() && self.running.len() < self.concurrency { + // unwrap: loop condition includes !is_empty() + let pending = self.pending.pop_front().unwrap(); + self.do_spawn(pending); + } + } + + /// For administrative commands: skip the pending queue, ignore concurrency limits + fn spawn_now(&mut self, job: PJ) -> &RJ { + let tenant_shard_id = *job.get_tenant_shard_id(); + self.do_spawn(job); + self.running + .get(&tenant_shard_id) + .expect("We just inserted this") + } + + /// Wait until the next task completes, and handle its completion + /// + /// Cancellation: this method is cancel-safe. + async fn process_next_completion(&mut self) -> Option { + match self.tasks.join_next().await { + Some(r) => { + // We use a channel to drive completions, but also + // need to drain the JoinSet to avoid completed tasks + // accumulating. These calls are 1:1 because every task + // we spawn into this joinset submits is result to the channel. + let completion = r.expect("Panic in background task"); + + self.running.remove(completion.get_tenant_shard_id()); + Some(completion) + } + None => { + // Nothing is running, so we have nothing to wait for. We may drop out: the + // main even loop will call us again after the next time it has run something. + None + } + } + } + + /// Convert the command into a pending job, spawn it, and when the spawned + /// job completes, send the result down `response_tx`. + fn handle_command( + &mut self, + cmd: CMD, + response_tx: tokio::sync::oneshot::Sender, + ) { + let job = match self.generator.on_command(cmd) { + Ok(j) => j, + Err(e) => { + response_tx.send(CommandResponse { result: Err(e) }).ok(); + return; + } + }; + + let tenant_shard_id = job.get_tenant_shard_id(); + let barrier = if let Some(barrier) = self.get_running(tenant_shard_id) { + barrier + } else { + let running = self.spawn_now(job); + running.get_barrier().clone() + }; + + // This task does no I/O: it only listens for a barrier's completion and then + // sends to the command response channel. It is therefore safe to spawn this without + // any gates/task_mgr hooks. + tokio::task::spawn(async move { + barrier.wait().await; + + response_tx.send(CommandResponse { result: Ok(()) }).ok(); + }); + } + + fn get_running(&self, tenant_shard_id: &TenantShardId) -> Option { + self.running.get(tenant_shard_id).map(|r| r.get_barrier()) + } + + /// Periodic execution phase: inspect all attached tenants and schedule any work they require. + /// + /// The type in `tenants` should be a tenant-like structure, e.g. [`crate::tenant::Tenant`] or [`crate::tenant::secondary::SecondaryTenant`] + /// + /// This function resets the pending list: it is assumed that the caller may change their mind about + /// which tenants need work between calls to schedule_iteration. + async fn schedule_iteration(&mut self, cancel: &CancellationToken) { + let SchedulingResult { + jobs, + want_interval, + } = self.generator.schedule().await; + + // Adjust interval based on feedback from the job generator + if let Some(want_interval) = want_interval { + // Calculation uses second granularity: this scheduler is not intended for high frequency tasks + self.scheduling_interval = Duration::from_secs(std::cmp::min( + std::cmp::max(MIN_SCHEDULING_INTERVAL.as_secs(), want_interval.as_secs()), + MAX_SCHEDULING_INTERVAL.as_secs(), + )); + } + + // The priority order of previously scheduled work may be invalidated by current state: drop + // all pending work (it will be re-scheduled if still needed) + self.pending.clear(); + + // While iterating over the potentially-long list of tenants, we will periodically yield + // to avoid blocking executor. + yielding_loop(1000, cancel, jobs.into_iter(), |job| { + // Skip tenants that already have a write in flight + if !self.running.contains_key(job.get_tenant_shard_id()) { + self.pending.push_back(job); + } + }) + .await + .ok(); + } +} diff --git a/pageserver/src/tenant/storage_layer/delta_layer.rs b/pageserver/src/tenant/storage_layer/delta_layer.rs index d0822d220f..2050a82f3b 100644 --- a/pageserver/src/tenant/storage_layer/delta_layer.rs +++ b/pageserver/src/tenant/storage_layer/delta_layer.rs @@ -320,8 +320,8 @@ impl DeltaLayer { .metadata() .context("get file metadata to determine size")?; - // TODO(sharding): we must get the TenantShardId from the path instead of reading the Summary. - // we should also validate the path against the Summary, as both should contain the same tenant, timeline, key, lsn. + // This function is never used for constructing layers in a running pageserver, + // so it does not need an accurate TenantShardId. let tenant_shard_id = TenantShardId::unsharded(summary.tenant_id); Ok(DeltaLayer { diff --git a/pageserver/src/tenant/storage_layer/image_layer.rs b/pageserver/src/tenant/storage_layer/image_layer.rs index 75174f4745..988dceb6ea 100644 --- a/pageserver/src/tenant/storage_layer/image_layer.rs +++ b/pageserver/src/tenant/storage_layer/image_layer.rs @@ -278,8 +278,8 @@ impl ImageLayer { .metadata() .context("get file metadata to determine size")?; - // TODO(sharding): we should get TenantShardId from path. - // OR, not at all: any layer we load from disk should also get reconciled with remote IndexPart. + // This function is never used for constructing layers in a running pageserver, + // so it does not need an accurate TenantShardId. let tenant_shard_id = TenantShardId::unsharded(summary.tenant_id); Ok(ImageLayer { diff --git a/pageserver/src/tenant/storage_layer/inmemory_layer.rs b/pageserver/src/tenant/storage_layer/inmemory_layer.rs index 003cf0e92b..7c9103eea8 100644 --- a/pageserver/src/tenant/storage_layer/inmemory_layer.rs +++ b/pageserver/src/tenant/storage_layer/inmemory_layer.rs @@ -23,7 +23,7 @@ use utils::{bin_ser::BeSer, id::TimelineId, lsn::Lsn, vec_map::VecMap}; // while being able to use std::fmt::Write's methods use std::fmt::Write as _; use std::ops::Range; -use tokio::sync::RwLock; +use tokio::sync::{RwLock, RwLockWriteGuard}; use super::{DeltaLayerWriter, ResidentLayer}; @@ -246,16 +246,43 @@ impl InMemoryLayer { /// Common subroutine of the public put_wal_record() and put_page_image() functions. /// Adds the page version to the in-memory tree - pub async fn put_value( + pub(crate) async fn put_value( &self, key: Key, lsn: Lsn, val: &Value, ctx: &RequestContext, ) -> Result<()> { - trace!("put_value key {} at {}/{}", key, self.timeline_id, lsn); - let inner: &mut _ = &mut *self.inner.write().await; + let mut inner = self.inner.write().await; self.assert_writable(); + self.put_value_locked(&mut inner, key, lsn, val, ctx).await + } + + pub(crate) async fn put_values( + &self, + values: &HashMap>, + ctx: &RequestContext, + ) -> Result<()> { + let mut inner = self.inner.write().await; + self.assert_writable(); + for (key, vals) in values { + for (lsn, val) in vals { + self.put_value_locked(&mut inner, *key, *lsn, val, ctx) + .await?; + } + } + Ok(()) + } + + async fn put_value_locked( + &self, + locked_inner: &mut RwLockWriteGuard<'_, InMemoryLayerInner>, + key: Key, + lsn: Lsn, + val: &Value, + ctx: &RequestContext, + ) -> Result<()> { + trace!("put_value key {} at {}/{}", key, self.timeline_id, lsn); let off = { // Avoid doing allocations for "small" values. @@ -264,7 +291,7 @@ impl InMemoryLayer { let mut buf = smallvec::SmallVec::<[u8; 256]>::new(); buf.clear(); val.ser_into(&mut buf)?; - inner + locked_inner .file .write_blob( &buf, @@ -275,7 +302,7 @@ impl InMemoryLayer { .await? }; - let vec_map = inner.index.entry(key).or_default(); + let vec_map = locked_inner.index.entry(key).or_default(); let old = vec_map.append_or_update_last(lsn, off).unwrap().0; if old.is_some() { // We already had an entry for this LSN. That's odd.. @@ -285,13 +312,11 @@ impl InMemoryLayer { Ok(()) } - pub async fn put_tombstone(&self, _key_range: Range, _lsn: Lsn) -> Result<()> { + pub(crate) async fn put_tombstones(&self, _key_ranges: &[(Range, Lsn)]) -> Result<()> { // TODO: Currently, we just leak the storage for any deleted keys - Ok(()) } - /// Make the layer non-writeable. Only call once. /// Records the end_lsn for non-dropped layers. /// `end_lsn` is exclusive pub async fn freeze(&self, end_lsn: Lsn) { diff --git a/pageserver/src/tenant/storage_layer/layer.rs b/pageserver/src/tenant/storage_layer/layer.rs index 8ae911b31e..3f29e9f6a5 100644 --- a/pageserver/src/tenant/storage_layer/layer.rs +++ b/pageserver/src/tenant/storage_layer/layer.rs @@ -945,8 +945,18 @@ impl LayerInner { Ok((Err(e), _permit)) => { // sleep already happened in the spawned task, if it was not cancelled let consecutive_failures = self.consecutive_failures.load(Ordering::Relaxed); - tracing::error!(consecutive_failures, "layer file download failed: {e:#}"); - Err(DownloadError::DownloadFailed) + + match e.downcast_ref::() { + // If the download failed due to its cancellation token, + // propagate the cancellation error upstream. + Some(remote_storage::DownloadError::Cancelled) => { + Err(DownloadError::DownloadCancelled) + } + _ => { + tracing::error!(consecutive_failures, "layer file download failed: {e:#}"); + Err(DownloadError::DownloadFailed) + } + } } Err(_gone) => Err(DownloadError::DownloadCancelled), } @@ -1118,6 +1128,7 @@ impl LayerInner { tracing::info!("evicted layer after unknown residence period"); } } + timeline.metrics.evictions.inc(); timeline .metrics .resident_physical_size_sub(self.desc.file_size); diff --git a/pageserver/src/tenant/tasks.rs b/pageserver/src/tenant/tasks.rs index 7ff1873eda..2b2fcc7711 100644 --- a/pageserver/src/tenant/tasks.rs +++ b/pageserver/src/tenant/tasks.rs @@ -45,6 +45,8 @@ pub(crate) enum BackgroundLoopKind { ConsumptionMetricsCollectMetrics, ConsumptionMetricsSyntheticSizeWorker, InitialLogicalSizeCalculation, + HeatmapUpload, + SecondaryDownload, } impl BackgroundLoopKind { @@ -63,6 +65,11 @@ pub(crate) async fn concurrent_background_tasks_rate_limit_permit( .with_label_values(&[loop_kind.as_static_str()]) .guard(); + pausable_failpoint!( + "initial-size-calculation-permit-pause", + loop_kind == BackgroundLoopKind::InitialLogicalSizeCalculation + ); + match CONCURRENT_BACKGROUND_TASKS.acquire().await { Ok(permit) => permit, Err(_closed) => unreachable!("we never close the semaphore"), diff --git a/pageserver/src/tenant/timeline.rs b/pageserver/src/tenant/timeline.rs index 1e84fa1848..ea1ab1a828 100644 --- a/pageserver/src/tenant/timeline.rs +++ b/pageserver/src/tenant/timeline.rs @@ -373,15 +373,20 @@ pub struct GcInfo { } /// An error happened in a get() operation. -#[derive(thiserror::Error)] -pub enum PageReconstructError { +#[derive(thiserror::Error, Debug)] +pub(crate) enum PageReconstructError { #[error(transparent)] Other(#[from] anyhow::Error), + #[error("Ancestor LSN wait error: {0}")] + AncestorLsnTimeout(#[from] WaitLsnError), + /// The operation was cancelled + #[error("Cancelled")] Cancelled, /// The ancestor of this is being stopped + #[error("ancestor timeline {0} is being stopped")] AncestorStopping(TimelineId), /// An error happened replaying WAL records @@ -402,32 +407,6 @@ enum FlushLayerError { Other(#[from] anyhow::Error), } -impl std::fmt::Debug for PageReconstructError { - fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> Result<(), std::fmt::Error> { - match self { - Self::Other(err) => err.fmt(f), - Self::Cancelled => write!(f, "cancelled"), - Self::AncestorStopping(timeline_id) => { - write!(f, "ancestor timeline {timeline_id} is being stopped") - } - Self::WalRedo(err) => err.fmt(f), - } - } -} - -impl std::fmt::Display for PageReconstructError { - fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> Result<(), std::fmt::Error> { - match self { - Self::Other(err) => err.fmt(f), - Self::Cancelled => write!(f, "cancelled"), - Self::AncestorStopping(timeline_id) => { - write!(f, "ancestor timeline {timeline_id} is being stopped") - } - Self::WalRedo(err) => err.fmt(f), - } - } -} - #[derive(Clone, Copy)] pub enum LogicalSizeCalculationCause { Initial, @@ -452,6 +431,21 @@ impl std::fmt::Debug for Timeline { } } +#[derive(thiserror::Error, Debug)] +pub(crate) enum WaitLsnError { + // Called on a timeline which is shutting down + #[error("Shutdown")] + Shutdown, + + // Called on an timeline not in active state or shutting down + #[error("Bad state (not active)")] + BadState, + + // Timeout expired while waiting for LSN to catch up with goal. + #[error("{0}")] + Timeout(String), +} + /// Public interface functions impl Timeline { /// Get the LSN where this branch was created @@ -486,7 +480,7 @@ impl Timeline { /// # Cancel-Safety /// /// This method is cancellation-safe. - pub async fn get( + pub(crate) async fn get( &self, key: Key, lsn: Lsn, @@ -496,6 +490,11 @@ impl Timeline { return Err(PageReconstructError::Other(anyhow::anyhow!("Invalid LSN"))); } + // This check is debug-only because of the cost of hashing, and because it's a double-check: we + // already checked the key against the shard_identity when looking up the Timeline from + // page_service. + debug_assert!(!self.shard_identity.is_key_disposable(&key)); + // XXX: structured stats collection for layer eviction here. trace!( "get page request for {}@{} from task kind {:?}", @@ -629,24 +628,28 @@ impl Timeline { /// You should call this before any of the other get_* or list_* functions. Calling /// those functions with an LSN that has been processed yet is an error. /// - pub async fn wait_lsn( + pub(crate) async fn wait_lsn( &self, lsn: Lsn, _ctx: &RequestContext, /* Prepare for use by cancellation */ - ) -> anyhow::Result<()> { - anyhow::ensure!(self.is_active(), "Cannot wait for Lsn on inactive timeline"); + ) -> Result<(), WaitLsnError> { + if self.cancel.is_cancelled() { + return Err(WaitLsnError::Shutdown); + } else if !self.is_active() { + return Err(WaitLsnError::BadState); + } // This should never be called from the WAL receiver, because that could lead // to a deadlock. - anyhow::ensure!( + debug_assert!( task_mgr::current_task_kind() != Some(TaskKind::WalReceiverManager), "wait_lsn cannot be called in WAL receiver" ); - anyhow::ensure!( + debug_assert!( task_mgr::current_task_kind() != Some(TaskKind::WalReceiverConnectionHandler), "wait_lsn cannot be called in WAL receiver" ); - anyhow::ensure!( + debug_assert!( task_mgr::current_task_kind() != Some(TaskKind::WalReceiverConnectionPoller), "wait_lsn cannot be called in WAL receiver" ); @@ -660,18 +663,22 @@ impl Timeline { { Ok(()) => Ok(()), Err(e) => { - // don't count the time spent waiting for lock below, and also in walreceiver.status(), towards the wait_lsn_time_histo - drop(_timer); - let walreceiver_status = self.walreceiver_status(); - Err(anyhow::Error::new(e).context({ - format!( + use utils::seqwait::SeqWaitError::*; + match e { + Shutdown => Err(WaitLsnError::Shutdown), + Timeout => { + // don't count the time spent waiting for lock below, and also in walreceiver.status(), towards the wait_lsn_time_histo + drop(_timer); + let walreceiver_status = self.walreceiver_status(); + Err(WaitLsnError::Timeout(format!( "Timed out while waiting for WAL record at LSN {} to arrive, last_record_lsn {} disk consistent LSN={}, WalReceiver status: {}", lsn, self.get_last_record_lsn(), self.get_disk_consistent_lsn(), walreceiver_status, - ) - })) + ))) + } + } } } } @@ -1459,6 +1466,7 @@ impl Timeline { max_lsn_wal_lag, auth_token: crate::config::SAFEKEEPER_AUTH_TOKEN.get().cloned(), availability_zone: self.conf.availability_zone.clone(), + ingest_batch_size: self.conf.ingest_batch_size, }, broker_client, ctx, @@ -2223,13 +2231,13 @@ impl Timeline { return Err(layer_traversal_error( if cfg!(test) { format!( - "could not find data for key {} at LSN {}, for request at LSN {}\n{}", - key, cont_lsn, request_lsn, std::backtrace::Backtrace::force_capture(), + "could not find data for key {} (shard {:?}) at LSN {}, for request at LSN {}\n{}", + key, self.shard_identity.get_shard_number(&key), cont_lsn, request_lsn, std::backtrace::Backtrace::force_capture(), ) } else { format!( - "could not find data for key {} at LSN {}, for request at LSN {}", - key, cont_lsn, request_lsn + "could not find data for key {} (shard {:?}) at LSN {}, for request at LSN {}", + key, self.shard_identity.get_shard_number(&key), cont_lsn, request_lsn ) }, traversal_path, @@ -2289,11 +2297,12 @@ impl Timeline { ancestor .wait_lsn(timeline.ancestor_lsn, ctx) .await - .with_context(|| { - format!( - "wait for lsn {} on ancestor timeline_id={}", - timeline.ancestor_lsn, ancestor.timeline_id - ) + .map_err(|e| match e { + e @ WaitLsnError::Timeout(_) => PageReconstructError::AncestorLsnTimeout(e), + WaitLsnError::Shutdown => PageReconstructError::Cancelled, + e @ WaitLsnError::BadState => { + PageReconstructError::Other(anyhow::anyhow!(e)) + } })?; timeline_owned = ancestor; @@ -2471,9 +2480,27 @@ impl Timeline { Ok(()) } - async fn put_tombstone(&self, key_range: Range, lsn: Lsn) -> anyhow::Result<()> { - let layer = self.get_layer_for_write(lsn).await?; - layer.put_tombstone(key_range, lsn).await?; + async fn put_values( + &self, + values: &HashMap>, + ctx: &RequestContext, + ) -> anyhow::Result<()> { + // Pick the first LSN in the batch to get the layer to write to. + for lsns in values.values() { + if let Some((lsn, _)) = lsns.first() { + let layer = self.get_layer_for_write(*lsn).await?; + layer.put_values(values, ctx).await?; + break; + } + } + Ok(()) + } + + async fn put_tombstones(&self, tombstones: &[(Range, Lsn)]) -> anyhow::Result<()> { + if let Some((_, lsn)) = tombstones.first() { + let layer = self.get_layer_for_write(*lsn).await?; + layer.put_tombstones(tombstones).await?; + } Ok(()) } @@ -3035,6 +3062,15 @@ impl Timeline { for range in &partition.ranges { let mut key = range.start; while key < range.end { + if self.shard_identity.is_key_disposable(&key) { + debug!( + "Dropping key {} during compaction (it belongs on shard {:?})", + key, + self.shard_identity.get_shard_number(&key) + ); + key = key.next(); + continue; + } let img = match self.get(key, lsn, ctx).await { Ok(img) => img, Err(err) => { @@ -3061,6 +3097,7 @@ impl Timeline { } } }; + image_layer_writer.put_image(key, &img).await?; key = key.next(); } @@ -3094,11 +3131,13 @@ impl Timeline { .await .context("fsync of newly created layer files")?; - par_fsync::par_fsync_async(&[self - .conf - .timeline_path(&self.tenant_shard_id, &self.timeline_id)]) - .await - .context("fsync of timeline dir")?; + if !all_paths.is_empty() { + par_fsync::par_fsync_async(&[self + .conf + .timeline_path(&self.tenant_shard_id, &self.timeline_id)]) + .await + .context("fsync of timeline dir")?; + } let mut guard = self.layers.write().await; @@ -3631,7 +3670,15 @@ impl Timeline { ))) }); - writer.as_mut().unwrap().put_value(key, lsn, value).await?; + if !self.shard_identity.is_key_disposable(&key) { + writer.as_mut().unwrap().put_value(key, lsn, value).await?; + } else { + debug!( + "Dropping key {} during compaction (it belongs on shard {:?})", + key, + self.shard_identity.get_shard_number(&key) + ); + } if !new_layers.is_empty() { fail_point!("after-timeline-compacted-first-L1"); @@ -4186,7 +4233,7 @@ impl Timeline { .context("Failed to reconstruct a page image:") { Ok(img) => img, - Err(e) => return Err(PageReconstructError::from(e)), + Err(e) => return Err(PageReconstructError::WalRedo(e)), }; if img.len() == page_cache::PAGE_SZ { @@ -4529,8 +4576,16 @@ impl<'a> TimelineWriter<'a> { self.tl.put_value(key, lsn, value, ctx).await } - pub async fn delete(&self, key_range: Range, lsn: Lsn) -> anyhow::Result<()> { - self.tl.put_tombstone(key_range, lsn).await + pub(crate) async fn put_batch( + &self, + batch: &HashMap>, + ctx: &RequestContext, + ) -> anyhow::Result<()> { + self.tl.put_values(batch, ctx).await + } + + pub(crate) async fn delete_batch(&self, batch: &[(Range, Lsn)]) -> anyhow::Result<()> { + self.tl.put_tombstones(batch).await } /// Track the end of the latest digested WAL record. @@ -4541,11 +4596,11 @@ impl<'a> TimelineWriter<'a> { /// 'lsn' must be aligned. This wakes up any wait_lsn() callers waiting for /// the 'lsn' or anything older. The previous last record LSN is stored alongside /// the latest and can be read. - pub fn finish_write(&self, new_lsn: Lsn) { + pub(crate) fn finish_write(&self, new_lsn: Lsn) { self.tl.finish_write(new_lsn); } - pub fn update_current_logical_size(&self, delta: i64) { + pub(crate) fn update_current_logical_size(&self, delta: i64) { self.tl.update_current_logical_size(delta) } } diff --git a/pageserver/src/tenant/timeline/walreceiver.rs b/pageserver/src/tenant/timeline/walreceiver.rs index e32265afb5..2fab6722b8 100644 --- a/pageserver/src/tenant/timeline/walreceiver.rs +++ b/pageserver/src/tenant/timeline/walreceiver.rs @@ -58,6 +58,7 @@ pub struct WalReceiverConf { pub max_lsn_wal_lag: NonZeroU64, pub auth_token: Option>, pub availability_zone: Option, + pub ingest_batch_size: u64, } pub struct WalReceiver { diff --git a/pageserver/src/tenant/timeline/walreceiver/connection_manager.rs b/pageserver/src/tenant/timeline/walreceiver/connection_manager.rs index 5a5b3d7586..cf6dee114f 100644 --- a/pageserver/src/tenant/timeline/walreceiver/connection_manager.rs +++ b/pageserver/src/tenant/timeline/walreceiver/connection_manager.rs @@ -411,6 +411,7 @@ impl ConnectionManagerState { let node_id = new_sk.safekeeper_id; let connect_timeout = self.conf.wal_connect_timeout; + let ingest_batch_size = self.conf.ingest_batch_size; let timeline = Arc::clone(&self.timeline); let ctx = ctx.detached_child( TaskKind::WalReceiverConnectionHandler, @@ -430,6 +431,7 @@ impl ConnectionManagerState { connect_timeout, ctx, node_id, + ingest_batch_size, ) .await; @@ -1335,7 +1337,7 @@ mod tests { ConnectionManagerState { id: TenantTimelineId { - tenant_id: harness.tenant_id, + tenant_id: harness.tenant_shard_id.tenant_id, timeline_id: TIMELINE_ID, }, timeline, @@ -1345,6 +1347,7 @@ mod tests { max_lsn_wal_lag: NonZeroU64::new(1024 * 1024).unwrap(), auth_token: None, availability_zone: None, + ingest_batch_size: 1, }, wal_connection: None, wal_stream_candidates: HashMap::new(), diff --git a/pageserver/src/tenant/timeline/walreceiver/walreceiver_connection.rs b/pageserver/src/tenant/timeline/walreceiver/walreceiver_connection.rs index 61ab236322..e398d683e5 100644 --- a/pageserver/src/tenant/timeline/walreceiver/walreceiver_connection.rs +++ b/pageserver/src/tenant/timeline/walreceiver/walreceiver_connection.rs @@ -26,7 +26,7 @@ use tracing::{debug, error, info, trace, warn, Instrument}; use super::TaskStateUpdate; use crate::{ context::RequestContext, - metrics::{LIVE_CONNECTIONS_COUNT, WALRECEIVER_STARTED_CONNECTIONS}, + metrics::{LIVE_CONNECTIONS_COUNT, WALRECEIVER_STARTED_CONNECTIONS, WAL_INGEST}, task_mgr, task_mgr::TaskKind, task_mgr::WALRECEIVER_RUNTIME, @@ -106,6 +106,7 @@ impl From for WalReceiverError { /// Open a connection to the given safekeeper and receive WAL, sending back progress /// messages as we go. +#[allow(clippy::too_many_arguments)] pub(super) async fn handle_walreceiver_connection( timeline: Arc, wal_source_connconf: PgConnectionConfig, @@ -114,6 +115,7 @@ pub(super) async fn handle_walreceiver_connection( connect_timeout: Duration, ctx: RequestContext, node: NodeId, + ingest_batch_size: u64, ) -> Result<(), WalReceiverError> { debug_assert_current_span_has_tenant_and_timeline_id(); @@ -305,7 +307,9 @@ pub(super) async fn handle_walreceiver_connection( { let mut decoded = DecodedWALRecord::default(); - let mut modification = timeline.begin_modification(endlsn); + let mut modification = timeline.begin_modification(startlsn); + let mut uncommitted_records = 0; + let mut filtered_records = 0; while let Some((lsn, recdata)) = waldecoder.poll_decode()? { // It is important to deal with the aligned records as lsn in getPage@LSN is // aligned and can be several bytes bigger. Without this alignment we are @@ -314,14 +318,40 @@ pub(super) async fn handle_walreceiver_connection( return Err(WalReceiverError::Other(anyhow!("LSN not aligned"))); } - walingest + // Ingest the records without immediately committing them. + let ingested = walingest .ingest_record(recdata, lsn, &mut modification, &mut decoded, &ctx) .await .with_context(|| format!("could not ingest record at {lsn}"))?; + if !ingested { + tracing::debug!("ingest: filtered out record @ LSN {lsn}"); + WAL_INGEST.records_filtered.inc(); + filtered_records += 1; + } fail_point!("walreceiver-after-ingest"); last_rec_lsn = lsn; + + // Commit every ingest_batch_size records. Even if we filtered out + // all records, we still need to call commit to advance the LSN. + uncommitted_records += 1; + if uncommitted_records >= ingest_batch_size { + WAL_INGEST + .records_committed + .inc_by(uncommitted_records - filtered_records); + modification.commit(&ctx).await?; + uncommitted_records = 0; + filtered_records = 0; + } + } + + // Commit the remaining records. + if uncommitted_records > 0 { + WAL_INGEST + .records_committed + .inc_by(uncommitted_records - filtered_records); + modification.commit(&ctx).await?; } } diff --git a/pageserver/src/walingest.rs b/pageserver/src/walingest.rs index 16b245c488..8df0c81c7a 100644 --- a/pageserver/src/walingest.rs +++ b/pageserver/src/walingest.rs @@ -29,6 +29,7 @@ use postgres_ffi::{fsm_logical_to_physical, page_is_new, page_set_lsn}; use anyhow::{bail, Context, Result}; use bytes::{Buf, Bytes, BytesMut}; use tracing::*; +use utils::failpoint_support; use crate::context::RequestContext; use crate::metrics::WAL_INGEST; @@ -47,20 +48,18 @@ use postgres_ffi::TransactionId; use postgres_ffi::BLCKSZ; use utils::lsn::Lsn; -pub struct WalIngest<'a> { +pub struct WalIngest { shard: ShardIdentity, - timeline: &'a Timeline, - checkpoint: CheckPoint, checkpoint_modified: bool, } -impl<'a> WalIngest<'a> { +impl WalIngest { pub async fn new( - timeline: &'a Timeline, + timeline: &Timeline, startpoint: Lsn, - ctx: &'_ RequestContext, - ) -> anyhow::Result> { + ctx: &RequestContext, + ) -> anyhow::Result { // Fetch the latest checkpoint into memory, so that we can compare with it // quickly in `ingest_record` and update it when it changes. let checkpoint_bytes = timeline.get_checkpoint(startpoint, ctx).await?; @@ -69,7 +68,6 @@ impl<'a> WalIngest<'a> { Ok(WalIngest { shard: *timeline.get_shard_identity(), - timeline, checkpoint, checkpoint_modified: false, }) @@ -83,6 +81,8 @@ impl<'a> WalIngest<'a> { /// Helper function to parse a WAL record and call the Timeline's PUT functions for all the /// relations/pages that the record affects. /// + /// This function returns `true` if the record was ingested, and `false` if it was filtered out + /// pub async fn ingest_record( &mut self, recdata: Bytes, @@ -90,11 +90,13 @@ impl<'a> WalIngest<'a> { modification: &mut DatadirModification<'_>, decoded: &mut DecodedWALRecord, ctx: &RequestContext, - ) -> anyhow::Result<()> { + ) -> anyhow::Result { WAL_INGEST.records_received.inc(); + let pg_version = modification.tline.pg_version; + let prev_len = modification.len(); - modification.lsn = lsn; - decode_wal_record(recdata, decoded, self.timeline.pg_version)?; + modification.set_lsn(lsn)?; + decode_wal_record(recdata, decoded, pg_version)?; let mut buf = decoded.record.clone(); buf.advance(decoded.main_data_offset); @@ -131,9 +133,9 @@ impl<'a> WalIngest<'a> { } pg_constants::RM_DBASE_ID => { let info = decoded.xl_info & pg_constants::XLR_RMGR_INFO_MASK; - debug!(%info, pg_version=%self.timeline.pg_version, "handle RM_DBASE_ID"); + debug!(%info, %pg_version, "handle RM_DBASE_ID"); - if self.timeline.pg_version == 14 { + if pg_version == 14 { if info == postgres_ffi::v14::bindings::XLOG_DBASE_CREATE { let createdb = XlCreateDatabase::decode(&mut buf); debug!("XLOG_DBASE_CREATE v14"); @@ -149,7 +151,7 @@ impl<'a> WalIngest<'a> { .await?; } } - } else if self.timeline.pg_version == 15 { + } else if pg_version == 15 { if info == postgres_ffi::v15::bindings::XLOG_DBASE_CREATE_WAL_LOG { debug!("XLOG_DBASE_CREATE_WAL_LOG: noop"); } else if info == postgres_ffi::v15::bindings::XLOG_DBASE_CREATE_FILE_COPY { @@ -169,7 +171,7 @@ impl<'a> WalIngest<'a> { .await?; } } - } else if self.timeline.pg_version == 16 { + } else if pg_version == 16 { if info == postgres_ffi::v16::bindings::XLOG_DBASE_CREATE_WAL_LOG { debug!("XLOG_DBASE_CREATE_WAL_LOG: noop"); } else if info == postgres_ffi::v16::bindings::XLOG_DBASE_CREATE_FILE_COPY { @@ -344,9 +346,7 @@ impl<'a> WalIngest<'a> { // particular point in the WAL. For more fine-grained control, // we could peek into the message and only pause if it contains // a particular string, for example, but this is enough for now. - crate::failpoint_support::sleep_millis_async!( - "wal-ingest-logical-message-sleep" - ); + failpoint_support::sleep_millis_async!("wal-ingest-logical-message-sleep"); } else if let Some(path) = prefix.strip_prefix("neon-file:") { modification.put_file(path, message, ctx).await?; } @@ -400,19 +400,11 @@ impl<'a> WalIngest<'a> { self.checkpoint_modified = false; } - if modification.is_empty() { - tracing::debug!("ingest: filtered out record @ LSN {lsn}"); - WAL_INGEST.records_filtered.inc(); - modification.tline.finish_write(lsn); - } else { - WAL_INGEST.records_committed.inc(); - modification.commit(ctx).await?; - } + // Note that at this point this record is only cached in the modification + // until commit() is called to flush the data into the repository and update + // the latest LSN. - // Now that this record has been fully handled, including updating the - // checkpoint data, let the repository know that it is up-to-date to this LSN. - - Ok(()) + Ok(modification.len() > prev_len) } /// Do not store this block, but observe it for the purposes of updating our relation size state. @@ -459,7 +451,7 @@ impl<'a> WalIngest<'a> { && (decoded.xl_info == pg_constants::XLOG_FPI || decoded.xl_info == pg_constants::XLOG_FPI_FOR_HINT) // compression of WAL is not yet supported: fall back to storing the original WAL record - && !postgres_ffi::bkpimage_is_compressed(blk.bimg_info, self.timeline.pg_version)? + && !postgres_ffi::bkpimage_is_compressed(blk.bimg_info, modification.tline.pg_version)? // do not materialize null pages because them most likely be soon replaced with real data && blk.bimg_len != 0 { @@ -512,7 +504,7 @@ impl<'a> WalIngest<'a> { let mut old_heap_blkno: Option = None; let mut flags = pg_constants::VISIBILITYMAP_VALID_BITS; - match self.timeline.pg_version { + match modification.tline.pg_version { 14 => { if decoded.xl_rmid == pg_constants::RM_HEAP_ID { let info = decoded.xl_info & pg_constants::XLOG_HEAP_OPMASK; @@ -736,7 +728,7 @@ impl<'a> WalIngest<'a> { // replaying it would fail to find the previous image of the page, because // it doesn't exist. So check if the VM page(s) exist, and skip the WAL // record if it doesn't. - let vm_size = self.get_relsize(vm_rel, modification.lsn, ctx).await?; + let vm_size = get_relsize(modification, vm_rel, ctx).await?; if let Some(blknum) = new_vm_blk { if blknum >= vm_size { new_vm_blk = None; @@ -817,10 +809,11 @@ impl<'a> WalIngest<'a> { let mut new_heap_blkno: Option = None; let mut old_heap_blkno: Option = None; let mut flags = pg_constants::VISIBILITYMAP_VALID_BITS; + let pg_version = modification.tline.pg_version; assert_eq!(decoded.xl_rmid, pg_constants::RM_NEON_ID); - match self.timeline.pg_version { + match pg_version { 16 => { let info = decoded.xl_info & pg_constants::XLOG_HEAP_OPMASK; @@ -883,7 +876,7 @@ impl<'a> WalIngest<'a> { } _ => bail!( "Neon RMGR has no known compatibility with PostgreSQL version {}", - self.timeline.pg_version + pg_version ), } @@ -906,7 +899,7 @@ impl<'a> WalIngest<'a> { // replaying it would fail to find the previous image of the page, because // it doesn't exist. So check if the VM page(s) exist, and skip the WAL // record if it doesn't. - let vm_size = self.get_relsize(vm_rel, modification.lsn, ctx).await?; + let vm_size = get_relsize(modification, vm_rel, ctx).await?; if let Some(blknum) = new_vm_blk { if blknum >= vm_size { new_vm_blk = None; @@ -984,16 +977,14 @@ impl<'a> WalIngest<'a> { let src_db_id = rec.src_db_id; let src_tablespace_id = rec.src_tablespace_id; - // Creating a database is implemented by copying the template (aka. source) database. - // To copy all the relations, we need to ask for the state as of the same LSN, but we - // cannot pass 'lsn' to the Timeline.get_* functions, or they will block waiting for - // the last valid LSN to advance up to it. So we use the previous record's LSN in the - // get calls instead. - let req_lsn = modification.tline.get_last_record_lsn(); - let rels = modification .tline - .list_rels(src_tablespace_id, src_db_id, req_lsn, ctx) + .list_rels( + src_tablespace_id, + src_db_id, + Version::Modified(modification), + ctx, + ) .await?; debug!("ingest_xlog_dbase_create: {} rels", rels.len()); @@ -1001,7 +992,12 @@ impl<'a> WalIngest<'a> { // Copy relfilemap let filemap = modification .tline - .get_relmap_file(src_tablespace_id, src_db_id, req_lsn, ctx) + .get_relmap_file( + src_tablespace_id, + src_db_id, + Version::Modified(modification), + ctx, + ) .await?; modification .put_relmap_file(tablespace_id, db_id, filemap, ctx) @@ -1015,7 +1011,7 @@ impl<'a> WalIngest<'a> { let nblocks = modification .tline - .get_rel_size(src_rel, req_lsn, true, ctx) + .get_rel_size(src_rel, Version::Modified(modification), true, ctx) .await?; let dst_rel = RelTag { spcnode: tablespace_id, @@ -1033,7 +1029,13 @@ impl<'a> WalIngest<'a> { let content = modification .tline - .get_rel_page_at_lsn(src_rel, blknum, req_lsn, true, ctx) + .get_rel_page_at_lsn( + src_rel, + blknum, + Version::Modified(modification), + true, + ctx, + ) .await?; modification.put_rel_page_image(dst_rel, blknum, content)?; num_blocks_copied += 1; @@ -1104,7 +1106,7 @@ impl<'a> WalIngest<'a> { modification.put_rel_page_image(rel, fsm_physical_page_no, ZERO_PAGE.clone())?; fsm_physical_page_no += 1; } - let nblocks = self.get_relsize(rel, modification.lsn, ctx).await?; + let nblocks = get_relsize(modification, rel, ctx).await?; if nblocks > fsm_physical_page_no { // check if something to do: FSM is larger than truncate position self.put_rel_truncation(modification, rel, fsm_physical_page_no, ctx) @@ -1126,7 +1128,7 @@ impl<'a> WalIngest<'a> { modification.put_rel_page_image(rel, vm_page_no, ZERO_PAGE.clone())?; vm_page_no += 1; } - let nblocks = self.get_relsize(rel, modification.lsn, ctx).await?; + let nblocks = get_relsize(modification, rel, ctx).await?; if nblocks > vm_page_no { // check if something to do: VM is larger than truncate position self.put_rel_truncation(modification, rel, vm_page_no, ctx) @@ -1199,10 +1201,9 @@ impl<'a> WalIngest<'a> { dbnode: xnode.dbnode, relnode: xnode.relnode, }; - let last_lsn = self.timeline.get_last_record_lsn(); if modification .tline - .get_rel_exists(rel, last_lsn, true, ctx) + .get_rel_exists(rel, Version::Modified(modification), true, ctx) .await? { self.put_rel_drop(modification, rel, ctx).await?; @@ -1256,10 +1257,9 @@ impl<'a> WalIngest<'a> { // will block waiting for the last valid LSN to advance up to // it. So we use the previous record's LSN in the get calls // instead. - let req_lsn = modification.tline.get_last_record_lsn(); for segno in modification .tline - .list_slru_segments(SlruKind::Clog, req_lsn, ctx) + .list_slru_segments(SlruKind::Clog, Version::Modified(modification), ctx) .await? { let segpage = segno * pg_constants::SLRU_PAGES_PER_SEGMENT; @@ -1471,20 +1471,6 @@ impl<'a> WalIngest<'a> { Ok(()) } - async fn get_relsize( - &mut self, - rel: RelTag, - lsn: Lsn, - ctx: &RequestContext, - ) -> anyhow::Result { - let nblocks = if !self.timeline.get_rel_exists(rel, lsn, true, ctx).await? { - 0 - } else { - self.timeline.get_rel_size(rel, lsn, true, ctx).await? - }; - Ok(nblocks) - } - async fn handle_rel_extend( &mut self, modification: &mut DatadirModification<'_>, @@ -1496,7 +1482,6 @@ impl<'a> WalIngest<'a> { // Check if the relation exists. We implicitly create relations on first // record. // TODO: would be nice if to be more explicit about it - let last_lsn = modification.lsn; // Get current size and put rel creation if rel doesn't exist // @@ -1504,11 +1489,14 @@ impl<'a> WalIngest<'a> { // check the cache too. This is because eagerly checking the cache results in // less work overall and 10% better performance. It's more work on cache miss // but cache miss is rare. - let old_nblocks = if let Some(nblocks) = self.timeline.get_cached_rel_size(&rel, last_lsn) { + let old_nblocks = if let Some(nblocks) = modification + .tline + .get_cached_rel_size(&rel, modification.get_lsn()) + { nblocks - } else if !self - .timeline - .get_rel_exists(rel, last_lsn, true, ctx) + } else if !modification + .tline + .get_rel_exists(rel, Version::Modified(modification), true, ctx) .await? { // create it with 0 size initially, the logic below will extend it @@ -1518,7 +1506,10 @@ impl<'a> WalIngest<'a> { .context("Relation Error")?; 0 } else { - self.timeline.get_rel_size(rel, last_lsn, true, ctx).await? + modification + .tline + .get_rel_size(rel, Version::Modified(modification), true, ctx) + .await? }; if new_nblocks > old_nblocks { @@ -1571,10 +1562,9 @@ impl<'a> WalIngest<'a> { // Check if the relation exists. We implicitly create relations on first // record. // TODO: would be nice if to be more explicit about it - let last_lsn = self.timeline.get_last_record_lsn(); - let old_nblocks = if !self - .timeline - .get_slru_segment_exists(kind, segno, last_lsn, ctx) + let old_nblocks = if !modification + .tline + .get_slru_segment_exists(kind, segno, Version::Modified(modification), ctx) .await? { // create it with 0 size initially, the logic below will extend it @@ -1583,8 +1573,9 @@ impl<'a> WalIngest<'a> { .await?; 0 } else { - self.timeline - .get_slru_segment_size(kind, segno, last_lsn, ctx) + modification + .tline + .get_slru_segment_size(kind, segno, Version::Modified(modification), ctx) .await? }; @@ -1607,11 +1598,32 @@ impl<'a> WalIngest<'a> { } } +async fn get_relsize( + modification: &DatadirModification<'_>, + rel: RelTag, + ctx: &RequestContext, +) -> anyhow::Result { + let nblocks = if !modification + .tline + .get_rel_exists(rel, Version::Modified(modification), true, ctx) + .await? + { + 0 + } else { + modification + .tline + .get_rel_size(rel, Version::Modified(modification), true, ctx) + .await? + }; + Ok(nblocks) +} + #[allow(clippy::bool_assert_comparison)] #[cfg(test)] mod tests { use super::*; use crate::tenant::harness::*; + use crate::tenant::remote_timeline_client::{remote_initdb_archive_path, INITDB_PATH}; use crate::tenant::Timeline; use postgres_ffi::v14::xlog_utils::SIZEOF_CHECKPOINT; use postgres_ffi::RELSEG_SIZE; @@ -1632,10 +1644,7 @@ mod tests { static ZERO_CHECKPOINT: Bytes = Bytes::from_static(&[0u8; SIZEOF_CHECKPOINT]); - async fn init_walingest_test<'a>( - tline: &'a Timeline, - ctx: &RequestContext, - ) -> Result> { + async fn init_walingest_test(tline: &Timeline, ctx: &RequestContext) -> Result { let mut m = tline.begin_modification(Lsn(0x10)); m.put_checkpoint(ZERO_CHECKPOINT.clone())?; m.put_relmap_file(0, 111, Bytes::from(""), ctx).await?; // dummy relmapper file @@ -1680,29 +1689,29 @@ mod tests { // The relation was created at LSN 2, not visible at LSN 1 yet. assert_eq!( tline - .get_rel_exists(TESTREL_A, Lsn(0x10), false, &ctx) + .get_rel_exists(TESTREL_A, Version::Lsn(Lsn(0x10)), false, &ctx) .await?, false ); assert!(tline - .get_rel_size(TESTREL_A, Lsn(0x10), false, &ctx) + .get_rel_size(TESTREL_A, Version::Lsn(Lsn(0x10)), false, &ctx) .await .is_err()); assert_eq!( tline - .get_rel_exists(TESTREL_A, Lsn(0x20), false, &ctx) + .get_rel_exists(TESTREL_A, Version::Lsn(Lsn(0x20)), false, &ctx) .await?, true ); assert_eq!( tline - .get_rel_size(TESTREL_A, Lsn(0x20), false, &ctx) + .get_rel_size(TESTREL_A, Version::Lsn(Lsn(0x20)), false, &ctx) .await?, 1 ); assert_eq!( tline - .get_rel_size(TESTREL_A, Lsn(0x50), false, &ctx) + .get_rel_size(TESTREL_A, Version::Lsn(Lsn(0x50)), false, &ctx) .await?, 3 ); @@ -1710,46 +1719,46 @@ mod tests { // Check page contents at each LSN assert_eq!( tline - .get_rel_page_at_lsn(TESTREL_A, 0, Lsn(0x20), false, &ctx) + .get_rel_page_at_lsn(TESTREL_A, 0, Version::Lsn(Lsn(0x20)), false, &ctx) .await?, TEST_IMG("foo blk 0 at 2") ); assert_eq!( tline - .get_rel_page_at_lsn(TESTREL_A, 0, Lsn(0x30), false, &ctx) + .get_rel_page_at_lsn(TESTREL_A, 0, Version::Lsn(Lsn(0x30)), false, &ctx) .await?, TEST_IMG("foo blk 0 at 3") ); assert_eq!( tline - .get_rel_page_at_lsn(TESTREL_A, 0, Lsn(0x40), false, &ctx) + .get_rel_page_at_lsn(TESTREL_A, 0, Version::Lsn(Lsn(0x40)), false, &ctx) .await?, TEST_IMG("foo blk 0 at 3") ); assert_eq!( tline - .get_rel_page_at_lsn(TESTREL_A, 1, Lsn(0x40), false, &ctx) + .get_rel_page_at_lsn(TESTREL_A, 1, Version::Lsn(Lsn(0x40)), false, &ctx) .await?, TEST_IMG("foo blk 1 at 4") ); assert_eq!( tline - .get_rel_page_at_lsn(TESTREL_A, 0, Lsn(0x50), false, &ctx) + .get_rel_page_at_lsn(TESTREL_A, 0, Version::Lsn(Lsn(0x50)), false, &ctx) .await?, TEST_IMG("foo blk 0 at 3") ); assert_eq!( tline - .get_rel_page_at_lsn(TESTREL_A, 1, Lsn(0x50), false, &ctx) + .get_rel_page_at_lsn(TESTREL_A, 1, Version::Lsn(Lsn(0x50)), false, &ctx) .await?, TEST_IMG("foo blk 1 at 4") ); assert_eq!( tline - .get_rel_page_at_lsn(TESTREL_A, 2, Lsn(0x50), false, &ctx) + .get_rel_page_at_lsn(TESTREL_A, 2, Version::Lsn(Lsn(0x50)), false, &ctx) .await?, TEST_IMG("foo blk 2 at 5") ); @@ -1765,19 +1774,19 @@ mod tests { // Check reported size and contents after truncation assert_eq!( tline - .get_rel_size(TESTREL_A, Lsn(0x60), false, &ctx) + .get_rel_size(TESTREL_A, Version::Lsn(Lsn(0x60)), false, &ctx) .await?, 2 ); assert_eq!( tline - .get_rel_page_at_lsn(TESTREL_A, 0, Lsn(0x60), false, &ctx) + .get_rel_page_at_lsn(TESTREL_A, 0, Version::Lsn(Lsn(0x60)), false, &ctx) .await?, TEST_IMG("foo blk 0 at 3") ); assert_eq!( tline - .get_rel_page_at_lsn(TESTREL_A, 1, Lsn(0x60), false, &ctx) + .get_rel_page_at_lsn(TESTREL_A, 1, Version::Lsn(Lsn(0x60)), false, &ctx) .await?, TEST_IMG("foo blk 1 at 4") ); @@ -1785,13 +1794,13 @@ mod tests { // should still see the truncated block with older LSN assert_eq!( tline - .get_rel_size(TESTREL_A, Lsn(0x50), false, &ctx) + .get_rel_size(TESTREL_A, Version::Lsn(Lsn(0x50)), false, &ctx) .await?, 3 ); assert_eq!( tline - .get_rel_page_at_lsn(TESTREL_A, 2, Lsn(0x50), false, &ctx) + .get_rel_page_at_lsn(TESTREL_A, 2, Version::Lsn(Lsn(0x50)), false, &ctx) .await?, TEST_IMG("foo blk 2 at 5") ); @@ -1804,7 +1813,7 @@ mod tests { m.commit(&ctx).await?; assert_eq!( tline - .get_rel_size(TESTREL_A, Lsn(0x68), false, &ctx) + .get_rel_size(TESTREL_A, Version::Lsn(Lsn(0x68)), false, &ctx) .await?, 0 ); @@ -1817,19 +1826,19 @@ mod tests { m.commit(&ctx).await?; assert_eq!( tline - .get_rel_size(TESTREL_A, Lsn(0x70), false, &ctx) + .get_rel_size(TESTREL_A, Version::Lsn(Lsn(0x70)), false, &ctx) .await?, 2 ); assert_eq!( tline - .get_rel_page_at_lsn(TESTREL_A, 0, Lsn(0x70), false, &ctx) + .get_rel_page_at_lsn(TESTREL_A, 0, Version::Lsn(Lsn(0x70)), false, &ctx) .await?, ZERO_PAGE ); assert_eq!( tline - .get_rel_page_at_lsn(TESTREL_A, 1, Lsn(0x70), false, &ctx) + .get_rel_page_at_lsn(TESTREL_A, 1, Version::Lsn(Lsn(0x70)), false, &ctx) .await?, TEST_IMG("foo blk 1") ); @@ -1842,21 +1851,21 @@ mod tests { m.commit(&ctx).await?; assert_eq!( tline - .get_rel_size(TESTREL_A, Lsn(0x80), false, &ctx) + .get_rel_size(TESTREL_A, Version::Lsn(Lsn(0x80)), false, &ctx) .await?, 1501 ); for blk in 2..1500 { assert_eq!( tline - .get_rel_page_at_lsn(TESTREL_A, blk, Lsn(0x80), false, &ctx) + .get_rel_page_at_lsn(TESTREL_A, blk, Version::Lsn(Lsn(0x80)), false, &ctx) .await?, ZERO_PAGE ); } assert_eq!( tline - .get_rel_page_at_lsn(TESTREL_A, 1500, Lsn(0x80), false, &ctx) + .get_rel_page_at_lsn(TESTREL_A, 1500, Version::Lsn(Lsn(0x80)), false, &ctx) .await?, TEST_IMG("foo blk 1500") ); @@ -1883,13 +1892,13 @@ mod tests { // Check that rel exists and size is correct assert_eq!( tline - .get_rel_exists(TESTREL_A, Lsn(0x20), false, &ctx) + .get_rel_exists(TESTREL_A, Version::Lsn(Lsn(0x20)), false, &ctx) .await?, true ); assert_eq!( tline - .get_rel_size(TESTREL_A, Lsn(0x20), false, &ctx) + .get_rel_size(TESTREL_A, Version::Lsn(Lsn(0x20)), false, &ctx) .await?, 1 ); @@ -1902,7 +1911,7 @@ mod tests { // Check that rel is not visible anymore assert_eq!( tline - .get_rel_exists(TESTREL_A, Lsn(0x30), false, &ctx) + .get_rel_exists(TESTREL_A, Version::Lsn(Lsn(0x30)), false, &ctx) .await?, false ); @@ -1920,13 +1929,13 @@ mod tests { // Check that rel exists and size is correct assert_eq!( tline - .get_rel_exists(TESTREL_A, Lsn(0x40), false, &ctx) + .get_rel_exists(TESTREL_A, Version::Lsn(Lsn(0x40)), false, &ctx) .await?, true ); assert_eq!( tline - .get_rel_size(TESTREL_A, Lsn(0x40), false, &ctx) + .get_rel_size(TESTREL_A, Version::Lsn(Lsn(0x40)), false, &ctx) .await?, 1 ); @@ -1959,24 +1968,24 @@ mod tests { // The relation was created at LSN 20, not visible at LSN 1 yet. assert_eq!( tline - .get_rel_exists(TESTREL_A, Lsn(0x10), false, &ctx) + .get_rel_exists(TESTREL_A, Version::Lsn(Lsn(0x10)), false, &ctx) .await?, false ); assert!(tline - .get_rel_size(TESTREL_A, Lsn(0x10), false, &ctx) + .get_rel_size(TESTREL_A, Version::Lsn(Lsn(0x10)), false, &ctx) .await .is_err()); assert_eq!( tline - .get_rel_exists(TESTREL_A, Lsn(0x20), false, &ctx) + .get_rel_exists(TESTREL_A, Version::Lsn(Lsn(0x20)), false, &ctx) .await?, true ); assert_eq!( tline - .get_rel_size(TESTREL_A, Lsn(0x20), false, &ctx) + .get_rel_size(TESTREL_A, Version::Lsn(Lsn(0x20)), false, &ctx) .await?, relsize ); @@ -1987,7 +1996,7 @@ mod tests { let data = format!("foo blk {} at {}", blkno, lsn); assert_eq!( tline - .get_rel_page_at_lsn(TESTREL_A, blkno, lsn, false, &ctx) + .get_rel_page_at_lsn(TESTREL_A, blkno, Version::Lsn(lsn), false, &ctx) .await?, TEST_IMG(&data) ); @@ -2004,7 +2013,7 @@ mod tests { // Check reported size and contents after truncation assert_eq!( tline - .get_rel_size(TESTREL_A, Lsn(0x60), false, &ctx) + .get_rel_size(TESTREL_A, Version::Lsn(Lsn(0x60)), false, &ctx) .await?, 1 ); @@ -2014,7 +2023,7 @@ mod tests { let data = format!("foo blk {} at {}", blkno, lsn); assert_eq!( tline - .get_rel_page_at_lsn(TESTREL_A, blkno, Lsn(0x60), false, &ctx) + .get_rel_page_at_lsn(TESTREL_A, blkno, Version::Lsn(Lsn(0x60)), false, &ctx) .await?, TEST_IMG(&data) ); @@ -2023,7 +2032,7 @@ mod tests { // should still see all blocks with older LSN assert_eq!( tline - .get_rel_size(TESTREL_A, Lsn(0x50), false, &ctx) + .get_rel_size(TESTREL_A, Version::Lsn(Lsn(0x50)), false, &ctx) .await?, relsize ); @@ -2032,7 +2041,7 @@ mod tests { let data = format!("foo blk {} at {}", blkno, lsn); assert_eq!( tline - .get_rel_page_at_lsn(TESTREL_A, blkno, Lsn(0x50), false, &ctx) + .get_rel_page_at_lsn(TESTREL_A, blkno, Version::Lsn(Lsn(0x50)), false, &ctx) .await?, TEST_IMG(&data) ); @@ -2052,13 +2061,13 @@ mod tests { assert_eq!( tline - .get_rel_exists(TESTREL_A, Lsn(0x80), false, &ctx) + .get_rel_exists(TESTREL_A, Version::Lsn(Lsn(0x80)), false, &ctx) .await?, true ); assert_eq!( tline - .get_rel_size(TESTREL_A, Lsn(0x80), false, &ctx) + .get_rel_size(TESTREL_A, Version::Lsn(Lsn(0x80)), false, &ctx) .await?, relsize ); @@ -2068,7 +2077,7 @@ mod tests { let data = format!("foo blk {} at {}", blkno, lsn); assert_eq!( tline - .get_rel_page_at_lsn(TESTREL_A, blkno, Lsn(0x80), false, &ctx) + .get_rel_page_at_lsn(TESTREL_A, blkno, Version::Lsn(Lsn(0x80)), false, &ctx) .await?, TEST_IMG(&data) ); @@ -2101,7 +2110,9 @@ mod tests { assert_current_logical_size(&tline, Lsn(lsn)); assert_eq!( - tline.get_rel_size(TESTREL_A, Lsn(lsn), false, &ctx).await?, + tline + .get_rel_size(TESTREL_A, Version::Lsn(Lsn(lsn)), false, &ctx) + .await?, RELSEG_SIZE + 1 ); @@ -2113,7 +2124,9 @@ mod tests { .await?; m.commit(&ctx).await?; assert_eq!( - tline.get_rel_size(TESTREL_A, Lsn(lsn), false, &ctx).await?, + tline + .get_rel_size(TESTREL_A, Version::Lsn(Lsn(lsn)), false, &ctx) + .await?, RELSEG_SIZE ); assert_current_logical_size(&tline, Lsn(lsn)); @@ -2126,7 +2139,9 @@ mod tests { .await?; m.commit(&ctx).await?; assert_eq!( - tline.get_rel_size(TESTREL_A, Lsn(lsn), false, &ctx).await?, + tline + .get_rel_size(TESTREL_A, Version::Lsn(Lsn(lsn)), false, &ctx) + .await?, RELSEG_SIZE - 1 ); assert_current_logical_size(&tline, Lsn(lsn)); @@ -2142,7 +2157,9 @@ mod tests { .await?; m.commit(&ctx).await?; assert_eq!( - tline.get_rel_size(TESTREL_A, Lsn(lsn), false, &ctx).await?, + tline + .get_rel_size(TESTREL_A, Version::Lsn(Lsn(lsn)), false, &ctx) + .await?, size as BlockNumber ); @@ -2177,21 +2194,25 @@ mod tests { let pg_version = 15; // The test data was generated by pg15 let path = "test_data/sk_wal_segment_from_pgbench"; let wal_segment_path = format!("{path}/000000010000000000000001.zst"); + let source_initdb_path = format!("{path}/{INITDB_PATH}"); let startpoint = Lsn::from_hex("14AEC08").unwrap(); - let endpoint = Lsn::from_hex("1FFFF98").unwrap(); + let _endpoint = Lsn::from_hex("1FFFF98").unwrap(); + + let harness = TenantHarness::create("test_ingest_real_wal").unwrap(); + let (tenant, ctx) = harness.load().await; + + let remote_initdb_path = remote_initdb_archive_path(&tenant.tenant_id(), &TIMELINE_ID); + let initdb_path = harness.remote_fs_dir.join(remote_initdb_path.get_path()); + + std::fs::create_dir_all(initdb_path.parent().unwrap()) + .expect("creating test dir should work"); + std::fs::copy(source_initdb_path, initdb_path).expect("copying the initdb.tar.zst works"); // Bootstrap a real timeline. We can't use create_test_timeline because // it doesn't create a real checkpoint, and Walingest::new tries to parse // the garbage data. - // - // TODO use the initdb.tar.zst file stored with the test data to avoid - // problems with inconsistent initdb results after pg minor version bumps. - let (tenant, ctx) = TenantHarness::create("test_ingest_real_wal") - .unwrap() - .load() - .await; let tline = tenant - .bootstrap_timeline_test(TIMELINE_ID, pg_version, None, &ctx) + .bootstrap_timeline_test(TIMELINE_ID, pg_version, Some(TIMELINE_ID), &ctx) .await .unwrap(); @@ -2217,7 +2238,7 @@ mod tests { let mut walingest = WalIngest::new(tline.as_ref(), startpoint, &ctx) .await .unwrap(); - let mut modification = tline.begin_modification(endpoint); + let mut modification = tline.begin_modification(startpoint); let mut decoded = DecodedWALRecord::default(); println!("decoding {} bytes", bytes.len() - xlogoff); @@ -2231,6 +2252,7 @@ mod tests { .await .unwrap(); } + modification.commit(&ctx).await.unwrap(); } let duration = started_at.elapsed(); diff --git a/pageserver/src/walredo.rs b/pageserver/src/walredo.rs index 94e95fd3b3..6918698f29 100644 --- a/pageserver/src/walredo.rs +++ b/pageserver/src/walredo.rs @@ -22,6 +22,7 @@ use anyhow::Context; use byteorder::{ByteOrder, LittleEndian}; use bytes::{BufMut, Bytes, BytesMut}; use nix::poll::*; +use pageserver_api::shard::TenantShardId; use serde::Serialize; use std::collections::VecDeque; use std::io; @@ -35,14 +36,11 @@ use std::sync::{Arc, Mutex, MutexGuard, RwLock}; use std::time::Duration; use std::time::Instant; use tracing::*; -use utils::{bin_ser::BeSer, id::TenantId, lsn::Lsn, nonblock::set_nonblock}; +use utils::{bin_ser::BeSer, lsn::Lsn, nonblock::set_nonblock}; #[cfg(feature = "testing")] use std::sync::atomic::{AtomicUsize, Ordering}; -#[cfg(feature = "testing")] -use pageserver_api::shard::TenantShardId; - use crate::config::PageServerConf; use crate::metrics::{ WalRedoKillCause, WAL_REDO_BYTES_HISTOGRAM, WAL_REDO_PROCESS_COUNTERS, @@ -92,7 +90,7 @@ struct ProcessOutput { /// records. /// pub struct PostgresRedoManager { - tenant_id: TenantId, + tenant_shard_id: TenantShardId, conf: &'static PageServerConf, last_redo_at: std::sync::Mutex>, redo_process: RwLock>>, @@ -186,10 +184,13 @@ impl PostgresRedoManager { /// /// Create a new PostgresRedoManager. /// - pub fn new(conf: &'static PageServerConf, tenant_id: TenantId) -> PostgresRedoManager { + pub fn new( + conf: &'static PageServerConf, + tenant_shard_id: TenantShardId, + ) -> PostgresRedoManager { // The actual process is launched lazily, on first request. PostgresRedoManager { - tenant_id, + tenant_shard_id, conf, last_redo_at: std::sync::Mutex::default(), redo_process: RwLock::new(None), @@ -244,8 +245,12 @@ impl PostgresRedoManager { let timer = WAL_REDO_PROCESS_LAUNCH_DURATION_HISTOGRAM.start_timer(); let proc = Arc::new( - WalRedoProcess::launch(self.conf, self.tenant_id, pg_version) - .context("launch walredo process")?, + WalRedoProcess::launch( + self.conf, + self.tenant_shard_id, + pg_version, + ) + .context("launch walredo process")?, ); timer.observe_duration(); *proc_guard = Some(Arc::clone(&proc)); @@ -638,7 +643,7 @@ impl CloseFileDescriptors for C { struct WalRedoProcess { #[allow(dead_code)] conf: &'static PageServerConf, - tenant_id: TenantId, + tenant_shard_id: TenantShardId, // Some() on construction, only becomes None on Drop. child: Option, stdout: Mutex, @@ -652,10 +657,10 @@ impl WalRedoProcess { // // Start postgres binary in special WAL redo mode. // - #[instrument(skip_all,fields(tenant_id=%tenant_id, pg_version=pg_version))] + #[instrument(skip_all,fields(tenant_id=%tenant_shard_id.tenant_id, shard_id=%tenant_shard_id.shard_slug(), pg_version=pg_version))] fn launch( conf: &'static PageServerConf, - tenant_id: TenantId, + tenant_shard_id: TenantShardId, pg_version: u32, ) -> anyhow::Result { let pg_bin_dir_path = conf.pg_bin_dir(pg_version).context("pg_bin_dir")?; // TODO these should be infallible. @@ -680,7 +685,7 @@ impl WalRedoProcess { // as close-on-exec by default, but that's not enough, since we use // libraries that directly call libc open without setting that flag. .close_fds() - .spawn_no_leak_child(tenant_id) + .spawn_no_leak_child(tenant_shard_id) .context("spawn process")?; WAL_REDO_PROCESS_COUNTERS.started.inc(); let mut child = scopeguard::guard(child, |child| { @@ -741,12 +746,12 @@ impl WalRedoProcess { error!(error=?e, "failed to read from walredo stderr"); } } - }.instrument(tracing::info_span!(parent: None, "wal-redo-postgres-stderr", pid = child.id(), tenant_id = %tenant_id, %pg_version)) + }.instrument(tracing::info_span!(parent: None, "wal-redo-postgres-stderr", pid = child.id(), tenant_id = %tenant_shard_id.tenant_id, shard_id = %tenant_shard_id.shard_slug(), %pg_version)) ); Ok(Self { conf, - tenant_id, + tenant_shard_id, child: Some(child), stdin: Mutex::new(ProcessInput { stdin, @@ -772,7 +777,7 @@ impl WalRedoProcess { // Apply given WAL records ('records') over an old page image. Returns // new page image. // - #[instrument(skip_all, fields(tenant_id=%self.tenant_id, pid=%self.id()))] + #[instrument(skip_all, fields(tenant_id=%self.tenant_shard_id.tenant_id, shard_id=%self.tenant_shard_id.shard_slug(), pid=%self.id()))] fn apply_wal_records( &self, tag: BufferTag, @@ -966,11 +971,7 @@ impl WalRedoProcess { // these files will be collected to an allure report let filename = format!("walredo-{millis}-{}-{seq}.walredo", writebuf.len()); - // TODO(sharding): update this call when WalRedoProcess gets a TenantShardId. - let path = self - .conf - .tenant_path(&TenantShardId::unsharded(self.tenant_id)) - .join(&filename); + let path = self.conf.tenant_path(&self.tenant_shard_id).join(&filename); let res = std::fs::OpenOptions::new() .write(true) @@ -1004,7 +1005,7 @@ impl Drop for WalRedoProcess { /// Wrapper type around `std::process::Child` which guarantees that the child /// will be killed and waited-for by this process before being dropped. struct NoLeakChild { - tenant_id: TenantId, + tenant_id: TenantShardId, child: Option, } @@ -1023,7 +1024,7 @@ impl DerefMut for NoLeakChild { } impl NoLeakChild { - fn spawn(tenant_id: TenantId, command: &mut Command) -> io::Result { + fn spawn(tenant_id: TenantShardId, command: &mut Command) -> io::Result { let child = command.spawn()?; Ok(NoLeakChild { tenant_id, @@ -1078,7 +1079,7 @@ impl Drop for NoLeakChild { Some(child) => child, None => return, }; - let tenant_id = self.tenant_id; + let tenant_shard_id = self.tenant_id; // Offload the kill+wait of the child process into the background. // If someone stops the runtime, we'll leak the child process. // We can ignore that case because we only stop the runtime on pageserver exit. @@ -1086,7 +1087,11 @@ impl Drop for NoLeakChild { tokio::task::spawn_blocking(move || { // Intentionally don't inherit the tracing context from whoever is dropping us. // This thread here is going to outlive of our dropper. - let span = tracing::info_span!("walredo", %tenant_id); + let span = tracing::info_span!( + "walredo", + tenant_id = %tenant_shard_id.tenant_id, + shard_id = %tenant_shard_id.shard_slug() + ); let _entered = span.enter(); Self::kill_and_wait_impl(child, WalRedoKillCause::NoLeakChildDrop); }) @@ -1096,11 +1101,11 @@ impl Drop for NoLeakChild { } trait NoLeakChildCommandExt { - fn spawn_no_leak_child(&mut self, tenant_id: TenantId) -> io::Result; + fn spawn_no_leak_child(&mut self, tenant_id: TenantShardId) -> io::Result; } impl NoLeakChildCommandExt for Command { - fn spawn_no_leak_child(&mut self, tenant_id: TenantId) -> io::Result { + fn spawn_no_leak_child(&mut self, tenant_id: TenantShardId) -> io::Result { NoLeakChild::spawn(tenant_id, self) } } @@ -1155,6 +1160,7 @@ mod tests { use crate::repository::Key; use crate::{config::PageServerConf, walrecord::NeonWalRecord}; use bytes::Bytes; + use pageserver_api::shard::TenantShardId; use std::str::FromStr; use utils::{id::TenantId, lsn::Lsn}; @@ -1264,9 +1270,9 @@ mod tests { let repo_dir = camino_tempfile::tempdir()?; let conf = PageServerConf::dummy_conf(repo_dir.path().to_path_buf()); let conf = Box::leak(Box::new(conf)); - let tenant_id = TenantId::generate(); + let tenant_shard_id = TenantShardId::unsharded(TenantId::generate()); - let manager = PostgresRedoManager::new(conf, tenant_id); + let manager = PostgresRedoManager::new(conf, tenant_shard_id); Ok(RedoHarness { _repo_dir: repo_dir, diff --git a/pgxn/neon/Makefile b/pgxn/neon/Makefile index 466e346e46..c6b224a14d 100644 --- a/pgxn/neon/Makefile +++ b/pgxn/neon/Makefile @@ -9,6 +9,7 @@ OBJS = \ libpagestore.o \ neon.o \ neon_utils.o \ + neon_walreader.o \ pagestore_smgr.o \ relsize_cache.o \ walproposer.o \ diff --git a/pgxn/neon/libpagestore.c b/pgxn/neon/libpagestore.c index 3b038f906f..3a7c0f1bb6 100644 --- a/pgxn/neon/libpagestore.c +++ b/pgxn/neon/libpagestore.c @@ -35,7 +35,8 @@ #define PageStoreTrace DEBUG5 -#define RECONNECT_INTERVAL_USEC 1000000 +#define MIN_RECONNECT_INTERVAL_USEC 1000 +#define MAX_RECONNECT_INTERVAL_USEC 1000000 bool connected = false; PGconn *pageserver_conn = NULL; @@ -133,6 +134,11 @@ pageserver_connect(int elevel) const char *values[3]; int n; + static TimestampTz last_connect_time = 0; + static uint64_t delay_us = MIN_RECONNECT_INTERVAL_USEC; + TimestampTz now; + uint64_t us_since_last_connect; + Assert(!connected); if (CheckConnstringUpdated()) @@ -140,6 +146,22 @@ pageserver_connect(int elevel) ReloadConnstring(); } + now = GetCurrentTimestamp(); + us_since_last_connect = now - last_connect_time; + if (us_since_last_connect < delay_us) + { + pg_usleep(delay_us - us_since_last_connect); + delay_us *= 2; + if (delay_us > MAX_RECONNECT_INTERVAL_USEC) + delay_us = MAX_RECONNECT_INTERVAL_USEC; + last_connect_time = GetCurrentTimestamp(); + } + else + { + delay_us = MIN_RECONNECT_INTERVAL_USEC; + last_connect_time = now; + } + /* * Connect using the connection string we got from the * neon.pageserver_connstring GUC. If the NEON_AUTH_TOKEN environment @@ -333,7 +355,6 @@ pageserver_send(NeonRequest *request) { HandleMainLoopInterrupts(); n_reconnect_attempts += 1; - pg_usleep(RECONNECT_INTERVAL_USEC); } n_reconnect_attempts = 0; } diff --git a/pgxn/neon/libpqwalproposer.h b/pgxn/neon/libpqwalproposer.h new file mode 100644 index 0000000000..cd7e568a47 --- /dev/null +++ b/pgxn/neon/libpqwalproposer.h @@ -0,0 +1,96 @@ +/* + * Interface to set of libpq wrappers walproposer and neon_walreader need. + * Similar to libpqwalreceiver, but it has blocking connection establishment and + * pqexec which don't fit us. Implementation is at walproposer_pg.c. + */ +#ifndef ___LIBPQWALPROPOSER_H__ +#define ___LIBPQWALPROPOSER_H__ + +/* Re-exported and modified ExecStatusType */ +typedef enum +{ + /* We received a single CopyBoth result */ + WP_EXEC_SUCCESS_COPYBOTH, + + /* + * Any success result other than a single CopyBoth was received. The + * specifics of the result were already logged, but it may be useful to + * provide an error message indicating which safekeeper messed up. + * + * Do not expect PQerrorMessage to be appropriately set. + */ + WP_EXEC_UNEXPECTED_SUCCESS, + + /* + * No result available at this time. Wait until read-ready, then call + * again. Internally, this is returned when PQisBusy indicates that + * PQgetResult would block. + */ + WP_EXEC_NEEDS_INPUT, + /* Catch-all failure. Check PQerrorMessage. */ + WP_EXEC_FAILED, +} WalProposerExecStatusType; + +/* Possible return values from walprop_async_read */ +typedef enum +{ + /* The full read was successful. buf now points to the data */ + PG_ASYNC_READ_SUCCESS, + + /* + * The read is ongoing. Wait until the connection is read-ready, then try + * again. + */ + PG_ASYNC_READ_TRY_AGAIN, + /* Reading failed. Check PQerrorMessage(conn) */ + PG_ASYNC_READ_FAIL, +} PGAsyncReadResult; + +/* Possible return values from walprop_async_write */ +typedef enum +{ + /* The write fully completed */ + PG_ASYNC_WRITE_SUCCESS, + + /* + * The write started, but you'll need to call PQflush some more times to + * finish it off. We just tried, so it's best to wait until the connection + * is read- or write-ready to try again. + * + * If it becomes read-ready, call PQconsumeInput and flush again. If it + * becomes write-ready, just call PQflush. + */ + PG_ASYNC_WRITE_TRY_FLUSH, + /* Writing failed. Check PQerrorMessage(conn) */ + PG_ASYNC_WRITE_FAIL, +} PGAsyncWriteResult; + +/* + * This header is included by walproposer.h to define walproposer_api; if we're + * building walproposer without pg, ignore libpq part, leaving only interface + * types. + */ +#ifndef WALPROPOSER_LIB + +#include "libpq-fe.h" + +/* + * Sometimes working directly with underlying PGconn is simpler, export the + * whole thing for simplicity. + */ +typedef struct WalProposerConn +{ + PGconn *pg_conn; + bool is_nonblocking; /* whether the connection is non-blocking */ + char *recvbuf; /* last received CopyData message from + * walprop_async_read */ +} WalProposerConn; + +extern WalProposerConn *libpqwp_connect_start(char *conninfo); +extern bool libpqwp_send_query(WalProposerConn *conn, char *query); +extern WalProposerExecStatusType libpqwp_get_query_result(WalProposerConn *conn); +extern PGAsyncReadResult libpqwp_async_read(WalProposerConn *conn, char **buf, int *amount); +extern void libpqwp_disconnect(WalProposerConn *conn); + +#endif /* WALPROPOSER_LIB */ +#endif /* ___LIBPQWALPROPOSER_H__ */ diff --git a/pgxn/neon/neon_walreader.c b/pgxn/neon/neon_walreader.c new file mode 100644 index 0000000000..f7ec9e5bfa --- /dev/null +++ b/pgxn/neon/neon_walreader.c @@ -0,0 +1,742 @@ +/* + * Like WALRead, but when WAL segment doesn't exist locally instead of throwing + * ERROR asynchronously tries to fetch it from the most advanced safekeeper. + * + * We can't use libpqwalreceiver as it blocks during connection establishment + * (and waiting for PQExec result), so use libpqwalproposer instead. + * + * TODO: keepalives are currently never sent, so the other side can close the + * connection prematurely. + * + * TODO: close conn if reading takes too long to prevent stuck connections. + */ +#include "postgres.h" + +#include +#include + +#include "access/xlog_internal.h" +#include "access/xlogdefs.h" +#include "access/xlogreader.h" +#include "libpq/pqformat.h" +#include "storage/fd.h" +#include "utils/wait_event.h" + +#include "libpq-fe.h" + +#include "neon_walreader.h" +#include "walproposer.h" + +#define NEON_WALREADER_ERR_MSG_LEN 512 + +/* + * Can be called where NeonWALReader *state is available in the context, adds log_prefix. + */ +#define nwr_log(elevel, fmt, ...) elog(elevel, "%s" fmt, state->log_prefix, ## __VA_ARGS__) + +static NeonWALReadResult NeonWALReadRemote(NeonWALReader *state, char *buf, XLogRecPtr startptr, Size count, TimeLineID tli); +static NeonWALReadResult NeonWALReaderReadMsg(NeonWALReader *state); +static void NeonWALReaderResetRemote(NeonWALReader *state); +static bool NeonWALReadLocal(NeonWALReader *state, char *buf, XLogRecPtr startptr, Size count, TimeLineID tli); +static bool neon_wal_segment_open(NeonWALReader *state, XLogSegNo nextSegNo, TimeLineID *tli_p); +static void neon_wal_segment_close(NeonWALReader *state); +static bool is_wal_segment_exists(XLogSegNo segno, int segsize, + TimeLineID tli); + +/* + * State of connection to donor safekeeper. + */ +typedef enum +{ + /* no remote connection */ + RS_NONE, + /* doing PQconnectPoll, need readable socket */ + RS_CONNECTING_READ, + /* doing PQconnectPoll, need writable socket */ + RS_CONNECTING_WRITE, + /* Waiting for START_REPLICATION result */ + RS_WAIT_EXEC_RESULT, + /* replication stream established */ + RS_ESTABLISHED, +} NeonWALReaderRemoteState; + +struct NeonWALReader +{ + /* + * LSN before which we assume WAL is not available locally. Exists because + * though first segment after startup always exists, part before + * basebackup LSN is filled with zeros. + */ + XLogRecPtr available_lsn; + WALSegmentContext segcxt; + WALOpenSegment seg; + int wre_errno; + /* Explains failure to read, static for simplicity. */ + char err_msg[NEON_WALREADER_ERR_MSG_LEN]; + + /* + * Saved info about request in progress, used to check validity of + * arguments after resume and remember how far we accomplished it. req_lsn + * is 0 if there is no request in progress. + */ + XLogRecPtr req_lsn; + Size req_len; + Size req_progress; + WalProposer *wp; /* we learn donor through walproposer */ + char donor_name[64]; /* saved donor safekeeper name for logging */ + /* state of connection to safekeeper */ + NeonWALReaderRemoteState rem_state; + WalProposerConn *wp_conn; + + /* + * position in wp_conn recvbuf from which we'll copy WAL next time, or + * NULL if there is no unprocessed message + */ + char *wal_ptr; + Size wal_rem_len; /* how many unprocessed bytes left in recvbuf */ + + /* + * LSN of wal_ptr position according to walsender to cross check against + * read request + */ + XLogRecPtr rem_lsn; + + /* prepended to lines logged by neon_walreader, if provided */ + char log_prefix[64]; +}; + +/* palloc and initialize NeonWALReader */ +NeonWALReader * +NeonWALReaderAllocate(int wal_segment_size, XLogRecPtr available_lsn, WalProposer *wp, char *log_prefix) +{ + NeonWALReader *reader; + + reader = (NeonWALReader *) + palloc_extended(sizeof(NeonWALReader), + MCXT_ALLOC_NO_OOM | MCXT_ALLOC_ZERO); + if (!reader) + return NULL; + + reader->available_lsn = available_lsn; + reader->seg.ws_file = -1; + reader->seg.ws_segno = 0; + reader->seg.ws_tli = 0; + reader->segcxt.ws_segsize = wal_segment_size; + + reader->wp = wp; + + reader->rem_state = RS_NONE; + + if (log_prefix) + strlcpy(reader->log_prefix, log_prefix, sizeof(reader->log_prefix)); + + return reader; +} + +void +NeonWALReaderFree(NeonWALReader *state) +{ + if (state->seg.ws_file != -1) + neon_wal_segment_close(state); + if (state->wp_conn) + libpqwp_disconnect(state->wp_conn); + pfree(state); +} + +/* + * Like vanilla WALRead, but if requested position is before available_lsn or + * WAL segment doesn't exist on disk, it tries to fetch needed segment from the + * advanced safekeeper. + * + * Read 'count' bytes into 'buf', starting at location 'startptr', from WAL + * fetched from timeline 'tli'. + * + * Returns NEON_WALREAD_SUCCESS if succeeded, NEON_WALREAD_ERROR if an error + * occurs, in which case 'err' has the desciption. Error always closes remote + * connection, if there was any, so socket subscription should be removed. + * + * NEON_WALREAD_WOULDBLOCK means caller should obtain socket to wait for with + * NeonWALReaderSocket and call NeonWALRead again with exactly the same + * arguments when NeonWALReaderEvents happen on the socket. Note that per libpq + * docs during connection establishment (before first successful read) socket + * underneath might change. + * + * Also, eventually walreader should switch from remote to local read; caller + * should remove subscription to socket then by checking NeonWALReaderEvents + * after successful read (otherwise next read might reopen the connection with + * different socket). + * + * Reading not monotonically is not supported and will result in error. + * + * Caller should be sure that WAL up to requested LSN exists, otherwise + * NEON_WALREAD_WOULDBLOCK might be always returned. + */ +NeonWALReadResult +NeonWALRead(NeonWALReader *state, char *buf, XLogRecPtr startptr, Size count, TimeLineID tli) +{ + /* + * If requested data is before known available basebackup lsn or there is + * already active remote state, do remote read. + */ + if (startptr < state->available_lsn || state->rem_state != RS_NONE) + { + return NeonWALReadRemote(state, buf, startptr, count, tli); + } + if (NeonWALReadLocal(state, buf, startptr, count, tli)) + { + return NEON_WALREAD_SUCCESS; + } + else if (state->wre_errno == ENOENT) + { + nwr_log(LOG, "local read failed as segment at %X/%X doesn't exist, attempting remote", + LSN_FORMAT_ARGS(startptr)); + return NeonWALReadRemote(state, buf, startptr, count, tli); + } + else + { + return NEON_WALREAD_ERROR; + } +} + +/* Do the read from remote safekeeper. */ +static NeonWALReadResult +NeonWALReadRemote(NeonWALReader *state, char *buf, XLogRecPtr startptr, Size count, TimeLineID tli) +{ + if (state->rem_state == RS_NONE) + { + XLogRecPtr donor_lsn; + + /* no connection yet; start one */ + Safekeeper *donor = GetDonor(state->wp, &donor_lsn); + + if (donor == NULL) + { + snprintf(state->err_msg, sizeof(state->err_msg), + "failed to establish remote connection to fetch WAL: no donor available"); + return NEON_WALREAD_ERROR; + } + snprintf(state->donor_name, sizeof(state->donor_name), "%s:%s", donor->host, donor->port); + nwr_log(LOG, "establishing connection to %s, flush_lsn %X/%X to fetch WAL", + state->donor_name, LSN_FORMAT_ARGS(donor_lsn)); + state->wp_conn = libpqwp_connect_start(donor->conninfo); + if (PQstatus(state->wp_conn->pg_conn) == CONNECTION_BAD) + { + snprintf(state->err_msg, sizeof(state->err_msg), + "failed to connect to %s to fetch WAL: immediately failed with %s", + state->donor_name, PQerrorMessage(state->wp_conn->pg_conn)); + NeonWALReaderResetRemote(state); + return NEON_WALREAD_ERROR; + } + /* we'll poll immediately */ + state->rem_state = RS_CONNECTING_READ; + } + + if (state->rem_state == RS_CONNECTING_READ || state->rem_state == RS_CONNECTING_WRITE) + { + switch (PQconnectPoll(state->wp_conn->pg_conn)) + { + case PGRES_POLLING_FAILED: + snprintf(state->err_msg, sizeof(state->err_msg), + "failed to connect to %s to fetch WAL: poll error: %s", + state->donor_name, PQerrorMessage(state->wp_conn->pg_conn)); + NeonWALReaderResetRemote(state); + return NEON_WALREAD_ERROR; + case PGRES_POLLING_READING: + state->rem_state = RS_CONNECTING_READ; + return NEON_WALREAD_WOULDBLOCK; + case PGRES_POLLING_WRITING: + state->rem_state = RS_CONNECTING_WRITE; + return NEON_WALREAD_WOULDBLOCK; + case PGRES_POLLING_OK: + { + /* connection successfully established */ + char start_repl_query[128]; + + snprintf(start_repl_query, sizeof(start_repl_query), + "START_REPLICATION PHYSICAL %X/%X (term='" UINT64_FORMAT "')", + LSN_FORMAT_ARGS(startptr), state->wp->propTerm); + nwr_log(LOG, "connection to %s to fetch WAL succeeded, running %s", + state->donor_name, start_repl_query); + if (!libpqwp_send_query(state->wp_conn, start_repl_query)) + { + snprintf(state->err_msg, sizeof(state->err_msg), + "failed to send %s query to %s: %s", + start_repl_query, state->donor_name, PQerrorMessage(state->wp_conn->pg_conn)); + NeonWALReaderResetRemote(state); + return NEON_WALREAD_ERROR; + } + state->rem_state = RS_WAIT_EXEC_RESULT; + break; + } + + default: /* there is unused PGRES_POLLING_ACTIVE */ + Assert(false); + return NEON_WALREAD_ERROR; /* keep the compiler quiet */ + } + } + + if (state->rem_state == RS_WAIT_EXEC_RESULT) + { + switch (libpqwp_get_query_result(state->wp_conn)) + { + case WP_EXEC_SUCCESS_COPYBOTH: + state->rem_state = RS_ESTABLISHED; + break; + case WP_EXEC_NEEDS_INPUT: + return NEON_WALREAD_WOULDBLOCK; + case WP_EXEC_FAILED: + snprintf(state->err_msg, sizeof(state->err_msg), + "get START_REPLICATION result from %s failed: %s", + state->donor_name, PQerrorMessage(state->wp_conn->pg_conn)); + NeonWALReaderResetRemote(state); + return NEON_WALREAD_ERROR; + default: /* can't happen */ + snprintf(state->err_msg, sizeof(state->err_msg), + "get START_REPLICATION result from %s: unexpected result", + state->donor_name); + NeonWALReaderResetRemote(state); + return NEON_WALREAD_ERROR; + } + } + + Assert(state->rem_state == RS_ESTABLISHED); + + /* + * If we had the request before, verify args are the same and advance the + * result ptr according to the progress; otherwise register the request. + */ + if (state->req_lsn != InvalidXLogRecPtr) + { + if (state->req_lsn != startptr || state->req_len != count) + { + snprintf(state->err_msg, sizeof(state->err_msg), + "args changed during request, was %X/%X %zu, now %X/%X %zu", + LSN_FORMAT_ARGS(state->req_lsn), state->req_len, LSN_FORMAT_ARGS(startptr), count); + NeonWALReaderResetRemote(state); + return NEON_WALREAD_ERROR; + } + nwr_log(DEBUG5, "continuing remote read at req_lsn=%X/%X len=%zu, req_progress=%zu", + LSN_FORMAT_ARGS(startptr), + count, + state->req_progress); + buf += state->req_progress; + } + else + { + state->req_lsn = startptr; + state->req_len = count; + state->req_progress = 0; + nwr_log(DEBUG5, "starting remote read req_lsn=%X/%X len=%zu", + LSN_FORMAT_ARGS(startptr), + count); + } + + while (true) + { + Size to_copy; + + /* + * If we have no ready data, receive new message. + */ + if (state->wal_rem_len == 0 && + + /* + * check for the sake of 0 length reads; walproposer does these for + * heartbeats, though generally they shouldn't hit remote source. + */ + state->req_len - state->req_progress > 0) + { + NeonWALReadResult read_msg_res = NeonWALReaderReadMsg(state); + + if (read_msg_res != NEON_WALREAD_SUCCESS) + return read_msg_res; + } + + if (state->req_lsn + state->req_progress != state->rem_lsn) + { + snprintf(state->err_msg, sizeof(state->err_msg), + "expected remote WAL at %X/%X but got %X/%X. Non monotonic read requests could have caused this. req_lsn=%X/%X len=%zu", + LSN_FORMAT_ARGS(state->req_lsn + state->req_progress), + LSN_FORMAT_ARGS(state->rem_lsn), + LSN_FORMAT_ARGS(state->req_lsn), + state->req_len); + NeonWALReaderResetRemote(state); + return NEON_WALREAD_ERROR; + } + + /* We can copy min of (available, requested) bytes. */ + to_copy = + Min(state->req_len - state->req_progress, state->wal_rem_len); + memcpy(buf, state->wal_ptr, to_copy); + state->wal_ptr += to_copy; + state->wal_rem_len -= to_copy; + state->rem_lsn += to_copy; + if (state->wal_rem_len == 0) + state->wal_ptr = NULL; /* freed by libpqwalproposer */ + buf += to_copy; + state->req_progress += to_copy; + if (state->req_progress == state->req_len) + { + XLogSegNo next_segno; + XLogSegNo req_segno; + + XLByteToSeg(state->req_lsn, req_segno, state->segcxt.ws_segsize); + XLByteToSeg(state->rem_lsn, next_segno, state->segcxt.ws_segsize); + + /* + * Request completed. If there is a chance of serving next one + * locally, close the connection. + */ + if (state->req_lsn < state->available_lsn && + state->rem_lsn >= state->available_lsn) + { + nwr_log(LOG, "closing remote connection as available_lsn %X/%X crossed and next read at %X/%X is likely to be served locally", + LSN_FORMAT_ARGS(state->available_lsn), LSN_FORMAT_ARGS(state->rem_lsn)); + NeonWALReaderResetRemote(state); + } + else if (state->rem_lsn >= state->available_lsn && next_segno > req_segno && + is_wal_segment_exists(next_segno, state->segcxt.ws_segsize, tli)) + { + nwr_log(LOG, "closing remote connection as WAL file at next lsn %X/%X exists", + LSN_FORMAT_ARGS(state->rem_lsn)); + NeonWALReaderResetRemote(state); + } + state->req_lsn = InvalidXLogRecPtr; + state->req_len = 0; + state->req_progress = 0; + return NEON_WALREAD_SUCCESS; + } + } +} + +/* + * Read one WAL message from the stream, sets state->wal_ptr in case of success. + * Resets remote state in case of failure. + */ +static NeonWALReadResult +NeonWALReaderReadMsg(NeonWALReader *state) +{ + while (true) /* loop until we get 'w' */ + { + char *copydata_ptr; + int copydata_size; + StringInfoData s; + char msg_type; + int hdrlen; + + Assert(state->rem_state == RS_ESTABLISHED); + Assert(state->wal_ptr == NULL && state->wal_rem_len == 0); + + switch (libpqwp_async_read(state->wp_conn, + ©data_ptr, + ©data_size)) + { + case PG_ASYNC_READ_SUCCESS: + break; + case PG_ASYNC_READ_TRY_AGAIN: + return NEON_WALREAD_WOULDBLOCK; + case PG_ASYNC_READ_FAIL: + snprintf(state->err_msg, + sizeof(state->err_msg), + "req_lsn=%X/%X, req_len=%zu, req_progress=%zu, get copydata failed: %s", + LSN_FORMAT_ARGS(state->req_lsn), + state->req_len, + state->req_progress, + PQerrorMessage(state->wp_conn->pg_conn)); + goto err; + } + + /* put data on StringInfo to parse */ + s.data = copydata_ptr; + s.len = copydata_size; + s.cursor = 0; + s.maxlen = -1; + + if (copydata_size == 0) + { + snprintf(state->err_msg, + sizeof(state->err_msg), + "zero length copydata received"); + goto err; + } + msg_type = pq_getmsgbyte(&s); + switch (msg_type) + { + case 'w': + { + XLogRecPtr start_lsn; + + hdrlen = sizeof(int64) + sizeof(int64) + sizeof(int64); + if (s.len - s.cursor < hdrlen) + { + snprintf(state->err_msg, + sizeof(state->err_msg), + "invalid WAL message received from primary"); + goto err; + } + + start_lsn = pq_getmsgint64(&s); + pq_getmsgint64(&s); /* XLogRecPtr end_lsn; */ + pq_getmsgint64(&s); /* TimestampTz send_time */ + + state->rem_lsn = start_lsn; + state->wal_rem_len = (Size) (s.len - s.cursor); + state->wal_ptr = (char *) pq_getmsgbytes(&s, s.len - s.cursor); + nwr_log(DEBUG5, "received WAL msg at %X/%X len %zu", + LSN_FORMAT_ARGS(state->rem_lsn), state->wal_rem_len); + + return NEON_WALREAD_SUCCESS; + } + case 'k': + { + XLogRecPtr end_lsn; + bool reply_requested; + + hdrlen = sizeof(int64) + sizeof(int64) + sizeof(char); + if (s.len - s.cursor < hdrlen) + { + snprintf(state->err_msg, sizeof(state->err_msg), + "invalid keepalive message received from primary"); + goto err; + } + + end_lsn = pq_getmsgint64(&s); + pq_getmsgint64(&s); /* TimestampTz timestamp; */ + reply_requested = pq_getmsgbyte(&s); + nwr_log(DEBUG5, "received keepalive end_lsn=%X/%X reply_requested=%d", + LSN_FORMAT_ARGS(end_lsn), + reply_requested); + if (end_lsn < state->req_lsn + state->req_len) + { + snprintf(state->err_msg, sizeof(state->err_msg), + "closing remote connection: requested WAL up to %X/%X, but current donor %s has only up to %X/%X", + LSN_FORMAT_ARGS(state->req_lsn + state->req_len), state->donor_name, LSN_FORMAT_ARGS(end_lsn)); + goto err; + } + continue; + } + default: + nwr_log(WARNING, "invalid replication message type %d", msg_type); + continue; + } + } +err: + NeonWALReaderResetRemote(state); + return NEON_WALREAD_ERROR; +} + +/* reset remote connection and request in progress */ +static void +NeonWALReaderResetRemote(NeonWALReader *state) +{ + state->req_lsn = InvalidXLogRecPtr; + state->req_len = 0; + state->req_progress = 0; + state->rem_state = RS_NONE; + if (state->wp_conn) + { + libpqwp_disconnect(state->wp_conn); + state->wp_conn = NULL; + } + state->donor_name[0] = '\0'; + state->wal_ptr = NULL; + state->wal_rem_len = 0; + state->rem_lsn = InvalidXLogRecPtr; +} + +/* + * Return socket of connection to remote source. Must be called only when + * connection exists (NeonWALReaderEvents returns non zero). + */ +pgsocket +NeonWALReaderSocket(NeonWALReader *state) +{ + if (!state->wp_conn) + nwr_log(FATAL, "NeonWALReaderSocket is called without active remote connection"); + return PQsocket(state->wp_conn->pg_conn); +} + +/* + * Whether remote connection is established. Once this is done, until successful + * local read or error socket is stable and user can update socket events + * instead of readding it each time. + */ +bool +NeonWALReaderIsRemConnEstablished(NeonWALReader *state) +{ + return state->rem_state == RS_ESTABLISHED; +} + +/* + * Returns events user should wait on connection socket or 0 if remote + * connection is not active. + */ +extern uint32 +NeonWALReaderEvents(NeonWALReader *state) +{ + switch (state->rem_state) + { + case RS_NONE: + return 0; + case RS_CONNECTING_READ: + return WL_SOCKET_READABLE; + case RS_CONNECTING_WRITE: + return WL_SOCKET_WRITEABLE; + case RS_WAIT_EXEC_RESULT: + case RS_ESTABLISHED: + return WL_SOCKET_READABLE; + default: + Assert(false); + return 0; /* make compiler happy */ + } +} + +static bool +NeonWALReadLocal(NeonWALReader *state, char *buf, XLogRecPtr startptr, Size count, TimeLineID tli) +{ + char *p; + XLogRecPtr recptr; + Size nbytes; + + p = buf; + recptr = startptr; + nbytes = count; + + while (nbytes > 0) + { + uint32 startoff; + int segbytes; + int readbytes; + + startoff = XLogSegmentOffset(recptr, state->segcxt.ws_segsize); + + /* + * If the data we want is not in a segment we have open, close what we + * have (if anything) and open the next one, using the caller's + * provided openSegment callback. + */ + if (state->seg.ws_file < 0 || + !XLByteInSeg(recptr, state->seg.ws_segno, state->segcxt.ws_segsize) || + tli != state->seg.ws_tli) + { + XLogSegNo nextSegNo; + + neon_wal_segment_close(state); + + XLByteToSeg(recptr, nextSegNo, state->segcxt.ws_segsize); + if (!neon_wal_segment_open(state, nextSegNo, &tli)) + { + char fname[MAXFNAMELEN]; + + state->wre_errno = errno; + + XLogFileName(fname, tli, nextSegNo, state->segcxt.ws_segsize); + snprintf(state->err_msg, sizeof(state->err_msg), "failed to open WAL segment %s while reading at %X/%X: %s", + fname, LSN_FORMAT_ARGS(recptr), strerror(state->wre_errno)); + return false; + } + + /* This shouldn't happen -- indicates a bug in segment_open */ + Assert(state->seg.ws_file >= 0); + + /* Update the current segment info. */ + state->seg.ws_tli = tli; + state->seg.ws_segno = nextSegNo; + } + + /* How many bytes are within this segment? */ + if (nbytes > (state->segcxt.ws_segsize - startoff)) + segbytes = state->segcxt.ws_segsize - startoff; + else + segbytes = nbytes; + +#ifndef FRONTEND + pgstat_report_wait_start(WAIT_EVENT_WAL_READ); +#endif + + /* Reset errno first; eases reporting non-errno-affecting errors */ + errno = 0; + readbytes = pg_pread(state->seg.ws_file, p, segbytes, (off_t) startoff); + +#ifndef FRONTEND + pgstat_report_wait_end(); +#endif + + if (readbytes <= 0) + { + char fname[MAXFNAMELEN]; + + XLogFileName(fname, state->seg.ws_tli, state->seg.ws_segno, state->segcxt.ws_segsize); + + if (readbytes < 0) + { + state->wre_errno = errno; + snprintf(state->err_msg, sizeof(state->err_msg), "could not read from log segment %s, offset %d: %m: %s", + fname, startoff, strerror(state->wre_errno)); + } + else + { + snprintf(state->err_msg, sizeof(state->err_msg), "could not read from log segment %s, offset %d: %m: unexpected EOF", + fname, startoff); + } + return false; + } + + /* Update state for read */ + recptr += readbytes; + nbytes -= readbytes; + p += readbytes; + } + + return true; +} + +/* + * Copy of vanilla wal_segment_open, but returns false in case of error instead + * of ERROR, with errno set. + * + * XLogReaderRoutine->segment_open callback for local pg_wal files + */ +static bool +neon_wal_segment_open(NeonWALReader *state, XLogSegNo nextSegNo, + TimeLineID *tli_p) +{ + TimeLineID tli = *tli_p; + char path[MAXPGPATH]; + + XLogFilePath(path, tli, nextSegNo, state->segcxt.ws_segsize); + nwr_log(DEBUG5, "opening %s", path); + state->seg.ws_file = BasicOpenFile(path, O_RDONLY | PG_BINARY); + if (state->seg.ws_file >= 0) + return true; + + return false; +} + +static bool +is_wal_segment_exists(XLogSegNo segno, int segsize, TimeLineID tli) +{ + struct stat stat_buffer; + char path[MAXPGPATH]; + + XLogFilePath(path, tli, segno, segsize); + return stat(path, &stat_buffer) == 0; +} + +/* copy of vanilla wal_segment_close with NeonWALReader */ +static void +neon_wal_segment_close(NeonWALReader *state) +{ + if (state->seg.ws_file >= 0) + { + close(state->seg.ws_file); + /* need to check errno? */ + state->seg.ws_file = -1; + } +} + +char * +NeonWALReaderErrMsg(NeonWALReader *state) +{ + return state->err_msg; +} diff --git a/pgxn/neon/neon_walreader.h b/pgxn/neon/neon_walreader.h new file mode 100644 index 0000000000..6be9f149aa --- /dev/null +++ b/pgxn/neon/neon_walreader.h @@ -0,0 +1,30 @@ +#ifndef __NEON_WALREADER_H__ +#define __NEON_WALREADER_H__ + +#include "access/xlogdefs.h" + +/* forward declare so we don't have to expose the struct to the public */ +struct NeonWALReader; +typedef struct NeonWALReader NeonWALReader; + +/* avoid including walproposer.h as it includes us */ +struct WalProposer; +typedef struct WalProposer WalProposer; + +/* NeonWALRead return value */ +typedef enum +{ + NEON_WALREAD_SUCCESS, + NEON_WALREAD_WOULDBLOCK, + NEON_WALREAD_ERROR, +} NeonWALReadResult; + +extern NeonWALReader *NeonWALReaderAllocate(int wal_segment_size, XLogRecPtr available_lsn, WalProposer *wp, char *log_prefix); +extern void NeonWALReaderFree(NeonWALReader *state); +extern NeonWALReadResult NeonWALRead(NeonWALReader *state, char *buf, XLogRecPtr startptr, Size count, TimeLineID tli); +extern pgsocket NeonWALReaderSocket(NeonWALReader *state); +extern uint32 NeonWALReaderEvents(NeonWALReader *state); +extern bool NeonWALReaderIsRemConnEstablished(NeonWALReader *state); +extern char *NeonWALReaderErrMsg(NeonWALReader *state); + +#endif /* __NEON_WALREADER_H__ */ diff --git a/pgxn/neon/walproposer.c b/pgxn/neon/walproposer.c index fc3332612c..1f7c473e7d 100644 --- a/pgxn/neon/walproposer.c +++ b/pgxn/neon/walproposer.c @@ -45,7 +45,6 @@ /* Prototypes for private functions */ static void WalProposerLoop(WalProposer *wp); -static void HackyRemoveWalProposerEvent(Safekeeper *to_remove); static void ShutdownConnection(Safekeeper *sk); static void ResetConnection(Safekeeper *sk); static long TimeToReconnect(WalProposer *wp, TimestampTz now); @@ -78,11 +77,11 @@ static bool BlockingWrite(Safekeeper *sk, void *msg, size_t msg_size, Safekeeper static bool AsyncWrite(Safekeeper *sk, void *msg, size_t msg_size, SafekeeperState flush_state); static bool AsyncFlush(Safekeeper *sk); static int CompareLsn(const void *a, const void *b); -static char *FormatSafekeeperState(SafekeeperState state); +static char *FormatSafekeeperState(Safekeeper *sk); static void AssertEventsOkForState(uint32 events, Safekeeper *sk); -static uint32 SafekeeperStateDesiredEvents(SafekeeperState state); static char *FormatEvents(WalProposer *wp, uint32 events); + WalProposer * WalProposerCreate(WalProposerConfig *config, walproposer_api api) { @@ -100,7 +99,7 @@ WalProposerCreate(WalProposerConfig *config, walproposer_api api) port = strchr(host, ':'); if (port == NULL) { - walprop_log(FATAL, "port is not specified"); + wp_log(FATAL, "port is not specified"); } *port++ = '\0'; sep = strchr(port, ','); @@ -108,11 +107,12 @@ WalProposerCreate(WalProposerConfig *config, walproposer_api api) *sep++ = '\0'; if (wp->n_safekeepers + 1 >= MAX_SAFEKEEPERS) { - walprop_log(FATAL, "Too many safekeepers"); + wp_log(FATAL, "too many safekeepers"); } wp->safekeeper[wp->n_safekeepers].host = host; wp->safekeeper[wp->n_safekeepers].port = port; wp->safekeeper[wp->n_safekeepers].state = SS_OFFLINE; + wp->safekeeper[wp->n_safekeepers].active_state = SS_ACTIVE_SEND; wp->safekeeper[wp->n_safekeepers].wp = wp; { @@ -123,19 +123,17 @@ WalProposerCreate(WalProposerConfig *config, walproposer_api api) "host=%s port=%s dbname=replication options='-c timeline_id=%s tenant_id=%s'", sk->host, sk->port, wp->config->neon_timeline, wp->config->neon_tenant); if (written > MAXCONNINFO || written < 0) - walprop_log(FATAL, "could not create connection string for safekeeper %s:%s", sk->host, sk->port); + wp_log(FATAL, "could not create connection string for safekeeper %s:%s", sk->host, sk->port); } initStringInfo(&wp->safekeeper[wp->n_safekeepers].outbuf); - wp->api.wal_reader_allocate(&wp->safekeeper[wp->n_safekeepers]); - wp->safekeeper[wp->n_safekeepers].flushWrite = false; wp->safekeeper[wp->n_safekeepers].startStreamingAt = InvalidXLogRecPtr; wp->safekeeper[wp->n_safekeepers].streamingAt = InvalidXLogRecPtr; wp->n_safekeepers += 1; } if (wp->n_safekeepers < 1) { - walprop_log(FATAL, "Safekeepers addresses are not specified"); + wp_log(FATAL, "safekeepers addresses are not specified"); } wp->quorum = wp->n_safekeepers / 2 + 1; @@ -146,15 +144,15 @@ WalProposerCreate(WalProposerConfig *config, walproposer_api api) wp->api.strong_random(wp, &wp->greetRequest.proposerId, sizeof(wp->greetRequest.proposerId)); wp->greetRequest.systemId = wp->config->systemId; if (!wp->config->neon_timeline) - walprop_log(FATAL, "neon.timeline_id is not provided"); + wp_log(FATAL, "neon.timeline_id is not provided"); if (*wp->config->neon_timeline != '\0' && !HexDecodeString(wp->greetRequest.timeline_id, wp->config->neon_timeline, 16)) - walprop_log(FATAL, "Could not parse neon.timeline_id, %s", wp->config->neon_timeline); + wp_log(FATAL, "could not parse neon.timeline_id, %s", wp->config->neon_timeline); if (!wp->config->neon_tenant) - walprop_log(FATAL, "neon.tenant_id is not provided"); + wp_log(FATAL, "neon.tenant_id is not provided"); if (*wp->config->neon_tenant != '\0' && !HexDecodeString(wp->greetRequest.tenant_id, wp->config->neon_tenant, 16)) - walprop_log(FATAL, "Could not parse neon.tenant_id, %s", wp->config->neon_tenant); + wp_log(FATAL, "could not parse neon.tenant_id, %s", wp->config->neon_tenant); wp->greetRequest.timeline = wp->config->pgTimeline; wp->greetRequest.walSegSize = wp->config->wal_segment_size; @@ -276,8 +274,8 @@ WalProposerPoll(WalProposer *wp) if (TimestampDifferenceExceeds(sk->latestMsgReceivedAt, now, wp->config->safekeeper_connection_timeout)) { - walprop_log(WARNING, "terminating connection to safekeeper '%s:%s' in '%s' state: no messages received during the last %dms or connection attempt took longer than that", - sk->host, sk->port, FormatSafekeeperState(sk->state), wp->config->safekeeper_connection_timeout); + wp_log(WARNING, "terminating connection to safekeeper '%s:%s' in '%s' state: no messages received during the last %dms or connection attempt took longer than that", + sk->host, sk->port, FormatSafekeeperState(sk), wp->config->safekeeper_connection_timeout); ShutdownConnection(sk); } } @@ -305,58 +303,20 @@ WalProposerLoop(WalProposer *wp) WalProposerPoll(wp); } -/* - * Hack: provides a way to remove the event corresponding to an individual walproposer from the set. - * - * Note: Internally, this completely reconstructs the event set. It should be avoided if possible. - */ -static void -HackyRemoveWalProposerEvent(Safekeeper *to_remove) -{ - WalProposer *wp = to_remove->wp; - - /* Remove the existing event set, assign sk->eventPos = -1 */ - wp->api.free_event_set(wp); - /* Re-initialize it without adding any safekeeper events */ - wp->api.init_event_set(wp); - - /* - * loop through the existing safekeepers. If they aren't the one we're - * removing, and if they have a socket we can use, re-add the applicable - * events. - */ - for (int i = 0; i < wp->n_safekeepers; i++) - { - uint32 desired_events = WL_NO_EVENTS; - Safekeeper *sk = &wp->safekeeper[i]; - - if (sk == to_remove) - continue; - - /* If this safekeeper isn't offline, add an event for it! */ - if (sk->state != SS_OFFLINE) - { - desired_events = SafekeeperStateDesiredEvents(sk->state); - /* will set sk->eventPos */ - wp->api.add_safekeeper_event_set(sk, desired_events); - } - } -} /* Shuts down and cleans up the connection for a safekeeper. Sets its state to SS_OFFLINE */ static void ShutdownConnection(Safekeeper *sk) { - sk->wp->api.conn_finish(sk); sk->state = SS_OFFLINE; - sk->flushWrite = false; sk->streamingAt = InvalidXLogRecPtr; if (sk->voteResponse.termHistory.entries) pfree(sk->voteResponse.termHistory.entries); sk->voteResponse.termHistory.entries = NULL; - HackyRemoveWalProposerEvent(sk); + sk->wp->api.conn_finish(sk); + sk->wp->api.rm_safekeeper_event_set(sk); } /* @@ -396,8 +356,8 @@ ResetConnection(Safekeeper *sk) * * https://www.postgresql.org/docs/devel/libpq-connect.html#LIBPQ-PQCONNECTSTARTPARAMS */ - walprop_log(WARNING, "Immediate failure to connect with node '%s:%s':\n\terror: %s", - sk->host, sk->port, wp->api.conn_error_message(sk)); + wp_log(WARNING, "immediate failure to connect with node '%s:%s':\n\terror: %s", + sk->host, sk->port, wp->api.conn_error_message(sk)); /* * Even though the connection failed, we still need to clean up the @@ -420,7 +380,7 @@ ResetConnection(Safekeeper *sk) * (see libpqrcv_connect, defined in * src/backend/replication/libpqwalreceiver/libpqwalreceiver.c) */ - walprop_log(LOG, "connecting with node %s:%s", sk->host, sk->port); + wp_log(LOG, "connecting with node %s:%s", sk->host, sk->port); sk->state = SS_CONNECTING_WRITE; sk->latestMsgReceivedAt = wp->api.get_current_timestamp(wp); @@ -474,7 +434,9 @@ ReconnectSafekeepers(WalProposer *wp) static void AdvancePollState(Safekeeper *sk, uint32 events) { +#ifdef WALPROPOSER_LIB /* wp_log needs wp in lib build */ WalProposer *wp = sk->wp; +#endif /* * Sanity check. We assume further down that the operations don't block @@ -490,8 +452,8 @@ AdvancePollState(Safekeeper *sk, uint32 events) * ResetConnection */ case SS_OFFLINE: - walprop_log(FATAL, "Unexpected safekeeper %s:%s state advancement: is offline", - sk->host, sk->port); + wp_log(FATAL, "unexpected safekeeper %s:%s state advancement: is offline", + sk->host, sk->port); break; /* actually unreachable, but prevents * -Wimplicit-fallthrough */ @@ -526,8 +488,8 @@ AdvancePollState(Safekeeper *sk, uint32 events) * requests. */ case SS_VOTING: - walprop_log(WARNING, "EOF from node %s:%s in %s state", sk->host, - sk->port, FormatSafekeeperState(sk->state)); + wp_log(WARNING, "EOF from node %s:%s in %s state", sk->host, + sk->port, FormatSafekeeperState(sk)); ResetConnection(sk); return; @@ -555,8 +517,8 @@ AdvancePollState(Safekeeper *sk, uint32 events) * Idle state for waiting votes from quorum. */ case SS_IDLE: - walprop_log(WARNING, "EOF from node %s:%s in %s state", sk->host, - sk->port, FormatSafekeeperState(sk->state)); + wp_log(WARNING, "EOF from node %s:%s in %s state", sk->host, + sk->port, FormatSafekeeperState(sk)); ResetConnection(sk); return; @@ -581,8 +543,8 @@ HandleConnectionEvent(Safekeeper *sk) switch (result) { case WP_CONN_POLLING_OK: - walprop_log(LOG, "connected with node %s:%s", sk->host, - sk->port); + wp_log(LOG, "connected with node %s:%s", sk->host, + sk->port); sk->latestMsgReceivedAt = wp->api.get_current_timestamp(wp); /* @@ -605,8 +567,8 @@ HandleConnectionEvent(Safekeeper *sk) break; case WP_CONN_POLLING_FAILED: - walprop_log(WARNING, "failed to connect to node '%s:%s': %s", - sk->host, sk->port, wp->api.conn_error_message(sk)); + wp_log(WARNING, "failed to connect to node '%s:%s': %s", + sk->host, sk->port, wp->api.conn_error_message(sk)); /* * If connecting failed, we don't want to restart the connection @@ -622,7 +584,7 @@ HandleConnectionEvent(Safekeeper *sk) * Because PQconnectPoll can change the socket, we have to un-register the * old event and re-register an event on the new socket. */ - HackyRemoveWalProposerEvent(sk); + wp->api.rm_safekeeper_event_set(sk); wp->api.add_safekeeper_event_set(sk, new_events); /* If we successfully connected, send START_WAL_PUSH query */ @@ -642,8 +604,8 @@ SendStartWALPush(Safekeeper *sk) if (!wp->api.conn_send_query(sk, "START_WAL_PUSH")) { - walprop_log(WARNING, "Failed to send 'START_WAL_PUSH' query to safekeeper %s:%s: %s", - sk->host, sk->port, wp->api.conn_error_message(sk)); + wp_log(WARNING, "failed to send 'START_WAL_PUSH' query to safekeeper %s:%s: %s", + sk->host, sk->port, wp->api.conn_error_message(sk)); ShutdownConnection(sk); return; } @@ -679,8 +641,8 @@ RecvStartWALPushResult(Safekeeper *sk) break; case WP_EXEC_FAILED: - walprop_log(WARNING, "Failed to send query to safekeeper %s:%s: %s", - sk->host, sk->port, wp->api.conn_error_message(sk)); + wp_log(WARNING, "failed to send query to safekeeper %s:%s: %s", + sk->host, sk->port, wp->api.conn_error_message(sk)); ShutdownConnection(sk); return; @@ -690,8 +652,8 @@ RecvStartWALPushResult(Safekeeper *sk) * wrong" */ case WP_EXEC_UNEXPECTED_SUCCESS: - walprop_log(WARNING, "Received bad response from safekeeper %s:%s query execution", - sk->host, sk->port); + wp_log(WARNING, "received bad response from safekeeper %s:%s query execution", + sk->host, sk->port); ShutdownConnection(sk); return; } @@ -726,7 +688,7 @@ RecvAcceptorGreeting(Safekeeper *sk) if (!AsyncReadMessage(sk, (AcceptorProposerMessage *) &sk->greetResponse)) return; - walprop_log(LOG, "received AcceptorGreeting from safekeeper %s:%s", sk->host, sk->port); + wp_log(LOG, "received AcceptorGreeting from safekeeper %s:%s", sk->host, sk->port); /* Protocol is all good, move to voting. */ sk->state = SS_VOTING; @@ -746,7 +708,7 @@ RecvAcceptorGreeting(Safekeeper *sk) if (wp->n_connected == wp->quorum) { wp->propTerm++; - walprop_log(LOG, "proposer connected to quorum (%d) safekeepers, propTerm=" INT64_FORMAT, wp->quorum, wp->propTerm); + wp_log(LOG, "proposer connected to quorum (%d) safekeepers, propTerm=" INT64_FORMAT, wp->quorum, wp->propTerm); wp->voteRequest = (VoteRequest) { @@ -759,9 +721,9 @@ RecvAcceptorGreeting(Safekeeper *sk) else if (sk->greetResponse.term > wp->propTerm) { /* Another compute with higher term is running. */ - walprop_log(FATAL, "WAL acceptor %s:%s with term " INT64_FORMAT " rejects our connection request with term " INT64_FORMAT "", - sk->host, sk->port, - sk->greetResponse.term, wp->propTerm); + wp_log(FATAL, "WAL acceptor %s:%s with term " INT64_FORMAT " rejects our connection request with term " INT64_FORMAT "", + sk->host, sk->port, + sk->greetResponse.term, wp->propTerm); } /* @@ -801,7 +763,7 @@ SendVoteRequest(Safekeeper *sk) WalProposer *wp = sk->wp; /* We have quorum for voting, send our vote request */ - walprop_log(LOG, "requesting vote from %s:%s for term " UINT64_FORMAT, sk->host, sk->port, wp->voteRequest.term); + wp_log(LOG, "requesting vote from %s:%s for term " UINT64_FORMAT, sk->host, sk->port, wp->voteRequest.term); /* On failure, logging & resetting is handled */ if (!BlockingWrite(sk, &wp->voteRequest, sizeof(wp->voteRequest), SS_WAIT_VERDICT)) return; @@ -818,12 +780,12 @@ RecvVoteResponse(Safekeeper *sk) if (!AsyncReadMessage(sk, (AcceptorProposerMessage *) &sk->voteResponse)) return; - walprop_log(LOG, - "got VoteResponse from acceptor %s:%s, voteGiven=" UINT64_FORMAT ", epoch=" UINT64_FORMAT ", flushLsn=%X/%X, truncateLsn=%X/%X, timelineStartLsn=%X/%X", - sk->host, sk->port, sk->voteResponse.voteGiven, GetHighestTerm(&sk->voteResponse.termHistory), - LSN_FORMAT_ARGS(sk->voteResponse.flushLsn), - LSN_FORMAT_ARGS(sk->voteResponse.truncateLsn), - LSN_FORMAT_ARGS(sk->voteResponse.timelineStartLsn)); + wp_log(LOG, + "got VoteResponse from acceptor %s:%s, voteGiven=" UINT64_FORMAT ", epoch=" UINT64_FORMAT ", flushLsn=%X/%X, truncateLsn=%X/%X, timelineStartLsn=%X/%X", + sk->host, sk->port, sk->voteResponse.voteGiven, GetHighestTerm(&sk->voteResponse.termHistory), + LSN_FORMAT_ARGS(sk->voteResponse.flushLsn), + LSN_FORMAT_ARGS(sk->voteResponse.truncateLsn), + LSN_FORMAT_ARGS(sk->voteResponse.timelineStartLsn)); /* * In case of acceptor rejecting our vote, bail out, but only if either it @@ -833,9 +795,9 @@ RecvVoteResponse(Safekeeper *sk) if ((!sk->voteResponse.voteGiven) && (sk->voteResponse.term > wp->propTerm || wp->n_votes < wp->quorum)) { - walprop_log(FATAL, "WAL acceptor %s:%s with term " INT64_FORMAT " rejects our connection request with term " INT64_FORMAT "", - sk->host, sk->port, - sk->voteResponse.term, wp->propTerm); + wp_log(FATAL, "WAL acceptor %s:%s with term " INT64_FORMAT " rejects our connection request with term " INT64_FORMAT "", + sk->host, sk->port, + sk->voteResponse.term, wp->propTerm); } Assert(sk->voteResponse.term == wp->propTerm); @@ -847,7 +809,7 @@ RecvVoteResponse(Safekeeper *sk) } else if (wp->n_votes > wp->quorum) { - /* recovery already performed, just start streaming */ + /* already elected, start streaming */ SendProposerElected(sk); } else @@ -873,21 +835,16 @@ HandleElectedProposer(WalProposer *wp) DetermineEpochStartLsn(wp); /* - * Check if not all safekeepers are up-to-date, we need to download WAL - * needed to synchronize them + * Synchronously download WAL from the most advanced safekeeper. We do + * that only for logical replication (and switching logical walsenders to + * neon_walreader is a todo.) */ - if (wp->truncateLsn < wp->propEpochStartLsn) + if (!wp->api.recovery_download(wp, &wp->safekeeper[wp->donor])) { - walprop_log(LOG, - "start recovery because truncateLsn=%X/%X is not " - "equal to epochStartLsn=%X/%X", - LSN_FORMAT_ARGS(wp->truncateLsn), - LSN_FORMAT_ARGS(wp->propEpochStartLsn)); - /* Perform recovery */ - if (!wp->api.recovery_download(&wp->safekeeper[wp->donor], wp->greetRequest.timeline, wp->truncateLsn, wp->propEpochStartLsn)) - walprop_log(FATAL, "Failed to recover state"); + wp_log(FATAL, "failed to download WAL for logical replicaiton"); } - else if (wp->config->syncSafekeepers) + + if (wp->truncateLsn == wp->propEpochStartLsn && wp->config->syncSafekeepers) { /* Sync is not needed: just exit */ wp->api.finish_sync_safekeepers(wp, wp->propEpochStartLsn); @@ -991,10 +948,10 @@ DetermineEpochStartLsn(WalProposer *wp) if (wp->timelineStartLsn != InvalidXLogRecPtr && wp->timelineStartLsn != wp->safekeeper[i].voteResponse.timelineStartLsn) { - walprop_log(WARNING, - "inconsistent timelineStartLsn: current %X/%X, received %X/%X", - LSN_FORMAT_ARGS(wp->timelineStartLsn), - LSN_FORMAT_ARGS(wp->safekeeper[i].voteResponse.timelineStartLsn)); + wp_log(WARNING, + "inconsistent timelineStartLsn: current %X/%X, received %X/%X", + LSN_FORMAT_ARGS(wp->timelineStartLsn), + LSN_FORMAT_ARGS(wp->safekeeper[i].voteResponse.timelineStartLsn)); } wp->timelineStartLsn = wp->safekeeper[i].voteResponse.timelineStartLsn; } @@ -1012,7 +969,7 @@ DetermineEpochStartLsn(WalProposer *wp) { wp->timelineStartLsn = wp->api.get_redo_start_lsn(wp); } - walprop_log(LOG, "bumped epochStartLsn to the first record %X/%X", LSN_FORMAT_ARGS(wp->propEpochStartLsn)); + wp_log(LOG, "bumped epochStartLsn to the first record %X/%X", LSN_FORMAT_ARGS(wp->propEpochStartLsn)); } /* @@ -1039,12 +996,12 @@ DetermineEpochStartLsn(WalProposer *wp) wp->propTermHistory.entries[wp->propTermHistory.n_entries - 1].term = wp->propTerm; wp->propTermHistory.entries[wp->propTermHistory.n_entries - 1].lsn = wp->propEpochStartLsn; - walprop_log(LOG, "got votes from majority (%d) of nodes, term " UINT64_FORMAT ", epochStartLsn %X/%X, donor %s:%s, truncate_lsn %X/%X", - wp->quorum, - wp->propTerm, - LSN_FORMAT_ARGS(wp->propEpochStartLsn), - wp->safekeeper[wp->donor].host, wp->safekeeper[wp->donor].port, - LSN_FORMAT_ARGS(wp->truncateLsn)); + wp_log(LOG, "got votes from majority (%d) of nodes, term " UINT64_FORMAT ", epochStartLsn %X/%X, donor %s:%s, truncate_lsn %X/%X", + wp->quorum, + wp->propTerm, + LSN_FORMAT_ARGS(wp->propEpochStartLsn), + wp->safekeeper[wp->donor].host, wp->safekeeper[wp->donor].port, + LSN_FORMAT_ARGS(wp->truncateLsn)); /* * Ensure the basebackup we are running (at RedoStartLsn) matches LSN @@ -1077,21 +1034,14 @@ DetermineEpochStartLsn(WalProposer *wp) * scenario. */ disable_core_dump(); - walprop_log(PANIC, - "collected propEpochStartLsn %X/%X, but basebackup LSN %X/%X", - LSN_FORMAT_ARGS(wp->propEpochStartLsn), - LSN_FORMAT_ARGS(wp->api.get_redo_start_lsn(wp))); + wp_log(PANIC, + "collected propEpochStartLsn %X/%X, but basebackup LSN %X/%X", + LSN_FORMAT_ARGS(wp->propEpochStartLsn), + LSN_FORMAT_ARGS(wp->api.get_redo_start_lsn(wp))); } } walprop_shared->mineLastElectedTerm = wp->propTerm; } - - /* - * WalProposer has just elected itself and initialized history, so we can - * call election callback. Usually it updates truncateLsn to fetch WAL for - * logical replication. - */ - wp->api.after_election(wp); } /* @@ -1112,6 +1062,9 @@ SendProposerElected(Safekeeper *sk) term_t lastCommonTerm; int i; + /* Now that we are ready to send it's a good moment to create WAL reader */ + wp->api.wal_reader_allocate(sk); + /* * Determine start LSN by comparing safekeeper's log term switch history * and proposer's, searching for the divergence point. @@ -1138,34 +1091,10 @@ SendProposerElected(Safekeeper *sk) { /* safekeeper is empty or no common point, start from the beginning */ sk->startStreamingAt = wp->propTermHistory.entries[0].lsn; - - if (sk->startStreamingAt < wp->truncateLsn) - { - /* - * There's a gap between the WAL starting point and a truncateLsn, - * which can't appear in a normal working cluster. That gap means - * that all safekeepers reported that they have persisted WAL up - * to the truncateLsn before, but now current safekeeper tells - * otherwise. - * - * Also we have a special condition here, which is empty - * safekeeper with no history. In combination with a gap, that can - * happen when we introduce a new safekeeper to the cluster. This - * is a rare case, which is triggered manually for now, and should - * be treated with care. - */ - - /* - * truncateLsn will not change without ack from current - * safekeeper, and it's aligned to the WAL record, so we can - * safely start streaming from this point. - */ - sk->startStreamingAt = wp->truncateLsn; - - walprop_log(WARNING, "empty safekeeper joined cluster as %s:%s, historyStart=%X/%X, sk->startStreamingAt=%X/%X", - sk->host, sk->port, LSN_FORMAT_ARGS(wp->propTermHistory.entries[0].lsn), - LSN_FORMAT_ARGS(sk->startStreamingAt)); - } + wp_log(LOG, "no common point with sk %s:%s, streaming since first term at %X/%X, timelineStartLsn=%X/%X, termHistory.n_entries=%u" , + sk->host, sk->port, LSN_FORMAT_ARGS(sk->startStreamingAt), LSN_FORMAT_ARGS(wp->timelineStartLsn), wp->propTermHistory.n_entries); + /* wp->timelineStartLsn == InvalidXLogRecPtr can be only when timeline is created manually (test_s3_wal_replay) */ + Assert(sk->startStreamingAt == wp->timelineStartLsn || wp->timelineStartLsn == InvalidXLogRecPtr); } else { @@ -1188,7 +1117,7 @@ SendProposerElected(Safekeeper *sk) } } - Assert(sk->startStreamingAt >= wp->truncateLsn && sk->startStreamingAt <= wp->availableLsn); + Assert(sk->startStreamingAt <= wp->availableLsn); msg.tag = 'e'; msg.term = wp->propTerm; @@ -1197,9 +1126,9 @@ SendProposerElected(Safekeeper *sk) msg.timelineStartLsn = wp->timelineStartLsn; lastCommonTerm = i >= 0 ? wp->propTermHistory.entries[i].term : 0; - walprop_log(LOG, - "sending elected msg to node " UINT64_FORMAT " term=" UINT64_FORMAT ", startStreamingAt=%X/%X (lastCommonTerm=" UINT64_FORMAT "), termHistory.n_entries=%u to %s:%s, timelineStartLsn=%X/%X", - sk->greetResponse.nodeId, msg.term, LSN_FORMAT_ARGS(msg.startStreamingAt), lastCommonTerm, msg.termHistory->n_entries, sk->host, sk->port, LSN_FORMAT_ARGS(msg.timelineStartLsn)); + wp_log(LOG, + "sending elected msg to node " UINT64_FORMAT " term=" UINT64_FORMAT ", startStreamingAt=%X/%X (lastCommonTerm=" UINT64_FORMAT "), termHistory.n_entries=%u to %s:%s, timelineStartLsn=%X/%X", + sk->greetResponse.nodeId, msg.term, LSN_FORMAT_ARGS(msg.startStreamingAt), lastCommonTerm, msg.termHistory->n_entries, sk->host, sk->port, LSN_FORMAT_ARGS(msg.timelineStartLsn)); resetStringInfo(&sk->outbuf); pq_sendint64_le(&sk->outbuf, msg.tag); @@ -1231,6 +1160,7 @@ StartStreaming(Safekeeper *sk) * once for a connection. */ sk->state = SS_ACTIVE; + sk->active_state = SS_ACTIVE_SEND; sk->streamingAt = sk->startStreamingAt; /* event set will be updated inside SendMessageToNode */ @@ -1289,9 +1219,13 @@ HandleActiveState(Safekeeper *sk, uint32 events) { WalProposer *wp = sk->wp; - uint32 newEvents = WL_SOCKET_READABLE; - - if (events & WL_SOCKET_WRITEABLE) + /* + * Note: we don't known which socket awoke us (sk or nwr). However, as + * SendAppendRequests always tries to send at least one msg in + * SS_ACTIVE_SEND be careful not to go there if are only after sk + * response, otherwise it'd create busy loop of pings. + */ + if (events & WL_SOCKET_WRITEABLE || sk->active_state == SS_ACTIVE_READ_WAL) if (!SendAppendRequests(sk)) return; @@ -1299,28 +1233,29 @@ HandleActiveState(Safekeeper *sk, uint32 events) if (!RecvAppendResponses(sk)) return; - /* - * We should wait for WL_SOCKET_WRITEABLE event if we have unflushed data - * in the buffer. - * - * LSN comparison checks if we have pending unsent messages. This check - * isn't necessary now, because we always send append messages immediately - * after arrival. But it's good to have it here in case we change this - * behavior in the future. - */ - if (sk->streamingAt != wp->availableLsn || sk->flushWrite) - newEvents |= WL_SOCKET_WRITEABLE; +#if PG_VERSION_NUM >= 150000 + /* expected never to happen, c.f. walprop_pg_active_state_update_event_set */ + if (events & WL_SOCKET_CLOSED) + { + wp_log(WARNING, "connection to %s:%s in active state failed, got WL_SOCKET_CLOSED on neon_walreader socket", + sk->host, sk->port); + ShutdownConnection(sk); + return; + } +#endif - wp->api.update_event_set(sk, newEvents); + /* configures event set for yield whatever is the substate */ + wp->api.active_state_update_event_set(sk); } /* * Send WAL messages starting from sk->streamingAt until the end or non-writable - * socket, whichever comes first. Caller should take care of updating event set. - * Even if no unsent WAL is available, at least one empty message will be sent - * as a heartbeat, if socket is ready. + * socket or neon_walreader blocks, whichever comes first; active_state is + * updated accordingly. Caller should take care of updating event set. Even if + * no unsent WAL is available, at least one empty message will be sent as a + * heartbeat, if socket is ready. * - * Can change state if Async* functions encounter errors and reset connection. + * Resets state and kills the connections if any error on them is encountered. * Returns false in this case, true otherwise. */ static bool @@ -1328,11 +1263,11 @@ SendAppendRequests(Safekeeper *sk) { WalProposer *wp = sk->wp; XLogRecPtr endLsn; - AppendRequestHeader *req; PGAsyncWriteResult writeResult; bool sentAnything = false; + AppendRequestHeader *req; - if (sk->flushWrite) + if (sk->active_state == SS_ACTIVE_FLUSH) { if (!AsyncFlush(sk)) @@ -1343,76 +1278,101 @@ SendAppendRequests(Safekeeper *sk) return sk->state == SS_ACTIVE; /* Event set will be updated in the end of HandleActiveState */ - sk->flushWrite = false; + sk->active_state = SS_ACTIVE_SEND; } while (sk->streamingAt != wp->availableLsn || !sentAnything) { - sentAnything = true; - - endLsn = sk->streamingAt; - endLsn += MAX_SEND_SIZE; - - /* if we went beyond available WAL, back off */ - if (endLsn > wp->availableLsn) + if (sk->active_state == SS_ACTIVE_SEND) { - endLsn = wp->availableLsn; + sentAnything = true; + + endLsn = sk->streamingAt; + endLsn += MAX_SEND_SIZE; + + /* if we went beyond available WAL, back off */ + if (endLsn > wp->availableLsn) + { + endLsn = wp->availableLsn; + } + + req = &sk->appendRequest; + PrepareAppendRequest(sk->wp, &sk->appendRequest, sk->streamingAt, endLsn); + + wp_log(DEBUG5, "sending message len %ld beginLsn=%X/%X endLsn=%X/%X commitLsn=%X/%X truncateLsn=%X/%X to %s:%s", + req->endLsn - req->beginLsn, + LSN_FORMAT_ARGS(req->beginLsn), + LSN_FORMAT_ARGS(req->endLsn), + LSN_FORMAT_ARGS(req->commitLsn), + LSN_FORMAT_ARGS(wp->truncateLsn), sk->host, sk->port); + + resetStringInfo(&sk->outbuf); + + /* write AppendRequest header */ + appendBinaryStringInfo(&sk->outbuf, (char *) req, sizeof(AppendRequestHeader)); + enlargeStringInfo(&sk->outbuf, req->endLsn - req->beginLsn); + sk->active_state = SS_ACTIVE_READ_WAL; } - req = &sk->appendRequest; - PrepareAppendRequest(sk->wp, &sk->appendRequest, sk->streamingAt, endLsn); - - walprop_log(DEBUG2, "sending message len %ld beginLsn=%X/%X endLsn=%X/%X commitLsn=%X/%X truncateLsn=%X/%X to %s:%s", - req->endLsn - req->beginLsn, - LSN_FORMAT_ARGS(req->beginLsn), - LSN_FORMAT_ARGS(req->endLsn), - LSN_FORMAT_ARGS(req->commitLsn), - LSN_FORMAT_ARGS(wp->truncateLsn), sk->host, sk->port); - - resetStringInfo(&sk->outbuf); - - /* write AppendRequest header */ - appendBinaryStringInfo(&sk->outbuf, (char *) req, sizeof(AppendRequestHeader)); - - /* write the WAL itself */ - enlargeStringInfo(&sk->outbuf, req->endLsn - req->beginLsn); - /* wal_read will raise error on failure */ - wp->api.wal_read(sk, - &sk->outbuf.data[sk->outbuf.len], - req->beginLsn, - req->endLsn - req->beginLsn); - sk->outbuf.len += req->endLsn - req->beginLsn; - - writeResult = wp->api.conn_async_write(sk, sk->outbuf.data, sk->outbuf.len); - - /* Mark current message as sent, whatever the result is */ - sk->streamingAt = endLsn; - - switch (writeResult) + if (sk->active_state == SS_ACTIVE_READ_WAL) { - case PG_ASYNC_WRITE_SUCCESS: - /* Continue writing the next message */ - break; + char *errmsg; - case PG_ASYNC_WRITE_TRY_FLUSH: + req = &sk->appendRequest; - /* - * * We still need to call PQflush some more to finish the - * job. Caller function will handle this by setting right - * event* set. - */ - sk->flushWrite = true; - return true; + switch (wp->api.wal_read(sk, + &sk->outbuf.data[sk->outbuf.len], + req->beginLsn, + req->endLsn - req->beginLsn, + &errmsg)) + { + case NEON_WALREAD_SUCCESS: + break; + case NEON_WALREAD_WOULDBLOCK: + return true; + case NEON_WALREAD_ERROR: + wp_log(WARNING, "WAL reading for node %s:%s failed: %s", + sk->host, sk->port, errmsg); + ShutdownConnection(sk); + return false; + default: + Assert(false); + } - case PG_ASYNC_WRITE_FAIL: - walprop_log(WARNING, "Failed to send to node %s:%s in %s state: %s", - sk->host, sk->port, FormatSafekeeperState(sk->state), - wp->api.conn_error_message(sk)); - ShutdownConnection(sk); - return false; - default: - Assert(false); - return false; + sk->outbuf.len += req->endLsn - req->beginLsn; + + writeResult = wp->api.conn_async_write(sk, sk->outbuf.data, sk->outbuf.len); + + /* Mark current message as sent, whatever the result is */ + sk->streamingAt = req->endLsn; + + switch (writeResult) + { + case PG_ASYNC_WRITE_SUCCESS: + /* Continue writing the next message */ + sk->active_state = SS_ACTIVE_SEND; + break; + + case PG_ASYNC_WRITE_TRY_FLUSH: + + /* + * We still need to call PQflush some more to finish the + * job. Caller function will handle this by setting right + * event set. + */ + sk->active_state = SS_ACTIVE_FLUSH; + return true; + + case PG_ASYNC_WRITE_FAIL: + wp_log(WARNING, "failed to send to node %s:%s in %s state: %s", + sk->host, sk->port, FormatSafekeeperState(sk), + wp->api.conn_error_message(sk)); + ShutdownConnection(sk); + return false; + default: + Assert(false); + return false; + } } } @@ -1422,7 +1382,7 @@ SendAppendRequests(Safekeeper *sk) /* * Receive and process all available feedback. * - * Can change state if Async* functions encounter errors and reset connection. + * Resets state and kills the connection if any error on it is encountered. * Returns false in this case, true otherwise. * * NB: This function can call SendMessageToNode and produce new messages. @@ -1445,11 +1405,11 @@ RecvAppendResponses(Safekeeper *sk) if (!AsyncReadMessage(sk, (AcceptorProposerMessage *) &sk->appendResponse)) break; - walprop_log(DEBUG2, "received message term=" INT64_FORMAT " flushLsn=%X/%X commitLsn=%X/%X from %s:%s", - sk->appendResponse.term, - LSN_FORMAT_ARGS(sk->appendResponse.flushLsn), - LSN_FORMAT_ARGS(sk->appendResponse.commitLsn), - sk->host, sk->port); + wp_log(DEBUG2, "received message term=" INT64_FORMAT " flushLsn=%X/%X commitLsn=%X/%X from %s:%s", + sk->appendResponse.term, + LSN_FORMAT_ARGS(sk->appendResponse.flushLsn), + LSN_FORMAT_ARGS(sk->appendResponse.commitLsn), + sk->host, sk->port); if (sk->appendResponse.term > wp->propTerm) { @@ -1459,9 +1419,9 @@ RecvAppendResponses(Safekeeper *sk) * core as this is kinda expected scenario. */ disable_core_dump(); - walprop_log(PANIC, "WAL acceptor %s:%s with term " INT64_FORMAT " rejected our request, our term " INT64_FORMAT "", - sk->host, sk->port, - sk->appendResponse.term, wp->propTerm); + wp_log(PANIC, "WAL acceptor %s:%s with term " INT64_FORMAT " rejected our request, our term " INT64_FORMAT "", + sk->host, sk->port, + sk->appendResponse.term, wp->propTerm); } readAnything = true; @@ -1505,32 +1465,32 @@ ParsePageserverFeedbackMessage(WalProposer *wp, StringInfo reply_message, Pagese pq_getmsgint(reply_message, sizeof(int32)); /* read value length */ rf->currentClusterSize = pq_getmsgint64(reply_message); - walprop_log(DEBUG2, "ParsePageserverFeedbackMessage: current_timeline_size %lu", - rf->currentClusterSize); + wp_log(DEBUG2, "ParsePageserverFeedbackMessage: current_timeline_size %lu", + rf->currentClusterSize); } else if ((strcmp(key, "ps_writelsn") == 0) || (strcmp(key, "last_received_lsn") == 0)) { pq_getmsgint(reply_message, sizeof(int32)); /* read value length */ rf->last_received_lsn = pq_getmsgint64(reply_message); - walprop_log(DEBUG2, "ParsePageserverFeedbackMessage: last_received_lsn %X/%X", - LSN_FORMAT_ARGS(rf->last_received_lsn)); + wp_log(DEBUG2, "ParsePageserverFeedbackMessage: last_received_lsn %X/%X", + LSN_FORMAT_ARGS(rf->last_received_lsn)); } else if ((strcmp(key, "ps_flushlsn") == 0) || (strcmp(key, "disk_consistent_lsn") == 0)) { pq_getmsgint(reply_message, sizeof(int32)); /* read value length */ rf->disk_consistent_lsn = pq_getmsgint64(reply_message); - walprop_log(DEBUG2, "ParsePageserverFeedbackMessage: disk_consistent_lsn %X/%X", - LSN_FORMAT_ARGS(rf->disk_consistent_lsn)); + wp_log(DEBUG2, "ParsePageserverFeedbackMessage: disk_consistent_lsn %X/%X", + LSN_FORMAT_ARGS(rf->disk_consistent_lsn)); } else if ((strcmp(key, "ps_applylsn") == 0) || (strcmp(key, "remote_consistent_lsn") == 0)) { pq_getmsgint(reply_message, sizeof(int32)); /* read value length */ rf->remote_consistent_lsn = pq_getmsgint64(reply_message); - walprop_log(DEBUG2, "ParsePageserverFeedbackMessage: remote_consistent_lsn %X/%X", - LSN_FORMAT_ARGS(rf->remote_consistent_lsn)); + wp_log(DEBUG2, "ParsePageserverFeedbackMessage: remote_consistent_lsn %X/%X", + LSN_FORMAT_ARGS(rf->remote_consistent_lsn)); } else if ((strcmp(key, "ps_replytime") == 0) || (strcmp(key, "replytime") == 0)) { @@ -1542,8 +1502,8 @@ ParsePageserverFeedbackMessage(WalProposer *wp, StringInfo reply_message, Pagese /* Copy because timestamptz_to_str returns a static buffer */ replyTimeStr = pstrdup(timestamptz_to_str(rf->replytime)); - walprop_log(DEBUG2, "ParsePageserverFeedbackMessage: replytime %lu reply_time: %s", - rf->replytime, replyTimeStr); + wp_log(DEBUG2, "ParsePageserverFeedbackMessage: replytime %lu reply_time: %s", + rf->replytime, replyTimeStr); pfree(replyTimeStr); } @@ -1557,7 +1517,7 @@ ParsePageserverFeedbackMessage(WalProposer *wp, StringInfo reply_message, Pagese * Skip unknown keys to support backward compatibile protocol * changes */ - walprop_log(LOG, "ParsePageserverFeedbackMessage: unknown key: %s len %d", key, len); + wp_log(LOG, "ParsePageserverFeedbackMessage: unknown key: %s len %d", key, len); pq_getmsgbytes(reply_message, len); }; } @@ -1608,39 +1568,77 @@ GetAcknowledgedByQuorumWALPosition(WalProposer *wp) return responses[wp->n_safekeepers - wp->quorum]; } +/* + * Return safekeeper with active connection from which WAL can be downloaded, or + * none if it doesn't exist. donor_lsn is set to end position of the donor to + * the best of our knowledge. + */ +Safekeeper * +GetDonor(WalProposer *wp, XLogRecPtr *donor_lsn) +{ + *donor_lsn = InvalidXLogRecPtr; + Safekeeper *donor = NULL; + int i; + + if (wp->n_votes < wp->quorum) + { + wp_log(WARNING, "GetDonor called before elections are won"); + return NULL; + } + + /* + * First, consider node which had determined our term start LSN as we know + * about its position immediately after election before any feedbacks are + * sent. + */ + if (wp->safekeeper[wp->donor].state >= SS_IDLE) + { + donor = &wp->safekeeper[wp->donor]; + *donor_lsn = wp->propEpochStartLsn; + } + + /* + * But also check feedbacks from all nodes with live connections and take + * the highest one. Note: if node sends feedbacks it already processed + * elected message so its term is fine. + */ + for (i = 0; i < wp->n_safekeepers; i++) + { + Safekeeper *sk = &wp->safekeeper[i]; + + if (sk->state == SS_ACTIVE && sk->appendResponse.flushLsn > *donor_lsn) + { + donor = sk; + *donor_lsn = sk->appendResponse.flushLsn; + } + } + return donor; +} + static void HandleSafekeeperResponse(WalProposer *wp) { XLogRecPtr minQuorumLsn; - XLogRecPtr minFlushLsn; + XLogRecPtr candidateTruncateLsn; minQuorumLsn = GetAcknowledgedByQuorumWALPosition(wp); wp->api.process_safekeeper_feedback(wp, minQuorumLsn); /* - * Try to advance truncateLsn to minFlushLsn, which is the last record - * flushed to all safekeepers. We must always start streaming from the - * beginning of the record, which simplifies decoding on the far end. + * Try to advance truncateLsn -- the last record flushed to all + * safekeepers. * - * Advanced truncateLsn should be not further than nearest commitLsn. This - * prevents surprising violation of truncateLsn <= commitLsn invariant - * which might occur because 1) truncateLsn can be advanced immediately - * once chunk is broadcast to all safekeepers, and commitLsn generally - * can't be advanced based on feedback from safekeeper who is still in the - * previous epoch (similar to 'leader can't commit entries from previous - * term' in Raft); 2) chunks we read from WAL and send are plain sheets of - * bytes, but safekeepers ack only on record boundaries. + * Advanced truncateLsn should be not higher than commitLsn. This prevents + * surprising violation of truncateLsn <= commitLsn invariant which might + * occur because commitLsn generally can't be advanced based on feedback + * from safekeeper who is still in the previous epoch (similar to 'leader + * can't commit entries from previous term' in Raft); 2) */ - minFlushLsn = CalculateMinFlushLsn(wp); - if (minFlushLsn > wp->truncateLsn) + candidateTruncateLsn = CalculateMinFlushLsn(wp); + candidateTruncateLsn = Min(candidateTruncateLsn, minQuorumLsn); + if (candidateTruncateLsn > wp->truncateLsn) { - wp->truncateLsn = minFlushLsn; - - /* - * Advance the replication slot to free up old WAL files. Note that - * slot doesn't exist if we are in syncSafekeepers mode. - */ - wp->api.confirm_wal_streamed(wp, wp->truncateLsn); + wp->truncateLsn = candidateTruncateLsn; } /* @@ -1712,9 +1710,9 @@ AsyncRead(Safekeeper *sk, char **buf, int *buf_size) return false; case PG_ASYNC_READ_FAIL: - walprop_log(WARNING, "Failed to read from node %s:%s in %s state: %s", sk->host, - sk->port, FormatSafekeeperState(sk->state), - wp->api.conn_error_message(sk)); + wp_log(WARNING, "failed to read from node %s:%s in %s state: %s", sk->host, + sk->port, FormatSafekeeperState(sk), + wp->api.conn_error_message(sk)); ShutdownConnection(sk); return false; } @@ -1752,8 +1750,8 @@ AsyncReadMessage(Safekeeper *sk, AcceptorProposerMessage *anymsg) tag = pq_getmsgint64_le(&s); if (tag != anymsg->tag) { - walprop_log(WARNING, "unexpected message tag %c from node %s:%s in state %s", (char) tag, sk->host, - sk->port, FormatSafekeeperState(sk->state)); + wp_log(WARNING, "unexpected message tag %c from node %s:%s in state %s", (char) tag, sk->host, + sk->port, FormatSafekeeperState(sk)); ResetConnection(sk); return false; } @@ -1824,13 +1822,14 @@ static bool BlockingWrite(Safekeeper *sk, void *msg, size_t msg_size, SafekeeperState success_state) { WalProposer *wp = sk->wp; - uint32 events; + uint32 sk_events; + uint32 nwr_events; if (!wp->api.conn_blocking_write(sk, msg, msg_size)) { - walprop_log(WARNING, "Failed to send to node %s:%s in %s state: %s", - sk->host, sk->port, FormatSafekeeperState(sk->state), - wp->api.conn_error_message(sk)); + wp_log(WARNING, "failed to send to node %s:%s in %s state: %s", + sk->host, sk->port, FormatSafekeeperState(sk), + wp->api.conn_error_message(sk)); ShutdownConnection(sk); return false; } @@ -1841,9 +1840,15 @@ BlockingWrite(Safekeeper *sk, void *msg, size_t msg_size, SafekeeperState succes * If the new state will be waiting for events to happen, update the event * set to wait for those */ - events = SafekeeperStateDesiredEvents(success_state); - if (events) - wp->api.update_event_set(sk, events); + SafekeeperStateDesiredEvents(sk, &sk_events, &nwr_events); + + /* + * nwr_events is relevant only during SS_ACTIVE which doesn't use + * BlockingWrite + */ + Assert(!nwr_events); + if (sk_events) + wp->api.update_event_set(sk, sk_events); return true; } @@ -1875,9 +1880,9 @@ AsyncWrite(Safekeeper *sk, void *msg, size_t msg_size, SafekeeperState flush_sta wp->api.update_event_set(sk, WL_SOCKET_READABLE | WL_SOCKET_WRITEABLE); return false; case PG_ASYNC_WRITE_FAIL: - walprop_log(WARNING, "Failed to send to node %s:%s in %s state: %s", - sk->host, sk->port, FormatSafekeeperState(sk->state), - wp->api.conn_error_message(sk)); + wp_log(WARNING, "failed to send to node %s:%s in %s state: %s", + sk->host, sk->port, FormatSafekeeperState(sk), + wp->api.conn_error_message(sk)); ShutdownConnection(sk); return false; default: @@ -1914,9 +1919,9 @@ AsyncFlush(Safekeeper *sk) /* Nothing to do; try again when the socket's ready */ return false; case -1: - walprop_log(WARNING, "Failed to flush write to node %s:%s in %s state: %s", - sk->host, sk->port, FormatSafekeeperState(sk->state), - wp->api.conn_error_message(sk)); + wp_log(WARNING, "failed to flush write to node %s:%s in %s state: %s", + sk->host, sk->port, FormatSafekeeperState(sk), + wp->api.conn_error_message(sk)); ResetConnection(sk); return false; default: @@ -1945,18 +1950,18 @@ CompareLsn(const void *a, const void *b) * * The strings are intended to be used as a prefix to "state", e.g.: * - * walprop_log(LOG, "currently in %s state", FormatSafekeeperState(sk->state)); + * wp_log(LOG, "currently in %s state", FormatSafekeeperState(sk)); * * If this sort of phrasing doesn't fit the message, instead use something like: * - * walprop_log(LOG, "currently in state [%s]", FormatSafekeeperState(sk->state)); + * wp_log(LOG, "currently in state [%s]", FormatSafekeeperState(sk)); */ static char * -FormatSafekeeperState(SafekeeperState state) +FormatSafekeeperState(Safekeeper *sk) { char *return_val = NULL; - switch (state) + switch (sk->state) { case SS_OFFLINE: return_val = "offline"; @@ -1984,7 +1989,18 @@ FormatSafekeeperState(SafekeeperState state) return_val = "idle"; break; case SS_ACTIVE: - return_val = "active"; + switch (sk->active_state) + { + case SS_ACTIVE_SEND: + return_val = "active send"; + break; + case SS_ACTIVE_READ_WAL: + return_val = "active read WAL"; + break; + case SS_ACTIVE_FLUSH: + return_val = "active flush"; + break; + } break; } @@ -1997,22 +2013,21 @@ FormatSafekeeperState(SafekeeperState state) static void AssertEventsOkForState(uint32 events, Safekeeper *sk) { - WalProposer *wp = sk->wp; - uint32 expected = SafekeeperStateDesiredEvents(sk->state); - - /* - * The events are in-line with what we're expecting, under two conditions: - * (a) if we aren't expecting anything, `events` has no read- or - * write-ready component. (b) if we are expecting something, there's - * overlap (i.e. `events & expected != 0`) - */ + uint32 sk_events; + uint32 nwr_events; + uint32 expected; bool events_ok_for_state; /* long name so the `Assert` is more * clear later */ + WalProposer *wp = sk->wp; - if (expected == WL_NO_EVENTS) - events_ok_for_state = ((events & (WL_SOCKET_READABLE | WL_SOCKET_WRITEABLE)) == 0); - else - events_ok_for_state = ((events & expected) != 0); + SafekeeperStateDesiredEvents(sk, &sk_events, &nwr_events); + + /* + * Without one more level of notify target indirection we have no way to + * distinguish which socket woke up us, so just union expected events. + */ + expected = sk_events | nwr_events; + events_ok_for_state = ((events & expected) != 0); if (!events_ok_for_state) { @@ -2020,37 +2035,40 @@ AssertEventsOkForState(uint32 events, Safekeeper *sk) * To give a descriptive message in the case of failure, we use elog * and then an assertion that's guaranteed to fail. */ - walprop_log(WARNING, "events %s mismatched for safekeeper %s:%s in state [%s]", - FormatEvents(wp, events), sk->host, sk->port, FormatSafekeeperState(sk->state)); + wp_log(WARNING, "events %s mismatched for safekeeper %s:%s in state [%s]", + FormatEvents(wp, events), sk->host, sk->port, FormatSafekeeperState(sk)); Assert(events_ok_for_state); } } -/* Returns the set of events a safekeeper in this state should be waiting on +/* Returns the set of events for both safekeeper (sk_events) and neon_walreader + * (nwr_events) sockets a safekeeper in this state should be waiting on. * * This will return WL_NO_EVENTS (= 0) for some events. */ -static uint32 -SafekeeperStateDesiredEvents(SafekeeperState state) +void +SafekeeperStateDesiredEvents(Safekeeper *sk, uint32 *sk_events, uint32 *nwr_events) { - uint32 result = WL_NO_EVENTS; + WalProposer *wp = sk->wp; + + *nwr_events = 0; /* nwr_events is empty for most states */ /* If the state doesn't have a modifier, we can check the base state */ - switch (state) + switch (sk->state) { /* Connecting states say what they want in the name */ case SS_CONNECTING_READ: - result = WL_SOCKET_READABLE; - break; + *sk_events = WL_SOCKET_READABLE; + return; case SS_CONNECTING_WRITE: - result = WL_SOCKET_WRITEABLE; - break; + *sk_events = WL_SOCKET_WRITEABLE; + return; /* Reading states need the socket to be read-ready to continue */ case SS_WAIT_EXEC_RESULT: case SS_HANDSHAKE_RECV: case SS_WAIT_VERDICT: - result = WL_SOCKET_READABLE; - break; + *sk_events = WL_SOCKET_READABLE; + return; /* * Idle states use read-readiness as a sign that the connection @@ -2058,32 +2076,66 @@ SafekeeperStateDesiredEvents(SafekeeperState state) */ case SS_VOTING: case SS_IDLE: - result = WL_SOCKET_READABLE; - break; + *sk_events = WL_SOCKET_READABLE; + return; - /* - * Flush states require write-ready for flushing. Active state - * does both reading and writing. - * - * TODO: SS_ACTIVE sometimes doesn't need to be write-ready. We - * should check sk->flushWrite here to set WL_SOCKET_WRITEABLE. - */ case SS_SEND_ELECTED_FLUSH: + *sk_events = WL_SOCKET_READABLE | WL_SOCKET_WRITEABLE; + return; + case SS_ACTIVE: - result = WL_SOCKET_READABLE | WL_SOCKET_WRITEABLE; - break; + switch (sk->active_state) + { + /* + * Everything is sent; we just wait for sk responses and + * latch. + * + * Note: this assumes we send all available WAL to + * safekeeper in one wakeup (unless it blocks). Otherwise + * we would want WL_SOCKET_WRITEABLE here to finish the + * work. + */ + case SS_ACTIVE_SEND: + *sk_events = WL_SOCKET_READABLE; + /* c.f. walprop_pg_active_state_update_event_set */ +#if PG_VERSION_NUM >= 150000 + if (wp->api.wal_reader_events(sk)) + *nwr_events = WL_SOCKET_CLOSED; +#endif /* on PG 14 nwr_events remains 0 */ + return; + + /* + * Waiting for neon_walreader socket, but we still read + * responses from sk socket. + */ + case SS_ACTIVE_READ_WAL: + *sk_events = WL_SOCKET_READABLE; + *nwr_events = wp->api.wal_reader_events(sk); + return; + + /* + * Need to flush the sk socket, so ignore neon_walreader + * one and set write interest on sk. + */ + case SS_ACTIVE_FLUSH: + *sk_events = WL_SOCKET_READABLE | WL_SOCKET_WRITEABLE; +#if PG_VERSION_NUM >= 150000 + /* c.f. walprop_pg_active_state_update_event_set */ + if (wp->api.wal_reader_events(sk)) + *nwr_events = WL_SOCKET_CLOSED; +#endif /* on PG 14 nwr_events remains 0 */ + return; + } + return; /* The offline state expects no events. */ case SS_OFFLINE: - result = WL_NO_EVENTS; - break; + *sk_events = 0; + return; default: Assert(false); - break; } - - return result; } /* Returns a human-readable string corresponding to the event set @@ -2123,8 +2175,8 @@ FormatEvents(WalProposer *wp, uint32 events) if (events & (~all_flags)) { - walprop_log(WARNING, "Event formatting found unexpected component %d", - events & (~all_flags)); + wp_log(WARNING, "event formatting found unexpected component %d", + events & (~all_flags)); return_str[6] = '*'; return_str[7] = '\0'; } diff --git a/pgxn/neon/walproposer.h b/pgxn/neon/walproposer.h index 6ba2aae75b..688d8e6e52 100644 --- a/pgxn/neon/walproposer.h +++ b/pgxn/neon/walproposer.h @@ -8,6 +8,9 @@ #include "replication/walreceiver.h" #include "utils/uuid.h" +#include "libpqwalproposer.h" +#include "neon_walreader.h" + #define SK_MAGIC 0xCafeCeefu #define SK_PROTOCOL_VERSION 2 @@ -20,43 +23,9 @@ */ #define WL_NO_EVENTS 0 -struct WalProposerConn; /* Defined in implementation (walprop_pg.c) */ +struct WalProposerConn; /* Defined in libpqwalproposer.h */ typedef struct WalProposerConn WalProposerConn; -/* Possible return values from ReadPGAsync */ -typedef enum -{ - /* The full read was successful. buf now points to the data */ - PG_ASYNC_READ_SUCCESS, - - /* - * The read is ongoing. Wait until the connection is read-ready, then try - * again. - */ - PG_ASYNC_READ_TRY_AGAIN, - /* Reading failed. Check PQerrorMessage(conn) */ - PG_ASYNC_READ_FAIL, -} PGAsyncReadResult; - -/* Possible return values from WritePGAsync */ -typedef enum -{ - /* The write fully completed */ - PG_ASYNC_WRITE_SUCCESS, - - /* - * The write started, but you'll need to call PQflush some more times to - * finish it off. We just tried, so it's best to wait until the connection - * is read- or write-ready to try again. - * - * If it becomes read-ready, call PQconsumeInput and flush again. If it - * becomes write-ready, just call PQflush. - */ - PG_ASYNC_WRITE_TRY_FLUSH, - /* Writing failed. Check PQerrorMessage(conn) */ - PG_ASYNC_WRITE_FAIL, -} PGAsyncWriteResult; - /* * WAL safekeeper state, which is used to wait for some event. * @@ -133,6 +102,40 @@ typedef enum SS_ACTIVE, } SafekeeperState; +/* + * Sending WAL substates of SS_ACTIVE. + */ +typedef enum +{ + /* + * We are ready to send more WAL, waiting for latch set to learn about + * more WAL becoming available (or just a timeout to send heartbeat). + */ + SS_ACTIVE_SEND, + + /* + * Polling neon_walreader to receive chunk of WAL (probably remotely) to + * send to this safekeeper. + * + * Note: socket management is done completely inside walproposer_pg for + * simplicity, and thus simulation doesn't test it. Which is fine as + * simulation is mainly aimed at consensus checks, not waiteventset + * management. + * + * Also, while in this state we don't touch safekeeper socket, so in + * theory it might close connection as inactive. This can be addressed if + * needed; however, while fetching WAL we should regularly send it, so the + * problem is unlikely. Vice versa is also true (SS_ACTIVE doesn't handle + * walreader socket), but similarly shouldn't be a problem. + */ + SS_ACTIVE_READ_WAL, + + /* + * Waiting for write readiness to flush the socket. + */ + SS_ACTIVE_FLUSH, +} SafekeeperActiveState; + /* Consensus logical timestamp. */ typedef uint64 term_t; @@ -341,12 +344,11 @@ typedef struct Safekeeper */ XLogRecPtr startStreamingAt; - bool flushWrite; /* set to true if we need to call AsyncFlush,* - * to flush pending messages */ XLogRecPtr streamingAt; /* current streaming position */ AppendRequestHeader appendRequest; /* request for sending to safekeeper */ SafekeeperState state; /* safekeeper state machine state */ + SafekeeperActiveState active_state; TimestampTz latestMsgReceivedAt; /* when latest msg is received */ AcceptorGreeting greetResponse; /* acceptor greeting */ VoteResponse voteResponse; /* the vote */ @@ -367,12 +369,27 @@ typedef struct Safekeeper /* * WAL reader, allocated for each safekeeper. */ - XLogReaderState *xlogreader; + NeonWALReader *xlogreader; /* * Position in wait event set. Equal to -1 if no event */ int eventPos; + + /* + * Neon WAL reader position in wait event set, or -1 if no socket. Note + * that event must be removed not only on error/failure, but also on + * successful *local* read, as next read might again be remote, but with + * different socket. + */ + int nwrEventPos; + + /* + * Per libpq docs, during connection establishment socket might change, + * remember here if it is stable to avoid readding to the event set if + * possible. Must be reset whenever nwr event is deleted. + */ + bool nwrConnEstablished; #endif @@ -401,31 +418,6 @@ typedef enum */ } WalProposerConnectPollStatusType; -/* Re-exported and modified ExecStatusType */ -typedef enum -{ - /* We received a single CopyBoth result */ - WP_EXEC_SUCCESS_COPYBOTH, - - /* - * Any success result other than a single CopyBoth was received. The - * specifics of the result were already logged, but it may be useful to - * provide an error message indicating which safekeeper messed up. - * - * Do not expect PQerrorMessage to be appropriately set. - */ - WP_EXEC_UNEXPECTED_SUCCESS, - - /* - * No result available at this time. Wait until read-ready, then call - * again. Internally, this is returned when PQisBusy indicates that - * PQgetResult would block. - */ - WP_EXEC_NEEDS_INPUT, - /* Catch-all failure. Check PQerrorMessage. */ - WP_EXEC_FAILED, -} WalProposerExecStatusType; - /* Re-exported ConnStatusType */ typedef enum { @@ -486,7 +478,7 @@ typedef struct walproposer_api /* Flush buffer to the network, aka PQflush. */ int (*conn_flush) (Safekeeper *sk); - /* Close the connection, aka PQfinish. */ + /* Reset sk state: close pq connection, deallocate xlogreader. */ void (*conn_finish) (Safekeeper *sk); /* @@ -503,17 +495,20 @@ typedef struct walproposer_api /* Blocking CopyData write, aka PQputCopyData + PQflush. */ bool (*conn_blocking_write) (Safekeeper *sk, void const *buf, size_t size); - /* Download WAL from startpos to endpos and make it available locally. */ - bool (*recovery_download) (Safekeeper *sk, TimeLineID timeline, XLogRecPtr startpos, XLogRecPtr endpos); - - /* Read WAL from disk to buf. */ - void (*wal_read) (Safekeeper *sk, char *buf, XLogRecPtr startptr, Size count); + /* + * Download WAL before basebackup for logical walsenders from sk, if + * needed + */ + bool (*recovery_download) (WalProposer *wp, Safekeeper *sk); /* Allocate WAL reader. */ void (*wal_reader_allocate) (Safekeeper *sk); - /* Deallocate event set. */ - void (*free_event_set) (WalProposer *wp); + /* Read WAL from disk to buf. */ + NeonWALReadResult (*wal_read) (Safekeeper *sk, char *buf, XLogRecPtr startptr, Size count, char **errmsg); + + /* Returns events to be awaited on WAL reader, if any. */ + uint32 (*wal_reader_events) (Safekeeper *sk); /* Initialize event set. */ void (*init_event_set) (WalProposer *wp); @@ -521,9 +516,15 @@ typedef struct walproposer_api /* Update events for an existing safekeeper connection. */ void (*update_event_set) (Safekeeper *sk, uint32 events); + /* Configure wait event set for yield in SS_ACTIVE. */ + void (*active_state_update_event_set) (Safekeeper *sk); + /* Add a new safekeeper connection to the event set. */ void (*add_safekeeper_event_set) (Safekeeper *sk, uint32 events); + /* Remove safekeeper connection from event set */ + void (*rm_safekeeper_event_set) (Safekeeper *sk); + /* * Wait until some event happens: - timeout is reached - socket event for * safekeeper connection - new WAL is available @@ -556,26 +557,12 @@ typedef struct walproposer_api */ void (*process_safekeeper_feedback) (WalProposer *wp, XLogRecPtr commitLsn); - /* - * Called on peer_horizon_lsn updates. Used to advance replication slot - * and to free up disk space by deleting unnecessary WAL. - */ - void (*confirm_wal_streamed) (WalProposer *wp, XLogRecPtr lsn); - /* * Write a log message to the internal log processor. This is used only * when walproposer is compiled as a library. Otherwise, all logging is * handled by elog(). */ void (*log_internal) (WalProposer *wp, int level, const char *line); - - /* - * Called right after the proposer was elected, but before it started - * recovery and sent ProposerElected message to the safekeepers. - * - * Used by logical replication to update truncateLsn. - */ - void (*after_election) (WalProposer *wp); } walproposer_api; /* @@ -709,15 +696,34 @@ extern void WalProposerBroadcast(WalProposer *wp, XLogRecPtr startpos, XLogRecPt extern void WalProposerPoll(WalProposer *wp); extern void WalProposerFree(WalProposer *wp); +/* + * WaitEventSet API doesn't allow to remove socket, so walproposer_pg uses it to + * recreate set from scratch, hence the export. + */ +extern void SafekeeperStateDesiredEvents(Safekeeper *sk, uint32 *sk_events, uint32 *nwr_events); +extern Safekeeper *GetDonor(WalProposer *wp, XLogRecPtr *donor_lsn); + #define WPEVENT 1337 /* special log level for walproposer internal * events */ +#define WP_LOG_PREFIX "[WP] " + +/* + * wp_log is used in pure wp code (walproposer.c), allowing API callback to + * catch logging. + */ #ifdef WALPROPOSER_LIB extern void WalProposerLibLog(WalProposer *wp, int elevel, char *fmt,...); -#define walprop_log(elevel, ...) WalProposerLibLog(wp, elevel, __VA_ARGS__) +#define wp_log(elevel, fmt, ...) WalProposerLibLog(wp, elevel, fmt, ## __VA_ARGS__) #else -#define walprop_log(elevel, ...) elog(elevel, __VA_ARGS__) +#define wp_log(elevel, fmt, ...) elog(elevel, WP_LOG_PREFIX fmt, ## __VA_ARGS__) #endif +/* + * And wpg_log is used all other (postgres specific) walproposer code, just + * adding prefix. + */ +#define wpg_log(elevel, fmt, ...) elog(elevel, WP_LOG_PREFIX fmt, ## __VA_ARGS__) + #endif /* __NEON_WALPROPOSER_H__ */ diff --git a/pgxn/neon/walproposer_pg.c b/pgxn/neon/walproposer_pg.c index 9361f08ad2..61a2a54809 100644 --- a/pgxn/neon/walproposer_pg.c +++ b/pgxn/neon/walproposer_pg.c @@ -12,6 +12,7 @@ #include #include #include "access/xact.h" +#include "access/xlog.h" #include "access/xlogdefs.h" #include "access/xlogutils.h" #include "access/xloginsert.h" @@ -43,14 +44,19 @@ #include "utils/ps_status.h" #include "utils/timestamp.h" -#include "neon.h" -#include "walproposer.h" #include "libpq-fe.h" +#include "libpqwalproposer.h" +#include "neon.h" +#include "neon_walreader.h" +#include "walproposer.h" + #define XLOG_HDR_SIZE (1 + 8 * 3) /* 'w' + startPos + walEnd + timestamp */ #define XLOG_HDR_START_POS 1 /* offset of start position in wal sender* * message header */ +#define MB ((XLogRecPtr)1024 * 1024) + #define WAL_PROPOSER_SLOT_NAME "wal_proposer_slot" char *wal_acceptors_list = ""; @@ -91,6 +97,12 @@ static void XLogBroadcastWalProposer(WalProposer *wp); static void XLogWalPropWrite(WalProposer *wp, char *buf, Size nbytes, XLogRecPtr recptr); static void XLogWalPropClose(XLogRecPtr recptr); +static void add_nwr_event_set(Safekeeper *sk, uint32 events); +static void update_nwr_event_set(Safekeeper *sk, uint32 events); +static void rm_safekeeper_event_set(Safekeeper *to_remove, bool is_sk); + +static XLogRecPtr GetLogRepRestartLSN(WalProposer *wp); + static void init_walprop_config(bool syncSafekeepers) { @@ -214,7 +226,6 @@ backpressure_lag_impl(void) XLogRecPtr myFlushLsn = GetFlushRecPtr(); #endif replication_feedback_get_lsns(&writePtr, &flushPtr, &applyPtr); -#define MB ((XLogRecPtr)1024 * 1024) elog(DEBUG2, "current flushLsn %X/%X PageserverFeedback: write %X/%X flush %X/%X apply %X/%X", LSN_FORMAT_ARGS(myFlushLsn), @@ -413,8 +424,8 @@ walprop_pg_start_streaming(WalProposer *wp, XLogRecPtr startpos) { StartReplicationCmd cmd; - elog(LOG, "WAL proposer starts streaming at %X/%X", - LSN_FORMAT_ARGS(startpos)); + wpg_log(LOG, "WAL proposer starts streaming at %X/%X", + LSN_FORMAT_ARGS(startpos)); cmd.slotname = WAL_PROPOSER_SLOT_NAME; cmd.timeline = wp->greetRequest.timeline; cmd.startpoint = startpos; @@ -538,17 +549,9 @@ walprop_pg_load_libpqwalreceiver(void) { load_file("libpqwalreceiver", false); if (WalReceiverFunctions == NULL) - elog(ERROR, "libpqwalreceiver didn't initialize correctly"); + wpg_log(ERROR, "libpqwalreceiver didn't initialize correctly"); } -/* Header in walproposer.h -- Wrapper struct to abstract away the libpq connection */ -struct WalProposerConn -{ - PGconn *pg_conn; - bool is_nonblocking; /* whether the connection is non-blocking */ - char *recvbuf; /* last received data from walprop_async_read */ -}; - /* Helper function */ static bool ensure_nonblocking_status(WalProposerConn *conn, bool is_nonblocking) @@ -586,16 +589,17 @@ walprop_status(Safekeeper *sk) } } -static void -walprop_connect_start(Safekeeper *sk) +WalProposerConn * +libpqwp_connect_start(char *conninfo) { + PGconn *pg_conn; + WalProposerConn *conn; const char *keywords[3]; const char *values[3]; int n; char *password = neon_auth_token; - Assert(sk->conn == NULL); /* * Connect using the given connection string. If the NEON_AUTH_TOKEN @@ -614,7 +618,7 @@ walprop_connect_start(Safekeeper *sk) n++; } keywords[n] = "dbname"; - values[n] = sk->conninfo; + values[n] = conninfo; n++; keywords[n] = NULL; values[n] = NULL; @@ -626,7 +630,7 @@ walprop_connect_start(Safekeeper *sk) * PGconn structure" */ if (!pg_conn) - elog(FATAL, "failed to allocate new PGconn object"); + wpg_log(FATAL, "failed to allocate new PGconn object"); /* * And in theory this allocation can fail as well, but it's incredibly @@ -635,11 +639,20 @@ walprop_connect_start(Safekeeper *sk) * palloc will exit on failure though, so there's not much we could do if * it *did* fail. */ - sk->conn = palloc(sizeof(WalProposerConn)); - sk->conn->pg_conn = pg_conn; - sk->conn->is_nonblocking = false; /* connections always start in - * blocking mode */ - sk->conn->recvbuf = NULL; + conn = palloc(sizeof(WalProposerConn)); + conn->pg_conn = pg_conn; + conn->is_nonblocking = false; /* connections always start in blocking + * mode */ + conn->recvbuf = NULL; + return conn; +} + +static void +walprop_connect_start(Safekeeper *sk) +{ + Assert(sk->conn == NULL); + sk->conn = libpqwp_connect_start(sk->conninfo); + } static WalProposerConnectPollStatusType @@ -667,7 +680,7 @@ walprop_connect_poll(Safekeeper *sk) * unused. We'll expect it's never returned. */ case PGRES_POLLING_ACTIVE: - elog(FATAL, "Unexpected PGRES_POLLING_ACTIVE returned from PQconnectPoll"); + wpg_log(FATAL, "unexpected PGRES_POLLING_ACTIVE returned from PQconnectPoll"); /* * This return is never actually reached, but it's here to make @@ -683,26 +696,33 @@ walprop_connect_poll(Safekeeper *sk) return return_val; } -static bool -walprop_send_query(Safekeeper *sk, char *query) +extern bool +libpqwp_send_query(WalProposerConn *conn, char *query) { /* * We need to be in blocking mode for sending the query to run without * requiring a call to PQflush */ - if (!ensure_nonblocking_status(sk->conn, false)) + if (!ensure_nonblocking_status(conn, false)) return false; /* PQsendQuery returns 1 on success, 0 on failure */ - if (!PQsendQuery(sk->conn->pg_conn, query)) + if (!PQsendQuery(conn->pg_conn, query)) return false; return true; } -static WalProposerExecStatusType -walprop_get_query_result(Safekeeper *sk) +static bool +walprop_send_query(Safekeeper *sk, char *query) { + return libpqwp_send_query(sk->conn, query); +} + +WalProposerExecStatusType +libpqwp_get_query_result(WalProposerConn *conn) +{ + PGresult *result; WalProposerExecStatusType return_val; @@ -710,14 +730,14 @@ walprop_get_query_result(Safekeeper *sk) char *unexpected_success = NULL; /* Consume any input that we might be missing */ - if (!PQconsumeInput(sk->conn->pg_conn)) + if (!PQconsumeInput(conn->pg_conn)) return WP_EXEC_FAILED; - if (PQisBusy(sk->conn->pg_conn)) + if (PQisBusy(conn->pg_conn)) return WP_EXEC_NEEDS_INPUT; - result = PQgetResult(sk->conn->pg_conn); + result = PQgetResult(conn->pg_conn); /* * PQgetResult returns NULL only if getting the result was successful & @@ -725,7 +745,7 @@ walprop_get_query_result(Safekeeper *sk) */ if (!result) { - elog(WARNING, "[libpqwalproposer] Unexpected successful end of command results"); + wpg_log(WARNING, "[libpqwalproposer] Unexpected successful end of command results"); return WP_EXEC_UNEXPECTED_SUCCESS; } @@ -773,11 +793,17 @@ walprop_get_query_result(Safekeeper *sk) } if (unexpected_success) - elog(WARNING, "[libpqwalproposer] Unexpected successful %s", unexpected_success); + wpg_log(WARNING, "[libpqwalproposer] Unexpected successful %s", unexpected_success); return return_val; } +static WalProposerExecStatusType +walprop_get_query_result(Safekeeper *sk) +{ + return libpqwp_get_query_result(sk->conn); +} + static pgsocket walprop_socket(Safekeeper *sk) { @@ -790,42 +816,31 @@ walprop_flush(Safekeeper *sk) return (PQflush(sk->conn->pg_conn)); } -static void -walprop_finish(Safekeeper *sk) +/* Like libpqrcv_receive. *buf is valid until the next call. */ +PGAsyncReadResult +libpqwp_async_read(WalProposerConn *conn, char **buf, int *amount) { - if (!sk->conn) - return; + int rawlen; - if (sk->conn->recvbuf != NULL) - PQfreemem(sk->conn->recvbuf); - PQfinish(sk->conn->pg_conn); - pfree(sk->conn); - sk->conn = NULL; -} - -/* - * Receive a message from the safekeeper. - * - * On success, the data is placed in *buf. It is valid until the next call - * to this function. - */ -static PGAsyncReadResult -walprop_async_read(Safekeeper *sk, char **buf, int *amount) -{ - int result; - - if (sk->conn->recvbuf != NULL) + if (conn->recvbuf != NULL) { - PQfreemem(sk->conn->recvbuf); - sk->conn->recvbuf = NULL; + PQfreemem(conn->recvbuf); + conn->recvbuf = NULL; } - /* Call PQconsumeInput so that we have the data we need */ - if (!PQconsumeInput(sk->conn->pg_conn)) + /* Try to receive a CopyData message */ + rawlen = PQgetCopyData(conn->pg_conn, &conn->recvbuf, true); + if (rawlen == 0) { - *amount = 0; - *buf = NULL; - return PG_ASYNC_READ_FAIL; + /* Try consuming some data. */ + if (!PQconsumeInput(conn->pg_conn)) + { + *amount = 0; + *buf = NULL; + return PG_ASYNC_READ_FAIL; + } + /* Now that we've consumed some input, try again */ + rawlen = PQgetCopyData(conn->pg_conn, &conn->recvbuf, true); } /* @@ -839,7 +854,7 @@ walprop_async_read(Safekeeper *sk, char **buf, int *amount) * sometimes be triggered by the server returning an ErrorResponse (which * also happens to have the effect that the copy is done). */ - switch (result = PQgetCopyData(sk->conn->pg_conn, &sk->conn->recvbuf, true)) + switch (rawlen) { case 0: *amount = 0; @@ -854,10 +869,10 @@ walprop_async_read(Safekeeper *sk, char **buf, int *amount) * We can check PQgetResult to make sure that the server * failed; it'll always result in PGRES_FATAL_ERROR */ - ExecStatusType status = PQresultStatus(PQgetResult(sk->conn->pg_conn)); + ExecStatusType status = PQresultStatus(PQgetResult(conn->pg_conn)); if (status != PGRES_FATAL_ERROR) - elog(FATAL, "unexpected result status %d after failed PQgetCopyData", status); + wpg_log(FATAL, "unexpected result status %d after failed PQgetCopyData", status); /* * If there was actually an error, it'll be properly reported @@ -874,12 +889,24 @@ walprop_async_read(Safekeeper *sk, char **buf, int *amount) return PG_ASYNC_READ_FAIL; default: /* Positive values indicate the size of the returned result */ - *amount = result; - *buf = sk->conn->recvbuf; + *amount = rawlen; + *buf = conn->recvbuf; return PG_ASYNC_READ_SUCCESS; } } +/* + * Receive a message from the safekeeper. + * + * On success, the data is placed in *buf. It is valid until the next call + * to this function. + */ +static PGAsyncReadResult +walprop_async_read(Safekeeper *sk, char **buf, int *amount) +{ + return libpqwp_async_read(sk->conn, buf, amount); +} + static PGAsyncWriteResult walprop_async_write(Safekeeper *sk, void const *buf, size_t size) { @@ -910,7 +937,7 @@ walprop_async_write(Safekeeper *sk, void const *buf, size_t size) case -1: return PG_ASYNC_WRITE_FAIL; default: - elog(FATAL, "invalid return %d from PQputCopyData", result); + wpg_log(FATAL, "invalid return %d from PQputCopyData", result); } /* @@ -931,7 +958,7 @@ walprop_async_write(Safekeeper *sk, void const *buf, size_t size) case -1: return PG_ASYNC_WRITE_FAIL; default: - elog(FATAL, "invalid return %d from PQflush", result); + wpg_log(FATAL, "invalid return %d from PQflush", result); } } @@ -962,6 +989,33 @@ walprop_blocking_write(Safekeeper *sk, void const *buf, size_t size) return true; } +void +libpqwp_disconnect(WalProposerConn *conn) +{ + if (conn->recvbuf != NULL) + PQfreemem(conn->recvbuf); + PQfinish(conn->pg_conn); + pfree(conn); +} + +static void +walprop_finish(Safekeeper *sk) +{ + if (sk->conn) + { + libpqwp_disconnect(sk->conn); + sk->conn = NULL; + } + + /* free xlogreader */ + if (sk->xlogreader) + { + NeonWALReaderFree(sk->xlogreader); + sk->xlogreader = NULL; + } + rm_safekeeper_event_set(sk, false); +} + /* * Subscribe for new WAL and stream it in the loop to safekeepers. * @@ -1165,16 +1219,25 @@ XLogBroadcastWalProposer(WalProposer *wp) } } -/* - * Receive WAL from most advanced safekeeper - */ +/* Download WAL before basebackup for logical walsenders from sk, if needed */ static bool -WalProposerRecovery(Safekeeper *sk, TimeLineID timeline, XLogRecPtr startpos, XLogRecPtr endpos) +WalProposerRecovery(WalProposer *wp, Safekeeper *sk) { char *err; WalReceiverConn *wrconn; WalRcvStreamOptions options; char conninfo[MAXCONNINFO]; + TimeLineID timeline; + XLogRecPtr startpos; + XLogRecPtr endpos; + uint64 download_range_mb; + + startpos = GetLogRepRestartLSN(wp); + if (startpos == InvalidXLogRecPtr) + return true; /* recovery not needed */ + endpos = wp->propEpochStartLsn; + + timeline = wp->greetRequest.timeline; if (!neon_auth_token) { @@ -1186,7 +1249,7 @@ WalProposerRecovery(Safekeeper *sk, TimeLineID timeline, XLogRecPtr startpos, XL written = snprintf((char *) conninfo, MAXCONNINFO, "password=%s %s", neon_auth_token, sk->conninfo); if (written > MAXCONNINFO || written < 0) - elog(FATAL, "could not append password to the safekeeper connection string"); + wpg_log(FATAL, "could not append password to the safekeeper connection string"); } #if PG_MAJORVERSION_NUM < 16 @@ -1203,11 +1266,11 @@ WalProposerRecovery(Safekeeper *sk, TimeLineID timeline, XLogRecPtr startpos, XL err))); return false; } - elog(LOG, - "start recovery from %s:%s starting from %X/%08X till %X/%08X timeline " - "%d", - sk->host, sk->port, (uint32) (startpos >> 32), - (uint32) startpos, (uint32) (endpos >> 32), (uint32) endpos, timeline); + wpg_log(LOG, + "start recovery for logical replication from %s:%s starting from %X/%08X till %X/%08X timeline " + "%d", + sk->host, sk->port, (uint32) (startpos >> 32), + (uint32) startpos, (uint32) (endpos >> 32), (uint32) endpos, timeline); options.logical = false; options.startpoint = startpos; @@ -1400,28 +1463,54 @@ XLogWalPropClose(XLogRecPtr recptr) walpropFile = -1; } -static void -walprop_pg_wal_read(Safekeeper *sk, char *buf, XLogRecPtr startptr, Size count) -{ - WALReadError errinfo; - - if (!WALRead(sk->xlogreader, - buf, - startptr, - count, - walprop_pg_get_timeline_id(), - &errinfo)) - { - WALReadRaiseError(&errinfo); - } -} - static void walprop_pg_wal_reader_allocate(Safekeeper *sk) { - sk->xlogreader = XLogReaderAllocate(wal_segment_size, NULL, XL_ROUTINE(.segment_open = wal_segment_open,.segment_close = wal_segment_close), NULL); + char log_prefix[64]; + + snprintf(log_prefix, sizeof(log_prefix), WP_LOG_PREFIX "sk %s:%s nwr: ", sk->host, sk->port); + Assert(!sk->xlogreader); + sk->xlogreader = NeonWALReaderAllocate(wal_segment_size, sk->wp->propEpochStartLsn, sk->wp, log_prefix); if (sk->xlogreader == NULL) - elog(FATAL, "Failed to allocate xlog reader"); + wpg_log(FATAL, "failed to allocate xlog reader"); +} + +static NeonWALReadResult +walprop_pg_wal_read(Safekeeper *sk, char *buf, XLogRecPtr startptr, Size count, char **errmsg) +{ + NeonWALReadResult res; + + res = NeonWALRead(sk->xlogreader, + buf, + startptr, + count, + walprop_pg_get_timeline_id()); + + if (res == NEON_WALREAD_SUCCESS) + { + /* + * If we have the socket subscribed, but walreader doesn't need any + * events, it must mean that remote connection just closed hoping to + * do next read locally. Remove the socket then. It is important to do + * as otherwise next read might open another connection and we won't + * be able to distinguish whether we have correct socket added in wait + * event set. + */ + if (NeonWALReaderEvents(sk->xlogreader) == 0) + rm_safekeeper_event_set(sk, false); + } + else if (res == NEON_WALREAD_ERROR) + { + *errmsg = NeonWALReaderErrMsg(sk->xlogreader); + } + + return res; +} + +static uint32 +walprop_pg_wal_reader_events(Safekeeper *sk) +{ + return NeonWALReaderEvents(sk->xlogreader); } static WaitEventSet *waitEvents; @@ -1438,6 +1527,8 @@ walprop_pg_free_event_set(WalProposer *wp) for (int i = 0; i < wp->n_safekeepers; i++) { wp->safekeeper[i].eventPos = -1; + wp->safekeeper[i].nwrEventPos = -1; + wp->safekeeper[i].nwrConnEstablished = false; } } @@ -1445,13 +1536,39 @@ static void walprop_pg_init_event_set(WalProposer *wp) { if (waitEvents) - elog(FATAL, "double-initialization of event set"); + wpg_log(FATAL, "double-initialization of event set"); - waitEvents = CreateWaitEventSet(TopMemoryContext, 2 + wp->n_safekeepers); + /* for each sk, we have socket plus potentially socket for neon walreader */ + waitEvents = CreateWaitEventSet(TopMemoryContext, 2 + 2 * wp->n_safekeepers); AddWaitEventToSet(waitEvents, WL_LATCH_SET, PGINVALID_SOCKET, MyLatch, NULL); AddWaitEventToSet(waitEvents, WL_EXIT_ON_PM_DEATH, PGINVALID_SOCKET, NULL, NULL); + + for (int i = 0; i < wp->n_safekeepers; i++) + { + wp->safekeeper[i].eventPos = -1; + wp->safekeeper[i].nwrEventPos = -1; + wp->safekeeper[i].nwrConnEstablished = false; + } +} + +/* add safekeeper socket to wait event set */ +static void +walprop_pg_add_safekeeper_event_set(Safekeeper *sk, uint32 events) +{ + Assert(sk->eventPos == -1); + sk->eventPos = AddWaitEventToSet(waitEvents, events, walprop_socket(sk), NULL, sk); +} + +/* add neon wal reader socket to wait event set */ +static void +add_nwr_event_set(Safekeeper *sk, uint32 events) +{ + Assert(sk->nwrEventPos == -1); + sk->nwrEventPos = AddWaitEventToSet(waitEvents, events, NeonWALReaderSocket(sk->xlogreader), NULL, sk); + sk->nwrConnEstablished = NeonWALReaderIsRemConnEstablished(sk->xlogreader); + wpg_log(DEBUG5, "sk %s:%s: added nwr socket events %d", sk->host, sk->port, events); } static void @@ -1463,10 +1580,144 @@ walprop_pg_update_event_set(Safekeeper *sk, uint32 events) ModifyWaitEvent(waitEvents, sk->eventPos, events, NULL); } +/* + * Update neon_walreader event. + * Can be called when nwr socket doesn't exist, does nothing in this case. + */ static void -walprop_pg_add_safekeeper_event_set(Safekeeper *sk, uint32 events) +update_nwr_event_set(Safekeeper *sk, uint32 events) { - sk->eventPos = AddWaitEventToSet(waitEvents, events, walprop_socket(sk), NULL, sk); + /* eventPos = -1 when we don't have an event */ + if (sk->nwrEventPos != -1) + ModifyWaitEvent(waitEvents, sk->nwrEventPos, events, NULL); +} + + +static void +walprop_pg_active_state_update_event_set(Safekeeper *sk) +{ + uint32 sk_events; + uint32 nwr_events; + + Assert(sk->state == SS_ACTIVE); + SafekeeperStateDesiredEvents(sk, &sk_events, &nwr_events); + + /* + * If we need to wait for neon_walreader, ensure we have up to date socket + * in the wait event set. + */ + if (sk->active_state == SS_ACTIVE_READ_WAL) + { + /* + * If conn is established and socket is thus stable, update the event + * directly; otherwise re-add it. + */ + if (sk->nwrConnEstablished) + { + Assert(sk->nwrEventPos != -1); + update_nwr_event_set(sk, nwr_events); + } + else + { + rm_safekeeper_event_set(sk, false); + add_nwr_event_set(sk, nwr_events); + } + } + else + { + /* + * Hack: we should always set 0 here, but for random reasons + * WaitEventSet (WaitEventAdjustEpoll) asserts that there is at least + * some event. Since there is also no way to remove socket except + * reconstructing the whole set, SafekeeperStateDesiredEvents instead + * gives WL_SOCKET_CLOSED if socket exists. We never expect it to + * trigger. + * + * On PG 14 which doesn't have WL_SOCKET_CLOSED resort to event + * removal. + */ +#if PG_VERSION_NUM >= 150000 + Assert(nwr_events == WL_SOCKET_CLOSED || nwr_events == 0); + update_nwr_event_set(sk, WL_SOCKET_CLOSED); +#else /* pg 14 */ + rm_safekeeper_event_set(sk, false); +#endif + } + walprop_pg_update_event_set(sk, sk_events); +} + +static void +walprop_pg_rm_safekeeper_event_set(Safekeeper *to_remove) +{ + rm_safekeeper_event_set(to_remove, true); +} + +/* + * A hacky way to remove single event from the event set. Can be called if event + * doesn't exist, does nothing in this case. + * + * Note: Internally, this completely reconstructs the event set. It should be + * avoided if possible. + * + * If is_sk is true, socket of connection to safekeeper is removed; otherwise + * socket of neon_walreader. + */ +static void +rm_safekeeper_event_set(Safekeeper *to_remove, bool is_sk) +{ + WalProposer *wp = to_remove->wp; + + wpg_log(DEBUG5, "sk %s:%s: removing event, is_sk %d", + to_remove->host, to_remove->port, is_sk); + + /* + * Shortpath for exiting if have nothing to do. We never call this + * function with safekeeper socket not existing, but do that with neon + * walreader socket. + */ + if ((is_sk && to_remove->eventPos == -1) || + (!is_sk && to_remove->nwrEventPos == -1)) + { + return; + } + + /* Remove the existing event set, assign sk->eventPos = -1 */ + walprop_pg_free_event_set(wp); + + /* Re-initialize it without adding any safekeeper events */ + wp->api.init_event_set(wp); + + /* + * loop through the existing safekeepers. If they aren't the one we're + * removing, and if they have a socket we can use, re-add the applicable + * events. + */ + for (int i = 0; i < wp->n_safekeepers; i++) + { + Safekeeper *sk = &wp->safekeeper[i]; + + /* + * If this safekeeper isn't offline, add events for it, except for the + * event requested to remove. + */ + if (sk->state != SS_OFFLINE) + { + uint32 sk_events; + uint32 nwr_events; + + SafekeeperStateDesiredEvents(sk, &sk_events, &nwr_events); + + if (sk != to_remove || !is_sk) + { + /* will set sk->eventPos */ + wp->api.add_safekeeper_event_set(sk, sk_events); + } + if ((sk != to_remove || is_sk) && nwr_events) + { + add_nwr_event_set(sk, nwr_events); + } + } + } } static int @@ -1484,8 +1735,8 @@ walprop_pg_wait_event_set(WalProposer *wp, long timeout, Safekeeper **sk, uint32 ConditionVariablePrepareToSleep(&WalSndCtl->wal_flush_cv); /* - * Now that we prepared the condvar, check flush ptr again -- it might have - * changed before we subscribed to cv so we missed the wakeup. + * Now that we prepared the condvar, check flush ptr again -- it might + * have changed before we subscribed to cv so we missed the wakeup. * * Do that only when we're interested in new WAL: without sync-safekeepers * and if election already passed. @@ -1548,7 +1799,7 @@ walprop_pg_finish_sync_safekeepers(WalProposer *wp, XLogRecPtr lsn) } /* - * Get PageserverFeedback fields from the most advanced safekeeper + * Choose most advanced PageserverFeedback and set it to *rf. */ static void GetLatestNeonFeedback(PageserverFeedback *rf, WalProposer *wp) @@ -1571,15 +1822,13 @@ GetLatestNeonFeedback(PageserverFeedback *rf, WalProposer *wp) rf->remote_consistent_lsn = wp->safekeeper[latest_safekeeper].appendResponse.rf.remote_consistent_lsn; rf->replytime = wp->safekeeper[latest_safekeeper].appendResponse.rf.replytime; - elog(DEBUG2, "GetLatestNeonFeedback: currentClusterSize %lu," - " last_received_lsn %X/%X, disk_consistent_lsn %X/%X, remote_consistent_lsn %X/%X, replytime %lu", - rf->currentClusterSize, - LSN_FORMAT_ARGS(rf->last_received_lsn), - LSN_FORMAT_ARGS(rf->disk_consistent_lsn), - LSN_FORMAT_ARGS(rf->remote_consistent_lsn), - rf->replytime); - - replication_feedback_set(rf); + wpg_log(DEBUG2, "GetLatestNeonFeedback: currentClusterSize %lu," + " last_received_lsn %X/%X, disk_consistent_lsn %X/%X, remote_consistent_lsn %X/%X, replytime %lu", + rf->currentClusterSize, + LSN_FORMAT_ARGS(rf->last_received_lsn), + LSN_FORMAT_ARGS(rf->disk_consistent_lsn), + LSN_FORMAT_ARGS(rf->remote_consistent_lsn), + rf->replytime); } /* @@ -1619,63 +1868,69 @@ CombineHotStanbyFeedbacks(HotStandbyFeedback *hs, WalProposer *wp) hs->catalog_xmin = InvalidFullTransactionId; } +/* + * Based on commitLsn and safekeeper responses including pageserver feedback, + * 1) Propagate cluster size received from ps to ensure the limit. + * 2) Propagate pageserver LSN positions to ensure backpressure limits. + * 3) Advance walproposer slot to commitLsn (releasing WAL & waking up waiters). + * 4) Propagate hot standby feedback. + * + * None of that is functional in sync-safekeepers. + */ static void walprop_pg_process_safekeeper_feedback(WalProposer *wp, XLogRecPtr commitLsn) { HotStandbyFeedback hsFeedback; - XLogRecPtr diskConsistentLsn; + XLogRecPtr oldDiskConsistentLsn; - diskConsistentLsn = quorumFeedback.rf.disk_consistent_lsn; + if (wp->config->syncSafekeepers) + return; - if (!wp->config->syncSafekeepers) + oldDiskConsistentLsn = quorumFeedback.rf.disk_consistent_lsn; + + /* Get PageserverFeedback fields from the most advanced safekeeper */ + GetLatestNeonFeedback(&quorumFeedback.rf, wp); + replication_feedback_set(&quorumFeedback.rf); + SetZenithCurrentClusterSize(quorumFeedback.rf.currentClusterSize); + + if (commitLsn > quorumFeedback.flushLsn || oldDiskConsistentLsn != quorumFeedback.rf.disk_consistent_lsn) { - /* Get PageserverFeedback fields from the most advanced safekeeper */ - GetLatestNeonFeedback(&quorumFeedback.rf, wp); - SetZenithCurrentClusterSize(quorumFeedback.rf.currentClusterSize); - } - - if (commitLsn > quorumFeedback.flushLsn || diskConsistentLsn != quorumFeedback.rf.disk_consistent_lsn) - { - if (commitLsn > quorumFeedback.flushLsn) quorumFeedback.flushLsn = commitLsn; - /* advance the replication slot */ - if (!wp->config->syncSafekeepers) - ProcessStandbyReply( - /* write_lsn - This is what durably stored in WAL service. */ - quorumFeedback.flushLsn, - /* flush_lsn - This is what durably stored in WAL service. */ - quorumFeedback.flushLsn, + /* + * Advance the replication slot to commitLsn. WAL before it is + * hardened and will be fetched from one of safekeepers by + * neon_walreader if needed. + * + * Also wakes up syncrep waiters. + */ + ProcessStandbyReply( + /* write_lsn - This is what durably stored in WAL service. */ + quorumFeedback.flushLsn, + /* flush_lsn - This is what durably stored in WAL service. */ + quorumFeedback.flushLsn, - /* - * apply_lsn - This is what processed and durably saved at* - * pageserver. - */ - quorumFeedback.rf.disk_consistent_lsn, - walprop_pg_get_current_timestamp(wp), false); + /* + * apply_lsn - This is what processed and durably saved at* + * pageserver. + */ + quorumFeedback.rf.disk_consistent_lsn, + walprop_pg_get_current_timestamp(wp), false); } CombineHotStanbyFeedbacks(&hsFeedback, wp); if (hsFeedback.ts != 0 && memcmp(&hsFeedback, &quorumFeedback.hs, sizeof hsFeedback) != 0) { quorumFeedback.hs = hsFeedback; - if (!wp->config->syncSafekeepers) - ProcessStandbyHSFeedback(hsFeedback.ts, - XidFromFullTransactionId(hsFeedback.xmin), - EpochFromFullTransactionId(hsFeedback.xmin), - XidFromFullTransactionId(hsFeedback.catalog_xmin), - EpochFromFullTransactionId(hsFeedback.catalog_xmin)); + ProcessStandbyHSFeedback(hsFeedback.ts, + XidFromFullTransactionId(hsFeedback.xmin), + EpochFromFullTransactionId(hsFeedback.xmin), + XidFromFullTransactionId(hsFeedback.catalog_xmin), + EpochFromFullTransactionId(hsFeedback.catalog_xmin)); } } -static void -walprop_pg_confirm_wal_streamed(WalProposer *wp, XLogRecPtr lsn) -{ - if (MyReplicationSlot) - PhysicalConfirmReceivedLocation(lsn); -} - static XLogRecPtr walprop_pg_get_redo_start_lsn(WalProposer *wp) { @@ -1694,15 +1949,15 @@ walprop_pg_log_internal(WalProposer *wp, int level, const char *line) elog(FATAL, "unexpected log_internal message at level %d: %s", level, line); } -static void -walprop_pg_after_election(WalProposer *wp) +static XLogRecPtr +GetLogRepRestartLSN(WalProposer *wp) { FILE *f; - XLogRecPtr lrRestartLsn; + XLogRecPtr lrRestartLsn = InvalidXLogRecPtr; /* We don't need to do anything in syncSafekeepers mode. */ if (wp->config->syncSafekeepers) - return; + return InvalidXLogRecPtr; /* * If there are active logical replication subscription we need to provide @@ -1710,22 +1965,40 @@ walprop_pg_after_election(WalProposer *wp) * replication slots. */ f = fopen("restart.lsn", "rb"); - if (f != NULL && !wp->config->syncSafekeepers) + if (f != NULL) { - size_t rc = fread(&lrRestartLsn, sizeof(lrRestartLsn), 1, f); + size_t rc = fread(&lrRestartLsn, sizeof(lrRestartLsn), 1, f); + fclose(f); if (rc == 1 && lrRestartLsn != InvalidXLogRecPtr) { - elog(LOG, "Logical replication restart LSN %X/%X", LSN_FORMAT_ARGS(lrRestartLsn)); + uint64 download_range_mb; + + wpg_log(LOG, "logical replication restart LSN %X/%X", LSN_FORMAT_ARGS(lrRestartLsn)); + + /* + * If we need to download more than a max_slot_wal_keep_size, + * don't do it to avoid risk of exploding pg_wal. Logical + * replication won't work until recreated, but at least compute + * would start; this also follows max_slot_wal_keep_size + * semantics. + */ + download_range_mb = (wp->propEpochStartLsn - lrRestartLsn) / MB; + if (max_slot_wal_keep_size_mb > 0 && download_range_mb >= max_slot_wal_keep_size_mb) + { + wpg_log(WARNING, "not downloading WAL for logical replication since %X/%X as max_slot_wal_keep_size=%dMB", + LSN_FORMAT_ARGS(lrRestartLsn), max_slot_wal_keep_size_mb); + return InvalidXLogRecPtr; + } /* * start from the beginning of the segment to fetch page headers * verifed by XLogReader */ lrRestartLsn = lrRestartLsn - XLogSegmentOffset(lrRestartLsn, wal_segment_size); - wp->truncateLsn = Min(wp->truncateLsn, lrRestartLsn); } } + return lrRestartLsn; } static const walproposer_api walprop_pg = { @@ -1745,18 +2018,18 @@ static const walproposer_api walprop_pg = { .conn_async_write = walprop_async_write, .conn_blocking_write = walprop_blocking_write, .recovery_download = WalProposerRecovery, - .wal_read = walprop_pg_wal_read, .wal_reader_allocate = walprop_pg_wal_reader_allocate, - .free_event_set = walprop_pg_free_event_set, + .wal_read = walprop_pg_wal_read, + .wal_reader_events = walprop_pg_wal_reader_events, .init_event_set = walprop_pg_init_event_set, .update_event_set = walprop_pg_update_event_set, + .active_state_update_event_set = walprop_pg_active_state_update_event_set, .add_safekeeper_event_set = walprop_pg_add_safekeeper_event_set, + .rm_safekeeper_event_set = walprop_pg_rm_safekeeper_event_set, .wait_event_set = walprop_pg_wait_event_set, .strong_random = walprop_pg_strong_random, .get_redo_start_lsn = walprop_pg_get_redo_start_lsn, .finish_sync_safekeepers = walprop_pg_finish_sync_safekeepers, .process_safekeeper_feedback = walprop_pg_process_safekeeper_feedback, - .confirm_wal_streamed = walprop_pg_confirm_wal_streamed, .log_internal = walprop_pg_log_internal, - .after_election = walprop_pg_after_election, }; diff --git a/poetry.lock b/poetry.lock index 76dfd6d37d..428698cb5a 100644 --- a/poetry.lock +++ b/poetry.lock @@ -1,4 +1,4 @@ -# This file is automatically @generated by Poetry 1.5.1 and should not be changed by hand. +# This file is automatically @generated by Poetry 1.7.1 and should not be changed by hand. [[package]] name = "aiohttp" @@ -288,70 +288,21 @@ files = [ {file = "backoff-2.2.1.tar.gz", hash = "sha256:03f829f5bb1923180821643f8753b0502c3b682293992485b0eef2807afa5cba"}, ] -[[package]] -name = "black" -version = "23.3.0" -description = "The uncompromising code formatter." -optional = false -python-versions = ">=3.7" -files = [ - {file = "black-23.3.0-cp310-cp310-macosx_10_16_arm64.whl", hash = "sha256:0945e13506be58bf7db93ee5853243eb368ace1c08a24c65ce108986eac65915"}, - {file = "black-23.3.0-cp310-cp310-macosx_10_16_universal2.whl", hash = "sha256:67de8d0c209eb5b330cce2469503de11bca4085880d62f1628bd9972cc3366b9"}, - {file = "black-23.3.0-cp310-cp310-macosx_10_16_x86_64.whl", hash = "sha256:7c3eb7cea23904399866c55826b31c1f55bbcd3890ce22ff70466b907b6775c2"}, - {file = "black-23.3.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:32daa9783106c28815d05b724238e30718f34155653d4d6e125dc7daec8e260c"}, - {file = "black-23.3.0-cp310-cp310-win_amd64.whl", hash = "sha256:35d1381d7a22cc5b2be2f72c7dfdae4072a3336060635718cc7e1ede24221d6c"}, - {file = "black-23.3.0-cp311-cp311-macosx_10_16_arm64.whl", hash = "sha256:a8a968125d0a6a404842fa1bf0b349a568634f856aa08ffaff40ae0dfa52e7c6"}, - {file = "black-23.3.0-cp311-cp311-macosx_10_16_universal2.whl", hash = "sha256:c7ab5790333c448903c4b721b59c0d80b11fe5e9803d8703e84dcb8da56fec1b"}, - {file = "black-23.3.0-cp311-cp311-macosx_10_16_x86_64.whl", hash = "sha256:a6f6886c9869d4daae2d1715ce34a19bbc4b95006d20ed785ca00fa03cba312d"}, - {file = "black-23.3.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:6f3c333ea1dd6771b2d3777482429864f8e258899f6ff05826c3a4fcc5ce3f70"}, - {file = "black-23.3.0-cp311-cp311-win_amd64.whl", hash = "sha256:11c410f71b876f961d1de77b9699ad19f939094c3a677323f43d7a29855fe326"}, - {file = "black-23.3.0-cp37-cp37m-macosx_10_16_x86_64.whl", hash = "sha256:1d06691f1eb8de91cd1b322f21e3bfc9efe0c7ca1f0e1eb1db44ea367dff656b"}, - {file = "black-23.3.0-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:50cb33cac881766a5cd9913e10ff75b1e8eb71babf4c7104f2e9c52da1fb7de2"}, - {file = "black-23.3.0-cp37-cp37m-win_amd64.whl", hash = "sha256:e114420bf26b90d4b9daa597351337762b63039752bdf72bf361364c1aa05925"}, - {file = "black-23.3.0-cp38-cp38-macosx_10_16_arm64.whl", hash = "sha256:48f9d345675bb7fbc3dd85821b12487e1b9a75242028adad0333ce36ed2a6d27"}, - {file = "black-23.3.0-cp38-cp38-macosx_10_16_universal2.whl", hash = "sha256:714290490c18fb0126baa0fca0a54ee795f7502b44177e1ce7624ba1c00f2331"}, - {file = "black-23.3.0-cp38-cp38-macosx_10_16_x86_64.whl", hash = "sha256:064101748afa12ad2291c2b91c960be28b817c0c7eaa35bec09cc63aa56493c5"}, - {file = "black-23.3.0-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:562bd3a70495facf56814293149e51aa1be9931567474993c7942ff7d3533961"}, - {file = "black-23.3.0-cp38-cp38-win_amd64.whl", hash = "sha256:e198cf27888ad6f4ff331ca1c48ffc038848ea9f031a3b40ba36aced7e22f2c8"}, - {file = "black-23.3.0-cp39-cp39-macosx_10_16_arm64.whl", hash = "sha256:3238f2aacf827d18d26db07524e44741233ae09a584273aa059066d644ca7b30"}, - {file = "black-23.3.0-cp39-cp39-macosx_10_16_universal2.whl", hash = "sha256:f0bd2f4a58d6666500542b26354978218a9babcdc972722f4bf90779524515f3"}, - {file = "black-23.3.0-cp39-cp39-macosx_10_16_x86_64.whl", hash = "sha256:92c543f6854c28a3c7f39f4d9b7694f9a6eb9d3c5e2ece488c327b6e7ea9b266"}, - {file = "black-23.3.0-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:3a150542a204124ed00683f0db1f5cf1c2aaaa9cc3495b7a3b5976fb136090ab"}, - {file = "black-23.3.0-cp39-cp39-win_amd64.whl", hash = "sha256:6b39abdfb402002b8a7d030ccc85cf5afff64ee90fa4c5aebc531e3ad0175ddb"}, - {file = "black-23.3.0-py3-none-any.whl", hash = "sha256:ec751418022185b0c1bb7d7736e6933d40bbb14c14a0abcf9123d1b159f98dd4"}, - {file = "black-23.3.0.tar.gz", hash = "sha256:1c7b8d606e728a41ea1ccbd7264677e494e87cf630e399262ced92d4a8dac940"}, -] - -[package.dependencies] -click = ">=8.0.0" -mypy-extensions = ">=0.4.3" -packaging = ">=22.0" -pathspec = ">=0.9.0" -platformdirs = ">=2" -tomli = {version = ">=1.1.0", markers = "python_version < \"3.11\""} -typing-extensions = {version = ">=3.10.0.0", markers = "python_version < \"3.10\""} - -[package.extras] -colorama = ["colorama (>=0.4.3)"] -d = ["aiohttp (>=3.7.4)"] -jupyter = ["ipython (>=7.8.0)", "tokenize-rt (>=3.2.0)"] -uvloop = ["uvloop (>=0.15.2)"] - [[package]] name = "boto3" -version = "1.26.16" +version = "1.34.11" description = "The AWS SDK for Python" optional = false -python-versions = ">= 3.7" +python-versions = ">= 3.8" files = [ - {file = "boto3-1.26.16-py3-none-any.whl", hash = "sha256:4f493a2aed71cee93e626de4f67ce58dd82c0473480a0fc45b131715cd8f4f30"}, - {file = "boto3-1.26.16.tar.gz", hash = "sha256:31c0adf71e4bd19a5428580bb229d7ea3b5795eecaa0847a85385df00c026116"}, + {file = "boto3-1.34.11-py3-none-any.whl", hash = "sha256:1af021e0c6e3040e8de66d403e963566476235bb70f9a8e3f6784813ac2d8026"}, + {file = "boto3-1.34.11.tar.gz", hash = "sha256:31c130a40ec0631059b77d7e87f67ad03ff1685a5b37638ac0c4687026a3259d"}, ] [package.dependencies] -botocore = ">=1.29.16,<1.30.0" +botocore = ">=1.34.11,<1.35.0" jmespath = ">=0.7.1,<2.0.0" -s3transfer = ">=0.6.0,<0.7.0" +s3transfer = ">=0.10.0,<0.11.0" [package.extras] crt = ["botocore[crt] (>=1.21.0,<2.0a0)"] @@ -702,22 +653,25 @@ xray = ["mypy-boto3-xray (>=1.26.0,<1.27.0)"] [[package]] name = "botocore" -version = "1.29.16" +version = "1.34.11" description = "Low-level, data-driven core of boto 3." optional = false -python-versions = ">= 3.7" +python-versions = ">= 3.8" files = [ - {file = "botocore-1.29.16-py3-none-any.whl", hash = "sha256:271b599e6cfe214405ed50d41cd967add1d5d469383dd81ff583bc818b47f59b"}, - {file = "botocore-1.29.16.tar.gz", hash = "sha256:8cfcc10f2f1751608c3cec694f2d6b5e16ebcd50d0a104f9914d5616227c62e9"}, + {file = "botocore-1.34.11-py3-none-any.whl", hash = "sha256:1ff1398b6ea670e1c01ac67a33af3da854f8e700d3528289c04f319c330d8250"}, + {file = "botocore-1.34.11.tar.gz", hash = "sha256:51905c3d623c60df5dc5794387de7caf886d350180a01a3dfa762e903edb45a9"}, ] [package.dependencies] jmespath = ">=0.7.1,<2.0.0" python-dateutil = ">=2.1,<3.0.0" -urllib3 = ">=1.25.4,<1.27" +urllib3 = [ + {version = ">=1.25.4,<1.27", markers = "python_version < \"3.10\""}, + {version = ">=1.25.4,<2.1", markers = "python_version >= \"3.10\""}, +] [package.extras] -crt = ["awscrt (==0.14.0)"] +crt = ["awscrt (==0.19.19)"] [[package]] name = "botocore-stubs" @@ -1624,17 +1578,6 @@ files = [ {file = "packaging-23.0.tar.gz", hash = "sha256:b6ad297f8907de0fa2fe1ccbd26fdaf387f5f47c7275fedf8cce89f99446cf97"}, ] -[[package]] -name = "pathspec" -version = "0.9.0" -description = "Utility library for gitignore style pattern matching of file paths." -optional = false -python-versions = "!=3.0.*,!=3.1.*,!=3.2.*,!=3.3.*,!=3.4.*,>=2.7" -files = [ - {file = "pathspec-0.9.0-py2.py3-none-any.whl", hash = "sha256:7d15c4ddb0b5c802d161efc417ec1a2558ea2653c2e8ad9c19098201dc1c993a"}, - {file = "pathspec-0.9.0.tar.gz", hash = "sha256:e564499435a2673d586f6b2130bb5b95f04a3ba06f81b8f895b651a3c76aabb1"}, -] - [[package]] name = "pbr" version = "5.9.0" @@ -1646,21 +1589,6 @@ files = [ {file = "pbr-5.9.0.tar.gz", hash = "sha256:e8dca2f4b43560edef58813969f52a56cef023146cbb8931626db80e6c1c4308"}, ] -[[package]] -name = "platformdirs" -version = "2.5.2" -description = "A small Python module for determining appropriate platform-specific dirs, e.g. a \"user data dir\"." -optional = false -python-versions = ">=3.7" -files = [ - {file = "platformdirs-2.5.2-py3-none-any.whl", hash = "sha256:027d8e83a2d7de06bbac4e5ef7e023c02b863d7ea5d079477e722bb41ab25788"}, - {file = "platformdirs-2.5.2.tar.gz", hash = "sha256:58c8abb07dcb441e6ee4b11d8df0ac856038f944ab98b7be6b27b2a3c7feef19"}, -] - -[package.extras] -docs = ["furo (>=2021.7.5b38)", "proselint (>=0.10.2)", "sphinx (>=4)", "sphinx-autodoc-typehints (>=1.12)"] -test = ["appdirs (==1.4.4)", "pytest (>=6)", "pytest-cov (>=2.7)", "pytest-mock (>=3.6)"] - [[package]] name = "pluggy" version = "1.0.0" @@ -1889,13 +1817,13 @@ files = [ [[package]] name = "pytest" -version = "7.3.1" +version = "7.4.4" description = "pytest: simple powerful testing with Python" optional = false python-versions = ">=3.7" files = [ - {file = "pytest-7.3.1-py3-none-any.whl", hash = "sha256:3799fa815351fea3a5e96ac7e503a96fa51cc9942c3753cda7651b93c1cfa362"}, - {file = "pytest-7.3.1.tar.gz", hash = "sha256:434afafd78b1d78ed0addf160ad2b77a30d35d4bdf8af234fe621919d9ed15e3"}, + {file = "pytest-7.4.4-py3-none-any.whl", hash = "sha256:b090cdf5ed60bf4c45261be03239c2c1c22df034fbffe691abe93cd80cea01d8"}, + {file = "pytest-7.4.4.tar.gz", hash = "sha256:2cf0005922c6ace4a3e2ec8b4080eb0d9753fdc93107415332f50ce9e7994280"}, ] [package.dependencies] @@ -1907,7 +1835,7 @@ pluggy = ">=0.12,<2.0" tomli = {version = ">=1.0.0", markers = "python_version < \"3.11\""} [package.extras] -testing = ["argcomplete", "attrs (>=19.2.0)", "hypothesis (>=3.56)", "mock", "nose", "pygments (>=2.7.2)", "requests", "xmlschema"] +testing = ["argcomplete", "attrs (>=19.2.0)", "hypothesis (>=3.56)", "mock", "nose", "pygments (>=2.7.2)", "requests", "setuptools", "xmlschema"] [[package]] name = "pytest-asyncio" @@ -2204,46 +2132,46 @@ pyasn1 = ">=0.1.3" [[package]] name = "ruff" -version = "0.0.269" -description = "An extremely fast Python linter, written in Rust." +version = "0.1.11" +description = "An extremely fast Python linter and code formatter, written in Rust." optional = false python-versions = ">=3.7" files = [ - {file = "ruff-0.0.269-py3-none-macosx_10_7_x86_64.whl", hash = "sha256:3569bcdee679045c09c0161fabc057599759c49219a08d9a4aad2cc3982ccba3"}, - {file = "ruff-0.0.269-py3-none-macosx_10_9_x86_64.macosx_11_0_arm64.macosx_10_9_universal2.whl", hash = "sha256:56347da63757a56cbce7d4b3d6044ca4f1941cd1bbff3714f7554360c3361f83"}, - {file = "ruff-0.0.269-py3-none-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:6da8ee25ef2f0cc6cc8e6e20942c1d44d25a36dce35070d7184655bc14f63f63"}, - {file = "ruff-0.0.269-py3-none-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:bd81b8e681b9eaa6cf15484f3985bd8bd97c3d114e95bff3e8ea283bf8865062"}, - {file = "ruff-0.0.269-py3-none-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:1f19f59ca3c28742955241fb452f3346241ddbd34e72ac5cb3d84fadebcf6bc8"}, - {file = "ruff-0.0.269-py3-none-manylinux_2_17_ppc64.manylinux2014_ppc64.whl", hash = "sha256:f062059b8289a4fab7f6064601b811d447c2f9d3d432a17f689efe4d68988450"}, - {file = "ruff-0.0.269-py3-none-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:3f5dc7aac52c58e82510217e3c7efd80765c134c097c2815d59e40face0d1fe6"}, - {file = "ruff-0.0.269-py3-none-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:e131b4dbe798c391090c6407641d6ab12c0fa1bb952379dde45e5000e208dabb"}, - {file = "ruff-0.0.269-py3-none-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:a374434e588e06550df0f8dcb74777290f285678de991fda4e1063c367ab2eb2"}, - {file = "ruff-0.0.269-py3-none-musllinux_1_2_aarch64.whl", hash = "sha256:cec2f4b84a14b87f1b121488649eb5b4eaa06467a2387373f750da74bdcb5679"}, - {file = "ruff-0.0.269-py3-none-musllinux_1_2_armv7l.whl", hash = "sha256:374b161753a247904aec7a32d45e165302b76b6e83d22d099bf3ff7c232c888f"}, - {file = "ruff-0.0.269-py3-none-musllinux_1_2_i686.whl", hash = "sha256:9ca0a1ddb1d835b5f742db9711c6cf59f213a1ad0088cb1e924a005fd399e7d8"}, - {file = "ruff-0.0.269-py3-none-musllinux_1_2_x86_64.whl", hash = "sha256:5a20658f0b97d207c7841c13d528f36d666bf445b00b01139f28a8ccb80093bb"}, - {file = "ruff-0.0.269-py3-none-win32.whl", hash = "sha256:03ff42bc91ceca58e0f0f072cb3f9286a9208f609812753474e799a997cdad1a"}, - {file = "ruff-0.0.269-py3-none-win_amd64.whl", hash = "sha256:f3b59ccff57b21ef0967ea8021fd187ec14c528ec65507d8bcbe035912050776"}, - {file = "ruff-0.0.269-py3-none-win_arm64.whl", hash = "sha256:bbeb857b1e508a4487bdb02ca1e6d41dd8d5ac5335a5246e25de8a3dff38c1ff"}, - {file = "ruff-0.0.269.tar.gz", hash = "sha256:11ddcfbab32cf5c420ea9dd5531170ace5a3e59c16d9251c7bd2581f7b16f602"}, + {file = "ruff-0.1.11-py3-none-macosx_10_12_x86_64.macosx_11_0_arm64.macosx_10_12_universal2.whl", hash = "sha256:a7f772696b4cdc0a3b2e527fc3c7ccc41cdcb98f5c80fdd4f2b8c50eb1458196"}, + {file = "ruff-0.1.11-py3-none-macosx_10_12_x86_64.whl", hash = "sha256:934832f6ed9b34a7d5feea58972635c2039c7a3b434fe5ba2ce015064cb6e955"}, + {file = "ruff-0.1.11-py3-none-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:ea0d3e950e394c4b332bcdd112aa566010a9f9c95814844a7468325290aabfd9"}, + {file = "ruff-0.1.11-py3-none-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:9bd4025b9c5b429a48280785a2b71d479798a69f5c2919e7d274c5f4b32c3607"}, + {file = "ruff-0.1.11-py3-none-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:e1ad00662305dcb1e987f5ec214d31f7d6a062cae3e74c1cbccef15afd96611d"}, + {file = "ruff-0.1.11-py3-none-manylinux_2_17_ppc64.manylinux2014_ppc64.whl", hash = "sha256:4b077ce83f47dd6bea1991af08b140e8b8339f0ba8cb9b7a484c30ebab18a23f"}, + {file = "ruff-0.1.11-py3-none-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:c4a88efecec23c37b11076fe676e15c6cdb1271a38f2b415e381e87fe4517f18"}, + {file = "ruff-0.1.11-py3-none-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:5b25093dad3b055667730a9b491129c42d45e11cdb7043b702e97125bcec48a1"}, + {file = "ruff-0.1.11-py3-none-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:231d8fb11b2cc7c0366a326a66dafc6ad449d7fcdbc268497ee47e1334f66f77"}, + {file = "ruff-0.1.11-py3-none-musllinux_1_2_aarch64.whl", hash = "sha256:09c415716884950080921dd6237767e52e227e397e2008e2bed410117679975b"}, + {file = "ruff-0.1.11-py3-none-musllinux_1_2_armv7l.whl", hash = "sha256:0f58948c6d212a6b8d41cd59e349751018797ce1727f961c2fa755ad6208ba45"}, + {file = "ruff-0.1.11-py3-none-musllinux_1_2_i686.whl", hash = "sha256:190a566c8f766c37074d99640cd9ca3da11d8deae2deae7c9505e68a4a30f740"}, + {file = "ruff-0.1.11-py3-none-musllinux_1_2_x86_64.whl", hash = "sha256:6464289bd67b2344d2a5d9158d5eb81025258f169e69a46b741b396ffb0cda95"}, + {file = "ruff-0.1.11-py3-none-win32.whl", hash = "sha256:9b8f397902f92bc2e70fb6bebfa2139008dc72ae5177e66c383fa5426cb0bf2c"}, + {file = "ruff-0.1.11-py3-none-win_amd64.whl", hash = "sha256:eb85ee287b11f901037a6683b2374bb0ec82928c5cbc984f575d0437979c521a"}, + {file = "ruff-0.1.11-py3-none-win_arm64.whl", hash = "sha256:97ce4d752f964ba559c7023a86e5f8e97f026d511e48013987623915431c7ea9"}, + {file = "ruff-0.1.11.tar.gz", hash = "sha256:f9d4d88cb6eeb4dfe20f9f0519bd2eaba8119bde87c3d5065c541dbae2b5a2cb"}, ] [[package]] name = "s3transfer" -version = "0.6.0" +version = "0.10.0" description = "An Amazon S3 Transfer Manager" optional = false -python-versions = ">= 3.7" +python-versions = ">= 3.8" files = [ - {file = "s3transfer-0.6.0-py3-none-any.whl", hash = "sha256:06176b74f3a15f61f1b4f25a1fc29a4429040b7647133a463da8fa5bd28d5ecd"}, - {file = "s3transfer-0.6.0.tar.gz", hash = "sha256:2ed07d3866f523cc561bf4a00fc5535827981b117dd7876f036b0c1aca42c947"}, + {file = "s3transfer-0.10.0-py3-none-any.whl", hash = "sha256:3cdb40f5cfa6966e812209d0994f2a4709b561c88e90cf00c2696d2df4e56b2e"}, + {file = "s3transfer-0.10.0.tar.gz", hash = "sha256:d0c8bbf672d5eebbe4e57945e23b972d963f07d82f661cabf678a5c88831595b"}, ] [package.dependencies] -botocore = ">=1.12.36,<2.0a.0" +botocore = ">=1.33.2,<2.0a.0" [package.extras] -crt = ["botocore[crt] (>=1.20.29,<2.0a.0)"] +crt = ["botocore[crt] (>=1.33.2,<2.0a.0)"] [[package]] name = "sarif-om" @@ -2493,16 +2421,6 @@ files = [ {file = "wrapt-1.14.1-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:8ad85f7f4e20964db4daadcab70b47ab05c7c1cf2a7c1e51087bfaa83831854c"}, {file = "wrapt-1.14.1-cp310-cp310-win32.whl", hash = "sha256:a9a52172be0b5aae932bef82a79ec0a0ce87288c7d132946d645eba03f0ad8a8"}, {file = "wrapt-1.14.1-cp310-cp310-win_amd64.whl", hash = "sha256:6d323e1554b3d22cfc03cd3243b5bb815a51f5249fdcbb86fda4bf62bab9e164"}, - {file = "wrapt-1.14.1-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:ecee4132c6cd2ce5308e21672015ddfed1ff975ad0ac8d27168ea82e71413f55"}, - {file = "wrapt-1.14.1-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:2020f391008ef874c6d9e208b24f28e31bcb85ccff4f335f15a3251d222b92d9"}, - {file = "wrapt-1.14.1-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:2feecf86e1f7a86517cab34ae6c2f081fd2d0dac860cb0c0ded96d799d20b335"}, - {file = "wrapt-1.14.1-cp311-cp311-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:240b1686f38ae665d1b15475966fe0472f78e71b1b4903c143a842659c8e4cb9"}, - {file = "wrapt-1.14.1-cp311-cp311-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:a9008dad07d71f68487c91e96579c8567c98ca4c3881b9b113bc7b33e9fd78b8"}, - {file = "wrapt-1.14.1-cp311-cp311-musllinux_1_1_aarch64.whl", hash = "sha256:6447e9f3ba72f8e2b985a1da758767698efa72723d5b59accefd716e9e8272bf"}, - {file = "wrapt-1.14.1-cp311-cp311-musllinux_1_1_i686.whl", hash = "sha256:acae32e13a4153809db37405f5eba5bac5fbe2e2ba61ab227926a22901051c0a"}, - {file = "wrapt-1.14.1-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:49ef582b7a1152ae2766557f0550a9fcbf7bbd76f43fbdc94dd3bf07cc7168be"}, - {file = "wrapt-1.14.1-cp311-cp311-win32.whl", hash = "sha256:358fe87cc899c6bb0ddc185bf3dbfa4ba646f05b1b0b9b5a27c2cb92c2cea204"}, - {file = "wrapt-1.14.1-cp311-cp311-win_amd64.whl", hash = "sha256:26046cd03936ae745a502abf44dac702a5e6880b2b01c29aea8ddf3353b68224"}, {file = "wrapt-1.14.1-cp35-cp35m-manylinux1_i686.whl", hash = "sha256:43ca3bbbe97af00f49efb06e352eae40434ca9d915906f77def219b88e85d907"}, {file = "wrapt-1.14.1-cp35-cp35m-manylinux1_x86_64.whl", hash = "sha256:6b1a564e6cb69922c7fe3a678b9f9a3c54e72b469875aa8018f18b4d1dd1adf3"}, {file = "wrapt-1.14.1-cp35-cp35m-manylinux2010_i686.whl", hash = "sha256:00b6d4ea20a906c0ca56d84f93065b398ab74b927a7a3dbd470f6fc503f95dc3"}, @@ -2740,4 +2658,4 @@ cffi = ["cffi (>=1.11)"] [metadata] lock-version = "2.0" python-versions = "^3.9" -content-hash = "c4e38082d246636903e15c02fbf8364c6afc1fd35d36a81c49f596ba68fc739b" +content-hash = "35c237fe6a9278b2dc65b06ed96bde5afb9e393d52c01b00c59acf1df3a8d482" diff --git a/pre-commit.py b/pre-commit.py index dc0b9ed588..c5ed63ac44 100755 --- a/pre-commit.py +++ b/pre-commit.py @@ -36,17 +36,17 @@ def rustfmt(fix_inplace: bool = False, no_color: bool = False) -> str: return cmd -def black(fix_inplace: bool) -> str: - cmd = "poetry run black" - if not fix_inplace: - cmd += " --diff --check" +def ruff_check(fix_inplace: bool) -> str: + cmd = "poetry run ruff check" + if fix_inplace: + cmd += " --fix" return cmd -def ruff(fix_inplace: bool) -> str: - cmd = "poetry run ruff" - if fix_inplace: - cmd += " --fix" +def ruff_format(fix_inplace: bool) -> str: + cmd = "poetry run ruff format" + if not fix_inplace: + cmd += " --diff --check" return cmd @@ -109,16 +109,16 @@ if __name__ == "__main__": no_color=args.no_color, ) check( - name="black", + name="ruff check", suffix=".py", - cmd=black(fix_inplace=args.fix_inplace), + cmd=ruff_check(fix_inplace=args.fix_inplace), changed_files=files, no_color=args.no_color, ) check( - name="ruff", + name="ruff format", suffix=".py", - cmd=ruff(fix_inplace=args.fix_inplace), + cmd=ruff_format(fix_inplace=args.fix_inplace), changed_files=files, no_color=args.no_color, ) diff --git a/proxy/Cargo.toml b/proxy/Cargo.toml index c94cd55417..23a9bb178d 100644 --- a/proxy/Cargo.toml +++ b/proxy/Cargo.toml @@ -5,7 +5,7 @@ edition.workspace = true license.workspace = true [features] -default = [] +default = ["testing"] testing = [] [dependencies] @@ -14,6 +14,7 @@ async-trait.workspace = true base64.workspace = true bstr.workspace = true bytes = { workspace = true, features = ["serde"] } +camino.workspace = true chrono.workspace = true clap.workspace = true consumption_metrics.workspace = true @@ -35,6 +36,8 @@ metrics.workspace = true once_cell.workspace = true opentelemetry.workspace = true parking_lot.workspace = true +parquet.workspace = true +parquet_derive.workspace = true pbkdf2 = { workspace = true, features = ["simple", "std"] } pin-project-lite.workspace = true postgres_backend.workspace = true @@ -42,6 +45,7 @@ pq_proto.workspace = true prometheus.workspace = true rand.workspace = true regex.workspace = true +remote_storage = { version = "0.1", path = "../libs/remote_storage/" } reqwest = { workspace = true, features = ["json"] } reqwest-middleware.workspace = true reqwest-retry.workspace = true @@ -75,11 +79,13 @@ x509-parser.workspace = true native-tls.workspace = true postgres-native-tls.workspace = true postgres-protocol.workspace = true +redis.workspace = true smol_str.workspace = true workspace_hack.workspace = true [dev-dependencies] +camino-tempfile.workspace = true rcgen.workspace = true rstest.workspace = true tokio-postgres-rustls.workspace = true diff --git a/proxy/src/auth.rs b/proxy/src/auth.rs index 64ef108e11..0707c1331f 100644 --- a/proxy/src/auth.rs +++ b/proxy/src/auth.rs @@ -4,7 +4,7 @@ pub mod backend; pub use backend::BackendType; mod credentials; -pub use credentials::{check_peer_addr_is_in_list, ClientCredentials}; +pub use credentials::{check_peer_addr_is_in_list, endpoint_sni, ComputeUserInfoMaybeEndpoint}; mod password_hack; pub use password_hack::parse_endpoint_param; diff --git a/proxy/src/auth/backend.rs b/proxy/src/auth/backend.rs index 923bd02560..120ed46992 100644 --- a/proxy/src/auth/backend.rs +++ b/proxy/src/auth/backend.rs @@ -8,26 +8,27 @@ use tokio_postgres::config::AuthKeys; use crate::auth::credentials::check_peer_addr_is_in_list; use crate::auth::validate_password_and_exchange; +use crate::cache::Cached; use crate::console::errors::GetAuthInfoError; use crate::console::AuthSecret; +use crate::context::RequestMonitoring; use crate::proxy::connect_compute::handle_try_wake; use crate::proxy::retry::retry_after; +use crate::proxy::NeonOptions; use crate::scram; use crate::stream::Stream; use crate::{ - auth::{self, ClientCredentials}, + auth::{self, ComputeUserInfoMaybeEndpoint}, config::AuthenticationConfig, console::{ self, - provider::{CachedNodeInfo, ConsoleReqExtra}, + provider::{CachedAllowedIps, CachedNodeInfo}, Api, }, - metrics::LatencyTimer, stream, url, }; use futures::TryFutureExt; use std::borrow::Cow; -use std::net::IpAddr; use std::ops::ControlFlow; use std::sync::Arc; use tokio::io::{AsyncRead, AsyncWrite}; @@ -38,7 +39,7 @@ use tracing::{error, info, warn}; /// * When `T` is `()`, it's just a regular auth backend selector /// which we use in [`crate::config::ProxyConfig`]. /// -/// * However, when we substitute `T` with [`ClientCredentials`], +/// * However, when we substitute `T` with [`ComputeUserInfoMaybeEndpoint`], /// this helps us provide the credentials only to those auth /// backends which require them for the authentication process. pub enum BackendType<'a, T> { @@ -56,7 +57,7 @@ pub enum BackendType<'a, T> { pub trait TestBackend: Send + Sync + 'static { fn wake_compute(&self) -> Result; - fn get_allowed_ips(&self) -> Result>, console::errors::GetAuthInfoError>; + fn get_allowed_ips(&self) -> Result, console::errors::GetAuthInfoError>; } impl std::fmt::Display for BackendType<'_, ()> { @@ -127,15 +128,23 @@ pub struct ComputeCredentials { pub keys: T, } +#[derive(Debug, Clone)] pub struct ComputeUserInfoNoEndpoint { pub user: SmolStr, - pub peer_addr: IpAddr, - pub cache_key: SmolStr, + pub options: NeonOptions, } +#[derive(Debug, Clone)] pub struct ComputeUserInfo { pub endpoint: SmolStr, - pub inner: ComputeUserInfoNoEndpoint, + pub user: SmolStr, + pub options: NeonOptions, +} + +impl ComputeUserInfo { + pub fn endpoint_cache_key(&self) -> SmolStr { + self.options.get_cache_key(&self.endpoint) + } } pub enum ComputeCredentialKeys { @@ -144,19 +153,21 @@ pub enum ComputeCredentialKeys { AuthKeys(AuthKeys), } -impl TryFrom for ComputeUserInfo { +impl TryFrom for ComputeUserInfo { // user name type Error = ComputeUserInfoNoEndpoint; - fn try_from(creds: ClientCredentials) -> Result { - let inner = ComputeUserInfoNoEndpoint { - user: creds.user, - peer_addr: creds.peer_addr, - cache_key: creds.cache_key, - }; - match creds.project { - None => Err(inner), - Some(endpoint) => Ok(ComputeUserInfo { endpoint, inner }), + fn try_from(user_info: ComputeUserInfoMaybeEndpoint) -> Result { + match user_info.project { + None => Err(ComputeUserInfoNoEndpoint { + user: user_info.user, + options: user_info.options, + }), + Some(endpoint) => Ok(ComputeUserInfo { + endpoint, + user: user_info.user, + options: user_info.options, + }), } } } @@ -166,49 +177,53 @@ impl TryFrom for ComputeUserInfo { /// /// All authentication flows will emit an AuthenticationOk message if successful. async fn auth_quirks( + ctx: &mut RequestMonitoring, api: &impl console::Api, - extra: &ConsoleReqExtra, - creds: ClientCredentials, + user_info: ComputeUserInfoMaybeEndpoint, client: &mut stream::PqStream>, allow_cleartext: bool, config: &'static AuthenticationConfig, - latency_timer: &mut LatencyTimer, ) -> auth::Result> { // If there's no project so far, that entails that client doesn't // support SNI or other means of passing the endpoint (project) name. // We now expect to see a very specific payload in the place of password. - let (info, unauthenticated_password) = match creds.try_into() { + let (info, unauthenticated_password) = match user_info.try_into() { Err(info) => { - let res = hacks::password_hack_no_authentication(info, client, latency_timer).await?; + let res = hacks::password_hack_no_authentication(info, client, &mut ctx.latency_timer) + .await?; + ctx.set_endpoint_id(Some(res.info.endpoint.clone())); (res.info, Some(res.keys)) } Ok(info) => (info, None), }; info!("fetching user's authentication info"); - let allowed_ips = api.get_allowed_ips(extra, &info).await?; + let allowed_ips = api.get_allowed_ips(ctx, &info).await?; // check allowed list - if !check_peer_addr_is_in_list(&info.inner.peer_addr, &allowed_ips) { + if !check_peer_addr_is_in_list(&ctx.peer_addr, &allowed_ips) { return Err(auth::AuthError::ip_address_not_allowed()); } - let cached_secret = api.get_role_secret(extra, &info).await?; + let maybe_secret = api.get_role_secret(ctx, &info).await?; - let secret = cached_secret.clone().unwrap_or_else(|| { + let cached_secret = maybe_secret.unwrap_or_else(|| { // If we don't have an authentication secret, we mock one to // prevent malicious probing (possible due to missing protocol steps). // This mocked secret will never lead to successful authentication. info!("authentication info not found, mocking it"); - AuthSecret::Scram(scram::ServerSecret::mock(&info.inner.user, rand::random())) + Cached::new_uncached(AuthSecret::Scram(scram::ServerSecret::mock( + &info.user, + rand::random(), + ))) }); match authenticate_with_secret( - secret, + ctx, + cached_secret.value.clone(), info, client, unauthenticated_password, allow_cleartext, config, - latency_timer, ) .await { @@ -224,13 +239,13 @@ async fn auth_quirks( } async fn authenticate_with_secret( + ctx: &mut RequestMonitoring, secret: AuthSecret, info: ComputeUserInfo, client: &mut stream::PqStream>, unauthenticated_password: Option>, allow_cleartext: bool, config: &'static AuthenticationConfig, - latency_timer: &mut LatencyTimer, ) -> auth::Result> { if let Some(password) = unauthenticated_password { let auth_outcome = validate_password_and_exchange(&password, secret)?; @@ -238,7 +253,7 @@ async fn authenticate_with_secret( crate::sasl::Outcome::Success(key) => key, crate::sasl::Outcome::Failure(reason) => { info!("auth backend failed with an error: {reason}"); - return Err(auth::AuthError::auth_failed(&*info.inner.user)); + return Err(auth::AuthError::auth_failed(&*info.user)); } }; @@ -253,38 +268,29 @@ async fn authenticate_with_secret( // Perform cleartext auth if we're allowed to do that. // Currently, we use it for websocket connections (latency). if allow_cleartext { - return hacks::authenticate_cleartext(info, client, latency_timer, secret).await; + return hacks::authenticate_cleartext(info, client, &mut ctx.latency_timer, secret).await; } // Finally, proceed with the main auth flow (SCRAM-based). - classic::authenticate(info, client, config, latency_timer, secret).await + classic::authenticate(info, client, config, &mut ctx.latency_timer, secret).await } /// Authenticate the user and then wake a compute (or retrieve an existing compute session from cache) /// only if authentication was successfuly. async fn auth_and_wake_compute( + ctx: &mut RequestMonitoring, api: &impl console::Api, - extra: &ConsoleReqExtra, - creds: ClientCredentials, + user_info: ComputeUserInfoMaybeEndpoint, client: &mut stream::PqStream>, allow_cleartext: bool, config: &'static AuthenticationConfig, - latency_timer: &mut LatencyTimer, ) -> auth::Result<(CachedNodeInfo, ComputeUserInfo)> { - let compute_credentials = auth_quirks( - api, - extra, - creds, - client, - allow_cleartext, - config, - latency_timer, - ) - .await?; + let compute_credentials = + auth_quirks(ctx, api, user_info, client, allow_cleartext, config).await?; let mut num_retries = 0; let mut node = loop { - let wake_res = api.wake_compute(extra, &compute_credentials.info).await; + let wake_res = api.wake_compute(ctx, &compute_credentials.info).await; match handle_try_wake(wake_res, num_retries) { Err(e) => { error!(error = ?e, num_retries, retriable = false, "couldn't wake compute node"); @@ -301,6 +307,8 @@ async fn auth_and_wake_compute( tokio::time::sleep(wait_duration).await; }; + ctx.set_project(node.aux.clone()); + match compute_credentials.keys { #[cfg(feature = "testing")] ComputeCredentialKeys::Password(password) => node.config.password(password), @@ -310,15 +318,15 @@ async fn auth_and_wake_compute( Ok((node, compute_credentials.info)) } -impl<'a> BackendType<'a, ClientCredentials> { +impl<'a> BackendType<'a, ComputeUserInfoMaybeEndpoint> { /// Get compute endpoint name from the credentials. pub fn get_endpoint(&self) -> Option { use BackendType::*; match self { - Console(_, creds) => creds.project.clone(), + Console(_, user_info) => user_info.project.clone(), #[cfg(feature = "testing")] - Postgres(_, creds) => creds.project.clone(), + Postgres(_, user_info) => user_info.project.clone(), Link(_) => Some("link".into()), #[cfg(test)] Test(_) => Some("test".into()), @@ -330,9 +338,9 @@ impl<'a> BackendType<'a, ClientCredentials> { use BackendType::*; match self { - Console(_, creds) => &creds.user, + Console(_, user_info) => &user_info.user, #[cfg(feature = "testing")] - Postgres(_, creds) => &creds.user, + Postgres(_, user_info) => &user_info.user, Link(_) => "link", #[cfg(test)] Test(_) => "test", @@ -343,52 +351,37 @@ impl<'a> BackendType<'a, ClientCredentials> { #[tracing::instrument(fields(allow_cleartext = allow_cleartext), skip_all)] pub async fn authenticate( self, - extra: &ConsoleReqExtra, + ctx: &mut RequestMonitoring, client: &mut stream::PqStream>, allow_cleartext: bool, config: &'static AuthenticationConfig, - latency_timer: &mut LatencyTimer, ) -> auth::Result<(CachedNodeInfo, BackendType<'a, ComputeUserInfo>)> { use BackendType::*; let res = match self { - Console(api, creds) => { + Console(api, user_info) => { info!( - user = &*creds.user, - project = creds.project(), + user = &*user_info.user, + project = user_info.project(), "performing authentication using the console" ); - let (cache_info, user_info) = auth_and_wake_compute( - &*api, - extra, - creds, - client, - allow_cleartext, - config, - latency_timer, - ) - .await?; + let (cache_info, user_info) = + auth_and_wake_compute(ctx, &*api, user_info, client, allow_cleartext, config) + .await?; (cache_info, BackendType::Console(api, user_info)) } #[cfg(feature = "testing")] - Postgres(api, creds) => { + Postgres(api, user_info) => { info!( - user = &*creds.user, - project = creds.project(), + user = &*user_info.user, + project = user_info.project(), "performing authentication using a local postgres instance" ); - let (cache_info, user_info) = auth_and_wake_compute( - &*api, - extra, - creds, - client, - allow_cleartext, - config, - latency_timer, - ) - .await?; + let (cache_info, user_info) = + auth_and_wake_compute(ctx, &*api, user_info, client, allow_cleartext, config) + .await?; (cache_info, BackendType::Postgres(api, user_info)) } // NOTE: this auth backend doesn't use client credentials. @@ -416,16 +409,16 @@ impl<'a> BackendType<'a, ClientCredentials> { impl BackendType<'_, ComputeUserInfo> { pub async fn get_allowed_ips( &self, - extra: &ConsoleReqExtra, - ) -> Result>, GetAuthInfoError> { + ctx: &mut RequestMonitoring, + ) -> Result { use BackendType::*; match self { - Console(api, creds) => api.get_allowed_ips(extra, creds).await, + Console(api, user_info) => api.get_allowed_ips(ctx, user_info).await, #[cfg(feature = "testing")] - Postgres(api, creds) => api.get_allowed_ips(extra, creds).await, - Link(_) => Ok(Arc::new(vec![])), + Postgres(api, user_info) => api.get_allowed_ips(ctx, user_info).await, + Link(_) => Ok(Cached::new_uncached(Arc::new(vec![]))), #[cfg(test)] - Test(x) => x.get_allowed_ips(), + Test(x) => Ok(Cached::new_uncached(Arc::new(x.get_allowed_ips()?))), } } @@ -433,14 +426,14 @@ impl BackendType<'_, ComputeUserInfo> { /// The link auth flow doesn't support this, so we return [`None`] in that case. pub async fn wake_compute( &self, - extra: &ConsoleReqExtra, + ctx: &mut RequestMonitoring, ) -> Result, console::errors::WakeComputeError> { use BackendType::*; match self { - Console(api, creds) => api.wake_compute(extra, creds).map_ok(Some).await, + Console(api, user_info) => api.wake_compute(ctx, user_info).map_ok(Some).await, #[cfg(feature = "testing")] - Postgres(api, creds) => api.wake_compute(extra, creds).map_ok(Some).await, + Postgres(api, user_info) => api.wake_compute(ctx, user_info).map_ok(Some).await, Link(_) => Ok(None), #[cfg(test)] Test(x) => x.wake_compute().map(Some), diff --git a/proxy/src/auth/backend/classic.rs b/proxy/src/auth/backend/classic.rs index 5c394ec649..358b335b88 100644 --- a/proxy/src/auth/backend/classic.rs +++ b/proxy/src/auth/backend/classic.rs @@ -54,7 +54,7 @@ pub(super) async fn authenticate( sasl::Outcome::Success(key) => key, sasl::Outcome::Failure(reason) => { info!("auth backend failed with an error: {reason}"); - return Err(auth::AuthError::auth_failed(&*creds.inner.user)); + return Err(auth::AuthError::auth_failed(&*creds.user)); } }; diff --git a/proxy/src/auth/backend/hacks.rs b/proxy/src/auth/backend/hacks.rs index 5dde514bca..b6c1a92d3c 100644 --- a/proxy/src/auth/backend/hacks.rs +++ b/proxy/src/auth/backend/hacks.rs @@ -36,7 +36,7 @@ pub async fn authenticate_cleartext( sasl::Outcome::Success(key) => key, sasl::Outcome::Failure(reason) => { info!("auth backend failed with an error: {reason}"); - return Err(auth::AuthError::auth_failed(&*info.inner.user)); + return Err(auth::AuthError::auth_failed(&*info.user)); } }; @@ -67,7 +67,8 @@ pub async fn password_hack_no_authentication( // Report tentative success; compute node will check the password anyway. Ok(ComputeCredentials { info: ComputeUserInfo { - inner: info, + user: info.user, + options: info.options, endpoint: payload.endpoint, }, keys: payload.password, diff --git a/proxy/src/auth/credentials.rs b/proxy/src/auth/credentials.rs index c04769a199..ada7f3614c 100644 --- a/proxy/src/auth/credentials.rs +++ b/proxy/src/auth/credentials.rs @@ -1,8 +1,8 @@ //! User credentials used in authentication. use crate::{ - auth::password_hack::parse_endpoint_param, error::UserFacingError, - metrics::NUM_CONNECTION_ACCEPTED_BY_SNI, proxy::neon_options_str, + auth::password_hack::parse_endpoint_param, context::RequestMonitoring, error::UserFacingError, + metrics::NUM_CONNECTION_ACCEPTED_BY_SNI, proxy::NeonOptions, }; use itertools::Itertools; use pq_proto::StartupMessageParams; @@ -12,7 +12,7 @@ use thiserror::Error; use tracing::{info, warn}; #[derive(Debug, Error, PartialEq, Eq, Clone)] -pub enum ClientCredsParseError { +pub enum ComputeUserInfoParseError { #[error("Parameter '{0}' is missing in startup packet.")] MissingKey(&'static str), @@ -33,39 +33,58 @@ pub enum ClientCredsParseError { MalformedProjectName(SmolStr), } -impl UserFacingError for ClientCredsParseError {} +impl UserFacingError for ComputeUserInfoParseError {} /// Various client credentials which we use for authentication. /// Note that we don't store any kind of client key or password here. #[derive(Debug, Clone, PartialEq, Eq)] -pub struct ClientCredentials { +pub struct ComputeUserInfoMaybeEndpoint { pub user: SmolStr, // TODO: this is a severe misnomer! We should think of a new name ASAP. pub project: Option, - pub cache_key: SmolStr, - pub peer_addr: IpAddr, + pub options: NeonOptions, } -impl ClientCredentials { +impl ComputeUserInfoMaybeEndpoint { #[inline] pub fn project(&self) -> Option<&str> { self.project.as_deref() } } -impl ClientCredentials { +pub fn endpoint_sni<'a>( + sni: &'a str, + common_names: &HashSet, +) -> Result<&'a str, ComputeUserInfoParseError> { + let Some((subdomain, common_name)) = sni.split_once('.') else { + return Err(ComputeUserInfoParseError::UnknownCommonName { cn: sni.into() }); + }; + if !common_names.contains(common_name) { + return Err(ComputeUserInfoParseError::UnknownCommonName { + cn: common_name.into(), + }); + } + Ok(subdomain) +} + +impl ComputeUserInfoMaybeEndpoint { pub fn parse( + ctx: &mut RequestMonitoring, params: &StartupMessageParams, sni: Option<&str>, - common_names: Option>, - peer_addr: IpAddr, - ) -> Result { - use ClientCredsParseError::*; + common_names: Option<&HashSet>, + ) -> Result { + use ComputeUserInfoParseError::*; // Some parameters are stored in the startup message. let get_param = |key| params.get(key).ok_or(MissingKey(key)); - let user = get_param("user")?.into(); + let user: SmolStr = get_param("user")?.into(); + + // record the values if we have them + ctx.set_application(params.get("application_name").map(SmolStr::from)); + ctx.set_user(user.clone()); + ctx.set_endpoint_id(sni.map(SmolStr::from)); // Project name might be passed via PG's command-line options. let project_option = params @@ -83,21 +102,7 @@ impl ClientCredentials { let project_from_domain = if let Some(sni_str) = sni { if let Some(cn) = common_names { - let common_name_from_sni = sni_str.split_once('.').map(|(_, domain)| domain); - - let project = common_name_from_sni - .and_then(|domain| { - if cn.contains(domain) { - subdomain_from_sni(sni_str, domain) - } else { - None - } - }) - .ok_or_else(|| UnknownCommonName { - cn: common_name_from_sni.unwrap_or("").into(), - })?; - - Some(project) + Some(SmolStr::from(endpoint_sni(sni_str, cn)?)) } else { None } @@ -136,23 +141,17 @@ impl ClientCredentials { info!("Connection with password hack"); } - let cache_key = format!( - "{}{}", - project.as_deref().unwrap_or(""), - neon_options_str(params) - ) - .into(); + let options = NeonOptions::parse_params(params); Ok(Self { user, project, - cache_key, - peer_addr, + options, }) } } -pub fn check_peer_addr_is_in_list(peer_addr: &IpAddr, ip_list: &Vec) -> bool { +pub fn check_peer_addr_is_in_list(peer_addr: &IpAddr, ip_list: &Vec) -> bool { if ip_list.is_empty() { return true; } @@ -204,25 +203,19 @@ fn project_name_valid(name: &str) -> bool { name.chars().all(|c| c.is_alphanumeric() || c == '-') } -fn subdomain_from_sni(sni: &str, common_name: &str) -> Option { - sni.strip_suffix(common_name)? - .strip_suffix('.') - .map(SmolStr::from) -} - #[cfg(test)] mod tests { use super::*; - use ClientCredsParseError::*; + use ComputeUserInfoParseError::*; #[test] fn parse_bare_minimum() -> anyhow::Result<()> { // According to postgresql, only `user` should be required. let options = StartupMessageParams::new([("user", "john_doe")]); - let peer_addr = IpAddr::from([127, 0, 0, 1]); - let creds = ClientCredentials::parse(&options, None, None, peer_addr)?; - assert_eq!(creds.user, "john_doe"); - assert_eq!(creds.project, None); + let mut ctx = RequestMonitoring::test(); + let user_info = ComputeUserInfoMaybeEndpoint::parse(&mut ctx, &options, None, None)?; + assert_eq!(user_info.user, "john_doe"); + assert_eq!(user_info.project, None); Ok(()) } @@ -234,10 +227,10 @@ mod tests { ("database", "world"), // should be ignored ("foo", "bar"), // should be ignored ]); - let peer_addr = IpAddr::from([127, 0, 0, 1]); - let creds = ClientCredentials::parse(&options, None, None, peer_addr)?; - assert_eq!(creds.user, "john_doe"); - assert_eq!(creds.project, None); + let mut ctx = RequestMonitoring::test(); + let user_info = ComputeUserInfoMaybeEndpoint::parse(&mut ctx, &options, None, None)?; + assert_eq!(user_info.user, "john_doe"); + assert_eq!(user_info.project, None); Ok(()) } @@ -249,11 +242,12 @@ mod tests { let sni = Some("foo.localhost"); let common_names = Some(["localhost".into()].into()); - let peer_addr = IpAddr::from([127, 0, 0, 1]); - let creds = ClientCredentials::parse(&options, sni, common_names, peer_addr)?; - assert_eq!(creds.user, "john_doe"); - assert_eq!(creds.project.as_deref(), Some("foo")); - assert_eq!(creds.cache_key, "foo"); + let mut ctx = RequestMonitoring::test(); + let user_info = + ComputeUserInfoMaybeEndpoint::parse(&mut ctx, &options, sni, common_names.as_ref())?; + assert_eq!(user_info.user, "john_doe"); + assert_eq!(user_info.project.as_deref(), Some("foo")); + assert_eq!(user_info.options.get_cache_key("foo"), "foo"); Ok(()) } @@ -265,10 +259,10 @@ mod tests { ("options", "-ckey=1 project=bar -c geqo=off"), ]); - let peer_addr = IpAddr::from([127, 0, 0, 1]); - let creds = ClientCredentials::parse(&options, None, None, peer_addr)?; - assert_eq!(creds.user, "john_doe"); - assert_eq!(creds.project.as_deref(), Some("bar")); + let mut ctx = RequestMonitoring::test(); + let user_info = ComputeUserInfoMaybeEndpoint::parse(&mut ctx, &options, None, None)?; + assert_eq!(user_info.user, "john_doe"); + assert_eq!(user_info.project.as_deref(), Some("bar")); Ok(()) } @@ -280,10 +274,10 @@ mod tests { ("options", "-ckey=1 endpoint=bar -c geqo=off"), ]); - let peer_addr = IpAddr::from([127, 0, 0, 1]); - let creds = ClientCredentials::parse(&options, None, None, peer_addr)?; - assert_eq!(creds.user, "john_doe"); - assert_eq!(creds.project.as_deref(), Some("bar")); + let mut ctx = RequestMonitoring::test(); + let user_info = ComputeUserInfoMaybeEndpoint::parse(&mut ctx, &options, None, None)?; + assert_eq!(user_info.user, "john_doe"); + assert_eq!(user_info.project.as_deref(), Some("bar")); Ok(()) } @@ -298,10 +292,10 @@ mod tests { ), ]); - let peer_addr = IpAddr::from([127, 0, 0, 1]); - let creds = ClientCredentials::parse(&options, None, None, peer_addr)?; - assert_eq!(creds.user, "john_doe"); - assert!(creds.project.is_none()); + let mut ctx = RequestMonitoring::test(); + let user_info = ComputeUserInfoMaybeEndpoint::parse(&mut ctx, &options, None, None)?; + assert_eq!(user_info.user, "john_doe"); + assert!(user_info.project.is_none()); Ok(()) } @@ -313,10 +307,10 @@ mod tests { ("options", "-ckey=1 endpoint=bar project=foo -c geqo=off"), ]); - let peer_addr = IpAddr::from([127, 0, 0, 1]); - let creds = ClientCredentials::parse(&options, None, None, peer_addr)?; - assert_eq!(creds.user, "john_doe"); - assert!(creds.project.is_none()); + let mut ctx = RequestMonitoring::test(); + let user_info = ComputeUserInfoMaybeEndpoint::parse(&mut ctx, &options, None, None)?; + assert_eq!(user_info.user, "john_doe"); + assert!(user_info.project.is_none()); Ok(()) } @@ -328,10 +322,11 @@ mod tests { let sni = Some("baz.localhost"); let common_names = Some(["localhost".into()].into()); - let peer_addr = IpAddr::from([127, 0, 0, 1]); - let creds = ClientCredentials::parse(&options, sni, common_names, peer_addr)?; - assert_eq!(creds.user, "john_doe"); - assert_eq!(creds.project.as_deref(), Some("baz")); + let mut ctx = RequestMonitoring::test(); + let user_info = + ComputeUserInfoMaybeEndpoint::parse(&mut ctx, &options, sni, common_names.as_ref())?; + assert_eq!(user_info.user, "john_doe"); + assert_eq!(user_info.project.as_deref(), Some("baz")); Ok(()) } @@ -342,15 +337,17 @@ mod tests { let common_names = Some(["a.com".into(), "b.com".into()].into()); let sni = Some("p1.a.com"); - let peer_addr = IpAddr::from([127, 0, 0, 1]); - let creds = ClientCredentials::parse(&options, sni, common_names, peer_addr)?; - assert_eq!(creds.project.as_deref(), Some("p1")); + let mut ctx = RequestMonitoring::test(); + let user_info = + ComputeUserInfoMaybeEndpoint::parse(&mut ctx, &options, sni, common_names.as_ref())?; + assert_eq!(user_info.project.as_deref(), Some("p1")); let common_names = Some(["a.com".into(), "b.com".into()].into()); let sni = Some("p1.b.com"); - let peer_addr = IpAddr::from([127, 0, 0, 1]); - let creds = ClientCredentials::parse(&options, sni, common_names, peer_addr)?; - assert_eq!(creds.project.as_deref(), Some("p1")); + let mut ctx = RequestMonitoring::test(); + let user_info = + ComputeUserInfoMaybeEndpoint::parse(&mut ctx, &options, sni, common_names.as_ref())?; + assert_eq!(user_info.project.as_deref(), Some("p1")); Ok(()) } @@ -363,9 +360,10 @@ mod tests { let sni = Some("second.localhost"); let common_names = Some(["localhost".into()].into()); - let peer_addr = IpAddr::from([127, 0, 0, 1]); - let err = ClientCredentials::parse(&options, sni, common_names, peer_addr) - .expect_err("should fail"); + let mut ctx = RequestMonitoring::test(); + let err = + ComputeUserInfoMaybeEndpoint::parse(&mut ctx, &options, sni, common_names.as_ref()) + .expect_err("should fail"); match err { InconsistentProjectNames { domain, option } => { assert_eq!(option, "first"); @@ -382,9 +380,10 @@ mod tests { let sni = Some("project.localhost"); let common_names = Some(["example.com".into()].into()); - let peer_addr = IpAddr::from([127, 0, 0, 1]); - let err = ClientCredentials::parse(&options, sni, common_names, peer_addr) - .expect_err("should fail"); + let mut ctx = RequestMonitoring::test(); + let err = + ComputeUserInfoMaybeEndpoint::parse(&mut ctx, &options, sni, common_names.as_ref()) + .expect_err("should fail"); match err { UnknownCommonName { cn } => { assert_eq!(cn, "localhost"); @@ -402,10 +401,14 @@ mod tests { let sni = Some("project.localhost"); let common_names = Some(["localhost".into()].into()); - let peer_addr = IpAddr::from([127, 0, 0, 1]); - let creds = ClientCredentials::parse(&options, sni, common_names, peer_addr)?; - assert_eq!(creds.project.as_deref(), Some("project")); - assert_eq!(creds.cache_key, "projectendpoint_type:read_write lsn:0/2"); + let mut ctx = RequestMonitoring::test(); + let user_info = + ComputeUserInfoMaybeEndpoint::parse(&mut ctx, &options, sni, common_names.as_ref())?; + assert_eq!(user_info.project.as_deref(), Some("project")); + assert_eq!( + user_info.options.get_cache_key("project"), + "project endpoint_type:read_write lsn:0/2" + ); Ok(()) } diff --git a/proxy/src/bin/pg_sni_router.rs b/proxy/src/bin/pg_sni_router.rs index d48ba3a54e..1edbc1e7e7 100644 --- a/proxy/src/bin/pg_sni_router.rs +++ b/proxy/src/bin/pg_sni_router.rs @@ -8,6 +8,7 @@ use std::{net::SocketAddr, sync::Arc}; use futures::future::Either; use itertools::Itertools; use proxy::config::TlsServerEndPoint; +use proxy::context::RequestMonitoring; use proxy::proxy::run_until_cancelled; use tokio::net::TcpListener; @@ -170,7 +171,16 @@ async fn task_main( .context("failed to set socket option")?; info!(%peer_addr, "serving"); - handle_client(dest_suffix, tls_config, tls_server_end_point, socket).await + let mut ctx = + RequestMonitoring::new(session_id, peer_addr.ip(), "sni_router", "sni"); + handle_client( + &mut ctx, + dest_suffix, + tls_config, + tls_server_end_point, + socket, + ) + .await } .unwrap_or_else(|e| { // Acknowledge that the task has finished with an error. @@ -236,6 +246,7 @@ async fn ssl_handshake( } async fn handle_client( + ctx: &mut RequestMonitoring, dest_suffix: Arc, tls_config: Arc, tls_server_end_point: TlsServerEndPoint, @@ -261,5 +272,5 @@ async fn handle_client( let client = tokio::net::TcpStream::connect(destination).await?; let metrics_aux: MetricsAuxInfo = Default::default(); - proxy::proxy::proxy_pass(tls_stream, client, metrics_aux).await + proxy::proxy::proxy_pass(ctx, tls_stream, client, metrics_aux).await } diff --git a/proxy/src/bin/proxy.rs b/proxy/src/bin/proxy.rs index be3989d387..e1dac34a59 100644 --- a/proxy/src/bin/proxy.rs +++ b/proxy/src/bin/proxy.rs @@ -3,14 +3,15 @@ use proxy::auth; use proxy::config::AuthenticationConfig; use proxy::config::CacheOptions; use proxy::config::HttpConfig; +use proxy::config::ProjectInfoCacheOptions; use proxy::console; -use proxy::console::provider::AllowedIpsCache; -use proxy::console::provider::NodeInfoCache; -use proxy::console::provider::RoleSecretCache; +use proxy::context::parquet::ParquetUploadArgs; use proxy::http; use proxy::rate_limiter::EndpointRateLimiter; use proxy::rate_limiter::RateBucketInfo; use proxy::rate_limiter::RateLimiterConfig; +use proxy::redis::notifications; +use proxy::serverless::GlobalConnPoolOptions; use proxy::usage_metrics; use anyhow::bail; @@ -43,6 +44,9 @@ enum AuthBackend { #[derive(Parser)] #[command(version = GIT_VERSION, about)] struct ProxyCliArgs { + /// Name of the region this proxy is deployed in + #[clap(long, default_value_t = String::new())] + region: String, /// listen for incoming client connections on ip:port #[clap(short, long, default_value = "127.0.0.1:4432")] proxy: String, @@ -95,12 +99,8 @@ struct ProxyCliArgs { /// Allow self-signed certificates for compute nodes (for testing) #[clap(long, default_value_t = false, value_parser = clap::builder::BoolishValueParser::new(), action = clap::ArgAction::Set)] allow_self_signed_compute: bool, - /// timeout for http connections - #[clap(long, default_value = "15s", value_parser = humantime::parse_duration)] - sql_over_http_timeout: tokio::time::Duration, - /// Whether the SQL over http pool is opt-in - #[clap(long, default_value_t = true, value_parser = clap::builder::BoolishValueParser::new(), action = clap::ArgAction::Set)] - sql_over_http_pool_opt_in: bool, + #[clap(flatten)] + sql_over_http: SqlOverHttpArgs, /// timeout for scram authentication protocol #[clap(long, default_value = "15s", value_parser = humantime::parse_duration)] scram_protocol_timeout: tokio::time::Duration, @@ -136,6 +136,45 @@ struct ProxyCliArgs { /// disable ip check for http requests. If it is too time consuming, it could be turned off. #[clap(long, default_value_t = false, value_parser = clap::builder::BoolishValueParser::new(), action = clap::ArgAction::Set)] disable_ip_check_for_http: bool, + /// redis url for notifications. + #[clap(long)] + redis_notifications: Option, + /// cache for `project_info` (use `size=0` to disable) + #[clap(long, default_value = config::ProjectInfoCacheOptions::CACHE_DEFAULT_OPTIONS)] + project_info_cache: String, + + #[clap(flatten)] + parquet_upload: ParquetUploadArgs, +} + +#[derive(clap::Args, Clone, Copy, Debug)] +struct SqlOverHttpArgs { + /// timeout for http connection requests + #[clap(long, default_value = "15s", value_parser = humantime::parse_duration)] + sql_over_http_timeout: tokio::time::Duration, + + /// Whether the SQL over http pool is opt-in + #[clap(long, default_value_t = true, value_parser = clap::builder::BoolishValueParser::new(), action = clap::ArgAction::Set)] + sql_over_http_pool_opt_in: bool, + + /// How many connections to pool for each endpoint. Excess connections are discarded + #[clap(long, default_value_t = 20)] + sql_over_http_pool_max_conns_per_endpoint: usize, + + /// How long pooled connections should remain idle for before closing + #[clap(long, default_value = "5m", value_parser = humantime::parse_duration)] + sql_over_http_idle_timeout: tokio::time::Duration, + + /// Duration each shard will wait on average before a GC sweep. + /// A longer time will causes sweeps to take longer but will interfere less frequently. + #[clap(long, default_value = "10m", value_parser = humantime::parse_duration)] + sql_over_http_pool_gc_epoch: tokio::time::Duration, + + /// How many shards should the global pool have. Must be a power of two. + /// More shards will introduce less contention for pool operations, but can + /// increase memory used by the pool + #[clap(long, default_value_t = 128)] + sql_over_http_pool_shards: usize, } #[tokio::main] @@ -194,6 +233,11 @@ async fn main() -> anyhow::Result<()> { )); } + client_tasks.spawn(proxy::context::parquet::worker( + cancellation_token.clone(), + args.parquet_upload, + )); + // maintenance tasks. these never return unless there's an error let mut maintenance_tasks = JoinSet::new(); maintenance_tasks.spawn(proxy::handle_signals(cancellation_token)); @@ -204,6 +248,15 @@ async fn main() -> anyhow::Result<()> { maintenance_tasks.spawn(usage_metrics::task_main(metrics_config)); } + if let auth::BackendType::Console(api, _) = &config.auth_backend { + let cache = api.caches.project_info.clone(); + if let Some(url) = args.redis_notifications { + info!("Starting redis notifications listener ({url})"); + maintenance_tasks.spawn(notifications::task_main(url.to_owned(), cache.clone())); + } + maintenance_tasks.spawn(async move { cache.clone().gc_worker().await }); + } + let maintenance = loop { // get one complete task match futures::future::select( @@ -269,32 +322,17 @@ fn build_config(args: &ProxyCliArgs) -> anyhow::Result<&'static ProxyConfig> { let auth_backend = match &args.auth_backend { AuthBackend::Console => { let wake_compute_cache_config: CacheOptions = args.wake_compute_cache.parse()?; - let allowed_ips_cache_config: CacheOptions = args.allowed_ips_cache.parse()?; - let role_secret_cache_config: CacheOptions = args.role_secret_cache.parse()?; + let project_info_cache_config: ProjectInfoCacheOptions = + args.project_info_cache.parse()?; info!("Using NodeInfoCache (wake_compute) with options={wake_compute_cache_config:?}"); - info!("Using AllowedIpsCache (wake_compute) with options={allowed_ips_cache_config:?}"); - info!("Using RoleSecretCache (wake_compute) with options={role_secret_cache_config:?}"); - let caches = Box::leak(Box::new(console::caches::ApiCaches { - node_info: NodeInfoCache::new( - "node_info_cache", - wake_compute_cache_config.size, - wake_compute_cache_config.ttl, - true, - ), - allowed_ips: AllowedIpsCache::new( - "allowed_ips_cache", - allowed_ips_cache_config.size, - allowed_ips_cache_config.ttl, - false, - ), - role_secret: RoleSecretCache::new( - "role_secret_cache", - role_secret_cache_config.size, - role_secret_cache_config.ttl, - false, - ), - })); + info!( + "Using AllowedIpsCache (wake_compute) with options={project_info_cache_config:?}" + ); + let caches = Box::leak(Box::new(console::caches::ApiCaches::new( + wake_compute_cache_config, + project_info_cache_config, + ))); let config::WakeComputeLockOptions { shards, @@ -327,8 +365,14 @@ fn build_config(args: &ProxyCliArgs) -> anyhow::Result<&'static ProxyConfig> { } }; let http_config = HttpConfig { - timeout: args.sql_over_http_timeout, - pool_opt_in: args.sql_over_http_pool_opt_in, + request_timeout: args.sql_over_http.sql_over_http_timeout, + pool_options: GlobalConnPoolOptions { + max_conns_per_endpoint: args.sql_over_http.sql_over_http_pool_max_conns_per_endpoint, + gc_epoch: args.sql_over_http.sql_over_http_pool_gc_epoch, + pool_shards: args.sql_over_http.sql_over_http_pool_shards, + idle_timeout: args.sql_over_http.sql_over_http_idle_timeout, + opt_in: args.sql_over_http.sql_over_http_pool_opt_in, + }, }; let authentication_config = AuthenticationConfig { scram_protocol_timeout: args.scram_protocol_timeout, @@ -347,6 +391,8 @@ fn build_config(args: &ProxyCliArgs) -> anyhow::Result<&'static ProxyConfig> { require_client_ip: args.require_client_ip, disable_ip_check_for_http: args.disable_ip_check_for_http, endpoint_rps_limit, + // TODO: add this argument + region: args.region.clone(), })); Ok(config) diff --git a/proxy/src/cache.rs b/proxy/src/cache.rs index f54f360b01..fc5f416395 100644 --- a/proxy/src/cache.rs +++ b/proxy/src/cache.rs @@ -1,311 +1,6 @@ -use std::{ - borrow::Borrow, - hash::Hash, - ops::{Deref, DerefMut}, - time::{Duration, Instant}, -}; -use tracing::debug; - -// This seems to make more sense than `lru` or `cached`: -// -// * `near/nearcore` ditched `cached` in favor of `lru` -// (https://github.com/near/nearcore/issues?q=is%3Aissue+lru+is%3Aclosed). -// -// * `lru` methods use an obscure `KeyRef` type in their contraints (which is deliberately excluded from docs). -// This severely hinders its usage both in terms of creating wrappers and supported key types. -// -// On the other hand, `hashlink` has good download stats and appears to be maintained. -use hashlink::{linked_hash_map::RawEntryMut, LruCache}; - -/// A generic trait which exposes types of cache's key and value, -/// as well as the notion of cache entry invalidation. -/// This is useful for [`timed_lru::Cached`]. -pub trait Cache { - /// Entry's key. - type Key; - - /// Entry's value. - type Value; - - /// Used for entry invalidation. - type LookupInfo; - - /// Invalidate an entry using a lookup info. - /// We don't have an empty default impl because it's error-prone. - fn invalidate(&self, _: &Self::LookupInfo); -} - -impl Cache for &C { - type Key = C::Key; - type Value = C::Value; - type LookupInfo = C::LookupInfo; - - fn invalidate(&self, info: &Self::LookupInfo) { - C::invalidate(self, info) - } -} +pub mod common; +pub mod project_info; +mod timed_lru; +pub use common::{Cache, Cached}; pub use timed_lru::TimedLru; -pub mod timed_lru { - use super::*; - - /// An implementation of timed LRU cache with fixed capacity. - /// Key properties: - /// - /// * Whenever a new entry is inserted, the least recently accessed one is evicted. - /// The cache also keeps track of entry's insertion time (`created_at`) and TTL (`expires_at`). - /// - /// * If `update_ttl_on_retrieval` is `true`. When the entry is about to be retrieved, we check its expiration timestamp. - /// If the entry has expired, we remove it from the cache; Otherwise we bump the - /// expiration timestamp (e.g. +5mins) and change its place in LRU list to prolong - /// its existence. - /// - /// * There's an API for immediate invalidation (removal) of a cache entry; - /// It's useful in case we know for sure that the entry is no longer correct. - /// See [`timed_lru::LookupInfo`] & [`timed_lru::Cached`] for more information. - /// - /// * Expired entries are kept in the cache, until they are evicted by the LRU policy, - /// or by a successful lookup (i.e. the entry hasn't expired yet). - /// There is no background job to reap the expired records. - /// - /// * It's possible for an entry that has not yet expired entry to be evicted - /// before expired items. That's a bit wasteful, but probably fine in practice. - pub struct TimedLru { - /// Cache's name for tracing. - name: &'static str, - - /// The underlying cache implementation. - cache: parking_lot::Mutex>>, - - /// Default time-to-live of a single entry. - ttl: Duration, - - update_ttl_on_retrieval: bool, - } - - impl Cache for TimedLru { - type Key = K; - type Value = V; - type LookupInfo = LookupInfo; - - fn invalidate(&self, info: &Self::LookupInfo) { - self.invalidate_raw(info) - } - } - - struct Entry { - created_at: Instant, - expires_at: Instant, - value: T, - } - - impl TimedLru { - /// Construct a new LRU cache with timed entries. - pub fn new( - name: &'static str, - capacity: usize, - ttl: Duration, - update_ttl_on_retrieval: bool, - ) -> Self { - Self { - name, - cache: LruCache::new(capacity).into(), - ttl, - update_ttl_on_retrieval, - } - } - - /// Drop an entry from the cache if it's outdated. - #[tracing::instrument(level = "debug", fields(cache = self.name), skip_all)] - fn invalidate_raw(&self, info: &LookupInfo) { - let now = Instant::now(); - - // Do costly things before taking the lock. - let mut cache = self.cache.lock(); - let raw_entry = match cache.raw_entry_mut().from_key(&info.key) { - RawEntryMut::Vacant(_) => return, - RawEntryMut::Occupied(x) => x, - }; - - // Remove the entry if it was created prior to lookup timestamp. - let entry = raw_entry.get(); - let (created_at, expires_at) = (entry.created_at, entry.expires_at); - let should_remove = created_at <= info.created_at || expires_at <= now; - - if should_remove { - raw_entry.remove(); - } - - drop(cache); // drop lock before logging - debug!( - created_at = format_args!("{created_at:?}"), - expires_at = format_args!("{expires_at:?}"), - entry_removed = should_remove, - "processed a cache entry invalidation event" - ); - } - - /// Try retrieving an entry by its key, then execute `extract` if it exists. - #[tracing::instrument(level = "debug", fields(cache = self.name), skip_all)] - fn get_raw(&self, key: &Q, extract: impl FnOnce(&K, &Entry) -> R) -> Option - where - K: Borrow, - Q: Hash + Eq + ?Sized, - { - let now = Instant::now(); - let deadline = now.checked_add(self.ttl).expect("time overflow"); - - // Do costly things before taking the lock. - let mut cache = self.cache.lock(); - let mut raw_entry = match cache.raw_entry_mut().from_key(key) { - RawEntryMut::Vacant(_) => return None, - RawEntryMut::Occupied(x) => x, - }; - - // Immeditely drop the entry if it has expired. - let entry = raw_entry.get(); - if entry.expires_at <= now { - raw_entry.remove(); - return None; - } - - let value = extract(raw_entry.key(), entry); - let (created_at, expires_at) = (entry.created_at, entry.expires_at); - - // Update the deadline and the entry's position in the LRU list. - if self.update_ttl_on_retrieval { - raw_entry.get_mut().expires_at = deadline; - } - raw_entry.to_back(); - - drop(cache); // drop lock before logging - debug!( - created_at = format_args!("{created_at:?}"), - old_expires_at = format_args!("{expires_at:?}"), - new_expires_at = format_args!("{deadline:?}"), - "accessed a cache entry" - ); - - Some(value) - } - - /// Insert an entry to the cache. If an entry with the same key already - /// existed, return the previous value and its creation timestamp. - #[tracing::instrument(level = "debug", fields(cache = self.name), skip_all)] - fn insert_raw(&self, key: K, value: V) -> (Instant, Option) { - let created_at = Instant::now(); - let expires_at = created_at.checked_add(self.ttl).expect("time overflow"); - - let entry = Entry { - created_at, - expires_at, - value, - }; - - // Do costly things before taking the lock. - let old = self - .cache - .lock() - .insert(key, entry) - .map(|entry| entry.value); - - debug!( - created_at = format_args!("{created_at:?}"), - expires_at = format_args!("{expires_at:?}"), - replaced = old.is_some(), - "created a cache entry" - ); - - (created_at, old) - } - } - - impl TimedLru { - pub fn insert(&self, key: K, value: V) -> (Option, Cached<&Self>) { - let (created_at, old) = self.insert_raw(key.clone(), value.clone()); - - let cached = Cached { - token: Some((self, LookupInfo { created_at, key })), - value, - }; - - (old, cached) - } - } - - impl TimedLru { - /// Retrieve a cached entry in convenient wrapper. - pub fn get(&self, key: &Q) -> Option> - where - K: Borrow + Clone, - Q: Hash + Eq + ?Sized, - { - self.get_raw(key, |key, entry| { - let info = LookupInfo { - created_at: entry.created_at, - key: key.clone(), - }; - - Cached { - token: Some((self, info)), - value: entry.value.clone(), - } - }) - } - } - - /// Lookup information for key invalidation. - pub struct LookupInfo { - /// Time of creation of a cache [`Entry`]. - /// We use this during invalidation lookups to prevent eviction of a newer - /// entry sharing the same key (it might've been inserted by a different - /// task after we got the entry we're trying to invalidate now). - created_at: Instant, - - /// Search by this key. - key: K, - } - - /// Wrapper for convenient entry invalidation. - pub struct Cached { - /// Cache + lookup info. - token: Option<(C, C::LookupInfo)>, - - /// The value itself. - value: C::Value, - } - - impl Cached { - /// Place any entry into this wrapper; invalidation will be a no-op. - pub fn new_uncached(value: C::Value) -> Self { - Self { token: None, value } - } - - /// Drop this entry from a cache if it's still there. - pub fn invalidate(self) -> C::Value { - if let Some((cache, info)) = &self.token { - cache.invalidate(info); - } - self.value - } - - /// Tell if this entry is actually cached. - pub fn cached(&self) -> bool { - self.token.is_some() - } - } - - impl Deref for Cached { - type Target = C::Value; - - fn deref(&self) -> &Self::Target { - &self.value - } - } - - impl DerefMut for Cached { - fn deref_mut(&mut self) -> &mut Self::Target { - &mut self.value - } - } -} diff --git a/proxy/src/cache/common.rs b/proxy/src/cache/common.rs new file mode 100644 index 0000000000..2af6a70e90 --- /dev/null +++ b/proxy/src/cache/common.rs @@ -0,0 +1,72 @@ +use std::ops::{Deref, DerefMut}; + +/// A generic trait which exposes types of cache's key and value, +/// as well as the notion of cache entry invalidation. +/// This is useful for [`Cached`]. +pub trait Cache { + /// Entry's key. + type Key; + + /// Entry's value. + type Value; + + /// Used for entry invalidation. + type LookupInfo; + + /// Invalidate an entry using a lookup info. + /// We don't have an empty default impl because it's error-prone. + fn invalidate(&self, _: &Self::LookupInfo); +} + +impl Cache for &C { + type Key = C::Key; + type Value = C::Value; + type LookupInfo = C::LookupInfo; + + fn invalidate(&self, info: &Self::LookupInfo) { + C::invalidate(self, info) + } +} + +/// Wrapper for convenient entry invalidation. +pub struct Cached::Value> { + /// Cache + lookup info. + pub token: Option<(C, C::LookupInfo)>, + + /// The value itself. + pub value: V, +} + +impl Cached { + /// Place any entry into this wrapper; invalidation will be a no-op. + pub fn new_uncached(value: V) -> Self { + Self { token: None, value } + } + + /// Drop this entry from a cache if it's still there. + pub fn invalidate(self) -> V { + if let Some((cache, info)) = &self.token { + cache.invalidate(info); + } + self.value + } + + /// Tell if this entry is actually cached. + pub fn cached(&self) -> bool { + self.token.is_some() + } +} + +impl Deref for Cached { + type Target = V; + + fn deref(&self) -> &Self::Target { + &self.value + } +} + +impl DerefMut for Cached { + fn deref_mut(&mut self) -> &mut Self::Target { + &mut self.value + } +} diff --git a/proxy/src/cache/project_info.rs b/proxy/src/cache/project_info.rs new file mode 100644 index 0000000000..7af2118873 --- /dev/null +++ b/proxy/src/cache/project_info.rs @@ -0,0 +1,496 @@ +use std::{ + collections::HashSet, + convert::Infallible, + sync::{atomic::AtomicU64, Arc}, + time::Duration, +}; + +use dashmap::DashMap; +use rand::{thread_rng, Rng}; +use smol_str::SmolStr; +use tokio::time::Instant; +use tracing::{debug, info}; + +use crate::{config::ProjectInfoCacheOptions, console::AuthSecret}; + +use super::{Cache, Cached}; + +pub trait ProjectInfoCache { + fn invalidate_allowed_ips_for_project(&self, project_id: &SmolStr); + fn invalidate_role_secret_for_project(&self, project_id: &SmolStr, role_name: &SmolStr); + fn enable_ttl(&self); + fn disable_ttl(&self); +} + +struct Entry { + created_at: Instant, + value: T, +} + +impl Entry { + pub fn new(value: T) -> Self { + Self { + created_at: Instant::now(), + value, + } + } +} + +impl From for Entry { + fn from(value: T) -> Self { + Self::new(value) + } +} + +#[derive(Default)] +struct EndpointInfo { + secret: std::collections::HashMap>, + allowed_ips: Option>>>, +} + +impl EndpointInfo { + fn check_ignore_cache(ignore_cache_since: Option, created_at: Instant) -> bool { + match ignore_cache_since { + None => false, + Some(t) => t < created_at, + } + } + pub fn get_role_secret( + &self, + role_name: &SmolStr, + valid_since: Instant, + ignore_cache_since: Option, + ) -> Option<(AuthSecret, bool)> { + if let Some(secret) = self.secret.get(role_name) { + if valid_since < secret.created_at { + return Some(( + secret.value.clone(), + Self::check_ignore_cache(ignore_cache_since, secret.created_at), + )); + } + } + None + } + + pub fn get_allowed_ips( + &self, + valid_since: Instant, + ignore_cache_since: Option, + ) -> Option<(Arc>, bool)> { + if let Some(allowed_ips) = &self.allowed_ips { + if valid_since < allowed_ips.created_at { + return Some(( + allowed_ips.value.clone(), + Self::check_ignore_cache(ignore_cache_since, allowed_ips.created_at), + )); + } + } + None + } + pub fn invalidate_allowed_ips(&mut self) { + self.allowed_ips = None; + } + pub fn invalidate_role_secret(&mut self, role_name: &SmolStr) { + self.secret.remove(role_name); + } +} + +/// Cache for project info. +/// This is used to cache auth data for endpoints. +/// Invalidation is done by console notifications or by TTL (if console notifications are disabled). +/// +/// We also store endpoint-to-project mapping in the cache, to be able to access per-endpoint data. +/// One may ask, why the data is stored per project, when on the user request there is only data about the endpoint available? +/// On the cplane side updates are done per project (or per branch), so it's easier to invalidate the whole project cache. +pub struct ProjectInfoCacheImpl { + cache: DashMap, + + project2ep: DashMap>, + config: ProjectInfoCacheOptions, + + start_time: Instant, + ttl_disabled_since_us: AtomicU64, +} + +impl ProjectInfoCache for ProjectInfoCacheImpl { + fn invalidate_allowed_ips_for_project(&self, project_id: &SmolStr) { + info!("invalidating allowed ips for project `{}`", project_id); + let endpoints = self + .project2ep + .get(project_id) + .map(|kv| kv.value().clone()) + .unwrap_or_default(); + for endpoint_id in endpoints { + if let Some(mut endpoint_info) = self.cache.get_mut(&endpoint_id) { + endpoint_info.invalidate_allowed_ips(); + } + } + } + fn invalidate_role_secret_for_project(&self, project_id: &SmolStr, role_name: &SmolStr) { + info!( + "invalidating role secret for project_id `{}` and role_name `{}`", + project_id, role_name + ); + let endpoints = self + .project2ep + .get(project_id) + .map(|kv| kv.value().clone()) + .unwrap_or_default(); + for endpoint_id in endpoints { + if let Some(mut endpoint_info) = self.cache.get_mut(&endpoint_id) { + endpoint_info.invalidate_role_secret(role_name); + } + } + } + fn enable_ttl(&self) { + self.ttl_disabled_since_us + .store(u64::MAX, std::sync::atomic::Ordering::Relaxed); + } + + fn disable_ttl(&self) { + let new_ttl = (self.start_time.elapsed() + self.config.ttl).as_micros() as u64; + self.ttl_disabled_since_us + .store(new_ttl, std::sync::atomic::Ordering::Relaxed); + } +} + +impl ProjectInfoCacheImpl { + pub fn new(config: ProjectInfoCacheOptions) -> Self { + Self { + cache: DashMap::new(), + project2ep: DashMap::new(), + config, + ttl_disabled_since_us: AtomicU64::new(u64::MAX), + start_time: Instant::now(), + } + } + + pub fn get_role_secret( + &self, + endpoint_id: &SmolStr, + role_name: &SmolStr, + ) -> Option> { + let (valid_since, ignore_cache_since) = self.get_cache_times(); + let endpoint_info = self.cache.get(endpoint_id)?; + let (value, ignore_cache) = + endpoint_info.get_role_secret(role_name, valid_since, ignore_cache_since)?; + if !ignore_cache { + let cached = Cached { + token: Some(( + self, + CachedLookupInfo::new_role_secret(endpoint_id.clone(), role_name.clone()), + )), + value, + }; + return Some(cached); + } + Some(Cached::new_uncached(value)) + } + pub fn get_allowed_ips( + &self, + endpoint_id: &SmolStr, + ) -> Option>>> { + let (valid_since, ignore_cache_since) = self.get_cache_times(); + let endpoint_info = self.cache.get(endpoint_id)?; + let value = endpoint_info.get_allowed_ips(valid_since, ignore_cache_since); + let (value, ignore_cache) = value?; + if !ignore_cache { + let cached = Cached { + token: Some((self, CachedLookupInfo::new_allowed_ips(endpoint_id.clone()))), + value, + }; + return Some(cached); + } + Some(Cached::new_uncached(value)) + } + pub fn insert_role_secret( + &self, + project_id: &SmolStr, + endpoint_id: &SmolStr, + role_name: &SmolStr, + secret: AuthSecret, + ) { + if self.cache.len() >= self.config.size { + // If there are too many entries, wait until the next gc cycle. + return; + } + self.inser_project2endpoint(project_id, endpoint_id); + let mut entry = self.cache.entry(endpoint_id.clone()).or_default(); + if entry.secret.len() < self.config.max_roles { + entry.secret.insert(role_name.clone(), secret.into()); + } + } + pub fn insert_allowed_ips( + &self, + project_id: &SmolStr, + endpoint_id: &SmolStr, + allowed_ips: Arc>, + ) { + if self.cache.len() >= self.config.size { + // If there are too many entries, wait until the next gc cycle. + return; + } + self.inser_project2endpoint(project_id, endpoint_id); + self.cache + .entry(endpoint_id.clone()) + .or_default() + .allowed_ips = Some(allowed_ips.into()); + } + fn inser_project2endpoint(&self, project_id: &SmolStr, endpoint_id: &SmolStr) { + if let Some(mut endpoints) = self.project2ep.get_mut(project_id) { + endpoints.insert(endpoint_id.clone()); + } else { + self.project2ep + .insert(project_id.clone(), HashSet::from([endpoint_id.clone()])); + } + } + fn get_cache_times(&self) -> (Instant, Option) { + let mut valid_since = Instant::now() - self.config.ttl; + // Only ignore cache if ttl is disabled. + let ttl_disabled_since_us = self + .ttl_disabled_since_us + .load(std::sync::atomic::Ordering::Relaxed); + let ignore_cache_since = if ttl_disabled_since_us != u64::MAX { + let ignore_cache_since = self.start_time + Duration::from_micros(ttl_disabled_since_us); + // We are fine if entry is not older than ttl or was added before we are getting notifications. + valid_since = valid_since.min(ignore_cache_since); + Some(ignore_cache_since) + } else { + None + }; + (valid_since, ignore_cache_since) + } + + pub async fn gc_worker(&self) -> anyhow::Result { + let mut interval = + tokio::time::interval(self.config.gc_interval / (self.cache.shards().len()) as u32); + loop { + interval.tick().await; + if self.cache.len() <= self.config.size { + // If there are not too many entries, wait until the next gc cycle. + continue; + } + self.gc(); + } + } + + fn gc(&self) { + let shard = thread_rng().gen_range(0..self.project2ep.shards().len()); + debug!(shard, "project_info_cache: performing epoch reclamation"); + + // acquire a random shard lock + let mut removed = 0; + let shard = self.project2ep.shards()[shard].write(); + for (_, endpoints) in shard.iter() { + for endpoint in endpoints.get().iter() { + self.cache.remove(endpoint); + removed += 1; + } + } + // We can drop this shard only after making sure that all endpoints are removed. + drop(shard); + info!("project_info_cache: removed {removed} endpoints"); + } +} + +/// Lookup info for project info cache. +/// This is used to invalidate cache entries. +pub struct CachedLookupInfo { + /// Search by this key. + endpoint_id: SmolStr, + lookup_type: LookupType, +} + +impl CachedLookupInfo { + pub(self) fn new_role_secret(endpoint_id: SmolStr, role_name: SmolStr) -> Self { + Self { + endpoint_id, + lookup_type: LookupType::RoleSecret(role_name), + } + } + pub(self) fn new_allowed_ips(endpoint_id: SmolStr) -> Self { + Self { + endpoint_id, + lookup_type: LookupType::AllowedIps, + } + } +} + +enum LookupType { + RoleSecret(SmolStr), + AllowedIps, +} + +impl Cache for ProjectInfoCacheImpl { + type Key = SmolStr; + // Value is not really used here, but we need to specify it. + type Value = SmolStr; + + type LookupInfo = CachedLookupInfo; + + fn invalidate(&self, key: &Self::LookupInfo) { + match &key.lookup_type { + LookupType::RoleSecret(role_name) => { + if let Some(mut endpoint_info) = self.cache.get_mut(&key.endpoint_id) { + endpoint_info.invalidate_role_secret(role_name); + } + } + LookupType::AllowedIps => { + if let Some(mut endpoint_info) = self.cache.get_mut(&key.endpoint_id) { + endpoint_info.invalidate_allowed_ips(); + } + } + } + } +} + +#[cfg(test)] +mod tests { + use super::*; + use crate::{console::AuthSecret, scram::ServerSecret}; + use smol_str::SmolStr; + use std::{sync::Arc, time::Duration}; + + #[tokio::test] + async fn test_project_info_cache_settings() { + tokio::time::pause(); + let cache = ProjectInfoCacheImpl::new(ProjectInfoCacheOptions { + size: 2, + max_roles: 2, + ttl: Duration::from_secs(1), + gc_interval: Duration::from_secs(600), + }); + let project_id = "project".into(); + let endpoint_id = "endpoint".into(); + let user1: SmolStr = "user1".into(); + let user2: SmolStr = "user2".into(); + let secret1 = AuthSecret::Scram(ServerSecret::mock(user1.as_str(), [1; 32])); + let secret2 = AuthSecret::Scram(ServerSecret::mock(user2.as_str(), [2; 32])); + let allowed_ips = Arc::new(vec!["allowed_ip1".into(), "allowed_ip2".into()]); + cache.insert_role_secret(&project_id, &endpoint_id, &user1, secret1.clone()); + cache.insert_role_secret(&project_id, &endpoint_id, &user2, secret2.clone()); + cache.insert_allowed_ips(&project_id, &endpoint_id, allowed_ips.clone()); + + let cached = cache.get_role_secret(&endpoint_id, &user1).unwrap(); + assert!(cached.cached()); + assert_eq!(cached.value, secret1); + let cached = cache.get_role_secret(&endpoint_id, &user2).unwrap(); + assert!(cached.cached()); + assert_eq!(cached.value, secret2); + + // Shouldn't add more than 2 roles. + let user3: SmolStr = "user3".into(); + let secret3 = AuthSecret::Scram(ServerSecret::mock(user3.as_str(), [3; 32])); + cache.insert_role_secret(&project_id, &endpoint_id, &user3, secret3.clone()); + assert!(cache.get_role_secret(&endpoint_id, &user3).is_none()); + + let cached = cache.get_allowed_ips(&endpoint_id).unwrap(); + assert!(cached.cached()); + assert_eq!(cached.value, allowed_ips); + + tokio::time::advance(Duration::from_secs(2)).await; + let cached = cache.get_role_secret(&endpoint_id, &user1); + assert!(cached.is_none()); + let cached = cache.get_role_secret(&endpoint_id, &user2); + assert!(cached.is_none()); + let cached = cache.get_allowed_ips(&endpoint_id); + assert!(cached.is_none()); + } + + #[tokio::test] + async fn test_project_info_cache_invalidations() { + tokio::time::pause(); + let cache = Arc::new(ProjectInfoCacheImpl::new(ProjectInfoCacheOptions { + size: 2, + max_roles: 2, + ttl: Duration::from_secs(1), + gc_interval: Duration::from_secs(600), + })); + cache.clone().disable_ttl(); + tokio::time::advance(Duration::from_secs(2)).await; + + let project_id = "project".into(); + let endpoint_id = "endpoint".into(); + let user1: SmolStr = "user1".into(); + let user2: SmolStr = "user2".into(); + let secret1 = AuthSecret::Scram(ServerSecret::mock(user1.as_str(), [1; 32])); + let secret2 = AuthSecret::Scram(ServerSecret::mock(user2.as_str(), [2; 32])); + let allowed_ips = Arc::new(vec!["allowed_ip1".into(), "allowed_ip2".into()]); + cache.insert_role_secret(&project_id, &endpoint_id, &user1, secret1.clone()); + cache.insert_role_secret(&project_id, &endpoint_id, &user2, secret2.clone()); + cache.insert_allowed_ips(&project_id, &endpoint_id, allowed_ips.clone()); + + tokio::time::advance(Duration::from_secs(2)).await; + // Nothing should be invalidated. + + let cached = cache.get_role_secret(&endpoint_id, &user1).unwrap(); + // TTL is disabled, so it should be impossible to invalidate this value. + assert!(!cached.cached()); + assert_eq!(cached.value, secret1); + + cached.invalidate(); // Shouldn't do anything. + let cached = cache.get_role_secret(&endpoint_id, &user1).unwrap(); + assert_eq!(cached.value, secret1); + + let cached = cache.get_role_secret(&endpoint_id, &user2).unwrap(); + assert!(!cached.cached()); + assert_eq!(cached.value, secret2); + + // The only way to invalidate this value is to invalidate via the api. + cache.invalidate_role_secret_for_project(&project_id, &user2); + assert!(cache.get_role_secret(&endpoint_id, &user2).is_none()); + + let cached = cache.get_allowed_ips(&endpoint_id).unwrap(); + assert!(!cached.cached()); + assert_eq!(cached.value, allowed_ips); + } + + #[tokio::test] + async fn test_disable_ttl_invalidate_added_before() { + tokio::time::pause(); + let cache = Arc::new(ProjectInfoCacheImpl::new(ProjectInfoCacheOptions { + size: 2, + max_roles: 2, + ttl: Duration::from_secs(1), + gc_interval: Duration::from_secs(600), + })); + + let project_id = "project".into(); + let endpoint_id = "endpoint".into(); + let user1: SmolStr = "user1".into(); + let user2: SmolStr = "user2".into(); + let secret1 = AuthSecret::Scram(ServerSecret::mock(user1.as_str(), [1; 32])); + let secret2 = AuthSecret::Scram(ServerSecret::mock(user2.as_str(), [2; 32])); + let allowed_ips = Arc::new(vec!["allowed_ip1".into(), "allowed_ip2".into()]); + cache.insert_role_secret(&project_id, &endpoint_id, &user1, secret1.clone()); + cache.clone().disable_ttl(); + tokio::time::advance(Duration::from_millis(100)).await; + cache.insert_role_secret(&project_id, &endpoint_id, &user2, secret2.clone()); + + // Added before ttl was disabled + ttl should be still cached. + let cached = cache.get_role_secret(&endpoint_id, &user1).unwrap(); + assert!(cached.cached()); + let cached = cache.get_role_secret(&endpoint_id, &user2).unwrap(); + assert!(cached.cached()); + + tokio::time::advance(Duration::from_secs(1)).await; + // Added before ttl was disabled + ttl should expire. + assert!(cache.get_role_secret(&endpoint_id, &user1).is_none()); + assert!(cache.get_role_secret(&endpoint_id, &user2).is_none()); + + // Added after ttl was disabled + ttl should not be cached. + cache.insert_allowed_ips(&project_id, &endpoint_id, allowed_ips.clone()); + let cached = cache.get_allowed_ips(&endpoint_id).unwrap(); + assert!(!cached.cached()); + + tokio::time::advance(Duration::from_secs(1)).await; + // Added before ttl was disabled + ttl still should expire. + assert!(cache.get_role_secret(&endpoint_id, &user1).is_none()); + assert!(cache.get_role_secret(&endpoint_id, &user2).is_none()); + // Shouldn't be invalidated. + + let cached = cache.get_allowed_ips(&endpoint_id).unwrap(); + assert!(!cached.cached()); + assert_eq!(cached.value, allowed_ips); + } +} diff --git a/proxy/src/cache/timed_lru.rs b/proxy/src/cache/timed_lru.rs new file mode 100644 index 0000000000..3b21381bb9 --- /dev/null +++ b/proxy/src/cache/timed_lru.rs @@ -0,0 +1,258 @@ +use std::{ + borrow::Borrow, + hash::Hash, + time::{Duration, Instant}, +}; +use tracing::debug; + +// This seems to make more sense than `lru` or `cached`: +// +// * `near/nearcore` ditched `cached` in favor of `lru` +// (https://github.com/near/nearcore/issues?q=is%3Aissue+lru+is%3Aclosed). +// +// * `lru` methods use an obscure `KeyRef` type in their contraints (which is deliberately excluded from docs). +// This severely hinders its usage both in terms of creating wrappers and supported key types. +// +// On the other hand, `hashlink` has good download stats and appears to be maintained. +use hashlink::{linked_hash_map::RawEntryMut, LruCache}; + +use super::{common::Cached, *}; + +/// An implementation of timed LRU cache with fixed capacity. +/// Key properties: +/// +/// * Whenever a new entry is inserted, the least recently accessed one is evicted. +/// The cache also keeps track of entry's insertion time (`created_at`) and TTL (`expires_at`). +/// +/// * If `update_ttl_on_retrieval` is `true`. When the entry is about to be retrieved, we check its expiration timestamp. +/// If the entry has expired, we remove it from the cache; Otherwise we bump the +/// expiration timestamp (e.g. +5mins) and change its place in LRU list to prolong +/// its existence. +/// +/// * There's an API for immediate invalidation (removal) of a cache entry; +/// It's useful in case we know for sure that the entry is no longer correct. +/// See [`timed_lru::LookupInfo`] & [`timed_lru::Cached`] for more information. +/// +/// * Expired entries are kept in the cache, until they are evicted by the LRU policy, +/// or by a successful lookup (i.e. the entry hasn't expired yet). +/// There is no background job to reap the expired records. +/// +/// * It's possible for an entry that has not yet expired entry to be evicted +/// before expired items. That's a bit wasteful, but probably fine in practice. +pub struct TimedLru { + /// Cache's name for tracing. + name: &'static str, + + /// The underlying cache implementation. + cache: parking_lot::Mutex>>, + + /// Default time-to-live of a single entry. + ttl: Duration, + + update_ttl_on_retrieval: bool, +} + +impl Cache for TimedLru { + type Key = K; + type Value = V; + type LookupInfo = LookupInfo; + + fn invalidate(&self, info: &Self::LookupInfo) { + self.invalidate_raw(info) + } +} + +struct Entry { + created_at: Instant, + expires_at: Instant, + value: T, +} + +impl TimedLru { + /// Construct a new LRU cache with timed entries. + pub fn new( + name: &'static str, + capacity: usize, + ttl: Duration, + update_ttl_on_retrieval: bool, + ) -> Self { + Self { + name, + cache: LruCache::new(capacity).into(), + ttl, + update_ttl_on_retrieval, + } + } + + /// Drop an entry from the cache if it's outdated. + #[tracing::instrument(level = "debug", fields(cache = self.name), skip_all)] + fn invalidate_raw(&self, info: &LookupInfo) { + let now = Instant::now(); + + // Do costly things before taking the lock. + let mut cache = self.cache.lock(); + let raw_entry = match cache.raw_entry_mut().from_key(&info.key) { + RawEntryMut::Vacant(_) => return, + RawEntryMut::Occupied(x) => x, + }; + + // Remove the entry if it was created prior to lookup timestamp. + let entry = raw_entry.get(); + let (created_at, expires_at) = (entry.created_at, entry.expires_at); + let should_remove = created_at <= info.created_at || expires_at <= now; + + if should_remove { + raw_entry.remove(); + } + + drop(cache); // drop lock before logging + debug!( + created_at = format_args!("{created_at:?}"), + expires_at = format_args!("{expires_at:?}"), + entry_removed = should_remove, + "processed a cache entry invalidation event" + ); + } + + /// Try retrieving an entry by its key, then execute `extract` if it exists. + #[tracing::instrument(level = "debug", fields(cache = self.name), skip_all)] + fn get_raw(&self, key: &Q, extract: impl FnOnce(&K, &Entry) -> R) -> Option + where + K: Borrow, + Q: Hash + Eq + ?Sized, + { + let now = Instant::now(); + let deadline = now.checked_add(self.ttl).expect("time overflow"); + + // Do costly things before taking the lock. + let mut cache = self.cache.lock(); + let mut raw_entry = match cache.raw_entry_mut().from_key(key) { + RawEntryMut::Vacant(_) => return None, + RawEntryMut::Occupied(x) => x, + }; + + // Immeditely drop the entry if it has expired. + let entry = raw_entry.get(); + if entry.expires_at <= now { + raw_entry.remove(); + return None; + } + + let value = extract(raw_entry.key(), entry); + let (created_at, expires_at) = (entry.created_at, entry.expires_at); + + // Update the deadline and the entry's position in the LRU list. + if self.update_ttl_on_retrieval { + raw_entry.get_mut().expires_at = deadline; + } + raw_entry.to_back(); + + drop(cache); // drop lock before logging + debug!( + created_at = format_args!("{created_at:?}"), + old_expires_at = format_args!("{expires_at:?}"), + new_expires_at = format_args!("{deadline:?}"), + "accessed a cache entry" + ); + + Some(value) + } + + /// Insert an entry to the cache. If an entry with the same key already + /// existed, return the previous value and its creation timestamp. + #[tracing::instrument(level = "debug", fields(cache = self.name), skip_all)] + fn insert_raw(&self, key: K, value: V) -> (Instant, Option) { + let created_at = Instant::now(); + let expires_at = created_at.checked_add(self.ttl).expect("time overflow"); + + let entry = Entry { + created_at, + expires_at, + value, + }; + + // Do costly things before taking the lock. + let old = self + .cache + .lock() + .insert(key, entry) + .map(|entry| entry.value); + + debug!( + created_at = format_args!("{created_at:?}"), + expires_at = format_args!("{expires_at:?}"), + replaced = old.is_some(), + "created a cache entry" + ); + + (created_at, old) + } +} + +impl TimedLru { + pub fn insert(&self, key: K, value: V) -> (Option, Cached<&Self>) { + let (created_at, old) = self.insert_raw(key.clone(), value.clone()); + + let cached = Cached { + token: Some((self, LookupInfo { created_at, key })), + value, + }; + + (old, cached) + } +} + +impl TimedLru { + /// Retrieve a cached entry in convenient wrapper. + pub fn get(&self, key: &Q) -> Option> + where + K: Borrow + Clone, + Q: Hash + Eq + ?Sized, + { + self.get_raw(key, |key, entry| { + let info = LookupInfo { + created_at: entry.created_at, + key: key.clone(), + }; + + Cached { + token: Some((self, info)), + value: entry.value.clone(), + } + }) + } + + /// Retrieve a cached entry in convenient wrapper, ignoring its TTL. + pub fn get_ignoring_ttl(&self, key: &Q) -> Option> + where + K: Borrow, + Q: Hash + Eq + ?Sized, + { + let mut cache = self.cache.lock(); + cache + .get(key) + .map(|entry| Cached::new_uncached(entry.value.clone())) + } + + /// Remove an entry from the cache. + pub fn remove(&self, key: &Q) -> Option + where + K: Borrow + Clone, + Q: Hash + Eq + ?Sized, + { + let mut cache = self.cache.lock(); + cache.remove(key).map(|entry| entry.value) + } +} + +/// Lookup information for key invalidation. +pub struct LookupInfo { + /// Time of creation of a cache [`Entry`]. + /// We use this during invalidation lookups to prevent eviction of a newer + /// entry sharing the same key (it might've been inserted by a different + /// task after we got the entry we're trying to invalidate now). + created_at: Instant, + + /// Search by this key. + key: K, +} diff --git a/proxy/src/compute.rs b/proxy/src/compute.rs index a54ba56e43..aef1aab733 100644 --- a/proxy/src/compute.rs +++ b/proxy/src/compute.rs @@ -1,6 +1,7 @@ use crate::{ auth::parse_endpoint_param, cancellation::CancelClosure, console::errors::WakeComputeError, - error::UserFacingError, metrics::NUM_DB_CONNECTIONS_GAUGE, proxy::neon_option, + context::RequestMonitoring, error::UserFacingError, metrics::NUM_DB_CONNECTIONS_GAUGE, + proxy::neon_option, }; use futures::{FutureExt, TryFutureExt}; use itertools::Itertools; @@ -38,7 +39,17 @@ impl UserFacingError for ConnectionError { // This helps us drop irrelevant library-specific prefixes. // TODO: propagate severity level and other parameters. Postgres(err) => match err.as_db_error() { - Some(err) => err.message().to_owned(), + Some(err) => { + let msg = err.message(); + + if msg.starts_with("unsupported startup parameter: ") + || msg.starts_with("unsupported startup parameter in options: ") + { + format!("{msg}. Please use unpooled connection or remove this parameter from the startup package. More details: https://neon.tech/docs/connect/connection-errors#unsupported-startup-parameter") + } else { + msg.to_owned() + } + } None => err.to_string(), }, WakeComputeError(err) => err.to_string_client(), @@ -232,9 +243,9 @@ impl ConnCfg { /// Connect to a corresponding compute node. pub async fn connect( &self, + ctx: &mut RequestMonitoring, allow_self_signed_compute: bool, timeout: Duration, - proto: &'static str, ) -> Result { let (socket_addr, stream, host) = self.connect_raw(timeout).await?; @@ -268,7 +279,9 @@ impl ConnCfg { stream, params, cancel_closure, - _guage: NUM_DB_CONNECTIONS_GAUGE.with_label_values(&[proto]).guard(), + _guage: NUM_DB_CONNECTIONS_GAUGE + .with_label_values(&[ctx.protocol]) + .guard(), }; Ok(connection) diff --git a/proxy/src/config.rs b/proxy/src/config.rs index 2ed248af8d..2c46458a49 100644 --- a/proxy/src/config.rs +++ b/proxy/src/config.rs @@ -1,4 +1,4 @@ -use crate::{auth, rate_limiter::RateBucketInfo}; +use crate::{auth, rate_limiter::RateBucketInfo, serverless::GlobalConnPoolOptions}; use anyhow::{bail, ensure, Context, Ok}; use rustls::{sign, Certificate, PrivateKey}; use sha2::{Digest, Sha256}; @@ -21,6 +21,7 @@ pub struct ProxyConfig { pub require_client_ip: bool, pub disable_ip_check_for_http: bool, pub endpoint_rps_limit: Vec, + pub region: String, } #[derive(Debug)] @@ -31,13 +32,13 @@ pub struct MetricCollectionConfig { pub struct TlsConfig { pub config: Arc, - pub common_names: Option>, + pub common_names: HashSet, pub cert_resolver: Arc, } pub struct HttpConfig { - pub timeout: tokio::time::Duration, - pub pool_opt_in: bool, + pub request_timeout: tokio::time::Duration, + pub pool_options: GlobalConnPoolOptions, } pub struct AuthenticationConfig { @@ -96,7 +97,7 @@ pub fn configure_tls( Ok(TlsConfig { config, - common_names: Some(common_names), + common_names, cert_resolver, }) } @@ -351,6 +352,69 @@ impl FromStr for CacheOptions { } } +/// Helper for cmdline cache options parsing. +#[derive(Debug)] +pub struct ProjectInfoCacheOptions { + /// Max number of entries. + pub size: usize, + /// Entry's time-to-live. + pub ttl: Duration, + /// Max number of roles per endpoint. + pub max_roles: usize, + /// Gc interval. + pub gc_interval: Duration, +} + +impl ProjectInfoCacheOptions { + /// Default options for [`crate::console::provider::NodeInfoCache`]. + pub const CACHE_DEFAULT_OPTIONS: &'static str = + "size=10000,ttl=4m,max_roles=10,gc_interval=60m"; + + /// Parse cache options passed via cmdline. + /// Example: [`Self::CACHE_DEFAULT_OPTIONS`]. + fn parse(options: &str) -> anyhow::Result { + let mut size = None; + let mut ttl = None; + let mut max_roles = None; + let mut gc_interval = None; + + for option in options.split(',') { + let (key, value) = option + .split_once('=') + .with_context(|| format!("bad key-value pair: {option}"))?; + + match key { + "size" => size = Some(value.parse()?), + "ttl" => ttl = Some(humantime::parse_duration(value)?), + "max_roles" => max_roles = Some(value.parse()?), + "gc_interval" => gc_interval = Some(humantime::parse_duration(value)?), + unknown => bail!("unknown key: {unknown}"), + } + } + + // TTL doesn't matter if cache is always empty. + if let Some(0) = size { + ttl.get_or_insert(Duration::default()); + } + + Ok(Self { + size: size.context("missing `size`")?, + ttl: ttl.context("missing `ttl`")?, + max_roles: max_roles.context("missing `max_roles`")?, + gc_interval: gc_interval.context("missing `gc_interval`")?, + }) + } +} + +impl FromStr for ProjectInfoCacheOptions { + type Err = anyhow::Error; + + fn from_str(options: &str) -> Result { + let error = || format!("failed to parse cache options '{options}'"); + Self::parse(options).with_context(error) + } +} + /// Helper for cmdline cache options parsing. pub struct WakeComputeLockOptions { /// The number of shards the lock map should have diff --git a/proxy/src/console.rs b/proxy/src/console.rs index 07bc807950..fd3c46b946 100644 --- a/proxy/src/console.rs +++ b/proxy/src/console.rs @@ -6,7 +6,7 @@ pub mod messages; /// Wrappers for console APIs and their mocks. pub mod provider; -pub use provider::{errors, Api, AuthSecret, CachedNodeInfo, ConsoleReqExtra, NodeInfo}; +pub use provider::{errors, Api, AuthSecret, CachedNodeInfo, NodeInfo}; /// Various cache-related types. pub mod caches { diff --git a/proxy/src/console/messages.rs b/proxy/src/console/messages.rs index 837379b21f..c02d65668f 100644 --- a/proxy/src/console/messages.rs +++ b/proxy/src/console/messages.rs @@ -15,6 +15,7 @@ pub struct ConsoleError { pub struct GetRoleSecret { pub role_secret: Box, pub allowed_ips: Option>>, + pub project_id: Option>, } // Manually implement debug to omit sensitive info. @@ -207,12 +208,17 @@ mod tests { "role_secret": "secret", }); let _: GetRoleSecret = serde_json::from_str(&json.to_string())?; - // Empty `allowed_ips` field. let json = json!({ "role_secret": "secret", "allowed_ips": ["8.8.8.8"], }); let _: GetRoleSecret = serde_json::from_str(&json.to_string())?; + let json = json!({ + "role_secret": "secret", + "allowed_ips": ["8.8.8.8"], + "project_id": "project", + }); + let _: GetRoleSecret = serde_json::from_str(&json.to_string())?; Ok(()) } diff --git a/proxy/src/console/provider.rs b/proxy/src/console/provider.rs index e4cf1e8c8e..84c43183cc 100644 --- a/proxy/src/console/provider.rs +++ b/proxy/src/console/provider.rs @@ -5,17 +5,18 @@ pub mod neon; use super::messages::MetricsAuxInfo; use crate::{ auth::backend::ComputeUserInfo, - cache::{timed_lru, TimedLru}, - compute, scram, + cache::{project_info::ProjectInfoCacheImpl, Cached, TimedLru}, + compute, + config::{CacheOptions, ProjectInfoCacheOptions}, + context::RequestMonitoring, + scram, }; use async_trait::async_trait; use dashmap::DashMap; use smol_str::SmolStr; use std::{sync::Arc, time::Duration}; -use tokio::{ - sync::{OwnedSemaphorePermit, Semaphore}, - time::Instant, -}; +use tokio::sync::{OwnedSemaphorePermit, Semaphore}; +use tokio::time::Instant; use tracing::info; pub mod errors { @@ -196,28 +197,8 @@ pub mod errors { } } -/// Extra query params we'd like to pass to the console. -pub struct ConsoleReqExtra { - /// A unique identifier for a connection. - pub session_id: uuid::Uuid, - /// Name of client application, if set. - pub application_name: String, - pub options: Vec<(String, String)>, -} - -impl ConsoleReqExtra { - // https://swagger.io/docs/specification/serialization/ DeepObject format - // paramName[prop1]=value1¶mName[prop2]=value2&.... - pub fn options_as_deep_object(&self) -> Vec<(String, String)> { - self.options - .iter() - .map(|(k, v)| (format!("options[{}]", k), v.to_string())) - .collect() - } -} - /// Auth secret which is managed by the cloud. -#[derive(Clone)] +#[derive(Clone, Eq, PartialEq, Debug)] pub enum AuthSecret { #[cfg(feature = "testing")] /// Md5 hash of user's password. @@ -231,7 +212,9 @@ pub enum AuthSecret { pub struct AuthInfo { pub secret: Option, /// List of IP addresses allowed for the autorization. - pub allowed_ips: Vec, + pub allowed_ips: Vec, + /// Project ID. This is used for cache invalidation. + pub project_id: Option, } /// Info for establishing a connection to a compute node. @@ -250,33 +233,34 @@ pub struct NodeInfo { pub allow_self_signed_compute: bool, } -pub type NodeInfoCache = TimedLru, NodeInfo>; -pub type CachedNodeInfo = timed_lru::Cached<&'static NodeInfoCache>; -pub type AllowedIpsCache = TimedLru>>; -pub type RoleSecretCache = TimedLru<(SmolStr, SmolStr), Option>; -pub type CachedRoleSecret = timed_lru::Cached<&'static RoleSecretCache>; +pub type NodeInfoCache = TimedLru; +pub type CachedNodeInfo = Cached<&'static NodeInfoCache>; +pub type CachedRoleSecret = Cached<&'static ProjectInfoCacheImpl, AuthSecret>; +pub type CachedAllowedIps = Cached<&'static ProjectInfoCacheImpl, Arc>>; /// This will allocate per each call, but the http requests alone /// already require a few allocations, so it should be fine. #[async_trait] pub trait Api { /// Get the client's auth secret for authentication. + /// Returns option because user not found situation is special. + /// We still have to mock the scram to avoid leaking information that user doesn't exist. async fn get_role_secret( &self, - extra: &ConsoleReqExtra, + ctx: &mut RequestMonitoring, creds: &ComputeUserInfo, - ) -> Result; + ) -> Result, errors::GetAuthInfoError>; async fn get_allowed_ips( &self, - extra: &ConsoleReqExtra, + ctx: &mut RequestMonitoring, creds: &ComputeUserInfo, - ) -> Result>, errors::GetAuthInfoError>; + ) -> Result; /// Wake up the compute node and return the corresponding connection info. async fn wake_compute( &self, - extra: &ConsoleReqExtra, + ctx: &mut RequestMonitoring, creds: &ComputeUserInfo, ) -> Result; } @@ -285,16 +269,31 @@ pub trait Api { pub struct ApiCaches { /// Cache for the `wake_compute` API method. pub node_info: NodeInfoCache, - /// Cache for the `get_allowed_ips`. TODO(anna): use notifications listener instead. - pub allowed_ips: AllowedIpsCache, - /// Cache for the `get_role_secret`. TODO(anna): use notifications listener instead. - pub role_secret: RoleSecretCache, + /// Cache which stores project_id -> endpoint_ids mapping. + pub project_info: Arc, +} + +impl ApiCaches { + pub fn new( + wake_compute_cache_config: CacheOptions, + project_info_cache_config: ProjectInfoCacheOptions, + ) -> Self { + Self { + node_info: NodeInfoCache::new( + "node_info_cache", + wake_compute_cache_config.size, + wake_compute_cache_config.ttl, + true, + ), + project_info: Arc::new(ProjectInfoCacheImpl::new(project_info_cache_config)), + } + } } /// Various caches for [`console`](super). pub struct ApiLocks { name: &'static str, - node_locks: DashMap, Arc>, + node_locks: DashMap>, permits: usize, timeout: Duration, registered: prometheus::IntCounter, @@ -362,7 +361,7 @@ impl ApiLocks { pub async fn get_wake_compute_permit( &self, - key: &Arc, + key: &SmolStr, ) -> Result { if self.permits == 0 { return Ok(WakeComputePermit { permit: None }); diff --git a/proxy/src/console/provider/mock.rs b/proxy/src/console/provider/mock.rs index dba5e5863f..cc35a06708 100644 --- a/proxy/src/console/provider/mock.rs +++ b/proxy/src/console/provider/mock.rs @@ -1,15 +1,17 @@ //! Mock console backend which relies on a user-provided postgres instance. -use std::sync::Arc; - use super::{ errors::{ApiError, GetAuthInfoError, WakeComputeError}, - AuthInfo, AuthSecret, CachedNodeInfo, ConsoleReqExtra, NodeInfo, + AuthInfo, AuthSecret, CachedNodeInfo, NodeInfo, }; -use crate::console::provider::CachedRoleSecret; +use crate::cache::Cached; +use crate::console::provider::{CachedAllowedIps, CachedRoleSecret}; +use crate::context::RequestMonitoring; use crate::{auth::backend::ComputeUserInfo, compute, error::io_error, scram, url::ApiUrl}; use async_trait::async_trait; use futures::TryFutureExt; +use smol_str::SmolStr; +use std::sync::Arc; use thiserror::Error; use tokio_postgres::{config::SslMode, Client}; use tracing::{error, info, info_span, warn, Instrument}; @@ -48,7 +50,7 @@ impl Api { async fn do_get_auth_info( &self, - creds: &ComputeUserInfo, + user_info: &ComputeUserInfo, ) -> Result { let (secret, allowed_ips) = async { // Perhaps we could persist this connection, but then we'd have to @@ -61,7 +63,7 @@ impl Api { let secret = match get_execute_postgres_query( &client, "select rolpassword from pg_catalog.pg_authid where rolname = $1", - &[&&*creds.inner.user], + &[&&*user_info.user], "rolpassword", ) .await? @@ -72,14 +74,14 @@ impl Api { secret.or_else(|| parse_md5(&entry).map(AuthSecret::Md5)) } None => { - warn!("user '{}' does not exist", creds.inner.user); + warn!("user '{}' does not exist", user_info.user); None } }; let allowed_ips = match get_execute_postgres_query( &client, "select allowed_ips from neon_control_plane.endpoints where endpoint_id = $1", - &[&creds.endpoint.as_str()], + &[&user_info.endpoint.as_str()], "allowed_ips", ) .await? @@ -98,7 +100,8 @@ impl Api { .await?; Ok(AuthInfo { secret, - allowed_ips, + allowed_ips: allowed_ips.iter().map(SmolStr::from).collect(), + project_id: None, }) } @@ -145,27 +148,31 @@ impl super::Api for Api { #[tracing::instrument(skip_all)] async fn get_role_secret( &self, - _extra: &ConsoleReqExtra, - creds: &ComputeUserInfo, - ) -> Result { - Ok(CachedRoleSecret::new_uncached( - self.do_get_auth_info(creds).await?.secret, - )) + _ctx: &mut RequestMonitoring, + user_info: &ComputeUserInfo, + ) -> Result, GetAuthInfoError> { + Ok(self + .do_get_auth_info(user_info) + .await? + .secret + .map(CachedRoleSecret::new_uncached)) } async fn get_allowed_ips( &self, - _extra: &ConsoleReqExtra, - creds: &ComputeUserInfo, - ) -> Result>, GetAuthInfoError> { - Ok(Arc::new(self.do_get_auth_info(creds).await?.allowed_ips)) + _ctx: &mut RequestMonitoring, + user_info: &ComputeUserInfo, + ) -> Result { + Ok(Cached::new_uncached(Arc::new( + self.do_get_auth_info(user_info).await?.allowed_ips, + ))) } #[tracing::instrument(skip_all)] async fn wake_compute( &self, - _extra: &ConsoleReqExtra, - _creds: &ComputeUserInfo, + _ctx: &mut RequestMonitoring, + _user_info: &ComputeUserInfo, ) -> Result { self.do_wake_compute() .map_ok(CachedNodeInfo::new_uncached) diff --git a/proxy/src/console/provider/neon.rs b/proxy/src/console/provider/neon.rs index 628d98df49..b61e7d2301 100644 --- a/proxy/src/console/provider/neon.rs +++ b/proxy/src/console/provider/neon.rs @@ -3,15 +3,20 @@ use super::{ super::messages::{ConsoleError, GetRoleSecret, WakeCompute}, errors::{ApiError, GetAuthInfoError, WakeComputeError}, - ApiCaches, ApiLocks, AuthInfo, AuthSecret, CachedNodeInfo, CachedRoleSecret, ConsoleReqExtra, + ApiCaches, ApiLocks, AuthInfo, AuthSecret, CachedAllowedIps, CachedNodeInfo, CachedRoleSecret, NodeInfo, }; -use crate::metrics::{ALLOWED_IPS_BY_CACHE_OUTCOME, ALLOWED_IPS_NUMBER}; use crate::{auth::backend::ComputeUserInfo, compute, http, scram}; +use crate::{ + cache::Cached, + context::RequestMonitoring, + metrics::{ALLOWED_IPS_BY_CACHE_OUTCOME, ALLOWED_IPS_NUMBER}, +}; use async_trait::async_trait; use futures::TryFutureExt; use itertools::Itertools; -use std::{net::SocketAddr, sync::Arc}; +use smol_str::SmolStr; +use std::sync::Arc; use tokio::time::Instant; use tokio_postgres::config::SslMode; use tracing::{error, info, info_span, warn, Instrument}; @@ -19,7 +24,7 @@ use tracing::{error, info, info_span, warn, Instrument}; #[derive(Clone)] pub struct Api { endpoint: http::Endpoint, - caches: &'static ApiCaches, + pub caches: &'static ApiCaches, locks: &'static ApiLocks, jwt: String, } @@ -49,21 +54,22 @@ impl Api { async fn do_get_auth_info( &self, - extra: &ConsoleReqExtra, - creds: &ComputeUserInfo, + ctx: &mut RequestMonitoring, + user_info: &ComputeUserInfo, ) -> Result { let request_id = uuid::Uuid::new_v4().to_string(); + let application_name = ctx.console_application_name(); async { let request = self .endpoint .get("proxy_get_role_secret") .header("X-Request-ID", &request_id) .header("Authorization", format!("Bearer {}", &self.jwt)) - .query(&[("session_id", extra.session_id)]) + .query(&[("session_id", ctx.session_id)]) .query(&[ - ("application_name", extra.application_name.as_str()), - ("project", creds.endpoint.as_str()), - ("role", creds.inner.user.as_str()), + ("application_name", application_name.as_str()), + ("project", user_info.endpoint.as_str()), + ("role", user_info.user.as_str()), ]) .build()?; @@ -87,12 +93,13 @@ impl Api { .allowed_ips .into_iter() .flatten() - .map(String::from) + .map(SmolStr::from) .collect_vec(); ALLOWED_IPS_NUMBER.observe(allowed_ips.len() as f64); Ok(AuthInfo { secret: Some(secret), allowed_ips, + project_id: body.project_id.map(SmolStr::from), }) } .map_err(crate::error::log_error) @@ -102,27 +109,28 @@ impl Api { async fn do_wake_compute( &self, - extra: &ConsoleReqExtra, - creds: &ComputeUserInfo, + ctx: &mut RequestMonitoring, + user_info: &ComputeUserInfo, ) -> Result { let request_id = uuid::Uuid::new_v4().to_string(); + let application_name = ctx.console_application_name(); async { let mut request_builder = self .endpoint .get("proxy_wake_compute") .header("X-Request-ID", &request_id) .header("Authorization", format!("Bearer {}", &self.jwt)) - .query(&[("session_id", extra.session_id)]) + .query(&[("session_id", ctx.session_id)]) .query(&[ - ("application_name", extra.application_name.as_str()), - ("project", creds.endpoint.as_str()), + ("application_name", application_name.as_str()), + ("project", user_info.endpoint.as_str()), ]); - request_builder = if extra.options.is_empty() { - request_builder - } else { - request_builder.query(&extra.options_as_deep_object()) - }; + let options = user_info.options.to_deep_object(); + if !options.is_empty() { + request_builder = request_builder.query(&options); + } + let request = request_builder.build()?; info!(url = request.url().as_str(), "sending http request"); @@ -141,7 +149,7 @@ impl Api { // We'll set username and such later using the startup message. // TODO: add more type safety (in progress). let mut config = compute::ConnCfg::new(); - config.host(&host).port(port).ssl_mode(SslMode::Disable); // TLS is not configured on compute nodes. + config.host(host).port(port).ssl_mode(SslMode::Disable); // TLS is not configured on compute nodes. let node = NodeInfo { config, @@ -162,69 +170,77 @@ impl super::Api for Api { #[tracing::instrument(skip_all)] async fn get_role_secret( &self, - extra: &ConsoleReqExtra, - creds: &ComputeUserInfo, - ) -> Result { - let ep = creds.endpoint.clone(); - let user = creds.inner.user.clone(); - if let Some(role_secret) = self.caches.role_secret.get(&(ep.clone(), user.clone())) { - return Ok(role_secret); + ctx: &mut RequestMonitoring, + user_info: &ComputeUserInfo, + ) -> Result, GetAuthInfoError> { + let ep = &user_info.endpoint; + let user = &user_info.user; + if let Some(role_secret) = self.caches.project_info.get_role_secret(ep, user) { + return Ok(Some(role_secret)); } - let auth_info = self.do_get_auth_info(extra, creds).await?; - let (_, secret) = self - .caches - .role_secret - .insert((ep.clone(), user), auth_info.secret.clone()); - self.caches - .allowed_ips - .insert(ep, Arc::new(auth_info.allowed_ips)); - Ok(secret) + let auth_info = self.do_get_auth_info(ctx, user_info).await?; + let project_id = auth_info.project_id.unwrap_or(ep.clone()); + if let Some(secret) = &auth_info.secret { + self.caches + .project_info + .insert_role_secret(&project_id, ep, user, secret.clone()) + } + self.caches.project_info.insert_allowed_ips( + &project_id, + ep, + Arc::new(auth_info.allowed_ips), + ); + // When we just got a secret, we don't need to invalidate it. + Ok(auth_info.secret.map(Cached::new_uncached)) } async fn get_allowed_ips( &self, - extra: &ConsoleReqExtra, - creds: &ComputeUserInfo, - ) -> Result>, GetAuthInfoError> { - if let Some(allowed_ips) = self.caches.allowed_ips.get(&creds.endpoint) { + ctx: &mut RequestMonitoring, + user_info: &ComputeUserInfo, + ) -> Result { + let ep = &user_info.endpoint; + if let Some(allowed_ips) = self.caches.project_info.get_allowed_ips(ep) { ALLOWED_IPS_BY_CACHE_OUTCOME .with_label_values(&["hit"]) .inc(); - return Ok(Arc::new(allowed_ips.to_vec())); + return Ok(allowed_ips); } ALLOWED_IPS_BY_CACHE_OUTCOME .with_label_values(&["miss"]) .inc(); - let auth_info = self.do_get_auth_info(extra, creds).await?; + let auth_info = self.do_get_auth_info(ctx, user_info).await?; let allowed_ips = Arc::new(auth_info.allowed_ips); - let ep = creds.endpoint.clone(); - let user = creds.inner.user.clone(); + let user = &user_info.user; + let project_id = auth_info.project_id.unwrap_or(ep.clone()); + if let Some(secret) = &auth_info.secret { + self.caches + .project_info + .insert_role_secret(&project_id, ep, user, secret.clone()) + } self.caches - .role_secret - .insert((ep.clone(), user), auth_info.secret); - self.caches.allowed_ips.insert(ep, allowed_ips.clone()); - Ok(allowed_ips) + .project_info + .insert_allowed_ips(&project_id, ep, allowed_ips.clone()); + Ok(Cached::new_uncached(allowed_ips)) } #[tracing::instrument(skip_all)] async fn wake_compute( &self, - extra: &ConsoleReqExtra, - creds: &ComputeUserInfo, + ctx: &mut RequestMonitoring, + user_info: &ComputeUserInfo, ) -> Result { - let key: &str = &creds.inner.cache_key; + let key = user_info.endpoint_cache_key(); // Every time we do a wakeup http request, the compute node will stay up // for some time (highly depends on the console's scale-to-zero policy); // The connection info remains the same during that period of time, // which means that we might cache it to reduce the load and latency. - if let Some(cached) = self.caches.node_info.get(key) { - info!(key = key, "found cached compute node info"); + if let Some(cached) = self.caches.node_info.get(&*key) { + info!(key = &*key, "found cached compute node info"); return Ok(cached); } - let key: Arc = key.into(); - let permit = self.locks.get_wake_compute_permit(&key).await?; // after getting back a permit - it's possible the cache was filled @@ -236,7 +252,7 @@ impl super::Api for Api { } } - let node = self.do_wake_compute(extra, creds).await?; + let node = self.do_wake_compute(ctx, user_info).await?; let (_, cached) = self.caches.node_info.insert(key.clone(), node); info!(key = &*key, "created a cache entry for compute node info"); @@ -269,9 +285,10 @@ async fn parse_body serde::Deserialize<'a>>( Err(ApiError::Console { status, text }) } -fn parse_host_port(input: &str) -> Option<(String, u16)> { - let parsed: SocketAddr = input.parse().ok()?; - Some((parsed.ip().to_string(), parsed.port())) +fn parse_host_port(input: &str) -> Option<(&str, u16)> { + let (host, port) = input.rsplit_once(':')?; + let ipv6_brackets: &[_] = &['[', ']']; + Some((host.trim_matches(ipv6_brackets), port.parse().ok()?)) } #[cfg(test)] @@ -279,9 +296,24 @@ mod tests { use super::*; #[test] - fn test_parse_host_port() { + fn test_parse_host_port_v4() { let (host, port) = parse_host_port("127.0.0.1:5432").expect("failed to parse"); assert_eq!(host, "127.0.0.1"); assert_eq!(port, 5432); } + + #[test] + fn test_parse_host_port_v6() { + let (host, port) = parse_host_port("[2001:db8::1]:5432").expect("failed to parse"); + assert_eq!(host, "2001:db8::1"); + assert_eq!(port, 5432); + } + + #[test] + fn test_parse_host_port_url() { + let (host, port) = parse_host_port("compute-foo-bar-1234.default.svc.cluster.local:5432") + .expect("failed to parse"); + assert_eq!(host, "compute-foo-bar-1234.default.svc.cluster.local"); + assert_eq!(port, 5432); + } } diff --git a/proxy/src/context.rs b/proxy/src/context.rs new file mode 100644 index 0000000000..47449cf59a --- /dev/null +++ b/proxy/src/context.rs @@ -0,0 +1,110 @@ +//! Connection request monitoring contexts + +use chrono::Utc; +use once_cell::sync::OnceCell; +use smol_str::SmolStr; +use std::net::IpAddr; +use tokio::sync::mpsc; +use uuid::Uuid; + +use crate::{console::messages::MetricsAuxInfo, error::ErrorKind, metrics::LatencyTimer}; + +pub mod parquet; + +static LOG_CHAN: OnceCell> = OnceCell::new(); + +#[derive(Clone)] +/// Context data for a single request to connect to a database. +/// +/// This data should **not** be used for connection logic, only for observability and limiting purposes. +/// All connection logic should instead use strongly typed state machines, not a bunch of Options. +pub struct RequestMonitoring { + pub peer_addr: IpAddr, + pub session_id: Uuid, + pub protocol: &'static str, + first_packet: chrono::DateTime, + region: &'static str, + + // filled in as they are discovered + project: Option, + branch: Option, + endpoint_id: Option, + user: Option, + application: Option, + error_kind: Option, + + // extra + // This sender is here to keep the request monitoring channel open while requests are taking place. + sender: Option>, + pub latency_timer: LatencyTimer, +} + +impl RequestMonitoring { + pub fn new( + session_id: Uuid, + peer_addr: IpAddr, + protocol: &'static str, + region: &'static str, + ) -> Self { + Self { + peer_addr, + session_id, + protocol, + first_packet: Utc::now(), + region, + + project: None, + branch: None, + endpoint_id: None, + user: None, + application: None, + error_kind: None, + + sender: LOG_CHAN.get().and_then(|tx| tx.upgrade()), + latency_timer: LatencyTimer::new(protocol), + } + } + + #[cfg(test)] + pub fn test() -> Self { + RequestMonitoring::new(Uuid::now_v7(), [127, 0, 0, 1].into(), "test", "test") + } + + pub fn console_application_name(&self) -> String { + format!( + "{}/{}", + self.application.as_deref().unwrap_or_default(), + self.protocol + ) + } + + pub fn set_project(&mut self, x: MetricsAuxInfo) { + self.branch = Some(x.branch_id); + self.endpoint_id = Some(x.endpoint_id); + self.project = Some(x.project_id); + } + + pub fn set_endpoint_id(&mut self, endpoint_id: Option) { + self.endpoint_id = endpoint_id.or_else(|| self.endpoint_id.clone()); + } + + pub fn set_application(&mut self, app: Option) { + self.application = app.or_else(|| self.application.clone()); + } + + pub fn set_user(&mut self, user: SmolStr) { + self.user = Some(user); + } + + pub fn log(&mut self) { + if let Some(tx) = self.sender.take() { + let _: Result<(), _> = tx.send(self.clone()); + } + } +} + +impl Drop for RequestMonitoring { + fn drop(&mut self) { + self.log() + } +} diff --git a/proxy/src/context/parquet.rs b/proxy/src/context/parquet.rs new file mode 100644 index 0000000000..ca4eff5ddf --- /dev/null +++ b/proxy/src/context/parquet.rs @@ -0,0 +1,641 @@ +use std::sync::Arc; + +use anyhow::Context; +use bytes::BytesMut; +use futures::{Stream, StreamExt}; +use parquet::{ + basic::Compression, + file::{ + metadata::RowGroupMetaDataPtr, + properties::{WriterProperties, WriterPropertiesPtr, DEFAULT_PAGE_SIZE}, + writer::SerializedFileWriter, + }, + record::RecordWriter, +}; +use remote_storage::{GenericRemoteStorage, RemotePath, RemoteStorageConfig}; +use tokio::{sync::mpsc, time}; +use tokio_util::sync::CancellationToken; +use tracing::{debug, info, Span}; +use utils::backoff; + +use super::{RequestMonitoring, LOG_CHAN}; + +#[derive(clap::Args, Clone, Debug)] +pub struct ParquetUploadArgs { + /// Storage location to upload the parquet files to. + /// Encoded as toml (same format as pageservers), eg + /// `{bucket_name='the-bucket',bucket_region='us-east-1',prefix_in_bucket='proxy',endpoint='http://minio:9000'}` + #[clap(long, default_value = "{}", value_parser = remote_storage_from_toml)] + parquet_upload_remote_storage: OptRemoteStorageConfig, + + /// How many rows to include in a row group + #[clap(long, default_value_t = 8192)] + parquet_upload_row_group_size: usize, + + /// How large each column page should be in bytes + #[clap(long, default_value_t = DEFAULT_PAGE_SIZE)] + parquet_upload_page_size: usize, + + /// How large the total parquet file should be in bytes + #[clap(long, default_value_t = 100_000_000)] + parquet_upload_size: i64, + + /// How long to wait before forcing a file upload + #[clap(long, default_value = "20m", value_parser = humantime::parse_duration)] + parquet_upload_maximum_duration: tokio::time::Duration, + + /// What level of compression to use + #[clap(long, default_value_t = Compression::UNCOMPRESSED)] + parquet_upload_compression: Compression, +} + +/// Hack to avoid clap being smarter. If you don't use this type alias, clap assumes more about the optional state and you get +/// runtime type errors from the value parser we use. +type OptRemoteStorageConfig = Option; + +fn remote_storage_from_toml(s: &str) -> anyhow::Result { + RemoteStorageConfig::from_toml(&s.parse()?) +} + +// Occasional network issues and such can cause remote operations to fail, and +// that's expected. If a upload fails, we log it at info-level, and retry. +// But after FAILED_UPLOAD_WARN_THRESHOLD retries, we start to log it at WARN +// level instead, as repeated failures can mean a more serious problem. If it +// fails more than FAILED_UPLOAD_RETRIES times, we give up +pub(crate) const FAILED_UPLOAD_WARN_THRESHOLD: u32 = 3; +pub(crate) const FAILED_UPLOAD_MAX_RETRIES: u32 = 10; + +// the parquet crate leaves a lot to be desired... +// what follows is an attempt to write parquet files with minimal allocs. +// complication: parquet is a columnar format, while we want to write in as rows. +// design: +// * we batch up to 1024 rows, then flush them into a 'row group' +// * after each rowgroup write, we check the length of the file and upload to s3 if large enough + +#[derive(parquet_derive::ParquetRecordWriter)] +struct RequestData { + region: &'static str, + protocol: &'static str, + /// Must be UTC. The derive macro doesn't like the timezones + timestamp: chrono::NaiveDateTime, + session_id: uuid::Uuid, + peer_addr: String, + username: Option, + application_name: Option, + endpoint_id: Option, + project: Option, + branch: Option, + error: Option<&'static str>, +} + +impl From for RequestData { + fn from(value: RequestMonitoring) -> Self { + Self { + session_id: value.session_id, + peer_addr: value.peer_addr.to_string(), + timestamp: value.first_packet.naive_utc(), + username: value.user.as_deref().map(String::from), + application_name: value.application.as_deref().map(String::from), + endpoint_id: value.endpoint_id.as_deref().map(String::from), + project: value.project.as_deref().map(String::from), + branch: value.branch.as_deref().map(String::from), + protocol: value.protocol, + region: value.region, + error: value.error_kind.as_ref().map(|e| e.to_str()), + } + } +} + +/// Parquet request context worker +/// +/// It listened on a channel for all completed requests, extracts the data and writes it into a parquet file, +/// then uploads a completed batch to S3 +pub async fn worker( + cancellation_token: CancellationToken, + config: ParquetUploadArgs, +) -> anyhow::Result<()> { + let Some(remote_storage_config) = config.parquet_upload_remote_storage else { + tracing::warn!("parquet request upload: no s3 bucket configured"); + return Ok(()); + }; + + let (tx, mut rx) = mpsc::unbounded_channel(); + LOG_CHAN.set(tx.downgrade()).unwrap(); + + // setup row stream that will close on cancellation + tokio::spawn(async move { + cancellation_token.cancelled().await; + // dropping this sender will cause the channel to close only once + // all the remaining inflight requests have been completed. + drop(tx); + }); + let rx = futures::stream::poll_fn(move |cx| rx.poll_recv(cx)); + let rx = rx.map(RequestData::from); + + let storage = + GenericRemoteStorage::from_config(&remote_storage_config).context("remote storage init")?; + + let properties = WriterProperties::builder() + .set_data_page_size_limit(config.parquet_upload_page_size) + .set_compression(config.parquet_upload_compression); + + let parquet_config = ParquetConfig { + propeties: Arc::new(properties.build()), + rows_per_group: config.parquet_upload_row_group_size, + file_size: config.parquet_upload_size, + max_duration: config.parquet_upload_maximum_duration, + + #[cfg(any(test, feature = "testing"))] + test_remote_failures: 0, + }; + + worker_inner(storage, rx, parquet_config).await +} + +struct ParquetConfig { + propeties: WriterPropertiesPtr, + rows_per_group: usize, + file_size: i64, + + max_duration: tokio::time::Duration, + + #[cfg(any(test, feature = "testing"))] + test_remote_failures: u64, +} + +async fn worker_inner( + storage: GenericRemoteStorage, + rx: impl Stream, + config: ParquetConfig, +) -> anyhow::Result<()> { + #[cfg(any(test, feature = "testing"))] + let storage = if config.test_remote_failures > 0 { + GenericRemoteStorage::unreliable_wrapper(storage, config.test_remote_failures) + } else { + storage + }; + + let mut rx = std::pin::pin!(rx); + + let mut rows = Vec::with_capacity(config.rows_per_group); + + let schema = rows.as_slice().schema()?; + let file = BytesWriter::default(); + let mut w = SerializedFileWriter::new(file, schema.clone(), config.propeties.clone())?; + + let mut last_upload = time::Instant::now(); + + let mut len = 0; + while let Some(row) = rx.next().await { + rows.push(row); + let force = last_upload.elapsed() > config.max_duration; + if rows.len() == config.rows_per_group || force { + let rg_meta; + (rows, w, rg_meta) = flush_rows(rows, w).await?; + len += rg_meta.compressed_size(); + } + if len > config.file_size || force { + last_upload = time::Instant::now(); + let file = upload_parquet(w, len, &storage).await?; + w = SerializedFileWriter::new(file, schema.clone(), config.propeties.clone())?; + len = 0; + } + } + + if !rows.is_empty() { + let rg_meta; + (_, w, rg_meta) = flush_rows(rows, w).await?; + len += rg_meta.compressed_size(); + } + + if !w.flushed_row_groups().is_empty() { + let _: BytesWriter = upload_parquet(w, len, &storage).await?; + } + + Ok(()) +} + +async fn flush_rows( + rows: Vec, + mut w: SerializedFileWriter, +) -> anyhow::Result<( + Vec, + SerializedFileWriter, + RowGroupMetaDataPtr, +)> { + let span = Span::current(); + let (mut rows, w, rg_meta) = tokio::task::spawn_blocking(move || { + let _enter = span.enter(); + + let mut rg = w.next_row_group()?; + rows.as_slice().write_to_row_group(&mut rg)?; + let rg_meta = rg.close()?; + + let size = rg_meta.compressed_size(); + let compression = rg_meta.compressed_size() as f64 / rg_meta.total_byte_size() as f64; + + debug!(size, compression, "flushed row group to parquet file"); + + Ok::<_, parquet::errors::ParquetError>((rows, w, rg_meta)) + }) + .await + .unwrap()?; + + rows.clear(); + Ok((rows, w, rg_meta)) +} + +async fn upload_parquet( + w: SerializedFileWriter, + len: i64, + storage: &GenericRemoteStorage, +) -> anyhow::Result { + let len_uncompressed = w + .flushed_row_groups() + .iter() + .map(|rg| rg.total_byte_size()) + .sum::(); + + // I don't know how compute intensive this is, although it probably isn't much... better be safe than sorry. + // finish method only available on the fork: https://github.com/apache/arrow-rs/issues/5253 + let (mut file, metadata) = tokio::task::spawn_blocking(move || w.finish()) + .await + .unwrap()?; + + let data = file.buf.split().freeze(); + + let compression = len as f64 / len_uncompressed as f64; + let size = data.len(); + let id = uuid::Uuid::now_v7(); + + info!( + %id, + rows = metadata.num_rows, + size, compression, "uploading request parquet file" + ); + + let path = RemotePath::from_string(&format!("requests_{id}.parquet"))?; + backoff::retry( + || async { + let stream = futures::stream::once(futures::future::ready(Ok(data.clone()))); + storage.upload(stream, data.len(), &path, None).await + }, + |_e| false, + FAILED_UPLOAD_WARN_THRESHOLD, + FAILED_UPLOAD_MAX_RETRIES, + "request_data_upload", + // we don't want cancellation to interrupt here, so we make a dummy cancel token + backoff::Cancel::new(CancellationToken::new(), || anyhow::anyhow!("Cancelled")), + ) + .await + .context("request_data_upload")?; + + Ok(file) +} + +// why doesn't BytesMut impl io::Write? +#[derive(Default)] +struct BytesWriter { + buf: BytesMut, +} + +impl std::io::Write for BytesWriter { + fn write(&mut self, buf: &[u8]) -> std::io::Result { + self.buf.extend_from_slice(buf); + Ok(buf.len()) + } + + fn flush(&mut self) -> std::io::Result<()> { + Ok(()) + } +} + +#[cfg(test)] +mod tests { + use std::{net::Ipv4Addr, num::NonZeroUsize, sync::Arc}; + + use camino::Utf8Path; + use clap::Parser; + use futures::{Stream, StreamExt}; + use itertools::Itertools; + use parquet::{ + basic::{Compression, ZstdLevel}, + file::{ + properties::{WriterProperties, DEFAULT_PAGE_SIZE}, + reader::FileReader, + serialized_reader::SerializedFileReader, + }, + }; + use rand::{rngs::StdRng, Rng, SeedableRng}; + use remote_storage::{ + GenericRemoteStorage, RemoteStorageConfig, RemoteStorageKind, S3Config, + DEFAULT_MAX_KEYS_PER_LIST_RESPONSE, DEFAULT_REMOTE_STORAGE_S3_CONCURRENCY_LIMIT, + }; + use tokio::{sync::mpsc, time}; + + use super::{worker_inner, ParquetConfig, ParquetUploadArgs, RequestData}; + + #[derive(Parser)] + struct ProxyCliArgs { + #[clap(flatten)] + parquet_upload: ParquetUploadArgs, + } + + #[test] + fn default_parser() { + let ProxyCliArgs { parquet_upload } = ProxyCliArgs::parse_from(["proxy"]); + assert_eq!(parquet_upload.parquet_upload_remote_storage, None); + assert_eq!(parquet_upload.parquet_upload_row_group_size, 8192); + assert_eq!(parquet_upload.parquet_upload_page_size, DEFAULT_PAGE_SIZE); + assert_eq!(parquet_upload.parquet_upload_size, 100_000_000); + assert_eq!( + parquet_upload.parquet_upload_maximum_duration, + time::Duration::from_secs(20 * 60) + ); + assert_eq!( + parquet_upload.parquet_upload_compression, + Compression::UNCOMPRESSED + ); + } + + #[test] + fn full_parser() { + let ProxyCliArgs { parquet_upload } = ProxyCliArgs::parse_from([ + "proxy", + "--parquet-upload-remote-storage", + "{bucket_name='default',prefix_in_bucket='proxy/',bucket_region='us-east-1',endpoint='http://minio:9000'}", + "--parquet-upload-row-group-size", + "100", + "--parquet-upload-page-size", + "10000", + "--parquet-upload-size", + "10000000", + "--parquet-upload-maximum-duration", + "10m", + "--parquet-upload-compression", + "zstd(5)", + ]); + assert_eq!( + parquet_upload.parquet_upload_remote_storage, + Some(RemoteStorageConfig { + storage: RemoteStorageKind::AwsS3(S3Config { + bucket_name: "default".into(), + bucket_region: "us-east-1".into(), + prefix_in_bucket: Some("proxy/".into()), + endpoint: Some("http://minio:9000".into()), + concurrency_limit: NonZeroUsize::new( + DEFAULT_REMOTE_STORAGE_S3_CONCURRENCY_LIMIT + ) + .unwrap(), + max_keys_per_list_response: DEFAULT_MAX_KEYS_PER_LIST_RESPONSE, + }) + }) + ); + assert_eq!(parquet_upload.parquet_upload_row_group_size, 100); + assert_eq!(parquet_upload.parquet_upload_page_size, 10000); + assert_eq!(parquet_upload.parquet_upload_size, 10_000_000); + assert_eq!( + parquet_upload.parquet_upload_maximum_duration, + time::Duration::from_secs(10 * 60) + ); + assert_eq!( + parquet_upload.parquet_upload_compression, + Compression::ZSTD(ZstdLevel::try_new(5).unwrap()) + ); + } + + fn generate_request_data(rng: &mut impl Rng) -> RequestData { + RequestData { + session_id: uuid::Builder::from_random_bytes(rng.gen()).into_uuid(), + peer_addr: Ipv4Addr::from(rng.gen::<[u8; 4]>()).to_string(), + timestamp: chrono::NaiveDateTime::from_timestamp_millis( + rng.gen_range(1703862754..1803862754), + ) + .unwrap(), + application_name: Some("test".to_owned()), + username: Some(hex::encode(rng.gen::<[u8; 4]>())), + endpoint_id: Some(hex::encode(rng.gen::<[u8; 16]>())), + project: Some(hex::encode(rng.gen::<[u8; 16]>())), + branch: Some(hex::encode(rng.gen::<[u8; 16]>())), + protocol: ["tcp", "ws", "http"][rng.gen_range(0..3)], + region: "us-east-1", + error: None, + } + } + + fn random_stream(len: usize) -> impl Stream + Unpin { + let mut rng = StdRng::from_seed([0x39; 32]); + futures::stream::iter( + std::iter::repeat_with(move || generate_request_data(&mut rng)).take(len), + ) + } + + async fn run_test( + tmpdir: &Utf8Path, + config: ParquetConfig, + rx: impl Stream, + ) -> Vec<(u64, usize, i64)> { + let remote_storage_config = RemoteStorageConfig { + storage: RemoteStorageKind::LocalFs(tmpdir.to_path_buf()), + }; + let storage = GenericRemoteStorage::from_config(&remote_storage_config).unwrap(); + + worker_inner(storage, rx, config).await.unwrap(); + + let mut files = std::fs::read_dir(tmpdir.as_std_path()) + .unwrap() + .map(|entry| entry.unwrap().path()) + .collect_vec(); + files.sort(); + + files + .into_iter() + .map(|path| std::fs::File::open(tmpdir.as_std_path().join(path)).unwrap()) + .map(|file| { + ( + file.metadata().unwrap(), + SerializedFileReader::new(file).unwrap().metadata().clone(), + ) + }) + .map(|(file_meta, parquet_meta)| { + ( + file_meta.len(), + parquet_meta.num_row_groups(), + parquet_meta.file_metadata().num_rows(), + ) + }) + .collect() + } + + #[tokio::test] + async fn verify_parquet_no_compression() { + let tmpdir = camino_tempfile::tempdir().unwrap(); + + let config = ParquetConfig { + propeties: Arc::new(WriterProperties::new()), + rows_per_group: 2_000, + file_size: 1_000_000, + max_duration: time::Duration::from_secs(20 * 60), + test_remote_failures: 0, + }; + + let rx = random_stream(50_000); + let file_stats = run_test(tmpdir.path(), config, rx).await; + + assert_eq!( + file_stats, + [ + (1029153, 3, 6000), + (1029075, 3, 6000), + (1029216, 3, 6000), + (1029129, 3, 6000), + (1029250, 3, 6000), + (1029017, 3, 6000), + (1029175, 3, 6000), + (1029247, 3, 6000), + (343124, 1, 2000) + ], + ); + + tmpdir.close().unwrap(); + } + + #[tokio::test] + async fn verify_parquet_min_compression() { + let tmpdir = camino_tempfile::tempdir().unwrap(); + + let config = ParquetConfig { + propeties: Arc::new( + WriterProperties::builder() + .set_compression(parquet::basic::Compression::ZSTD(ZstdLevel::default())) + .build(), + ), + rows_per_group: 2_000, + file_size: 1_000_000, + max_duration: time::Duration::from_secs(20 * 60), + test_remote_failures: 0, + }; + + let rx = random_stream(50_000); + let file_stats = run_test(tmpdir.path(), config, rx).await; + + // with compression, there are fewer files with more rows per file + assert_eq!( + file_stats, + [ + (1166201, 6, 12000), + (1163577, 6, 12000), + (1164641, 6, 12000), + (1168772, 6, 12000), + (196761, 1, 2000) + ], + ); + + tmpdir.close().unwrap(); + } + + #[tokio::test] + async fn verify_parquet_strong_compression() { + let tmpdir = camino_tempfile::tempdir().unwrap(); + + let config = ParquetConfig { + propeties: Arc::new( + WriterProperties::builder() + .set_compression(parquet::basic::Compression::ZSTD( + ZstdLevel::try_new(10).unwrap(), + )) + .build(), + ), + rows_per_group: 2_000, + file_size: 1_000_000, + max_duration: time::Duration::from_secs(20 * 60), + test_remote_failures: 0, + }; + + let rx = random_stream(50_000); + let file_stats = run_test(tmpdir.path(), config, rx).await; + + // with strong compression, the files are smaller + assert_eq!( + file_stats, + [ + (1144934, 6, 12000), + (1144941, 6, 12000), + (1144735, 6, 12000), + (1144936, 6, 12000), + (191035, 1, 2000) + ], + ); + + tmpdir.close().unwrap(); + } + + #[tokio::test] + async fn verify_parquet_unreliable_upload() { + let tmpdir = camino_tempfile::tempdir().unwrap(); + + let config = ParquetConfig { + propeties: Arc::new(WriterProperties::new()), + rows_per_group: 2_000, + file_size: 1_000_000, + max_duration: time::Duration::from_secs(20 * 60), + test_remote_failures: 2, + }; + + let rx = random_stream(50_000); + let file_stats = run_test(tmpdir.path(), config, rx).await; + + assert_eq!( + file_stats, + [ + (1029153, 3, 6000), + (1029075, 3, 6000), + (1029216, 3, 6000), + (1029129, 3, 6000), + (1029250, 3, 6000), + (1029017, 3, 6000), + (1029175, 3, 6000), + (1029247, 3, 6000), + (343124, 1, 2000) + ], + ); + + tmpdir.close().unwrap(); + } + + #[tokio::test(start_paused = true)] + async fn verify_parquet_regular_upload() { + let tmpdir = camino_tempfile::tempdir().unwrap(); + + let config = ParquetConfig { + propeties: Arc::new(WriterProperties::new()), + rows_per_group: 2_000, + file_size: 1_000_000, + max_duration: time::Duration::from_secs(60), + test_remote_failures: 2, + }; + + let (tx, mut rx) = mpsc::unbounded_channel(); + + tokio::spawn(async move { + for _ in 0..3 { + let mut s = random_stream(3000); + while let Some(r) = s.next().await { + tx.send(r).unwrap(); + } + time::sleep(time::Duration::from_secs(70)).await + } + }); + + let rx = futures::stream::poll_fn(move |cx| rx.poll_recv(cx)); + let file_stats = run_test(tmpdir.path(), config, rx).await; + + // files are smaller than the size threshold, but they took too long to fill so were flushed early + assert_eq!( + file_stats, + [(515807, 2, 3001), (515585, 2, 3000), (515425, 2, 2999)], + ); + + tmpdir.close().unwrap(); + } +} diff --git a/proxy/src/error.rs b/proxy/src/error.rs index f1cb44b1a8..5b2dd7ecfd 100644 --- a/proxy/src/error.rs +++ b/proxy/src/error.rs @@ -28,3 +28,37 @@ pub trait UserFacingError: fmt::Display { self.to_string() } } + +#[derive(Clone)] +pub enum ErrorKind { + /// Wrong password, unknown endpoint, protocol violation, etc... + User, + + /// Network error between user and proxy. Not necessarily user error + Disconnect, + + /// Proxy self-imposed rate limits + RateLimit, + + /// internal errors + Service, + + /// Error communicating with control plane + ControlPlane, + + /// Error communicating with compute + Compute, +} + +impl ErrorKind { + pub fn to_str(&self) -> &'static str { + match self { + ErrorKind::User => "request failed due to user error", + ErrorKind::Disconnect => "client disconnected", + ErrorKind::RateLimit => "request cancelled due to rate limit", + ErrorKind::Service => "internal service error", + ErrorKind::ControlPlane => "non-retryable control plane error", + ErrorKind::Compute => "non-retryable compute error (or exhausted retry capacity)", + } + } +} diff --git a/proxy/src/lib.rs b/proxy/src/lib.rs index 2da1eaf482..a22b2459b8 100644 --- a/proxy/src/lib.rs +++ b/proxy/src/lib.rs @@ -13,6 +13,7 @@ pub mod cancellation; pub mod compute; pub mod config; pub mod console; +pub mod context; pub mod error; pub mod http; pub mod logging; @@ -21,6 +22,7 @@ pub mod parse; pub mod protocol2; pub mod proxy; pub mod rate_limiter; +pub mod redis; pub mod sasl; pub mod scram; pub mod serverless; diff --git a/proxy/src/metrics.rs b/proxy/src/metrics.rs index 8e2a6105b1..6e4cbb3f3a 100644 --- a/proxy/src/metrics.rs +++ b/proxy/src/metrics.rs @@ -115,11 +115,12 @@ pub static ALLOWED_IPS_NUMBER: Lazy = Lazy::new(|| { .unwrap() }); +#[derive(Clone)] pub struct LatencyTimer { // time since the stopwatch was started start: Option, // accumulated time on the stopwatch - accumulated: std::time::Duration, + pub accumulated: std::time::Duration, // label data protocol: &'static str, cache_miss: bool, @@ -160,7 +161,12 @@ impl LatencyTimer { self.pool_miss = false; } - pub fn success(mut self) { + pub fn success(&mut self) { + // stop the stopwatch and record the time that we have accumulated + let start = self.start.take().expect("latency timer should be started"); + self.accumulated += start.elapsed(); + + // success self.outcome = "success"; } } diff --git a/proxy/src/proxy.rs b/proxy/src/proxy.rs index 17e910860c..84b4c266e6 100644 --- a/proxy/src/proxy.rs +++ b/proxy/src/proxy.rs @@ -9,9 +9,10 @@ use crate::{ cancellation::{self, CancelMap}, compute, config::{AuthenticationConfig, ProxyConfig, TlsConfig}, - console::{self, messages::MetricsAuxInfo}, + console::messages::MetricsAuxInfo, + context::RequestMonitoring, metrics::{ - LatencyTimer, NUM_BYTES_PROXIED_COUNTER, NUM_BYTES_PROXIED_PER_CLIENT_COUNTER, + NUM_BYTES_PROXIED_COUNTER, NUM_BYTES_PROXIED_PER_CLIENT_COUNTER, NUM_CLIENT_CONNECTION_GAUGE, NUM_CONNECTION_REQUESTS_GAUGE, }, protocol2::WithClientIp, @@ -25,7 +26,8 @@ use itertools::Itertools; use once_cell::sync::OnceCell; use pq_proto::{BeMessage as Be, FeStartupPacket, StartupMessageParams}; use regex::Regex; -use std::{net::IpAddr, sync::Arc}; +use smol_str::SmolStr; +use std::sync::Arc; use tokio::io::{AsyncRead, AsyncWrite, AsyncWriteExt}; use tokio_util::sync::CancellationToken; use tracing::{error, info, info_span, Instrument}; @@ -82,14 +84,16 @@ pub async fn task_main( info!("accepted postgres client connection"); let mut socket = WithClientIp::new(socket); - let mut peer_addr = peer_addr; - if let Some(ip) = socket.wait_for_addr().await? { - peer_addr = ip; - tracing::Span::current().record("peer_addr", &tracing::field::display(ip)); + let mut peer_addr = peer_addr.ip(); + if let Some(addr) = socket.wait_for_addr().await? { + peer_addr = addr.ip(); + tracing::Span::current().record("peer_addr", &tracing::field::display(addr)); } else if config.require_client_ip { bail!("missing required client IP"); } + let mut ctx = RequestMonitoring::new(session_id, peer_addr, "tcp", &config.region); + socket .inner .set_nodelay(true) @@ -97,11 +101,10 @@ pub async fn task_main( handle_client( config, + &mut ctx, &cancel_map, - session_id, socket, ClientMode::Tcp, - peer_addr.ip(), endpoint_rate_limiter, ) .await @@ -134,13 +137,6 @@ pub enum ClientMode { /// Abstracts the logic of handling TCP vs WS clients impl ClientMode { - fn protocol_label(&self) -> &'static str { - match self { - ClientMode::Tcp => "tcp", - ClientMode::Websockets { .. } => "ws", - } - } - fn allow_cleartext(&self) -> bool { match self { ClientMode::Tcp => false, @@ -173,19 +169,18 @@ impl ClientMode { pub async fn handle_client( config: &'static ProxyConfig, + ctx: &mut RequestMonitoring, cancel_map: &CancelMap, - session_id: uuid::Uuid, stream: S, mode: ClientMode, - peer_addr: IpAddr, endpoint_rate_limiter: Arc, ) -> anyhow::Result<()> { info!( - protocol = mode.protocol_label(), + protocol = ctx.protocol, "handling interactive connection from client" ); - let proto = mode.protocol_label(); + let proto = ctx.protocol; let _client_gauge = NUM_CLIENT_CONNECTION_GAUGE .with_label_values(&[proto]) .guard(); @@ -195,38 +190,46 @@ pub async fn handle_client( let tls = config.tls_config.as_ref(); + let pause = ctx.latency_timer.pause(); let do_handshake = handshake(stream, mode.handshake_tls(tls), cancel_map); let (mut stream, params) = match do_handshake.await? { Some(x) => x, None => return Ok(()), // it's a cancellation request }; + drop(pause); // Extract credentials which we're going to use for auth. - let creds = { + let user_info = { let hostname = mode.hostname(stream.get_ref()); - let common_names = tls.and_then(|tls| tls.common_names.clone()); + + let common_names = tls.map(|tls| &tls.common_names); let result = config .auth_backend .as_ref() - .map(|_| auth::ClientCredentials::parse(¶ms, hostname, common_names, peer_addr)) + .map(|_| { + auth::ComputeUserInfoMaybeEndpoint::parse(ctx, ¶ms, hostname, common_names) + }) .transpose(); match result { - Ok(creds) => creds, + Ok(user_info) => user_info, Err(e) => stream.throw_error(e).await?, } }; + ctx.set_endpoint_id(user_info.get_endpoint()); + let client = Client::new( stream, - creds, + user_info, ¶ms, - session_id, mode.allow_self_signed_compute(config), endpoint_rate_limiter, ); cancel_map - .with_session(|session| client.connect_to_db(session, mode, &config.authentication_config)) + .with_session(|session| { + client.connect_to_db(ctx, session, mode, &config.authentication_config) + }) .await } @@ -348,10 +351,13 @@ async fn prepare_client_connection( /// Forward bytes in both directions (client <-> compute). #[tracing::instrument(skip_all)] pub async fn proxy_pass( + ctx: &mut RequestMonitoring, client: impl AsyncRead + AsyncWrite + Unpin, compute: impl AsyncRead + AsyncWrite + Unpin, aux: MetricsAuxInfo, ) -> anyhow::Result<()> { + ctx.log(); + let usage = USAGE_METRICS.register(Ids { endpoint_id: aux.endpoint_id.clone(), branch_id: aux.branch_id.clone(), @@ -394,11 +400,9 @@ struct Client<'a, S> { /// The underlying libpq protocol stream. stream: PqStream>, /// Client credentials that we care about. - creds: auth::BackendType<'a, auth::ClientCredentials>, + user_info: auth::BackendType<'a, auth::ComputeUserInfoMaybeEndpoint>, /// KV-dictionary with PostgreSQL connection params. params: &'a StartupMessageParams, - /// Unique connection ID. - session_id: uuid::Uuid, /// Allow self-signed certificates (for testing). allow_self_signed_compute: bool, /// Rate limiter for endpoints @@ -409,17 +413,15 @@ impl<'a, S> Client<'a, S> { /// Construct a new connection context. fn new( stream: PqStream>, - creds: auth::BackendType<'a, auth::ClientCredentials>, + user_info: auth::BackendType<'a, auth::ComputeUserInfoMaybeEndpoint>, params: &'a StartupMessageParams, - session_id: uuid::Uuid, allow_self_signed_compute: bool, endpoint_rate_limiter: Arc, ) -> Self { Self { stream, - creds, + user_info, params, - session_id, allow_self_signed_compute, endpoint_rate_limiter, } @@ -430,24 +432,24 @@ impl Client<'_, S> { /// Let the client authenticate and connect to the designated compute node. // Instrumentation logs endpoint name everywhere. Doesn't work for link // auth; strictly speaking we don't know endpoint name in its case. - #[tracing::instrument(name = "", fields(ep = %self.creds.get_endpoint().unwrap_or_default()), skip_all)] + #[tracing::instrument(name = "", fields(ep = %self.user_info.get_endpoint().unwrap_or_default()), skip_all)] async fn connect_to_db( self, + ctx: &mut RequestMonitoring, session: cancellation::Session<'_>, mode: ClientMode, config: &'static AuthenticationConfig, ) -> anyhow::Result<()> { let Self { mut stream, - creds, + user_info, params, - session_id, allow_self_signed_compute, endpoint_rate_limiter, } = self; // check rate limit - if let Some(ep) = creds.get_endpoint() { + if let Some(ep) = user_info.get_endpoint() { if !endpoint_rate_limiter.check(ep) { return stream .throw_error(auth::AuthError::too_many_connections()) @@ -455,27 +457,9 @@ impl Client<'_, S> { } } - let proto = mode.protocol_label(); - let extra = console::ConsoleReqExtra { - session_id, // aka this connection's id - application_name: format!( - "{}/{}", - params.get("application_name").unwrap_or_default(), - proto - ), - options: neon_options(params), - }; - let mut latency_timer = LatencyTimer::new(proto); - - let user = creds.get_user().to_owned(); - let auth_result = match creds - .authenticate( - &extra, - &mut stream, - mode.allow_cleartext(), - config, - &mut latency_timer, - ) + let user = user_info.get_user().to_owned(); + let auth_result = match user_info + .authenticate(ctx, &mut stream, mode.allow_cleartext(), config) .await { Ok(auth_result) => auth_result, @@ -488,20 +472,14 @@ impl Client<'_, S> { } }; - let (mut node_info, creds) = auth_result; + let (mut node_info, user_info) = auth_result; node_info.allow_self_signed_compute = allow_self_signed_compute; let aux = node_info.aux.clone(); - let mut node = connect_to_compute( - &TcpMechanism { params, proto }, - node_info, - &extra, - &creds, - latency_timer, - ) - .or_else(|e| stream.throw_error(e)) - .await?; + let mut node = connect_to_compute(ctx, &TcpMechanism { params }, node_info, &user_info) + .or_else(|e| stream.throw_error(e)) + .await?; prepare_client_connection(&node, session, &mut stream).await?; // Before proxy passing, forward to compute whatever data is left in the @@ -510,33 +488,56 @@ impl Client<'_, S> { // immediately after opening the connection. let (stream, read_buf) = stream.into_inner(); node.stream.write_all(&read_buf).await?; - proxy_pass(stream, node.stream, aux).await + proxy_pass(ctx, stream, node.stream, aux).await } } -pub fn neon_options(params: &StartupMessageParams) -> Vec<(String, String)> { - #[allow(unstable_name_collisions)] - match params.options_raw() { - Some(options) => options.filter_map(neon_option).collect(), - None => vec![], +#[derive(Debug, Clone, PartialEq, Eq, Default)] +pub struct NeonOptions(Vec<(SmolStr, SmolStr)>); + +impl NeonOptions { + pub fn parse_params(params: &StartupMessageParams) -> Self { + params + .options_raw() + .map(Self::parse_from_iter) + .unwrap_or_default() + } + pub fn parse_options_raw(options: &str) -> Self { + Self::parse_from_iter(StartupMessageParams::parse_options_raw(options)) + } + + fn parse_from_iter<'a>(options: impl Iterator) -> Self { + let mut options = options + .filter_map(neon_option) + .map(|(k, v)| (k.into(), v.into())) + .collect_vec(); + options.sort(); + Self(options) + } + + pub fn get_cache_key(&self, prefix: &str) -> SmolStr { + // prefix + format!(" {k}:{v}") + // kinda jank because SmolStr is immutable + std::iter::once(prefix) + .chain(self.0.iter().flat_map(|(k, v)| [" ", &**k, ":", &**v])) + .collect() + } + + /// DeepObject format + /// `paramName[prop1]=value1¶mName[prop2]=value2&...` + pub fn to_deep_object(&self) -> Vec<(String, SmolStr)> { + self.0 + .iter() + .map(|(k, v)| (format!("options[{}]", k), v.clone())) + .collect() } } -pub fn neon_options_str(params: &StartupMessageParams) -> String { - #[allow(unstable_name_collisions)] - neon_options(params) - .iter() - .map(|(k, v)| format!("{}:{}", k, v)) - .sorted() // we sort it to use as cache key - .intersperse(" ".to_owned()) - .collect() -} - -pub fn neon_option(bytes: &str) -> Option<(String, String)> { +pub fn neon_option(bytes: &str) -> Option<(&str, &str)> { static RE: OnceCell = OnceCell::new(); let re = RE.get_or_init(|| Regex::new(r"^neon_(\w+):(.+)").unwrap()); let cap = re.captures(bytes)?; let (_, [k, v]) = cap.extract(); - Some((k.to_owned(), v.to_owned())) + Some((k, v)) } diff --git a/proxy/src/proxy/connect_compute.rs b/proxy/src/proxy/connect_compute.rs index 88b0019c49..72cab1fe5d 100644 --- a/proxy/src/proxy/connect_compute.rs +++ b/proxy/src/proxy/connect_compute.rs @@ -2,7 +2,8 @@ use crate::{ auth, compute::{self, PostgresConnection}, console::{self, errors::WakeComputeError, Api}, - metrics::{bool_to_str, LatencyTimer, NUM_CONNECTION_FAILURES, NUM_WAKEUP_FAILURES}, + context::RequestMonitoring, + metrics::{bool_to_str, NUM_CONNECTION_FAILURES, NUM_WAKEUP_FAILURES}, proxy::retry::{retry_after, ShouldRetry}, }; use async_trait::async_trait; @@ -35,15 +36,15 @@ pub fn invalidate_cache(node_info: console::CachedNodeInfo) -> compute::ConnCfg /// Try to connect to the compute node once. #[tracing::instrument(name = "connect_once", fields(pid = tracing::field::Empty), skip_all)] async fn connect_to_compute_once( + ctx: &mut RequestMonitoring, node_info: &console::CachedNodeInfo, timeout: time::Duration, - proto: &'static str, ) -> Result { let allow_self_signed_compute = node_info.allow_self_signed_compute; node_info .config - .connect(allow_self_signed_compute, timeout, proto) + .connect(ctx, allow_self_signed_compute, timeout) .await } @@ -54,6 +55,7 @@ pub trait ConnectMechanism { type Error: From; async fn connect_once( &self, + ctx: &mut RequestMonitoring, node_info: &console::CachedNodeInfo, timeout: time::Duration, ) -> Result; @@ -64,7 +66,6 @@ pub trait ConnectMechanism { pub struct TcpMechanism<'a> { /// KV-dictionary with PostgreSQL connection params. pub params: &'a StartupMessageParams, - pub proto: &'static str, } #[async_trait] @@ -75,10 +76,11 @@ impl ConnectMechanism for TcpMechanism<'_> { async fn connect_once( &self, + ctx: &mut RequestMonitoring, node_info: &console::CachedNodeInfo, timeout: time::Duration, ) -> Result { - connect_to_compute_once(node_info, timeout, self.proto).await + connect_to_compute_once(ctx, node_info, timeout).await } fn update_connect_config(&self, config: &mut compute::ConnCfg) { @@ -123,11 +125,10 @@ fn report_error(e: &WakeComputeError, retry: bool) { /// This function might update `node_info`, so we take it by `&mut`. #[tracing::instrument(skip_all)] pub async fn connect_to_compute( + ctx: &mut RequestMonitoring, mechanism: &M, mut node_info: console::CachedNodeInfo, - extra: &console::ConsoleReqExtra, - creds: &auth::BackendType<'_, auth::backend::ComputeUserInfo>, - mut latency_timer: LatencyTimer, + user_info: &auth::BackendType<'_, auth::backend::ComputeUserInfo>, ) -> Result where M::ConnectError: ShouldRetry + std::fmt::Debug, @@ -136,9 +137,12 @@ where mechanism.update_connect_config(&mut node_info.config); // try once - let (config, err) = match mechanism.connect_once(&node_info, CONNECT_TIMEOUT).await { + let (config, err) = match mechanism + .connect_once(ctx, &node_info, CONNECT_TIMEOUT) + .await + { Ok(res) => { - latency_timer.success(); + ctx.latency_timer.success(); return Ok(res); } Err(e) => { @@ -147,17 +151,17 @@ where } }; - latency_timer.cache_miss(); + ctx.latency_timer.cache_miss(); let mut num_retries = 1; // if we failed to connect, it's likely that the compute node was suspended, wake a new compute node info!("compute node's state has likely changed; requesting a wake-up"); let node_info = loop { - let wake_res = match creds { - auth::BackendType::Console(api, creds) => api.wake_compute(extra, creds).await, + let wake_res = match user_info { + auth::BackendType::Console(api, user_info) => api.wake_compute(ctx, user_info).await, #[cfg(feature = "testing")] - auth::BackendType::Postgres(api, creds) => api.wake_compute(extra, creds).await, + auth::BackendType::Postgres(api, user_info) => api.wake_compute(ctx, user_info).await, // nothing to do? auth::BackendType::Link(_) => return Err(err.into()), // test backend @@ -195,9 +199,12 @@ where // * DNS connection settings haven't quite propagated yet info!("wake_compute success. attempting to connect"); loop { - match mechanism.connect_once(&node_info, CONNECT_TIMEOUT).await { + match mechanism + .connect_once(ctx, &node_info, CONNECT_TIMEOUT) + .await + { Ok(res) => { - latency_timer.success(); + ctx.latency_timer.success(); return Ok(res); } Err(e) => { diff --git a/proxy/src/proxy/tests.rs b/proxy/src/proxy/tests.rs index 3c483c59ee..73fde2d7d0 100644 --- a/proxy/src/proxy/tests.rs +++ b/proxy/src/proxy/tests.rs @@ -7,11 +7,12 @@ use super::retry::ShouldRetry; use super::*; use crate::auth::backend::{ComputeUserInfo, TestBackend}; use crate::config::CertResolver; -use crate::console::{CachedNodeInfo, NodeInfo}; +use crate::console::{self, CachedNodeInfo, NodeInfo}; use crate::proxy::retry::{retry_after, NUM_RETRIES_CONNECT}; use crate::{auth, http, sasl, scram}; use async_trait::async_trait; use rstest::rstest; +use smol_str::SmolStr; use tokio_postgres::config::SslMode; use tokio_postgres::tls::{MakeTlsConnect, NoTls}; use tokio_postgres_rustls::{MakeRustlsConnect, RustlsStream}; @@ -82,7 +83,7 @@ fn generate_tls_config<'a>( let mut cert_resolver = CertResolver::new(); cert_resolver.add_cert(key, vec![cert], true)?; - let common_names = Some(cert_resolver.get_common_names()); + let common_names = cert_resolver.get_common_names(); TlsConfig { config, @@ -425,6 +426,7 @@ impl ConnectMechanism for TestConnectMechanism { async fn connect_once( &self, + _ctx: &mut RequestMonitoring, _node_info: &console::CachedNodeInfo, _timeout: std::time::Duration, ) -> Result { @@ -469,7 +471,7 @@ impl TestBackend for TestConnectMechanism { } } - fn get_allowed_ips(&self) -> Result>, console::errors::GetAuthInfoError> { + fn get_allowed_ips(&self) -> Result, console::errors::GetAuthInfoError> { unimplemented!("not used in tests") } } @@ -485,27 +487,19 @@ fn helper_create_cached_node_info() -> CachedNodeInfo { fn helper_create_connect_info( mechanism: &TestConnectMechanism, -) -> ( - CachedNodeInfo, - console::ConsoleReqExtra, - auth::BackendType<'_, ComputeUserInfo>, -) { +) -> (CachedNodeInfo, auth::BackendType<'_, ComputeUserInfo>) { let cache = helper_create_cached_node_info(); - let extra = console::ConsoleReqExtra { - session_id: uuid::Uuid::new_v4(), - application_name: "TEST".into(), - options: vec![], - }; - let creds = auth::BackendType::Test(mechanism); - (cache, extra, creds) + let user_info = auth::BackendType::Test(mechanism); + (cache, user_info) } #[tokio::test] async fn connect_to_compute_success() { use ConnectAction::*; + let mut ctx = RequestMonitoring::test(); let mechanism = TestConnectMechanism::new(vec![Connect]); - let (cache, extra, creds) = helper_create_connect_info(&mechanism); - connect_to_compute(&mechanism, cache, &extra, &creds, LatencyTimer::new("test")) + let (cache, user_info) = helper_create_connect_info(&mechanism); + connect_to_compute(&mut ctx, &mechanism, cache, &user_info) .await .unwrap(); mechanism.verify(); @@ -514,9 +508,10 @@ async fn connect_to_compute_success() { #[tokio::test] async fn connect_to_compute_retry() { use ConnectAction::*; + let mut ctx = RequestMonitoring::test(); let mechanism = TestConnectMechanism::new(vec![Retry, Wake, Retry, Connect]); - let (cache, extra, creds) = helper_create_connect_info(&mechanism); - connect_to_compute(&mechanism, cache, &extra, &creds, LatencyTimer::new("test")) + let (cache, user_info) = helper_create_connect_info(&mechanism); + connect_to_compute(&mut ctx, &mechanism, cache, &user_info) .await .unwrap(); mechanism.verify(); @@ -526,9 +521,10 @@ async fn connect_to_compute_retry() { #[tokio::test] async fn connect_to_compute_non_retry_1() { use ConnectAction::*; + let mut ctx = RequestMonitoring::test(); let mechanism = TestConnectMechanism::new(vec![Retry, Wake, Retry, Fail]); - let (cache, extra, creds) = helper_create_connect_info(&mechanism); - connect_to_compute(&mechanism, cache, &extra, &creds, LatencyTimer::new("test")) + let (cache, user_info) = helper_create_connect_info(&mechanism); + connect_to_compute(&mut ctx, &mechanism, cache, &user_info) .await .unwrap_err(); mechanism.verify(); @@ -538,9 +534,10 @@ async fn connect_to_compute_non_retry_1() { #[tokio::test] async fn connect_to_compute_non_retry_2() { use ConnectAction::*; + let mut ctx = RequestMonitoring::test(); let mechanism = TestConnectMechanism::new(vec![Fail, Wake, Retry, Connect]); - let (cache, extra, creds) = helper_create_connect_info(&mechanism); - connect_to_compute(&mechanism, cache, &extra, &creds, LatencyTimer::new("test")) + let (cache, user_info) = helper_create_connect_info(&mechanism); + connect_to_compute(&mut ctx, &mechanism, cache, &user_info) .await .unwrap(); mechanism.verify(); @@ -551,12 +548,13 @@ async fn connect_to_compute_non_retry_2() { async fn connect_to_compute_non_retry_3() { assert_eq!(NUM_RETRIES_CONNECT, 16); use ConnectAction::*; + let mut ctx = RequestMonitoring::test(); let mechanism = TestConnectMechanism::new(vec![ Retry, Wake, Retry, Retry, Retry, Retry, Retry, Retry, Retry, Retry, Retry, Retry, Retry, Retry, Retry, Retry, Retry, /* the 17th time */ Retry, ]); - let (cache, extra, creds) = helper_create_connect_info(&mechanism); - connect_to_compute(&mechanism, cache, &extra, &creds, LatencyTimer::new("test")) + let (cache, user_info) = helper_create_connect_info(&mechanism); + connect_to_compute(&mut ctx, &mechanism, cache, &user_info) .await .unwrap_err(); mechanism.verify(); @@ -566,9 +564,10 @@ async fn connect_to_compute_non_retry_3() { #[tokio::test] async fn wake_retry() { use ConnectAction::*; + let mut ctx = RequestMonitoring::test(); let mechanism = TestConnectMechanism::new(vec![Retry, WakeRetry, Wake, Connect]); - let (cache, extra, creds) = helper_create_connect_info(&mechanism); - connect_to_compute(&mechanism, cache, &extra, &creds, LatencyTimer::new("test")) + let (cache, user_info) = helper_create_connect_info(&mechanism); + connect_to_compute(&mut ctx, &mechanism, cache, &user_info) .await .unwrap(); mechanism.verify(); @@ -578,9 +577,10 @@ async fn wake_retry() { #[tokio::test] async fn wake_non_retry() { use ConnectAction::*; + let mut ctx = RequestMonitoring::test(); let mechanism = TestConnectMechanism::new(vec![Retry, WakeFail]); - let (cache, extra, creds) = helper_create_connect_info(&mechanism); - connect_to_compute(&mechanism, cache, &extra, &creds, LatencyTimer::new("test")) + let (cache, user_info) = helper_create_connect_info(&mechanism); + connect_to_compute(&mut ctx, &mechanism, cache, &user_info) .await .unwrap_err(); mechanism.verify(); diff --git a/proxy/src/redis.rs b/proxy/src/redis.rs new file mode 100644 index 0000000000..c2a91bed97 --- /dev/null +++ b/proxy/src/redis.rs @@ -0,0 +1 @@ +pub mod notifications; diff --git a/proxy/src/redis/notifications.rs b/proxy/src/redis/notifications.rs new file mode 100644 index 0000000000..933f2a1bdb --- /dev/null +++ b/proxy/src/redis/notifications.rs @@ -0,0 +1,202 @@ +use std::{convert::Infallible, sync::Arc}; + +use futures::StreamExt; +use redis::aio::PubSub; +use serde::Deserialize; +use smol_str::SmolStr; + +use crate::cache::project_info::ProjectInfoCache; + +const CHANNEL_NAME: &str = "neondb-proxy-ws-updates"; +const RECONNECT_TIMEOUT: std::time::Duration = std::time::Duration::from_secs(20); +const INVALIDATION_LAG: std::time::Duration = std::time::Duration::from_secs(20); + +struct ConsoleRedisClient { + client: redis::Client, +} + +impl ConsoleRedisClient { + pub fn new(url: &str) -> anyhow::Result { + let client = redis::Client::open(url)?; + Ok(Self { client }) + } + async fn try_connect(&self) -> anyhow::Result { + let mut conn = self.client.get_async_connection().await?.into_pubsub(); + tracing::info!("subscribing to a channel `{CHANNEL_NAME}`"); + conn.subscribe(CHANNEL_NAME).await?; + Ok(conn) + } +} + +#[derive(Clone, Debug, Deserialize, Eq, PartialEq)] +#[serde(tag = "topic", content = "data")] +enum Notification { + #[serde( + rename = "/allowed_ips_updated", + deserialize_with = "deserialize_json_string" + )] + AllowedIpsUpdate { + allowed_ips_update: AllowedIpsUpdate, + }, + #[serde( + rename = "/password_updated", + deserialize_with = "deserialize_json_string" + )] + PasswordUpdate { password_update: PasswordUpdate }, +} +#[derive(Clone, Debug, Deserialize, Eq, PartialEq)] +struct AllowedIpsUpdate { + #[serde(rename = "project")] + project_id: SmolStr, +} +#[derive(Clone, Debug, Deserialize, Eq, PartialEq)] +struct PasswordUpdate { + #[serde(rename = "project")] + project_id: SmolStr, + #[serde(rename = "role")] + role_name: SmolStr, +} +fn deserialize_json_string<'de, D, T>(deserializer: D) -> Result +where + T: for<'de2> serde::Deserialize<'de2>, + D: serde::Deserializer<'de>, +{ + let s = String::deserialize(deserializer)?; + serde_json::from_str(&s).map_err(::custom) +} + +fn invalidate_cache(cache: Arc, msg: Notification) { + use Notification::*; + match msg { + AllowedIpsUpdate { allowed_ips_update } => { + cache.invalidate_allowed_ips_for_project(&allowed_ips_update.project_id) + } + PasswordUpdate { password_update } => cache.invalidate_role_secret_for_project( + &password_update.project_id, + &password_update.role_name, + ), + } +} + +#[tracing::instrument(skip(cache))] +fn handle_message(msg: redis::Msg, cache: Arc) -> anyhow::Result<()> +where + C: ProjectInfoCache + Send + Sync + 'static, +{ + let payload: String = msg.get_payload()?; + tracing::debug!(?payload, "received a message payload"); + + let msg: Notification = match serde_json::from_str(&payload) { + Ok(msg) => msg, + Err(e) => { + tracing::error!("broken message: {e}"); + return Ok(()); + } + }; + tracing::debug!(?msg, "received a message"); + invalidate_cache(cache.clone(), msg.clone()); + // It might happen that the invalid entry is on the way to be cached. + // To make sure that the entry is invalidated, let's repeat the invalidation in INVALIDATION_LAG seconds. + // TODO: include the version (or the timestamp) in the message and invalidate only if the entry is cached before the message. + tokio::spawn(async move { + tokio::time::sleep(INVALIDATION_LAG).await; + invalidate_cache(cache, msg.clone()); + }); + + Ok(()) +} + +/// Handle console's invalidation messages. +#[tracing::instrument(name = "console_notifications", skip_all)] +pub async fn task_main(url: String, cache: Arc) -> anyhow::Result +where + C: ProjectInfoCache + Send + Sync + 'static, +{ + cache.enable_ttl(); + + loop { + let redis = ConsoleRedisClient::new(&url)?; + let conn = match redis.try_connect().await { + Ok(conn) => { + cache.disable_ttl(); + conn + } + Err(e) => { + tracing::error!( + "failed to connect to redis: {e}, will try to reconnect in {RECONNECT_TIMEOUT:#?}" + ); + tokio::time::sleep(RECONNECT_TIMEOUT).await; + continue; + } + }; + let mut stream = conn.into_on_message(); + while let Some(msg) = stream.next().await { + match handle_message(msg, cache.clone()) { + Ok(()) => {} + Err(e) => { + tracing::error!("failed to handle message: {e}, will try to reconnect"); + break; + } + } + } + cache.enable_ttl(); + } +} + +#[cfg(test)] +mod tests { + use super::*; + use serde_json::json; + + #[test] + fn parse_allowed_ips() -> anyhow::Result<()> { + let project_id = "new_project".to_string(); + let data = format!("{{\"project\": \"{project_id}\"}}"); + let text = json!({ + "type": "message", + "topic": "/allowed_ips_updated", + "data": data, + "extre_fields": "something" + }) + .to_string(); + + let result: Notification = serde_json::from_str(&text)?; + assert_eq!( + result, + Notification::AllowedIpsUpdate { + allowed_ips_update: AllowedIpsUpdate { + project_id: project_id.into() + } + } + ); + + Ok(()) + } + + #[test] + fn parse_password_updated() -> anyhow::Result<()> { + let project_id = "new_project".to_string(); + let role_name = "new_role".to_string(); + let data = format!("{{\"project\": \"{project_id}\", \"role\": \"{role_name}\"}}"); + let text = json!({ + "type": "message", + "topic": "/password_updated", + "data": data, + "extre_fields": "something" + }) + .to_string(); + + let result: Notification = serde_json::from_str(&text)?; + assert_eq!( + result, + Notification::PasswordUpdate { + password_update: PasswordUpdate { + project_id: project_id.into(), + role_name: role_name.into() + } + } + ); + + Ok(()) + } +} diff --git a/proxy/src/scram/key.rs b/proxy/src/scram/key.rs index bd93fb2b70..66c2c6b207 100644 --- a/proxy/src/scram/key.rs +++ b/proxy/src/scram/key.rs @@ -6,7 +6,7 @@ pub const SCRAM_KEY_LEN: usize = 32; /// One of the keys derived from the [password](super::password::SaltedPassword). /// We use the same structure for all keys, i.e. /// `ClientKey`, `StoredKey`, and `ServerKey`. -#[derive(Clone, Default, PartialEq, Eq)] +#[derive(Clone, Default, PartialEq, Eq, Debug)] #[repr(transparent)] pub struct ScramKey { bytes: [u8; SCRAM_KEY_LEN], diff --git a/proxy/src/scram/secret.rs b/proxy/src/scram/secret.rs index 9e74e07af1..041548014a 100644 --- a/proxy/src/scram/secret.rs +++ b/proxy/src/scram/secret.rs @@ -5,7 +5,7 @@ use super::key::ScramKey; /// Server secret is produced from [password](super::password::SaltedPassword) /// and is used throughout the authentication process. -#[derive(Clone)] +#[derive(Clone, Eq, PartialEq, Debug)] pub struct ServerSecret { /// Number of iterations for `PBKDF2` function. pub iterations: u32, diff --git a/proxy/src/serverless.rs b/proxy/src/serverless.rs index e358a0712f..8af008394a 100644 --- a/proxy/src/serverless.rs +++ b/proxy/src/serverless.rs @@ -6,13 +6,19 @@ mod conn_pool; mod sql_over_http; mod websocket; +pub use conn_pool::GlobalConnPoolOptions; + use anyhow::bail; use hyper::StatusCode; use metrics::IntCounterPairGuard; +use rand::rngs::StdRng; +use rand::SeedableRng; pub use reqwest_middleware::{ClientWithMiddleware, Error}; pub use reqwest_retry::{policies::ExponentialBackoff, RetryTransientMiddleware}; use tokio_util::task::TaskTracker; +use crate::config::TlsConfig; +use crate::context::RequestMonitoring; use crate::metrics::NUM_CLIENT_CONNECTION_GAUGE; use crate::protocol2::{ProxyProtocolAccept, WithClientIp}; use crate::rate_limiter::EndpointRateLimiter; @@ -47,6 +53,11 @@ pub async fn task_main( let conn_pool = conn_pool::GlobalConnPool::new(config); + let conn_pool2 = Arc::clone(&conn_pool); + tokio::spawn(async move { + conn_pool2.gc_worker(StdRng::from_entropy()).await; + }); + // shutdown the connection pool tokio::spawn({ let cancellation_token = cancellation_token.clone(); @@ -59,14 +70,14 @@ pub async fn task_main( } }); - let tls_config = config.tls_config.as_ref().map(|cfg| cfg.to_server_config()); - let tls_acceptor: tokio_rustls::TlsAcceptor = match tls_config { - Some(config) => config.into(), + let tls_config = match config.tls_config.as_ref() { + Some(config) => config, None => { warn!("TLS config is missing, WebSocket Secure server will not be started"); return Ok(()); } }; + let tls_acceptor: tokio_rustls::TlsAcceptor = tls_config.to_server_config().into(); let mut addr_incoming = AddrIncoming::from_listener(ws_listener)?; let _ = addr_incoming.set_nodelay(true); @@ -116,6 +127,7 @@ pub async fn task_main( request_handler( req, config, + tls_config, conn_pool, ws_connections, cancel_map, @@ -185,6 +197,7 @@ where async fn request_handler( mut request: Request, config: &'static ProxyConfig, + tls: &'static TlsConfig, conn_pool: Arc, ws_connections: TaskTracker, cancel_map: Arc, @@ -209,13 +222,14 @@ async fn request_handler( ws_connections.spawn( async move { + let mut ctx = RequestMonitoring::new(session_id, peer_addr, "ws", &config.region); + if let Err(e) = websocket::serve_websocket( - websocket, config, + &mut ctx, + websocket, &cancel_map, - session_id, host, - peer_addr, endpoint_rate_limiter, ) .await @@ -229,13 +243,15 @@ async fn request_handler( // Return the response so the spawned future can continue. Ok(response) } else if request.uri().path() == "/sql" && request.method() == Method::POST { + let mut ctx = RequestMonitoring::new(session_id, peer_addr, "http", &config.region); + sql_over_http::handle( + tls, + &config.http_config, + &mut ctx, request, sni_hostname, conn_pool, - session_id, - peer_addr, - &config.http_config, ) .await } else if request.uri().path() == "/sql" && request.method() == Method::OPTIONS { diff --git a/proxy/src/serverless/conn_pool.rs b/proxy/src/serverless/conn_pool.rs index df2d1bea32..787b8bb28e 100644 --- a/proxy/src/serverless/conn_pool.rs +++ b/proxy/src/serverless/conn_pool.rs @@ -1,15 +1,18 @@ -use anyhow::{anyhow, Context}; +use anyhow::Context; use async_trait::async_trait; use dashmap::DashMap; -use futures::future::poll_fn; +use futures::{future::poll_fn, Future}; +use metrics::{register_int_counter_pair, IntCounterPair, IntCounterPairGuard}; +use once_cell::sync::Lazy; use parking_lot::RwLock; use pbkdf2::{ password_hash::{PasswordHashString, PasswordHasher, PasswordVerifier, SaltString}, Params, Pbkdf2, }; -use pq_proto::StartupMessageParams; +use prometheus::{exponential_buckets, register_histogram, Histogram}; +use rand::Rng; use smol_str::SmolStr; -use std::{collections::HashMap, net::IpAddr, sync::Arc}; +use std::{collections::HashMap, pin::pin, sync::Arc, sync::Weak, time::Duration}; use std::{ fmt, task::{ready, Poll}, @@ -18,44 +21,53 @@ use std::{ ops::Deref, sync::atomic::{self, AtomicUsize}, }; -use tokio::time; +use tokio::time::{self, Instant}; use tokio_postgres::{AsyncMessage, ReadyForQueryStatus}; use crate::{ auth::{self, backend::ComputeUserInfo, check_peer_addr_is_in_list}, console, - metrics::{LatencyTimer, NUM_DB_CONNECTIONS_GAUGE}, - proxy::{connect_compute::ConnectMechanism, neon_options}, + context::RequestMonitoring, + metrics::NUM_DB_CONNECTIONS_GAUGE, + proxy::connect_compute::ConnectMechanism, usage_metrics::{Ids, MetricCounter, USAGE_METRICS}, }; use crate::{compute, config}; -use tracing::{error, warn, Span}; +use tracing::{debug, error, warn, Span}; use tracing::{info, info_span, Instrument}; -pub const APP_NAME: &str = "/sql_over_http"; -const MAX_CONNS_PER_ENDPOINT: usize = 20; +pub const APP_NAME: SmolStr = SmolStr::new_inline("/sql_over_http"); #[derive(Debug, Clone)] pub struct ConnInfo { - pub username: SmolStr, + pub user_info: ComputeUserInfo, pub dbname: SmolStr, - pub hostname: SmolStr, pub password: SmolStr, - pub options: Option, } impl ConnInfo { // hm, change to hasher to avoid cloning? pub fn db_and_user(&self) -> (SmolStr, SmolStr) { - (self.dbname.clone(), self.username.clone()) + (self.dbname.clone(), self.user_info.user.clone()) + } + + pub fn endpoint_cache_key(&self) -> SmolStr { + self.user_info.endpoint_cache_key() } } impl fmt::Display for ConnInfo { // use custom display to avoid logging password fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { - write!(f, "{}@{}/{}", self.username, self.hostname, self.dbname) + write!( + f, + "{}@{}/{}?{}", + self.user_info.user, + self.user_info.endpoint, + self.dbname, + self.user_info.options.get_cache_key("") + ) } } @@ -69,234 +81,51 @@ struct ConnPoolEntry { pub struct EndpointConnPool { pools: HashMap<(SmolStr, SmolStr), DbUserConnPool>, total_conns: usize, + max_conns: usize, + _guard: IntCounterPairGuard, } -/// 4096 is the number of rounds that SCRAM-SHA-256 recommends. -/// It's not the 600,000 that OWASP recommends... but our passwords are high entropy anyway. -/// -/// Still takes 1.4ms to hash on my hardware. -/// We don't want to ruin the latency improvements of using the pool by making password verification take too long -const PARAMS: Params = Params { - rounds: 4096, - output_length: 32, -}; - -#[derive(Default)] -pub struct DbUserConnPool { - conns: Vec, - password_hash: Option, -} - -pub struct GlobalConnPool { - // endpoint -> per-endpoint connection pool - // - // That should be a fairly conteded map, so return reference to the per-endpoint - // pool as early as possible and release the lock. - global_pool: DashMap>>, - - /// [`DashMap::len`] iterates over all inner pools and acquires a read lock on each. - /// That seems like far too much effort, so we're using a relaxed increment counter instead. - /// It's only used for diagnostics. - global_pool_size: AtomicUsize, - - // Maximum number of connections per one endpoint. - // Can mix different (dbname, username) connections. - // When running out of free slots for a particular endpoint, - // falls back to opening a new connection for each request. - max_conns_per_endpoint: usize, - - proxy_config: &'static crate::config::ProxyConfig, - - // Using a lock to remove any race conditions. - // Eg cleaning up connections while a new connection is returned - closed: RwLock, -} - -impl GlobalConnPool { - pub fn new(config: &'static crate::config::ProxyConfig) -> Arc { - Arc::new(Self { - global_pool: DashMap::new(), - global_pool_size: AtomicUsize::new(0), - max_conns_per_endpoint: MAX_CONNS_PER_ENDPOINT, - proxy_config: config, - closed: RwLock::new(false), - }) +impl EndpointConnPool { + fn get_conn_entry(&mut self, db_user: (SmolStr, SmolStr)) -> Option { + let Self { + pools, total_conns, .. + } = self; + pools + .get_mut(&db_user) + .and_then(|pool_entries| pool_entries.get_conn_entry(total_conns)) } - pub fn shutdown(&self) { - *self.closed.write() = true; - - self.global_pool.retain(|_, endpoint_pool| { - let mut pool = endpoint_pool.write(); - // by clearing this hashmap, we remove the slots that a connection can be returned to. - // when returning, it drops the connection if the slot doesn't exist - pool.pools.clear(); - pool.total_conns = 0; - + fn remove_client(&mut self, db_user: (SmolStr, SmolStr), conn_id: uuid::Uuid) -> bool { + let Self { + pools, total_conns, .. + } = self; + if let Some(pool) = pools.get_mut(&db_user) { + let old_len = pool.conns.len(); + pool.conns.retain(|conn| conn.conn.conn_id != conn_id); + let new_len = pool.conns.len(); + let removed = old_len - new_len; + *total_conns -= removed; + removed > 0 + } else { false - }); + } } - pub async fn get( - self: &Arc, - conn_info: &ConnInfo, - force_new: bool, - session_id: uuid::Uuid, - peer_addr: IpAddr, - ) -> anyhow::Result { - let mut client: Option = None; - let mut latency_timer = LatencyTimer::new("http"); - - let pool = if force_new { - None - } else { - Some((conn_info.clone(), self.clone())) - }; - - let mut hash_valid = false; - if !force_new { - let pool = self.get_or_create_endpoint_pool(&conn_info.hostname); - let mut hash = None; - - // find a pool entry by (dbname, username) if exists - { - let pool = pool.read(); - if let Some(pool_entries) = pool.pools.get(&conn_info.db_and_user()) { - if !pool_entries.conns.is_empty() { - hash = pool_entries.password_hash.clone(); - } - } - } - - // a connection exists in the pool, verify the password hash - if let Some(hash) = hash { - let pw = conn_info.password.clone(); - let validate = tokio::task::spawn_blocking(move || { - Pbkdf2.verify_password(pw.as_bytes(), &hash.password_hash()) - }) - .await?; - - // if the hash is invalid, don't error - // we will continue with the regular connection flow - if validate.is_ok() { - hash_valid = true; - let mut pool = pool.write(); - if let Some(pool_entries) = pool.pools.get_mut(&conn_info.db_and_user()) { - if let Some(entry) = pool_entries.conns.pop() { - client = Some(entry.conn); - pool.total_conns -= 1; - } - } - } - } - } - - // ok return cached connection if found and establish a new one otherwise - let new_client = if let Some(client) = client { - if client.inner.is_closed() { - let conn_id = uuid::Uuid::new_v4(); - info!(%conn_id, "pool: cached connection '{conn_info}' is closed, opening a new one"); - connect_to_compute( - self.proxy_config, - conn_info, - conn_id, - session_id, - latency_timer, - peer_addr, - ) - .await - } else { - info!("pool: reusing connection '{conn_info}'"); - client.session.send(session_id)?; - tracing::Span::current().record( - "pid", - &tracing::field::display(client.inner.get_process_id()), - ); - latency_timer.pool_hit(); - latency_timer.success(); - return Ok(Client::new(client, pool).await); - } - } else { - let conn_id = uuid::Uuid::new_v4(); - info!(%conn_id, "pool: opening a new connection '{conn_info}'"); - connect_to_compute( - self.proxy_config, - conn_info, - conn_id, - session_id, - latency_timer, - peer_addr, - ) - .await - }; - if let Ok(client) = &new_client { - tracing::Span::current().record( - "pid", - &tracing::field::display(client.inner.get_process_id()), - ); - } - - match &new_client { - // clear the hash. it's no longer valid - // TODO: update tokio-postgres fork to allow access to this error kind directly - Err(err) - if hash_valid && err.to_string().contains("password authentication failed") => - { - let pool = self.get_or_create_endpoint_pool(&conn_info.hostname); - let mut pool = pool.write(); - if let Some(entry) = pool.pools.get_mut(&conn_info.db_and_user()) { - entry.password_hash = None; - } - } - // new password is valid and we should insert/update it - Ok(_) if !force_new && !hash_valid => { - let pw = conn_info.password.clone(); - let new_hash = tokio::task::spawn_blocking(move || { - let salt = SaltString::generate(rand::rngs::OsRng); - Pbkdf2 - .hash_password_customized(pw.as_bytes(), None, None, PARAMS, &salt) - .map(|s| s.serialize()) - }) - .await??; - - let pool = self.get_or_create_endpoint_pool(&conn_info.hostname); - let mut pool = pool.write(); - pool.pools - .entry(conn_info.db_and_user()) - .or_default() - .password_hash = Some(new_hash); - } - _ => {} - } - let new_client = new_client?; - Ok(Client::new(new_client, pool).await) - } - - fn put(&self, conn_info: &ConnInfo, client: ClientInner) -> anyhow::Result<()> { + fn put(pool: &RwLock, conn_info: &ConnInfo, client: ClientInner) -> anyhow::Result<()> { let conn_id = client.conn_id; - // We want to hold this open while we return. This ensures that the pool can't close - // while we are in the middle of returning the connection. - let closed = self.closed.read(); - if *closed { - info!(%conn_id, "pool: throwing away connection '{conn_info}' because pool is closed"); - return Ok(()); - } - if client.inner.is_closed() { info!(%conn_id, "pool: throwing away connection '{conn_info}' because connection is closed"); return Ok(()); } - let pool = self.get_or_create_endpoint_pool(&conn_info.hostname); - // return connection to the pool let mut returned = false; let mut per_db_size = 0; let total_conns = { let mut pool = pool.write(); - if pool.total_conns < self.max_conns_per_endpoint { + if pool.total_conns < pool.max_conns { // we create this db-user entry in get, so it should not be None if let Some(pool_entries) = pool.pools.get_mut(&conn_info.db_and_user()) { pool_entries.conns.push(ConnPoolEntry { @@ -323,6 +152,292 @@ impl GlobalConnPool { Ok(()) } +} + +/// 4096 is the number of rounds that SCRAM-SHA-256 recommends. +/// It's not the 600,000 that OWASP recommends... but our passwords are high entropy anyway. +/// +/// Still takes 1.4ms to hash on my hardware. +/// We don't want to ruin the latency improvements of using the pool by making password verification take too long +const PARAMS: Params = Params { + rounds: 4096, + output_length: 32, +}; + +#[derive(Default)] +pub struct DbUserConnPool { + conns: Vec, + password_hash: Option, +} + +impl DbUserConnPool { + fn clear_closed_clients(&mut self, conns: &mut usize) { + let old_len = self.conns.len(); + + self.conns.retain(|conn| !conn.conn.inner.is_closed()); + + let new_len = self.conns.len(); + let removed = old_len - new_len; + *conns -= removed; + } + + fn get_conn_entry(&mut self, conns: &mut usize) -> Option { + self.clear_closed_clients(conns); + let conn = self.conns.pop(); + if conn.is_some() { + *conns -= 1; + } + conn + } +} + +pub struct GlobalConnPool { + // endpoint -> per-endpoint connection pool + // + // That should be a fairly conteded map, so return reference to the per-endpoint + // pool as early as possible and release the lock. + global_pool: DashMap>>, + + /// Number of endpoint-connection pools + /// + /// [`DashMap::len`] iterates over all inner pools and acquires a read lock on each. + /// That seems like far too much effort, so we're using a relaxed increment counter instead. + /// It's only used for diagnostics. + global_pool_size: AtomicUsize, + + proxy_config: &'static crate::config::ProxyConfig, +} + +#[derive(Debug, Clone, Copy)] +pub struct GlobalConnPoolOptions { + // Maximum number of connections per one endpoint. + // Can mix different (dbname, username) connections. + // When running out of free slots for a particular endpoint, + // falls back to opening a new connection for each request. + pub max_conns_per_endpoint: usize, + + pub gc_epoch: Duration, + + pub pool_shards: usize, + + pub idle_timeout: Duration, + + pub opt_in: bool, +} + +pub static GC_LATENCY: Lazy = Lazy::new(|| { + register_histogram!( + "proxy_http_pool_reclaimation_lag_seconds", + "Time it takes to reclaim unused connection pools", + // 1us -> 65ms + exponential_buckets(1e-6, 2.0, 16).unwrap(), + ) + .unwrap() +}); + +pub static ENDPOINT_POOLS: Lazy = Lazy::new(|| { + register_int_counter_pair!( + "proxy_http_pool_endpoints_registered_total", + "Number of endpoints we have registered pools for", + "proxy_http_pool_endpoints_unregistered_total", + "Number of endpoints we have unregistered pools for", + ) + .unwrap() +}); + +impl GlobalConnPool { + pub fn new(config: &'static crate::config::ProxyConfig) -> Arc { + let shards = config.http_config.pool_options.pool_shards; + Arc::new(Self { + global_pool: DashMap::with_shard_amount(shards), + global_pool_size: AtomicUsize::new(0), + proxy_config: config, + }) + } + + pub fn shutdown(&self) { + // drops all strong references to endpoint-pools + self.global_pool.clear(); + } + + pub async fn gc_worker(&self, mut rng: impl Rng) { + let epoch = self.proxy_config.http_config.pool_options.gc_epoch; + let mut interval = tokio::time::interval(epoch / (self.global_pool.shards().len()) as u32); + loop { + interval.tick().await; + + let shard = rng.gen_range(0..self.global_pool.shards().len()); + self.gc(shard); + } + } + + fn gc(&self, shard: usize) { + debug!(shard, "pool: performing epoch reclamation"); + + // acquire a random shard lock + let mut shard = self.global_pool.shards()[shard].write(); + + let timer = GC_LATENCY.start_timer(); + let current_len = shard.len(); + shard.retain(|endpoint, x| { + // if the current endpoint pool is unique (no other strong or weak references) + // then it is currently not in use by any connections. + if let Some(pool) = Arc::get_mut(x.get_mut()) { + let EndpointConnPool { + pools, total_conns, .. + } = pool.get_mut(); + + // ensure that closed clients are removed + pools + .iter_mut() + .for_each(|(_, db_pool)| db_pool.clear_closed_clients(total_conns)); + + // we only remove this pool if it has no active connections + if *total_conns == 0 { + info!("pool: discarding pool for endpoint {endpoint}"); + return false; + } + } + + true + }); + let new_len = shard.len(); + drop(shard); + timer.observe_duration(); + + let removed = current_len - new_len; + + if removed > 0 { + let global_pool_size = self + .global_pool_size + .fetch_sub(removed, atomic::Ordering::Relaxed) + - removed; + info!("pool: performed global pool gc. size now {global_pool_size}"); + } + } + + pub async fn get( + self: &Arc, + ctx: &mut RequestMonitoring, + conn_info: ConnInfo, + force_new: bool, + ) -> anyhow::Result { + let mut client: Option = None; + + let mut hash_valid = false; + let mut endpoint_pool = Weak::new(); + if !force_new { + let pool = self.get_or_create_endpoint_pool(&conn_info.endpoint_cache_key()); + endpoint_pool = Arc::downgrade(&pool); + let mut hash = None; + + // find a pool entry by (dbname, username) if exists + { + let pool = pool.read(); + if let Some(pool_entries) = pool.pools.get(&conn_info.db_and_user()) { + if !pool_entries.conns.is_empty() { + hash = pool_entries.password_hash.clone(); + } + } + } + + // a connection exists in the pool, verify the password hash + if let Some(hash) = hash { + let pw = conn_info.password.clone(); + let validate = tokio::task::spawn_blocking(move || { + Pbkdf2.verify_password(pw.as_bytes(), &hash.password_hash()) + }) + .await?; + + // if the hash is invalid, don't error + // we will continue with the regular connection flow + if validate.is_ok() { + hash_valid = true; + if let Some(entry) = pool.write().get_conn_entry(conn_info.db_and_user()) { + client = Some(entry.conn) + } + } + } + } + + // ok return cached connection if found and establish a new one otherwise + let new_client = if let Some(client) = client { + if client.inner.is_closed() { + let conn_id = uuid::Uuid::new_v4(); + info!(%conn_id, "pool: cached connection '{conn_info}' is closed, opening a new one"); + connect_to_compute( + self.proxy_config, + ctx, + &conn_info, + conn_id, + endpoint_pool.clone(), + ) + .await + } else { + info!("pool: reusing connection '{conn_info}'"); + client.session.send(ctx.session_id)?; + tracing::Span::current().record( + "pid", + &tracing::field::display(client.inner.get_process_id()), + ); + ctx.latency_timer.pool_hit(); + ctx.latency_timer.success(); + return Ok(Client::new(client, conn_info, endpoint_pool).await); + } + } else { + let conn_id = uuid::Uuid::new_v4(); + info!(%conn_id, "pool: opening a new connection '{conn_info}'"); + connect_to_compute( + self.proxy_config, + ctx, + &conn_info, + conn_id, + endpoint_pool.clone(), + ) + .await + }; + if let Ok(client) = &new_client { + tracing::Span::current().record( + "pid", + &tracing::field::display(client.inner.get_process_id()), + ); + } + + match &new_client { + // clear the hash. it's no longer valid + // TODO: update tokio-postgres fork to allow access to this error kind directly + Err(err) + if hash_valid && err.to_string().contains("password authentication failed") => + { + let pool = self.get_or_create_endpoint_pool(&conn_info.endpoint_cache_key()); + let mut pool = pool.write(); + if let Some(entry) = pool.pools.get_mut(&conn_info.db_and_user()) { + entry.password_hash = None; + } + } + // new password is valid and we should insert/update it + Ok(_) if !force_new && !hash_valid => { + let pw = conn_info.password.clone(); + let new_hash = tokio::task::spawn_blocking(move || { + let salt = SaltString::generate(rand::rngs::OsRng); + Pbkdf2 + .hash_password_customized(pw.as_bytes(), None, None, PARAMS, &salt) + .map(|s| s.serialize()) + }) + .await??; + + let pool = self.get_or_create_endpoint_pool(&conn_info.endpoint_cache_key()); + let mut pool = pool.write(); + pool.pools + .entry(conn_info.db_and_user()) + .or_default() + .password_hash = Some(new_hash); + } + _ => {} + } + let new_client = new_client?; + Ok(Client::new(new_client, conn_info, endpoint_pool).await) + } fn get_or_create_endpoint_pool(&self, endpoint: &SmolStr) -> Arc> { // fast path @@ -334,6 +449,12 @@ impl GlobalConnPool { let new_pool = Arc::new(RwLock::new(EndpointConnPool { pools: HashMap::new(), total_conns: 0, + max_conns: self + .proxy_config + .http_config + .pool_options + .max_conns_per_endpoint, + _guard: ENDPOINT_POOLS.guard(), })); // find or create a pool for this endpoint @@ -363,9 +484,10 @@ impl GlobalConnPool { } struct TokioMechanism<'a> { + pool: Weak>, conn_info: &'a ConnInfo, - session_id: uuid::Uuid, conn_id: uuid::Uuid, + idle: Duration, } #[async_trait] @@ -376,15 +498,18 @@ impl ConnectMechanism for TokioMechanism<'_> { async fn connect_once( &self, + ctx: &mut RequestMonitoring, node_info: &console::CachedNodeInfo, timeout: time::Duration, ) -> Result { connect_to_compute_once( + ctx, node_info, self.conn_info, timeout, self.conn_id, - self.session_id, + self.pool.clone(), + self.idle, ) .await } @@ -398,75 +523,58 @@ impl ConnectMechanism for TokioMechanism<'_> { #[tracing::instrument(fields(pid = tracing::field::Empty), skip_all)] async fn connect_to_compute( config: &config::ProxyConfig, + ctx: &mut RequestMonitoring, conn_info: &ConnInfo, conn_id: uuid::Uuid, - session_id: uuid::Uuid, - latency_timer: LatencyTimer, - peer_addr: IpAddr, + pool: Weak>, ) -> anyhow::Result { - let tls = config.tls_config.as_ref(); - let common_names = tls.and_then(|tls| tls.common_names.clone()); + ctx.set_application(Some(APP_NAME)); + let backend = config + .auth_backend + .as_ref() + .map(|_| conn_info.user_info.clone()); - let params = StartupMessageParams::new([ - ("user", &conn_info.username), - ("database", &conn_info.dbname), - ("application_name", APP_NAME), - ("options", conn_info.options.as_deref().unwrap_or("")), - ]); - let creds = auth::ClientCredentials::parse( - ¶ms, - Some(&conn_info.hostname), - common_names, - peer_addr, - )?; - - let creds = - ComputeUserInfo::try_from(creds).map_err(|_| anyhow!("missing endpoint identifier"))?; - let backend = config.auth_backend.as_ref().map(|_| creds); - - let console_options = neon_options(¶ms); - - let extra = console::ConsoleReqExtra { - session_id: uuid::Uuid::new_v4(), - application_name: APP_NAME.to_string(), - options: console_options, - }; if !config.disable_ip_check_for_http { - let allowed_ips = backend.get_allowed_ips(&extra).await?; - if !check_peer_addr_is_in_list(&peer_addr, &allowed_ips) { + let allowed_ips = backend.get_allowed_ips(ctx).await?; + if !check_peer_addr_is_in_list(&ctx.peer_addr, &allowed_ips) { return Err(auth::AuthError::ip_address_not_allowed().into()); } } let node_info = backend - .wake_compute(&extra) + .wake_compute(ctx) .await? .context("missing cache entry from wake_compute")?; + ctx.set_project(node_info.aux.clone()); + crate::proxy::connect_compute::connect_to_compute( + ctx, &TokioMechanism { conn_id, conn_info, - session_id, + pool, + idle: config.http_config.pool_options.idle_timeout, }, node_info, - &extra, &backend, - latency_timer, ) .await } async fn connect_to_compute_once( + ctx: &mut RequestMonitoring, node_info: &console::CachedNodeInfo, conn_info: &ConnInfo, timeout: time::Duration, conn_id: uuid::Uuid, - mut session: uuid::Uuid, + pool: Weak>, + idle: Duration, ) -> Result { let mut config = (*node_info.config).clone(); + let mut session = ctx.session_id; let (client, mut connection) = config - .user(&conn_info.username) + .user(&conn_info.user_info.user) .password(&*conn_info.password) .dbname(&conn_info.dbname) .connect_timeout(timeout) @@ -474,7 +582,7 @@ async fn connect_to_compute_once( .await?; let conn_gauge = NUM_DB_CONNECTIONS_GAUGE - .with_label_values(&["http"]) + .with_label_values(&[ctx.protocol]) .guard(); tracing::Span::current().record("pid", &tracing::field::display(client.get_process_id())); @@ -490,13 +598,29 @@ async fn connect_to_compute_once( branch_id: node_info.aux.branch_id.clone(), }; + let db_user = conn_info.db_and_user(); tokio::spawn( async move { let _conn_gauge = conn_gauge; + let mut idle_timeout = pin!(tokio::time::sleep(idle)); poll_fn(move |cx| { if matches!(rx.has_changed(), Ok(true)) { session = *rx.borrow_and_update(); info!(%session, "changed session"); + idle_timeout.as_mut().reset(Instant::now() + idle); + } + + // 5 minute idle connection timeout + if idle_timeout.as_mut().poll(cx).is_ready() { + idle_timeout.as_mut().reset(Instant::now() + idle); + info!("connection idle"); + if let Some(pool) = pool.clone().upgrade() { + // remove client from pool - should close the connection if it's idle. + // does nothing if the client is currently checked-out and in-use + if pool.write().remove_client(db_user.clone(), conn_id) { + info!("idle connection removed"); + } + } } loop { @@ -514,15 +638,25 @@ async fn connect_to_compute_once( } Some(Err(e)) => { error!(%session, "connection error: {}", e); - return Poll::Ready(()) + break } None => { info!("connection closed"); - return Poll::Ready(()) + break } } } - }).await + + // remove from connection pool + if let Some(pool) = pool.clone().upgrade() { + if pool.write().remove_client(db_user.clone(), conn_id) { + info!("closed connection removed"); + } + } + + Poll::Ready(()) + }).await; + } .instrument(span) ); @@ -552,23 +686,27 @@ pub struct Client { conn_id: uuid::Uuid, span: Span, inner: Option, - pool: Option<(ConnInfo, Arc)>, + conn_info: ConnInfo, + pool: Weak>, } pub struct Discard<'a> { conn_id: uuid::Uuid, - pool: &'a mut Option<(ConnInfo, Arc)>, + conn_info: &'a ConnInfo, + pool: &'a mut Weak>, } impl Client { pub(self) async fn new( inner: ClientInner, - pool: Option<(ConnInfo, Arc)>, + conn_info: ConnInfo, + pool: Weak>, ) -> Self { Self { conn_id: inner.conn_id, inner: Some(inner), span: Span::current(), + conn_info, pool, } } @@ -577,6 +715,7 @@ impl Client { inner, pool, conn_id, + conn_info, span: _, } = self; ( @@ -586,6 +725,7 @@ impl Client { .inner, Discard { pool, + conn_info, conn_id: *conn_id, }, ) @@ -601,14 +741,14 @@ impl Client { impl Discard<'_> { pub fn check_idle(&mut self, status: ReadyForQueryStatus) { - if status != ReadyForQueryStatus::Idle { - if let Some((conn_info, _)) = self.pool.take() { - info!(conn_id = %self.conn_id, "pool: throwing away connection '{conn_info}' because connection is not idle") - } + let conn_info = &self.conn_info; + if status != ReadyForQueryStatus::Idle && std::mem::take(self.pool).strong_count() > 0 { + info!(conn_id = %self.conn_id, "pool: throwing away connection '{conn_info}' because connection is not idle") } } pub fn discard(&mut self) { - if let Some((conn_info, _)) = self.pool.take() { + let conn_info = &self.conn_info; + if std::mem::take(self.pool).strong_count() > 0 { info!(conn_id = %self.conn_id, "pool: throwing away connection '{conn_info}' because connection is potentially in a broken state") } } @@ -628,16 +768,17 @@ impl Deref for Client { impl Drop for Client { fn drop(&mut self) { + let conn_info = self.conn_info.clone(); let client = self .inner .take() .expect("client inner should not be removed"); - if let Some((conn_info, conn_pool)) = self.pool.take() { + if let Some(conn_pool) = std::mem::take(&mut self.pool).upgrade() { let current_span = self.span.clone(); // return connection to the pool tokio::task::spawn_blocking(move || { let _span = current_span.enter(); - let _ = conn_pool.put(&conn_info, client); + let _ = EndpointConnPool::put(&conn_pool, &conn_info, client); }); } } diff --git a/proxy/src/serverless/sql_over_http.rs b/proxy/src/serverless/sql_over_http.rs index 307b085ce0..719559ed48 100644 --- a/proxy/src/serverless/sql_over_http.rs +++ b/proxy/src/serverless/sql_over_http.rs @@ -1,4 +1,3 @@ -use std::net::IpAddr; use std::sync::Arc; use anyhow::bail; @@ -14,6 +13,7 @@ use hyper::{Body, HeaderMap, Request}; use serde_json::json; use serde_json::Map; use serde_json::Value; +use smol_str::SmolStr; use tokio_postgres::error::DbError; use tokio_postgres::types::Kind; use tokio_postgres::types::Type; @@ -28,8 +28,13 @@ use url::Url; use utils::http::error::ApiError; use utils::http::json::json_response; +use crate::auth::backend::ComputeUserInfo; +use crate::auth::endpoint_sni; use crate::config::HttpConfig; +use crate::config::TlsConfig; +use crate::context::RequestMonitoring; use crate::metrics::NUM_CONNECTION_REQUESTS_GAUGE; +use crate::proxy::NeonOptions; use super::conn_pool::ConnInfo; use super::conn_pool::GlobalConnPool; @@ -121,8 +126,10 @@ fn json_array_to_pg_array(value: &Value) -> Option { } fn get_conn_info( + ctx: &mut RequestMonitoring, headers: &HeaderMap, sni_hostname: Option, + tls: &TlsConfig, ) -> Result { let connection_string = headers .get("Neon-Connection-String") @@ -146,10 +153,11 @@ fn get_conn_info( .next() .ok_or(anyhow::anyhow!("invalid database name"))?; - let username = connection_url.username(); + let username = SmolStr::from(connection_url.username()); if username.is_empty() { return Err(anyhow::anyhow!("missing username")); } + ctx.set_user(username.clone()); let password = connection_url .password() @@ -176,45 +184,47 @@ fn get_conn_info( } } + let endpoint = endpoint_sni(hostname, &tls.common_names)?; + + let endpoint: SmolStr = endpoint.into(); + ctx.set_endpoint_id(Some(endpoint.clone())); + let pairs = connection_url.query_pairs(); let mut options = Option::None; for (key, value) in pairs { if key == "options" { - options = Some(value.into()); + options = Some(NeonOptions::parse_options_raw(&value)); break; } } + let user_info = ComputeUserInfo { + endpoint, + user: username, + options: options.unwrap_or_default(), + }; + Ok(ConnInfo { - username: username.into(), + user_info, dbname: dbname.into(), - hostname: hostname.into(), password: password.into(), - options, }) } // TODO: return different http error codes pub async fn handle( + tls: &'static TlsConfig, + config: &'static HttpConfig, + ctx: &mut RequestMonitoring, request: Request, sni_hostname: Option, conn_pool: Arc, - session_id: uuid::Uuid, - peer_addr: IpAddr, - config: &'static HttpConfig, ) -> Result, ApiError> { let result = tokio::time::timeout( - config.timeout, - handle_inner( - config, - request, - sni_hostname, - conn_pool, - session_id, - peer_addr, - ), + config.request_timeout, + handle_inner(tls, config, ctx, request, sni_hostname, conn_pool), ) .await; let mut response = match result { @@ -278,7 +288,7 @@ pub async fn handle( Err(_) => { let message = format!( "HTTP-Connection timed out, execution time exeeded {} seconds", - config.timeout.as_secs() + config.request_timeout.as_secs() ); error!(message); json_response( @@ -296,12 +306,12 @@ pub async fn handle( #[instrument(name = "sql-over-http", fields(pid = tracing::field::Empty), skip_all)] async fn handle_inner( + tls: &'static TlsConfig, config: &'static HttpConfig, + ctx: &mut RequestMonitoring, request: Request, sni_hostname: Option, conn_pool: Arc, - session_id: uuid::Uuid, - peer_addr: IpAddr, ) -> anyhow::Result> { let _request_gauge = NUM_CONNECTION_REQUESTS_GAUGE .with_label_values(&["http"]) @@ -311,7 +321,7 @@ async fn handle_inner( // Determine the destination and connection params // let headers = request.headers(); - let conn_info = get_conn_info(headers, sni_hostname)?; + let conn_info = get_conn_info(ctx, headers, sni_hostname, tls)?; // Determine the output options. Default behaviour is 'false'. Anything that is not // strictly 'true' assumed to be false. @@ -320,7 +330,8 @@ async fn handle_inner( // Allow connection pooling only if explicitly requested // or if we have decided that http pool is no longer opt-in - let allow_pool = !config.pool_opt_in || headers.get(&ALLOW_POOL) == Some(&HEADER_VALUE_TRUE); + let allow_pool = + !config.pool_options.opt_in || headers.get(&ALLOW_POOL) == Some(&HEADER_VALUE_TRUE); // isolation level, read only and deferrable @@ -339,10 +350,12 @@ async fn handle_inner( let txn_read_only = headers.get(&TXN_READ_ONLY) == Some(&HEADER_VALUE_TRUE); let txn_deferrable = headers.get(&TXN_DEFERRABLE) == Some(&HEADER_VALUE_TRUE); + let paused = ctx.latency_timer.pause(); let request_content_length = match request.body().size_hint().upper() { Some(v) => v, None => MAX_REQUEST_SIZE + 1, }; + drop(paused); // we don't have a streaming request support yet so this is to prevent OOM // from a malicious user sending an extremely large request body @@ -358,9 +371,7 @@ async fn handle_inner( let body = hyper::body::to_bytes(request.into_body()).await?; let payload: Payload = serde_json::from_slice(&body)?; - let mut client = conn_pool - .get(&conn_info, !allow_pool, session_id, peer_addr) - .await?; + let mut client = conn_pool.get(ctx, conn_info, !allow_pool).await?; let mut response = Response::builder() .status(StatusCode::OK) @@ -448,6 +459,7 @@ async fn handle_inner( } }; + ctx.log(); let metrics = client.metrics(); // how could this possibly fail diff --git a/proxy/src/serverless/websocket.rs b/proxy/src/serverless/websocket.rs index 071add3bca..a6529c920a 100644 --- a/proxy/src/serverless/websocket.rs +++ b/proxy/src/serverless/websocket.rs @@ -1,6 +1,7 @@ use crate::{ cancellation::CancelMap, config::ProxyConfig, + context::RequestMonitoring, error::io_error, proxy::{handle_client, ClientMode}, rate_limiter::EndpointRateLimiter, @@ -12,7 +13,6 @@ use hyper_tungstenite::{tungstenite::Message, HyperWebsocket, WebSocketStream}; use pin_project_lite::pin_project; use std::{ - net::IpAddr, pin::Pin, sync::Arc, task::{ready, Context, Poll}, @@ -130,22 +130,20 @@ impl AsyncBufRead for WebSocketRw { } pub async fn serve_websocket( - websocket: HyperWebsocket, config: &'static ProxyConfig, + ctx: &mut RequestMonitoring, + websocket: HyperWebsocket, cancel_map: &CancelMap, - session_id: uuid::Uuid, hostname: Option, - peer_addr: IpAddr, endpoint_rate_limiter: Arc, ) -> anyhow::Result<()> { let websocket = websocket.await?; handle_client( config, + ctx, cancel_map, - session_id, WebSocketRw::new(websocket), ClientMode::Websockets { hostname }, - peer_addr, endpoint_rate_limiter, ) .await?; diff --git a/pyproject.toml b/pyproject.toml index 401acaeba4..bb04123e05 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -6,7 +6,7 @@ authors = [] [tool.poetry.dependencies] python = "^3.9" -pytest = "^7.3.1" +pytest = "^7.4.4" psycopg2-binary = "^2.9.6" typing-extensions = "^4.6.1" PyJWT = {version = "^2.1.0", extras = ["crypto"]} @@ -17,7 +17,7 @@ aiopg = "^1.4.0" Jinja2 = "^3.0.2" types-requests = "^2.31.0.0" types-psycopg2 = "^2.9.21.10" -boto3 = "^1.26.16" +boto3 = "^1.34.11" boto3-stubs = {extras = ["s3"], version = "^1.26.16"} moto = {extras = ["server"], version = "^4.1.2"} backoff = "^2.2.1" @@ -40,22 +40,13 @@ pytest-split = "^0.8.1" zstandard = "^0.21.0" [tool.poetry.group.dev.dependencies] -black = "^23.3.0" mypy = "==1.3.0" -ruff = "^0.0.269" +ruff = "^0.1.11" [build-system] requires = ["poetry-core>=1.0.0"] build-backend = "poetry.core.masonry.api" -[tool.black] -line-length = 100 -extend-exclude = ''' -/( - vendor -)/ -''' - [tool.mypy] exclude = "^vendor/" check_untyped_defs = true @@ -82,7 +73,9 @@ ignore_missing_imports = true [tool.ruff] target-version = "py39" extend-exclude = ["vendor/"] -ignore = ["E501"] +ignore = [ + "E501", # Line too long, we don't want to be too strict about it +] select = [ "E", # pycodestyle "F", # Pyflakes @@ -90,3 +83,4 @@ select = [ "W", # pycodestyle "B", # bugbear ] +line-length = 100 # this setting is rather guidance, it won't fail if it can't make the shorter diff --git a/rust-toolchain.toml b/rust-toolchain.toml index b2cd21d85c..9b5a965f7d 100644 --- a/rust-toolchain.toml +++ b/rust-toolchain.toml @@ -1,5 +1,5 @@ [toolchain] -channel = "1.74.0" +channel = "1.75.0" profile = "default" # The default profile includes rustc, rust-std, cargo, rust-docs, rustfmt and clippy. # https://rust-lang.github.io/rustup/concepts/profiles.html diff --git a/s3_scrubber/Cargo.toml b/s3_scrubber/Cargo.toml index fdae378d55..4d136472e0 100644 --- a/s3_scrubber/Cargo.toml +++ b/s3_scrubber/Cargo.toml @@ -6,6 +6,7 @@ license.workspace = true [dependencies] aws-sdk-s3.workspace = true +aws-smithy-async.workspace = true either.workspace = true tokio-rustls.workspace = true anyhow.workspace = true @@ -39,3 +40,5 @@ tracing-subscriber.workspace = true clap.workspace = true tracing-appender = "0.2" histogram = "0.7" + +futures.workspace = true diff --git a/s3_scrubber/src/garbage.rs b/s3_scrubber/src/garbage.rs index 7192afb91b..93bb115883 100644 --- a/s3_scrubber/src/garbage.rs +++ b/s3_scrubber/src/garbage.rs @@ -2,7 +2,10 @@ //! S3 objects which are either not referenced by any metadata, or are referenced by a //! control plane tenant/timeline in a deleted state. -use std::{collections::HashMap, sync::Arc}; +use std::{ + collections::{HashMap, HashSet}, + sync::Arc, +}; use anyhow::Context; use aws_sdk_s3::{ @@ -118,6 +121,13 @@ const S3_CONCURRENCY: usize = 32; // How many concurrent API requests to make to the console API. const CONSOLE_CONCURRENCY: usize = 128; +struct ConsoleCache { + /// Set of tenants found in the control plane API + projects: HashMap, + /// Set of tenants for which the control plane API returned 404 + not_found: HashSet, +} + async fn find_garbage_inner( bucket_config: BucketConfig, console_config: ConsoleConfig, @@ -143,23 +153,49 @@ async fn find_garbage_inner( console_projects.len() ); - // TODO(sharding): batch calls into Console so that we only call once for each TenantId, - // rather than checking the same TenantId for multiple TenantShardId + // Because many tenant shards may look up the same TenantId, we maintain a cache. + let console_cache = Arc::new(std::sync::Mutex::new(ConsoleCache { + projects: console_projects, + not_found: HashSet::new(), + })); // Enumerate Tenants in S3, and check if each one exists in Console tracing::info!("Finding all tenants in bucket {}...", bucket_config.bucket); let tenants = stream_tenants(&s3_client, &target); let tenants_checked = tenants.map_ok(|t| { let api_client = cloud_admin_api_client.clone(); - let console_projects = &console_projects; + let console_cache = console_cache.clone(); async move { - match console_projects.get(&t.tenant_id) { + // Check cache before issuing API call + let project_data = { + let cache = console_cache.lock().unwrap(); + let result = cache.projects.get(&t.tenant_id).cloned(); + if result.is_none() && cache.not_found.contains(&t.tenant_id) { + return Ok((t, None)); + } + result + }; + + match project_data { Some(project_data) => Ok((t, Some(project_data.clone()))), - None => api_client - .find_tenant_project(t.tenant_id) - .await - .map_err(|e| anyhow::anyhow!(e)) - .map(|r| (t, r)), + None => { + let project_data = api_client + .find_tenant_project(t.tenant_id) + .await + .map_err(|e| anyhow::anyhow!(e)); + + // Populate cache with result of API call + { + let mut cache = console_cache.lock().unwrap(); + if let Ok(Some(project_data)) = &project_data { + cache.projects.insert(t.tenant_id, project_data.clone()); + } else if let Ok(None) = &project_data { + cache.not_found.insert(t.tenant_id); + } + } + + project_data.map(|r| (t, r)) + } } } }); diff --git a/s3_scrubber/src/lib.rs b/s3_scrubber/src/lib.rs index d2338c21e5..d2842877d0 100644 --- a/s3_scrubber/src/lib.rs +++ b/s3_scrubber/src/lib.rs @@ -15,10 +15,13 @@ use anyhow::Context; use aws_config::environment::EnvironmentVariableCredentialsProvider; use aws_config::imds::credentials::ImdsCredentialsProvider; use aws_config::meta::credentials::CredentialsProviderChain; +use aws_config::profile::ProfileFileCredentialsProvider; +use aws_config::retry::RetryConfig; use aws_config::sso::SsoCredentialsProvider; use aws_config::BehaviorVersion; -use aws_sdk_s3::config::Region; +use aws_sdk_s3::config::{AsyncSleep, Region, SharedAsyncSleep}; use aws_sdk_s3::{Client, Config}; +use aws_smithy_async::rt::sleep::TokioSleep; use clap::ValueEnum; use pageserver::tenant::TENANTS_SEGMENT_NAME; @@ -255,6 +258,11 @@ pub fn init_s3_client(account_id: Option, bucket_region: Region) -> Clie let chain = CredentialsProviderChain::first_try( "env", EnvironmentVariableCredentialsProvider::new(), + ) + // uses "AWS_PROFILE" / `aws sso login --profile ` + .or_else( + "profile-sso", + ProfileFileCredentialsProvider::builder().build(), ); // Use SSO if we were given an account ID @@ -265,7 +273,7 @@ pub fn init_s3_client(account_id: Option, bucket_region: Region) -> Clie .account_id(sso_account) .role_name("PowerUserAccess") .start_url("https://neondb.awsapps.com/start") - .region(Region::from_static("eu-central-1")) + .region(bucket_region.clone()) .build(), ), None => chain, @@ -277,9 +285,13 @@ pub fn init_s3_client(account_id: Option, bucket_region: Region) -> Clie ) }; + let sleep_impl: Arc = Arc::new(TokioSleep::new()); + let mut builder = Config::builder() .behavior_version(BehaviorVersion::v2023_11_09()) .region(bucket_region) + .retry_config(RetryConfig::adaptive().with_max_attempts(3)) + .sleep_impl(SharedAsyncSleep::from(sleep_impl)) .credentials_provider(credentials_provider); if let Ok(endpoint) = env::var("AWS_ENDPOINT_URL") { diff --git a/s3_scrubber/src/main.rs b/s3_scrubber/src/main.rs index ef020edc2a..957213856b 100644 --- a/s3_scrubber/src/main.rs +++ b/s3_scrubber/src/main.rs @@ -1,3 +1,4 @@ +use pageserver_api::shard::TenantShardId; use s3_scrubber::garbage::{find_garbage, purge_garbage, PurgeMode}; use s3_scrubber::scan_metadata::scan_metadata; use s3_scrubber::{init_logging, BucketConfig, ConsoleConfig, NodeKind, TraversingDepth}; @@ -34,6 +35,8 @@ enum Command { ScanMetadata { #[arg(short, long, default_value_t = false)] json: bool, + #[arg(long = "tenant-id", num_args = 0..)] + tenant_ids: Vec, }, } @@ -57,35 +60,37 @@ async fn main() -> anyhow::Result<()> { )); match cli.command { - Command::ScanMetadata { json } => match scan_metadata(bucket_config.clone()).await { - Err(e) => { - tracing::error!("Failed: {e}"); - Err(e) - } - Ok(summary) => { - if json { - println!("{}", serde_json::to_string(&summary).unwrap()) - } else { - println!("{}", summary.summary_string()); + Command::ScanMetadata { json, tenant_ids } => { + match scan_metadata(bucket_config.clone(), tenant_ids).await { + Err(e) => { + tracing::error!("Failed: {e}"); + Err(e) } - if summary.is_fatal() { - Err(anyhow::anyhow!("Fatal scrub errors detected")) - } else if summary.is_empty() { - // Strictly speaking an empty bucket is a valid bucket, but if someone ran the - // scrubber they were likely expecting to scan something, and if we see no timelines - // at all then it's likely due to some configuration issues like a bad prefix - Err(anyhow::anyhow!( - "No timelines found in bucket {} prefix {}", - bucket_config.bucket, - bucket_config - .prefix_in_bucket - .unwrap_or("".to_string()) - )) - } else { - Ok(()) + Ok(summary) => { + if json { + println!("{}", serde_json::to_string(&summary).unwrap()) + } else { + println!("{}", summary.summary_string()); + } + if summary.is_fatal() { + Err(anyhow::anyhow!("Fatal scrub errors detected")) + } else if summary.is_empty() { + // Strictly speaking an empty bucket is a valid bucket, but if someone ran the + // scrubber they were likely expecting to scan something, and if we see no timelines + // at all then it's likely due to some configuration issues like a bad prefix + Err(anyhow::anyhow!( + "No timelines found in bucket {} prefix {}", + bucket_config.bucket, + bucket_config + .prefix_in_bucket + .unwrap_or("".to_string()) + )) + } else { + Ok(()) + } } } - }, + } Command::FindGarbage { node_kind, depth, diff --git a/s3_scrubber/src/scan_metadata.rs b/s3_scrubber/src/scan_metadata.rs index bcc4d2e618..4b63bb3884 100644 --- a/s3_scrubber/src/scan_metadata.rs +++ b/s3_scrubber/src/scan_metadata.rs @@ -17,7 +17,9 @@ use utils::id::TenantId; #[derive(Serialize)] pub struct MetadataSummary { - count: usize, + tenant_count: usize, + timeline_count: usize, + timeline_shard_count: usize, with_errors: HashSet, with_warnings: HashSet, with_orphans: HashSet, @@ -87,7 +89,9 @@ impl MinMaxHisto { impl MetadataSummary { fn new() -> Self { Self { - count: 0, + tenant_count: 0, + timeline_count: 0, + timeline_shard_count: 0, with_errors: HashSet::new(), with_warnings: HashSet::new(), with_orphans: HashSet::new(), @@ -112,7 +116,7 @@ impl MetadataSummary { } fn update_data(&mut self, data: &S3TimelineBlobData) { - self.count += 1; + self.timeline_shard_count += 1; if let BlobDataParseResult::Parsed { index_part, index_part_generation: _, @@ -158,16 +162,20 @@ impl MetadataSummary { ); format!( - "Timelines: {0} -With errors: {1} -With warnings: {2} -With orphan layers: {3} + "Tenants: {} +Timelines: {} +Timeline-shards: {} +With errors: {} +With warnings: {} +With orphan layers: {} Index versions: {version_summary} -Timeline size bytes: {4} -Layer size bytes: {5} -Timeline layer count: {6} +Timeline size bytes: {} +Layer size bytes: {} +Timeline layer count: {} ", - self.count, + self.tenant_count, + self.timeline_count, + self.timeline_shard_count, self.with_errors.len(), self.with_warnings.len(), self.with_orphans.len(), @@ -182,15 +190,22 @@ Timeline layer count: {6} } pub fn is_empty(&self) -> bool { - self.count == 0 + self.timeline_shard_count == 0 } } /// Scan the pageserver metadata in an S3 bucket, reporting errors and statistics. -pub async fn scan_metadata(bucket_config: BucketConfig) -> anyhow::Result { +pub async fn scan_metadata( + bucket_config: BucketConfig, + tenant_ids: Vec, +) -> anyhow::Result { let (s3_client, target) = init_remote(bucket_config, NodeKind::Pageserver)?; - let tenants = stream_tenants(&s3_client, &target); + let tenants = if tenant_ids.is_empty() { + futures::future::Either::Left(stream_tenants(&s3_client, &target)) + } else { + futures::future::Either::Right(futures::stream::iter(tenant_ids.into_iter().map(Ok))) + }; // How many tenants to process in parallel. We need to be mindful of pageservers // accessing the same per tenant prefixes, so use a lower setting than pageservers. @@ -226,8 +241,12 @@ pub async fn scan_metadata(bucket_config: BucketConfig) -> anyhow::Result, ) { + summary.tenant_count += 1; + + let mut timeline_ids = HashSet::new(); let mut timeline_generations = HashMap::new(); for (ttid, data) in timelines { + timeline_ids.insert(ttid.timeline_id); // Stash the generation of each timeline, for later use identifying orphan layers if let BlobDataParseResult::Parsed { index_part: _index_part, @@ -245,6 +264,8 @@ pub async fn scan_metadata(bucket_config: BucketConfig) -> anyhow::Result String { + format!( + "{GIT_VERSION} failpoints: {}, features: {:?}", + fail::has_failpoints(), + FEATURES, + ) +} + const ABOUT: &str = r#" A fleet of safekeepers is responsible for reliably storing WAL received from compute, passing it through consensus (mitigating potential computes brain @@ -167,7 +180,9 @@ async fn main() -> anyhow::Result<()> { // getting 'argument cannot be used multiple times' error. This seems to be // impossible with pure Derive API, so convert struct to Command, modify it, // parse arguments, and then fill the struct back. - let cmd = ::command().args_override_self(true); + let cmd = ::command() + .args_override_self(true) + .version(version()); let mut matches = cmd.get_matches(); let mut args = ::from_arg_matches_mut(&mut matches)?; diff --git a/safekeeper/src/control_file.rs b/safekeeper/src/control_file.rs index 7aadd67ac6..591bfea182 100644 --- a/safekeeper/src/control_file.rs +++ b/safekeeper/src/control_file.rs @@ -66,12 +66,10 @@ impl FileStorage { /// Create file storage for a new timeline, but don't persist it yet. pub fn create_new( - ttid: &TenantTimelineId, + timeline_dir: Utf8PathBuf, conf: &SafeKeeperConf, state: SafeKeeperState, ) -> Result { - let timeline_dir = conf.timeline_dir(ttid); - let store = FileStorage { timeline_dir, conf: conf.clone(), @@ -277,7 +275,8 @@ mod test { .await .expect("failed to create timeline dir"); let state = SafeKeeperState::empty(); - let storage = FileStorage::create_new(ttid, conf, state.clone())?; + let timeline_dir = conf.timeline_dir(ttid); + let storage = FileStorage::create_new(timeline_dir, conf, state.clone())?; Ok((storage, state)) } diff --git a/safekeeper/src/copy_timeline.rs b/safekeeper/src/copy_timeline.rs new file mode 100644 index 0000000000..ef88eb27e3 --- /dev/null +++ b/safekeeper/src/copy_timeline.rs @@ -0,0 +1,250 @@ +use std::sync::Arc; + +use anyhow::{bail, Result}; +use camino::Utf8PathBuf; + +use postgres_ffi::{MAX_SEND_SIZE, WAL_SEGMENT_SIZE}; +use tokio::{ + fs::OpenOptions, + io::{AsyncSeekExt, AsyncWriteExt}, +}; +use tracing::{info, warn}; +use utils::{id::TenantTimelineId, lsn::Lsn}; + +use crate::{ + control_file::{FileStorage, Storage}, + pull_timeline::{create_temp_timeline_dir, load_temp_timeline, validate_temp_timeline}, + safekeeper::SafeKeeperState, + timeline::{Timeline, TimelineError}, + wal_backup::copy_s3_segments, + wal_storage::{wal_file_paths, WalReader}, + GlobalTimelines, SafeKeeperConf, +}; + +// we don't want to have more than 10 segments on disk after copy, because they take space +const MAX_BACKUP_LAG: u64 = 10 * WAL_SEGMENT_SIZE as u64; + +pub struct Request { + pub source: Arc, + pub until_lsn: Lsn, + pub destination_ttid: TenantTimelineId, +} + +pub async fn handle_request(request: Request) -> Result<()> { + // TODO: request.until_lsn MUST be a valid LSN, and we cannot check it :( + // if LSN will point to the middle of a WAL record, timeline will be in "broken" state + + match GlobalTimelines::get(request.destination_ttid) { + // timeline already exists. would be good to check that this timeline is the copy + // of the source timeline, but it isn't obvious how to do that + Ok(_) => return Ok(()), + // timeline not found, we are going to create it + Err(TimelineError::NotFound(_)) => {} + // error, probably timeline was deleted + res => { + res?; + } + } + + let conf = &GlobalTimelines::get_global_config(); + let ttid = request.destination_ttid; + + let (_tmp_dir, tli_dir_path) = create_temp_timeline_dir(conf, ttid).await?; + + let (mem_state, state) = request.source.get_state().await; + let start_lsn = state.timeline_start_lsn; + if start_lsn == Lsn::INVALID { + bail!("timeline is not initialized"); + } + let backup_lsn = mem_state.backup_lsn; + + { + let commit_lsn = mem_state.commit_lsn; + let flush_lsn = request.source.get_flush_lsn().await; + + info!( + "collected info about source timeline: start_lsn={}, backup_lsn={}, commit_lsn={}, flush_lsn={}", + start_lsn, backup_lsn, commit_lsn, flush_lsn + ); + + assert!(backup_lsn >= start_lsn); + assert!(commit_lsn >= start_lsn); + assert!(flush_lsn >= start_lsn); + + if request.until_lsn > flush_lsn { + bail!("requested LSN is beyond the end of the timeline"); + } + if request.until_lsn < start_lsn { + bail!("requested LSN is before the start of the timeline"); + } + + if request.until_lsn > commit_lsn { + warn!("copy_timeline WAL is not fully committed"); + } + + if backup_lsn < request.until_lsn && request.until_lsn.0 - backup_lsn.0 > MAX_BACKUP_LAG { + // we have a lot of segments that are not backed up. we can try to wait here until + // segments will be backed up to remote storage, but it's not clear how long to wait + bail!("too many segments are not backed up"); + } + } + + let wal_seg_size = state.server.wal_seg_size as usize; + if wal_seg_size == 0 { + bail!("wal_seg_size is not set"); + } + + let first_segment = start_lsn.segment_number(wal_seg_size); + let last_segment = request.until_lsn.segment_number(wal_seg_size); + + let new_backup_lsn = { + // we can't have new backup_lsn greater than existing backup_lsn or start of the last segment + let max_backup_lsn = backup_lsn.min(Lsn(last_segment * wal_seg_size as u64)); + + if max_backup_lsn <= start_lsn { + // probably we are starting from the first segment, which was not backed up yet. + // note that start_lsn can be in the middle of the segment + start_lsn + } else { + // we have some segments backed up, so we will assume all WAL below max_backup_lsn is backed up + assert!(max_backup_lsn.segment_offset(wal_seg_size) == 0); + max_backup_lsn + } + }; + + // all previous segments will be copied inside S3 + let first_ondisk_segment = new_backup_lsn.segment_number(wal_seg_size); + assert!(first_ondisk_segment <= last_segment); + assert!(first_ondisk_segment >= first_segment); + + copy_s3_segments( + wal_seg_size, + &request.source.ttid, + &request.destination_ttid, + first_segment, + first_ondisk_segment, + ) + .await?; + + copy_disk_segments( + conf, + &state, + wal_seg_size, + &request.source.ttid, + new_backup_lsn, + request.until_lsn, + &tli_dir_path, + ) + .await?; + + let mut new_state = SafeKeeperState::new( + &request.destination_ttid, + state.server.clone(), + vec![], + request.until_lsn, + start_lsn, + ); + new_state.timeline_start_lsn = start_lsn; + new_state.peer_horizon_lsn = request.until_lsn; + new_state.backup_lsn = new_backup_lsn; + + let mut file_storage = FileStorage::create_new(tli_dir_path.clone(), conf, new_state.clone())?; + file_storage.persist(&new_state).await?; + + // now we have a ready timeline in a temp directory + validate_temp_timeline(conf, request.destination_ttid, &tli_dir_path).await?; + load_temp_timeline(conf, request.destination_ttid, &tli_dir_path).await?; + + Ok(()) +} + +async fn copy_disk_segments( + conf: &SafeKeeperConf, + persisted_state: &SafeKeeperState, + wal_seg_size: usize, + source_ttid: &TenantTimelineId, + start_lsn: Lsn, + end_lsn: Lsn, + tli_dir_path: &Utf8PathBuf, +) -> Result<()> { + let mut wal_reader = WalReader::new( + conf.workdir.clone(), + conf.timeline_dir(source_ttid), + persisted_state, + start_lsn, + true, + )?; + + let mut buf = [0u8; MAX_SEND_SIZE]; + + let first_segment = start_lsn.segment_number(wal_seg_size); + let last_segment = end_lsn.segment_number(wal_seg_size); + + for segment in first_segment..=last_segment { + let segment_start = segment * wal_seg_size as u64; + let segment_end = segment_start + wal_seg_size as u64; + + let copy_start = segment_start.max(start_lsn.0); + let copy_end = segment_end.min(end_lsn.0); + + let copy_start = copy_start - segment_start; + let copy_end = copy_end - segment_start; + + let wal_file_path = { + let (normal, partial) = wal_file_paths(tli_dir_path, segment, wal_seg_size)?; + + if segment == last_segment { + partial + } else { + normal + } + }; + + write_segment( + &mut buf, + &wal_file_path, + wal_seg_size as u64, + copy_start, + copy_end, + &mut wal_reader, + ) + .await?; + } + + Ok(()) +} + +async fn write_segment( + buf: &mut [u8], + file_path: &Utf8PathBuf, + wal_seg_size: u64, + from: u64, + to: u64, + reader: &mut WalReader, +) -> Result<()> { + assert!(from <= to); + assert!(to <= wal_seg_size); + + let mut file = OpenOptions::new() + .create(true) + .write(true) + .open(&file_path) + .await?; + + // maybe fill with zeros, as in wal_storage.rs? + file.set_len(wal_seg_size).await?; + file.seek(std::io::SeekFrom::Start(from)).await?; + + let mut bytes_left = to - from; + while bytes_left > 0 { + let len = bytes_left as usize; + let len = len.min(buf.len()); + let len = reader.read(&mut buf[..len]).await?; + file.write_all(&buf[..len]).await?; + bytes_left -= len as u64; + } + + file.flush().await?; + file.sync_all().await?; + Ok(()) +} diff --git a/safekeeper/src/debug_dump.rs b/safekeeper/src/debug_dump.rs index daf9255ecb..c9ff1afdea 100644 --- a/safekeeper/src/debug_dump.rs +++ b/safekeeper/src/debug_dump.rs @@ -7,13 +7,16 @@ use std::io::Read; use std::path::PathBuf; use std::sync::Arc; +use anyhow::bail; use anyhow::Result; use camino::Utf8Path; use chrono::{DateTime, Utc}; use postgres_ffi::XLogSegNo; +use postgres_ffi::MAX_SEND_SIZE; use serde::Deserialize; use serde::Serialize; +use sha2::{Digest, Sha256}; use utils::id::NodeId; use utils::id::TenantTimelineId; use utils::id::{TenantId, TimelineId}; @@ -25,6 +28,7 @@ use crate::safekeeper::TermHistory; use crate::SafeKeeperConf; use crate::send_wal::WalSenderState; +use crate::wal_storage::WalReader; use crate::GlobalTimelines; /// Various filters that influence the resulting JSON output. @@ -300,3 +304,56 @@ fn build_config(config: SafeKeeperConf) -> Config { wal_backup_enabled: config.wal_backup_enabled, } } + +#[derive(Debug, Clone, Deserialize, Serialize)] +pub struct TimelineDigestRequest { + pub from_lsn: Lsn, + pub until_lsn: Lsn, +} + +#[derive(Debug, Serialize, Deserialize)] +pub struct TimelineDigest { + pub sha256: String, +} + +pub async fn calculate_digest( + tli: &Arc, + request: TimelineDigestRequest, +) -> Result { + if request.from_lsn > request.until_lsn { + bail!("from_lsn is greater than until_lsn"); + } + + let conf = GlobalTimelines::get_global_config(); + let (_, persisted_state) = tli.get_state().await; + + if persisted_state.timeline_start_lsn > request.from_lsn { + bail!("requested LSN is before the start of the timeline"); + } + + let mut wal_reader = WalReader::new( + conf.workdir.clone(), + tli.timeline_dir.clone(), + &persisted_state, + request.from_lsn, + true, + )?; + + let mut hasher = Sha256::new(); + let mut buf = [0u8; MAX_SEND_SIZE]; + + let mut bytes_left = (request.until_lsn.0 - request.from_lsn.0) as usize; + while bytes_left > 0 { + let bytes_to_read = std::cmp::min(buf.len(), bytes_left); + let bytes_read = wal_reader.read(&mut buf[..bytes_to_read]).await?; + if bytes_read == 0 { + bail!("wal_reader.read returned 0 bytes"); + } + hasher.update(&buf[..bytes_read]); + bytes_left -= bytes_read; + } + + let digest = hasher.finalize(); + let digest = hex::encode(digest); + Ok(TimelineDigest { sha256: digest }) +} diff --git a/safekeeper/src/http/routes.rs b/safekeeper/src/http/routes.rs index c48b5330b3..5283ea19c1 100644 --- a/safekeeper/src/http/routes.rs +++ b/safekeeper/src/http/routes.rs @@ -2,7 +2,7 @@ use hyper::{Body, Request, Response, StatusCode, Uri}; use once_cell::sync::Lazy; use postgres_ffi::WAL_SEGMENT_SIZE; -use safekeeper_api::models::SkTimelineInfo; +use safekeeper_api::models::{SkTimelineInfo, TimelineCopyRequest}; use serde::{Deserialize, Serialize}; use std::collections::{HashMap, HashSet}; use std::fmt; @@ -12,19 +12,23 @@ use storage_broker::proto::SafekeeperTimelineInfo; use storage_broker::proto::TenantTimelineId as ProtoTenantTimelineId; use tokio::fs::File; use tokio::io::AsyncReadExt; +use tokio_util::sync::CancellationToken; +use utils::failpoint_support::failpoints_handler; +use utils::http::request::parse_query_param; use std::io::Write as _; use tokio::sync::mpsc; use tokio_stream::wrappers::ReceiverStream; -use tracing::info_span; +use tracing::{info_span, Instrument}; use utils::http::endpoint::{request_span, ChannelWriter}; +use crate::debug_dump::TimelineDigestRequest; use crate::receive_wal::WalReceiverState; use crate::safekeeper::Term; use crate::safekeeper::{ServerInfo, TermLsn}; use crate::send_wal::WalSenderState; use crate::timeline::PeerInfo; -use crate::{debug_dump, pull_timeline}; +use crate::{copy_timeline, debug_dump, pull_timeline}; use crate::timelines_global_map::TimelineDeleteForceResult; use crate::GlobalTimelines; @@ -202,6 +206,56 @@ async fn timeline_pull_handler(mut request: Request) -> Result) -> Result, ApiError> { + check_permission(&request, None)?; + + let request_data: TimelineCopyRequest = json_request(&mut request).await?; + let ttid = TenantTimelineId::new( + parse_request_param(&request, "tenant_id")?, + parse_request_param(&request, "source_timeline_id")?, + ); + + let source = GlobalTimelines::get(ttid)?; + + copy_timeline::handle_request(copy_timeline::Request{ + source, + until_lsn: request_data.until_lsn, + destination_ttid: TenantTimelineId::new(ttid.tenant_id, request_data.target_timeline_id), + }) + .instrument(info_span!("copy_timeline", from=%ttid, to=%request_data.target_timeline_id, until_lsn=%request_data.until_lsn)) + .await + .map_err(ApiError::InternalServerError)?; + + json_response(StatusCode::OK, ()) +} + +async fn timeline_digest_handler(request: Request) -> Result, ApiError> { + let ttid = TenantTimelineId::new( + parse_request_param(&request, "tenant_id")?, + parse_request_param(&request, "timeline_id")?, + ); + check_permission(&request, Some(ttid.tenant_id))?; + + let from_lsn: Option = parse_query_param(&request, "from_lsn")?; + let until_lsn: Option = parse_query_param(&request, "until_lsn")?; + + let request = TimelineDigestRequest { + from_lsn: from_lsn.ok_or(ApiError::BadRequest(anyhow::anyhow!( + "from_lsn is required" + )))?, + until_lsn: until_lsn.ok_or(ApiError::BadRequest(anyhow::anyhow!( + "until_lsn is required" + )))?, + }; + + let tli = GlobalTimelines::get(ttid).map_err(ApiError::from)?; + + let response = debug_dump::calculate_digest(&tli, request) + .await + .map_err(ApiError::InternalServerError)?; + json_response(StatusCode::OK, response) +} + /// Download a file from the timeline directory. // TODO: figure out a better way to copy files between safekeepers async fn timeline_files_handler(request: Request) -> Result, ApiError> { @@ -444,6 +498,12 @@ pub fn make_router(conf: SafeKeeperConf) -> RouterBuilder .data(Arc::new(conf)) .data(auth) .get("/v1/status", |r| request_span(r, status_handler)) + .put("/v1/failpoints", |r| { + request_span(r, move |r| async { + let cancel = CancellationToken::new(); + failpoints_handler(r, cancel).await + }) + }) // Will be used in the future instead of implicit timeline creation .post("/v1/tenant/timeline", |r| { request_span(r, timeline_create_handler) @@ -464,11 +524,18 @@ pub fn make_router(conf: SafeKeeperConf) -> RouterBuilder "/v1/tenant/:tenant_id/timeline/:timeline_id/file/:filename", |r| request_span(r, timeline_files_handler), ) + .post( + "/v1/tenant/:tenant_id/timeline/:source_timeline_id/copy", + |r| request_span(r, timeline_copy_handler), + ) // for tests .post("/v1/record_safekeeper_info/:tenant_id/:timeline_id", |r| { request_span(r, record_safekeeper_info) }) .get("/v1/debug_dump", |r| request_span(r, dump_debug_handler)) + .get("/v1/tenant/:tenant_id/timeline/:timeline_id/digest", |r| { + request_span(r, timeline_digest_handler) + }) } #[cfg(test)] diff --git a/safekeeper/src/lib.rs b/safekeeper/src/lib.rs index 3a086f1f54..fc5f99eb00 100644 --- a/safekeeper/src/lib.rs +++ b/safekeeper/src/lib.rs @@ -16,6 +16,7 @@ mod auth; pub mod broker; pub mod control_file; pub mod control_file_upgrade; +pub mod copy_timeline; pub mod debug_dump; pub mod handler; pub mod http; diff --git a/safekeeper/src/pull_timeline.rs b/safekeeper/src/pull_timeline.rs index ad3a18a536..93b51f32c0 100644 --- a/safekeeper/src/pull_timeline.rs +++ b/safekeeper/src/pull_timeline.rs @@ -1,16 +1,24 @@ +use std::sync::Arc; + +use camino::Utf8PathBuf; +use camino_tempfile::Utf8TempDir; use chrono::{DateTime, Utc}; use serde::{Deserialize, Serialize}; use anyhow::{bail, Context, Result}; use tokio::io::AsyncWriteExt; use tracing::info; -use utils::id::{TenantId, TenantTimelineId, TimelineId}; +use utils::{ + id::{TenantId, TenantTimelineId, TimelineId}, + lsn::Lsn, +}; use crate::{ control_file, debug_dump, http::routes::TimelineStatus, + timeline::{Timeline, TimelineError}, wal_storage::{self, Storage}, - GlobalTimelines, + GlobalTimelines, SafeKeeperConf, }; /// Info about timeline on safekeeper ready for reporting. @@ -91,7 +99,7 @@ pub async fn handle_request(request: Request) -> Result { async fn pull_timeline(status: TimelineStatus, host: String) -> Result { let ttid = TenantTimelineId::new(status.tenant_id, status.timeline_id); info!( - "Pulling timeline {} from safekeeper {}, commit_lsn={}, flush_lsn={}, term={}, epoch={}", + "pulling timeline {} from safekeeper {}, commit_lsn={}, flush_lsn={}, term={}, epoch={}", ttid, host, status.commit_lsn, @@ -121,14 +129,14 @@ async fn pull_timeline(status: TimelineStatus, host: String) -> Result if dump.timelines.len() != 1 { bail!( - "Expected to fetch single timeline, got {} timelines", + "expected to fetch single timeline, got {} timelines", dump.timelines.len() ); } let timeline = dump.timelines.into_iter().next().unwrap(); let disk_content = timeline.disk_content.ok_or(anyhow::anyhow!( - "Timeline {} doesn't have disk content", + "timeline {} doesn't have disk content", ttid ))?; @@ -155,29 +163,12 @@ async fn pull_timeline(status: TimelineStatus, host: String) -> Result filenames.insert(0, "safekeeper.control".to_string()); info!( - "Downloading {} files from safekeeper {}", + "downloading {} files from safekeeper {}", filenames.len(), host ); - // Creating temp directory for a new timeline. It needs to be - // located on the same filesystem as the rest of the timelines. - - // conf.workdir is usually /storage/safekeeper/data - // will try to transform it into /storage/safekeeper/tmp - let temp_base = conf - .workdir - .parent() - .ok_or(anyhow::anyhow!("workdir has no parent"))? - .join("tmp"); - - tokio::fs::create_dir_all(&temp_base).await?; - - let tli_dir = camino_tempfile::Builder::new() - .suffix("_temptli") - .prefix(&format!("{}_{}_", ttid.tenant_id, ttid.timeline_id)) - .tempdir_in(temp_base)?; - let tli_dir_path = tli_dir.path().to_path_buf(); + let (_tmp_dir, tli_dir_path) = create_temp_timeline_dir(conf, ttid).await?; // Note: some time happens between fetching list of files and fetching files themselves. // It's possible that some files will be removed from safekeeper and we will fail to fetch them. @@ -201,47 +192,105 @@ async fn pull_timeline(status: TimelineStatus, host: String) -> Result // TODO: fsync? // Let's create timeline from temp directory and verify that it's correct + let (commit_lsn, flush_lsn) = validate_temp_timeline(conf, ttid, &tli_dir_path).await?; + info!( + "finished downloading timeline {}, commit_lsn={}, flush_lsn={}", + ttid, commit_lsn, flush_lsn + ); + assert!(status.commit_lsn <= status.flush_lsn); - let control_path = tli_dir_path.join("safekeeper.control"); + // Finally, load the timeline. + let _tli = load_temp_timeline(conf, ttid, &tli_dir_path).await?; + + Ok(Response { + safekeeper_host: host, + }) +} + +/// Create temp directory for a new timeline. It needs to be located on the same +/// filesystem as the rest of the timelines. It will be automatically deleted when +/// Utf8TempDir goes out of scope. +pub async fn create_temp_timeline_dir( + conf: &SafeKeeperConf, + ttid: TenantTimelineId, +) -> Result<(Utf8TempDir, Utf8PathBuf)> { + // conf.workdir is usually /storage/safekeeper/data + // will try to transform it into /storage/safekeeper/tmp + let temp_base = conf + .workdir + .parent() + .ok_or(anyhow::anyhow!("workdir has no parent"))? + .join("tmp"); + + tokio::fs::create_dir_all(&temp_base).await?; + + let tli_dir = camino_tempfile::Builder::new() + .suffix("_temptli") + .prefix(&format!("{}_{}_", ttid.tenant_id, ttid.timeline_id)) + .tempdir_in(temp_base)?; + + let tli_dir_path = tli_dir.path().to_path_buf(); + + Ok((tli_dir, tli_dir_path)) +} + +/// Do basic validation of a temp timeline, before moving it to the global map. +pub async fn validate_temp_timeline( + conf: &SafeKeeperConf, + ttid: TenantTimelineId, + path: &Utf8PathBuf, +) -> Result<(Lsn, Lsn)> { + let control_path = path.join("safekeeper.control"); let control_store = control_file::FileStorage::load_control_file(control_path)?; if control_store.server.wal_seg_size == 0 { bail!("wal_seg_size is not set"); } - let wal_store = - wal_storage::PhysicalStorage::new(&ttid, tli_dir_path.clone(), conf, &control_store)?; + let wal_store = wal_storage::PhysicalStorage::new(&ttid, path.clone(), conf, &control_store)?; - let commit_lsn = status.commit_lsn; + let commit_lsn = control_store.commit_lsn; let flush_lsn = wal_store.flush_lsn(); - info!( - "Finished downloading timeline {}, commit_lsn={}, flush_lsn={}", - ttid, commit_lsn, flush_lsn - ); - assert!(status.commit_lsn <= status.flush_lsn); + Ok((commit_lsn, flush_lsn)) +} + +/// Move timeline from a temp directory to the main storage, and load it to the global map. +/// This operation is done under a lock to prevent bugs if several concurrent requests are +/// trying to load the same timeline. Note that it doesn't guard against creating the +/// timeline with the same ttid, but no one should be doing this anyway. +pub async fn load_temp_timeline( + conf: &SafeKeeperConf, + ttid: TenantTimelineId, + tmp_path: &Utf8PathBuf, +) -> Result> { + // Take a lock to prevent concurrent loadings + let load_lock = GlobalTimelines::loading_lock().await; + let guard = load_lock.lock().await; + + if !matches!(GlobalTimelines::get(ttid), Err(TimelineError::NotFound(_))) { + bail!("timeline already exists, cannot overwrite it") + } // Move timeline dir to the correct location let timeline_path = conf.timeline_dir(&ttid); info!( - "Moving timeline {} from {} to {}", - ttid, tli_dir_path, timeline_path + "moving timeline {} from {} to {}", + ttid, tmp_path, timeline_path ); tokio::fs::create_dir_all(conf.tenant_dir(&ttid.tenant_id)).await?; - tokio::fs::rename(tli_dir_path, &timeline_path).await?; + tokio::fs::rename(tmp_path, &timeline_path).await?; - let tli = GlobalTimelines::load_timeline(ttid) + let tli = GlobalTimelines::load_timeline(&guard, ttid) .await .context("Failed to load timeline after copy")?; info!( - "Loaded timeline {}, flush_lsn={}", + "loaded timeline {}, flush_lsn={}", ttid, tli.get_flush_lsn().await ); - Ok(Response { - safekeeper_host: host, - }) + Ok(tli) } diff --git a/safekeeper/src/send_wal.rs b/safekeeper/src/send_wal.rs index 44f14f8c7e..9a5657a40d 100644 --- a/safekeeper/src/send_wal.rs +++ b/safekeeper/src/send_wal.rs @@ -17,6 +17,7 @@ use postgres_ffi::{TimestampTz, MAX_SEND_SIZE}; use pq_proto::{BeMessage, WalSndKeepAlive, XLogDataBody}; use serde::{Deserialize, Serialize}; use tokio::io::{AsyncRead, AsyncWrite}; +use utils::failpoint_support; use utils::id::TenantTimelineId; use utils::lsn::AtomicLsn; use utils::pageserver_feedback::PageserverFeedback; @@ -391,15 +392,8 @@ impl SafekeeperPostgresHandler { // application_name: give only committed WAL (used by pageserver) or all // existing WAL (up to flush_lsn, used by walproposer or peer recovery). // The second case is always driven by a consensus leader which term - // must generally be also supplied. However we're sloppy to do this in - // walproposer recovery which will be removed soon. So TODO is to make - // it not Option'al then. - // - // Fetching WAL without term in recovery creates a small risk of this - // WAL getting concurrently garbaged if another compute rises which - // collects majority and starts fixing log on this safekeeper itself. - // That's ok as (old) proposer will never be able to commit such WAL. - let end_watch = if self.is_walproposer_recovery() { + // must be supplied. + let end_watch = if term.is_some() { EndWatch::Flush(tli.get_term_flush_lsn_watch_rx()) } else { EndWatch::Commit(tli.get_commit_lsn_watch_rx()) @@ -535,12 +529,19 @@ impl WalSender<'_, IO> { ); // try to send as much as available, capped by MAX_SEND_SIZE - let mut send_size = self - .end_pos - .checked_sub(self.start_pos) - .context("reading wal without waiting for it first")? - .0 as usize; - send_size = min(send_size, self.send_buf.len()); + let mut chunk_end_pos = self.start_pos + MAX_SEND_SIZE as u64; + // if we went behind available WAL, back off + if chunk_end_pos >= self.end_pos { + chunk_end_pos = self.end_pos; + } else { + // If sending not up to end pos, round down to page boundary to + // avoid breaking WAL record not at page boundary, as protocol + // demands. See walsender.c (XLogSendPhysical). + chunk_end_pos = chunk_end_pos + .checked_sub(chunk_end_pos.block_offset()) + .unwrap(); + } + let send_size = (chunk_end_pos.0 - self.start_pos.0) as usize; let send_buf = &mut self.send_buf[..send_size]; let send_size: usize; { @@ -551,7 +552,8 @@ impl WalSender<'_, IO> { } else { None }; - // read wal into buffer + // Read WAL into buffer. send_size can be additionally capped to + // segment boundary here. send_size = self.wal_reader.read(send_buf).await? }; let send_buf = &send_buf[..send_size]; @@ -566,6 +568,11 @@ impl WalSender<'_, IO> { })) .await?; + if let Some(appname) = &self.appname { + if appname == "replica" { + failpoint_support::sleep_millis_async!("sk-send-wal-replica-sleep"); + } + } trace!( "sent {} bytes of WAL {}-{}", send_size, diff --git a/safekeeper/src/timeline.rs b/safekeeper/src/timeline.rs index bdc9088138..2f284abe8c 100644 --- a/safekeeper/src/timeline.rs +++ b/safekeeper/src/timeline.rs @@ -141,7 +141,8 @@ impl SharedState { // We don't want to write anything to disk, because we may have existing timeline there. // These functions should not change anything on disk. - let control_store = control_file::FileStorage::create_new(ttid, conf, state)?; + let timeline_dir = conf.timeline_dir(ttid); + let control_store = control_file::FileStorage::create_new(timeline_dir, conf, state)?; let wal_store = wal_storage::PhysicalStorage::new(ttid, conf.timeline_dir(ttid), conf, &control_store)?; let sk = SafeKeeper::new(control_store, wal_store, conf.my_id)?; diff --git a/safekeeper/src/timelines_global_map.rs b/safekeeper/src/timelines_global_map.rs index cbb3342e40..92ac5ba66d 100644 --- a/safekeeper/src/timelines_global_map.rs +++ b/safekeeper/src/timelines_global_map.rs @@ -21,8 +21,12 @@ struct GlobalTimelinesState { timelines: HashMap>, wal_backup_launcher_tx: Option>, conf: Option, + load_lock: Arc>, } +// Used to prevent concurrent timeline loading. +pub struct TimelineLoadLock; + impl GlobalTimelinesState { /// Get configuration, which must be set once during init. fn get_conf(&self) -> &SafeKeeperConf { @@ -63,6 +67,7 @@ static TIMELINES_STATE: Lazy> = Lazy::new(|| { timelines: HashMap::new(), wal_backup_launcher_tx: None, conf: None, + load_lock: Arc::new(tokio::sync::Mutex::new(TimelineLoadLock)), }) }); @@ -174,8 +179,16 @@ impl GlobalTimelines { Ok(()) } + /// Take a lock for timeline loading. + pub async fn loading_lock() -> Arc> { + TIMELINES_STATE.lock().unwrap().load_lock.clone() + } + /// Load timeline from disk to the memory. - pub async fn load_timeline(ttid: TenantTimelineId) -> Result> { + pub async fn load_timeline<'a>( + _guard: &tokio::sync::MutexGuard<'a, TimelineLoadLock>, + ttid: TenantTimelineId, + ) -> Result> { let (conf, wal_backup_launcher_tx) = TIMELINES_STATE.lock().unwrap().get_dependencies(); match Timeline::load_timeline(&conf, ttid, wal_backup_launcher_tx) { diff --git a/safekeeper/src/wal_backup.rs b/safekeeper/src/wal_backup.rs index c99bbc7d61..e4499eaf50 100644 --- a/safekeeper/src/wal_backup.rs +++ b/safekeeper/src/wal_backup.rs @@ -7,7 +7,7 @@ use tokio::task::JoinHandle; use utils::id::NodeId; use std::cmp::min; -use std::collections::HashMap; +use std::collections::{HashMap, HashSet}; use std::pin::Pin; use std::sync::Arc; use std::time::Duration; @@ -531,3 +531,62 @@ pub async fn read_object( Ok(Box::pin(reader)) } + +/// Copy segments from one timeline to another. Used in copy_timeline. +pub async fn copy_s3_segments( + wal_seg_size: usize, + src_ttid: &TenantTimelineId, + dst_ttid: &TenantTimelineId, + from_segment: XLogSegNo, + to_segment: XLogSegNo, +) -> Result<()> { + const SEGMENTS_PROGRESS_REPORT_INTERVAL: u64 = 1024; + + let storage = REMOTE_STORAGE + .get() + .expect("failed to get remote storage") + .as_ref() + .unwrap(); + + let relative_dst_path = + Utf8Path::new(&dst_ttid.tenant_id.to_string()).join(dst_ttid.timeline_id.to_string()); + + let remote_path = RemotePath::new(&relative_dst_path)?; + + let files = storage.list_files(Some(&remote_path)).await?; + let uploaded_segments = &files + .iter() + .filter_map(|file| file.object_name().map(ToOwned::to_owned)) + .collect::>(); + + debug!( + "these segments have already been uploaded: {:?}", + uploaded_segments + ); + + let relative_src_path = + Utf8Path::new(&src_ttid.tenant_id.to_string()).join(src_ttid.timeline_id.to_string()); + + for segno in from_segment..to_segment { + if segno % SEGMENTS_PROGRESS_REPORT_INTERVAL == 0 { + info!("copied all segments from {} until {}", from_segment, segno); + } + + let segment_name = XLogFileName(PG_TLI, segno, wal_seg_size); + if uploaded_segments.contains(&segment_name) { + continue; + } + debug!("copying segment {}", segment_name); + + let from = RemotePath::new(&relative_src_path.join(&segment_name))?; + let to = RemotePath::new(&relative_dst_path.join(&segment_name))?; + + storage.copy_object(&from, &to).await?; + } + + info!( + "finished copying segments from {} until {}", + from_segment, to_segment + ); + Ok(()) +} diff --git a/safekeeper/src/wal_storage.rs b/safekeeper/src/wal_storage.rs index fa44b24258..8d138c701f 100644 --- a/safekeeper/src/wal_storage.rs +++ b/safekeeper/src/wal_storage.rs @@ -565,6 +565,9 @@ impl WalReader { }) } + /// Read WAL at current position into provided buf, returns number of bytes + /// read. It can be smaller than buf size only if segment boundary is + /// reached. pub async fn read(&mut self, buf: &mut [u8]) -> Result { // If this timeline is new, we may not have a full segment yet, so // we pad the first bytes of the timeline's first WAL segment with 0s @@ -725,7 +728,7 @@ async fn write_zeroes(file: &mut File, mut count: usize) -> Result<()> { } /// Helper returning full path to WAL segment file and its .partial brother. -fn wal_file_paths( +pub fn wal_file_paths( timeline_dir: &Utf8Path, segno: XLogSegNo, wal_seg_size: usize, diff --git a/scripts/export_import_between_pageservers.py b/scripts/export_import_between_pageservers.py index ff584bd4b0..980f343047 100755 --- a/scripts/export_import_between_pageservers.py +++ b/scripts/export_import_between_pageservers.py @@ -63,7 +63,7 @@ def subprocess_capture(capture_dir: str, cmd: List[str], **kwargs: Any) -> str: If those files already exist, we will overwrite them. Returns basepath for files with captured output. """ - assert type(cmd) is list + assert isinstance(cmd, list) base = os.path.basename(cmd[0]) + "_{}".format(global_counter()) basepath = os.path.join(capture_dir, base) stdout_filename = basepath + ".stdout" diff --git a/scripts/reformat b/scripts/reformat index 8688044f66..3533c4dcb8 100755 --- a/scripts/reformat +++ b/scripts/reformat @@ -6,5 +6,5 @@ set -euox pipefail echo 'Reformatting Rust code' cargo fmt echo 'Reformatting Python code' -poetry run ruff --fix test_runner scripts -poetry run black test_runner scripts +poetry run ruff check --fix test_runner scripts +poetry run ruff format test_runner scripts diff --git a/test_runner/fixtures/neon_fixtures.py b/test_runner/fixtures/neon_fixtures.py index bec51ccbd3..80dd78f4ad 100644 --- a/test_runner/fixtures/neon_fixtures.py +++ b/test_runner/fixtures/neon_fixtures.py @@ -40,6 +40,7 @@ from psycopg2.extensions import make_dsn, parse_dsn from typing_extensions import Literal from urllib3.util.retry import Retry +from fixtures import overlayfs from fixtures.broker import NeonBroker from fixtures.log_helper import log from fixtures.pageserver.allowed_errors import ( @@ -347,7 +348,9 @@ class PgProtocol: """ return self.safe_psql_many([query], **kwargs)[0] - def safe_psql_many(self, queries: List[str], **kwargs: Any) -> List[List[Tuple[Any, ...]]]: + def safe_psql_many( + self, queries: List[str], log_query=True, **kwargs: Any + ) -> List[List[Tuple[Any, ...]]]: """ Execute queries against the node and return all rows. This method passes all extra params to connstr. @@ -356,7 +359,8 @@ class PgProtocol: with closing(self.connect(**kwargs)) as conn: with conn.cursor() as cur: for query in queries: - log.info(f"Executing query: {query}") + if log_query: + log.info(f"Executing query: {query}") cur.execute(query) if cur.description is None: @@ -365,6 +369,12 @@ class PgProtocol: result.append(cur.fetchall()) return result + def safe_psql_scalar(self, query, log_query=True) -> Any: + """ + Execute query returning single row with single column. + """ + return self.safe_psql(query, log_query=log_query)[0][0] + @dataclass class AuthKeys: @@ -415,6 +425,7 @@ class NeonEnvBuilder: pg_version: PgVersion, test_name: str, test_output_dir: Path, + test_overlay_dir: Optional[Path] = None, pageserver_remote_storage: Optional[RemoteStorage] = None, pageserver_config_override: Optional[str] = None, num_safekeepers: int = 1, @@ -460,6 +471,8 @@ class NeonEnvBuilder: self.initial_timeline = initial_timeline or TimelineId.generate() self.scrub_on_exit = False self.test_output_dir = test_output_dir + self.test_overlay_dir = test_overlay_dir + self.overlay_mounts_created_by_us: List[Tuple[str, Path]] = [] self.pageserver_virtual_file_io_engine: Optional[str] = pageserver_virtual_file_io_engine @@ -541,7 +554,10 @@ class NeonEnvBuilder: tenants_to_dir = self.repo_dir / ps_dir.name / "tenants" log.info(f"Copying pageserver tenants directory {tenants_from_dir} to {tenants_to_dir}") - shutil.copytree(tenants_from_dir, tenants_to_dir) + if self.test_overlay_dir is None: + shutil.copytree(tenants_from_dir, tenants_to_dir) + else: + self.overlay_mount(f"{ps_dir.name}:tenants", tenants_from_dir, tenants_to_dir) for sk_from_dir in (repo_dir / "safekeepers").glob("sk*"): sk_to_dir = self.repo_dir / "safekeepers" / sk_from_dir.name @@ -550,9 +566,16 @@ class NeonEnvBuilder: shutil.copytree(sk_from_dir, sk_to_dir, ignore=shutil.ignore_patterns("*.log", "*.pid")) shutil.rmtree(self.repo_dir / "local_fs_remote_storage", ignore_errors=True) - shutil.copytree( - repo_dir / "local_fs_remote_storage", self.repo_dir / "local_fs_remote_storage" - ) + if self.test_overlay_dir is None: + shutil.copytree( + repo_dir / "local_fs_remote_storage", self.repo_dir / "local_fs_remote_storage" + ) + else: + self.overlay_mount( + "local_fs_remote_storage", + repo_dir / "local_fs_remote_storage", + self.repo_dir / "local_fs_remote_storage", + ) if (attachments_json := Path(repo_dir / "attachments.json")).exists(): shutil.copyfile(attachments_json, self.repo_dir / attachments_json.name) @@ -569,6 +592,69 @@ class NeonEnvBuilder: return self.env + def overlay_mount(self, ident: str, srcdir: Path, dstdir: Path): + """ + Mount `srcdir` as an overlayfs mount at `dstdir`. + The overlayfs `upperdir` and `workdir` will be placed in test_overlay_dir. + """ + assert self.test_overlay_dir + assert ( + self.test_output_dir in dstdir.parents + ) # so that teardown & test_overlay_dir fixture work + assert srcdir.is_dir() + dstdir.mkdir(exist_ok=False, parents=False) + ident_state_dir = self.test_overlay_dir / ident + upper = ident_state_dir / "upper" + work = ident_state_dir / "work" + ident_state_dir.mkdir( + exist_ok=False, parents=False + ) # exists_ok=False also checks uniqueness in self.overlay_mounts + upper.mkdir() + work.mkdir() + cmd = [ + "sudo", + "mount", + "-t", + "overlay", + "overlay", + "-o", + f"lowerdir={srcdir},upperdir={upper},workdir={work}", + str(dstdir), + ] + log.info(f"Mounting overlayfs srcdir={srcdir} dstdir={dstdir}: {cmd}") + subprocess_capture( + self.test_output_dir, cmd, check=True, echo_stderr=True, echo_stdout=True + ) + self.overlay_mounts_created_by_us.append((ident, dstdir)) + + def overlay_cleanup_teardown(self): + """ + Unmount the overlayfs mounts created by `self.overlay_mount()`. + Supposed to be called during env teardown. + """ + if self.test_overlay_dir is None: + return + while len(self.overlay_mounts_created_by_us) > 0: + (ident, mountpoint) = self.overlay_mounts_created_by_us.pop() + ident_state_dir = self.test_overlay_dir / ident + cmd = ["sudo", "umount", str(mountpoint)] + log.info( + f"Unmounting overlayfs mount created during setup for ident {ident} at {mountpoint}: {cmd}" + ) + subprocess_capture( + self.test_output_dir, cmd, check=True, echo_stderr=True, echo_stdout=True + ) + log.info( + f"Cleaning up overlayfs state dir (owned by root user) for ident {ident} at {ident_state_dir}" + ) + cmd = ["sudo", "rm", "-rf", str(ident_state_dir)] + subprocess_capture( + self.test_output_dir, cmd, check=True, echo_stderr=True, echo_stdout=True + ) + + # assert all overlayfs mounts in our test directory are gone + assert [] == list(overlayfs.iter_mounts_beneath(self.test_overlay_dir)) + def enable_scrub_on_exit(self): """ Call this if you would like the fixture to automatically run @@ -675,7 +761,10 @@ class NeonEnvBuilder: sk.stop(immediate=True) for pageserver in self.env.pageservers: - pageserver.assert_no_metric_errors() + # if the test threw an exception, don't check for errors + # as a failing assertion would cause the cleanup below to fail + if exc_type is not None: + pageserver.assert_no_metric_errors() pageserver.stop(immediate=True) @@ -690,6 +779,13 @@ class NeonEnvBuilder: log.error(f"Error during remote storage scrub: {e}") cleanup_error = e + try: + self.overlay_cleanup_teardown() + except Exception as e: + log.error(f"Error cleaning up overlay state: {e}") + if cleanup_error is not None: + cleanup_error = e + try: self.cleanup_remote_storage() except Exception as e: @@ -892,8 +988,8 @@ class NeonEnv: """Get list of safekeeper endpoints suitable for safekeepers GUC""" return ",".join(f"localhost:{wa.port.pg}" for wa in self.safekeepers) - def get_pageserver_version(self) -> str: - bin_pageserver = str(self.neon_binpath / "pageserver") + def get_binary_version(self, binary_name: str) -> str: + bin_pageserver = str(self.neon_binpath / binary_name) res = subprocess.run( [bin_pageserver, "--version"], check=True, @@ -1018,6 +1114,7 @@ def neon_env_builder( default_broker: NeonBroker, run_id: uuid.UUID, request: FixtureRequest, + test_overlay_dir: Path, pageserver_virtual_file_io_engine: str, ) -> Iterator[NeonEnvBuilder]: """ @@ -1050,6 +1147,7 @@ def neon_env_builder( pageserver_virtual_file_io_engine=pageserver_virtual_file_io_engine, test_name=request.node.name, test_output_dir=test_output_dir, + test_overlay_dir=test_overlay_dir, ) as builder: yield builder @@ -1104,8 +1202,8 @@ class AbstractNeonCli(abc.ABC): If `local_binpath` is true, then we are invoking a test utility """ - assert type(arguments) == list - assert type(self.COMMAND) == str + assert isinstance(arguments, list) + assert isinstance(self.COMMAND, str) if local_binpath: # Test utility @@ -1662,7 +1760,7 @@ class NeonPageserver(PgProtocol): self.running = False self.service_port = port self.config_override = config_override - self.version = env.get_pageserver_version() + self.version = env.get_binary_version("pageserver") # After a test finishes, we will scrape the log to see if there are any # unexpected error messages. If your test expects an error, add it to @@ -1831,18 +1929,24 @@ class NeonPageserver(PgProtocol): return None def tenant_attach( - self, tenant_id: TenantId, config: None | Dict[str, Any] = None, config_null: bool = False + self, + tenant_id: TenantId, + config: None | Dict[str, Any] = None, + config_null: bool = False, + generation: Optional[int] = None, ): """ Tenant attachment passes through here to acquire a generation number before proceeding to call into the pageserver HTTP client. """ client = self.http_client() + if generation is None: + generation = self.env.attachment_service.attach_hook_issue(tenant_id, self.id) return client.tenant_attach( tenant_id, config, config_null, - generation=self.env.attachment_service.attach_hook_issue(tenant_id, self.id), + generation=generation, ) def tenant_detach(self, tenant_id: TenantId): @@ -2745,6 +2849,13 @@ class Endpoint(PgProtocol): ): self.stop() + # Checkpoints running endpoint and returns pg_wal size in MB. + def get_pg_wal_size(self): + log.info(f'checkpointing at LSN {self.safe_psql("select pg_current_wal_lsn()")[0][0]}') + self.safe_psql("checkpoint") + assert self.pgdata_dir is not None # please mypy + return get_dir_size(os.path.join(self.pgdata_dir, "pg_wal")) / 1024 / 1024 + class EndpointFactory: """An object representing multiple compute endpoints.""" @@ -2923,7 +3034,10 @@ class Safekeeper: return res def http_client(self, auth_token: Optional[str] = None) -> SafekeeperHttpClient: - return SafekeeperHttpClient(port=self.port.http, auth_token=auth_token) + is_testing_enabled = '"testing"' in self.env.get_binary_version("safekeeper") + return SafekeeperHttpClient( + port=self.port.http, auth_token=auth_token, is_testing_enabled=is_testing_enabled + ) def data_dir(self) -> str: return os.path.join(self.env.repo_dir, "safekeepers", f"sk{self.id}") @@ -2943,6 +3057,13 @@ class Safekeeper: return segments +# Walreceiver as returned by sk's timeline status endpoint. +@dataclass +class Walreceiver: + conn_id: int + state: str + + @dataclass class SafekeeperTimelineStatus: acceptor_epoch: int @@ -2953,6 +3074,7 @@ class SafekeeperTimelineStatus: backup_lsn: Lsn peer_horizon_lsn: Lsn remote_consistent_lsn: Lsn + walreceivers: List[Walreceiver] @dataclass @@ -2966,10 +3088,11 @@ class SafekeeperMetrics: class SafekeeperHttpClient(requests.Session): HTTPError = requests.HTTPError - def __init__(self, port: int, auth_token: Optional[str] = None): + def __init__(self, port: int, auth_token: Optional[str] = None, is_testing_enabled=False): super().__init__() self.port = port self.auth_token = auth_token + self.is_testing_enabled = is_testing_enabled if auth_token is not None: self.headers["Authorization"] = f"Bearer {auth_token}" @@ -2977,6 +3100,30 @@ class SafekeeperHttpClient(requests.Session): def check_status(self): self.get(f"http://localhost:{self.port}/v1/status").raise_for_status() + def is_testing_enabled_or_skip(self): + if not self.is_testing_enabled: + pytest.skip("safekeeper was built without 'testing' feature") + + def configure_failpoints(self, config_strings: Tuple[str, str] | List[Tuple[str, str]]): + self.is_testing_enabled_or_skip() + + if isinstance(config_strings, tuple): + pairs = [config_strings] + else: + pairs = config_strings + + log.info(f"Requesting config failpoints: {repr(pairs)}") + + res = self.put( + f"http://localhost:{self.port}/v1/failpoints", + json=[{"name": name, "actions": actions} for name, actions in pairs], + ) + log.info(f"Got failpoints request response code {res.status_code}") + res.raise_for_status() + res_json = res.json() + assert res_json is None + return res_json + def debug_dump(self, params: Optional[Dict[str, str]] = None) -> Dict[str, Any]: params = params or {} res = self.get(f"http://localhost:{self.port}/v1/debug_dump", params=params) @@ -2992,6 +3139,28 @@ class SafekeeperHttpClient(requests.Session): assert isinstance(res_json, dict) return res_json + def copy_timeline(self, tenant_id: TenantId, timeline_id: TimelineId, body: Dict[str, Any]): + res = self.post( + f"http://localhost:{self.port}/v1/tenant/{tenant_id}/timeline/{timeline_id}/copy", + json=body, + ) + res.raise_for_status() + + def timeline_digest( + self, tenant_id: TenantId, timeline_id: TimelineId, from_lsn: Lsn, until_lsn: Lsn + ) -> Dict[str, Any]: + res = self.get( + f"http://localhost:{self.port}/v1/tenant/{tenant_id}/timeline/{timeline_id}/digest", + params={ + "from_lsn": str(from_lsn), + "until_lsn": str(until_lsn), + }, + ) + res.raise_for_status() + res_json = res.json() + assert isinstance(res_json, dict) + return res_json + def timeline_create( self, tenant_id: TenantId, @@ -3014,6 +3183,7 @@ class SafekeeperHttpClient(requests.Session): res = self.get(f"http://localhost:{self.port}/v1/tenant/{tenant_id}/timeline/{timeline_id}") res.raise_for_status() resj = res.json() + walreceivers = [Walreceiver(wr["conn_id"], wr["status"]) for wr in resj["walreceivers"]] return SafekeeperTimelineStatus( acceptor_epoch=resj["acceptor_state"]["epoch"], pg_version=resj["pg_info"]["pg_version"], @@ -3023,6 +3193,7 @@ class SafekeeperHttpClient(requests.Session): backup_lsn=Lsn(resj["backup_lsn"]), peer_horizon_lsn=Lsn(resj["peer_horizon_lsn"]), remote_consistent_lsn=Lsn(resj["remote_consistent_lsn"]), + walreceivers=walreceivers, ) def record_safekeeper_info(self, tenant_id: TenantId, timeline_id: TimelineId, body): @@ -3130,10 +3301,10 @@ class S3Scrubber: raise -def get_test_output_dir(request: FixtureRequest, top_output_dir: Path) -> Path: - """Compute the working directory for an individual test.""" +def _get_test_dir(request: FixtureRequest, top_output_dir: Path, prefix: str) -> Path: + """Compute the path to a working directory for an individual test.""" test_name = request.node.name - test_dir = top_output_dir / test_name.replace("/", "-") + test_dir = top_output_dir / f"{prefix}{test_name.replace('/', '-')}" # We rerun flaky tests multiple times, use a separate directory for each run. if (suffix := getattr(request.node, "execution_count", None)) is not None: @@ -3145,6 +3316,21 @@ def get_test_output_dir(request: FixtureRequest, top_output_dir: Path) -> Path: return test_dir +def get_test_output_dir(request: FixtureRequest, top_output_dir: Path) -> Path: + """ + The working directory for a test. + """ + return _get_test_dir(request, top_output_dir, "") + + +def get_test_overlay_dir(request: FixtureRequest, top_output_dir: Path) -> Path: + """ + Directory that contains `upperdir` and `workdir` for overlayfs mounts + that a test creates. See `NeonEnvBuilder.overlay_mount`. + """ + return _get_test_dir(request, top_output_dir, "overlay-") + + def get_test_repo_dir(request: FixtureRequest, top_output_dir: Path) -> Path: return get_test_output_dir(request, top_output_dir) / "repo" @@ -3172,8 +3358,12 @@ SMALL_DB_FILE_NAME_REGEX: re.Pattern = re.compile( # type: ignore[type-arg] # scope. So it uses the get_test_output_dir() function to get the path, and # this fixture ensures that the directory exists. That works because # 'autouse' fixtures are run before other fixtures. +# +# NB: we request the overlay dir fixture so the fixture does its cleanups @pytest.fixture(scope="function", autouse=True) -def test_output_dir(request: FixtureRequest, top_output_dir: Path) -> Iterator[Path]: +def test_output_dir( + request: FixtureRequest, top_output_dir: Path, test_overlay_dir: Path +) -> Iterator[Path]: """Create the working directory for an individual test.""" # one directory per test @@ -3187,6 +3377,43 @@ def test_output_dir(request: FixtureRequest, top_output_dir: Path) -> Iterator[P allure_attach_from_dir(test_dir) +@pytest.fixture(scope="function") +def test_overlay_dir(request: FixtureRequest, top_output_dir: Path) -> Optional[Path]: + """ + Idempotently create a test's overlayfs mount state directory. + If the functionality isn't enabled via env var, returns None. + + The procedure cleans up after previous runs that were aborted (e.g. due to Ctrl-C, OOM kills, etc). + """ + + if os.getenv("NEON_ENV_BUILDER_FROM_REPO_DIR_USE_OVERLAYFS") is None: + return None + + overlay_dir = get_test_overlay_dir(request, top_output_dir) + log.info(f"test_overlay_dir is {overlay_dir}") + + overlay_dir.mkdir(exist_ok=True) + # unmount stale overlayfs mounts which subdirectories of `overlay_dir/*` as the overlayfs `upperdir` and `workdir` + for mountpoint in overlayfs.iter_mounts_beneath(get_test_output_dir(request, top_output_dir)): + cmd = ["sudo", "umount", str(mountpoint)] + log.info( + f"Unmounting stale overlayfs mount probably created during earlier test run: {cmd}" + ) + subprocess.run(cmd, capture_output=True, check=True) + # the overlayfs `workdir`` is owned by `root`, shutil.rmtree won't work. + cmd = ["sudo", "rm", "-rf", str(overlay_dir)] + subprocess.run(cmd, capture_output=True, check=True) + + overlay_dir.mkdir() + + return overlay_dir + + # no need to clean up anything: on clean shutdown, + # NeonEnvBuilder.overlay_cleanup_teardown takes care of cleanup + # and on unclean shutdown, this function will take care of it + # on the next test run + + SKIP_DIRS = frozenset( ( "pg_wal", diff --git a/test_runner/fixtures/overlayfs.py b/test_runner/fixtures/overlayfs.py new file mode 100644 index 0000000000..3e2f661893 --- /dev/null +++ b/test_runner/fixtures/overlayfs.py @@ -0,0 +1,16 @@ +from pathlib import Path +from typing import Iterator + +import psutil + + +def iter_mounts_beneath(topdir: Path) -> Iterator[Path]: + """ + Iterate over the overlayfs mounts beneath the specififed `topdir`. + The `topdir` itself isn't considered. + """ + for part in psutil.disk_partitions(all=True): + if part.fstype == "overlay": + mountpoint = Path(part.mountpoint) + if topdir in mountpoint.parents: + yield mountpoint diff --git a/test_runner/fixtures/pageserver/http.py b/test_runner/fixtures/pageserver/http.py index add6c4288a..b24de342f8 100644 --- a/test_runner/fixtures/pageserver/http.py +++ b/test_runner/fixtures/pageserver/http.py @@ -326,6 +326,10 @@ class PageserverHttpClient(requests.Session): res = self.post(f"http://localhost:{self.port}/v1/tenant/{tenant_id}/heatmap_upload") self.verbose_error(res) + def tenant_secondary_download(self, tenant_id: TenantId): + res = self.post(f"http://localhost:{self.port}/v1/tenant/{tenant_id}/secondary/download") + self.verbose_error(res) + def set_tenant_config(self, tenant_id: TenantId, config: dict[str, Any]): assert "tenant_id" not in config.keys() res = self.put( @@ -361,9 +365,9 @@ class PageserverHttpClient(requests.Session): assert isinstance(res, dict) assert TenantId(res["id"]) == tenant_id size = res["size"] - assert type(size) == int + assert isinstance(size, int) inputs = res["inputs"] - assert type(inputs) is dict + assert isinstance(inputs, dict) return (size, inputs) def tenant_size_debug(self, tenant_id: TenantId) -> str: @@ -437,6 +441,7 @@ class PageserverHttpClient(requests.Session): timeline_id: TimelineId, include_non_incremental_logical_size: bool = False, include_timeline_dir_layer_file_size_sum: bool = False, + force_await_initial_logical_size: bool = False, **kwargs, ) -> Dict[Any, Any]: params = {} @@ -444,6 +449,8 @@ class PageserverHttpClient(requests.Session): params["include-non-incremental-logical-size"] = "true" if include_timeline_dir_layer_file_size_sum: params["include-timeline-dir-layer-file-size-sum"] = "true" + if force_await_initial_logical_size: + params["force-await-initial-logical-size"] = "true" res = self.get( f"http://localhost:{self.port}/v1/tenant/{tenant_id}/timeline/{timeline_id}", @@ -714,7 +721,7 @@ class PageserverHttpClient(requests.Session): ) self.verbose_error(res) - assert res.status_code == 200 + assert res.status_code in (200, 304) def evict_all_layers(self, tenant_id: TenantId, timeline_id: TimelineId): info = self.layer_map_info(tenant_id, timeline_id) diff --git a/test_runner/performance/test_perf_olap.py b/test_runner/performance/test_perf_olap.py index 1de7e95bbe..8a9509ea44 100644 --- a/test_runner/performance/test_perf_olap.py +++ b/test_runner/performance/test_perf_olap.py @@ -42,9 +42,10 @@ def test_clickbench_create_pg_stat_statements(remote_compare: RemoteCompare): # Please do not alter the label for the query, as it is used to identify it. # Labels for ClickBench queries match the labels in ClickBench reports # on https://benchmark.clickhouse.com/ (the DB size may differ). +# +# Disable auto formatting for the list of queries so that it's easier to read +# fmt: off QUERIES: Tuple[LabelledQuery, ...] = ( - # Disable `black` formatting for the list of queries so that it's easier to read - # fmt: off ### ClickBench queries: LabelledQuery("Q0", r"SELECT COUNT(*) FROM hits;"), LabelledQuery("Q1", r"SELECT COUNT(*) FROM hits WHERE AdvEngineID <> 0;"), @@ -96,8 +97,8 @@ QUERIES: Tuple[LabelledQuery, ...] = ( # LabelledQuery("NQ0", r"..."), # LabelledQuery("NQ1", r"..."), # ... - # fmt: on ) +# fmt: on EXPLAIN_STRING: str = "EXPLAIN (ANALYZE, VERBOSE, BUFFERS, COSTS, SETTINGS, FORMAT JSON)" @@ -151,7 +152,9 @@ def test_clickbench(query: LabelledQuery, remote_compare: RemoteCompare, scale: An OLAP-style ClickHouse benchmark Based on https://github.com/ClickHouse/ClickBench/tree/c00135ca5b6a0d86fedcdbf998fdaa8ed85c1c3b/aurora-postgresql - The DB prepared manually in advance + The DB prepared manually in advance. + Important: after intial data load, run `VACUUM (DISABLE_PAGE_SKIPPING, FREEZE, ANALYZE) hits;` + to ensure that Postgres optimizer chooses the same plans as RDS and Aurora. """ explain: bool = os.getenv("TEST_OLAP_COLLECT_EXPLAIN", "false").lower() == "true" diff --git a/test_runner/performance/test_wal_backpressure.py b/test_runner/performance/test_wal_backpressure.py index 3cb4b667ff..7eb244d378 100644 --- a/test_runner/performance/test_wal_backpressure.py +++ b/test_runner/performance/test_wal_backpressure.py @@ -32,8 +32,7 @@ def pg_compare(request) -> PgCompare: else: assert ( len(x) == 2 - ), f"request param ({request.param}) should have a format of \ - `neon_{{safekeepers_enable_fsync}}`" + ), f"request param ({request.param}) should have a format of `neon_{{safekeepers_enable_fsync}}`" # `NeonCompare` interface neon_env_builder = request.getfixturevalue("neon_env_builder") diff --git a/test_runner/regress/test_attach_tenant_config.py b/test_runner/regress/test_attach_tenant_config.py index 32397bbcc1..ed389b1aa2 100644 --- a/test_runner/regress/test_attach_tenant_config.py +++ b/test_runner/regress/test_attach_tenant_config.py @@ -194,12 +194,13 @@ def test_fully_custom_config(positive_env: NeonEnv): assert set(our_tenant_config.effective_config.keys()) == set( fully_custom_config.keys() ), "ensure we cover all config options" - assert { - k: initial_tenant_config.effective_config[k] != our_tenant_config.effective_config[k] - for k in fully_custom_config.keys() - } == { - k: True for k in fully_custom_config.keys() - }, "ensure our custom config has different values than the default config for all config options, so we know we overrode everything" + assert ( + { + k: initial_tenant_config.effective_config[k] != our_tenant_config.effective_config[k] + for k in fully_custom_config.keys() + } + == {k: True for k in fully_custom_config.keys()} + ), "ensure our custom config has different values than the default config for all config options, so we know we overrode everything" ps_http.tenant_detach(tenant_id) env.pageserver.tenant_attach(tenant_id, config=fully_custom_config) diff --git a/test_runner/regress/test_compatibility.py b/test_runner/regress/test_compatibility.py index 5a9c2782e6..f9d6d0a934 100644 --- a/test_runner/regress/test_compatibility.py +++ b/test_runner/regress/test_compatibility.py @@ -186,9 +186,7 @@ def test_backward_compatibility( else: raise - assert ( - not breaking_changes_allowed - ), "Breaking changes are allowed by ALLOW_BACKWARD_COMPATIBILITY_BREAKAGE, but the test has passed without any breakage" + assert not breaking_changes_allowed, "Breaking changes are allowed by ALLOW_BACKWARD_COMPATIBILITY_BREAKAGE, but the test has passed without any breakage" @check_ondisk_data_compatibility_if_enabled @@ -247,9 +245,7 @@ def test_forward_compatibility( else: raise - assert ( - not breaking_changes_allowed - ), "Breaking changes are allowed by ALLOW_FORWARD_COMPATIBILITY_BREAKAGE, but the test has passed without any breakage" + assert not breaking_changes_allowed, "Breaking changes are allowed by ALLOW_FORWARD_COMPATIBILITY_BREAKAGE, but the test has passed without any breakage" def check_neon_works(env: NeonEnv, test_output_dir: Path, sql_dump_path: Path, repo_dir: Path): diff --git a/test_runner/regress/test_crafted_wal_end.py b/test_runner/regress/test_crafted_wal_end.py index 7ec901af34..01ecc2b95f 100644 --- a/test_runner/regress/test_crafted_wal_end.py +++ b/test_runner/regress/test_crafted_wal_end.py @@ -2,7 +2,6 @@ import pytest from fixtures.log_helper import log from fixtures.neon_fixtures import NeonEnvBuilder, WalCraft - # Restart nodes with WAL end having specially crafted shape, like last record # crossing segment boundary, to test decoding issues. diff --git a/test_runner/regress/test_disk_usage_eviction.py b/test_runner/regress/test_disk_usage_eviction.py index f3f3a1ddf3..9fdc4d59f5 100644 --- a/test_runner/regress/test_disk_usage_eviction.py +++ b/test_runner/regress/test_disk_usage_eviction.py @@ -1,6 +1,7 @@ +import enum import time from dataclasses import dataclass -from typing import Dict, Tuple +from typing import Any, Dict, Tuple import pytest import toml @@ -64,6 +65,23 @@ def test_min_resident_size_override_handling( assert_config(tenant_id, None, config_level_override) +@enum.unique +class EvictionOrder(str, enum.Enum): + ABSOLUTE_ORDER = "absolute" + RELATIVE_ORDER_EQUAL = "relative_equal" + RELATIVE_ORDER_SPARE = "relative_spare" + + def config(self) -> Dict[str, Any]: + if self == EvictionOrder.ABSOLUTE_ORDER: + return {"type": "AbsoluteAccessed"} + elif self == EvictionOrder.RELATIVE_ORDER_EQUAL: + return {"type": "RelativeAccessed", "args": {"highest_layer_count_loses_first": False}} + elif self == EvictionOrder.RELATIVE_ORDER_SPARE: + return {"type": "RelativeAccessed", "args": {"highest_layer_count_loses_first": True}} + else: + raise RuntimeError(f"not implemented: {self}") + + @dataclass class EvictionEnv: timelines: list[Tuple[TenantId, TimelineId]] @@ -108,13 +126,14 @@ class EvictionEnv: _avg = cur.fetchone() def pageserver_start_with_disk_usage_eviction( - self, period, max_usage_pct, min_avail_bytes, mock_behavior + self, period, max_usage_pct, min_avail_bytes, mock_behavior, eviction_order: EvictionOrder ): disk_usage_config = { "period": period, "max_usage_pct": max_usage_pct, "min_avail_bytes": min_avail_bytes, "mock_statvfs": mock_behavior, + "eviction_order": eviction_order.config(), } enc = toml.TomlEncoder() @@ -270,7 +289,13 @@ def test_broken_tenants_are_skipped(eviction_env: EvictionEnv): env.neon_env.pageserver.allowed_errors.append(".*" + GLOBAL_LRU_LOG_LINE) -def test_pageserver_evicts_until_pressure_is_relieved(eviction_env: EvictionEnv): +@pytest.mark.parametrize( + "order", + [EvictionOrder.ABSOLUTE_ORDER, EvictionOrder.RELATIVE_ORDER_EQUAL], +) +def test_pageserver_evicts_until_pressure_is_relieved( + eviction_env: EvictionEnv, order: EvictionOrder +): """ Basic test to ensure that we evict enough to relieve pressure. """ @@ -281,7 +306,9 @@ def test_pageserver_evicts_until_pressure_is_relieved(eviction_env: EvictionEnv) target = total_on_disk // 2 - response = pageserver_http.disk_usage_eviction_run({"evict_bytes": target}) + response = pageserver_http.disk_usage_eviction_run( + {"evict_bytes": target, "eviction_order": order.config()} + ) log.info(f"{response}") (later_total_on_disk, _, _) = env.timelines_du() @@ -296,7 +323,13 @@ def test_pageserver_evicts_until_pressure_is_relieved(eviction_env: EvictionEnv) assert response["Finished"]["assumed"]["failed"]["count"] == 0, "zero failures expected" -def test_pageserver_respects_overridden_resident_size(eviction_env: EvictionEnv): +@pytest.mark.parametrize( + "order", + [EvictionOrder.ABSOLUTE_ORDER, EvictionOrder.RELATIVE_ORDER_EQUAL], +) +def test_pageserver_respects_overridden_resident_size( + eviction_env: EvictionEnv, order: EvictionOrder +): """ Override tenant min resident and ensure that it will be respected by eviction. """ @@ -336,7 +369,9 @@ def test_pageserver_respects_overridden_resident_size(eviction_env: EvictionEnv) env.warm_up_tenant(large_tenant[0]) # do one run - response = ps_http.disk_usage_eviction_run({"evict_bytes": target}) + response = ps_http.disk_usage_eviction_run( + {"evict_bytes": target, "eviction_order": order.config()} + ) log.info(f"{response}") time.sleep(1) # give log time to flush @@ -365,7 +400,11 @@ def test_pageserver_respects_overridden_resident_size(eviction_env: EvictionEnv) assert du_by_timeline[large_tenant] - later_du_by_timeline[large_tenant] >= target -def test_pageserver_falls_back_to_global_lru(eviction_env: EvictionEnv): +@pytest.mark.parametrize( + "order", + [EvictionOrder.ABSOLUTE_ORDER, EvictionOrder.RELATIVE_ORDER_EQUAL], +) +def test_pageserver_falls_back_to_global_lru(eviction_env: EvictionEnv, order: EvictionOrder): """ If we can't relieve pressure using tenant_min_resident_size-respecting eviction, we should continue to evict layers following global LRU. @@ -376,7 +415,9 @@ def test_pageserver_falls_back_to_global_lru(eviction_env: EvictionEnv): (total_on_disk, _, _) = env.timelines_du() target = total_on_disk - response = ps_http.disk_usage_eviction_run({"evict_bytes": target}) + response = ps_http.disk_usage_eviction_run( + {"evict_bytes": target, "eviction_order": order.config()} + ) log.info(f"{response}") (later_total_on_disk, _, _) = env.timelines_du() @@ -389,7 +430,15 @@ def test_pageserver_falls_back_to_global_lru(eviction_env: EvictionEnv): env.neon_env.pageserver.allowed_errors.append(".*" + GLOBAL_LRU_LOG_LINE) -def test_partial_evict_tenant(eviction_env: EvictionEnv): +@pytest.mark.parametrize( + "order", + [ + EvictionOrder.ABSOLUTE_ORDER, + EvictionOrder.RELATIVE_ORDER_EQUAL, + EvictionOrder.RELATIVE_ORDER_SPARE, + ], +) +def test_partial_evict_tenant(eviction_env: EvictionEnv, order: EvictionOrder): """ Warm up a tenant, then build up pressure to cause in evictions in both. We expect @@ -402,7 +451,7 @@ def test_partial_evict_tenant(eviction_env: EvictionEnv): (total_on_disk, _, _) = env.timelines_du() du_by_timeline = env.du_by_timeline() - # pick any tenant + # pick smaller or greater (iteration order is insertion order of scale=4 and scale=6) [warm, cold] = list(du_by_timeline.keys()) (tenant_id, timeline_id) = warm @@ -413,7 +462,9 @@ def test_partial_evict_tenant(eviction_env: EvictionEnv): # but not enough to fall into global LRU. # So, set target to all occupied space, except 2*env.layer_size per tenant target = du_by_timeline[cold] + (du_by_timeline[warm] // 2) - 2 * 2 * env.layer_size - response = ps_http.disk_usage_eviction_run({"evict_bytes": target}) + response = ps_http.disk_usage_eviction_run( + {"evict_bytes": target, "eviction_order": order.config()} + ) log.info(f"{response}") (later_total_on_disk, _, _) = env.timelines_du() @@ -428,28 +479,32 @@ def test_partial_evict_tenant(eviction_env: EvictionEnv): ), "all tenants should have lost some layers" warm_size = later_du_by_timeline[warm] - - # bounds for warmed_size - warm_lower = 0.5 * du_by_timeline[warm] - - # We don't know exactly whether the cold tenant needs 2 or just 1 env.layer_size wiggle room. - # So, check for up to 3 here. - warm_upper = warm_lower + 3 * env.layer_size - cold_size = later_du_by_timeline[cold] - cold_upper = 2 * env.layer_size - log.info( - f"expecting for warm tenant: {human_bytes(warm_lower)} < {human_bytes(warm_size)} < {human_bytes(warm_upper)}" - ) - log.info(f"expecting for cold tenant: {human_bytes(cold_size)} < {human_bytes(cold_upper)}") + if order == EvictionOrder.ABSOLUTE_ORDER: + # bounds for warmed_size + warm_lower = 0.5 * du_by_timeline[warm] - assert warm_size > warm_lower, "warmed up tenant should be at about half size (lower)" - assert warm_size < warm_upper, "warmed up tenant should be at about half size (upper)" + # We don't know exactly whether the cold tenant needs 2 or just 1 env.layer_size wiggle room. + # So, check for up to 3 here. + warm_upper = warm_lower + 3 * env.layer_size - assert ( - cold_size < cold_upper - ), "the cold tenant should be evicted to its min_resident_size, i.e., max layer file size" + cold_upper = 2 * env.layer_size + log.info(f"tenants: warm={warm[0]}, cold={cold[0]}") + log.info( + f"expecting for warm tenant: {human_bytes(warm_lower)} < {human_bytes(warm_size)} < {human_bytes(warm_upper)}" + ) + log.info(f"expecting for cold tenant: {human_bytes(cold_size)} < {human_bytes(cold_upper)}") + + assert warm_size > warm_lower, "warmed up tenant should be at about half size (lower)" + assert warm_size < warm_upper, "warmed up tenant should be at about half size (upper)" + + assert ( + cold_size < cold_upper + ), "the cold tenant should be evicted to its min_resident_size, i.e., max layer file size" + else: + # just go with the space was freed, find proper limits later + pass def poor_mans_du( @@ -501,6 +556,7 @@ def test_statvfs_error_handling(eviction_env: EvictionEnv): "type": "Failure", "mocked_error": "EIO", }, + eviction_order=EvictionOrder.ABSOLUTE_ORDER, ) assert env.neon_env.pageserver.log_contains(".*statvfs failed.*EIO") @@ -533,6 +589,7 @@ def test_statvfs_pressure_usage(eviction_env: EvictionEnv): # This avoids accounting for metadata files & tenant conf in the tests. "name_filter": ".*__.*", }, + eviction_order=EvictionOrder.ABSOLUTE_ORDER, ) def relieved_log_message(): @@ -573,6 +630,7 @@ def test_statvfs_pressure_min_avail_bytes(eviction_env: EvictionEnv): # This avoids accounting for metadata files & tenant conf in the tests. "name_filter": ".*__.*", }, + eviction_order=EvictionOrder.ABSOLUTE_ORDER, ) def relieved_log_message(): diff --git a/test_runner/regress/test_hot_standby.py b/test_runner/regress/test_hot_standby.py index 031fd2857d..7822e29ed9 100644 --- a/test_runner/regress/test_hot_standby.py +++ b/test_runner/regress/test_hot_standby.py @@ -1,19 +1,59 @@ +import os +import re import time -from fixtures.neon_fixtures import NeonEnv +from fixtures.log_helper import log +from fixtures.neon_fixtures import Endpoint, NeonEnv + + +def wait_caughtup(primary: Endpoint, secondary: Endpoint): + primary_lsn = primary.safe_psql_scalar( + "SELECT pg_current_wal_insert_lsn()::text", log_query=False + ) + while True: + secondary_lsn = secondary.safe_psql_scalar( + "SELECT pg_last_wal_replay_lsn()", log_query=False + ) + caught_up = secondary_lsn >= primary_lsn + log.info(f"caughtup={caught_up}, primary_lsn={primary_lsn}, secondary_lsn={secondary_lsn}") + if caught_up: + return + time.sleep(1) + + +# Check for corrupted WAL messages which might otherwise go unnoticed if +# reconnection fixes this. +def scan_standby_log_for_errors(secondary): + log_path = secondary.endpoint_path() / "compute.log" + with log_path.open("r") as f: + markers = re.compile( + r"incorrect resource manager data|record with incorrect|invalid magic number|unexpected pageaddr" + ) + for line in f: + if markers.search(line): + log.info(f"bad error in standby log: {line}") + raise AssertionError() def test_hot_standby(neon_simple_env: NeonEnv): env = neon_simple_env + # We've had a bug caused by WAL records split across multiple XLogData + # messages resulting in corrupted WAL complains on standby. It reproduced + # only when sending from safekeeper is slow enough to grab full + # MAX_SEND_SIZE messages. So insert sleep through failpoints, but only in + # one conf to decrease test time. + slow_down_send = "[debug-pg16]" in os.environ.get("PYTEST_CURRENT_TEST", "") + if slow_down_send: + sk_http = env.safekeepers[0].http_client() + sk_http.configure_failpoints([("sk-send-wal-replica-sleep", "return(100)")]) + with env.endpoints.create_start( branch_name="main", endpoint_id="primary", ) as primary: time.sleep(1) with env.endpoints.new_replica_start(origin=primary, endpoint_id="secondary") as secondary: - primary_lsn = None - caught_up = False queries = [ "SHOW neon.timeline_id", "SHOW neon.tenant_id", @@ -26,23 +66,6 @@ def test_hot_standby(neon_simple_env: NeonEnv): with p_con.cursor() as p_cur: p_cur.execute("CREATE TABLE test AS SELECT generate_series(1, 100) AS i") - # Explicit commit to make sure other connections (and replicas) can - # see the changes of this commit. - p_con.commit() - - with p_con.cursor() as p_cur: - p_cur.execute("SELECT pg_current_wal_insert_lsn()::text") - res = p_cur.fetchone() - assert res is not None - (lsn,) = res - primary_lsn = lsn - - # Explicit commit to make sure other connections (and replicas) can - # see the changes of this commit. - # Note that this may generate more WAL if the transaction has changed - # things, but we don't care about that. - p_con.commit() - for query in queries: with p_con.cursor() as p_cur: p_cur.execute(query) @@ -51,30 +74,28 @@ def test_hot_standby(neon_simple_env: NeonEnv): response = res responses[query] = response + # insert more data to make safekeeper send MAX_SEND_SIZE messages + if slow_down_send: + primary.safe_psql("create table t(key int, value text)") + primary.safe_psql("insert into t select generate_series(1, 100000), 'payload'") + + wait_caughtup(primary, secondary) + with secondary.connect() as s_con: with s_con.cursor() as s_cur: s_cur.execute("SELECT 1 WHERE pg_is_in_recovery()") res = s_cur.fetchone() assert res is not None - while not caught_up: - with s_con.cursor() as secondary_cursor: - secondary_cursor.execute("SELECT pg_last_wal_replay_lsn()") - res = secondary_cursor.fetchone() - assert res is not None - (secondary_lsn,) = res - # There may be more changes on the primary after we got our LSN - # due to e.g. autovacuum, but that shouldn't impact the content - # of the tables, so we check whether we've replayed up to at - # least after the commit of the `test` table. - caught_up = secondary_lsn >= primary_lsn - - # Explicit commit to flush any transient transaction-level state. - s_con.commit() - for query in queries: with s_con.cursor() as secondary_cursor: secondary_cursor.execute(query) response = secondary_cursor.fetchone() assert response is not None assert response == responses[query] + + scan_standby_log_for_errors(secondary) + + # clean up + if slow_down_send: + sk_http.configure_failpoints(("sk-send-wal-replica-sleep", "off")) diff --git a/test_runner/regress/test_layer_eviction.py b/test_runner/regress/test_layer_eviction.py index 2cd2406065..efba2033fb 100644 --- a/test_runner/regress/test_layer_eviction.py +++ b/test_runner/regress/test_layer_eviction.py @@ -102,9 +102,7 @@ def test_basic_eviction( ), f"Did not expect to find {local_layer} layer after evicting" empty_layers = list(filter(lambda path: path.name != "metadata", timeline_path.glob("*"))) - assert ( - not empty_layers - ), f"After evicting all layers, timeline {tenant_id}/{timeline_id} should have no layers locally, but got: {empty_layers}" + assert not empty_layers, f"After evicting all layers, timeline {tenant_id}/{timeline_id} should have no layers locally, but got: {empty_layers}" evicted_layer_map_info = client.layer_map_info(tenant_id=tenant_id, timeline_id=timeline_id) assert ( diff --git a/test_runner/regress/test_layers_from_future.py b/test_runner/regress/test_layers_from_future.py index 340188c1ae..999e077e45 100644 --- a/test_runner/regress/test_layers_from_future.py +++ b/test_runner/regress/test_layers_from_future.py @@ -38,6 +38,9 @@ def test_issue_5878(neon_env_builder: NeonEnvBuilder): neon_env_builder.enable_pageserver_remote_storage(RemoteStorageKind.LOCAL_FS) env = neon_env_builder.init_start() + env.pageserver.allowed_errors.extend( + [".*Dropped remote consistent LSN updates.*", ".*Dropping stale deletions.*"] + ) ps_http = env.pageserver.http_client() diff --git a/test_runner/regress/test_pageserver_api.py b/test_runner/regress/test_pageserver_api.py index 573d2139ce..e29db1e252 100644 --- a/test_runner/regress/test_pageserver_api.py +++ b/test_runner/regress/test_pageserver_api.py @@ -145,8 +145,7 @@ def expect_updated_msg_lsn( last_msg_lsn = Lsn(timeline_details["last_received_msg_lsn"]) assert ( prev_msg_lsn is None or prev_msg_lsn < last_msg_lsn - ), f"the last received message's LSN {last_msg_lsn} hasn't been updated \ - compared to the previous message's LSN {prev_msg_lsn}" + ), f"the last received message's LSN {last_msg_lsn} hasn't been updated compared to the previous message's LSN {prev_msg_lsn}" return last_msg_lsn diff --git a/test_runner/regress/test_pageserver_generations.py b/test_runner/regress/test_pageserver_generations.py index 9c2f5786d4..87a4fa01fc 100644 --- a/test_runner/regress/test_pageserver_generations.py +++ b/test_runner/regress/test_pageserver_generations.py @@ -254,7 +254,9 @@ def test_generations_upgrade(neon_env_builder: NeonEnvBuilder): metadata_summary = S3Scrubber( neon_env_builder.test_output_dir, neon_env_builder ).scan_metadata() - assert metadata_summary["count"] == 1 # Scrubber should have seen our timeline + assert metadata_summary["tenant_count"] == 1 # Scrubber should have seen our timeline + assert metadata_summary["timeline_count"] == 1 + assert metadata_summary["timeline_shard_count"] == 1 assert not metadata_summary["with_errors"] assert not metadata_summary["with_warnings"] diff --git a/test_runner/regress/test_pageserver_secondary.py b/test_runner/regress/test_pageserver_secondary.py index 8ae4297983..a9eff99a0c 100644 --- a/test_runner/regress/test_pageserver_secondary.py +++ b/test_runner/regress/test_pageserver_secondary.py @@ -1,9 +1,11 @@ import random +from pathlib import Path from typing import Any, Dict, Optional import pytest from fixtures.log_helper import log -from fixtures.neon_fixtures import NeonEnvBuilder, NeonPageserver +from fixtures.neon_fixtures import NeonEnvBuilder, NeonPageserver, S3Scrubber +from fixtures.pageserver.utils import assert_prefix_empty, tenant_delete_wait_completed from fixtures.remote_storage import LocalFsStorage, RemoteStorageKind from fixtures.types import TenantId, TimelineId from fixtures.utils import wait_until @@ -251,6 +253,9 @@ def test_live_migration(neon_env_builder: NeonEnvBuilder): flush_ms=5000, ) + # Encourage the new location to download while still in secondary mode + pageserver_b.http_client().tenant_secondary_download(tenant_id) + migrated_generation = env.attachment_service.attach_hook_issue(tenant_id, pageserver_b.id) log.info(f"Acquired generation {migrated_generation} for destination pageserver") assert migrated_generation == initial_generation + 1 @@ -258,8 +263,6 @@ def test_live_migration(neon_env_builder: NeonEnvBuilder): # Writes and reads still work in AttachedStale. workload.validate(pageserver_a.id) - # TODO: call into secondary mode API hooks to do an upload/download sync - # Generate some more dirty writes: we expect the origin to ingest WAL in # in AttachedStale workload.churn_rows(64, pageserver_a.id, upload=False) @@ -369,3 +372,143 @@ def test_heatmap_uploads(neon_env_builder: NeonEnvBuilder): log.info(f"Read back heatmap: {heatmap_second}") assert heatmap_second != heatmap_first validate_heatmap(heatmap_second) + + +def list_layers(pageserver, tenant_id: TenantId, timeline_id: TimelineId) -> list[Path]: + """ + Inspect local storage on a pageserver to discover which layer files are present. + + :return: list of relative paths to layers, from the timeline root. + """ + timeline_path = pageserver.timeline_dir(tenant_id, timeline_id) + + def relative(p: Path) -> Path: + return p.relative_to(timeline_path) + + return sorted( + list( + map( + relative, + filter( + lambda path: path.name != "metadata" + and "ephemeral" not in path.name + and "temp" not in path.name, + timeline_path.glob("*"), + ), + ) + ) + ) + + +def test_secondary_downloads(neon_env_builder: NeonEnvBuilder): + """ + Test the overall data flow in secondary mode: + - Heatmap uploads from the attached location + - Heatmap & layer downloads from the secondary location + - Eviction of layers on the attached location results in deletion + on the secondary location as well. + """ + neon_env_builder.num_pageservers = 2 + neon_env_builder.enable_pageserver_remote_storage( + remote_storage_kind=RemoteStorageKind.MOCK_S3, + ) + env = neon_env_builder.init_start(initial_tenant_conf=TENANT_CONF) + assert env.attachment_service is not None + + tenant_id = env.initial_tenant + timeline_id = env.initial_timeline + + ps_attached = env.pageservers[0] + ps_secondary = env.pageservers[1] + + workload = Workload(env, tenant_id, timeline_id) + workload.init(env.pageservers[0].id) + workload.write_rows(256, ps_attached.id) + + # Configure a secondary location + log.info("Setting up secondary location...") + ps_secondary.tenant_location_configure( + tenant_id, + { + "mode": "Secondary", + "secondary_conf": {"warm": True}, + "tenant_conf": {}, + }, + ) + readback_conf = ps_secondary.read_tenant_location_conf(tenant_id) + log.info(f"Read back conf: {readback_conf}") + + # Explicit upload/download cycle + # ============================== + log.info("Synchronizing after initial write...") + ps_attached.http_client().tenant_heatmap_upload(tenant_id) + + ps_secondary.http_client().tenant_secondary_download(tenant_id) + + assert list_layers(ps_attached, tenant_id, timeline_id) == list_layers( + ps_secondary, tenant_id, timeline_id + ) + + # Make changes on attached pageserver, check secondary downloads them + # =================================================================== + log.info("Synchronizing after subsequent write...") + workload.churn_rows(128, ps_attached.id) + + ps_attached.http_client().tenant_heatmap_upload(tenant_id) + ps_secondary.http_client().tenant_secondary_download(tenant_id) + + assert list_layers(ps_attached, tenant_id, timeline_id) == list_layers( + ps_secondary, tenant_id, timeline_id + ) + + # FIXME: this sleep is needed to avoid on-demand promotion of the layers we evict, while + # walreceiver is still doing something. + import time + + time.sleep(5) + + # Do evictions on attached pageserver, check secondary follows along + # ================================================================== + log.info("Evicting a layer...") + layer_to_evict = list_layers(ps_attached, tenant_id, timeline_id)[0] + ps_attached.http_client().evict_layer(tenant_id, timeline_id, layer_name=layer_to_evict.name) + + log.info("Synchronizing after eviction...") + ps_attached.http_client().tenant_heatmap_upload(tenant_id) + ps_secondary.http_client().tenant_secondary_download(tenant_id) + + assert layer_to_evict not in list_layers(ps_attached, tenant_id, timeline_id) + assert list_layers(ps_attached, tenant_id, timeline_id) == list_layers( + ps_secondary, tenant_id, timeline_id + ) + + # Scrub the remote storage + # ======================== + # This confirms that the scrubber isn't upset by the presence of the heatmap + S3Scrubber(neon_env_builder.test_output_dir, neon_env_builder).scan_metadata() + + # Detach secondary and delete tenant + # =================================== + # This confirms that the heatmap gets cleaned up as well as other normal content. + log.info("Detaching secondary location...") + ps_secondary.tenant_location_configure( + tenant_id, + { + "mode": "Detached", + "secondary_conf": None, + "tenant_conf": {}, + }, + ) + + log.info("Deleting tenant...") + tenant_delete_wait_completed(ps_attached.http_client(), tenant_id, 10) + + assert_prefix_empty( + neon_env_builder, + prefix="/".join( + ( + "tenants", + str(tenant_id), + ) + ), + ) diff --git a/test_runner/regress/test_remote_storage.py b/test_runner/regress/test_remote_storage.py index 2fda56d0f4..98b2e856ec 100644 --- a/test_runner/regress/test_remote_storage.py +++ b/test_runner/regress/test_remote_storage.py @@ -144,8 +144,11 @@ def test_remote_storage_backup_and_restore( # Introduce failpoint in list remote timelines code path to make tenant_attach fail. # This is before the failures injected by test_remote_failures, so it's a permanent error. pageserver_http.configure_failpoints(("storage-sync-list-remote-timelines", "return")) - env.pageserver.allowed_errors.append( - ".*attach failed.*: storage-sync-list-remote-timelines", + env.pageserver.allowed_errors.extend( + [ + ".*attach failed.*: storage-sync-list-remote-timelines", + ".*Tenant state is Broken: storage-sync-list-remote-timelines.*", + ] ) # Attach it. This HTTP request will succeed and launch a # background task to load the tenant. In that background task, @@ -159,9 +162,13 @@ def test_remote_storage_backup_and_restore( "data": {"reason": "storage-sync-list-remote-timelines"}, } - # Ensure that even though the tenant is broken, we can't attach it again. - with pytest.raises(Exception, match=f"tenant {tenant_id} already exists, state: Broken"): - env.pageserver.tenant_attach(tenant_id) + # Ensure that even though the tenant is broken, retrying the attachment fails + with pytest.raises(Exception, match="Tenant state is Broken"): + # Use same generation as in previous attempt + gen_state = env.attachment_service.inspect(tenant_id) + assert gen_state is not None + generation = gen_state[0] + env.pageserver.tenant_attach(tenant_id, generation=generation) # Restart again, this implicitly clears the failpoint. # test_remote_failures=1 remains active, though, as it's in the pageserver config. @@ -176,10 +183,8 @@ def test_remote_storage_backup_and_restore( ), "we shouldn't have tried any layer downloads yet since list remote timelines has a failpoint" env.pageserver.start() - # Ensure that the pageserver remembers that the tenant was attaching, by - # trying to attach it again. It should fail. - with pytest.raises(Exception, match=f"tenant {tenant_id} already exists, state:"): - env.pageserver.tenant_attach(tenant_id) + # The attach should have got far enough that it recovers on restart (i.e. tenant's + # config was written to local storage). log.info("waiting for tenant to become active. this should be quick with on-demand download") wait_until_tenant_active( diff --git a/test_runner/regress/test_tenant_detach.py b/test_runner/regress/test_tenant_detach.py index 0dcbb23ad4..d548e63cc1 100644 --- a/test_runner/regress/test_tenant_detach.py +++ b/test_runner/regress/test_tenant_detach.py @@ -391,8 +391,7 @@ def test_tenant_detach_ignored_tenant(neon_simple_env: NeonEnv): tenants_after_detach = [tenant["id"] for tenant in client.tenant_list()] assert ( tenant_id not in tenants_after_detach - ), f"Ignored and then detached tenant {tenant_id} \ - should not be present in pageserver's memory" + ), f"Ignored and then detached tenant {tenant_id} should not be present in pageserver's memory" # Creates a tenant, and detaches it with extra paremeter that forces ignored tenant detach. @@ -430,8 +429,7 @@ def test_tenant_detach_regular_tenant(neon_simple_env: NeonEnv): tenants_after_detach = [tenant["id"] for tenant in client.tenant_list()] assert ( tenant_id not in tenants_after_detach - ), f"Ignored and then detached tenant {tenant_id} \ - should not be present in pageserver's memory" + ), f"Ignored and then detached tenant {tenant_id} should not be present in pageserver's memory" def test_detach_while_attaching( @@ -629,7 +627,7 @@ def test_ignored_tenant_download_missing_layers(neon_env_builder: NeonEnvBuilder # Tests that attach is never working on a tenant, ignored or not, as long as it's not absent locally # Similarly, tests that it's not possible to schedule a `load` for tenat that's not ignored. -def test_load_attach_negatives(neon_env_builder: NeonEnvBuilder): +def test_load_negatives(neon_env_builder: NeonEnvBuilder): neon_env_builder.enable_pageserver_remote_storage(RemoteStorageKind.LOCAL_FS) env = neon_env_builder.init_start() pageserver_http = env.pageserver.http_client() @@ -646,25 +644,16 @@ def test_load_attach_negatives(neon_env_builder: NeonEnvBuilder): ): env.pageserver.tenant_load(tenant_id) - with pytest.raises( - expected_exception=PageserverApiException, - match=f"tenant {tenant_id} already exists, state: Active", - ): - env.pageserver.tenant_attach(tenant_id) - pageserver_http.tenant_ignore(tenant_id) - env.pageserver.allowed_errors.append(".*tenant directory already exists.*") - with pytest.raises( - expected_exception=PageserverApiException, - match="tenant directory already exists", - ): - env.pageserver.tenant_attach(tenant_id) - -def test_ignore_while_attaching( +def test_detach_while_activating( neon_env_builder: NeonEnvBuilder, ): + """ + Test cancellation behavior for tenants that are stuck somewhere between + being attached and reaching Active state. + """ neon_env_builder.enable_pageserver_remote_storage(RemoteStorageKind.LOCAL_FS) env = neon_env_builder.init_start() @@ -686,39 +675,28 @@ def test_ignore_while_attaching( data_secret = "very secret secret" insert_test_data(pageserver_http, tenant_id, timeline_id, data_id, data_secret, endpoint) - tenants_before_ignore = [tenant["id"] for tenant in pageserver_http.tenant_list()] + tenants_before_detach = [tenant["id"] for tenant in pageserver_http.tenant_list()] # Detach it pageserver_http.tenant_detach(tenant_id) + # And re-attach, but stop attach task_mgr task from completing - pageserver_http.configure_failpoints([("attach-before-activate", "return(5000)")]) + pageserver_http.configure_failpoints([("attach-before-activate", "return(600000)")]) env.pageserver.tenant_attach(tenant_id) - # Run ignore on the task, thereby cancelling the attach. - # XXX This should take priority over attach, i.e., it should cancel the attach task. - # But neither the failpoint, nor the proper remote_timeline_client download functions, - # are sensitive to task_mgr::shutdown. - # This problem is tracked in https://github.com/neondatabase/neon/issues/2996 . - # So, for now, effectively, this ignore here will block until attach task completes. - pageserver_http.tenant_ignore(tenant_id) - # Cannot attach it due to some local files existing - env.pageserver.allowed_errors.append(".*tenant directory already exists.*") - with pytest.raises( - expected_exception=PageserverApiException, - match="tenant directory already exists", - ): - env.pageserver.tenant_attach(tenant_id) + # The tenant is in the Activating state. This should not block us from + # shutting it down and detaching it. + pageserver_http.tenant_detach(tenant_id) - tenants_after_ignore = [tenant["id"] for tenant in pageserver_http.tenant_list()] - assert tenant_id not in tenants_after_ignore, "Ignored tenant should be missing" - assert len(tenants_after_ignore) + 1 == len( - tenants_before_ignore + tenants_after_detach = [tenant["id"] for tenant in pageserver_http.tenant_list()] + assert tenant_id not in tenants_after_detach, "Detached tenant should be missing" + assert len(tenants_after_detach) + 1 == len( + tenants_before_detach ), "Only ignored tenant should be missing" - # Calling load will bring the tenant back online + # Subsequently attaching it again should still work pageserver_http.configure_failpoints([("attach-before-activate", "off")]) - env.pageserver.tenant_load(tenant_id) - + env.pageserver.tenant_attach(tenant_id) wait_until_tenant_state(pageserver_http, tenant_id, "Active", 5) endpoint.stop() @@ -817,9 +795,7 @@ def test_metrics_while_ignoring_broken_tenant_and_reloading( if found_broken: break time.sleep(0.5) - assert ( - found_broken - ), f"broken should still be in set, but it is not in the tenant state count: broken={broken}, broken_set={broken_set}" + assert found_broken, f"broken should still be in set, but it is not in the tenant state count: broken={broken}, broken_set={broken_set}" env.pageserver.tenant_load(env.initial_tenant) @@ -837,6 +813,4 @@ def test_metrics_while_ignoring_broken_tenant_and_reloading( break time.sleep(0.5) - assert ( - found_active - ), f"reloaded tenant should be active, and broken tenant set item removed: active={active}, broken_set={broken_set}" + assert found_active, f"reloaded tenant should be active, and broken tenant set item removed: active={active}, broken_set={broken_set}" diff --git a/test_runner/regress/test_tenant_relocation.py b/test_runner/regress/test_tenant_relocation.py index dcd7232b1b..1887bca23b 100644 --- a/test_runner/regress/test_tenant_relocation.py +++ b/test_runner/regress/test_tenant_relocation.py @@ -161,12 +161,10 @@ def switch_pg_to_new_pageserver( files_before_detach = os.listdir(timeline_to_detach_local_path) assert ( "metadata" in files_before_detach - ), f"Regular timeline {timeline_to_detach_local_path} should have the metadata file,\ - but got: {files_before_detach}" + ), f"Regular timeline {timeline_to_detach_local_path} should have the metadata file, but got: {files_before_detach}" assert ( len(files_before_detach) >= 2 - ), f"Regular timeline {timeline_to_detach_local_path} should have at least one layer file,\ - but got {files_before_detach}" + ), f"Regular timeline {timeline_to_detach_local_path} should have at least one layer file, but got {files_before_detach}" return timeline_to_detach_local_path diff --git a/test_runner/regress/test_tenants.py b/test_runner/regress/test_tenants.py index 22036884ee..5f2c1500d8 100644 --- a/test_runner/regress/test_tenants.py +++ b/test_runner/regress/test_tenants.py @@ -29,18 +29,13 @@ def test_tenant_creation_fails(neon_simple_env: NeonEnv): initial_tenants = sorted( map(lambda t: t.split()[0], neon_simple_env.neon_cli.list_tenants().stdout.splitlines()) ) - initial_tenant_dirs = [d for d in tenants_dir.iterdir()] + [d for d in tenants_dir.iterdir()] - neon_simple_env.pageserver.allowed_errors.extend( - [ - ".*Failed to create directory structure for tenant .*, cleaning tmp data.*", - ".*Failed to fsync removed temporary tenant directory .*", - ] - ) + neon_simple_env.pageserver.allowed_errors.append(".*tenant-config-before-write.*") pageserver_http = neon_simple_env.pageserver.http_client() - pageserver_http.configure_failpoints(("tenant-creation-before-tmp-rename", "return")) - with pytest.raises(Exception, match="tenant-creation-before-tmp-rename"): + pageserver_http.configure_failpoints(("tenant-config-before-write", "return")) + with pytest.raises(Exception, match="tenant-config-before-write"): _ = neon_simple_env.neon_cli.create_tenant() new_tenants = sorted( @@ -48,10 +43,10 @@ def test_tenant_creation_fails(neon_simple_env: NeonEnv): ) assert initial_tenants == new_tenants, "should not create new tenants" - new_tenant_dirs = [d for d in tenants_dir.iterdir()] - assert ( - new_tenant_dirs == initial_tenant_dirs - ), "pageserver should clean its temp tenant dirs on tenant creation failure" + # Any files left behind on disk during failed creation do not prevent + # a retry from succeeding. + pageserver_http.configure_failpoints(("tenant-config-before-write", "off")) + neon_simple_env.neon_cli.create_tenant() def test_tenants_normal_work(neon_env_builder: NeonEnvBuilder): diff --git a/test_runner/regress/test_tenants_with_remote_storage.py b/test_runner/regress/test_tenants_with_remote_storage.py index 07fb6dc5ca..6f05d7f7cb 100644 --- a/test_runner/regress/test_tenants_with_remote_storage.py +++ b/test_runner/regress/test_tenants_with_remote_storage.py @@ -201,8 +201,8 @@ def test_tenants_attached_after_download(neon_env_builder: NeonEnvBuilder): len(restored_timelines) == 1 ), f"Tenant {tenant_id} should have its timeline reattached after its layer is downloaded from the remote storage" restored_timeline = restored_timelines[0] - assert restored_timeline["timeline_id"] == str( - timeline_id + assert ( + restored_timeline["timeline_id"] == str(timeline_id) ), f"Tenant {tenant_id} should have its old timeline {timeline_id} restored from the remote storage" # Check that we had to retry the downloads @@ -280,8 +280,8 @@ def test_tenant_redownloads_truncated_file_on_startup( len(restored_timelines) == 1 ), f"Tenant {tenant_id} should have its timeline reattached after its layer is downloaded from the remote storage" retored_timeline = restored_timelines[0] - assert retored_timeline["timeline_id"] == str( - timeline_id + assert ( + retored_timeline["timeline_id"] == str(timeline_id) ), f"Tenant {tenant_id} should have its old timeline {timeline_id} restored from the remote storage" # Request non-incremental logical size. Calculating it needs the layer file that diff --git a/test_runner/regress/test_timeline_size.py b/test_runner/regress/test_timeline_size.py index 6e510b2eba..92f2e72378 100644 --- a/test_runner/regress/test_timeline_size.py +++ b/test_runner/regress/test_timeline_size.py @@ -1,3 +1,4 @@ +import concurrent.futures import math import queue import random @@ -24,6 +25,7 @@ from fixtures.pageserver.utils import ( assert_tenant_state, timeline_delete_wait_completed, wait_for_upload_queue_empty, + wait_tenant_status_404, wait_until_tenant_active, ) from fixtures.pg_version import PgVersion @@ -776,6 +778,7 @@ def test_ondemand_activation(neon_env_builder: NeonEnvBuilder): def get_tenant_states(): states = {} + log.info(f"Tenant ids: {tenant_ids}") for tenant_id in tenant_ids: tenant = pageserver_http.tenant_status(tenant_id=tenant_id) states[tenant_id] = tenant["state"]["slug"] @@ -872,3 +875,116 @@ def test_ondemand_activation(neon_env_builder: NeonEnvBuilder): pageserver_http.get_metric_value("pageserver_tenant_startup_scheduled_total") == n_tenants ) assert pageserver_http.get_metric_value("pageserver_tenant_startup_complete_total") == n_tenants + + # Check that tenant deletion proactively wakes tenants: this is done separately to the main + # body of the test because it will disrupt tenant counts + env.pageserver.stop() + env.pageserver.start( + extra_env_vars={"FAILPOINTS": "timeline-calculate-logical-size-pause=pause"} + ) + + wait_until(10, 1, at_least_one_active) + delete_tenant_id = list( + [(tid, s) for (tid, s) in get_tenant_states().items() if s == "Attaching"] + )[0][0] + + # Deleting a stuck tenant should prompt it to go active + with concurrent.futures.ThreadPoolExecutor() as executor: + log.info("Starting background delete") + + def delete_tenant(): + env.pageserver.http_client().tenant_delete(delete_tenant_id) + + background_delete = executor.submit(delete_tenant) + + # Deletion itself won't complete due to our failpoint: Tenant::shutdown can't complete while calculating + # logical size is paused in a failpoint. So instead we will use a log observation to check that + # on-demand activation was triggered by the tenant deletion + log_match = f".*attach{{tenant_id={delete_tenant_id} shard_id=0000}}: Activating tenant \\(on-demand\\).*" + + def activated_on_demand(): + assert env.pageserver.log_contains(log_match) is not None + + log.info(f"Waiting for activation message '{log_match}'") + try: + wait_until(10, 1, activated_on_demand) + finally: + log.info("Clearing failpoint") + pageserver_http.configure_failpoints(("timeline-calculate-logical-size-pause", "off")) + + # Deletion should complete successfully now that failpoint is unblocked + log.info("Joining background delete") + background_delete.result(timeout=10) + + # Poll for deletion to complete + wait_tenant_status_404(pageserver_http, tenant_id=delete_tenant_id, iterations=40) + tenant_ids.remove(delete_tenant_id) + + # Check that all the stuck tenants proceed to active (apart from the one that deletes) + wait_until(10, 1, all_active) + assert len(get_tenant_states()) == n_tenants - 1 + + +def test_timeline_logical_size_task_priority(neon_env_builder: NeonEnvBuilder): + """ + /v1/tenant/:tenant_shard_id/timeline and /v1/tenant/:tenant_shard_id + should not bump the priority of the initial logical size computation + background task, unless the force-await-initial-logical-size query param + is set to true. + + This test verifies the invariant stated above. A couple of tricks are involved: + 1. Detach the tenant and re-attach it after the page server is restarted. This circumvents + the warm-up which forces the initial logical size calculation. + 2. A fail point (initial-size-calculation-permit-pause) is used to block the initial + computation of the logical size until forced. + 3. A fail point (walreceiver-after-ingest) is used to pause the walreceiver since + otherwise it would force the logical size computation. + """ + env = neon_env_builder.init_start() + client = env.pageserver.http_client() + + tenant_id = env.initial_tenant + timeline_id = env.initial_timeline + + # load in some data + endpoint = env.endpoints.create_start("main", tenant_id=tenant_id) + endpoint.safe_psql_many( + [ + "CREATE TABLE foo (x INTEGER)", + "INSERT INTO foo SELECT g FROM generate_series(1, 10000) g", + ] + ) + wait_for_last_flush_lsn(env, endpoint, tenant_id, timeline_id) + + # restart with failpoint inside initial size calculation task + log.info(f"Detaching tenant {tenant_id} and stopping pageserver...") + + endpoint.stop() + env.pageserver.tenant_detach(tenant_id) + env.pageserver.stop() + env.pageserver.start( + extra_env_vars={ + "FAILPOINTS": "initial-size-calculation-permit-pause=pause;walreceiver-after-ingest=pause" + } + ) + + log.info(f"Re-attaching tenant {tenant_id}...") + env.pageserver.tenant_attach(tenant_id) + + # kick off initial size calculation task (the response we get here is the estimated size) + def assert_initial_logical_size_not_prioritised(): + details = client.timeline_detail(tenant_id, timeline_id) + assert details["current_logical_size_is_accurate"] is False + + assert_initial_logical_size_not_prioritised() + + # ensure that's actually the case + time.sleep(2) + assert_initial_logical_size_not_prioritised() + + details = client.timeline_detail(tenant_id, timeline_id, force_await_initial_logical_size=True) + assert details["current_logical_size_is_accurate"] is True + + client.configure_failpoints( + [("initial-size-calculation-permit-pause", "off"), ("walreceiver-after-ingest", "off")] + ) diff --git a/test_runner/regress/test_wal_acceptor.py b/test_runner/regress/test_wal_acceptor.py index 3c40a9cb3e..b4ce633531 100644 --- a/test_runner/regress/test_wal_acceptor.py +++ b/test_runner/regress/test_wal_acceptor.py @@ -419,7 +419,8 @@ def wait(f, desc, timeout=30, wait_f=None): try: if f(): break - except Exception: + except Exception as e: + log.info(f"got exception while waiting for {desc}: {e}") pass elapsed = time.time() - started_at if elapsed > timeout: @@ -565,7 +566,7 @@ def test_s3_wal_replay(neon_env_builder: NeonEnvBuilder): f"Pageserver last_record_lsn={pageserver_lsn}; flush_lsn={last_lsn}; lag before replay is {lag / 1024}kb" ) - endpoint.stop_and_destroy() + endpoint.stop() timeline_delete_wait_completed(ps_http, tenant_id, timeline_id) # Also delete and manually create timeline on safekeepers -- this tests @@ -1001,8 +1002,40 @@ def test_restart_endpoint(neon_env_builder: NeonEnvBuilder): endpoint.start() +# Context manager which logs passed time on exit. +class DurationLogger: + def __init__(self, desc): + self.desc = desc + + def __enter__(self): + self.ts_before = time.time() + + def __exit__(self, *exc): + log.info(f"{self.desc} finished in {time.time() - self.ts_before}s") + + +# Context manager which logs WAL position change on exit. +class WalChangeLogger: + def __init__(self, ep, desc_before): + self.ep = ep + self.desc_before = desc_before + + def __enter__(self): + self.ts_before = time.time() + self.lsn_before = Lsn(self.ep.safe_psql_scalar("select pg_current_wal_lsn()")) + log.info(f"{self.desc_before}, lsn_before={self.lsn_before}") + + def __exit__(self, *exc): + lsn_after = Lsn(self.ep.safe_psql_scalar("select pg_current_wal_lsn()")) + log.info( + f"inserted {((lsn_after - self.lsn_before) / 1024 / 1024):.3f} MB of WAL in {(time.time() - self.ts_before):.3f}s" + ) + + # Test that we can create timeline with one safekeeper down and initialize it -# later when some data already had been written. +# later when some data already had been written. It is strictly weaker than +# test_lagging_sk, but also is the simplest test to trigger WAL sk -> compute +# download (recovery) and as such useful for development/testing. def test_late_init(neon_env_builder: NeonEnvBuilder): neon_env_builder.num_safekeepers = 3 env = neon_env_builder.init_start() @@ -1010,12 +1043,13 @@ def test_late_init(neon_env_builder: NeonEnvBuilder): sk1 = env.safekeepers[0] sk1.stop() - # create and insert smth while safekeeper is down... - env.neon_cli.create_branch("test_late_init") + tenant_id = env.initial_tenant + timeline_id = env.neon_cli.create_branch("test_late_init") endpoint = env.endpoints.create_start("test_late_init") + # create and insert smth while safekeeper is down... endpoint.safe_psql("create table t(key int, value text)") - endpoint.safe_psql("insert into t select generate_series(1, 1000), 'payload'") - log.info("insert with safekeeper down done") + with WalChangeLogger(endpoint, "doing insert with sk1 down"): + endpoint.safe_psql("insert into t select generate_series(1, 1000), 'payload'") endpoint.stop() # stop compute # stop another safekeeper, and start one which missed timeline creation @@ -1024,28 +1058,213 @@ def test_late_init(neon_env_builder: NeonEnvBuilder): sk1.start() # insert some more - endpoint = env.endpoints.create_start("test_late_init") + with DurationLogger("recovery"): + endpoint = env.endpoints.create_start("test_late_init") endpoint.safe_psql("insert into t select generate_series(1,100), 'payload'") + wait_flush_lsn_align_by_ep( + env, "test_late_init", tenant_id, timeline_id, endpoint, [sk1, env.safekeepers[2]] + ) + # Check that WALs are the same. + cmp_sk_wal([sk1, env.safekeepers[2]], tenant_id, timeline_id) + # is timeline flush_lsn equal on provided safekeepers? -def is_flush_lsn_aligned(sk1_http_cli, sk2_http_cli, tenant_id, timeline_id): - status1 = sk1_http_cli.timeline_status(tenant_id, timeline_id) - status2 = sk2_http_cli.timeline_status(tenant_id, timeline_id) - log.info( - f"waiting for flush_lsn alignment, sk1.flush_lsn={status1.flush_lsn}, sk2.flush_lsn={status2.flush_lsn}" +def is_flush_lsn_aligned(sk_http_clis, tenant_id, timeline_id): + flush_lsns = [ + sk_http_cli.timeline_status(tenant_id, timeline_id).flush_lsn + for sk_http_cli in sk_http_clis + ] + log.info(f"waiting for flush_lsn alignment, flush_lsns={flush_lsns}") + return all([flush_lsns[0] == flsn for flsn in flush_lsns]) + + +def are_walreceivers_absent(sk_http_cli, tenant_id: TenantId, timeline_id: TimelineId): + status = sk_http_cli.timeline_status(tenant_id, timeline_id) + log.info(f"waiting for walreceivers to be gone, currently {status.walreceivers}") + return len(status.walreceivers) == 0 + + +# Assert by xxd that WAL on given safekeepers is identical. No compute must be +# running for this to be reliable. +def cmp_sk_wal(sks: List[Safekeeper], tenant_id: TenantId, timeline_id: TimelineId): + assert len(sks) >= 2, "cmp_sk_wal makes sense with >= 2 safekeepers passed" + sk_http_clis = [sk.http_client() for sk in sks] + + # First check that term / flush_lsn are the same: it is easier to + # report/understand if WALs are different due to that. + statuses = [sk_http_cli.timeline_status(tenant_id, timeline_id) for sk_http_cli in sk_http_clis] + term_flush_lsns = [(s.acceptor_epoch, s.flush_lsn) for s in statuses] + for tfl, sk in zip(term_flush_lsns[1:], sks[1:]): + assert ( + term_flush_lsns[0] == tfl + ), f"(term, flush_lsn) are not equal on sks {sks[0].id} and {sk.id}: {term_flush_lsns[0]} != {tfl}" + + # check that WALs are identic. + segs = [sk.list_segments(tenant_id, timeline_id) for sk in sks] + for cmp_segs, sk in zip(segs[1:], sks[1:]): + assert ( + segs[0] == cmp_segs + ), f"lists of segments on sks {sks[0].id} and {sk.id} are not identic: {segs[0]} and {cmp_segs}" + log.info(f"comparing segs {segs[0]}") + + sk0 = sks[0] + for sk in sks[1:]: + (_, mismatch, not_regular) = filecmp.cmpfiles( + sk0.timeline_dir(tenant_id, timeline_id), + sk.timeline_dir(tenant_id, timeline_id), + segs[0], + shallow=False, + ) + log.info( + f"filecmp result mismatch and not regular files:\n\t mismatch={mismatch}\n\t not_regular={not_regular}" + ) + + for f in mismatch: + f1 = os.path.join(sk0.timeline_dir(tenant_id, timeline_id), f) + f2 = os.path.join(sk.timeline_dir(tenant_id, timeline_id), f) + stdout_filename = "{}.filediff".format(f2) + + with open(stdout_filename, "w") as stdout_f: + subprocess.run("xxd {} > {}.hex ".format(f1, f1), shell=True) + subprocess.run("xxd {} > {}.hex ".format(f2, f2), shell=True) + + cmd = "diff {}.hex {}.hex".format(f1, f2) + subprocess.run([cmd], stdout=stdout_f, shell=True) + + assert (mismatch, not_regular) == ( + [], + [], + ), f"WAL segs {f1} and {f2} on sks {sks[0].id} and {sk.id} are not identic" + + +# Wait until flush_lsn on given sks becomes equal, assuming endpoint ep is +# running. ep is stopped by this function. This is used in tests which check +# binary equality of WAL segments on safekeepers; which is inherently racy as +# shutting down endpoint might always write some WAL which can get to only one +# safekeeper. So here we recheck flush_lsn again after ep shutdown and retry if +# it has changed. +def wait_flush_lsn_align_by_ep(env, branch, tenant_id, timeline_id, ep, sks): + sk_http_clis = [sk.http_client() for sk in sks] + # First wait for the alignment. + wait( + partial(is_flush_lsn_aligned, sk_http_clis, tenant_id, timeline_id), + "flush_lsn to get aligned", ) - return status1.flush_lsn == status2.flush_lsn + ep.stop() # then stop endpoint + # Even if there is no compute, there might be some in flight data; ensure + # all walreceivers die before rechecking. + for sk_http_cli in sk_http_clis: + wait( + partial(are_walreceivers_absent, sk_http_cli, tenant_id, timeline_id), + "walreceivers to be gone", + ) + # Now recheck again flush_lsn and exit if it is good + if is_flush_lsn_aligned(sk_http_clis, tenant_id, timeline_id): + return + # Otherwise repeat. + log.info("flush_lsn changed during endpoint shutdown; retrying alignment") + ep = env.endpoints.create_start(branch) -# Test behaviour with one safekeeper down and missing a lot of WAL. Namely, that -# 1) walproposer can't recover node if it misses WAL written by previous computes, but -# still starts up and functions normally if two other sks are ok. -# 2) walproposer doesn't keep WAL after some threshold (pg_wal bloat is limited), but functions -# normally if two other sks are ok. -# 3) Lagged safekeeper can still recover by peer recovery. -def test_one_sk_down(neon_env_builder: NeonEnvBuilder): - pass +# Test behaviour with one safekeeper down and missing a lot of WAL, exercising +# neon_walreader and checking that pg_wal never bloats. Namely, ensures that +# compute doesn't keep many WAL for lagging sk, but still can recover it with +# neon_walreader, in two scenarious: a) WAL never existed on compute (it started +# on basebackup LSN later than lagging sk position) though segment file exists +# b) WAL had been recycled on it and segment file doesn't exist. +# +# Also checks along the way that whenever there are two sks alive, compute +# should be able to commit. +def test_lagging_sk(neon_env_builder: NeonEnvBuilder): + # inserts ~20MB of WAL, a bit more than a segment. + def fill_segment(ep): + ep.safe_psql("insert into t select generate_series(1, 180000), 'payload'") + + neon_env_builder.num_safekeepers = 3 + env = neon_env_builder.init_start() + + (sk1, sk2, sk3) = env.safekeepers + + # create and insert smth while safekeeper is down... + sk1.stop() + tenant_id = env.initial_tenant + timeline_id = env.neon_cli.create_branch("test_lagging_sk") + ep = env.endpoints.create_start("test_lagging_sk") + ep.safe_psql("create table t(key int, value text)") + # make small insert to be on the same segment + ep.safe_psql("insert into t select generate_series(1, 1000), 'payload'") + log.info("insert with safekeeper down done") + ep.stop() # stop compute + + # Stop another safekeeper, and start one which missed timeline creation. + sk2.stop() + sk1.start() + + # Start new ep and insert some more. neon_walreader should download WAL for + # sk1 because it should be filled since the horizon (initial LSN) which is + # earlier than basebackup LSN. + ep = env.endpoints.create_start("test_lagging_sk") + ep.safe_psql("insert into t select generate_series(1,100), 'payload'") + # stop ep and ensure WAL is identical after recovery. + wait_flush_lsn_align_by_ep(env, "test_lagging_sk", tenant_id, timeline_id, ep, [sk1, sk3]) + # Check that WALs are the same. + cmp_sk_wal([sk1, sk3], tenant_id, timeline_id) + + # Now repeat insertion with sk1 down, but with inserting more data to check + # that WAL on compute is removed. + sk1.stop() + sk2.start() + + # min_wal_size must be at least 2x segment size. + min_wal_config = [ + "min_wal_size=32MB", + "max_wal_size=32MB", + "wal_keep_size=0", + "log_checkpoints=on", + ] + ep = env.endpoints.create_start( + "test_lagging_sk", + config_lines=min_wal_config, + ) + with WalChangeLogger(ep, "doing large insert with sk1 down"): + for _ in range(0, 5): + fill_segment(ep) + # there shouldn't be more than 2 WAL segments (but dir may have archive_status files) + assert ep.get_pg_wal_size() < 16 * 2.5 + + sk2.stop() # stop another sk to ensure sk1 and sk3 can work + sk1.start() + with DurationLogger("recovery"): + ep.safe_psql("insert into t select generate_series(1,100), 'payload'") # forces recovery + # stop ep and ensure WAL is identical after recovery. + wait_flush_lsn_align_by_ep(env, "test_lagging_sk", tenant_id, timeline_id, ep, [sk1, sk3]) + # Check that WALs are the same. + cmp_sk_wal([sk1, sk3], tenant_id, timeline_id) + + # Now do the same with different safekeeper sk2 down, and restarting ep + # before recovery (again scenario when recovery starts below basebackup_lsn, + # but multi segment now). + ep = env.endpoints.create_start( + "test_lagging_sk", + config_lines=["min_wal_size=32MB", "max_wal_size=32MB", "log_checkpoints=on"], + ) + with WalChangeLogger(ep, "doing large insert with sk2 down"): + for _ in range(0, 5): + fill_segment(ep) + # there shouldn't be more than 2 WAL segments (but dir may have archive_status files) + assert ep.get_pg_wal_size() < 16 * 2.5 + + ep.stop() + ep = env.endpoints.create_start( + "test_lagging_sk", + config_lines=min_wal_config, + ) + sk2.start() + with DurationLogger("recovery"): + wait_flush_lsn_align_by_ep(env, "test_lagging_sk", tenant_id, timeline_id, ep, [sk2, sk3]) + # Check that WALs are the same. + cmp_sk_wal([sk1, sk2, sk3], tenant_id, timeline_id) # Smaller version of test_one_sk_down testing peer recovery in isolation: that @@ -1065,7 +1284,7 @@ def test_peer_recovery(neon_env_builder: NeonEnvBuilder): sk2_http_cli = sk2.http_client() # ensure tli gets created on sk1, peer recovery won't do that wait( - partial(is_flush_lsn_aligned, sk1_http_cli, sk2_http_cli, tenant_id, timeline_id), + partial(is_flush_lsn_aligned, [sk1_http_cli, sk2_http_cli], tenant_id, timeline_id), "flush_lsn to get aligned", ) @@ -1087,7 +1306,7 @@ def test_peer_recovery(neon_env_builder: NeonEnvBuilder): assert sk2_tli_status.flush_lsn - sk1_tli_status.flush_lsn >= 16 * 1024 * 1024 # wait a bit, lsns shouldn't change - # time.sleep(5) + time.sleep(2) sk1_tli_status = sk1_http_cli.timeline_status(tenant_id, timeline_id) sk2_tli_status = sk2_http_cli.timeline_status(tenant_id, timeline_id) log.info( @@ -1098,37 +1317,11 @@ def test_peer_recovery(neon_env_builder: NeonEnvBuilder): # now restart safekeeper with peer recovery enabled and wait for recovery sk1.stop().start(extra_opts=["--peer-recovery=true"]) wait( - partial(is_flush_lsn_aligned, sk1_http_cli, sk2_http_cli, tenant_id, timeline_id), + partial(is_flush_lsn_aligned, [sk1_http_cli, sk2_http_cli], tenant_id, timeline_id), "flush_lsn to get aligned", ) - # check that WALs are identic after recovery - segs = sk1.list_segments(tenant_id, timeline_id) - log.info(f"segs are {segs}") - - (_, mismatch, not_regular) = filecmp.cmpfiles( - sk1.timeline_dir(tenant_id, timeline_id), - sk2.timeline_dir(tenant_id, timeline_id), - segs, - shallow=False, - ) - log.info( - f"filecmp result mismatch and not regular files:\n\t mismatch={mismatch}\n\t not_regular={not_regular}" - ) - - for f in mismatch: - f1 = os.path.join(sk1.timeline_dir(tenant_id, timeline_id), f) - f2 = os.path.join(sk2.timeline_dir(tenant_id, timeline_id), f) - stdout_filename = "{}.filediff".format(f2) - - with open(stdout_filename, "w") as stdout_f: - subprocess.run("xxd {} > {}.hex ".format(f1, f1), shell=True) - subprocess.run("xxd {} > {}.hex ".format(f2, f2), shell=True) - - cmd = "diff {}.hex {}.hex".format(f1, f2) - subprocess.run([cmd], stdout=stdout_f, shell=True) - - assert (mismatch, not_regular) == ([], []) + cmp_sk_wal([sk1, sk2], tenant_id, timeline_id) # stop one of safekeepers which weren't recovering and insert a bit more to check we can commit env.safekeepers[2].stop() @@ -1364,60 +1557,6 @@ def test_replace_safekeeper(neon_env_builder: NeonEnvBuilder): show_statuses(env.safekeepers, tenant_id, timeline_id) -# We have `wal_keep_size=0`, so postgres should trim WAL once it's broadcasted -# to all safekeepers. This test checks that compute WAL can fit into small number -# of WAL segments. -def test_wal_deleted_after_broadcast(neon_env_builder: NeonEnvBuilder): - # used to calculate delta in collect_stats - last_lsn = Lsn(0) - - # returns pg_wal size in MB - def collect_stats(endpoint: Endpoint, cur, enable_logs=True): - nonlocal last_lsn - assert endpoint.pgdata_dir is not None - - log.info("executing INSERT to generate WAL") - current_lsn = Lsn(query_scalar(cur, "select pg_current_wal_lsn()")) - pg_wal_size_mb = get_dir_size(os.path.join(endpoint.pgdata_dir, "pg_wal")) / 1024 / 1024 - if enable_logs: - lsn_delta_mb = (current_lsn - last_lsn) / 1024 / 1024 - log.info(f"LSN delta: {lsn_delta_mb} MB, current WAL size: {pg_wal_size_mb} MB") - last_lsn = current_lsn - return pg_wal_size_mb - - # generates about ~20MB of WAL, to create at least one new segment - def generate_wal(cur): - cur.execute("INSERT INTO t SELECT generate_series(1,300000), 'payload'") - - neon_env_builder.num_safekeepers = 3 - env = neon_env_builder.init_start() - - env.neon_cli.create_branch("test_wal_deleted_after_broadcast") - # Adjust checkpoint config to prevent keeping old WAL segments - endpoint = env.endpoints.create_start( - "test_wal_deleted_after_broadcast", - config_lines=["min_wal_size=32MB", "max_wal_size=32MB", "log_checkpoints=on"], - ) - - pg_conn = endpoint.connect() - cur = pg_conn.cursor() - cur.execute("CREATE TABLE t(key int, value text)") - - collect_stats(endpoint, cur) - - # generate WAL to simulate normal workload - for _ in range(5): - generate_wal(cur) - collect_stats(endpoint, cur) - - log.info("executing checkpoint") - cur.execute("CHECKPOINT") - wal_size_after_checkpoint = collect_stats(endpoint, cur) - - # there shouldn't be more than 2 WAL segments (but dir may have archive_status files) - assert wal_size_after_checkpoint < 16 * 2.5 - - @pytest.mark.parametrize("auth_enabled", [False, True]) def test_delete_force(neon_env_builder: NeonEnvBuilder, auth_enabled: bool): neon_env_builder.auth_enabled = auth_enabled @@ -1699,3 +1838,83 @@ def test_idle_reconnections(neon_env_builder: NeonEnvBuilder): assert final_stats.get("START_REPLICATION", 0) >= 1 # walproposer should connect to each safekeeper at least once assert final_stats.get("START_WAL_PUSH", 0) >= 3 + + +@pytest.mark.parametrize("insert_rows", [0, 100, 100000, 500000]) +def test_timeline_copy(neon_env_builder: NeonEnvBuilder, insert_rows: int): + target_percents = [10, 50, 90, 100] + + neon_env_builder.num_safekeepers = 3 + # we need remote storage that supports copy_object S3 API + neon_env_builder.enable_safekeeper_remote_storage(RemoteStorageKind.MOCK_S3) + env = neon_env_builder.init_start() + + tenant_id = env.initial_tenant + timeline_id = env.initial_timeline + + endpoint = env.endpoints.create_start("main") + + lsns = [] + + def remember_lsn(): + lsn = Lsn(endpoint.safe_psql("SELECT pg_current_wal_flush_lsn()")[0][0]) + lsns.append(lsn) + return lsn + + # remember LSN right after timeline creation + lsn = remember_lsn() + log.info(f"LSN after timeline creation: {lsn}") + + endpoint.safe_psql("create table t(key int, value text)") + + timeline_status = env.safekeepers[0].http_client().timeline_status(tenant_id, timeline_id) + timeline_start_lsn = timeline_status.timeline_start_lsn + log.info(f"Timeline start LSN: {timeline_start_lsn}") + + current_percent = 0.0 + for new_percent in target_percents: + new_rows = insert_rows * (new_percent - current_percent) / 100 + current_percent = new_percent + + if new_rows == 0: + continue + + endpoint.safe_psql( + f"insert into t select generate_series(1, {new_rows}), repeat('payload!', 10)" + ) + + # remember LSN right after reaching new_percent + lsn = remember_lsn() + log.info(f"LSN after inserting {new_rows} rows: {lsn}") + + # TODO: would be also good to test cases where not all segments are uploaded to S3 + + for lsn in lsns: + new_timeline_id = TimelineId.generate() + log.info(f"Copying branch for LSN {lsn}, to timeline {new_timeline_id}") + + orig_digest = ( + env.safekeepers[0] + .http_client() + .timeline_digest(tenant_id, timeline_id, timeline_start_lsn, lsn) + ) + log.info(f"Original digest: {orig_digest}") + + for sk in env.safekeepers: + sk.http_client().copy_timeline( + tenant_id, + timeline_id, + { + "target_timeline_id": str(new_timeline_id), + "until_lsn": str(lsn), + }, + ) + + new_digest = sk.http_client().timeline_digest( + tenant_id, new_timeline_id, timeline_start_lsn, lsn + ) + log.info(f"Digest after timeline copy on safekeeper {sk.id}: {new_digest}") + + assert orig_digest == new_digest + + # TODO: test timelines can start after copy diff --git a/test_runner/regress/test_wal_acceptor_async.py b/test_runner/regress/test_wal_acceptor_async.py index feab7e605b..77d67cd63a 100644 --- a/test_runner/regress/test_wal_acceptor_async.py +++ b/test_runner/regress/test_wal_acceptor_async.py @@ -475,6 +475,46 @@ def test_unavailability(neon_env_builder: NeonEnvBuilder): asyncio.run(run_unavailability(env, endpoint)) +async def run_recovery_uncommitted(env: NeonEnv): + (sk1, sk2, _) = env.safekeepers + + env.neon_cli.create_branch("test_recovery_uncommitted") + ep = env.endpoints.create_start("test_recovery_uncommitted") + ep.safe_psql("create table t(key int, value text)") + ep.safe_psql("insert into t select generate_series(1, 100), 'payload'") + + # insert with only one safekeeper up to create tail of flushed but not committed WAL + sk1.stop() + sk2.stop() + conn = await ep.connect_async() + # query should hang, so execute in separate task + bg_query = asyncio.create_task( + conn.execute("insert into t select generate_series(1, 2000), 'payload'") + ) + sleep_sec = 2 + await asyncio.sleep(sleep_sec) + # it must still be not finished + assert not bg_query.done() + # note: destoy will kill compute_ctl, preventing it waiting for hanging sync-safekeepers. + ep.stop_and_destroy() + + # Start one of sks to make quorum online plus compute and ensure they can + # sync. + sk2.start() + ep = env.endpoints.create_start( + "test_recovery_uncommitted", + ) + ep.safe_psql("insert into t select generate_series(1, 2000), 'payload'") + + +# Test pulling uncommitted WAL (up to flush_lsn) during recovery. +def test_recovery_uncommitted(neon_env_builder: NeonEnvBuilder): + neon_env_builder.num_safekeepers = 3 + env = neon_env_builder.init_start() + + asyncio.run(run_recovery_uncommitted(env)) + + @dataclass class RaceConditionTest: iteration: int diff --git a/vendor/postgres-v14 b/vendor/postgres-v14 index 0bb356aa0c..03358bb0b5 160000 --- a/vendor/postgres-v14 +++ b/vendor/postgres-v14 @@ -1 +1 @@ -Subproject commit 0bb356aa0cd1582112926fbcf0b5370222c2db6d +Subproject commit 03358bb0b5e0d33c238710139e768db9e75cfcc8 diff --git a/vendor/postgres-v15 b/vendor/postgres-v15 index 24333abb81..a2dc225ddf 160000 --- a/vendor/postgres-v15 +++ b/vendor/postgres-v15 @@ -1 +1 @@ -Subproject commit 24333abb81a9ecae4541019478f0bf7d0b289df7 +Subproject commit a2dc225ddfc8cae1849aa2316f435c58f0333d8c diff --git a/vendor/postgres-v16 b/vendor/postgres-v16 index 863b71572b..225071f482 160000 --- a/vendor/postgres-v16 +++ b/vendor/postgres-v16 @@ -1 +1 @@ -Subproject commit 863b71572bc441581efb3bbee2ad18af037be1bb +Subproject commit 225071f482774943854c2eec4540757e01171557 diff --git a/vendor/revisions.json b/vendor/revisions.json index a9575a2cb7..def4eab069 100644 --- a/vendor/revisions.json +++ b/vendor/revisions.json @@ -1,5 +1,5 @@ { - "postgres-v16": "863b71572bc441581efb3bbee2ad18af037be1bb", - "postgres-v15": "24333abb81a9ecae4541019478f0bf7d0b289df7", - "postgres-v14": "0bb356aa0cd1582112926fbcf0b5370222c2db6d" + "postgres-v16": "225071f482774943854c2eec4540757e01171557", + "postgres-v15": "a2dc225ddfc8cae1849aa2316f435c58f0333d8c", + "postgres-v14": "03358bb0b5e0d33c238710139e768db9e75cfcc8" } diff --git a/vm-image-spec.yaml b/vm-image-spec.yaml index 804405293f..704e3721d6 100644 --- a/vm-image-spec.yaml +++ b/vm-image-spec.yaml @@ -36,6 +36,7 @@ files: max_client_conn=10000 default_pool_size=64 max_prepared_statements=0 + admin_users=cloud_admin - filename: cgconfig.conf content: | # Configuration for cgroups in VM compute nodes @@ -166,22 +167,21 @@ build: | && apt-get update \ && apt-get install -y \ build-essential \ - curl \ + git \ libevent-dev \ - libssl-dev \ - patchutils \ + libtool \ pkg-config - ENV PGBOUNCER_VERSION 1.21.0 - ENV PGBOUNCER_GITPATH 1_21_0 + # Note, we use pgbouncer from neondatabase/pgbouncer fork, which could contain extra commits. + # Use `dist_man_MANS=` to skip manpage generation (which requires python3/pandoc) + ENV PGBOUNCER_TAG pgbouncer_1_21_0-neon-1 RUN set -e \ - && curl -sfSL https://github.com/pgbouncer/pgbouncer/releases/download/pgbouncer_${PGBOUNCER_GITPATH}/pgbouncer-${PGBOUNCER_VERSION}.tar.gz -o pgbouncer-${PGBOUNCER_VERSION}.tar.gz \ - && tar xzvf pgbouncer-${PGBOUNCER_VERSION}.tar.gz \ - && cd pgbouncer-${PGBOUNCER_VERSION} \ - && curl https://github.com/pgbouncer/pgbouncer/commit/a7b3c0a5f4caa9dbe92743d04cf1e28c4c05806c.patch | filterdiff --include a/src/server.c | patch -p1 \ + && git clone --recurse-submodules --depth 1 --branch ${PGBOUNCER_TAG} https://github.com/neondatabase/pgbouncer.git pgbouncer \ + && cd pgbouncer \ + && ./autogen.sh \ && LDFLAGS=-static ./configure --prefix=/usr/local/pgbouncer --without-openssl \ - && make -j $(nproc) \ - && make install + && make -j $(nproc) dist_man_MANS= \ + && make install dist_man_MANS= merge: | # tweak nofile limits RUN set -e \ diff --git a/workspace_hack/Cargo.toml b/workspace_hack/Cargo.toml index 82bbedc4ae..57aa1ef0bc 100644 --- a/workspace_hack/Cargo.toml +++ b/workspace_hack/Cargo.toml @@ -39,6 +39,7 @@ futures-executor = { version = "0.3" } futures-io = { version = "0.3" } futures-sink = { version = "0.3" } futures-util = { version = "0.3", features = ["channel", "io", "sink"] } +getrandom = { version = "0.2", default-features = false, features = ["std"] } hex = { version = "0.4", features = ["serde"] } hmac = { version = "0.12", default-features = false, features = ["reset"] } hyper = { version = "0.14", features = ["full"] } @@ -50,6 +51,8 @@ nom = { version = "7" } num-bigint = { version = "0.4" } num-integer = { version = "0.1", features = ["i128"] } num-traits = { version = "0.2", features = ["i128"] } +once_cell = { version = "1" } +parquet = { git = "https://github.com/neondatabase/arrow-rs", branch = "neon-fix-bugs", default-features = false, features = ["zstd"] } prost = { version = "0.11" } rand = { version = "0.8", features = ["small_rng"] } regex = { version = "1" } @@ -74,7 +77,7 @@ tracing = { version = "0.1", features = ["log"] } tracing-core = { version = "0.1" } tungstenite = { version = "0.20" } url = { version = "2", features = ["serde"] } -uuid = { version = "1", features = ["serde", "v4"] } +uuid = { version = "1", features = ["serde", "v4", "v7"] } zstd = { version = "0.13" } zstd-safe = { version = "7", default-features = false, features = ["arrays", "legacy", "std", "zdict_builder"] } zstd-sys = { version = "2", default-features = false, features = ["legacy", "std", "zdict_builder"] } @@ -83,12 +86,19 @@ zstd-sys = { version = "2", default-features = false, features = ["legacy", "std anyhow = { version = "1", features = ["backtrace"] } bytes = { version = "1", features = ["serde"] } cc = { version = "1", default-features = false, features = ["parallel"] } +chrono = { version = "0.4", default-features = false, features = ["clock", "serde", "wasmbind"] } either = { version = "1" } +getrandom = { version = "0.2", default-features = false, features = ["std"] } itertools = { version = "0.10" } libc = { version = "0.2", features = ["extra_traits"] } log = { version = "0.4", default-features = false, features = ["std"] } memchr = { version = "2" } nom = { version = "7" } +num-bigint = { version = "0.4" } +num-integer = { version = "0.1", features = ["i128"] } +num-traits = { version = "0.2", features = ["i128"] } +once_cell = { version = "1" } +parquet = { git = "https://github.com/neondatabase/arrow-rs", branch = "neon-fix-bugs", default-features = false, features = ["zstd"] } prost = { version = "0.11" } regex = { version = "1" } regex-automata = { version = "0.4", default-features = false, features = ["dfa-onepass", "hybrid", "meta", "nfa-backtrack", "perf-inline", "perf-literal", "unicode"] } @@ -97,5 +107,8 @@ serde = { version = "1", features = ["alloc", "derive"] } syn-dff4ba8e3ae991db = { package = "syn", version = "1", features = ["extra-traits", "full", "visit"] } syn-f595c2ba2a3f28df = { package = "syn", version = "2", features = ["extra-traits", "full", "visit", "visit-mut"] } time-macros = { version = "0.2", default-features = false, features = ["formatting", "parsing", "serde"] } +zstd = { version = "0.13" } +zstd-safe = { version = "7", default-features = false, features = ["arrays", "legacy", "std", "zdict_builder"] } +zstd-sys = { version = "2", default-features = false, features = ["legacy", "std", "zdict_builder"] } ### END HAKARI SECTION