diff --git a/.github/workflows/build_and_push_docker_image.yml b/.github/workflows/build_and_push_docker_image.yml index e401b2f418..892e21114b 100644 --- a/.github/workflows/build_and_push_docker_image.yml +++ b/.github/workflows/build_and_push_docker_image.yml @@ -69,7 +69,15 @@ jobs: run: echo "{\"credsStore\":\"ecr-login\"}" > /kaniko/.docker/config.json - name: Kaniko build - run: /kaniko/executor --reproducible --snapshotMode=redo --skip-unused-stages --dockerfile ${{ inputs.dockerfile-path }} --cache=true --cache-repo 369495373322.dkr.ecr.eu-central-1.amazonaws.com/cache --destination 369495373322.dkr.ecr.eu-central-1.amazonaws.com/${{ inputs.image-name }}:${{ needs.tag.outputs.build-tools-tag }}-amd64 + run: | + /kaniko/executor \ + --reproducible \ + --snapshotMode=redo \ + --skip-unused-stages \ + --dockerfile ${{ inputs.dockerfile-path }} \ + --cache=true \ + --cache-repo 369495373322.dkr.ecr.eu-central-1.amazonaws.com/cache \ + --destination 369495373322.dkr.ecr.eu-central-1.amazonaws.com/${{ inputs.image-name }}:${{ needs.tag.outputs.build-tools-tag }}-amd64 kaniko-arm: if: needs.check-if-build-tools-dockerfile-changed.outputs.docker_file_changed == 'true' @@ -85,7 +93,15 @@ jobs: run: echo "{\"credsStore\":\"ecr-login\"}" > /kaniko/.docker/config.json - name: Kaniko build - run: /kaniko/executor --reproducible --snapshotMode=redo --skip-unused-stages --dockerfile ${{ inputs.dockerfile-path }} --cache=true --cache-repo 369495373322.dkr.ecr.eu-central-1.amazonaws.com/cache --destination 369495373322.dkr.ecr.eu-central-1.amazonaws.com/${{ inputs.image-name }}:${{ needs.tag.outputs.build-tools-tag }}-arm64 + run: | + /kaniko/executor \ + --reproducible \ + --snapshotMode=redo \ + --skip-unused-stages \ + --dockerfile ${{ inputs.dockerfile-path }} \ + --cache=true \ + --cache-repo 369495373322.dkr.ecr.eu-central-1.amazonaws.com/cache \ + --destination 369495373322.dkr.ecr.eu-central-1.amazonaws.com/${{ inputs.image-name }}:${{ needs.tag.outputs.build-tools-tag }}-arm64 manifest: if: needs.check-if-build-tools-dockerfile-changed.outputs.docker_file_changed == 'true' @@ -99,7 +115,10 @@ jobs: steps: - name: Create manifest - run: docker manifest create 369495373322.dkr.ecr.eu-central-1.amazonaws.com/${{ inputs.image-name }}:${{ needs.tag.outputs.build-tools-tag }} --amend 369495373322.dkr.ecr.eu-central-1.amazonaws.com/${{ inputs.image-name }}:${{ needs.tag.outputs.build-tools-tag }}-amd64 --amend 369495373322.dkr.ecr.eu-central-1.amazonaws.com/${{ inputs.image-name }}:${{ needs.tag.outputs.build-tools-tag }}-arm64 + run: | + docker manifest create 369495373322.dkr.ecr.eu-central-1.amazonaws.com/${{ inputs.image-name }}:${{ needs.tag.outputs.build-tools-tag }} \ + --amend 369495373322.dkr.ecr.eu-central-1.amazonaws.com/${{ inputs.image-name }}:${{ needs.tag.outputs.build-tools-tag }}-amd64 \ + --amend 369495373322.dkr.ecr.eu-central-1.amazonaws.com/${{ inputs.image-name }}:${{ needs.tag.outputs.build-tools-tag }}-arm64 - name: Push manifest run: docker manifest push 369495373322.dkr.ecr.eu-central-1.amazonaws.com/${{ inputs.image-name }}:${{ needs.tag.outputs.build-tools-tag }} diff --git a/.github/workflows/build_and_test.yml b/.github/workflows/build_and_test.yml index 2b88f09b3d..147d5cae2d 100644 --- a/.github/workflows/build_and_test.yml +++ b/.github/workflows/build_and_test.yml @@ -21,6 +21,8 @@ env: COPT: '-Werror' AWS_ACCESS_KEY_ID: ${{ secrets.AWS_ACCESS_KEY_DEV }} AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_SECRET_KEY_DEV }} + # A concurrency group that we use for e2e-tests runs, matches `concurrency.group` above with `github.repository` as a prefix + E2E_CONCURRENCY_GROUP: ${{ github.repository }}-${{ github.workflow }}-${{ github.ref_name }}-${{ github.ref_name == 'main' && github.sha || 'anysha' }} jobs: check-permissions: @@ -44,6 +46,20 @@ jobs: exit 1 + cancel-previous-e2e-tests: + needs: [ check-permissions ] + if: github.event_name == 'pull_request' + runs-on: ubuntu-latest + + steps: + - name: Cancel previous e2e-tests runs for this PR + env: + GH_TOKEN: ${{ secrets.CI_ACCESS_TOKEN }} + run: | + gh workflow --repo neondatabase/cloud \ + run cancel-previous-in-concurrency-group.yml \ + --field concurrency_group="${{ env.E2E_CONCURRENCY_GROUP }}" + tag: needs: [ check-permissions ] runs-on: [ self-hosted, gen3, small ] @@ -186,7 +202,11 @@ jobs: runs-on: [ self-hosted, gen3, large ] container: image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/build-tools:${{ needs.build-buildtools-image.outputs.build-tools-tag }} - options: --init + # Raise locked memory limit for tokio-epoll-uring. + # On 5.10 LTS kernels < 5.10.162 (and generally mainline kernels < 5.12), + # io_uring will account the memory of the CQ and SQ as locked. + # More details: https://github.com/neondatabase/neon/issues/6373#issuecomment-1905814391 + options: --init --shm-size=512mb --ulimit memlock=67108864:67108864 strategy: fail-fast: false matrix: @@ -340,8 +360,12 @@ jobs: ${cov_prefix} mold -run cargo build $CARGO_FLAGS $CARGO_FEATURES --bins --tests - name: Run rust tests + env: + NEXTEST_RETRIES: 3 run: | - ${cov_prefix} cargo nextest run $CARGO_FLAGS $CARGO_FEATURES + for io_engine in std-fs tokio-epoll-uring ; do + NEON_PAGESERVER_UNIT_TEST_VIRTUAL_FILE_IOENGINE=$io_engine ${cov_prefix} cargo nextest run $CARGO_FLAGS $CARGO_FEATURES + done # Run separate tests for real S3 export ENABLE_REAL_S3_REMOTE_STORAGE=nonempty @@ -419,8 +443,8 @@ jobs: runs-on: [ self-hosted, gen3, large ] container: image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/build-tools:${{ needs.build-buildtools-image.outputs.build-tools-tag }} - # Default shared memory is 64mb - options: --init --shm-size=512mb + # for changed limits, see comments on `options:` earlier in this file + options: --init --shm-size=512mb --ulimit memlock=67108864:67108864 strategy: fail-fast: false matrix: @@ -448,6 +472,7 @@ jobs: TEST_RESULT_CONNSTR: ${{ secrets.REGRESS_TEST_RESULT_CONNSTR_NEW }} CHECK_ONDISK_DATA_COMPATIBILITY: nonempty BUILD_TAG: ${{ needs.tag.outputs.build-tag }} + PAGESERVER_VIRTUAL_FILE_IO_ENGINE: std-fs - name: Merge and upload coverage data if: matrix.build_type == 'debug' && matrix.pg_version == 'v14' @@ -458,12 +483,13 @@ jobs: runs-on: [ self-hosted, gen3, small ] container: image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/build-tools:${{ needs.build-buildtools-image.outputs.build-tools-tag }} - # Default shared memory is 64mb - options: --init --shm-size=512mb + # for changed limits, see comments on `options:` earlier in this file + options: --init --shm-size=512mb --ulimit memlock=67108864:67108864 if: github.ref_name == 'main' || contains(github.event.pull_request.labels.*.name, 'run-benchmarks') strategy: fail-fast: false matrix: + # the amount of groups (N) should be reflected in `extra_params: --splits N ...` pytest_split_group: [ 1, 2, 3, 4 ] build_type: [ release ] steps: @@ -477,11 +503,12 @@ jobs: test_selection: performance run_in_parallel: false save_perf_report: ${{ github.ref_name == 'main' }} - extra_params: --splits ${{ strategy.job-total }} --group ${{ matrix.pytest_split_group }} + extra_params: --splits 4 --group ${{ matrix.pytest_split_group }} env: VIP_VAP_ACCESS_TOKEN: "${{ secrets.VIP_VAP_ACCESS_TOKEN }}" PERF_TEST_RESULT_CONNSTR: "${{ secrets.PERF_TEST_RESULT_CONNSTR }}" TEST_RESULT_CONNSTR: "${{ secrets.REGRESS_TEST_RESULT_CONNSTR_NEW }}" + PAGESERVER_VIRTUAL_FILE_IO_ENGINE: std-fs # XXX: no coverage data handling here, since benchmarks are run on release builds, # while coverage is currently collected for the debug ones @@ -695,7 +722,8 @@ jobs: \"commit_hash\": \"$COMMIT_SHA\", \"remote_repo\": \"${{ github.repository }}\", \"storage_image_tag\": \"${{ needs.tag.outputs.build-tag }}\", - \"compute_image_tag\": \"${{ needs.tag.outputs.build-tag }}\" + \"compute_image_tag\": \"${{ needs.tag.outputs.build-tag }}\", + \"concurrency_group\": \"${{ env.E2E_CONCURRENCY_GROUP }}\" } }" diff --git a/.github/workflows/neon_extra_builds.yml b/.github/workflows/neon_extra_builds.yml index c6c2b7386a..f8fb62d3f8 100644 --- a/.github/workflows/neon_extra_builds.yml +++ b/.github/workflows/neon_extra_builds.yml @@ -124,12 +124,12 @@ jobs: # Hence keeping target/ (and general cache size) smaller BUILD_TYPE: release CARGO_FEATURES: --features testing - CARGO_FLAGS: --locked --release + CARGO_FLAGS: --release AWS_ACCESS_KEY_ID: ${{ secrets.AWS_ACCESS_KEY_DEV }} AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_SECRET_KEY_DEV }} container: - image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/rust:pinned + image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/build-tools:pinned options: --init steps: @@ -210,18 +210,20 @@ jobs: - name: Run cargo build run: | - mold -run cargo build $CARGO_FLAGS $CARGO_FEATURES --bins --tests + mold -run cargo build --locked $CARGO_FLAGS $CARGO_FEATURES --bins --tests - name: Run cargo test + env: + NEXTEST_RETRIES: 3 run: | - cargo test $CARGO_FLAGS $CARGO_FEATURES + cargo nextest run $CARGO_FEATURES # Run separate tests for real S3 export ENABLE_REAL_S3_REMOTE_STORAGE=nonempty export REMOTE_STORAGE_S3_BUCKET=neon-github-ci-tests export REMOTE_STORAGE_S3_REGION=eu-central-1 # Avoid `$CARGO_FEATURES` since there's no `testing` feature in the e2e tests now - cargo test $CARGO_FLAGS --package remote_storage --test test_real_s3 + cargo nextest run --package remote_storage --test test_real_s3 # Run separate tests for real Azure Blob Storage # XXX: replace region with `eu-central-1`-like region @@ -231,7 +233,7 @@ jobs: export REMOTE_STORAGE_AZURE_CONTAINER="${{ vars.REMOTE_STORAGE_AZURE_CONTAINER }}" export REMOTE_STORAGE_AZURE_REGION="${{ vars.REMOTE_STORAGE_AZURE_REGION }}" # Avoid `$CARGO_FEATURES` since there's no `testing` feature in the e2e tests now - cargo test $CARGO_FLAGS --package remote_storage --test test_real_azure + cargo nextest run --package remote_storage --test test_real_azure check-codestyle-rust-arm: timeout-minutes: 90 diff --git a/.github/workflows/update_build_tools_image.yml b/.github/workflows/update_build_tools_image.yml index 88bab797b7..900724fc60 100644 --- a/.github/workflows/update_build_tools_image.yml +++ b/.github/workflows/update_build_tools_image.yml @@ -20,111 +20,51 @@ defaults: run: shell: bash -euo pipefail {0} -env: - AWS_ACCESS_KEY_ID: ${{ secrets.AWS_ACCESS_KEY_DEV }} - AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_SECRET_KEY_DEV }} - permissions: {} jobs: tag-image: runs-on: [ self-hosted, gen3, small ] - container: golang:1.19-bullseye env: - IMAGE: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/build-tools - FROM_TAG: ${{ inputs.from-tag }} - TO_TAG: ${{ inputs.to-tag }} - outputs: - next-digest-buildtools: ${{ steps.next-digest.outputs.next-digest-buildtools }} - prev-digest-buildtools: ${{ steps.prev-digest.outputs.prev-digest-buildtools }} - - steps: - - name: Install Crane & ECR helper - run: | - go install github.com/google/go-containerregistry/cmd/crane@a54d64203cffcbf94146e04069aae4a97f228ee2 # v0.16.1 - go install github.com/awslabs/amazon-ecr-credential-helper/ecr-login/cli/docker-credential-ecr-login@adf1bafd791ae7d4ff098108b1e91f36a4da5404 # v0.7.1 - - - name: Configure ECR login - run: | - mkdir /github/home/.docker/ - echo "{\"credsStore\":\"ecr-login\"}" > /github/home/.docker/config.json - - - name: Get source image digest - id: next-digest - run: | - NEXT_DIGEST=$(crane digest ${IMAGE}:${FROM_TAG} || true) - if [ -z "${NEXT_DIGEST}" ]; then - echo >&2 "Image ${IMAGE}:${FROM_TAG} does not exist" - exit 1 - fi - - echo "Current ${IMAGE}@${FROM_TAG} image is ${IMAGE}@${NEXT_DIGEST}" - echo "next-digest-buildtools=$NEXT_DIGEST" >> $GITHUB_OUTPUT - - - name: Get destination image digest (if already exists) - id: prev-digest - run: | - PREV_DIGEST=$(crane digest ${IMAGE}:${TO_TAG} || true) - if [ -z "${PREV_DIGEST}" ]; then - echo >&2 "Image ${IMAGE}:${TO_TAG} does not exist (it's ok)" - else - echo >&2 "Current ${IMAGE}@${TO_TAG} image is ${IMAGE}@${PREV_DIGEST}" - - echo "prev-digest-buildtools=$PREV_DIGEST" >> $GITHUB_OUTPUT - fi - - - name: Tag image - run: | - crane tag "${IMAGE}:${FROM_TAG}" "${TO_TAG}" - - rollback-tag-image: - needs: tag-image - if: ${{ !success() }} - - runs-on: [ self-hosted, gen3, small ] - container: golang:1.19-bullseye - - env: - IMAGE: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/build-tools + ECR_IMAGE: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/build-tools + DOCKER_HUB_IMAGE: docker.io/neondatabase/build-tools FROM_TAG: ${{ inputs.from-tag }} TO_TAG: ${{ inputs.to-tag }} steps: - - name: Install Crane & ECR helper + # Use custom DOCKER_CONFIG directory to avoid conflicts with default settings + # The default value is ~/.docker + - name: Set custom docker config directory run: | - go install github.com/google/go-containerregistry/cmd/crane@a54d64203cffcbf94146e04069aae4a97f228ee2 # v0.16.1 - go install github.com/awslabs/amazon-ecr-credential-helper/ecr-login/cli/docker-credential-ecr-login@adf1bafd791ae7d4ff098108b1e91f36a4da5404 # v0.7.1 + mkdir -p .docker-custom + echo DOCKER_CONFIG=$(pwd)/.docker-custom >> $GITHUB_ENV - - name: Configure ECR login + - uses: docker/login-action@v2 + with: + username: ${{ secrets.NEON_DOCKERHUB_USERNAME }} + password: ${{ secrets.NEON_DOCKERHUB_PASSWORD }} + + - uses: docker/login-action@v2 + with: + registry: 369495373322.dkr.ecr.eu-central-1.amazonaws.com + username: ${{ secrets.AWS_ACCESS_KEY_DEV }} + password: ${{ secrets.AWS_SECRET_KEY_DEV }} + + - uses: actions/setup-go@v5 + with: + go-version: '1.21' + + - name: Install crane run: | - mkdir /github/home/.docker/ - echo "{\"credsStore\":\"ecr-login\"}" > /github/home/.docker/config.json + go install github.com/google/go-containerregistry/cmd/crane@a0658aa1d0cc7a7f1bcc4a3af9155335b6943f40 # v0.18.0 - - name: Restore previous tag if needed + - name: Copy images run: | - NEXT_DIGEST="${{ needs.tag-image.outputs.next-digest-buildtools }}" - PREV_DIGEST="${{ needs.tag-image.outputs.prev-digest-buildtools }}" + crane copy "${ECR_IMAGE}:${FROM_TAG}" "${ECR_IMAGE}:${TO_TAG}" + crane copy "${ECR_IMAGE}:${FROM_TAG}" "${DOCKER_HUB_IMAGE}:${TO_TAG}" - if [ -z "${NEXT_DIGEST}" ]; then - echo >&2 "Image ${IMAGE}:${FROM_TAG} does not exist, nothing to rollback" - exit 0 - fi - - if [ -z "${PREV_DIGEST}" ]; then - # I guess we should delete the tag here/untag the image, but crane does not support it - # - https://github.com/google/go-containerregistry/issues/999 - - echo >&2 "Image ${IMAGE}:${TO_TAG} did not exist, but it was created by the job, no need to rollback" - - exit 0 - fi - - CURRENT_DIGEST=$(crane digest "${IMAGE}:${TO_TAG}") - if [ "${CURRENT_DIGEST}" == "${NEXT_DIGEST}" ]; then - crane tag "${IMAGE}@${PREV_DIGEST}" "${TO_TAG}" - - echo >&2 "Successfully restored ${TO_TAG} tag from ${IMAGE}@${CURRENT_DIGEST} to ${IMAGE}@${PREV_DIGEST}" - else - echo >&2 "Image ${IMAGE}:${TO_TAG}@${CURRENT_DIGEST} is not required to be restored" - fi + - name: Remove custom docker config directory + if: always() + run: | + rm -rf .docker-custom diff --git a/Cargo.lock b/Cargo.lock index 952034a16b..f0bcfb762a 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -10,9 +10,9 @@ checksum = "8b5ace29ee3216de37c0546865ad08edef58b0f9e76838ed8959a84a990e58c5" [[package]] name = "addr2line" -version = "0.19.0" +version = "0.21.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a76fd60b23679b7d19bd066031410fb7e458ccc5e958eb5c325888ce4baedc97" +checksum = "8a30b2e23b9e17a9f90641c7ab1549cd9b44f296d3ccbf309d2863cfe398a0cb" dependencies = [ "gimli", ] @@ -278,6 +278,7 @@ dependencies = [ "camino", "clap", "control_plane", + "diesel", "futures", "git-version", "hyper", @@ -840,15 +841,15 @@ dependencies = [ [[package]] name = "backtrace" -version = "0.3.67" +version = "0.3.69" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "233d376d6d185f2a3093e58f283f60f880315b6c60075b01f36b3b85154564ca" +checksum = "2089b7e3f35b9dd2d0ed921ead4f6d318c27680d4a5bd167b3ee120edb105837" dependencies = [ "addr2line", "cc", "cfg-if", "libc", - "miniz_oxide 0.6.2", + "miniz_oxide", "object", "rustc-demangle", ] @@ -1215,7 +1216,7 @@ dependencies = [ "flate2", "futures", "hyper", - "nix 0.26.2", + "nix 0.27.1", "notify", "num_cpus", "opentelemetry", @@ -1327,11 +1328,13 @@ dependencies = [ "clap", "comfy-table", "compute_api", + "diesel", + "diesel_migrations", "futures", "git-version", "hex", "hyper", - "nix 0.26.2", + "nix 0.27.1", "once_cell", "pageserver_api", "pageserver_client", @@ -1341,6 +1344,7 @@ dependencies = [ "regex", "reqwest", "safekeeper_api", + "scopeguard", "serde", "serde_json", "serde_with", @@ -1636,6 +1640,52 @@ dependencies = [ "rusticata-macros", ] +[[package]] +name = "diesel" +version = "2.1.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "62c6fcf842f17f8c78ecf7c81d75c5ce84436b41ee07e03f490fbb5f5a8731d8" +dependencies = [ + "bitflags 2.4.1", + "byteorder", + "diesel_derives", + "itoa", + "pq-sys", + "serde_json", +] + +[[package]] +name = "diesel_derives" +version = "2.1.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ef8337737574f55a468005a83499da720f20c65586241ffea339db9ecdfd2b44" +dependencies = [ + "diesel_table_macro_syntax", + "proc-macro2", + "quote", + "syn 2.0.32", +] + +[[package]] +name = "diesel_migrations" +version = "2.1.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6036b3f0120c5961381b570ee20a02432d7e2d27ea60de9578799cf9156914ac" +dependencies = [ + "diesel", + "migrations_internals", + "migrations_macros", +] + +[[package]] +name = "diesel_table_macro_syntax" +version = "0.1.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "fc5557efc453706fed5e4fa85006fe9817c224c3f480a34c7e5959fd700921c5" +dependencies = [ + "syn 2.0.32", +] + [[package]] name = "digest" version = "0.10.7" @@ -1872,13 +1922,13 @@ dependencies = [ [[package]] name = "filetime" -version = "0.2.21" +version = "0.2.22" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "5cbc844cecaee9d4443931972e1289c8ff485cb4cc2767cb03ca139ed6885153" +checksum = "d4029edd3e734da6fe05b6cd7bd2960760a616bd2ddd0d59a0124746d6272af0" dependencies = [ "cfg-if", "libc", - "redox_syscall 0.2.16", + "redox_syscall 0.3.5", "windows-sys 0.48.0", ] @@ -1895,7 +1945,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "3b9429470923de8e8cbd4d2dc513535400b4b3fef0319fb5c4e1f520a7bef743" dependencies = [ "crc32fast", - "miniz_oxide 0.7.1", + "miniz_oxide", ] [[package]] @@ -2093,9 +2143,9 @@ dependencies = [ [[package]] name = "gimli" -version = "0.27.2" +version = "0.28.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ad0a93d233ebf96623465aad4046a8d3aa4da22d4f4beba5388838c8a434bbb4" +checksum = "4271d37baee1b8c7e4b708028c57d816cf9d2434acb33a549475f78c181f6253" [[package]] name = "git-version" @@ -2562,6 +2612,16 @@ dependencies = [ "windows-sys 0.48.0", ] +[[package]] +name = "io-uring" +version = "0.6.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "460648e47a07a43110fbfa2e0b14afb2be920093c31e5dccc50e49568e099762" +dependencies = [ + "bitflags 1.3.2", + "libc", +] + [[package]] name = "ipnet" version = "2.9.0" @@ -2748,18 +2808,18 @@ checksum = "f665ee40bc4a3c5590afb1e9677db74a508659dfd71e126420da8274909a0167" [[package]] name = "memoffset" -version = "0.7.1" +version = "0.8.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "5de893c32cde5f383baa4c04c5d6dbdd735cfd4a794b0debdb2bb1b421da5ff4" +checksum = "d61c719bcfbcf5d62b3a09efa6088de8c54bc0bfcd3ea7ae39fcc186108b8de1" dependencies = [ "autocfg", ] [[package]] name = "memoffset" -version = "0.8.0" +version = "0.9.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d61c719bcfbcf5d62b3a09efa6088de8c54bc0bfcd3ea7ae39fcc186108b8de1" +checksum = "5a634b1c61a95585bd15607c6ab0c4e5b226e695ff2800ba0cdccddf208c406c" dependencies = [ "autocfg", ] @@ -2775,6 +2835,27 @@ dependencies = [ "workspace_hack", ] +[[package]] +name = "migrations_internals" +version = "2.1.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0f23f71580015254b020e856feac3df5878c2c7a8812297edd6c0a485ac9dada" +dependencies = [ + "serde", + "toml", +] + +[[package]] +name = "migrations_macros" +version = "2.1.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "cce3325ac70e67bbab5bd837a31cae01f1a6db64e0e744a33cb03a543469ef08" +dependencies = [ + "migrations_internals", + "proc-macro2", + "quote", +] + [[package]] name = "mime" version = "0.3.17" @@ -2797,15 +2878,6 @@ version = "0.2.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "68354c5c6bd36d73ff3feceb05efa59b6acb7626617f4962be322a825e61f79a" -[[package]] -name = "miniz_oxide" -version = "0.6.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b275950c28b37e794e8c55d88aeb5e139d0ce23fdbbeda68f8d7174abdf9e8fa" -dependencies = [ - "adler", -] - [[package]] name = "miniz_oxide" version = "0.7.1" @@ -2865,16 +2937,14 @@ dependencies = [ [[package]] name = "nix" -version = "0.26.2" +version = "0.27.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "bfdda3d196821d6af13126e40375cdf7da646a96114af134d5f417a9a1dc8e1a" +checksum = "2eb04e9c688eff1c89d72b407f168cf79bb9e867a9d3323ed6c01519eb9cc053" dependencies = [ - "bitflags 1.3.2", + "bitflags 2.4.1", "cfg-if", "libc", - "memoffset 0.7.1", - "pin-utils", - "static_assertions", + "memoffset 0.9.0", ] [[package]] @@ -2889,20 +2959,21 @@ dependencies = [ [[package]] name = "notify" -version = "5.2.0" +version = "6.1.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "729f63e1ca555a43fe3efa4f3efdf4801c479da85b432242a7b726f353c88486" +checksum = "6205bd8bb1e454ad2e27422015fb5e4f2bcc7e08fa8f27058670d208324a4d2d" dependencies = [ - "bitflags 1.3.2", + "bitflags 2.4.1", "crossbeam-channel", "filetime", "fsevent-sys", "inotify 0.9.6", "kqueue", "libc", + "log", "mio", "walkdir", - "windows-sys 0.45.0", + "windows-sys 0.48.0", ] [[package]] @@ -3028,9 +3099,9 @@ dependencies = [ [[package]] name = "object" -version = "0.30.3" +version = "0.32.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ea86265d3d3dcb6a27fc51bd29a4bf387fae9d2986b823079d4986af253eb439" +checksum = "a6a622008b6e321afc04970976f62ee297fdbaa6f95318ca343e3eebb9648441" dependencies = [ "memchr", ] @@ -3102,9 +3173,9 @@ dependencies = [ [[package]] name = "opentelemetry" -version = "0.19.0" +version = "0.20.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "5f4b8347cc26099d3aeee044065ecc3ae11469796b4d65d065a23a584ed92a6f" +checksum = "9591d937bc0e6d2feb6f71a559540ab300ea49955229c347a517a28d27784c54" dependencies = [ "opentelemetry_api", "opentelemetry_sdk", @@ -3112,9 +3183,9 @@ dependencies = [ [[package]] name = "opentelemetry-http" -version = "0.8.0" +version = "0.9.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a819b71d6530c4297b49b3cae2939ab3a8cc1b9f382826a1bc29dd0ca3864906" +checksum = "c7594ec0e11d8e33faf03530a4c49af7064ebba81c1480e01be67d90b356508b" dependencies = [ "async-trait", "bytes", @@ -3125,54 +3196,56 @@ dependencies = [ [[package]] name = "opentelemetry-otlp" -version = "0.12.0" +version = "0.13.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "8af72d59a4484654ea8eb183fea5ae4eb6a41d7ac3e3bae5f4d2a282a3a7d3ca" +checksum = "7e5e5a5c4135864099f3faafbe939eb4d7f9b80ebf68a8448da961b32a7c1275" dependencies = [ "async-trait", - "futures", - "futures-util", + "futures-core", "http", - "opentelemetry", "opentelemetry-http", "opentelemetry-proto", + "opentelemetry-semantic-conventions", + "opentelemetry_api", + "opentelemetry_sdk", "prost", "reqwest", "thiserror", + "tokio", + "tonic", ] [[package]] name = "opentelemetry-proto" -version = "0.2.0" +version = "0.3.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "045f8eea8c0fa19f7d48e7bc3128a39c2e5c533d5c61298c548dfefc1064474c" +checksum = "b1e3f814aa9f8c905d0ee4bde026afd3b2577a97c10e1699912e3e44f0c4cbeb" dependencies = [ - "futures", - "futures-util", - "opentelemetry", + "opentelemetry_api", + "opentelemetry_sdk", "prost", - "tonic 0.8.3", + "tonic", ] [[package]] name = "opentelemetry-semantic-conventions" -version = "0.11.0" +version = "0.12.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "24e33428e6bf08c6f7fcea4ddb8e358fab0fe48ab877a87c70c6ebe20f673ce5" +checksum = "73c9f9340ad135068800e7f1b24e9e09ed9e7143f5bf8518ded3d3ec69789269" dependencies = [ "opentelemetry", ] [[package]] name = "opentelemetry_api" -version = "0.19.0" +version = "0.20.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ed41783a5bf567688eb38372f2b7a8530f5a607a4b49d38dd7573236c23ca7e2" +checksum = "8a81f725323db1b1206ca3da8bb19874bbd3f57c3bcd59471bfb04525b265b9b" dependencies = [ - "fnv", "futures-channel", "futures-util", "indexmap 1.9.3", + "js-sys", "once_cell", "pin-project-lite", "thiserror", @@ -3181,21 +3254,22 @@ dependencies = [ [[package]] name = "opentelemetry_sdk" -version = "0.19.0" +version = "0.20.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "8b3a2a91fdbfdd4d212c0dcc2ab540de2c2bcbbd90be17de7a7daf8822d010c1" +checksum = "fa8e705a0612d48139799fcbaba0d4a90f06277153e43dd2bdc16c6f0edd8026" dependencies = [ "async-trait", "crossbeam-channel", - "dashmap", - "fnv", "futures-channel", "futures-executor", "futures-util", "once_cell", "opentelemetry_api", + "ordered-float 3.9.2", "percent-encoding", "rand 0.8.5", + "regex", + "serde_json", "thiserror", "tokio", "tokio-stream", @@ -3210,6 +3284,15 @@ dependencies = [ "num-traits", ] +[[package]] +name = "ordered-float" +version = "3.9.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f1e1c390732d15f1d48471625cd92d154e66db2c56645e29a9cd26f4699f72dc" +dependencies = [ + "num-traits", +] + [[package]] name = "ordered-multimap" version = "0.7.1" @@ -3325,7 +3408,7 @@ dependencies = [ "itertools", "md5", "metrics", - "nix 0.26.2", + "nix 0.27.1", "num-traits", "num_cpus", "once_cell", @@ -3358,6 +3441,7 @@ dependencies = [ "tenant_size_model", "thiserror", "tokio", + "tokio-epoll-uring", "tokio-io-timeout", "tokio-postgres", "tokio-stream", @@ -3780,6 +3864,15 @@ version = "0.2.17" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "5b40af805b3121feab8a3c29f04d8ad262fa8e0561883e7653e024ae4479e6de" +[[package]] +name = "pq-sys" +version = "0.4.8" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "31c0052426df997c0cbd30789eb44ca097e3541717a7b8fa36b1c464ee7edebd" +dependencies = [ + "vcpkg", +] + [[package]] name = "pq_proto" version = "0.1.0" @@ -4339,9 +4432,9 @@ dependencies = [ [[package]] name = "reqwest-tracing" -version = "0.4.5" +version = "0.4.7" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "1b97ad83c2fc18113346b7158d79732242002427c30f620fa817c1f32901e0a8" +checksum = "5a0152176687dd5cfe7f507ac1cb1a491c679cfe483afd133a7db7aaea818bb3" dependencies = [ "anyhow", "async-trait", @@ -5110,9 +5203,9 @@ checksum = "62bb4feee49fdd9f707ef802e22365a35de4b7b299de4763d44bfea899442ff9" [[package]] name = "smol_str" -version = "0.2.0" +version = "0.2.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "74212e6bbe9a4352329b2f68ba3130c15a3f26fe88ff22dbdc6cdd58fa85e99c" +checksum = "e6845563ada680337a52d43bb0b29f396f2d911616f6573012645b9e3d048a49" dependencies = [ "serde", ] @@ -5195,7 +5288,7 @@ dependencies = [ "prost", "tokio", "tokio-stream", - "tonic 0.9.2", + "tonic", "tonic-build", "tracing", "utils", @@ -5379,18 +5472,18 @@ dependencies = [ [[package]] name = "thiserror" -version = "1.0.40" +version = "1.0.47" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "978c9a314bd8dc99be594bc3c175faaa9794be04a5a5e153caba6915336cebac" +checksum = "97a802ec30afc17eee47b2855fc72e0c4cd62be9b4efe6591edde0ec5bd68d8f" dependencies = [ "thiserror-impl", ] [[package]] name = "thiserror-impl" -version = "1.0.40" +version = "1.0.47" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f9456a42c5b0d803c8cd86e73dd7cc9edd429499f37a3550d286d5e86720569f" +checksum = "6bb623b56e39ab7dcd4b1b98bb6c8f8d907ed255b18de254088016b27a8ee19b" dependencies = [ "proc-macro2", "quote", @@ -5415,7 +5508,7 @@ checksum = "7e54bc85fc7faa8bc175c4bab5b92ba8d9a3ce893d0e9f42cc455c8ab16a9e09" dependencies = [ "byteorder", "integer-encoding", - "ordered-float", + "ordered-float 2.10.1", ] [[package]] @@ -5514,6 +5607,21 @@ dependencies = [ "windows-sys 0.48.0", ] +[[package]] +name = "tokio-epoll-uring" +version = "0.1.0" +source = "git+https://github.com/neondatabase/tokio-epoll-uring.git?branch=main#0dd3a2f8bf3239d34a19719ef1a74146c093126f" +dependencies = [ + "futures", + "once_cell", + "scopeguard", + "thiserror", + "tokio", + "tokio-util", + "tracing", + "uring-common", +] + [[package]] name = "tokio-io-timeout" version = "1.2.0" @@ -5681,38 +5789,6 @@ dependencies = [ "winnow", ] -[[package]] -name = "tonic" -version = "0.8.3" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "8f219fad3b929bef19b1f86fbc0358d35daed8f2cac972037ac0dc10bbb8d5fb" -dependencies = [ - "async-stream", - "async-trait", - "axum", - "base64 0.13.1", - "bytes", - "futures-core", - "futures-util", - "h2", - "http", - "http-body", - "hyper", - "hyper-timeout", - "percent-encoding", - "pin-project", - "prost", - "prost-derive", - "tokio", - "tokio-stream", - "tokio-util", - "tower", - "tower-layer", - "tower-service", - "tracing", - "tracing-futures", -] - [[package]] name = "tonic" version = "0.9.2" @@ -5856,16 +5932,6 @@ dependencies = [ "tracing-subscriber", ] -[[package]] -name = "tracing-futures" -version = "0.2.5" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "97d095ae15e245a057c8e8451bab9b3ee1e1f68e9ba2b4fbc18d0ac5237835f2" -dependencies = [ - "pin-project", - "tracing", -] - [[package]] name = "tracing-log" version = "0.1.3" @@ -5879,9 +5945,9 @@ dependencies = [ [[package]] name = "tracing-opentelemetry" -version = "0.19.0" +version = "0.20.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "00a39dcf9bfc1742fa4d6215253b33a6e474be78275884c216fc2a06267b3600" +checksum = "fc09e402904a5261e42cf27aea09ccb7d5318c6717a9eec3d8e2e65c56b18f19" dependencies = [ "once_cell", "opentelemetry", @@ -6065,6 +6131,15 @@ dependencies = [ "webpki-roots 0.23.1", ] +[[package]] +name = "uring-common" +version = "0.1.0" +source = "git+https://github.com/neondatabase/tokio-epoll-uring.git?branch=main#0dd3a2f8bf3239d34a19719ef1a74146c093126f" +dependencies = [ + "io-uring", + "libc", +] + [[package]] name = "url" version = "2.3.1" @@ -6118,7 +6193,7 @@ dependencies = [ "hyper", "jsonwebtoken", "metrics", - "nix 0.26.2", + "nix 0.27.1", "once_cell", "pin-project-lite", "postgres_connection", @@ -6626,10 +6701,9 @@ dependencies = [ "clap", "clap_builder", "crossbeam-utils", - "dashmap", + "diesel", "either", "fail", - "futures", "futures-channel", "futures-core", "futures-executor", @@ -6674,6 +6748,7 @@ dependencies = [ "tokio-util", "toml_datetime", "toml_edit", + "tonic", "tower", "tracing", "tracing-core", diff --git a/Cargo.toml b/Cargo.toml index 5d5d2f4a55..8afab02b15 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -99,14 +99,14 @@ libc = "0.2" md5 = "0.7.0" memoffset = "0.8" native-tls = "0.2" -nix = "0.26" -notify = "5.0.0" +nix = { version = "0.27", features = ["fs", "process", "socket", "signal", "poll"] } +notify = "6.0.0" num_cpus = "1.15" num-traits = "0.2.15" once_cell = "1.13" -opentelemetry = "0.19.0" -opentelemetry-otlp = { version = "0.12.0", default_features=false, features = ["http-proto", "trace", "http", "reqwest-client"] } -opentelemetry-semantic-conventions = "0.11.0" +opentelemetry = "0.20.0" +opentelemetry-otlp = { version = "0.13.0", default_features=false, features = ["http-proto", "trace", "http", "reqwest-client"] } +opentelemetry-semantic-conventions = "0.12.0" parking_lot = "0.12" parquet = { version = "49.0.0", default-features = false, features = ["zstd"] } parquet_derive = "49.0.0" @@ -118,7 +118,7 @@ rand = "0.8" redis = { version = "0.24.0", features = ["tokio-rustls-comp", "keep-alive"] } regex = "1.10.2" reqwest = { version = "0.11", default-features = false, features = ["rustls-tls"] } -reqwest-tracing = { version = "0.4.0", features = ["opentelemetry_0_19"] } +reqwest-tracing = { version = "0.4.7", features = ["opentelemetry_0_20"] } reqwest-middleware = "0.2.0" reqwest-retry = "0.2.2" routerify = "3" @@ -151,6 +151,7 @@ test-context = "0.1" thiserror = "1.0" tls-listener = { version = "0.7", features = ["rustls", "hyper-h1"] } tokio = { version = "1.17", features = ["macros"] } +tokio-epoll-uring = { git = "https://github.com/neondatabase/tokio-epoll-uring.git" , branch = "main" } tokio-io-timeout = "1.2.0" tokio-postgres-rustls = "0.10.0" tokio-rustls = "0.24" @@ -162,7 +163,7 @@ toml_edit = "0.19" tonic = {version = "0.9", features = ["tls", "tls-roots"]} tracing = "0.1" tracing-error = "0.2.0" -tracing-opentelemetry = "0.19.0" +tracing-opentelemetry = "0.20.0" tracing-subscriber = { version = "0.3", default_features = false, features = ["smallvec", "fmt", "tracing-log", "std", "env-filter", "json"] } url = "2.2" uuid = { version = "1.6.1", features = ["v4", "v7", "serde"] } diff --git a/Dockerfile.compute-node b/Dockerfile.compute-node index 908460018f..299c4097e8 100644 --- a/Dockerfile.compute-node +++ b/Dockerfile.compute-node @@ -52,7 +52,7 @@ RUN cd postgres && \ # We need to grant EXECUTE on pg_stat_statements_reset() to neon_superuser. # In vanilla postgres this function is limited to Postgres role superuser. # In neon we have neon_superuser role that is not a superuser but replaces superuser in some cases. - # We could add the additional grant statements to the postgres repository but it would be hard to maintain, + # We could add the additional grant statements to the postgres repository but it would be hard to maintain, # whenever we need to pick up a new postgres version and we want to limit the changes in our postgres fork, # so we do it here. old_list="pg_stat_statements--1.0--1.1.sql pg_stat_statements--1.1--1.2.sql pg_stat_statements--1.2--1.3.sql pg_stat_statements--1.3--1.4.sql pg_stat_statements--1.4--1.5.sql pg_stat_statements--1.4.sql pg_stat_statements--1.5--1.6.sql"; \ @@ -63,14 +63,14 @@ RUN cd postgres && \ echo 'GRANT EXECUTE ON FUNCTION pg_stat_statements_reset() TO neon_superuser;' >> $file; \ fi; \ done; \ - # the second loop is for pg_stat_statement extension versions >= 1.7, + # the second loop is for pg_stat_statement extension versions >= 1.7, # where pg_stat_statement_reset() got 3 additional arguments for file in /usr/local/pgsql/share/extension/pg_stat_statements--*.sql; do \ filename=$(basename "$file"); \ if ! echo "$old_list" | grep -q -F "$filename"; then \ echo 'GRANT EXECUTE ON FUNCTION pg_stat_statements_reset(Oid, Oid, bigint) TO neon_superuser;' >> $file; \ fi; \ - done + done ######################################################################################### # @@ -143,29 +143,24 @@ RUN wget https://github.com/pgRouting/pgrouting/archive/v3.4.2.tar.gz -O pgrouti ######################################################################################### FROM build-deps AS plv8-build COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/ + RUN apt update && \ apt install -y ninja-build python3-dev libncurses5 binutils clang -RUN case "${PG_VERSION}" in \ - "v14" | "v15") \ - export PLV8_VERSION=3.1.5 \ - export PLV8_CHECKSUM=1e108d5df639e4c189e1c5bdfa2432a521c126ca89e7e5a969d46899ca7bf106 \ - ;; \ - "v16") \ - export PLV8_VERSION=3.1.8 \ - export PLV8_CHECKSUM=92b10c7db39afdae97ff748c9ec54713826af222c459084ad002571b79eb3f49 \ - ;; \ - *) \ - echo "Export the valid PG_VERSION variable" && exit 1 \ - ;; \ - esac && \ - wget https://github.com/plv8/plv8/archive/refs/tags/v${PLV8_VERSION}.tar.gz -O plv8.tar.gz && \ - echo "${PLV8_CHECKSUM} plv8.tar.gz" | sha256sum --check && \ +RUN wget https://github.com/plv8/plv8/archive/refs/tags/v3.1.10.tar.gz -O plv8.tar.gz && \ + echo "7096c3290928561f0d4901b7a52794295dc47f6303102fae3f8e42dd575ad97d plv8.tar.gz" | sha256sum --check && \ mkdir plv8-src && cd plv8-src && tar xvzf ../plv8.tar.gz --strip-components=1 -C . && \ + # generate and copy upgrade scripts + mkdir -p upgrade && ./generate_upgrade.sh 3.1.10 && \ + cp upgrade/* /usr/local/pgsql/share/extension/ && \ export PATH="/usr/local/pgsql/bin:$PATH" && \ make DOCKER=1 -j $(getconf _NPROCESSORS_ONLN) install && \ rm -rf /plv8-* && \ find /usr/local/pgsql/ -name "plv8-*.so" | xargs strip && \ + # don't break computes with installed old version of plv8 + cd /usr/local/pgsql/lib/ && \ + ln -s plv8-3.1.10.so plv8-3.1.5.so && \ + ln -s plv8-3.1.10.so plv8-3.1.8.so && \ echo 'trusted = true' >> /usr/local/pgsql/share/extension/plv8.control && \ echo 'trusted = true' >> /usr/local/pgsql/share/extension/plcoffee.control && \ echo 'trusted = true' >> /usr/local/pgsql/share/extension/plls.control @@ -551,6 +546,7 @@ RUN wget https://github.com/rdkit/rdkit/archive/refs/tags/Release_2023_03_3.tar. -D PostgreSQL_TYPE_INCLUDE_DIR=`pg_config --includedir-server` \ -D PostgreSQL_LIBRARY_DIR=`pg_config --libdir` \ -D RDK_INSTALL_INTREE=OFF \ + -D RDK_INSTALL_COMIC_FONTS=OFF \ -D CMAKE_BUILD_TYPE=Release \ . && \ make -j $(getconf _NPROCESSORS_ONLN) && \ @@ -617,6 +613,7 @@ RUN wget https://github.com/theory/pg-semver/archive/refs/tags/v0.32.1.tar.gz -O FROM build-deps AS pg-embedding-pg-build COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/ +ARG PG_VERSION ENV PATH "/usr/local/pgsql/bin/:$PATH" RUN case "${PG_VERSION}" in \ "v14" | "v15") \ @@ -779,6 +776,8 @@ RUN wget https://github.com/eulerto/wal2json/archive/refs/tags/wal2json_2_5.tar. # ######################################################################################### FROM build-deps AS neon-pg-ext-build +ARG PG_VERSION + # Public extensions COPY --from=postgis-build /usr/local/pgsql/ /usr/local/pgsql/ COPY --from=postgis-build /sfcgal/* / diff --git a/compute_tools/src/compute.rs b/compute_tools/src/compute.rs index 5f5363105c..07e0abe6ff 100644 --- a/compute_tools/src/compute.rs +++ b/compute_tools/src/compute.rs @@ -700,13 +700,14 @@ impl ComputeNode { // In this case we need to connect with old `zenith_admin` name // and create new user. We cannot simply rename connected user, // but we can create a new one and grant it all privileges. - let mut client = match Client::connect(self.connstr.as_str(), NoTls) { + let connstr = self.connstr.clone(); + let mut client = match Client::connect(connstr.as_str(), NoTls) { Err(e) => { info!( "cannot connect to postgres: {}, retrying with `zenith_admin` username", e ); - let mut zenith_admin_connstr = self.connstr.clone(); + let mut zenith_admin_connstr = connstr.clone(); zenith_admin_connstr .set_username("zenith_admin") @@ -719,8 +720,8 @@ impl ComputeNode { client.simple_query("GRANT zenith_admin TO cloud_admin")?; drop(client); - // reconnect with connsting with expected name - Client::connect(self.connstr.as_str(), NoTls)? + // reconnect with connstring with expected name + Client::connect(connstr.as_str(), NoTls)? } Ok(client) => client, }; @@ -734,8 +735,8 @@ impl ComputeNode { cleanup_instance(&mut client)?; handle_roles(spec, &mut client)?; handle_databases(spec, &mut client)?; - handle_role_deletions(spec, self.connstr.as_str(), &mut client)?; - handle_grants(spec, &mut client, self.connstr.as_str())?; + handle_role_deletions(spec, connstr.as_str(), &mut client)?; + handle_grants(spec, &mut client, connstr.as_str())?; handle_extensions(spec, &mut client)?; handle_extension_neon(&mut client)?; create_availability_check_data(&mut client)?; @@ -743,6 +744,12 @@ impl ComputeNode { // 'Close' connection drop(client); + if self.has_feature(ComputeFeature::Migrations) { + thread::spawn(move || { + let mut client = Client::connect(connstr.as_str(), NoTls)?; + handle_migrations(&mut client) + }); + } Ok(()) } @@ -807,6 +814,10 @@ impl ComputeNode { handle_grants(&spec, &mut client, self.connstr.as_str())?; handle_extensions(&spec, &mut client)?; handle_extension_neon(&mut client)?; + // We can skip handle_migrations here because a new migration can only appear + // if we have a new version of the compute_ctl binary, which can only happen + // if compute got restarted, in which case we'll end up inside of apply_config + // instead of reconfigure. } // 'Close' connection diff --git a/compute_tools/src/spec.rs b/compute_tools/src/spec.rs index ef5f55622d..e87dc0b732 100644 --- a/compute_tools/src/spec.rs +++ b/compute_tools/src/spec.rs @@ -727,3 +727,79 @@ pub fn handle_extension_neon(client: &mut Client) -> Result<()> { Ok(()) } + +#[instrument(skip_all)] +pub fn handle_migrations(client: &mut Client) -> Result<()> { + info!("handle migrations"); + + // !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! + // !BE SURE TO ONLY ADD MIGRATIONS TO THE END OF THIS ARRAY. IF YOU DO NOT, VERY VERY BAD THINGS MAY HAPPEN! + // !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! + + let migrations = [ + "ALTER ROLE neon_superuser BYPASSRLS", + r#" +DO $$ +DECLARE + role_name text; +BEGIN + FOR role_name IN SELECT rolname FROM pg_roles WHERE pg_has_role(rolname, 'neon_superuser', 'member') + LOOP + RAISE NOTICE 'EXECUTING ALTER ROLE % INHERIT', quote_ident(role_name); + EXECUTE 'ALTER ROLE ' || quote_ident(role_name) || ' INHERIT'; + END LOOP; + + FOR role_name IN SELECT rolname FROM pg_roles + WHERE + NOT pg_has_role(rolname, 'neon_superuser', 'member') AND NOT starts_with(rolname, 'pg_') + LOOP + RAISE NOTICE 'EXECUTING ALTER ROLE % NOBYPASSRLS', quote_ident(role_name); + EXECUTE 'ALTER ROLE ' || quote_ident(role_name) || ' NOBYPASSRLS'; + END LOOP; +END $$; +"#, + ]; + + let mut query = "CREATE SCHEMA IF NOT EXISTS neon_migration"; + client.simple_query(query)?; + + query = "CREATE TABLE IF NOT EXISTS neon_migration.migration_id (key INT NOT NULL PRIMARY KEY, id bigint NOT NULL DEFAULT 0)"; + client.simple_query(query)?; + + query = "INSERT INTO neon_migration.migration_id VALUES (0, 0) ON CONFLICT DO NOTHING"; + client.simple_query(query)?; + + query = "ALTER SCHEMA neon_migration OWNER TO cloud_admin"; + client.simple_query(query)?; + + query = "REVOKE ALL ON SCHEMA neon_migration FROM PUBLIC"; + client.simple_query(query)?; + + query = "SELECT id FROM neon_migration.migration_id"; + let row = client.query_one(query, &[])?; + let mut current_migration: usize = row.get::<&str, i64>("id") as usize; + let starting_migration_id = current_migration; + + query = "BEGIN"; + client.simple_query(query)?; + + while current_migration < migrations.len() { + info!("Running migration:\n{}\n", migrations[current_migration]); + client.simple_query(migrations[current_migration])?; + current_migration += 1; + } + let setval = format!( + "UPDATE neon_migration.migration_id SET id={}", + migrations.len() + ); + client.simple_query(&setval)?; + + query = "COMMIT"; + client.simple_query(query)?; + + info!( + "Ran {} migrations", + (migrations.len() - starting_migration_id) + ); + Ok(()) +} diff --git a/control_plane/Cargo.toml b/control_plane/Cargo.toml index 898ad05add..09c171f1d3 100644 --- a/control_plane/Cargo.toml +++ b/control_plane/Cargo.toml @@ -10,6 +10,8 @@ async-trait.workspace = true camino.workspace = true clap.workspace = true comfy-table.workspace = true +diesel = { version = "2.1.4", features = ["postgres"]} +diesel_migrations = { version = "2.1.0", features = ["postgres"]} futures.workspace = true git-version.workspace = true nix.workspace = true @@ -19,6 +21,7 @@ hex.workspace = true hyper.workspace = true regex.workspace = true reqwest = { workspace = true, features = ["blocking", "json"] } +scopeguard.workspace = true serde.workspace = true serde_json.workspace = true serde_with.workspace = true diff --git a/control_plane/attachment_service/Cargo.toml b/control_plane/attachment_service/Cargo.toml index 2e2286dbab..6fc21810bc 100644 --- a/control_plane/attachment_service/Cargo.toml +++ b/control_plane/attachment_service/Cargo.toml @@ -25,6 +25,8 @@ tracing.workspace = true # a parsing function when loading pageservers from neon_local LocalEnv postgres_backend.workspace = true +diesel = { version = "2.1.4", features = ["serde_json", "postgres"] } + utils = { path = "../../libs/utils/" } metrics = { path = "../../libs/metrics/" } control_plane = { path = ".." } diff --git a/control_plane/attachment_service/migrations/.keep b/control_plane/attachment_service/migrations/.keep new file mode 100644 index 0000000000..e69de29bb2 diff --git a/control_plane/attachment_service/migrations/00000000000000_diesel_initial_setup/down.sql b/control_plane/attachment_service/migrations/00000000000000_diesel_initial_setup/down.sql new file mode 100644 index 0000000000..a9f5260911 --- /dev/null +++ b/control_plane/attachment_service/migrations/00000000000000_diesel_initial_setup/down.sql @@ -0,0 +1,6 @@ +-- This file was automatically created by Diesel to setup helper functions +-- and other internal bookkeeping. This file is safe to edit, any future +-- changes will be added to existing projects as new migrations. + +DROP FUNCTION IF EXISTS diesel_manage_updated_at(_tbl regclass); +DROP FUNCTION IF EXISTS diesel_set_updated_at(); diff --git a/control_plane/attachment_service/migrations/00000000000000_diesel_initial_setup/up.sql b/control_plane/attachment_service/migrations/00000000000000_diesel_initial_setup/up.sql new file mode 100644 index 0000000000..d68895b1a7 --- /dev/null +++ b/control_plane/attachment_service/migrations/00000000000000_diesel_initial_setup/up.sql @@ -0,0 +1,36 @@ +-- This file was automatically created by Diesel to setup helper functions +-- and other internal bookkeeping. This file is safe to edit, any future +-- changes will be added to existing projects as new migrations. + + + + +-- Sets up a trigger for the given table to automatically set a column called +-- `updated_at` whenever the row is modified (unless `updated_at` was included +-- in the modified columns) +-- +-- # Example +-- +-- ```sql +-- CREATE TABLE users (id SERIAL PRIMARY KEY, updated_at TIMESTAMP NOT NULL DEFAULT NOW()); +-- +-- SELECT diesel_manage_updated_at('users'); +-- ``` +CREATE OR REPLACE FUNCTION diesel_manage_updated_at(_tbl regclass) RETURNS VOID AS $$ +BEGIN + EXECUTE format('CREATE TRIGGER set_updated_at BEFORE UPDATE ON %s + FOR EACH ROW EXECUTE PROCEDURE diesel_set_updated_at()', _tbl); +END; +$$ LANGUAGE plpgsql; + +CREATE OR REPLACE FUNCTION diesel_set_updated_at() RETURNS trigger AS $$ +BEGIN + IF ( + NEW IS DISTINCT FROM OLD AND + NEW.updated_at IS NOT DISTINCT FROM OLD.updated_at + ) THEN + NEW.updated_at := current_timestamp; + END IF; + RETURN NEW; +END; +$$ LANGUAGE plpgsql; diff --git a/control_plane/attachment_service/migrations/2024-01-07-211257_create_tenant_shards/down.sql b/control_plane/attachment_service/migrations/2024-01-07-211257_create_tenant_shards/down.sql new file mode 100644 index 0000000000..b875b91c00 --- /dev/null +++ b/control_plane/attachment_service/migrations/2024-01-07-211257_create_tenant_shards/down.sql @@ -0,0 +1 @@ +DROP TABLE tenant_shards; diff --git a/control_plane/attachment_service/migrations/2024-01-07-211257_create_tenant_shards/up.sql b/control_plane/attachment_service/migrations/2024-01-07-211257_create_tenant_shards/up.sql new file mode 100644 index 0000000000..585dbc79a0 --- /dev/null +++ b/control_plane/attachment_service/migrations/2024-01-07-211257_create_tenant_shards/up.sql @@ -0,0 +1,12 @@ +CREATE TABLE tenant_shards ( + tenant_id VARCHAR NOT NULL, + shard_number INTEGER NOT NULL, + shard_count INTEGER NOT NULL, + PRIMARY KEY(tenant_id, shard_number, shard_count), + shard_stripe_size INTEGER NOT NULL, + generation INTEGER NOT NULL, + generation_pageserver BIGINT NOT NULL, + placement_policy VARCHAR NOT NULL, + -- config is JSON encoded, opaque to the database. + config TEXT NOT NULL +); \ No newline at end of file diff --git a/control_plane/attachment_service/migrations/2024-01-07-212945_create_nodes/down.sql b/control_plane/attachment_service/migrations/2024-01-07-212945_create_nodes/down.sql new file mode 100644 index 0000000000..ec303bc8cf --- /dev/null +++ b/control_plane/attachment_service/migrations/2024-01-07-212945_create_nodes/down.sql @@ -0,0 +1 @@ +DROP TABLE nodes; diff --git a/control_plane/attachment_service/migrations/2024-01-07-212945_create_nodes/up.sql b/control_plane/attachment_service/migrations/2024-01-07-212945_create_nodes/up.sql new file mode 100644 index 0000000000..9be0880fa4 --- /dev/null +++ b/control_plane/attachment_service/migrations/2024-01-07-212945_create_nodes/up.sql @@ -0,0 +1,10 @@ +CREATE TABLE nodes ( + node_id BIGINT PRIMARY KEY NOT NULL, + + scheduling_policy VARCHAR NOT NULL, + + listen_http_addr VARCHAR NOT NULL, + listen_http_port INTEGER NOT NULL, + listen_pg_addr VARCHAR NOT NULL, + listen_pg_port INTEGER NOT NULL +); \ No newline at end of file diff --git a/control_plane/attachment_service/src/http.rs b/control_plane/attachment_service/src/http.rs index 30f6dd66ee..81f21a8e7a 100644 --- a/control_plane/attachment_service/src/http.rs +++ b/control_plane/attachment_service/src/http.rs @@ -1,5 +1,5 @@ use crate::reconciler::ReconcileError; -use crate::service::Service; +use crate::service::{Service, STARTUP_RECONCILE_TIMEOUT}; use hyper::{Body, Request, Response}; use hyper::{StatusCode, Uri}; use pageserver_api::models::{TenantCreateRequest, TimelineCreateRequest}; @@ -104,34 +104,34 @@ async fn handle_inspect(mut req: Request) -> Result, ApiErr json_response(StatusCode::OK, state.service.inspect(inspect_req)) } -async fn handle_tenant_create(mut req: Request) -> Result, ApiError> { +async fn handle_tenant_create( + service: Arc, + mut req: Request, +) -> Result, ApiError> { let create_req = json_request::(&mut req).await?; - let state = get_state(&req); - json_response( - StatusCode::OK, - state.service.tenant_create(create_req).await?, - ) + json_response(StatusCode::OK, service.tenant_create(create_req).await?) } -async fn handle_tenant_timeline_create(mut req: Request) -> Result, ApiError> { +async fn handle_tenant_timeline_create( + service: Arc, + mut req: Request, +) -> Result, ApiError> { let tenant_id: TenantId = parse_request_param(&req, "tenant_id")?; let create_req = json_request::(&mut req).await?; - - let state = get_state(&req); json_response( StatusCode::OK, - state - .service + service .tenant_timeline_create(tenant_id, create_req) .await?, ) } -async fn handle_tenant_locate(req: Request) -> Result, ApiError> { +async fn handle_tenant_locate( + service: Arc, + req: Request, +) -> Result, ApiError> { let tenant_id: TenantId = parse_request_param(&req, "tenant_id")?; - let state = get_state(&req); - - json_response(StatusCode::OK, state.service.tenant_locate(tenant_id)?) + json_response(StatusCode::OK, service.tenant_locate(tenant_id)?) } async fn handle_node_register(mut req: Request) -> Result, ApiError> { @@ -154,14 +154,15 @@ async fn handle_node_configure(mut req: Request) -> Result, json_response(StatusCode::OK, state.service.node_configure(config_req)?) } -async fn handle_tenant_shard_migrate(mut req: Request) -> Result, ApiError> { +async fn handle_tenant_shard_migrate( + service: Arc, + mut req: Request, +) -> Result, ApiError> { let tenant_shard_id: TenantShardId = parse_request_param(&req, "tenant_shard_id")?; let migrate_req = json_request::(&mut req).await?; - let state = get_state(&req); json_response( StatusCode::OK, - state - .service + service .tenant_shard_migrate(tenant_shard_id, migrate_req) .await?, ) @@ -178,6 +179,35 @@ impl From for ApiError { } } +/// Common wrapper for request handlers that call into Service and will operate on tenants: they must only +/// be allowed to run if Service has finished its initial reconciliation. +async fn tenant_service_handler(request: Request, handler: H) -> R::Output +where + R: std::future::Future, ApiError>> + Send + 'static, + H: FnOnce(Arc, Request) -> R + Send + Sync + 'static, +{ + let state = get_state(&request); + let service = state.service.clone(); + + let startup_complete = service.startup_complete.clone(); + if tokio::time::timeout(STARTUP_RECONCILE_TIMEOUT, startup_complete.wait()) + .await + .is_err() + { + // This shouldn't happen: it is the responsibilty of [`Service::startup_reconcile`] to use appropriate + // timeouts around its remote calls, to bound its runtime. + return Err(ApiError::Timeout( + "Timed out waiting for service readiness".into(), + )); + } + + request_span( + request, + |request| async move { handler(service, request).await }, + ) + .await +} + pub fn make_router( service: Arc, auth: Option>, @@ -205,14 +235,20 @@ pub fn make_router( .put("/node/:node_id/config", |r| { request_span(r, handle_node_configure) }) - .post("/tenant", |r| request_span(r, handle_tenant_create)) - .post("/tenant/:tenant_id/timeline", |r| { - request_span(r, handle_tenant_timeline_create) + .post("/v1/tenant", |r| { + tenant_service_handler(r, handle_tenant_create) + }) + .post("/v1/tenant/:tenant_id/timeline", |r| { + tenant_service_handler(r, handle_tenant_timeline_create) }) .get("/tenant/:tenant_id/locate", |r| { - request_span(r, handle_tenant_locate) + tenant_service_handler(r, handle_tenant_locate) }) .put("/tenant/:tenant_shard_id/migrate", |r| { - request_span(r, handle_tenant_shard_migrate) + tenant_service_handler(r, handle_tenant_shard_migrate) }) + // Path aliases for tests_forward_compatibility + // TODO: remove these in future PR + .post("/re-attach", |r| request_span(r, handle_re_attach)) + .post("/validate", |r| request_span(r, handle_validate)) } diff --git a/control_plane/attachment_service/src/lib.rs b/control_plane/attachment_service/src/lib.rs index d8f996952a..082afb4157 100644 --- a/control_plane/attachment_service/src/lib.rs +++ b/control_plane/attachment_service/src/lib.rs @@ -7,6 +7,7 @@ mod node; pub mod persistence; mod reconciler; mod scheduler; +mod schema; pub mod service; mod tenant_state; @@ -17,6 +18,8 @@ enum PlacementPolicy { /// Production-ready way to attach a tenant: one attached pageserver and /// some number of secondaries. Double(usize), + /// Do not attach to any pageservers + Detached, } #[derive(Ord, PartialOrd, Eq, PartialEq, Copy, Clone)] diff --git a/control_plane/attachment_service/src/main.rs b/control_plane/attachment_service/src/main.rs index ee2a22ee53..05a3895dfa 100644 --- a/control_plane/attachment_service/src/main.rs +++ b/control_plane/attachment_service/src/main.rs @@ -12,9 +12,9 @@ use camino::Utf8PathBuf; use clap::Parser; use metrics::launch_timestamp::LaunchTimestamp; use std::sync::Arc; +use tokio::signal::unix::SignalKind; use utils::auth::{JwtAuth, SwappableJwtAuth}; use utils::logging::{self, LogFormat}; -use utils::signals::{ShutdownSignals, Signal}; use utils::{project_build_tag, project_git_version, tcp_listener}; @@ -40,6 +40,10 @@ struct Cli { /// Path to the .json file to store state (will be created if it doesn't exist) #[arg(short, long)] path: Utf8PathBuf, + + /// URL to connect to postgres, like postgresql://localhost:1234/attachment_service + #[arg(long)] + database_url: String, } #[tokio::main] @@ -66,9 +70,14 @@ async fn main() -> anyhow::Result<()> { jwt_token: args.jwt_token, }; - let persistence = Arc::new(Persistence::new(&args.path).await); + let json_path = if args.path.as_os_str().is_empty() { + None + } else { + Some(args.path) + }; + let persistence = Arc::new(Persistence::new(args.database_url, json_path.clone())); - let service = Service::spawn(config, persistence).await?; + let service = Service::spawn(config, persistence.clone()).await?; let http_listener = tcp_listener::bind(args.listen)?; @@ -81,20 +90,31 @@ async fn main() -> anyhow::Result<()> { let router = make_router(service, auth) .build() .map_err(|err| anyhow!(err))?; - let service = utils::http::RouterService::new(router).unwrap(); - let server = hyper::Server::from_tcp(http_listener)?.serve(service); + let router_service = utils::http::RouterService::new(router).unwrap(); + let server = hyper::Server::from_tcp(http_listener)?.serve(router_service); tracing::info!("Serving on {0}", args.listen); tokio::task::spawn(server); - ShutdownSignals::handle(|signal| match signal { - Signal::Interrupt | Signal::Terminate | Signal::Quit => { - tracing::info!("Got {}. Terminating", signal.name()); - // We're just a test helper: no graceful shutdown. - std::process::exit(0); - } - })?; + // Wait until we receive a signal + let mut sigint = tokio::signal::unix::signal(SignalKind::interrupt())?; + let mut sigquit = tokio::signal::unix::signal(SignalKind::quit())?; + let mut sigterm = tokio::signal::unix::signal(SignalKind::terminate())?; + tokio::select! { + _ = sigint.recv() => {}, + _ = sigterm.recv() => {}, + _ = sigquit.recv() => {}, + } + tracing::info!("Terminating on signal"); - Ok(()) + if json_path.is_some() { + // Write out a JSON dump on shutdown: this is used in compat tests to avoid passing + // full postgres dumps around. + if let Err(e) = persistence.write_tenants_json().await { + tracing::error!("Failed to write JSON on shutdown: {e}") + } + } + + std::process::exit(0); } diff --git a/control_plane/attachment_service/src/node.rs b/control_plane/attachment_service/src/node.rs index efd3f8f49b..47f61702d8 100644 --- a/control_plane/attachment_service/src/node.rs +++ b/control_plane/attachment_service/src/node.rs @@ -1,6 +1,8 @@ use control_plane::attachment_service::{NodeAvailability, NodeSchedulingPolicy}; use utils::id::NodeId; +use crate::persistence::NodePersistence; + #[derive(Clone)] pub(crate) struct Node { pub(crate) id: NodeId, @@ -34,4 +36,15 @@ impl Node { NodeSchedulingPolicy::Pause => false, } } + + pub(crate) fn to_persistent(&self) -> NodePersistence { + NodePersistence { + node_id: self.id.0 as i64, + scheduling_policy: self.scheduling.into(), + listen_http_addr: self.listen_http_addr.clone(), + listen_http_port: self.listen_http_port as i32, + listen_pg_addr: self.listen_pg_addr.clone(), + listen_pg_port: self.listen_pg_port as i32, + } + } } diff --git a/control_plane/attachment_service/src/persistence.rs b/control_plane/attachment_service/src/persistence.rs index 58708be140..b27bd2bf2e 100644 --- a/control_plane/attachment_service/src/persistence.rs +++ b/control_plane/attachment_service/src/persistence.rs @@ -1,139 +1,161 @@ -use std::{collections::HashMap, str::FromStr}; +use std::collections::HashMap; +use std::str::FromStr; -use camino::{Utf8Path, Utf8PathBuf}; -use control_plane::{ - attachment_service::{NodeAvailability, NodeSchedulingPolicy}, - local_env::LocalEnv, -}; -use pageserver_api::{ - models::TenantConfig, - shard::{ShardCount, ShardNumber, TenantShardId}, -}; +use camino::Utf8Path; +use camino::Utf8PathBuf; +use control_plane::attachment_service::{NodeAvailability, NodeSchedulingPolicy}; +use diesel::pg::PgConnection; +use diesel::prelude::*; +use diesel::Connection; +use pageserver_api::models::TenantConfig; +use pageserver_api::shard::{ShardCount, ShardNumber, TenantShardId}; use postgres_connection::parse_host_port; use serde::{Deserialize, Serialize}; -use utils::{ - generation::Generation, - id::{NodeId, TenantId}, -}; +use utils::generation::Generation; +use utils::id::{NodeId, TenantId}; -use crate::{node::Node, PlacementPolicy}; +use crate::node::Node; +use crate::PlacementPolicy; -/// Placeholder for storage. This will be replaced with a database client. +/// ## What do we store? +/// +/// The attachment service does not store most of its state durably. +/// +/// The essential things to store durably are: +/// - generation numbers, as these must always advance monotonically to ensure data safety. +/// - Tenant's PlacementPolicy and TenantConfig, as the source of truth for these is something external. +/// - Node's scheduling policies, as the source of truth for these is something external. +/// +/// Other things we store durably as an implementation detail: +/// - Node's host/port: this could be avoided it we made nodes emit a self-registering heartbeat, +/// but it is operationally simpler to make this service the authority for which nodes +/// it talks to. +/// +/// ## Performance/efficiency +/// +/// The attachment service does not go via the database for most things: there are +/// a couple of places where we must, and where efficiency matters: +/// - Incrementing generation numbers: the Reconciler has to wait for this to complete +/// before it can attach a tenant, so this acts as a bound on how fast things like +/// failover can happen. +/// - Pageserver re-attach: we will increment many shards' generations when this happens, +/// so it is important to avoid e.g. issuing O(N) queries. +/// +/// Database calls relating to nodes have low performance requirements, as they are very rarely +/// updated, and reads of nodes are always from memory, not the database. We only require that +/// we can UPDATE a node's scheduling mode reasonably quickly to mark a bad node offline. pub struct Persistence { - state: std::sync::Mutex, + database_url: String, + + // In test environments, we support loading+saving a JSON file. This is temporary, for the benefit of + // test_compatibility.py, so that we don't have to commit to making the database contents fully backward/forward + // compatible just yet. + json_path: Option, } -// Top level state available to all HTTP handlers +/// Legacy format, for use in JSON compat objects in test environment #[derive(Serialize, Deserialize)] -struct PersistentState { +struct JsonPersistence { tenants: HashMap, - - #[serde(skip)] - path: Utf8PathBuf, } -/// A convenience for serializing the state inside a sync lock, and then -/// writing it to disk outside of the lock. This will go away when switching -/// to a database backend. -struct PendingWrite { - bytes: Vec, - path: Utf8PathBuf, +#[derive(thiserror::Error, Debug)] +pub(crate) enum DatabaseError { + #[error(transparent)] + Query(#[from] diesel::result::Error), + #[error(transparent)] + Connection(#[from] diesel::result::ConnectionError), + #[error("Logical error: {0}")] + Logical(String), } -impl PendingWrite { - async fn commit(&self) -> anyhow::Result<()> { - tokio::fs::write(&self.path, &self.bytes).await?; - - Ok(()) - } -} - -impl PersistentState { - fn save(&self) -> PendingWrite { - PendingWrite { - bytes: serde_json::to_vec(self).expect("Serialization error"), - path: self.path.clone(), - } - } - - async fn load(path: &Utf8Path) -> anyhow::Result { - let bytes = tokio::fs::read(path).await?; - let mut decoded = serde_json::from_slice::(&bytes)?; - decoded.path = path.to_owned(); - - for (tenant_id, tenant) in &mut decoded.tenants { - // Backward compat: an old attachments.json from before PR #6251, replace - // empty strings with proper defaults. - if tenant.tenant_id.is_empty() { - tenant.tenant_id = format!("{}", tenant_id); - tenant.config = serde_json::to_string(&TenantConfig::default())?; - tenant.placement_policy = serde_json::to_string(&PlacementPolicy::default())?; - } - } - - Ok(decoded) - } - - async fn load_or_new(path: &Utf8Path) -> Self { - match Self::load(path).await { - Ok(s) => { - tracing::info!("Loaded state file at {}", path); - s - } - Err(e) - if e.downcast_ref::() - .map(|e| e.kind() == std::io::ErrorKind::NotFound) - .unwrap_or(false) => - { - tracing::info!("Will create state file at {}", path); - Self { - tenants: HashMap::new(), - path: path.to_owned(), - } - } - Err(e) => { - panic!("Failed to load state from '{}': {e:#} (maybe your .neon/ dir was written by an older version?)", path) - } - } - } -} +pub(crate) type DatabaseResult = Result; impl Persistence { - pub async fn new(path: &Utf8Path) -> Self { - let state = PersistentState::load_or_new(path).await; + pub fn new(database_url: String, json_path: Option) -> Self { Self { - state: std::sync::Mutex::new(state), + database_url, + json_path, } } - /// When registering a node, persist it so that on next start we will be able to - /// iterate over known nodes to synchronize their tenant shard states with our observed state. - pub(crate) async fn insert_node(&self, _node: &Node) -> anyhow::Result<()> { - // TODO: node persitence will come with database backend - Ok(()) + /// Call the provided function in a tokio blocking thread, with a Diesel database connection. + async fn with_conn(&self, func: F) -> DatabaseResult + where + F: Fn(&mut PgConnection) -> DatabaseResult + Send + 'static, + R: Send + 'static, + { + let database_url = self.database_url.clone(); + tokio::task::spawn_blocking(move || -> DatabaseResult { + // TODO: connection pooling, such as via diesel::r2d2 + let mut conn = PgConnection::establish(&database_url)?; + func(&mut conn) + }) + .await + .expect("Task panic") } - /// At startup, we populate the service's list of nodes, and use this list to call into - /// each node to do an initial reconciliation of the state of the world with our in-memory - /// observed state. - pub(crate) async fn list_nodes(&self) -> anyhow::Result> { - let env = LocalEnv::load_config()?; - // TODO: node persitence will come with database backend + /// When a node is first registered, persist it before using it for anything + pub(crate) async fn insert_node(&self, node: &Node) -> DatabaseResult<()> { + let np = node.to_persistent(); + self.with_conn(move |conn| -> DatabaseResult<()> { + diesel::insert_into(crate::schema::nodes::table) + .values(&np) + .execute(conn)?; + Ok(()) + }) + .await + } - // XXX hack: enable test_backward_compatibility to work by populating our list of + /// At startup, populate the list of nodes which our shards may be placed on + pub(crate) async fn list_nodes(&self) -> DatabaseResult> { + let nodes: Vec = self + .with_conn(move |conn| -> DatabaseResult<_> { + Ok(crate::schema::nodes::table + .load::(conn)? + .into_iter() + .map(|n| Node { + id: NodeId(n.node_id as u64), + // At startup we consider a node offline until proven otherwise. + availability: NodeAvailability::Offline, + scheduling: NodeSchedulingPolicy::from_str(&n.scheduling_policy) + .expect("Bad scheduling policy in DB"), + listen_http_addr: n.listen_http_addr, + listen_http_port: n.listen_http_port as u16, + listen_pg_addr: n.listen_pg_addr, + listen_pg_port: n.listen_pg_port as u16, + }) + .collect::>()) + }) + .await?; + + if nodes.is_empty() { + return self.list_nodes_local_env().await; + } + + tracing::info!("list_nodes: loaded {} nodes", nodes.len()); + + Ok(nodes) + } + + /// Shim for automated compatibility tests: load nodes from LocalEnv instead of database + pub(crate) async fn list_nodes_local_env(&self) -> DatabaseResult> { + // Enable test_backward_compatibility to work by populating our list of // nodes from LocalEnv when it is not present in persistent storage. Otherwise at // first startup in the compat test, we may have shards but no nodes. - let mut result = Vec::new(); + use control_plane::local_env::LocalEnv; + let env = LocalEnv::load_config().map_err(|e| DatabaseError::Logical(format!("{e}")))?; tracing::info!( - "Loaded {} pageserver nodes from LocalEnv", + "Loading {} pageserver nodes from LocalEnv", env.pageservers.len() ); + let mut nodes = Vec::new(); for ps_conf in env.pageservers { let (pg_host, pg_port) = parse_host_port(&ps_conf.listen_pg_addr).expect("Unable to parse listen_pg_addr"); let (http_host, http_port) = parse_host_port(&ps_conf.listen_http_addr) .expect("Unable to parse listen_http_addr"); - result.push(Node { + let node = Node { id: ps_conf.id, listen_pg_addr: pg_host.to_string(), listen_pg_port: pg_port.unwrap_or(5432), @@ -141,16 +163,96 @@ impl Persistence { listen_http_port: http_port.unwrap_or(80), availability: NodeAvailability::Active, scheduling: NodeSchedulingPolicy::Active, - }); + }; + + // Synchronize database with what we learn from LocalEnv + self.insert_node(&node).await?; + + nodes.push(node); } - Ok(result) + Ok(nodes) } - /// At startup, we populate our map of tenant shards from persistent storage. - pub(crate) async fn list_tenant_shards(&self) -> anyhow::Result> { - let locked = self.state.lock().unwrap(); - Ok(locked.tenants.values().cloned().collect()) + /// At startup, load the high level state for shards, such as their config + policy. This will + /// be enriched at runtime with state discovered on pageservers. + pub(crate) async fn list_tenant_shards(&self) -> DatabaseResult> { + let loaded = self + .with_conn(move |conn| -> DatabaseResult<_> { + Ok(crate::schema::tenant_shards::table.load::(conn)?) + }) + .await?; + + if loaded.is_empty() { + if let Some(path) = &self.json_path { + if tokio::fs::try_exists(path) + .await + .map_err(|e| DatabaseError::Logical(format!("Error stat'ing JSON file: {e}")))? + { + tracing::info!("Importing from legacy JSON format at {path}"); + return self.list_tenant_shards_json(path).await; + } + } + } + Ok(loaded) + } + + /// Shim for automated compatibility tests: load tenants from a JSON file instead of database + pub(crate) async fn list_tenant_shards_json( + &self, + path: &Utf8Path, + ) -> DatabaseResult> { + let bytes = tokio::fs::read(path) + .await + .map_err(|e| DatabaseError::Logical(format!("Failed to load JSON: {e}")))?; + + let mut decoded = serde_json::from_slice::(&bytes) + .map_err(|e| DatabaseError::Logical(format!("Deserialization error: {e}")))?; + for (tenant_id, tenant) in &mut decoded.tenants { + // Backward compat: an old attachments.json from before PR #6251, replace + // empty strings with proper defaults. + if tenant.tenant_id.is_empty() { + tenant.tenant_id = tenant_id.to_string(); + tenant.config = serde_json::to_string(&TenantConfig::default()) + .map_err(|e| DatabaseError::Logical(format!("Serialization error: {e}")))?; + tenant.placement_policy = serde_json::to_string(&PlacementPolicy::default()) + .map_err(|e| DatabaseError::Logical(format!("Serialization error: {e}")))?; + } + } + + let tenants: Vec = decoded.tenants.into_values().collect(); + + // Synchronize database with what is in the JSON file + self.insert_tenant_shards(tenants.clone()).await?; + + Ok(tenants) + } + + /// For use in testing environments, where we dump out JSON on shutdown. + pub async fn write_tenants_json(&self) -> anyhow::Result<()> { + let Some(path) = &self.json_path else { + anyhow::bail!("Cannot write JSON if path isn't set (test environment bug)"); + }; + tracing::info!("Writing state to {path}..."); + let tenants = self.list_tenant_shards().await?; + let mut tenants_map = HashMap::new(); + for tsp in tenants { + let tenant_shard_id = TenantShardId { + tenant_id: TenantId::from_str(tsp.tenant_id.as_str())?, + shard_number: ShardNumber(tsp.shard_number as u8), + shard_count: ShardCount(tsp.shard_count as u8), + }; + + tenants_map.insert(tenant_shard_id, tsp); + } + let json = serde_json::to_string(&JsonPersistence { + tenants: tenants_map, + })?; + + tokio::fs::write(path, &json).await?; + tracing::info!("Wrote {} bytes to {path}...", json.len()); + + Ok(()) } /// Tenants must be persisted before we schedule them for the first time. This enables us @@ -158,24 +260,77 @@ impl Persistence { pub(crate) async fn insert_tenant_shards( &self, shards: Vec, - ) -> anyhow::Result<()> { - let write = { - let mut locked = self.state.lock().unwrap(); - for shard in shards { - let tenant_shard_id = TenantShardId { - tenant_id: TenantId::from_str(shard.tenant_id.as_str())?, - shard_number: ShardNumber(shard.shard_number as u8), - shard_count: ShardCount(shard.shard_count as u8), - }; + ) -> DatabaseResult<()> { + use crate::schema::tenant_shards::dsl::*; + self.with_conn(move |conn| -> DatabaseResult<()> { + conn.transaction(|conn| -> QueryResult<()> { + for tenant in &shards { + diesel::insert_into(tenant_shards) + .values(tenant) + .execute(conn)?; + } + Ok(()) + })?; + Ok(()) + }) + .await + } - locked.tenants.insert(tenant_shard_id, shard); - } - locked.save() - }; + /// Ordering: call this _after_ deleting the tenant on pageservers, but _before_ dropping state for + /// the tenant from memory on this server. + #[allow(unused)] + pub(crate) async fn delete_tenant(&self, del_tenant_id: TenantId) -> DatabaseResult<()> { + use crate::schema::tenant_shards::dsl::*; + self.with_conn(move |conn| -> DatabaseResult<()> { + diesel::delete(tenant_shards) + .filter(tenant_id.eq(del_tenant_id.to_string())) + .execute(conn)?; - write.commit().await?; + Ok(()) + }) + .await + } - Ok(()) + /// When a tenant invokes the /re-attach API, this function is responsible for doing an efficient + /// batched increment of the generations of all tenants whose generation_pageserver is equal to + /// the node that called /re-attach. + #[tracing::instrument(skip_all, fields(node_id))] + pub(crate) async fn re_attach( + &self, + node_id: NodeId, + ) -> DatabaseResult> { + use crate::schema::tenant_shards::dsl::*; + let updated = self + .with_conn(move |conn| { + let rows_updated = diesel::update(tenant_shards) + .filter(generation_pageserver.eq(node_id.0 as i64)) + .set(generation.eq(generation + 1)) + .execute(conn)?; + + tracing::info!("Incremented {} tenants' generations", rows_updated); + + // TODO: UPDATE+SELECT in one query + + let updated = tenant_shards + .filter(generation_pageserver.eq(node_id.0 as i64)) + .select(TenantShardPersistence::as_select()) + .load(conn)?; + Ok(updated) + }) + .await?; + + let mut result = HashMap::new(); + for tsp in updated { + let tenant_shard_id = TenantShardId { + tenant_id: TenantId::from_str(tsp.tenant_id.as_str()) + .map_err(|e| DatabaseError::Logical(format!("Malformed tenant id: {e}")))?, + shard_number: ShardNumber(tsp.shard_number as u8), + shard_count: ShardCount(tsp.shard_count as u8), + }; + result.insert(tenant_shard_id, Generation::new(tsp.generation as u32)); + } + + Ok(result) } /// Reconciler calls this immediately before attaching to a new pageserver, to acquire a unique, monotonically @@ -184,49 +339,48 @@ impl Persistence { pub(crate) async fn increment_generation( &self, tenant_shard_id: TenantShardId, - node_id: Option, + node_id: NodeId, ) -> anyhow::Result { - let (write, gen) = { - let mut locked = self.state.lock().unwrap(); - let Some(shard) = locked.tenants.get_mut(&tenant_shard_id) else { - anyhow::bail!("Tried to increment generation of unknown shard"); - }; + use crate::schema::tenant_shards::dsl::*; + let updated = self + .with_conn(move |conn| { + let updated = diesel::update(tenant_shards) + .filter(tenant_id.eq(tenant_shard_id.tenant_id.to_string())) + .filter(shard_number.eq(tenant_shard_id.shard_number.0 as i32)) + .filter(shard_count.eq(tenant_shard_id.shard_count.0 as i32)) + .set(( + generation.eq(generation + 1), + generation_pageserver.eq(node_id.0 as i64), + )) + // TODO: only returning() the generation column + .returning(TenantShardPersistence::as_returning()) + .get_result(conn)?; - // If we're called with a None pageserver, we need only update the generation - // record to disassociate it with this pageserver, not actually increment the number, as - // the increment is guaranteed to happen the next time this tenant is attached. - if node_id.is_some() { - shard.generation += 1; - } + Ok(updated) + }) + .await?; - shard.generation_pageserver = node_id; - let gen = Generation::new(shard.generation); - (locked.save(), gen) - }; - - write.commit().await?; - Ok(gen) + Ok(Generation::new(updated.generation as u32)) } - pub(crate) async fn re_attach( - &self, - node_id: NodeId, - ) -> anyhow::Result> { - let (write, result) = { - let mut result = HashMap::new(); - let mut locked = self.state.lock().unwrap(); - for (tenant_shard_id, shard) in locked.tenants.iter_mut() { - if shard.generation_pageserver == Some(node_id) { - shard.generation += 1; - result.insert(*tenant_shard_id, Generation::new(shard.generation)); - } - } + pub(crate) async fn detach(&self, tenant_shard_id: TenantShardId) -> anyhow::Result<()> { + use crate::schema::tenant_shards::dsl::*; + self.with_conn(move |conn| { + let updated = diesel::update(tenant_shards) + .filter(tenant_id.eq(tenant_shard_id.tenant_id.to_string())) + .filter(shard_number.eq(tenant_shard_id.shard_number.0 as i32)) + .filter(shard_count.eq(tenant_shard_id.shard_count.0 as i32)) + .set(( + generation_pageserver.eq(i64::MAX), + placement_policy.eq(serde_json::to_string(&PlacementPolicy::Detached).unwrap()), + )) + .execute(conn)?; - (locked.save(), result) - }; + Ok(updated) + }) + .await?; - write.commit().await?; - Ok(result) + Ok(()) } // TODO: when we start shard splitting, we must durably mark the tenant so that @@ -246,7 +400,8 @@ impl Persistence { } /// Parts of [`crate::tenant_state::TenantState`] that are stored durably -#[derive(Serialize, Deserialize, Clone)] +#[derive(Queryable, Selectable, Insertable, Serialize, Deserialize, Clone)] +#[diesel(table_name = crate::schema::tenant_shards)] pub(crate) struct TenantShardPersistence { #[serde(default)] pub(crate) tenant_id: String, @@ -257,16 +412,28 @@ pub(crate) struct TenantShardPersistence { #[serde(default)] pub(crate) shard_stripe_size: i32, - // Currently attached pageserver - #[serde(rename = "pageserver")] - pub(crate) generation_pageserver: Option, - // Latest generation number: next time we attach, increment this // and use the incremented number when attaching - pub(crate) generation: u32, + pub(crate) generation: i32, + + // Currently attached pageserver + #[serde(rename = "pageserver")] + pub(crate) generation_pageserver: i64, #[serde(default)] pub(crate) placement_policy: String, #[serde(default)] pub(crate) config: String, } + +/// Parts of [`crate::node::Node`] that are stored durably +#[derive(Serialize, Deserialize, Queryable, Selectable, Insertable)] +#[diesel(table_name = crate::schema::nodes)] +pub(crate) struct NodePersistence { + pub(crate) node_id: i64, + pub(crate) scheduling_policy: String, + pub(crate) listen_http_addr: String, + pub(crate) listen_http_port: i32, + pub(crate) listen_pg_addr: String, + pub(crate) listen_pg_port: i32, +} diff --git a/control_plane/attachment_service/src/reconciler.rs b/control_plane/attachment_service/src/reconciler.rs index b08339b3b4..d7f4c0406a 100644 --- a/control_plane/attachment_service/src/reconciler.rs +++ b/control_plane/attachment_service/src/reconciler.rs @@ -296,7 +296,7 @@ impl Reconciler { // Increment generation before attaching to new pageserver self.generation = self .persistence - .increment_generation(self.tenant_shard_id, Some(dest_ps_id)) + .increment_generation(self.tenant_shard_id, dest_ps_id) .await?; let dest_conf = build_location_config( @@ -395,7 +395,7 @@ impl Reconciler { // as locations with unknown (None) observed state. self.generation = self .persistence - .increment_generation(self.tenant_shard_id, Some(node_id)) + .increment_generation(self.tenant_shard_id, node_id) .await?; wanted_conf.generation = self.generation.into(); tracing::info!("Observed configuration requires update."); diff --git a/control_plane/attachment_service/src/schema.rs b/control_plane/attachment_service/src/schema.rs new file mode 100644 index 0000000000..de80fc8f64 --- /dev/null +++ b/control_plane/attachment_service/src/schema.rs @@ -0,0 +1,27 @@ +// @generated automatically by Diesel CLI. + +diesel::table! { + nodes (node_id) { + node_id -> Int8, + scheduling_policy -> Varchar, + listen_http_addr -> Varchar, + listen_http_port -> Int4, + listen_pg_addr -> Varchar, + listen_pg_port -> Int4, + } +} + +diesel::table! { + tenant_shards (tenant_id, shard_number, shard_count) { + tenant_id -> Varchar, + shard_number -> Int4, + shard_count -> Int4, + shard_stripe_size -> Int4, + generation -> Int4, + generation_pageserver -> Int8, + placement_policy -> Varchar, + config -> Text, + } +} + +diesel::allow_tables_to_appear_in_same_query!(nodes, tenant_shards,); diff --git a/control_plane/attachment_service/src/service.rs b/control_plane/attachment_service/src/service.rs index 5999d48fd9..ec56dc8ad4 100644 --- a/control_plane/attachment_service/src/service.rs +++ b/control_plane/attachment_service/src/service.rs @@ -11,6 +11,7 @@ use control_plane::attachment_service::{ TenantCreateResponseShard, TenantLocateResponse, TenantLocateResponseShard, TenantShardMigrateRequest, TenantShardMigrateResponse, }; +use diesel::result::DatabaseErrorKind; use hyper::StatusCode; use pageserver_api::{ control_api::{ @@ -26,6 +27,7 @@ use pageserver_api::{ }; use pageserver_client::mgmt_api; use utils::{ + completion::Barrier, generation::Generation, http::error::ApiError, id::{NodeId, TenantId}, @@ -35,7 +37,7 @@ use utils::{ use crate::{ compute_hook::ComputeHook, node::Node, - persistence::{Persistence, TenantShardPersistence}, + persistence::{DatabaseError, Persistence, TenantShardPersistence}, scheduler::Scheduler, tenant_state::{ IntentState, ObservedState, ObservedStateLocation, ReconcileResult, ReconcileWaitError, @@ -46,6 +48,10 @@ use crate::{ const RECONCILE_TIMEOUT: Duration = Duration::from_secs(30); +/// How long [`Service::startup_reconcile`] is allowed to take before it should give +/// up on unresponsive pageservers and proceed. +pub(crate) const STARTUP_RECONCILE_TIMEOUT: Duration = Duration::from_secs(30); + // Top level state available to all HTTP handlers struct ServiceState { tenants: BTreeMap, @@ -79,10 +85,27 @@ pub struct Config { pub jwt_token: Option, } +impl From for ApiError { + fn from(err: DatabaseError) -> ApiError { + match err { + DatabaseError::Query(e) => ApiError::InternalServerError(e.into()), + // FIXME: ApiError doesn't have an Unavailable variant, but ShuttingDown maps to 503. + DatabaseError::Connection(_e) => ApiError::ShuttingDown, + DatabaseError::Logical(reason) => { + ApiError::InternalServerError(anyhow::anyhow!(reason)) + } + } + } +} + pub struct Service { inner: Arc>, config: Config, persistence: Arc, + + /// This waits for initial reconciliation with pageservers to complete. Until this barrier + /// passes, it isn't safe to do any actions that mutate tenants. + pub(crate) startup_complete: Barrier, } impl From for ApiError { @@ -96,77 +119,32 @@ impl From for ApiError { } impl Service { - pub async fn spawn(config: Config, persistence: Arc) -> anyhow::Result> { - let (result_tx, mut result_rx) = tokio::sync::mpsc::unbounded_channel(); - - tracing::info!("Loading nodes from database..."); - let mut nodes = persistence.list_nodes().await?; - tracing::info!("Loaded {} nodes from database.", nodes.len()); - - tracing::info!("Loading shards from database..."); - let tenant_shard_persistence = persistence.list_tenant_shards().await?; - tracing::info!( - "Loaded {} shards from database.", - tenant_shard_persistence.len() - ); - - let mut tenants = BTreeMap::new(); - - for tsp in tenant_shard_persistence { - let tenant_shard_id = TenantShardId { - tenant_id: TenantId::from_str(tsp.tenant_id.as_str())?, - shard_number: ShardNumber(tsp.shard_number as u8), - shard_count: ShardCount(tsp.shard_count as u8), - }; - let shard_identity = if tsp.shard_count == 0 { - ShardIdentity::unsharded() - } else { - ShardIdentity::new( - ShardNumber(tsp.shard_number as u8), - ShardCount(tsp.shard_count as u8), - ShardStripeSize(tsp.shard_stripe_size as u32), - )? - }; - let new_tenant = TenantState { - tenant_shard_id, - shard: shard_identity, - sequence: Sequence::initial(), - // Note that we load generation, but don't care about generation_pageserver. We will either end up finding - // our existing attached location and it will match generation_pageserver, or we will attach somewhere new - // and update generation_pageserver in the process. - generation: Generation::new(tsp.generation), - policy: serde_json::from_str(&tsp.placement_policy).unwrap(), - intent: IntentState::new(), - observed: ObservedState::new(), - config: serde_json::from_str(&tsp.config).unwrap(), - reconciler: None, - waiter: Arc::new(SeqWait::new(Sequence::initial())), - error_waiter: Arc::new(SeqWait::new(Sequence::initial())), - last_error: Arc::default(), - }; - - tenants.insert(tenant_shard_id, new_tenant); - } + pub fn get_config(&self) -> &Config { + &self.config + } + /// TODO: don't allow other API calls until this is done, don't start doing any background housekeeping + /// until this is done. + async fn startup_reconcile(&self) { // For all tenant shards, a vector of observed states on nodes (where None means // indeterminate, same as in [`ObservedStateLocation`]) let mut observed = HashMap::new(); + let nodes = { + let locked = self.inner.read().unwrap(); + locked.nodes.clone() + }; + // TODO: issue these requests concurrently - for node in &mut nodes { - let client = mgmt_api::Client::new(node.base_url(), config.jwt_token.as_deref()); + for node in nodes.values() { + let client = mgmt_api::Client::new(node.base_url(), self.config.jwt_token.as_deref()); tracing::info!("Scanning shards on node {}...", node.id); match client.list_location_config().await { Err(e) => { tracing::warn!("Could not contact pageserver {} ({e})", node.id); - // TODO: be more tolerant, apply a generous 5-10 second timeout - // TODO: setting a node to Offline is a dramatic thing to do, and can - // prevent neon_local from starting up (it starts this service before - // any pageservers are running). It may make sense to give nodes - // a Pending state to accomodate this situation, and allow (but deprioritize) - // scheduling on Pending nodes. - //node.availability = NodeAvailability::Offline; + // TODO: be more tolerant, apply a generous 5-10 second timeout with retries, in case + // pageserver is being restarted at the same time as we are } Ok(listing) => { tracing::info!( @@ -174,7 +152,6 @@ impl Service { listing.tenant_shards.len(), node.id ); - node.availability = NodeAvailability::Active; for (tenant_shard_id, conf_opt) in listing.tenant_shards { observed.insert(tenant_shard_id, (node.id, conf_opt)); @@ -186,41 +163,46 @@ impl Service { let mut cleanup = Vec::new(); // Populate intent and observed states for all tenants, based on reported state on pageservers - for (tenant_shard_id, (node_id, observed_loc)) in observed { - let Some(tenant_state) = tenants.get_mut(&tenant_shard_id) else { - cleanup.push((tenant_shard_id, node_id)); - continue; - }; + let shard_count = { + let mut locked = self.inner.write().unwrap(); + for (tenant_shard_id, (node_id, observed_loc)) in observed { + let Some(tenant_state) = locked.tenants.get_mut(&tenant_shard_id) else { + cleanup.push((tenant_shard_id, node_id)); + continue; + }; - tenant_state - .observed - .locations - .insert(node_id, ObservedStateLocation { conf: observed_loc }); - } - - // State of nodes is now frozen, transform to a HashMap. - let mut nodes: HashMap = nodes.into_iter().map(|n| (n.id, n)).collect(); - - // Populate each tenant's intent state - let mut scheduler = Scheduler::new(&tenants, &nodes); - for (tenant_shard_id, tenant_state) in tenants.iter_mut() { - tenant_state.intent_from_observed(); - if let Err(e) = tenant_state.schedule(&mut scheduler) { - // Non-fatal error: we are unable to properly schedule the tenant, perhaps because - // not enough pageservers are available. The tenant may well still be available - // to clients. - tracing::error!("Failed to schedule tenant {tenant_shard_id} at startup: {e}"); + tenant_state + .observed + .locations + .insert(node_id, ObservedStateLocation { conf: observed_loc }); } - } + + // Populate each tenant's intent state + let mut scheduler = Scheduler::new(&locked.tenants, &nodes); + for (tenant_shard_id, tenant_state) in locked.tenants.iter_mut() { + tenant_state.intent_from_observed(); + if let Err(e) = tenant_state.schedule(&mut scheduler) { + // Non-fatal error: we are unable to properly schedule the tenant, perhaps because + // not enough pageservers are available. The tenant may well still be available + // to clients. + tracing::error!("Failed to schedule tenant {tenant_shard_id} at startup: {e}"); + } + } + + locked.tenants.len() + }; + + // TODO: if any tenant's intent now differs from its loaded generation_pageserver, we should clear that + // generation_pageserver in the database. // Clean up any tenants that were found on pageservers but are not known to us. for (tenant_shard_id, node_id) in cleanup { // A node reported a tenant_shard_id which is unknown to us: detach it. let node = nodes - .get_mut(&node_id) + .get(&node_id) .expect("Always exists: only known nodes are scanned"); - let client = mgmt_api::Client::new(node.base_url(), config.jwt_token.as_deref()); + let client = mgmt_api::Client::new(node.base_url(), self.config.jwt_token.as_deref()); match client .location_config( tenant_shard_id, @@ -252,13 +234,80 @@ impl Service { } } - let shard_count = tenants.len(); + // Finally, now that the service is up and running, launch reconcile operations for any tenants + // which require it: under normal circumstances this should only include tenants that were in some + // transient state before we restarted. + let reconcile_tasks = self.reconcile_all(); + tracing::info!("Startup complete, spawned {reconcile_tasks} reconciliation tasks ({shard_count} shards total)"); + } + + pub async fn spawn(config: Config, persistence: Arc) -> anyhow::Result> { + let (result_tx, mut result_rx) = tokio::sync::mpsc::unbounded_channel(); + + tracing::info!("Loading nodes from database..."); + let nodes = persistence.list_nodes().await?; + let nodes: HashMap = nodes.into_iter().map(|n| (n.id, n)).collect(); + tracing::info!("Loaded {} nodes from database.", nodes.len()); + + tracing::info!("Loading shards from database..."); + let tenant_shard_persistence = persistence.list_tenant_shards().await?; + tracing::info!( + "Loaded {} shards from database.", + tenant_shard_persistence.len() + ); + + let mut tenants = BTreeMap::new(); + + for tsp in tenant_shard_persistence { + let tenant_shard_id = TenantShardId { + tenant_id: TenantId::from_str(tsp.tenant_id.as_str())?, + shard_number: ShardNumber(tsp.shard_number as u8), + shard_count: ShardCount(tsp.shard_count as u8), + }; + let shard_identity = if tsp.shard_count == 0 { + ShardIdentity::unsharded() + } else { + ShardIdentity::new( + ShardNumber(tsp.shard_number as u8), + ShardCount(tsp.shard_count as u8), + ShardStripeSize(tsp.shard_stripe_size as u32), + )? + }; + + // We will populate intent properly later in [`Self::startup_reconcile`], initially populate + // it with what we can infer: the node for which a generation was most recently issued. + let mut intent = IntentState::new(); + if tsp.generation_pageserver != i64::MAX { + intent.attached = Some(NodeId(tsp.generation_pageserver as u64)) + } + + let new_tenant = TenantState { + tenant_shard_id, + shard: shard_identity, + sequence: Sequence::initial(), + generation: Generation::new(tsp.generation as u32), + policy: serde_json::from_str(&tsp.placement_policy).unwrap(), + intent, + observed: ObservedState::new(), + config: serde_json::from_str(&tsp.config).unwrap(), + reconciler: None, + waiter: Arc::new(SeqWait::new(Sequence::initial())), + error_waiter: Arc::new(SeqWait::new(Sequence::initial())), + last_error: Arc::default(), + }; + + tenants.insert(tenant_shard_id, new_tenant); + } + + let (startup_completion, startup_complete) = utils::completion::channel(); + let this = Arc::new(Self { inner: Arc::new(std::sync::RwLock::new(ServiceState::new( result_tx, nodes, tenants, ))), config, persistence, + startup_complete, }); let result_task_this = this.clone(); @@ -316,11 +365,13 @@ impl Service { } }); - // Finally, now that the service is up and running, launch reconcile operations for any tenants - // which require it: under normal circumstances this should only include tenants that were in some - // transient state before we restarted. - let reconcile_tasks = this.reconcile_all(); - tracing::info!("Startup complete, spawned {reconcile_tasks} reconciliation tasks ({shard_count} shards total)"); + let startup_reconcile_this = this.clone(); + tokio::task::spawn(async move { + // Block the [`Service::startup_complete`] barrier until we're done + let _completion = startup_completion; + + startup_reconcile_this.startup_reconcile().await + }); Ok(this) } @@ -336,7 +387,6 @@ impl Service { let locked = self.inner.write().unwrap(); !locked.tenants.contains_key(&attach_req.tenant_shard_id) }; - if insert { let tsp = TenantShardPersistence { tenant_id: attach_req.tenant_shard_id.tenant_id.to_string(), @@ -344,31 +394,49 @@ impl Service { shard_count: attach_req.tenant_shard_id.shard_count.0 as i32, shard_stripe_size: 0, generation: 0, - generation_pageserver: None, + generation_pageserver: i64::MAX, placement_policy: serde_json::to_string(&PlacementPolicy::default()).unwrap(), config: serde_json::to_string(&TenantConfig::default()).unwrap(), }; - self.persistence.insert_tenant_shards(vec![tsp]).await?; + match self.persistence.insert_tenant_shards(vec![tsp]).await { + Err(e) => match e { + DatabaseError::Query(diesel::result::Error::DatabaseError( + DatabaseErrorKind::UniqueViolation, + _, + )) => { + tracing::info!( + "Raced with another request to insert tenant {}", + attach_req.tenant_shard_id + ) + } + _ => return Err(e.into()), + }, + Ok(()) => { + tracing::info!("Inserted shard {} in database", attach_req.tenant_shard_id); - let mut locked = self.inner.write().unwrap(); - locked.tenants.insert( - attach_req.tenant_shard_id, - TenantState::new( - attach_req.tenant_shard_id, - ShardIdentity::unsharded(), - PlacementPolicy::Single, - ), - ); + let mut locked = self.inner.write().unwrap(); + locked.tenants.insert( + attach_req.tenant_shard_id, + TenantState::new( + attach_req.tenant_shard_id, + ShardIdentity::unsharded(), + PlacementPolicy::Single, + ), + ); + tracing::info!("Inserted shard {} in memory", attach_req.tenant_shard_id); + } + } } - let new_generation = if attach_req.node_id.is_some() { + let new_generation = if let Some(req_node_id) = attach_req.node_id { Some( self.persistence - .increment_generation(attach_req.tenant_shard_id, attach_req.node_id) + .increment_generation(attach_req.tenant_shard_id, req_node_id) .await?, ) } else { + self.persistence.detach(attach_req.tenant_shard_id).await?; None }; @@ -380,6 +448,11 @@ impl Service { if let Some(new_generation) = new_generation { tenant_state.generation = new_generation; + } else { + // This is a detach notification. We must update placement policy to avoid re-attaching + // during background scheduling/reconciliation, or during attachment service restart. + assert!(attach_req.node_id.is_none()); + tenant_state.policy = PlacementPolicy::Detached; } if let Some(attaching_pageserver) = attach_req.node_id.as_ref() { @@ -407,6 +480,7 @@ impl Service { "attach_hook: tenant {} set generation {:?}, pageserver {}", attach_req.tenant_shard_id, tenant_state.generation, + // TODO: this is an odd number of 0xf's attach_req.node_id.unwrap_or(utils::id::NodeId(0xfffffff)) ); @@ -499,6 +573,14 @@ impl Service { id: req_tenant.id, valid, }); + } else { + // After tenant deletion, we may approve any validation. This avoids + // spurious warnings on the pageserver if it has pending LSN updates + // at the point a deletion happens. + response.tenants.push(ValidateResponseTenant { + id: req_tenant.id, + valid: true, + }); } } response @@ -554,7 +636,7 @@ impl Service { shard_count: tenant_shard_id.shard_count.0 as i32, shard_stripe_size: create_req.shard_parameters.stripe_size.0 as i32, generation: 0, - generation_pageserver: None, + generation_pageserver: i64::MAX, placement_policy: serde_json::to_string(&placement_policy).unwrap(), config: serde_json::to_string(&create_req.config).unwrap(), }) @@ -868,7 +950,6 @@ impl Service { } else { let old_attached = shard.intent.attached; - shard.intent.attached = Some(migrate_req.node_id); match shard.policy { PlacementPolicy::Single => { shard.intent.secondary.clear(); @@ -882,7 +963,13 @@ impl Service { shard.intent.secondary.push(old_attached); } } + PlacementPolicy::Detached => { + return Err(ApiError::BadRequest(anyhow::anyhow!( + "Cannot migrate a tenant that is PlacementPolicy::Detached: configure it to an attached policy first" + ))) + } } + shard.intent.attached = Some(migrate_req.node_id); tracing::info!("Migrating: new intent {:?}", shard.intent); shard.sequence = shard.sequence.next(); @@ -955,10 +1042,7 @@ impl Service { availability: NodeAvailability::Active, }; // TODO: idempotency if the node already exists in the database - self.persistence - .insert_node(&new_node) - .await - .map_err(ApiError::InternalServerError)?; + self.persistence.insert_node(&new_node).await?; let mut locked = self.inner.write().unwrap(); let mut new_nodes = (*locked.nodes).clone(); diff --git a/control_plane/attachment_service/src/tenant_state.rs b/control_plane/attachment_service/src/tenant_state.rs index a907628eff..5290197d84 100644 --- a/control_plane/attachment_service/src/tenant_state.rs +++ b/control_plane/attachment_service/src/tenant_state.rs @@ -312,6 +312,18 @@ impl TenantState { modified = true; } } + Detached => { + // Should have no attached or secondary pageservers + if self.intent.attached.is_some() { + self.intent.attached = None; + modified = true; + } + + if !self.intent.secondary.is_empty() { + self.intent.secondary.clear(); + modified = true; + } + } } if modified { diff --git a/control_plane/src/attachment_service.rs b/control_plane/src/attachment_service.rs index 0a353d8b12..6602aa9a73 100644 --- a/control_plane/src/attachment_service.rs +++ b/control_plane/src/attachment_service.rs @@ -1,5 +1,11 @@ use crate::{background_process, local_env::LocalEnv}; -use camino::Utf8PathBuf; +use camino::{Utf8Path, Utf8PathBuf}; +use diesel::{ + backend::Backend, + query_builder::{AstPass, QueryFragment, QueryId}, + Connection, PgConnection, QueryResult, RunQueryDsl, +}; +use diesel_migrations::{HarnessWithOutput, MigrationHarness}; use hyper::Method; use pageserver_api::{ models::{ShardParameters, TenantCreateRequest, TimelineCreateRequest, TimelineInfo}, @@ -7,9 +13,9 @@ use pageserver_api::{ }; use pageserver_client::mgmt_api::ResponseErrorMessageExt; use postgres_backend::AuthType; -use postgres_connection::parse_host_port; use serde::{de::DeserializeOwned, Deserialize, Serialize}; -use std::{path::PathBuf, process::Child, str::FromStr}; +use std::{env, str::FromStr}; +use tokio::process::Command; use tracing::instrument; use utils::{ auth::{Claims, Scope}, @@ -19,14 +25,17 @@ use utils::{ pub struct AttachmentService { env: LocalEnv, listen: String, - path: PathBuf, + path: Utf8PathBuf, jwt_token: Option, public_key_path: Option, + postgres_port: u16, client: reqwest::Client, } const COMMAND: &str = "attachment_service"; +const ATTACHMENT_SERVICE_POSTGRES_VERSION: u32 = 16; + #[derive(Serialize, Deserialize)] pub struct AttachHookRequest { pub tenant_shard_id: TenantShardId, @@ -169,7 +178,9 @@ pub struct TenantShardMigrateResponse {} impl AttachmentService { pub fn from_env(env: &LocalEnv) -> Self { - let path = env.base_data_dir.join("attachments.json"); + let path = Utf8PathBuf::from_path_buf(env.base_data_dir.clone()) + .unwrap() + .join("attachments.json"); // Makes no sense to construct this if pageservers aren't going to use it: assume // pageservers have control plane API set @@ -181,6 +192,13 @@ impl AttachmentService { listen_url.port().unwrap() ); + // Convention: NeonEnv in python tests reserves the next port after the control_plane_api + // port, for use by our captive postgres. + let postgres_port = listen_url + .port() + .expect("Control plane API setting should always have a port") + + 1; + // Assume all pageservers have symmetric auth configuration: this service // expects to use one JWT token to talk to all of them. let ps_conf = env @@ -209,6 +227,7 @@ impl AttachmentService { listen, jwt_token, public_key_path, + postgres_port, client: reqwest::ClientBuilder::new() .build() .expect("Failed to construct http client"), @@ -220,13 +239,214 @@ impl AttachmentService { .expect("non-Unicode path") } - pub async fn start(&self) -> anyhow::Result { - let path_str = self.path.to_string_lossy(); + /// PIDFile for the postgres instance used to store attachment service state + fn postgres_pid_file(&self) -> Utf8PathBuf { + Utf8PathBuf::from_path_buf( + self.env + .base_data_dir + .join("attachment_service_postgres.pid"), + ) + .expect("non-Unicode path") + } - let mut args = vec!["-l", &self.listen, "-p", &path_str] - .into_iter() - .map(|s| s.to_string()) - .collect::>(); + /// In order to access database migrations, we need to find the Neon source tree + async fn find_source_root(&self) -> anyhow::Result { + // We assume that either prd or our binary is in the source tree. The former is usually + // true for automated test runners, the latter is usually true for developer workstations. Often + // both are true, which is fine. + let candidate_start_points = [ + // Current working directory + Utf8PathBuf::from_path_buf(std::env::current_dir()?).unwrap(), + // Directory containing the binary we're running inside + Utf8PathBuf::from_path_buf(env::current_exe()?.parent().unwrap().to_owned()).unwrap(), + ]; + + // For each candidate start point, search through ancestors looking for a neon.git source tree root + for start_point in &candidate_start_points { + // Start from the build dir: assumes we are running out of a built neon source tree + for path in start_point.ancestors() { + // A crude approximation: the root of the source tree is whatever contains a "control_plane" + // subdirectory. + let control_plane = path.join("control_plane"); + if tokio::fs::try_exists(&control_plane).await? { + return Ok(path.to_owned()); + } + } + } + + // Fall-through + Err(anyhow::anyhow!( + "Could not find control_plane src dir, after searching ancestors of {candidate_start_points:?}" + )) + } + + /// Find the directory containing postgres binaries, such as `initdb` and `pg_ctl` + /// + /// This usually uses ATTACHMENT_SERVICE_POSTGRES_VERSION of postgres, but will fall back + /// to other versions if that one isn't found. Some automated tests create circumstances + /// where only one version is available in pg_distrib_dir, such as `test_remote_extensions`. + pub async fn get_pg_bin_dir(&self) -> anyhow::Result { + let prefer_versions = [ATTACHMENT_SERVICE_POSTGRES_VERSION, 15, 14]; + + for v in prefer_versions { + let path = Utf8PathBuf::from_path_buf(self.env.pg_bin_dir(v)?).unwrap(); + if tokio::fs::try_exists(&path).await? { + return Ok(path); + } + } + + // Fall through + anyhow::bail!( + "Postgres binaries not found in {}", + self.env.pg_distrib_dir.display() + ); + } + + /// Readiness check for our postgres process + async fn pg_isready(&self, pg_bin_dir: &Utf8Path) -> anyhow::Result { + let bin_path = pg_bin_dir.join("pg_isready"); + let args = ["-h", "localhost", "-p", &format!("{}", self.postgres_port)]; + let exitcode = Command::new(bin_path).args(args).spawn()?.wait().await?; + + Ok(exitcode.success()) + } + + /// Create our database if it doesn't exist, and run migrations. + /// + /// This function is equivalent to the `diesel setup` command in the diesel CLI. We implement + /// the same steps by hand to avoid imposing a dependency on installing diesel-cli for developers + /// who just want to run `cargo neon_local` without knowing about diesel. + /// + /// Returns the database url + pub async fn setup_database(&self) -> anyhow::Result { + let database_url = format!( + "postgresql://localhost:{}/attachment_service", + self.postgres_port + ); + println!("Running attachment service database setup..."); + fn change_database_of_url(database_url: &str, default_database: &str) -> (String, String) { + let base = ::url::Url::parse(database_url).unwrap(); + let database = base.path_segments().unwrap().last().unwrap().to_owned(); + let mut new_url = base.join(default_database).unwrap(); + new_url.set_query(base.query()); + (database, new_url.into()) + } + + #[derive(Debug, Clone)] + pub struct CreateDatabaseStatement { + db_name: String, + } + + impl CreateDatabaseStatement { + pub fn new(db_name: &str) -> Self { + CreateDatabaseStatement { + db_name: db_name.to_owned(), + } + } + } + + impl QueryFragment for CreateDatabaseStatement { + fn walk_ast<'b>(&'b self, mut out: AstPass<'_, 'b, DB>) -> QueryResult<()> { + out.push_sql("CREATE DATABASE "); + out.push_identifier(&self.db_name)?; + Ok(()) + } + } + + impl RunQueryDsl for CreateDatabaseStatement {} + + impl QueryId for CreateDatabaseStatement { + type QueryId = (); + + const HAS_STATIC_QUERY_ID: bool = false; + } + if PgConnection::establish(&database_url).is_err() { + let (database, postgres_url) = change_database_of_url(&database_url, "postgres"); + println!("Creating database: {database}"); + let mut conn = PgConnection::establish(&postgres_url)?; + CreateDatabaseStatement::new(&database).execute(&mut conn)?; + } + let mut conn = PgConnection::establish(&database_url)?; + + let migrations_dir = self + .find_source_root() + .await? + .join("control_plane/attachment_service/migrations"); + + let migrations = diesel_migrations::FileBasedMigrations::from_path(migrations_dir)?; + println!("Running migrations in {}", migrations.path().display()); + HarnessWithOutput::write_to_stdout(&mut conn) + .run_pending_migrations(migrations) + .map(|_| ()) + .map_err(|e| anyhow::anyhow!(e))?; + + println!("Migrations complete"); + + Ok(database_url) + } + + pub async fn start(&self) -> anyhow::Result<()> { + // Start a vanilla Postgres process used by the attachment service for persistence. + let pg_data_path = Utf8PathBuf::from_path_buf(self.env.base_data_dir.clone()) + .unwrap() + .join("attachment_service_db"); + let pg_bin_dir = self.get_pg_bin_dir().await?; + let pg_log_path = pg_data_path.join("postgres.log"); + + if !tokio::fs::try_exists(&pg_data_path).await? { + // Initialize empty database + let initdb_path = pg_bin_dir.join("initdb"); + let mut child = Command::new(&initdb_path) + .args(["-D", pg_data_path.as_ref()]) + .spawn() + .expect("Failed to spawn initdb"); + let status = child.wait().await?; + if !status.success() { + anyhow::bail!("initdb failed with status {status}"); + } + + tokio::fs::write( + &pg_data_path.join("postgresql.conf"), + format!("port = {}", self.postgres_port), + ) + .await?; + }; + + println!("Starting attachment service database..."); + let db_start_args = [ + "-w", + "-D", + pg_data_path.as_ref(), + "-l", + pg_log_path.as_ref(), + "start", + ]; + + background_process::start_process( + "attachment_service_db", + &self.env.base_data_dir, + pg_bin_dir.join("pg_ctl").as_std_path(), + db_start_args, + [], + background_process::InitialPidFile::Create(self.postgres_pid_file()), + || self.pg_isready(&pg_bin_dir), + ) + .await?; + + // Run migrations on every startup, in case something changed. + let database_url = self.setup_database().await?; + + let mut args = vec![ + "-l", + &self.listen, + "-p", + self.path.as_ref(), + "--database-url", + &database_url, + ] + .into_iter() + .map(|s| s.to_string()) + .collect::>(); if let Some(jwt_token) = &self.jwt_token { args.push(format!("--jwt-token={jwt_token}")); } @@ -235,7 +455,7 @@ impl AttachmentService { args.push(format!("--public-key={public_key_path}")); } - let result = background_process::start_process( + background_process::start_process( COMMAND, &self.env.base_data_dir, &self.env.attachment_service_bin(), @@ -252,29 +472,46 @@ impl AttachmentService { } }, ) - .await; + .await?; - for ps_conf in &self.env.pageservers { - let (pg_host, pg_port) = - parse_host_port(&ps_conf.listen_pg_addr).expect("Unable to parse listen_pg_addr"); - let (http_host, http_port) = parse_host_port(&ps_conf.listen_http_addr) - .expect("Unable to parse listen_http_addr"); - self.node_register(NodeRegisterRequest { - node_id: ps_conf.id, - listen_pg_addr: pg_host.to_string(), - listen_pg_port: pg_port.unwrap_or(5432), - listen_http_addr: http_host.to_string(), - listen_http_port: http_port.unwrap_or(80), - }) + Ok(()) + } + + pub async fn stop(&self, immediate: bool) -> anyhow::Result<()> { + background_process::stop_process(immediate, COMMAND, &self.pid_file())?; + + let pg_data_path = self.env.base_data_dir.join("attachment_service_db"); + let pg_bin_dir = self.get_pg_bin_dir().await?; + + println!("Stopping attachment service database..."); + let pg_stop_args = ["-D", &pg_data_path.to_string_lossy(), "stop"]; + let stop_status = Command::new(pg_bin_dir.join("pg_ctl")) + .args(pg_stop_args) + .spawn()? + .wait() .await?; + if !stop_status.success() { + let pg_status_args = ["-D", &pg_data_path.to_string_lossy(), "status"]; + let status_exitcode = Command::new(pg_bin_dir.join("pg_ctl")) + .args(pg_status_args) + .spawn()? + .wait() + .await?; + + // pg_ctl status returns this exit code if postgres is not running: in this case it is + // fine that stop failed. Otherwise it is an error that stop failed. + const PG_STATUS_NOT_RUNNING: i32 = 3; + if Some(PG_STATUS_NOT_RUNNING) == status_exitcode.code() { + println!("Attachment service data base is already stopped"); + return Ok(()); + } else { + anyhow::bail!("Failed to stop attachment service database: {stop_status}") + } } - result + Ok(()) } - pub fn stop(&self, immediate: bool) -> anyhow::Result<()> { - background_process::stop_process(immediate, COMMAND, &self.pid_file()) - } /// Simple HTTP request wrapper for calling into attachment service async fn dispatch( &self, @@ -356,7 +593,7 @@ impl AttachmentService { &self, req: TenantCreateRequest, ) -> anyhow::Result { - self.dispatch(Method::POST, "tenant".to_string(), Some(req)) + self.dispatch(Method::POST, "v1/tenant".to_string(), Some(req)) .await } @@ -413,7 +650,7 @@ impl AttachmentService { ) -> anyhow::Result { self.dispatch( Method::POST, - format!("tenant/{tenant_id}/timeline"), + format!("v1/tenant/{tenant_id}/timeline"), Some(req), ) .await diff --git a/control_plane/src/background_process.rs b/control_plane/src/background_process.rs index 20fa3af9b8..3ffb8734d0 100644 --- a/control_plane/src/background_process.rs +++ b/control_plane/src/background_process.rs @@ -17,7 +17,7 @@ use std::io::Write; use std::os::unix::prelude::AsRawFd; use std::os::unix::process::CommandExt; use std::path::Path; -use std::process::{Child, Command}; +use std::process::Command; use std::time::Duration; use std::{fs, io, thread}; @@ -60,7 +60,7 @@ pub async fn start_process( envs: EI, initial_pid_file: InitialPidFile, process_status_check: F, -) -> anyhow::Result +) -> anyhow::Result<()> where F: Fn() -> Fut, Fut: std::future::Future>, @@ -98,7 +98,7 @@ where InitialPidFile::Expect(path) => path, }; - let mut spawned_process = filled_cmd.spawn().with_context(|| { + let spawned_process = filled_cmd.spawn().with_context(|| { format!("Could not spawn {process_name}, see console output and log files for details.") })?; let pid = spawned_process.id(); @@ -106,12 +106,26 @@ where i32::try_from(pid) .with_context(|| format!("Subprocess {process_name} has invalid pid {pid}"))?, ); + // set up a scopeguard to kill & wait for the child in case we panic or bail below + let spawned_process = scopeguard::guard(spawned_process, |mut spawned_process| { + println!("SIGKILL & wait the started process"); + (|| { + // TODO: use another signal that can be caught by the child so it can clean up any children it spawned (e..g, walredo). + spawned_process.kill().context("SIGKILL child")?; + spawned_process.wait().context("wait() for child process")?; + anyhow::Ok(()) + })() + .with_context(|| format!("scopeguard kill&wait child {process_name:?}")) + .unwrap(); + }); for retries in 0..RETRIES { match process_started(pid, pid_file_to_check, &process_status_check).await { Ok(true) => { - println!("\n{process_name} started, pid: {pid}"); - return Ok(spawned_process); + println!("\n{process_name} started and passed status check, pid: {pid}"); + // leak the child process, it'll outlive this neon_local invocation + drop(scopeguard::ScopeGuard::into_inner(spawned_process)); + return Ok(()); } Ok(false) => { if retries == NOTICE_AFTER_RETRIES { @@ -126,16 +140,15 @@ where thread::sleep(Duration::from_millis(RETRY_INTERVAL_MILLIS)); } Err(e) => { - println!("{process_name} failed to start: {e:#}"); - if let Err(e) = spawned_process.kill() { - println!("Could not stop {process_name} subprocess: {e:#}") - }; + println!("error starting process {process_name:?}: {e:#}"); return Err(e); } } } println!(); - anyhow::bail!("{process_name} did not start in {RETRY_UNTIL_SECS} seconds"); + anyhow::bail!( + "{process_name} did not start+pass status checks within {RETRY_UNTIL_SECS} seconds" + ); } /// Stops the process, using the pid file given. Returns Ok also if the process is already not running. diff --git a/control_plane/src/bin/neon_local.rs b/control_plane/src/bin/neon_local.rs index 279c47398f..a5242e3dc7 100644 --- a/control_plane/src/bin/neon_local.rs +++ b/control_plane/src/bin/neon_local.rs @@ -135,7 +135,7 @@ fn main() -> Result<()> { "tenant" => rt.block_on(handle_tenant(sub_args, &mut env)), "timeline" => rt.block_on(handle_timeline(sub_args, &mut env)), "start" => rt.block_on(handle_start_all(sub_args, &env)), - "stop" => handle_stop_all(sub_args, &env), + "stop" => rt.block_on(handle_stop_all(sub_args, &env)), "pageserver" => rt.block_on(handle_pageserver(sub_args, &env)), "attachment_service" => rt.block_on(handle_attachment_service(sub_args, &env)), "safekeeper" => rt.block_on(handle_safekeeper(sub_args, &env)), @@ -1056,8 +1056,9 @@ fn get_pageserver(env: &local_env::LocalEnv, args: &ArgMatches) -> Result Result<()> { match sub_match.subcommand() { Some(("start", subcommand_args)) => { + let register = subcommand_args.get_one::("register").unwrap_or(&true); if let Err(e) = get_pageserver(env, subcommand_args)? - .start(&pageserver_config_overrides(subcommand_args)) + .start(&pageserver_config_overrides(subcommand_args), *register) .await { eprintln!("pageserver start failed: {e}"); @@ -1086,24 +1087,7 @@ async fn handle_pageserver(sub_match: &ArgMatches, env: &local_env::LocalEnv) -> } if let Err(e) = pageserver - .start(&pageserver_config_overrides(subcommand_args)) - .await - { - eprintln!("pageserver start failed: {e}"); - exit(1); - } - } - - Some(("migrate", subcommand_args)) => { - let pageserver = get_pageserver(env, subcommand_args)?; - //TODO what shutdown strategy should we use here? - if let Err(e) = pageserver.stop(false) { - eprintln!("pageserver stop failed: {}", e); - exit(1); - } - - if let Err(e) = pageserver - .start(&pageserver_config_overrides(subcommand_args)) + .start(&pageserver_config_overrides(subcommand_args), false) .await { eprintln!("pageserver start failed: {e}"); @@ -1161,7 +1145,7 @@ async fn handle_attachment_service( .map(|s| s.as_str()) == Some("immediate"); - if let Err(e) = svc.stop(immediate) { + if let Err(e) = svc.stop(immediate).await { eprintln!("stop failed: {}", e); exit(1); } @@ -1257,7 +1241,7 @@ async fn handle_start_all(sub_match: &ArgMatches, env: &local_env::LocalEnv) -> let attachment_service = AttachmentService::from_env(env); if let Err(e) = attachment_service.start().await { eprintln!("attachment_service start failed: {:#}", e); - try_stop_all(env, true); + try_stop_all(env, true).await; exit(1); } } @@ -1265,11 +1249,11 @@ async fn handle_start_all(sub_match: &ArgMatches, env: &local_env::LocalEnv) -> for ps_conf in &env.pageservers { let pageserver = PageServerNode::from_env(env, ps_conf); if let Err(e) = pageserver - .start(&pageserver_config_overrides(sub_match)) + .start(&pageserver_config_overrides(sub_match), true) .await { eprintln!("pageserver {} start failed: {:#}", ps_conf.id, e); - try_stop_all(env, true); + try_stop_all(env, true).await; exit(1); } } @@ -1278,23 +1262,23 @@ async fn handle_start_all(sub_match: &ArgMatches, env: &local_env::LocalEnv) -> let safekeeper = SafekeeperNode::from_env(env, node); if let Err(e) = safekeeper.start(vec![]).await { eprintln!("safekeeper {} start failed: {:#}", safekeeper.id, e); - try_stop_all(env, false); + try_stop_all(env, false).await; exit(1); } } Ok(()) } -fn handle_stop_all(sub_match: &ArgMatches, env: &local_env::LocalEnv) -> Result<()> { +async fn handle_stop_all(sub_match: &ArgMatches, env: &local_env::LocalEnv) -> Result<()> { let immediate = sub_match.get_one::("stop-mode").map(|s| s.as_str()) == Some("immediate"); - try_stop_all(env, immediate); + try_stop_all(env, immediate).await; Ok(()) } -fn try_stop_all(env: &local_env::LocalEnv, immediate: bool) { +async fn try_stop_all(env: &local_env::LocalEnv, immediate: bool) { // Stop all endpoints match ComputeControlPlane::load(env.clone()) { Ok(cplane) => { @@ -1329,7 +1313,7 @@ fn try_stop_all(env: &local_env::LocalEnv, immediate: bool) { if env.control_plane_api.is_some() { let attachment_service = AttachmentService::from_env(env); - if let Err(e) = attachment_service.stop(immediate) { + if let Err(e) = attachment_service.stop(immediate).await { eprintln!("attachment service stop failed: {e:#}"); } } @@ -1549,7 +1533,11 @@ fn cli() -> Command { .subcommand(Command::new("status")) .subcommand(Command::new("start") .about("Start local pageserver") - .arg(pageserver_config_args.clone()) + .arg(pageserver_config_args.clone()).arg(Arg::new("register") + .long("register") + .default_value("true").required(false) + .value_parser(value_parser!(bool)) + .value_name("register")) ) .subcommand(Command::new("stop") .about("Stop local pageserver") diff --git a/control_plane/src/endpoint.rs b/control_plane/src/endpoint.rs index 43f8ea3b43..dcad22b992 100644 --- a/control_plane/src/endpoint.rs +++ b/control_plane/src/endpoint.rs @@ -57,7 +57,7 @@ use crate::local_env::LocalEnv; use crate::postgresql_conf::PostgresConf; use compute_api::responses::{ComputeState, ComputeStatus}; -use compute_api::spec::{Cluster, ComputeMode, ComputeSpec}; +use compute_api::spec::{Cluster, ComputeFeature, ComputeMode, ComputeSpec}; // contents of a endpoint.json file #[derive(Serialize, Deserialize, PartialEq, Eq, Clone, Debug)] @@ -70,6 +70,7 @@ pub struct EndpointConf { http_port: u16, pg_version: u32, skip_pg_catalog_updates: bool, + features: Vec, } // @@ -140,6 +141,7 @@ impl ComputeControlPlane { // with this we basically test a case of waking up an idle compute, where // we also skip catalog updates in the cloud. skip_pg_catalog_updates: true, + features: vec![], }); ep.create_endpoint_dir()?; @@ -154,6 +156,7 @@ impl ComputeControlPlane { pg_port, pg_version, skip_pg_catalog_updates: true, + features: vec![], })?, )?; std::fs::write( @@ -215,6 +218,9 @@ pub struct Endpoint { // Optimizations skip_pg_catalog_updates: bool, + + // Feature flags + features: Vec, } impl Endpoint { @@ -244,6 +250,7 @@ impl Endpoint { tenant_id: conf.tenant_id, pg_version: conf.pg_version, skip_pg_catalog_updates: conf.skip_pg_catalog_updates, + features: conf.features, }) } @@ -431,7 +438,7 @@ impl Endpoint { } fn wait_for_compute_ctl_to_exit(&self, send_sigterm: bool) -> Result<()> { - // TODO use background_process::stop_process instead + // TODO use background_process::stop_process instead: https://github.com/neondatabase/neon/pull/6482 let pidfile_path = self.endpoint_path().join("compute_ctl.pid"); let pid: u32 = std::fs::read_to_string(pidfile_path)?.parse()?; let pid = nix::unistd::Pid::from_raw(pid as i32); @@ -519,7 +526,7 @@ impl Endpoint { skip_pg_catalog_updates: self.skip_pg_catalog_updates, format_version: 1.0, operation_uuid: None, - features: vec![], + features: self.features.clone(), cluster: Cluster { cluster_id: None, // project ID: not used name: None, // project name: not used @@ -576,9 +583,21 @@ impl Endpoint { } let child = cmd.spawn()?; + // set up a scopeguard to kill & wait for the child in case we panic or bail below + let child = scopeguard::guard(child, |mut child| { + println!("SIGKILL & wait the started process"); + (|| { + // TODO: use another signal that can be caught by the child so it can clean up any children it spawned + child.kill().context("SIGKILL child")?; + child.wait().context("wait() for child process")?; + anyhow::Ok(()) + })() + .with_context(|| format!("scopeguard kill&wait child {child:?}")) + .unwrap(); + }); // Write down the pid so we can wait for it when we want to stop - // TODO use background_process::start_process instead + // TODO use background_process::start_process instead: https://github.com/neondatabase/neon/pull/6482 let pid = child.id(); let pidfile_path = self.endpoint_path().join("compute_ctl.pid"); std::fs::write(pidfile_path, pid.to_string())?; @@ -627,6 +646,9 @@ impl Endpoint { std::thread::sleep(ATTEMPT_INTERVAL); } + // disarm the scopeguard, let the child outlive this function (and neon_local invoction) + drop(scopeguard::ScopeGuard::into_inner(child)); + Ok(()) } diff --git a/control_plane/src/local_env.rs b/control_plane/src/local_env.rs index 4460fdd3a6..aefef47da7 100644 --- a/control_plane/src/local_env.rs +++ b/control_plane/src/local_env.rs @@ -223,7 +223,11 @@ impl LocalEnv { } pub fn attachment_service_bin(&self) -> PathBuf { - self.neon_distrib_dir.join("attachment_service") + // Irrespective of configuration, attachment service binary is always + // run from the same location as neon_local. This means that for compatibility + // tests that run old pageserver/safekeeper, they still run latest attachment service. + let neon_local_bin_dir = env::current_exe().unwrap().parent().unwrap().to_owned(); + neon_local_bin_dir.join("attachment_service") } pub fn safekeeper_bin(&self) -> PathBuf { diff --git a/control_plane/src/pageserver.rs b/control_plane/src/pageserver.rs index 18ccf6bd98..540d1185a2 100644 --- a/control_plane/src/pageserver.rs +++ b/control_plane/src/pageserver.rs @@ -11,7 +11,7 @@ use std::io; use std::io::Write; use std::num::NonZeroU64; use std::path::PathBuf; -use std::process::{Child, Command}; +use std::process::Command; use std::time::Duration; use anyhow::{bail, Context}; @@ -30,6 +30,7 @@ use utils::{ lsn::Lsn, }; +use crate::attachment_service::{AttachmentService, NodeRegisterRequest}; use crate::local_env::PageServerConf; use crate::{background_process, local_env::LocalEnv}; @@ -161,8 +162,8 @@ impl PageServerNode { .expect("non-Unicode path") } - pub async fn start(&self, config_overrides: &[&str]) -> anyhow::Result { - self.start_node(config_overrides, false).await + pub async fn start(&self, config_overrides: &[&str], register: bool) -> anyhow::Result<()> { + self.start_node(config_overrides, false, register).await } fn pageserver_init(&self, config_overrides: &[&str]) -> anyhow::Result<()> { @@ -207,7 +208,8 @@ impl PageServerNode { &self, config_overrides: &[&str], update_config: bool, - ) -> anyhow::Result { + register: bool, + ) -> anyhow::Result<()> { // TODO: using a thread here because start_process() is not async but we need to call check_status() let datadir = self.repo_path(); print!( @@ -244,7 +246,26 @@ impl PageServerNode { } }, ) - .await + .await?; + + if register { + let attachment_service = AttachmentService::from_env(&self.env); + let (pg_host, pg_port) = + parse_host_port(&self.conf.listen_pg_addr).expect("Unable to parse listen_pg_addr"); + let (http_host, http_port) = parse_host_port(&self.conf.listen_http_addr) + .expect("Unable to parse listen_http_addr"); + attachment_service + .node_register(NodeRegisterRequest { + node_id: self.conf.id, + listen_pg_addr: pg_host.to_string(), + listen_pg_port: pg_port.unwrap_or(5432), + listen_http_addr: http_host.to_string(), + listen_http_port: http_port.unwrap_or(80), + }) + .await?; + } + + Ok(()) } fn pageserver_basic_args<'a>( diff --git a/control_plane/src/safekeeper.rs b/control_plane/src/safekeeper.rs index 4026ef0eb9..6ac71dfe51 100644 --- a/control_plane/src/safekeeper.rs +++ b/control_plane/src/safekeeper.rs @@ -7,7 +7,6 @@ //! ``` use std::io::Write; use std::path::PathBuf; -use std::process::Child; use std::{io, result}; use anyhow::Context; @@ -104,7 +103,7 @@ impl SafekeeperNode { .expect("non-Unicode path") } - pub async fn start(&self, extra_opts: Vec) -> anyhow::Result { + pub async fn start(&self, extra_opts: Vec) -> anyhow::Result<()> { print!( "Starting safekeeper at '{}' in '{}'", self.pg_connection_config.raw_address(), diff --git a/diesel.toml b/diesel.toml new file mode 100644 index 0000000000..30ed4444d7 --- /dev/null +++ b/diesel.toml @@ -0,0 +1,9 @@ +# For documentation on how to configure this file, +# see https://diesel.rs/guides/configuring-diesel-cli + +[print_schema] +file = "control_plane/attachment_service/src/schema.rs" +custom_type_derives = ["diesel::query_builder::QueryId"] + +[migrations_directory] +dir = "control_plane/attachment_service/migrations" diff --git a/libs/compute_api/src/spec.rs b/libs/compute_api/src/spec.rs index 13ac18e0c5..5361d14004 100644 --- a/libs/compute_api/src/spec.rs +++ b/libs/compute_api/src/spec.rs @@ -90,6 +90,9 @@ pub enum ComputeFeature { /// track short-lived connections as user activity. ActivityMonitorExperimental, + /// Enable running migrations + Migrations, + /// This is a special feature flag that is used to represent unknown feature flags. /// Basically all unknown to enum flags are represented as this one. See unit test /// `parse_unknown_features()` for more details. diff --git a/libs/pageserver_api/src/key.rs b/libs/pageserver_api/src/key.rs index 6a3679292e..852670af2c 100644 --- a/libs/pageserver_api/src/key.rs +++ b/libs/pageserver_api/src/key.rs @@ -1,9 +1,11 @@ use anyhow::{bail, Result}; use byteorder::{ByteOrder, BE}; +use postgres_ffi::relfile_utils::{FSM_FORKNUM, VISIBILITYMAP_FORKNUM}; +use postgres_ffi::{Oid, TransactionId}; use serde::{Deserialize, Serialize}; -use std::fmt; +use std::{fmt, ops::Range}; -use crate::reltag::{BlockNumber, RelTag}; +use crate::reltag::{BlockNumber, RelTag, SlruKind}; /// Key used in the Repository kv-store. /// @@ -143,12 +145,390 @@ impl Key { } } +// Layout of the Key address space +// +// The Key struct, used to address the underlying key-value store, consists of +// 18 bytes, split into six fields. See 'Key' in repository.rs. We need to map +// all the data and metadata keys into those 18 bytes. +// +// Principles for the mapping: +// +// - Things that are often accessed or modified together, should be close to +// each other in the key space. For example, if a relation is extended by one +// block, we create a new key-value pair for the block data, and update the +// relation size entry. Because of that, the RelSize key comes after all the +// RelBlocks of a relation: the RelSize and the last RelBlock are always next +// to each other. +// +// The key space is divided into four major sections, identified by the first +// byte, and the form a hierarchy: +// +// 00 Relation data and metadata +// +// DbDir () -> (dbnode, spcnode) +// Filenodemap +// RelDir -> relnode forknum +// RelBlocks +// RelSize +// +// 01 SLRUs +// +// SlruDir kind +// SlruSegBlocks segno +// SlruSegSize +// +// 02 pg_twophase +// +// 03 misc +// Controlfile +// checkpoint +// pg_version +// +// 04 aux files +// +// Below is a full list of the keyspace allocation: +// +// DbDir: +// 00 00000000 00000000 00000000 00 00000000 +// +// Filenodemap: +// 00 SPCNODE DBNODE 00000000 00 00000000 +// +// RelDir: +// 00 SPCNODE DBNODE 00000000 00 00000001 (Postgres never uses relfilenode 0) +// +// RelBlock: +// 00 SPCNODE DBNODE RELNODE FORK BLKNUM +// +// RelSize: +// 00 SPCNODE DBNODE RELNODE FORK FFFFFFFF +// +// SlruDir: +// 01 kind 00000000 00000000 00 00000000 +// +// SlruSegBlock: +// 01 kind 00000001 SEGNO 00 BLKNUM +// +// SlruSegSize: +// 01 kind 00000001 SEGNO 00 FFFFFFFF +// +// TwoPhaseDir: +// 02 00000000 00000000 00000000 00 00000000 +// +// TwoPhaseFile: +// 02 00000000 00000000 00000000 00 XID +// +// ControlFile: +// 03 00000000 00000000 00000000 00 00000000 +// +// Checkpoint: +// 03 00000000 00000000 00000000 00 00000001 +// +// AuxFiles: +// 03 00000000 00000000 00000000 00 00000002 +// + +//-- Section 01: relation data and metadata + +pub const DBDIR_KEY: Key = Key { + field1: 0x00, + field2: 0, + field3: 0, + field4: 0, + field5: 0, + field6: 0, +}; + +#[inline(always)] +pub fn dbdir_key_range(spcnode: Oid, dbnode: Oid) -> Range { + Key { + field1: 0x00, + field2: spcnode, + field3: dbnode, + field4: 0, + field5: 0, + field6: 0, + }..Key { + field1: 0x00, + field2: spcnode, + field3: dbnode, + field4: 0xffffffff, + field5: 0xff, + field6: 0xffffffff, + } +} + +#[inline(always)] +pub fn relmap_file_key(spcnode: Oid, dbnode: Oid) -> Key { + Key { + field1: 0x00, + field2: spcnode, + field3: dbnode, + field4: 0, + field5: 0, + field6: 0, + } +} + +#[inline(always)] +pub fn rel_dir_to_key(spcnode: Oid, dbnode: Oid) -> Key { + Key { + field1: 0x00, + field2: spcnode, + field3: dbnode, + field4: 0, + field5: 0, + field6: 1, + } +} + +#[inline(always)] +pub fn rel_block_to_key(rel: RelTag, blknum: BlockNumber) -> Key { + Key { + field1: 0x00, + field2: rel.spcnode, + field3: rel.dbnode, + field4: rel.relnode, + field5: rel.forknum, + field6: blknum, + } +} + +#[inline(always)] +pub fn rel_size_to_key(rel: RelTag) -> Key { + Key { + field1: 0x00, + field2: rel.spcnode, + field3: rel.dbnode, + field4: rel.relnode, + field5: rel.forknum, + field6: 0xffffffff, + } +} + +#[inline(always)] +pub fn rel_key_range(rel: RelTag) -> Range { + Key { + field1: 0x00, + field2: rel.spcnode, + field3: rel.dbnode, + field4: rel.relnode, + field5: rel.forknum, + field6: 0, + }..Key { + field1: 0x00, + field2: rel.spcnode, + field3: rel.dbnode, + field4: rel.relnode, + field5: rel.forknum + 1, + field6: 0, + } +} + +//-- Section 02: SLRUs + +#[inline(always)] +pub fn slru_dir_to_key(kind: SlruKind) -> Key { + Key { + field1: 0x01, + field2: match kind { + SlruKind::Clog => 0x00, + SlruKind::MultiXactMembers => 0x01, + SlruKind::MultiXactOffsets => 0x02, + }, + field3: 0, + field4: 0, + field5: 0, + field6: 0, + } +} + +#[inline(always)] +pub fn slru_block_to_key(kind: SlruKind, segno: u32, blknum: BlockNumber) -> Key { + Key { + field1: 0x01, + field2: match kind { + SlruKind::Clog => 0x00, + SlruKind::MultiXactMembers => 0x01, + SlruKind::MultiXactOffsets => 0x02, + }, + field3: 1, + field4: segno, + field5: 0, + field6: blknum, + } +} + +#[inline(always)] +pub fn slru_segment_size_to_key(kind: SlruKind, segno: u32) -> Key { + Key { + field1: 0x01, + field2: match kind { + SlruKind::Clog => 0x00, + SlruKind::MultiXactMembers => 0x01, + SlruKind::MultiXactOffsets => 0x02, + }, + field3: 1, + field4: segno, + field5: 0, + field6: 0xffffffff, + } +} + +#[inline(always)] +pub fn slru_segment_key_range(kind: SlruKind, segno: u32) -> Range { + let field2 = match kind { + SlruKind::Clog => 0x00, + SlruKind::MultiXactMembers => 0x01, + SlruKind::MultiXactOffsets => 0x02, + }; + + Key { + field1: 0x01, + field2, + field3: 1, + field4: segno, + field5: 0, + field6: 0, + }..Key { + field1: 0x01, + field2, + field3: 1, + field4: segno, + field5: 1, + field6: 0, + } +} + +//-- Section 03: pg_twophase + +pub const TWOPHASEDIR_KEY: Key = Key { + field1: 0x02, + field2: 0, + field3: 0, + field4: 0, + field5: 0, + field6: 0, +}; + +#[inline(always)] +pub fn twophase_file_key(xid: TransactionId) -> Key { + Key { + field1: 0x02, + field2: 0, + field3: 0, + field4: 0, + field5: 0, + field6: xid, + } +} + +#[inline(always)] +pub fn twophase_key_range(xid: TransactionId) -> Range { + let (next_xid, overflowed) = xid.overflowing_add(1); + + Key { + field1: 0x02, + field2: 0, + field3: 0, + field4: 0, + field5: 0, + field6: xid, + }..Key { + field1: 0x02, + field2: 0, + field3: 0, + field4: 0, + field5: u8::from(overflowed), + field6: next_xid, + } +} + +//-- Section 03: Control file +pub const CONTROLFILE_KEY: Key = Key { + field1: 0x03, + field2: 0, + field3: 0, + field4: 0, + field5: 0, + field6: 0, +}; + +pub const CHECKPOINT_KEY: Key = Key { + field1: 0x03, + field2: 0, + field3: 0, + field4: 0, + field5: 0, + field6: 1, +}; + +pub const AUX_FILES_KEY: Key = Key { + field1: 0x03, + field2: 0, + field3: 0, + field4: 0, + field5: 0, + field6: 2, +}; + +// Reverse mappings for a few Keys. +// These are needed by WAL redo manager. + +// AUX_FILES currently stores only data for logical replication (slots etc), and +// we don't preserve these on a branch because safekeepers can't follow timeline +// switch (and generally it likely should be optional), so ignore these. +#[inline(always)] +pub fn is_inherited_key(key: Key) -> bool { + key != AUX_FILES_KEY +} + +#[inline(always)] +pub fn is_rel_fsm_block_key(key: Key) -> bool { + key.field1 == 0x00 && key.field4 != 0 && key.field5 == FSM_FORKNUM && key.field6 != 0xffffffff +} + +#[inline(always)] +pub fn is_rel_vm_block_key(key: Key) -> bool { + key.field1 == 0x00 + && key.field4 != 0 + && key.field5 == VISIBILITYMAP_FORKNUM + && key.field6 != 0xffffffff +} + +#[inline(always)] +pub fn key_to_slru_block(key: Key) -> anyhow::Result<(SlruKind, u32, BlockNumber)> { + Ok(match key.field1 { + 0x01 => { + let kind = match key.field2 { + 0x00 => SlruKind::Clog, + 0x01 => SlruKind::MultiXactMembers, + 0x02 => SlruKind::MultiXactOffsets, + _ => anyhow::bail!("unrecognized slru kind 0x{:02x}", key.field2), + }; + let segno = key.field4; + let blknum = key.field6; + + (kind, segno, blknum) + } + _ => anyhow::bail!("unexpected value kind 0x{:02x}", key.field1), + }) +} + +#[inline(always)] +pub fn is_slru_block_key(key: Key) -> bool { + key.field1 == 0x01 // SLRU-related + && key.field3 == 0x00000001 // but not SlruDir + && key.field6 != 0xffffffff // and not SlruSegSize +} + #[inline(always)] pub fn is_rel_block_key(key: &Key) -> bool { key.field1 == 0x00 && key.field4 != 0 && key.field6 != 0xffffffff } /// Guaranteed to return `Ok()` if [[is_rel_block_key]] returns `true` for `key`. +#[inline(always)] pub fn key_to_rel_block(key: Key) -> anyhow::Result<(RelTag, BlockNumber)> { Ok(match key.field1 { 0x00 => ( diff --git a/libs/pageserver_api/src/keyspace.rs b/libs/pageserver_api/src/keyspace.rs index cab7b3d860..2316acb616 100644 --- a/libs/pageserver_api/src/keyspace.rs +++ b/libs/pageserver_api/src/keyspace.rs @@ -104,6 +104,7 @@ pub struct KeySpaceAccum { accum: Option>, ranges: Vec>, + size: u64, } impl KeySpaceAccum { @@ -111,6 +112,7 @@ impl KeySpaceAccum { Self { accum: None, ranges: Vec::new(), + size: 0, } } @@ -121,6 +123,8 @@ impl KeySpaceAccum { #[inline(always)] pub fn add_range(&mut self, range: Range) { + self.size += key_range_size(&range) as u64; + match self.accum.as_mut() { Some(accum) => { if range.start == accum.end { @@ -146,6 +150,23 @@ impl KeySpaceAccum { ranges: self.ranges, } } + + pub fn consume_keyspace(&mut self) -> KeySpace { + if let Some(accum) = self.accum.take() { + self.ranges.push(accum); + } + + let mut prev_accum = KeySpaceAccum::new(); + std::mem::swap(self, &mut prev_accum); + + KeySpace { + ranges: prev_accum.ranges, + } + } + + pub fn size(&self) -> u64 { + self.size + } } /// @@ -254,6 +275,30 @@ mod tests { } } + #[test] + fn keyspace_consume() { + let ranges = vec![kr(0..10), kr(20..35), kr(40..45)]; + + let mut accum = KeySpaceAccum::new(); + for range in &ranges { + accum.add_range(range.clone()); + } + + let expected_size: u64 = ranges.iter().map(|r| key_range_size(r) as u64).sum(); + assert_eq!(accum.size(), expected_size); + + assert_ks_eq(&accum.consume_keyspace(), ranges.clone()); + assert_eq!(accum.size(), 0); + + assert_ks_eq(&accum.consume_keyspace(), vec![]); + assert_eq!(accum.size(), 0); + + for range in &ranges { + accum.add_range(range.clone()); + } + assert_ks_eq(&accum.to_keyspace(), ranges); + } + #[test] fn keyspace_add_range() { // two separate ranges diff --git a/libs/pageserver_api/src/reltag.rs b/libs/pageserver_api/src/reltag.rs index e3a7da2ad9..3f37af600d 100644 --- a/libs/pageserver_api/src/reltag.rs +++ b/libs/pageserver_api/src/reltag.rs @@ -111,7 +111,19 @@ impl RelTag { /// These files are divided into segments, which are divided into /// pages of the same BLCKSZ as used for relation files. /// -#[derive(Debug, Clone, Copy, Hash, Serialize, Deserialize, PartialEq, Eq, PartialOrd, Ord)] +#[derive( + Debug, + Clone, + Copy, + Hash, + Serialize, + Deserialize, + PartialEq, + Eq, + PartialOrd, + Ord, + strum_macros::EnumIter, +)] pub enum SlruKind { Clog, MultiXactMembers, diff --git a/libs/remote_storage/src/azure_blob.rs b/libs/remote_storage/src/azure_blob.rs index 7895a21f66..abab32470b 100644 --- a/libs/remote_storage/src/azure_blob.rs +++ b/libs/remote_storage/src/azure_blob.rs @@ -8,6 +8,7 @@ use std::pin::Pin; use std::str::FromStr; use std::sync::Arc; use std::time::Duration; +use std::time::SystemTime; use super::REMOTE_STORAGE_PREFIX_SEPARATOR; use anyhow::Result; @@ -23,6 +24,7 @@ use futures::stream::Stream; use futures_util::StreamExt; use http_types::{StatusCode, Url}; use tokio::time::Instant; +use tokio_util::sync::CancellationToken; use tracing::debug; use crate::s3_bucket::RequestKind; @@ -183,7 +185,6 @@ fn to_download_error(error: azure_core::Error) -> DownloadError { } } -#[async_trait::async_trait] impl RemoteStorage for AzureBlobStorage { async fn list( &self, @@ -371,6 +372,20 @@ impl RemoteStorage for AzureBlobStorage { copy_status = status; } } + + async fn time_travel_recover( + &self, + _prefix: Option<&RemotePath>, + _timestamp: SystemTime, + _done_if_after: SystemTime, + _cancel: CancellationToken, + ) -> anyhow::Result<()> { + // TODO use Azure point in time recovery feature for this + // https://learn.microsoft.com/en-us/azure/storage/blobs/point-in-time-restore-overview + Err(anyhow::anyhow!( + "time travel recovery for azure blob storage is not implemented" + )) + } } pin_project_lite::pin_project! { diff --git a/libs/remote_storage/src/lib.rs b/libs/remote_storage/src/lib.rs index 942d0016b0..bf9c51ad1a 100644 --- a/libs/remote_storage/src/lib.rs +++ b/libs/remote_storage/src/lib.rs @@ -25,6 +25,7 @@ use bytes::Bytes; use futures::stream::Stream; use serde::{Deserialize, Serialize}; use tokio::sync::Semaphore; +use tokio_util::sync::CancellationToken; use toml_edit::Item; use tracing::info; @@ -142,7 +143,7 @@ pub struct Listing { /// Storage (potentially remote) API to manage its state. /// This storage tries to be unaware of any layered repository context, /// providing basic CRUD operations for storage files. -#[async_trait::async_trait] +#[allow(async_fn_in_trait)] pub trait RemoteStorage: Send + Sync + 'static { /// Lists all top level subdirectories for a given prefix /// Note: here we assume that if the prefix is passed it was obtained via remote_object_id @@ -210,6 +211,15 @@ pub trait RemoteStorage: Send + Sync + 'static { /// Copy a remote object inside a bucket from one path to another. async fn copy(&self, from: &RemotePath, to: &RemotePath) -> anyhow::Result<()>; + + /// Resets the content of everything with the given prefix to the given state + async fn time_travel_recover( + &self, + prefix: Option<&RemotePath>, + timestamp: SystemTime, + done_if_after: SystemTime, + cancel: CancellationToken, + ) -> anyhow::Result<()>; } pub type DownloadStream = Pin> + Unpin + Send + Sync>>; @@ -262,14 +272,15 @@ impl std::error::Error for DownloadError {} /// Every storage, currently supported. /// Serves as a simple way to pass around the [`RemoteStorage`] without dealing with generics. #[derive(Clone)] -pub enum GenericRemoteStorage { +// Require Clone for `Other` due to https://github.com/rust-lang/rust/issues/26925 +pub enum GenericRemoteStorage> { LocalFs(LocalFs), AwsS3(Arc), AzureBlob(Arc), - Unreliable(Arc), + Unreliable(Other), } -impl GenericRemoteStorage { +impl GenericRemoteStorage> { pub async fn list( &self, prefix: Option<&RemotePath>, @@ -386,6 +397,33 @@ impl GenericRemoteStorage { Self::Unreliable(s) => s.copy(from, to).await, } } + + pub async fn time_travel_recover( + &self, + prefix: Option<&RemotePath>, + timestamp: SystemTime, + done_if_after: SystemTime, + cancel: CancellationToken, + ) -> anyhow::Result<()> { + match self { + Self::LocalFs(s) => { + s.time_travel_recover(prefix, timestamp, done_if_after, cancel) + .await + } + Self::AwsS3(s) => { + s.time_travel_recover(prefix, timestamp, done_if_after, cancel) + .await + } + Self::AzureBlob(s) => { + s.time_travel_recover(prefix, timestamp, done_if_after, cancel) + .await + } + Self::Unreliable(s) => { + s.time_travel_recover(prefix, timestamp, done_if_after, cancel) + .await + } + } + } } impl GenericRemoteStorage { @@ -673,6 +711,7 @@ impl ConcurrencyLimiter { RequestKind::List => &self.read, RequestKind::Delete => &self.write, RequestKind::Copy => &self.write, + RequestKind::TimeTravel => &self.write, } } diff --git a/libs/remote_storage/src/local_fs.rs b/libs/remote_storage/src/local_fs.rs index bf8b6b5dde..34a6658a69 100644 --- a/libs/remote_storage/src/local_fs.rs +++ b/libs/remote_storage/src/local_fs.rs @@ -4,7 +4,7 @@ //! This storage used in tests, but can also be used in cases when a certain persistent //! volume is mounted to the local FS. -use std::{borrow::Cow, future::Future, io::ErrorKind, pin::Pin}; +use std::{borrow::Cow, future::Future, io::ErrorKind, pin::Pin, time::SystemTime}; use anyhow::{bail, ensure, Context}; use bytes::Bytes; @@ -14,7 +14,7 @@ use tokio::{ fs, io::{self, AsyncReadExt, AsyncSeekExt, AsyncWriteExt}, }; -use tokio_util::io::ReaderStream; +use tokio_util::{io::ReaderStream, sync::CancellationToken}; use tracing::*; use utils::{crashsafe::path_with_suffix_extension, fs_ext::is_directory_empty}; @@ -157,7 +157,6 @@ impl LocalFs { } } -#[async_trait::async_trait] impl RemoteStorage for LocalFs { async fn list( &self, @@ -423,6 +422,17 @@ impl RemoteStorage for LocalFs { })?; Ok(()) } + + #[allow(clippy::diverging_sub_expression)] + async fn time_travel_recover( + &self, + _prefix: Option<&RemotePath>, + _timestamp: SystemTime, + _done_if_after: SystemTime, + _cancel: CancellationToken, + ) -> anyhow::Result<()> { + unimplemented!() + } } fn storage_metadata_path(original_path: &Utf8Path) -> Utf8PathBuf { diff --git a/libs/remote_storage/src/s3_bucket.rs b/libs/remote_storage/src/s3_bucket.rs index d7b41edaaf..4909b8522b 100644 --- a/libs/remote_storage/src/s3_bucket.rs +++ b/libs/remote_storage/src/s3_bucket.rs @@ -6,12 +6,14 @@ use std::{ borrow::Cow, + collections::HashMap, pin::Pin, sync::Arc, task::{Context, Poll}, + time::SystemTime, }; -use anyhow::Context as _; +use anyhow::{anyhow, Context as _}; use aws_config::{ environment::credentials::EnvironmentVariableCredentialsProvider, imds::credentials::ImdsCredentialsProvider, @@ -27,17 +29,19 @@ use aws_sdk_s3::{ config::{AsyncSleep, Builder, IdentityCache, Region, SharedAsyncSleep}, error::SdkError, operation::get_object::GetObjectError, - types::{Delete, ObjectIdentifier}, + types::{Delete, DeleteMarkerEntry, ObjectIdentifier, ObjectVersion}, Client, }; use aws_smithy_async::rt::sleep::TokioSleep; -use aws_smithy_types::body::SdkBody; use aws_smithy_types::byte_stream::ByteStream; +use aws_smithy_types::{body::SdkBody, DateTime}; use bytes::Bytes; use futures::stream::Stream; use hyper::Body; use scopeguard::ScopeGuard; +use tokio_util::sync::CancellationToken; +use utils::backoff; use super::StorageMetadata; use crate::{ @@ -270,6 +274,59 @@ impl S3Bucket { } } } + + async fn delete_oids( + &self, + kind: RequestKind, + delete_objects: &[ObjectIdentifier], + ) -> anyhow::Result<()> { + for chunk in delete_objects.chunks(MAX_KEYS_PER_DELETE) { + let started_at = start_measuring_requests(kind); + + let resp = self + .client + .delete_objects() + .bucket(self.bucket_name.clone()) + .delete( + Delete::builder() + .set_objects(Some(chunk.to_vec())) + .build()?, + ) + .send() + .await; + + let started_at = ScopeGuard::into_inner(started_at); + metrics::BUCKET_METRICS + .req_seconds + .observe_elapsed(kind, &resp, started_at); + + let resp = resp?; + metrics::BUCKET_METRICS + .deleted_objects_total + .inc_by(chunk.len() as u64); + if let Some(errors) = resp.errors { + // Log a bounded number of the errors within the response: + // these requests can carry 1000 keys so logging each one + // would be too verbose, especially as errors may lead us + // to retry repeatedly. + const LOG_UP_TO_N_ERRORS: usize = 10; + for e in errors.iter().take(LOG_UP_TO_N_ERRORS) { + tracing::warn!( + "DeleteObjects key {} failed: {}: {}", + e.key.as_ref().map(Cow::from).unwrap_or("".into()), + e.code.as_ref().map(Cow::from).unwrap_or("".into()), + e.message.as_ref().map(Cow::from).unwrap_or("".into()) + ); + } + + return Err(anyhow::format_err!( + "Failed to delete {} objects", + errors.len() + )); + } + } + Ok(()) + } } pin_project_lite::pin_project! { @@ -373,7 +430,6 @@ impl>> Stream for TimedDownload { } } -#[async_trait::async_trait] impl RemoteStorage for S3Bucket { async fn list( &self, @@ -569,64 +625,168 @@ impl RemoteStorage for S3Bucket { delete_objects.push(obj_id); } - for chunk in delete_objects.chunks(MAX_KEYS_PER_DELETE) { - let started_at = start_measuring_requests(kind); - - let resp = self - .client - .delete_objects() - .bucket(self.bucket_name.clone()) - .delete( - Delete::builder() - .set_objects(Some(chunk.to_vec())) - .build()?, - ) - .send() - .await; - - let started_at = ScopeGuard::into_inner(started_at); - metrics::BUCKET_METRICS - .req_seconds - .observe_elapsed(kind, &resp, started_at); - - match resp { - Ok(resp) => { - metrics::BUCKET_METRICS - .deleted_objects_total - .inc_by(chunk.len() as u64); - if let Some(errors) = resp.errors { - // Log a bounded number of the errors within the response: - // these requests can carry 1000 keys so logging each one - // would be too verbose, especially as errors may lead us - // to retry repeatedly. - const LOG_UP_TO_N_ERRORS: usize = 10; - for e in errors.iter().take(LOG_UP_TO_N_ERRORS) { - tracing::warn!( - "DeleteObjects key {} failed: {}: {}", - e.key.as_ref().map(Cow::from).unwrap_or("".into()), - e.code.as_ref().map(Cow::from).unwrap_or("".into()), - e.message.as_ref().map(Cow::from).unwrap_or("".into()) - ); - } - - return Err(anyhow::format_err!( - "Failed to delete {} objects", - errors.len() - )); - } - } - Err(e) => { - return Err(e.into()); - } - } - } - Ok(()) + self.delete_oids(kind, &delete_objects).await } async fn delete(&self, path: &RemotePath) -> anyhow::Result<()> { let paths = std::array::from_ref(path); self.delete_objects(paths).await } + + async fn time_travel_recover( + &self, + prefix: Option<&RemotePath>, + timestamp: SystemTime, + done_if_after: SystemTime, + cancel: CancellationToken, + ) -> anyhow::Result<()> { + let kind = RequestKind::TimeTravel; + let _guard = self.permit(kind).await; + + let timestamp = DateTime::from(timestamp); + let done_if_after = DateTime::from(done_if_after); + + tracing::trace!("Target time: {timestamp:?}, done_if_after {done_if_after:?}"); + + // get the passed prefix or if it is not set use prefix_in_bucket value + let prefix = prefix + .map(|p| self.relative_path_to_s3_object(p)) + .or_else(|| self.prefix_in_bucket.clone()); + + let warn_threshold = 3; + let max_retries = 10; + let is_permanent = |_e: &_| false; + + let list = backoff::retry( + || async { + Ok(self + .client + .list_object_versions() + .bucket(self.bucket_name.clone()) + .set_prefix(prefix.clone()) + .send() + .await?) + }, + is_permanent, + warn_threshold, + max_retries, + "listing object versions for time_travel_recover", + backoff::Cancel::new(cancel.clone(), || anyhow!("Cancelled")), + ) + .await?; + + if list.is_truncated().unwrap_or_default() { + anyhow::bail!("Received truncated ListObjectVersions response for prefix={prefix:?}"); + } + + let mut versions_deletes = list + .versions() + .iter() + .map(VerOrDelete::Version) + .chain(list.delete_markers().iter().map(VerOrDelete::DeleteMarker)) + .collect::>(); + + versions_deletes.sort_by_key(|vd| (vd.key(), vd.last_modified())); + + let mut vds_for_key = HashMap::<_, Vec<_>>::new(); + + for vd in versions_deletes { + let last_modified = vd.last_modified(); + let version_id = vd.version_id(); + let key = vd.key(); + let (Some(last_modified), Some(version_id), Some(key)) = + (last_modified, version_id, key) + else { + anyhow::bail!( + "One (or more) of last_modified, key, and id is None. \ + Is versioning enabled in the bucket? last_modified={:?} key={:?} version_id={:?}", + last_modified, key, version_id, + ); + }; + if version_id == "null" { + anyhow::bail!("Received ListVersions response for key={key} with version_id='null', \ + indicating either disabled versioning, or legacy objects with null version id values"); + } + tracing::trace!( + "Parsing version key={key} version_id={version_id} is_delete={}", + matches!(vd, VerOrDelete::DeleteMarker(_)) + ); + + vds_for_key + .entry(key) + .or_default() + .push((vd, last_modified, version_id)); + } + for (key, versions) in vds_for_key { + let (last_vd, last_last_modified, _version_id) = versions.last().unwrap(); + if last_last_modified > &&done_if_after { + tracing::trace!("Key {key} has version later than done_if_after, skipping"); + continue; + } + // the version we want to restore to. + let version_to_restore_to = + match versions.binary_search_by_key(×tamp, |tpl| *tpl.1) { + Ok(v) => v, + Err(e) => e, + }; + if version_to_restore_to == versions.len() { + tracing::trace!("Key {key} has no changes since timestamp, skipping"); + continue; + } + let mut do_delete = false; + if version_to_restore_to == 0 { + // All versions more recent, so the key didn't exist at the specified time point. + tracing::trace!( + "All {} versions more recent for {key}, deleting", + versions.len() + ); + do_delete = true; + } else { + match &versions[version_to_restore_to - 1] { + (VerOrDelete::Version(_), _last_modified, version_id) => { + tracing::trace!("Copying old version {version_id} for {key}..."); + // Restore the state to the last version by copying + let source_id = + format!("{}/{key}?versionId={version_id}", self.bucket_name); + + backoff::retry( + || async { + Ok(self + .client + .copy_object() + .bucket(self.bucket_name.clone()) + .key(key) + .copy_source(&source_id) + .send() + .await?) + }, + is_permanent, + warn_threshold, + max_retries, + "listing object versions for time_travel_recover", + backoff::Cancel::new(cancel.clone(), || anyhow!("Cancelled")), + ) + .await?; + } + (VerOrDelete::DeleteMarker(_), _last_modified, _version_id) => { + do_delete = true; + } + } + }; + if do_delete { + if matches!(last_vd, VerOrDelete::DeleteMarker(_)) { + // Key has since been deleted (but there was some history), no need to do anything + tracing::trace!("Key {key} already deleted, skipping."); + } else { + tracing::trace!("Deleting {key}..."); + + let oid = ObjectIdentifier::builder().key(key.to_owned()).build()?; + self.delete_oids(kind, &[oid]).await?; + } + } + } + Ok(()) + } } /// On drop (cancellation) count towards [`metrics::BucketMetrics::cancelled_waits`]. @@ -651,6 +811,32 @@ fn start_measuring_requests( }) } +enum VerOrDelete<'a> { + Version(&'a ObjectVersion), + DeleteMarker(&'a DeleteMarkerEntry), +} + +impl<'a> VerOrDelete<'a> { + fn last_modified(&self) -> Option<&'a DateTime> { + match self { + VerOrDelete::Version(v) => v.last_modified(), + VerOrDelete::DeleteMarker(v) => v.last_modified(), + } + } + fn version_id(&self) -> Option<&'a str> { + match self { + VerOrDelete::Version(v) => v.version_id(), + VerOrDelete::DeleteMarker(v) => v.version_id(), + } + } + fn key(&self) -> Option<&'a str> { + match self { + VerOrDelete::Version(v) => v.key(), + VerOrDelete::DeleteMarker(v) => v.key(), + } + } +} + #[cfg(test)] mod tests { use camino::Utf8Path; diff --git a/libs/remote_storage/src/s3_bucket/metrics.rs b/libs/remote_storage/src/s3_bucket/metrics.rs index 21dde14906..beca755920 100644 --- a/libs/remote_storage/src/s3_bucket/metrics.rs +++ b/libs/remote_storage/src/s3_bucket/metrics.rs @@ -12,6 +12,7 @@ pub(crate) enum RequestKind { Delete = 2, List = 3, Copy = 4, + TimeTravel = 5, } use RequestKind::*; @@ -24,6 +25,7 @@ impl RequestKind { Delete => "delete_object", List => "list_objects", Copy => "copy_object", + TimeTravel => "time_travel_recover", } } const fn as_index(&self) -> usize { @@ -31,7 +33,7 @@ impl RequestKind { } } -pub(super) struct RequestTyped([C; 5]); +pub(super) struct RequestTyped([C; 6]); impl RequestTyped { pub(super) fn get(&self, kind: RequestKind) -> &C { @@ -40,8 +42,8 @@ impl RequestTyped { fn build_with(mut f: impl FnMut(RequestKind) -> C) -> Self { use RequestKind::*; - let mut it = [Get, Put, Delete, List, Copy].into_iter(); - let arr = std::array::from_fn::(|index| { + let mut it = [Get, Put, Delete, List, Copy, TimeTravel].into_iter(); + let arr = std::array::from_fn::(|index| { let next = it.next().unwrap(); assert_eq!(index, next.as_index()); f(next) diff --git a/libs/remote_storage/src/simulate_failures.rs b/libs/remote_storage/src/simulate_failures.rs index 7f5adcea30..fc4c4b315b 100644 --- a/libs/remote_storage/src/simulate_failures.rs +++ b/libs/remote_storage/src/simulate_failures.rs @@ -3,16 +3,19 @@ //! testing purposes. use bytes::Bytes; use futures::stream::Stream; -use std::collections::hash_map::Entry; use std::collections::HashMap; use std::sync::Mutex; +use std::time::SystemTime; +use std::{collections::hash_map::Entry, sync::Arc}; +use tokio_util::sync::CancellationToken; use crate::{ - Download, DownloadError, Listing, ListingMode, RemotePath, RemoteStorage, StorageMetadata, + Download, DownloadError, GenericRemoteStorage, Listing, ListingMode, RemotePath, RemoteStorage, + StorageMetadata, }; pub struct UnreliableWrapper { - inner: crate::GenericRemoteStorage, + inner: GenericRemoteStorage>, // This many attempts of each operation will fail, then we let it succeed. attempts_to_fail: u64, @@ -29,11 +32,21 @@ enum RemoteOp { Download(RemotePath), Delete(RemotePath), DeleteObjects(Vec), + TimeTravelRecover(Option), } impl UnreliableWrapper { pub fn new(inner: crate::GenericRemoteStorage, attempts_to_fail: u64) -> Self { assert!(attempts_to_fail > 0); + let inner = match inner { + GenericRemoteStorage::AwsS3(s) => GenericRemoteStorage::AwsS3(s), + GenericRemoteStorage::AzureBlob(s) => GenericRemoteStorage::AzureBlob(s), + GenericRemoteStorage::LocalFs(s) => GenericRemoteStorage::LocalFs(s), + // We could also make this a no-op, as in, extract the inner of the passed generic remote storage + GenericRemoteStorage::Unreliable(_s) => { + panic!("Can't wrap unreliable wrapper unreliably") + } + }; UnreliableWrapper { inner, attempts_to_fail, @@ -84,7 +97,9 @@ impl UnreliableWrapper { } } -#[async_trait::async_trait] +// We never construct this, so the type is not important, just has to not be UnreliableWrapper and impl RemoteStorage. +type VoidStorage = crate::LocalFs; + impl RemoteStorage for UnreliableWrapper { async fn list_prefixes( &self, @@ -169,4 +184,17 @@ impl RemoteStorage for UnreliableWrapper { self.attempt(RemoteOp::Upload(to.clone()))?; self.inner.copy_object(from, to).await } + + async fn time_travel_recover( + &self, + prefix: Option<&RemotePath>, + timestamp: SystemTime, + done_if_after: SystemTime, + cancel: CancellationToken, + ) -> anyhow::Result<()> { + self.attempt(RemoteOp::TimeTravelRecover(prefix.map(|p| p.to_owned())))?; + self.inner + .time_travel_recover(prefix, timestamp, done_if_after, cancel) + .await + } } diff --git a/libs/remote_storage/tests/test_real_s3.rs b/libs/remote_storage/tests/test_real_s3.rs index 4a999d115e..679be66bf7 100644 --- a/libs/remote_storage/tests/test_real_s3.rs +++ b/libs/remote_storage/tests/test_real_s3.rs @@ -1,15 +1,21 @@ -use std::collections::HashSet; use std::env; +use std::fmt::{Debug, Display}; use std::num::NonZeroUsize; use std::ops::ControlFlow; use std::sync::Arc; -use std::time::UNIX_EPOCH; +use std::time::{Duration, UNIX_EPOCH}; +use std::{collections::HashSet, time::SystemTime}; +use crate::common::{download_to_vec, upload_stream}; use anyhow::Context; +use camino::Utf8Path; +use futures_util::Future; use remote_storage::{ GenericRemoteStorage, RemotePath, RemoteStorageConfig, RemoteStorageKind, S3Config, }; +use test_context::test_context; use test_context::AsyncTestContext; +use tokio_util::sync::CancellationToken; use tracing::info; mod common; @@ -18,11 +24,160 @@ mod common; mod tests_s3; use common::{cleanup, ensure_logging_ready, upload_remote_data, upload_simple_remote_data}; +use utils::backoff; const ENABLE_REAL_S3_REMOTE_STORAGE_ENV_VAR_NAME: &str = "ENABLE_REAL_S3_REMOTE_STORAGE"; const BASE_PREFIX: &str = "test"; +#[test_context(MaybeEnabledStorage)] +#[tokio::test] +async fn s3_time_travel_recovery_works(ctx: &mut MaybeEnabledStorage) -> anyhow::Result<()> { + let ctx = match ctx { + MaybeEnabledStorage::Enabled(ctx) => ctx, + MaybeEnabledStorage::Disabled => return Ok(()), + }; + // Our test depends on discrepancies in the clock between S3 and the environment the tests + // run in. Therefore, wait a little bit before and after. The alternative would be + // to take the time from S3 response headers. + const WAIT_TIME: Duration = Duration::from_millis(3_000); + + async fn retry(op: O) -> Result + where + E: Display + Debug + 'static, + O: FnMut() -> F, + F: Future>, + { + let warn_threshold = 3; + let max_retries = 10; + backoff::retry( + op, + |_e| false, + warn_threshold, + max_retries, + "test retry", + backoff::Cancel::new(CancellationToken::new(), || unreachable!()), + ) + .await + } + + async fn time_point() -> SystemTime { + tokio::time::sleep(WAIT_TIME).await; + let ret = SystemTime::now(); + tokio::time::sleep(WAIT_TIME).await; + ret + } + + async fn list_files(client: &Arc) -> anyhow::Result> { + Ok(retry(|| client.list_files(None)) + .await + .context("list root files failure")? + .into_iter() + .collect::>()) + } + + let path1 = RemotePath::new(Utf8Path::new(format!("{}/path1", ctx.base_prefix).as_str())) + .with_context(|| "RemotePath conversion")?; + + let path2 = RemotePath::new(Utf8Path::new(format!("{}/path2", ctx.base_prefix).as_str())) + .with_context(|| "RemotePath conversion")?; + + let path3 = RemotePath::new(Utf8Path::new(format!("{}/path3", ctx.base_prefix).as_str())) + .with_context(|| "RemotePath conversion")?; + + retry(|| { + let (data, len) = upload_stream("remote blob data1".as_bytes().into()); + ctx.client.upload(data, len, &path1, None) + }) + .await?; + + let t0_files = list_files(&ctx.client).await?; + let t0 = time_point().await; + println!("at t0: {t0_files:?}"); + + let old_data = "remote blob data2"; + + retry(|| { + let (data, len) = upload_stream(old_data.as_bytes().into()); + ctx.client.upload(data, len, &path2, None) + }) + .await?; + + let t1_files = list_files(&ctx.client).await?; + let t1 = time_point().await; + println!("at t1: {t1_files:?}"); + + // A little check to ensure that our clock is not too far off from the S3 clock + { + let dl = retry(|| ctx.client.download(&path2)).await?; + let last_modified = dl.last_modified.unwrap(); + let half_wt = WAIT_TIME.mul_f32(0.5); + let t0_hwt = t0 + half_wt; + let t1_hwt = t1 - half_wt; + if !(t0_hwt..=t1_hwt).contains(&last_modified) { + panic!("last_modified={last_modified:?} is not between t0_hwt={t0_hwt:?} and t1_hwt={t1_hwt:?}. \ + This likely means a large lock discrepancy between S3 and the local clock."); + } + } + + retry(|| { + let (data, len) = upload_stream("remote blob data3".as_bytes().into()); + ctx.client.upload(data, len, &path3, None) + }) + .await?; + + let new_data = "new remote blob data2"; + + retry(|| { + let (data, len) = upload_stream(new_data.as_bytes().into()); + ctx.client.upload(data, len, &path2, None) + }) + .await?; + + retry(|| ctx.client.delete(&path1)).await?; + let t2_files = list_files(&ctx.client).await?; + let t2 = time_point().await; + println!("at t2: {t2_files:?}"); + + // No changes after recovery to t2 (no-op) + let t_final = time_point().await; + ctx.client + .time_travel_recover(None, t2, t_final, CancellationToken::new()) + .await?; + let t2_files_recovered = list_files(&ctx.client).await?; + println!("after recovery to t2: {t2_files_recovered:?}"); + assert_eq!(t2_files, t2_files_recovered); + let path2_recovered_t2 = download_to_vec(ctx.client.download(&path2).await?).await?; + assert_eq!(path2_recovered_t2, new_data.as_bytes()); + + // after recovery to t1: path1 is back, path2 has the old content + let t_final = time_point().await; + ctx.client + .time_travel_recover(None, t1, t_final, CancellationToken::new()) + .await?; + let t1_files_recovered = list_files(&ctx.client).await?; + println!("after recovery to t1: {t1_files_recovered:?}"); + assert_eq!(t1_files, t1_files_recovered); + let path2_recovered_t1 = download_to_vec(ctx.client.download(&path2).await?).await?; + assert_eq!(path2_recovered_t1, old_data.as_bytes()); + + // after recovery to t0: everything is gone except for path1 + let t_final = time_point().await; + ctx.client + .time_travel_recover(None, t0, t_final, CancellationToken::new()) + .await?; + let t0_files_recovered = list_files(&ctx.client).await?; + println!("after recovery to t0: {t0_files_recovered:?}"); + assert_eq!(t0_files, t0_files_recovered); + + // cleanup + + let paths = &[path1, path2, path3]; + retry(|| ctx.client.delete_objects(paths)).await?; + + Ok(()) +} + struct EnabledS3 { client: Arc, base_prefix: &'static str, diff --git a/libs/utils/src/http/error.rs b/libs/utils/src/http/error.rs index 3e9281ac81..d55823b0b7 100644 --- a/libs/utils/src/http/error.rs +++ b/libs/utils/src/http/error.rs @@ -131,7 +131,9 @@ pub fn api_error_handler(api_error: ApiError) -> Response { ApiError::ResourceUnavailable(_) => info!("Error processing HTTP request: {api_error:#}"), ApiError::NotFound(_) => info!("Error processing HTTP request: {api_error:#}"), ApiError::InternalServerError(_) => error!("Error processing HTTP request: {api_error:?}"), - _ => error!("Error processing HTTP request: {api_error:#}"), + ApiError::ShuttingDown => info!("Shut down while processing HTTP request"), + ApiError::Timeout(_) => info!("Timeout while processing HTTP request: {api_error:#}"), + _ => info!("Error processing HTTP request: {api_error:#}"), } api_error.into_response() diff --git a/libs/utils/src/nonblock.rs b/libs/utils/src/nonblock.rs index 8b1fd71ae6..05e2e3af4c 100644 --- a/libs/utils/src/nonblock.rs +++ b/libs/utils/src/nonblock.rs @@ -5,10 +5,10 @@ use std::os::unix::io::RawFd; pub fn set_nonblock(fd: RawFd) -> Result<(), std::io::Error> { let bits = fcntl(fd, F_GETFL)?; - // Safety: If F_GETFL returns some unknown bits, they should be valid + // If F_GETFL returns some unknown bits, they should be valid // for passing back to F_SETFL, too. If we left them out, the F_SETFL // would effectively clear them, which is not what we want. - let mut flags = unsafe { OFlag::from_bits_unchecked(bits) }; + let mut flags = OFlag::from_bits_retain(bits); flags |= OFlag::O_NONBLOCK; fcntl(fd, F_SETFL(flags))?; diff --git a/libs/utils/src/tcp_listener.rs b/libs/utils/src/tcp_listener.rs index 7666ad138c..6b35d3d63a 100644 --- a/libs/utils/src/tcp_listener.rs +++ b/libs/utils/src/tcp_listener.rs @@ -1,7 +1,6 @@ use std::{ io, net::{TcpListener, ToSocketAddrs}, - os::unix::prelude::AsRawFd, }; use nix::sys::socket::{setsockopt, sockopt::ReuseAddr}; @@ -10,7 +9,7 @@ use nix::sys::socket::{setsockopt, sockopt::ReuseAddr}; pub fn bind(addr: A) -> io::Result { let listener = TcpListener::bind(addr)?; - setsockopt(listener.as_raw_fd(), ReuseAddr, &true)?; + setsockopt(&listener, ReuseAddr, &true)?; Ok(listener) } diff --git a/pageserver/Cargo.toml b/pageserver/Cargo.toml index 980fbab22e..e44501d1ed 100644 --- a/pageserver/Cargo.toml +++ b/pageserver/Cargo.toml @@ -61,6 +61,7 @@ sync_wrapper.workspace = true tokio-tar.workspace = true thiserror.workspace = true tokio = { workspace = true, features = ["process", "sync", "fs", "rt", "io-util", "time"] } +tokio-epoll-uring.workspace = true tokio-io-timeout.workspace = true tokio-postgres.workspace = true tokio-stream.workspace = true diff --git a/pageserver/ctl/src/layer_map_analyzer.rs b/pageserver/ctl/src/layer_map_analyzer.rs index 15d4eb09e0..eb5c3f15cf 100644 --- a/pageserver/ctl/src/layer_map_analyzer.rs +++ b/pageserver/ctl/src/layer_map_analyzer.rs @@ -18,7 +18,7 @@ use pageserver::tenant::block_io::FileBlockReader; use pageserver::tenant::disk_btree::{DiskBtreeReader, VisitDirection}; use pageserver::tenant::storage_layer::delta_layer::{Summary, DELTA_KEY_SIZE}; use pageserver::tenant::storage_layer::range_overlaps; -use pageserver::virtual_file::VirtualFile; +use pageserver::virtual_file::{self, VirtualFile}; use utils::{bin_ser::BeSer, lsn::Lsn}; @@ -142,7 +142,7 @@ pub(crate) async fn main(cmd: &AnalyzeLayerMapCmd) -> Result<()> { let ctx = RequestContext::new(TaskKind::DebugTool, DownloadBehavior::Error); // Initialize virtual_file (file desriptor cache) and page cache which are needed to access layer persistent B-Tree. - pageserver::virtual_file::init(10); + pageserver::virtual_file::init(10, virtual_file::IoEngineKind::StdFs); pageserver::page_cache::init(100); let mut total_delta_layers = 0usize; diff --git a/pageserver/ctl/src/layers.rs b/pageserver/ctl/src/layers.rs index ebf4a4bec3..dbbcfedac0 100644 --- a/pageserver/ctl/src/layers.rs +++ b/pageserver/ctl/src/layers.rs @@ -59,7 +59,7 @@ pub(crate) enum LayerCmd { async fn read_delta_file(path: impl AsRef, ctx: &RequestContext) -> Result<()> { let path = Utf8Path::from_path(path.as_ref()).expect("non-Unicode path"); - virtual_file::init(10); + virtual_file::init(10, virtual_file::IoEngineKind::StdFs); page_cache::init(100); let file = FileBlockReader::new(VirtualFile::open(path).await?); let summary_blk = file.read_blk(0, ctx).await?; @@ -187,7 +187,7 @@ pub(crate) async fn main(cmd: &LayerCmd) -> Result<()> { new_tenant_id, new_timeline_id, } => { - pageserver::virtual_file::init(10); + pageserver::virtual_file::init(10, virtual_file::IoEngineKind::StdFs); pageserver::page_cache::init(100); let ctx = RequestContext::new(TaskKind::DebugTool, DownloadBehavior::Error); diff --git a/pageserver/ctl/src/main.rs b/pageserver/ctl/src/main.rs index fb42d6d2f1..3c90933fe9 100644 --- a/pageserver/ctl/src/main.rs +++ b/pageserver/ctl/src/main.rs @@ -123,7 +123,7 @@ fn read_pg_control_file(control_file_path: &Utf8Path) -> anyhow::Result<()> { async fn print_layerfile(path: &Utf8Path) -> anyhow::Result<()> { // Basic initialization of things that don't change after startup - virtual_file::init(10); + virtual_file::init(10, virtual_file::IoEngineKind::StdFs); page_cache::init(100); let ctx = RequestContext::new(TaskKind::DebugTool, DownloadBehavior::Error); dump_layerfile_from_path(path, true, &ctx).await diff --git a/pageserver/pagebench/src/cmd/getpage_latest_lsn.rs b/pageserver/pagebench/src/cmd/getpage_latest_lsn.rs index 98f1852acd..400b5476b7 100644 --- a/pageserver/pagebench/src/cmd/getpage_latest_lsn.rs +++ b/pageserver/pagebench/src/cmd/getpage_latest_lsn.rs @@ -423,8 +423,8 @@ async fn client( tokio::select! { res = do_requests => { res }, _ = cancel.cancelled() => { - client.shutdown().await; - return; + // fallthrough to shutdown } } + client.shutdown().await; } diff --git a/pageserver/src/basebackup.rs b/pageserver/src/basebackup.rs index 7e5ae892ad..009deff0aa 100644 --- a/pageserver/src/basebackup.rs +++ b/pageserver/src/basebackup.rs @@ -11,8 +11,9 @@ //! from data stored in object storage. //! use anyhow::{anyhow, bail, ensure, Context}; -use bytes::{BufMut, BytesMut}; +use bytes::{BufMut, Bytes, BytesMut}; use fail::fail_point; +use pageserver_api::key::{key_to_slru_block, Key}; use postgres_ffi::pg_constants; use std::fmt::Write as FmtWrite; use std::time::SystemTime; @@ -133,6 +134,87 @@ where ctx: &'a RequestContext, } +/// A sink that accepts SLRU blocks ordered by key and forwards +/// full segments to the archive. +struct SlruSegmentsBuilder<'a, 'b, W> +where + W: AsyncWrite + Send + Sync + Unpin, +{ + ar: &'a mut Builder<&'b mut W>, + buf: Vec, + current_segment: Option<(SlruKind, u32)>, +} + +impl<'a, 'b, W> SlruSegmentsBuilder<'a, 'b, W> +where + W: AsyncWrite + Send + Sync + Unpin, +{ + fn new(ar: &'a mut Builder<&'b mut W>) -> Self { + Self { + ar, + buf: Vec::new(), + current_segment: None, + } + } + + async fn add_block(&mut self, key: &Key, block: Bytes) -> anyhow::Result<()> { + let (kind, segno, _) = key_to_slru_block(*key)?; + + match kind { + SlruKind::Clog => { + ensure!(block.len() == BLCKSZ as usize || block.len() == BLCKSZ as usize + 8); + } + SlruKind::MultiXactMembers | SlruKind::MultiXactOffsets => { + ensure!(block.len() == BLCKSZ as usize); + } + } + + let segment = (kind, segno); + match self.current_segment { + None => { + self.current_segment = Some(segment); + self.buf + .extend_from_slice(block.slice(..BLCKSZ as usize).as_ref()); + } + Some(current_seg) if current_seg == segment => { + self.buf + .extend_from_slice(block.slice(..BLCKSZ as usize).as_ref()); + } + Some(_) => { + self.flush().await?; + + self.current_segment = Some(segment); + self.buf + .extend_from_slice(block.slice(..BLCKSZ as usize).as_ref()); + } + } + + Ok(()) + } + + async fn flush(&mut self) -> anyhow::Result<()> { + let nblocks = self.buf.len() / BLCKSZ as usize; + let (kind, segno) = self.current_segment.take().unwrap(); + let segname = format!("{}/{:>04X}", kind.to_str(), segno); + let header = new_tar_header(&segname, self.buf.len() as u64)?; + self.ar.append(&header, self.buf.as_slice()).await?; + + trace!("Added to basebackup slru {} relsize {}", segname, nblocks); + + self.buf.clear(); + + Ok(()) + } + + async fn finish(mut self) -> anyhow::Result<()> { + if self.current_segment.is_none() || self.buf.is_empty() { + return Ok(()); + } + + self.flush().await + } +} + impl<'a, W> Basebackup<'a, W> where W: AsyncWrite + Send + Sync + Unpin, @@ -168,20 +250,27 @@ where } // Gather non-relational files from object storage pages. - for kind in [ - SlruKind::Clog, - SlruKind::MultiXactOffsets, - SlruKind::MultiXactMembers, - ] { - for segno in self + let slru_partitions = self + .timeline + .get_slru_keyspace(Version::Lsn(self.lsn), self.ctx) + .await? + .partition(Timeline::MAX_GET_VECTORED_KEYS * BLCKSZ as u64); + + let mut slru_builder = SlruSegmentsBuilder::new(&mut self.ar); + + for part in slru_partitions.parts { + let blocks = self .timeline - .list_slru_segments(kind, Version::Lsn(self.lsn), self.ctx) - .await? - { - self.add_slru_segment(kind, segno).await?; + .get_vectored(&part.ranges, self.lsn, self.ctx) + .await?; + + for (key, block) in blocks { + slru_builder.add_block(&key, block?).await?; } } + slru_builder.finish().await?; + let mut min_restart_lsn: Lsn = Lsn::MAX; // Create tablespace directories for ((spcnode, dbnode), has_relmap_file) in @@ -305,39 +394,6 @@ where Ok(()) } - // - // Generate SLRU segment files from repository. - // - async fn add_slru_segment(&mut self, slru: SlruKind, segno: u32) -> anyhow::Result<()> { - let nblocks = self - .timeline - .get_slru_segment_size(slru, segno, Version::Lsn(self.lsn), self.ctx) - .await?; - - let mut slru_buf: Vec = Vec::with_capacity(nblocks as usize * BLCKSZ as usize); - for blknum in 0..nblocks { - let img = self - .timeline - .get_slru_page_at_lsn(slru, segno, blknum, self.lsn, self.ctx) - .await?; - - if slru == SlruKind::Clog { - ensure!(img.len() == BLCKSZ as usize || img.len() == BLCKSZ as usize + 8); - } else { - ensure!(img.len() == BLCKSZ as usize); - } - - slru_buf.extend_from_slice(&img[..BLCKSZ as usize]); - } - - let segname = format!("{}/{:>04X}", slru.to_str(), segno); - let header = new_tar_header(&segname, slru_buf.len() as u64)?; - self.ar.append(&header, slru_buf.as_slice()).await?; - - trace!("Added to basebackup slru {} relsize {}", segname, nblocks); - Ok(()) - } - // // Include database/tablespace directories. // diff --git a/pageserver/src/bin/pageserver.rs b/pageserver/src/bin/pageserver.rs index 15e3359c06..84de76e55e 100644 --- a/pageserver/src/bin/pageserver.rs +++ b/pageserver/src/bin/pageserver.rs @@ -130,7 +130,7 @@ fn main() -> anyhow::Result<()> { let scenario = failpoint_support::init(); // Basic initialization of things that don't change after startup - virtual_file::init(conf.max_file_descriptors); + virtual_file::init(conf.max_file_descriptors, conf.virtual_file_io_engine); page_cache::init(conf.page_cache_size); start_pageserver(launch_ts, conf).context("Failed to start pageserver")?; diff --git a/pageserver/src/config.rs b/pageserver/src/config.rs index 52277d7f24..1989bef817 100644 --- a/pageserver/src/config.rs +++ b/pageserver/src/config.rs @@ -36,6 +36,7 @@ use crate::tenant::config::TenantConfOpt; use crate::tenant::{ TENANTS_SEGMENT_NAME, TENANT_DELETED_MARKER_FILE_NAME, TIMELINES_SEGMENT_NAME, }; +use crate::virtual_file; use crate::{ IGNORED_TENANT_FILE_NAME, METADATA_FILE_NAME, TENANT_CONFIG_NAME, TENANT_HEATMAP_BASENAME, TENANT_LOCATION_CONFIG_NAME, TIMELINE_DELETE_MARK_SUFFIX, TIMELINE_UNINIT_MARK_SUFFIX, @@ -43,6 +44,8 @@ use crate::{ use self::defaults::DEFAULT_CONCURRENT_TENANT_WARMUP; +use self::defaults::DEFAULT_VIRTUAL_FILE_IO_ENGINE; + pub mod defaults { use crate::tenant::config::defaults::*; use const_format::formatcp; @@ -79,6 +82,8 @@ pub mod defaults { pub const DEFAULT_INGEST_BATCH_SIZE: u64 = 100; + pub const DEFAULT_VIRTUAL_FILE_IO_ENGINE: &str = "std-fs"; + /// /// Default built-in configuration file. /// @@ -114,6 +119,8 @@ pub mod defaults { #ingest_batch_size = {DEFAULT_INGEST_BATCH_SIZE} +#virtual_file_io_engine = '{DEFAULT_VIRTUAL_FILE_IO_ENGINE}' + [tenant_config] #checkpoint_distance = {DEFAULT_CHECKPOINT_DISTANCE} # in bytes #checkpoint_timeout = {DEFAULT_CHECKPOINT_TIMEOUT} @@ -247,6 +254,8 @@ pub struct PageServerConf { /// Maximum number of WAL records to be ingested and committed at the same time pub ingest_batch_size: u64, + + pub virtual_file_io_engine: virtual_file::IoEngineKind, } /// We do not want to store this in a PageServerConf because the latter may be logged @@ -331,6 +340,8 @@ struct PageServerConfigBuilder { secondary_download_concurrency: BuilderValue, ingest_batch_size: BuilderValue, + + virtual_file_io_engine: BuilderValue, } impl Default for PageServerConfigBuilder { @@ -406,6 +417,8 @@ impl Default for PageServerConfigBuilder { secondary_download_concurrency: Set(DEFAULT_SECONDARY_DOWNLOAD_CONCURRENCY), ingest_batch_size: Set(DEFAULT_INGEST_BATCH_SIZE), + + virtual_file_io_engine: Set(DEFAULT_VIRTUAL_FILE_IO_ENGINE.parse().unwrap()), } } } @@ -562,6 +575,10 @@ impl PageServerConfigBuilder { self.ingest_batch_size = BuilderValue::Set(ingest_batch_size) } + pub fn virtual_file_io_engine(&mut self, value: virtual_file::IoEngineKind) { + self.virtual_file_io_engine = BuilderValue::Set(value); + } + pub fn build(self) -> anyhow::Result { let concurrent_tenant_warmup = self .concurrent_tenant_warmup @@ -669,6 +686,9 @@ impl PageServerConfigBuilder { ingest_batch_size: self .ingest_batch_size .ok_or(anyhow!("missing ingest_batch_size"))?, + virtual_file_io_engine: self + .virtual_file_io_engine + .ok_or(anyhow!("missing virtual_file_io_engine"))?, }) } } @@ -920,6 +940,9 @@ impl PageServerConf { builder.secondary_download_concurrency(parse_toml_u64(key, item)? as usize) }, "ingest_batch_size" => builder.ingest_batch_size(parse_toml_u64(key, item)?), + "virtual_file_io_engine" => { + builder.virtual_file_io_engine(parse_toml_from_str("virtual_file_io_engine", item)?) + } _ => bail!("unrecognized pageserver option '{key}'"), } } @@ -993,6 +1016,7 @@ impl PageServerConf { heatmap_upload_concurrency: defaults::DEFAULT_HEATMAP_UPLOAD_CONCURRENCY, secondary_download_concurrency: defaults::DEFAULT_SECONDARY_DOWNLOAD_CONCURRENCY, ingest_batch_size: defaults::DEFAULT_INGEST_BATCH_SIZE, + virtual_file_io_engine: DEFAULT_VIRTUAL_FILE_IO_ENGINE.parse().unwrap(), } } } @@ -1225,6 +1249,7 @@ background_task_maximum_delay = '334 s' heatmap_upload_concurrency: defaults::DEFAULT_HEATMAP_UPLOAD_CONCURRENCY, secondary_download_concurrency: defaults::DEFAULT_SECONDARY_DOWNLOAD_CONCURRENCY, ingest_batch_size: defaults::DEFAULT_INGEST_BATCH_SIZE, + virtual_file_io_engine: DEFAULT_VIRTUAL_FILE_IO_ENGINE.parse().unwrap(), }, "Correct defaults should be used when no config values are provided" ); @@ -1288,6 +1313,7 @@ background_task_maximum_delay = '334 s' heatmap_upload_concurrency: defaults::DEFAULT_HEATMAP_UPLOAD_CONCURRENCY, secondary_download_concurrency: defaults::DEFAULT_SECONDARY_DOWNLOAD_CONCURRENCY, ingest_batch_size: 100, + virtual_file_io_engine: DEFAULT_VIRTUAL_FILE_IO_ENGINE.parse().unwrap(), }, "Should be able to parse all basic config values correctly" ); diff --git a/pageserver/src/http/openapi_spec.yml b/pageserver/src/http/openapi_spec.yml index 1fbca1086f..a49eef8bb9 100644 --- a/pageserver/src/http/openapi_spec.yml +++ b/pageserver/src/http/openapi_spec.yml @@ -877,6 +877,56 @@ paths: schema: $ref: "#/components/schemas/ServiceUnavailableError" + /v1/tenant/{tenant_id}/{timeline_id}/preserve_initdb_archive: + parameters: + - name: tenant_id + in: path + required: true + schema: + type: string + - name: timeline_id + in: path + required: true + schema: + type: string + post: + description: | + Marks the initdb archive for preservation upon deletion of the timeline or tenant. + This is meant to be part of the disaster recovery process. + responses: + "202": + description: Tenant scheduled to load successfully + "404": + description: No tenant or timeline found for the specified ids + content: + application/json: + schema: + $ref: "#/components/schemas/Error" + "401": + description: Unauthorized Error + content: + application/json: + schema: + $ref: "#/components/schemas/UnauthorizedError" + "403": + description: Forbidden Error + content: + application/json: + schema: + $ref: "#/components/schemas/ForbiddenError" + "500": + description: Generic operation error + content: + application/json: + schema: + $ref: "#/components/schemas/Error" + "503": + description: Temporarily unavailable, please retry. + content: + application/json: + schema: + $ref: "#/components/schemas/ServiceUnavailableError" + /v1/tenant/{tenant_id}/synthetic_size: parameters: diff --git a/pageserver/src/http/routes.rs b/pageserver/src/http/routes.rs index 811232397c..aa56806246 100644 --- a/pageserver/src/http/routes.rs +++ b/pageserver/src/http/routes.rs @@ -187,6 +187,7 @@ impl From for ApiError { match e { InternalError(e) => ApiError::InternalServerError(anyhow::anyhow!("{e}")), MapState(e) => e.into(), + ShuttingDown(_) => ApiError::ShuttingDown, } } } @@ -495,6 +496,10 @@ async fn timeline_create_handler( .map_err(ApiError::InternalServerError)?; json_response(StatusCode::CREATED, timeline_info) } + Err(_) if tenant.cancel.is_cancelled() => { + // In case we get some ugly error type during shutdown, cast it into a clean 503. + json_response(StatusCode::SERVICE_UNAVAILABLE, HttpErrorBody::from_msg("Tenant shutting down".to_string())) + } Err(tenant::CreateTimelineError::Conflict | tenant::CreateTimelineError::AlreadyCreating) => { json_response(StatusCode::CONFLICT, ()) } @@ -561,6 +566,43 @@ async fn timeline_list_handler( json_response(StatusCode::OK, response_data) } +async fn timeline_preserve_initdb_handler( + request: Request, + _cancel: CancellationToken, +) -> Result, ApiError> { + let tenant_shard_id: TenantShardId = parse_request_param(&request, "tenant_shard_id")?; + let timeline_id: TimelineId = parse_request_param(&request, "timeline_id")?; + check_permission(&request, Some(tenant_shard_id.tenant_id))?; + + // Part of the process for disaster recovery from safekeeper-stored WAL: + // If we don't recover into a new timeline but want to keep the timeline ID, + // then the initdb archive is deleted. This endpoint copies it to a different + // location where timeline recreation cand find it. + + async { + let tenant = mgr::get_tenant(tenant_shard_id, true)?; + + let timeline = tenant + .get_timeline(timeline_id, false) + .map_err(|e| ApiError::NotFound(e.into()))?; + + timeline + .preserve_initdb_archive() + .await + .context("preserving initdb archive") + .map_err(ApiError::InternalServerError)?; + + Ok::<_, ApiError>(()) + } + .instrument(info_span!("timeline_preserve_initdb_archive", + tenant_id = %tenant_shard_id.tenant_id, + shard_id = %tenant_shard_id.shard_slug(), + %timeline_id)) + .await?; + + json_response(StatusCode::OK, ()) +} + async fn timeline_detail_handler( request: Request, _cancel: CancellationToken, @@ -1220,19 +1262,9 @@ async fn tenant_create_handler( }; // We created the tenant. Existing API semantics are that the tenant // is Active when this function returns. - if let res @ Err(_) = new_tenant + new_tenant .wait_to_become_active(ACTIVE_TENANT_TIMEOUT) - .await - { - // This shouldn't happen because we just created the tenant directory - // in upsert_location, and there aren't any remote timelines - // to load, so, nothing can really fail during load. - // Don't do cleanup because we don't know how we got here. - // The tenant will likely be in `Broken` state and subsequent - // calls will fail. - res.context("created tenant failed to become active") - .map_err(ApiError::InternalServerError)?; - } + .await?; json_response( StatusCode::CREATED, @@ -1943,6 +1975,10 @@ pub fn make_router( .post("/v1/tenant/:tenant_id/ignore", |r| { api_handler(r, tenant_ignore_handler) }) + .post( + "/v1/tenant/:tenant_shard_id/timeline/:timeline_id/preserve_initdb_archive", + |r| api_handler(r, timeline_preserve_initdb_handler), + ) .get("/v1/tenant/:tenant_shard_id/timeline/:timeline_id", |r| { api_handler(r, timeline_detail_handler) }) diff --git a/pageserver/src/lib.rs b/pageserver/src/lib.rs index 26070e0cc1..bcde1166b7 100644 --- a/pageserver/src/lib.rs +++ b/pageserver/src/lib.rs @@ -1,3 +1,4 @@ +#![recursion_limit = "300"] #![deny(clippy::undocumented_unsafe_blocks)] mod auth; diff --git a/pageserver/src/metrics.rs b/pageserver/src/metrics.rs index 993685db6e..9b3679e3c2 100644 --- a/pageserver/src/metrics.rs +++ b/pageserver/src/metrics.rs @@ -150,6 +150,43 @@ pub(crate) static MATERIALIZED_PAGE_CACHE_HIT: Lazy = Lazy::new(|| { .expect("failed to define a metric") }); +pub(crate) struct GetVectoredLatency { + map: EnumMap>, +} + +impl GetVectoredLatency { + // Only these task types perform vectored gets. Filter all other tasks out to reduce total + // cardinality of the metric. + const TRACKED_TASK_KINDS: [TaskKind; 2] = [TaskKind::Compaction, TaskKind::PageRequestHandler]; + + pub(crate) fn for_task_kind(&self, task_kind: TaskKind) -> Option<&Histogram> { + self.map[task_kind].as_ref() + } +} + +pub(crate) static GET_VECTORED_LATENCY: Lazy = Lazy::new(|| { + let inner = register_histogram_vec!( + "pageserver_get_vectored_seconds", + "Time spent in get_vectored", + &["task_kind"], + CRITICAL_OP_BUCKETS.into(), + ) + .expect("failed to define a metric"); + + GetVectoredLatency { + map: EnumMap::from_array(std::array::from_fn(|task_kind_idx| { + let task_kind = ::from_usize(task_kind_idx); + + if GetVectoredLatency::TRACKED_TASK_KINDS.contains(&task_kind) { + let task_kind = task_kind.into(); + Some(inner.with_label_values(&[task_kind])) + } else { + None + } + })), + } +}); + pub(crate) struct PageCacheMetricsForTaskKind { pub read_accesses_materialized_page: IntCounter, pub read_accesses_immutable: IntCounter, @@ -932,6 +969,7 @@ pub(crate) static STORAGE_IO_SIZE: Lazy = Lazy::new(|| { .expect("failed to define a metric") }); +#[cfg(not(test))] pub(crate) mod virtual_file_descriptor_cache { use super::*; @@ -951,6 +989,20 @@ pub(crate) mod virtual_file_descriptor_cache { // ``` } +#[cfg(not(test))] +pub(crate) mod virtual_file_io_engine { + use super::*; + + pub(crate) static KIND: Lazy = Lazy::new(|| { + register_uint_gauge_vec!( + "pageserver_virtual_file_io_engine_kind", + "The configured io engine for VirtualFile", + &["kind"], + ) + .unwrap() + }); +} + #[derive(Debug)] struct GlobalAndPerTimelineHistogram { global: Histogram, diff --git a/pageserver/src/page_service.rs b/pageserver/src/page_service.rs index 77ce9981f0..a8a3487b4e 100644 --- a/pageserver/src/page_service.rs +++ b/pageserver/src/page_service.rs @@ -61,7 +61,7 @@ use crate::context::{DownloadBehavior, RequestContext}; use crate::import_datadir::import_wal_from_tar; use crate::metrics; use crate::metrics::LIVE_CONNECTIONS_COUNT; -use crate::pgdatadir_mapping::{rel_block_to_key, Version}; +use crate::pgdatadir_mapping::Version; use crate::task_mgr; use crate::task_mgr::TaskKind; use crate::tenant::debug_assert_current_span_has_tenant_and_timeline_id; @@ -75,6 +75,7 @@ use crate::tenant::PageReconstructError; use crate::tenant::Timeline; use crate::trace::Tracer; +use pageserver_api::key::rel_block_to_key; use postgres_ffi::pg_constants::DEFAULTTABLESPACE_OID; use postgres_ffi::BLCKSZ; @@ -321,8 +322,8 @@ enum PageStreamError { Shutdown, /// Something went wrong reading a page: this likely indicates a pageserver bug - #[error("Read error: {0}")] - Read(PageReconstructError), + #[error("Read error")] + Read(#[source] PageReconstructError), /// Ran out of time waiting for an LSN #[error("LSN timeout: {0}")] @@ -331,11 +332,11 @@ enum PageStreamError { /// The entity required to serve the request (tenant or timeline) is not found, /// or is not found in a suitable state to serve a request. #[error("Not found: {0}")] - NotFound(std::borrow::Cow<'static, str>), + NotFound(Cow<'static, str>), /// Request asked for something that doesn't make sense, like an invalid LSN #[error("Bad request: {0}")] - BadRequest(std::borrow::Cow<'static, str>), + BadRequest(Cow<'static, str>), } impl From for PageStreamError { @@ -666,7 +667,10 @@ impl PageServerHandler { // print the all details to the log with {:#}, but for the client the // error message is enough. Do not log if shutting down, as the anyhow::Error // here includes cancellation which is not an error. - span.in_scope(|| error!("error reading relation or page version: {:#}", e)); + let full = utils::error::report_compact_sources(&e); + span.in_scope(|| { + error!("error reading relation or page version: {full:#}") + }); PagestreamBeMessage::Error(PagestreamErrorResponse { message: e.to_string(), }) diff --git a/pageserver/src/pgdatadir_mapping.rs b/pageserver/src/pgdatadir_mapping.rs index d9cc85319e..b65fe1eddd 100644 --- a/pageserver/src/pgdatadir_mapping.rs +++ b/pageserver/src/pgdatadir_mapping.rs @@ -13,7 +13,12 @@ use crate::repository::*; use crate::walrecord::NeonWalRecord; use anyhow::{ensure, Context}; use bytes::{Buf, Bytes}; -use pageserver_api::key::is_rel_block_key; +use pageserver_api::key::{ + dbdir_key_range, is_rel_block_key, is_slru_block_key, rel_block_to_key, rel_dir_to_key, + rel_key_range, rel_size_to_key, relmap_file_key, slru_block_to_key, slru_dir_to_key, + slru_segment_key_range, slru_segment_size_to_key, twophase_file_key, twophase_key_range, + AUX_FILES_KEY, CHECKPOINT_KEY, CONTROLFILE_KEY, DBDIR_KEY, TWOPHASEDIR_KEY, +}; use pageserver_api::reltag::{BlockNumber, RelTag, SlruKind}; use postgres_ffi::relfile_utils::{FSM_FORKNUM, VISIBILITYMAP_FORKNUM}; use postgres_ffi::BLCKSZ; @@ -22,6 +27,7 @@ use serde::{Deserialize, Serialize}; use std::collections::{hash_map, HashMap, HashSet}; use std::ops::ControlFlow; use std::ops::Range; +use strum::IntoEnumIterator; use tokio_util::sync::CancellationToken; use tracing::{debug, trace, warn}; use utils::bin_ser::DeserializeError; @@ -528,6 +534,33 @@ impl Timeline { Ok(Default::default()) } + pub(crate) async fn get_slru_keyspace( + &self, + version: Version<'_>, + ctx: &RequestContext, + ) -> Result { + let mut accum = KeySpaceAccum::new(); + + for kind in SlruKind::iter() { + let mut segments: Vec = self + .list_slru_segments(kind, version, ctx) + .await? + .into_iter() + .collect(); + segments.sort_unstable(); + + for seg in segments { + let block_count = self.get_slru_segment_size(kind, seg, version, ctx).await?; + + accum.add_range( + slru_block_to_key(kind, seg, 0)..slru_block_to_key(kind, seg, block_count), + ); + } + } + + Ok(accum.to_keyspace()) + } + /// Get a list of SLRU segments pub(crate) async fn list_slru_segments( &self, @@ -1535,366 +1568,6 @@ struct SlruSegmentDirectory { static ZERO_PAGE: Bytes = Bytes::from_static(&[0u8; BLCKSZ as usize]); -// Layout of the Key address space -// -// The Key struct, used to address the underlying key-value store, consists of -// 18 bytes, split into six fields. See 'Key' in repository.rs. We need to map -// all the data and metadata keys into those 18 bytes. -// -// Principles for the mapping: -// -// - Things that are often accessed or modified together, should be close to -// each other in the key space. For example, if a relation is extended by one -// block, we create a new key-value pair for the block data, and update the -// relation size entry. Because of that, the RelSize key comes after all the -// RelBlocks of a relation: the RelSize and the last RelBlock are always next -// to each other. -// -// The key space is divided into four major sections, identified by the first -// byte, and the form a hierarchy: -// -// 00 Relation data and metadata -// -// DbDir () -> (dbnode, spcnode) -// Filenodemap -// RelDir -> relnode forknum -// RelBlocks -// RelSize -// -// 01 SLRUs -// -// SlruDir kind -// SlruSegBlocks segno -// SlruSegSize -// -// 02 pg_twophase -// -// 03 misc -// Controlfile -// checkpoint -// pg_version -// -// 04 aux files -// -// Below is a full list of the keyspace allocation: -// -// DbDir: -// 00 00000000 00000000 00000000 00 00000000 -// -// Filenodemap: -// 00 SPCNODE DBNODE 00000000 00 00000000 -// -// RelDir: -// 00 SPCNODE DBNODE 00000000 00 00000001 (Postgres never uses relfilenode 0) -// -// RelBlock: -// 00 SPCNODE DBNODE RELNODE FORK BLKNUM -// -// RelSize: -// 00 SPCNODE DBNODE RELNODE FORK FFFFFFFF -// -// SlruDir: -// 01 kind 00000000 00000000 00 00000000 -// -// SlruSegBlock: -// 01 kind 00000001 SEGNO 00 BLKNUM -// -// SlruSegSize: -// 01 kind 00000001 SEGNO 00 FFFFFFFF -// -// TwoPhaseDir: -// 02 00000000 00000000 00000000 00 00000000 -// -// TwoPhaseFile: -// 02 00000000 00000000 00000000 00 XID -// -// ControlFile: -// 03 00000000 00000000 00000000 00 00000000 -// -// Checkpoint: -// 03 00000000 00000000 00000000 00 00000001 -// -// AuxFiles: -// 03 00000000 00000000 00000000 00 00000002 -// - -//-- Section 01: relation data and metadata - -const DBDIR_KEY: Key = Key { - field1: 0x00, - field2: 0, - field3: 0, - field4: 0, - field5: 0, - field6: 0, -}; - -fn dbdir_key_range(spcnode: Oid, dbnode: Oid) -> Range { - Key { - field1: 0x00, - field2: spcnode, - field3: dbnode, - field4: 0, - field5: 0, - field6: 0, - }..Key { - field1: 0x00, - field2: spcnode, - field3: dbnode, - field4: 0xffffffff, - field5: 0xff, - field6: 0xffffffff, - } -} - -fn relmap_file_key(spcnode: Oid, dbnode: Oid) -> Key { - Key { - field1: 0x00, - field2: spcnode, - field3: dbnode, - field4: 0, - field5: 0, - field6: 0, - } -} - -fn rel_dir_to_key(spcnode: Oid, dbnode: Oid) -> Key { - Key { - field1: 0x00, - field2: spcnode, - field3: dbnode, - field4: 0, - field5: 0, - field6: 1, - } -} - -pub(crate) fn rel_block_to_key(rel: RelTag, blknum: BlockNumber) -> Key { - Key { - field1: 0x00, - field2: rel.spcnode, - field3: rel.dbnode, - field4: rel.relnode, - field5: rel.forknum, - field6: blknum, - } -} - -fn rel_size_to_key(rel: RelTag) -> Key { - Key { - field1: 0x00, - field2: rel.spcnode, - field3: rel.dbnode, - field4: rel.relnode, - field5: rel.forknum, - field6: 0xffffffff, - } -} - -fn rel_key_range(rel: RelTag) -> Range { - Key { - field1: 0x00, - field2: rel.spcnode, - field3: rel.dbnode, - field4: rel.relnode, - field5: rel.forknum, - field6: 0, - }..Key { - field1: 0x00, - field2: rel.spcnode, - field3: rel.dbnode, - field4: rel.relnode, - field5: rel.forknum + 1, - field6: 0, - } -} - -//-- Section 02: SLRUs - -fn slru_dir_to_key(kind: SlruKind) -> Key { - Key { - field1: 0x01, - field2: match kind { - SlruKind::Clog => 0x00, - SlruKind::MultiXactMembers => 0x01, - SlruKind::MultiXactOffsets => 0x02, - }, - field3: 0, - field4: 0, - field5: 0, - field6: 0, - } -} - -fn slru_block_to_key(kind: SlruKind, segno: u32, blknum: BlockNumber) -> Key { - Key { - field1: 0x01, - field2: match kind { - SlruKind::Clog => 0x00, - SlruKind::MultiXactMembers => 0x01, - SlruKind::MultiXactOffsets => 0x02, - }, - field3: 1, - field4: segno, - field5: 0, - field6: blknum, - } -} - -fn slru_segment_size_to_key(kind: SlruKind, segno: u32) -> Key { - Key { - field1: 0x01, - field2: match kind { - SlruKind::Clog => 0x00, - SlruKind::MultiXactMembers => 0x01, - SlruKind::MultiXactOffsets => 0x02, - }, - field3: 1, - field4: segno, - field5: 0, - field6: 0xffffffff, - } -} - -fn slru_segment_key_range(kind: SlruKind, segno: u32) -> Range { - let field2 = match kind { - SlruKind::Clog => 0x00, - SlruKind::MultiXactMembers => 0x01, - SlruKind::MultiXactOffsets => 0x02, - }; - - Key { - field1: 0x01, - field2, - field3: 1, - field4: segno, - field5: 0, - field6: 0, - }..Key { - field1: 0x01, - field2, - field3: 1, - field4: segno, - field5: 1, - field6: 0, - } -} - -//-- Section 03: pg_twophase - -const TWOPHASEDIR_KEY: Key = Key { - field1: 0x02, - field2: 0, - field3: 0, - field4: 0, - field5: 0, - field6: 0, -}; - -fn twophase_file_key(xid: TransactionId) -> Key { - Key { - field1: 0x02, - field2: 0, - field3: 0, - field4: 0, - field5: 0, - field6: xid, - } -} - -fn twophase_key_range(xid: TransactionId) -> Range { - let (next_xid, overflowed) = xid.overflowing_add(1); - - Key { - field1: 0x02, - field2: 0, - field3: 0, - field4: 0, - field5: 0, - field6: xid, - }..Key { - field1: 0x02, - field2: 0, - field3: 0, - field4: 0, - field5: u8::from(overflowed), - field6: next_xid, - } -} - -//-- Section 03: Control file -const CONTROLFILE_KEY: Key = Key { - field1: 0x03, - field2: 0, - field3: 0, - field4: 0, - field5: 0, - field6: 0, -}; - -const CHECKPOINT_KEY: Key = Key { - field1: 0x03, - field2: 0, - field3: 0, - field4: 0, - field5: 0, - field6: 1, -}; - -const AUX_FILES_KEY: Key = Key { - field1: 0x03, - field2: 0, - field3: 0, - field4: 0, - field5: 0, - field6: 2, -}; - -// Reverse mappings for a few Keys. -// These are needed by WAL redo manager. - -// AUX_FILES currently stores only data for logical replication (slots etc), and -// we don't preserve these on a branch because safekeepers can't follow timeline -// switch (and generally it likely should be optional), so ignore these. -pub fn is_inherited_key(key: Key) -> bool { - key != AUX_FILES_KEY -} - -pub fn is_rel_fsm_block_key(key: Key) -> bool { - key.field1 == 0x00 && key.field4 != 0 && key.field5 == FSM_FORKNUM && key.field6 != 0xffffffff -} - -pub fn is_rel_vm_block_key(key: Key) -> bool { - key.field1 == 0x00 - && key.field4 != 0 - && key.field5 == VISIBILITYMAP_FORKNUM - && key.field6 != 0xffffffff -} - -pub fn key_to_slru_block(key: Key) -> anyhow::Result<(SlruKind, u32, BlockNumber)> { - Ok(match key.field1 { - 0x01 => { - let kind = match key.field2 { - 0x00 => SlruKind::Clog, - 0x01 => SlruKind::MultiXactMembers, - 0x02 => SlruKind::MultiXactOffsets, - _ => anyhow::bail!("unrecognized slru kind 0x{:02x}", key.field2), - }; - let segno = key.field4; - let blknum = key.field6; - - (kind, segno, blknum) - } - _ => anyhow::bail!("unexpected value kind 0x{:02x}", key.field1), - }) -} - -fn is_slru_block_key(key: Key) -> bool { - key.field1 == 0x01 // SLRU-related - && key.field3 == 0x00000001 // but not SlruDir - && key.field6 != 0xffffffff // and not SlruSegSize -} - #[allow(clippy::bool_assert_comparison)] #[cfg(test)] mod tests { diff --git a/pageserver/src/tenant.rs b/pageserver/src/tenant.rs index 1d9b91c9ce..7bb5881aab 100644 --- a/pageserver/src/tenant.rs +++ b/pageserver/src/tenant.rs @@ -91,7 +91,6 @@ use std::fs; use std::fs::File; use std::io; use std::ops::Bound::Included; -use std::process::Stdio; use std::sync::atomic::AtomicU64; use std::sync::atomic::Ordering; use std::sync::Arc; @@ -628,9 +627,15 @@ impl Tenant { deletion_queue_client, )); + // The attach task will carry a GateGuard, so that shutdown() reliably waits for it to drop out if + // we shut down while attaching. + let Ok(attach_gate_guard) = tenant.gate.enter() else { + // We just created the Tenant: nothing else can have shut it down yet + unreachable!(); + }; + // Do all the hard work in the background let tenant_clone = Arc::clone(&tenant); - let ctx = ctx.detached_child(TaskKind::Attach, DownloadBehavior::Warn); task_mgr::spawn( &tokio::runtime::Handle::current(), @@ -640,6 +645,8 @@ impl Tenant { "attach tenant", false, async move { + let _gate_guard = attach_gate_guard; + // Is this tenant being spawned as part of process startup? let starting_up = init_order.is_some(); scopeguard::defer! { @@ -814,7 +821,7 @@ impl Tenant { SpawnMode::Create => None, SpawnMode::Normal => {Some(TENANT.attach.start_timer())} }; - match tenant_clone.attach(preload, &ctx).await { + match tenant_clone.attach(preload, mode, &ctx).await { Ok(()) => { info!("attach finished, activating"); if let Some(t)= attach_timer {t.observe_duration();} @@ -901,15 +908,20 @@ impl Tenant { async fn attach( self: &Arc, preload: Option, + mode: SpawnMode, ctx: &RequestContext, ) -> anyhow::Result<()> { span::debug_assert_current_span_has_tenant_id(); failpoint_support::sleep_millis_async!("before-attaching-tenant"); - let preload = match preload { - Some(p) => p, - None => { + let preload = match (preload, mode) { + (Some(p), _) => p, + (None, SpawnMode::Create) => TenantPreload { + deleting: false, + timelines: HashMap::new(), + }, + (None, SpawnMode::Normal) => { // Deprecated dev mode: load from local disk state instead of remote storage // https://github.com/neondatabase/neon/issues/5624 return self.load_local(ctx).await; @@ -1017,7 +1029,10 @@ impl Tenant { // IndexPart is the source of truth. self.clean_up_timelines(&existent_timelines)?; - failpoint_support::sleep_millis_async!("attach-before-activate", &self.cancel); + fail::fail_point!("attach-before-activate", |_| { + anyhow::bail!("attach-before-activate"); + }); + failpoint_support::sleep_millis_async!("attach-before-activate-sleep", &self.cancel); info!("Done"); @@ -1681,9 +1696,13 @@ impl Tenant { ctx: &RequestContext, ) -> Result, CreateTimelineError> { if !self.is_active() { - return Err(CreateTimelineError::Other(anyhow::anyhow!( - "Cannot create timelines on inactive tenant" - ))); + if matches!(self.current_state(), TenantState::Stopping { .. }) { + return Err(CreateTimelineError::ShuttingDown); + } else { + return Err(CreateTimelineError::Other(anyhow::anyhow!( + "Cannot create timelines on inactive tenant" + ))); + } } let _gate = self @@ -3759,27 +3778,25 @@ async fn run_initdb( .env_clear() .env("LD_LIBRARY_PATH", &initdb_lib_dir) .env("DYLD_LIBRARY_PATH", &initdb_lib_dir) - .stdout(Stdio::piped()) - .stderr(Stdio::piped()) - // If the `select!` below doesn't finish the `wait_with_output`, - // let the task get `wait()`ed for asynchronously by tokio. - // This means there is a slim chance we can go over the INIT_DB_SEMAPHORE. - // TODO: fix for this is non-trivial, see - // https://github.com/neondatabase/neon/pull/5921#pullrequestreview-1750858021 - // - .kill_on_drop(true) .spawn()?; - tokio::select! { - initdb_output = initdb_command.wait_with_output() => { - let initdb_output = initdb_output?; - if !initdb_output.status.success() { - return Err(InitdbError::Failed(initdb_output.status, initdb_output.stderr)); - } - } - _ = cancel.cancelled() => { - return Err(InitdbError::Cancelled); - } + // Ideally we'd select here with the cancellation token, but the problem is that + // we can't safely terminate initdb: it launches processes of its own, and killing + // initdb doesn't kill them. After we return from this function, we want the target + // directory to be able to be cleaned up. + // See https://github.com/neondatabase/neon/issues/6385 + let initdb_output = initdb_command.wait_with_output().await?; + if !initdb_output.status.success() { + return Err(InitdbError::Failed( + initdb_output.status, + initdb_output.stderr, + )); + } + + // This isn't true cancellation support, see above. Still return an error to + // excercise the cancellation code path. + if cancel.is_cancelled() { + return Err(InitdbError::Cancelled); } Ok(()) @@ -4035,7 +4052,7 @@ pub(crate) mod harness { .instrument(info_span!("try_load_preload", tenant_id=%self.tenant_shard_id.tenant_id, shard_id=%self.tenant_shard_id.shard_slug())) .await?; tenant - .attach(Some(preload), ctx) + .attach(Some(preload), SpawnMode::Normal, ctx) .instrument(info_span!("try_load", tenant_id=%self.tenant_shard_id.tenant_id, shard_id=%self.tenant_shard_id.shard_slug())) .await?; } diff --git a/pageserver/src/tenant/block_io.rs b/pageserver/src/tenant/block_io.rs index 0617017528..1b6bccc120 100644 --- a/pageserver/src/tenant/block_io.rs +++ b/pageserver/src/tenant/block_io.rs @@ -5,10 +5,10 @@ use super::ephemeral_file::EphemeralFile; use super::storage_layer::delta_layer::{Adapter, DeltaLayerInner}; use crate::context::RequestContext; -use crate::page_cache::{self, PageReadGuard, ReadBufResult, PAGE_SZ}; +use crate::page_cache::{self, PageReadGuard, PageWriteGuard, ReadBufResult, PAGE_SZ}; use crate::virtual_file::VirtualFile; use bytes::Bytes; -use std::ops::{Deref, DerefMut}; +use std::ops::Deref; /// This is implemented by anything that can read 8 kB (PAGE_SZ) /// blocks, using the page cache @@ -39,6 +39,8 @@ pub enum BlockLease<'a> { EphemeralFileMutableTail(&'a [u8; PAGE_SZ]), #[cfg(test)] Arc(std::sync::Arc<[u8; PAGE_SZ]>), + #[cfg(test)] + Vec(Vec), } impl From> for BlockLease<'static> { @@ -63,6 +65,10 @@ impl<'a> Deref for BlockLease<'a> { BlockLease::EphemeralFileMutableTail(v) => v, #[cfg(test)] BlockLease::Arc(v) => v.deref(), + #[cfg(test)] + BlockLease::Vec(v) => { + TryFrom::try_from(&v[..]).expect("caller must ensure that v has PAGE_SZ") + } } } } @@ -169,10 +175,14 @@ impl FileBlockReader { } /// Read a page from the underlying file into given buffer. - async fn fill_buffer(&self, buf: &mut [u8], blkno: u32) -> Result<(), std::io::Error> { + async fn fill_buffer( + &self, + buf: PageWriteGuard<'static>, + blkno: u32, + ) -> Result, std::io::Error> { assert!(buf.len() == PAGE_SZ); self.file - .read_exact_at(buf, blkno as u64 * PAGE_SZ as u64) + .read_exact_at_page(buf, blkno as u64 * PAGE_SZ as u64) .await } /// Read a block. @@ -196,9 +206,9 @@ impl FileBlockReader { ) })? { ReadBufResult::Found(guard) => Ok(guard.into()), - ReadBufResult::NotFound(mut write_guard) => { + ReadBufResult::NotFound(write_guard) => { // Read the page from disk into the buffer - self.fill_buffer(write_guard.deref_mut(), blknum).await?; + let write_guard = self.fill_buffer(write_guard, blknum).await?; Ok(write_guard.mark_valid().into()) } } diff --git a/pageserver/src/tenant/delete.rs b/pageserver/src/tenant/delete.rs index ecffd4e6c1..97de0cdcf9 100644 --- a/pageserver/src/tenant/delete.rs +++ b/pageserver/src/tenant/delete.rs @@ -409,7 +409,10 @@ impl DeleteTenantFlow { .await .expect("cant be stopping or broken"); - tenant.attach(preload, ctx).await.context("attach")?; + tenant + .attach(preload, super::SpawnMode::Normal, ctx) + .await + .context("attach")?; Self::background( guard, diff --git a/pageserver/src/tenant/ephemeral_file.rs b/pageserver/src/tenant/ephemeral_file.rs index 591eacd104..6b8cd77d78 100644 --- a/pageserver/src/tenant/ephemeral_file.rs +++ b/pageserver/src/tenant/ephemeral_file.rs @@ -5,11 +5,11 @@ use crate::config::PageServerConf; use crate::context::RequestContext; use crate::page_cache::{self, PAGE_SZ}; use crate::tenant::block_io::{BlockCursor, BlockLease, BlockReader}; -use crate::virtual_file::VirtualFile; +use crate::virtual_file::{self, VirtualFile}; use camino::Utf8PathBuf; use pageserver_api::shard::TenantShardId; use std::cmp::min; -use std::fs::OpenOptions; + use std::io::{self, ErrorKind}; use std::ops::DerefMut; use std::sync::atomic::AtomicU64; @@ -47,7 +47,10 @@ impl EphemeralFile { let file = VirtualFile::open_with_options( &filename, - OpenOptions::new().read(true).write(true).create(true), + virtual_file::OpenOptions::new() + .read(true) + .write(true) + .create(true), ) .await?; @@ -89,11 +92,10 @@ impl EphemeralFile { page_cache::ReadBufResult::Found(guard) => { return Ok(BlockLease::PageReadGuard(guard)) } - page_cache::ReadBufResult::NotFound(mut write_guard) => { - let buf: &mut [u8] = write_guard.deref_mut(); - debug_assert_eq!(buf.len(), PAGE_SZ); - self.file - .read_exact_at(&mut buf[..], blknum as u64 * PAGE_SZ as u64) + page_cache::ReadBufResult::NotFound(write_guard) => { + let write_guard = self + .file + .read_exact_at_page(write_guard, blknum as u64 * PAGE_SZ as u64) .await?; let read_guard = write_guard.mark_valid(); return Ok(BlockLease::PageReadGuard(read_guard)); diff --git a/pageserver/src/tenant/layer_map.rs b/pageserver/src/tenant/layer_map.rs index 9b6225501f..c31d401e84 100644 --- a/pageserver/src/tenant/layer_map.rs +++ b/pageserver/src/tenant/layer_map.rs @@ -283,15 +283,15 @@ impl LayerMap { /// /// This is used for garbage collection, to determine if an old layer can /// be deleted. - pub fn image_layer_exists(&self, key: &Range, lsn: &Range) -> Result { + pub fn image_layer_exists(&self, key: &Range, lsn: &Range) -> bool { if key.is_empty() { // Vacuously true. There's a newer image for all 0 of the kerys in the range. - return Ok(true); + return true; } let version = match self.historic.get().unwrap().get_version(lsn.end.0 - 1) { Some(v) => v, - None => return Ok(false), + None => return false, }; let start = key.start.to_i128(); @@ -304,17 +304,17 @@ impl LayerMap { // Check the start is covered if !layer_covers(version.image_coverage.query(start)) { - return Ok(false); + return false; } // Check after all changes of coverage for (_, change_val) in version.image_coverage.range(start..end) { if !layer_covers(change_val) { - return Ok(false); + return false; } } - Ok(true) + true } pub fn iter_historic_layers(&self) -> impl '_ + Iterator> { @@ -325,18 +325,14 @@ impl LayerMap { /// Divide the whole given range of keys into sub-ranges based on the latest /// image layer that covers each range at the specified lsn (inclusive). /// This is used when creating new image layers. - /// - // FIXME: clippy complains that the result type is very complex. She's probably - // right... - #[allow(clippy::type_complexity)] pub fn image_coverage( &self, key_range: &Range, lsn: Lsn, - ) -> Result, Option>)>> { + ) -> Vec<(Range, Option>)> { let version = match self.historic.get().unwrap().get_version(lsn.0) { Some(v) => v, - None => return Ok(vec![]), + None => return vec![], }; let start = key_range.start.to_i128(); @@ -359,7 +355,7 @@ impl LayerMap { let kr = Key::from_i128(current_key)..Key::from_i128(end); coverage.push((kr, current_val.take())); - Ok(coverage) + coverage } pub fn is_l0(layer: &PersistentLayerDesc) -> bool { @@ -410,24 +406,19 @@ impl LayerMap { /// This number is used to compute the largest number of deltas that /// we'll need to visit for any page reconstruction in this region. /// We use this heuristic to decide whether to create an image layer. - pub fn count_deltas( - &self, - key: &Range, - lsn: &Range, - limit: Option, - ) -> Result { + pub fn count_deltas(&self, key: &Range, lsn: &Range, limit: Option) -> usize { // We get the delta coverage of the region, and for each part of the coverage // we recurse right underneath the delta. The recursion depth is limited by // the largest result this function could return, which is in practice between // 3 and 10 (since we usually try to create an image when the number gets larger). if lsn.is_empty() || key.is_empty() || limit == Some(0) { - return Ok(0); + return 0; } let version = match self.historic.get().unwrap().get_version(lsn.end.0 - 1) { Some(v) => v, - None => return Ok(0), + None => return 0, }; let start = key.start.to_i128(); @@ -448,8 +439,7 @@ impl LayerMap { if !kr.is_empty() { let base_count = Self::is_reimage_worthy(&val, key) as usize; let new_limit = limit.map(|l| l - base_count); - let max_stacked_deltas_underneath = - self.count_deltas(&kr, &lr, new_limit)?; + let max_stacked_deltas_underneath = self.count_deltas(&kr, &lr, new_limit); max_stacked_deltas = std::cmp::max( max_stacked_deltas, base_count + max_stacked_deltas_underneath, @@ -471,7 +461,7 @@ impl LayerMap { if !kr.is_empty() { let base_count = Self::is_reimage_worthy(&val, key) as usize; let new_limit = limit.map(|l| l - base_count); - let max_stacked_deltas_underneath = self.count_deltas(&kr, &lr, new_limit)?; + let max_stacked_deltas_underneath = self.count_deltas(&kr, &lr, new_limit); max_stacked_deltas = std::cmp::max( max_stacked_deltas, base_count + max_stacked_deltas_underneath, @@ -480,7 +470,7 @@ impl LayerMap { } } - Ok(max_stacked_deltas) + max_stacked_deltas } /// Count how many reimage-worthy layers we need to visit for given key-lsn pair. @@ -592,10 +582,7 @@ impl LayerMap { if limit == Some(difficulty) { break; } - for (img_range, last_img) in self - .image_coverage(range, lsn) - .expect("why would this err?") - { + for (img_range, last_img) in self.image_coverage(range, lsn) { if limit == Some(difficulty) { break; } @@ -606,9 +593,7 @@ impl LayerMap { }; if img_lsn < lsn { - let num_deltas = self - .count_deltas(&img_range, &(img_lsn..lsn), limit) - .expect("why would this err lol?"); + let num_deltas = self.count_deltas(&img_range, &(img_lsn..lsn), limit); difficulty = std::cmp::max(difficulty, num_deltas); } } diff --git a/pageserver/src/tenant/mgr.rs b/pageserver/src/tenant/mgr.rs index 84c7a20247..32535e0134 100644 --- a/pageserver/src/tenant/mgr.rs +++ b/pageserver/src/tenant/mgr.rs @@ -7,6 +7,7 @@ use pageserver_api::models::ShardParameters; use pageserver_api::shard::{ShardCount, ShardIdentity, ShardNumber, TenantShardId}; use rand::{distributions::Alphanumeric, Rng}; use std::borrow::Cow; +use std::cmp::Ordering; use std::collections::{BTreeMap, HashMap}; use std::ops::Deref; use std::sync::Arc; @@ -32,7 +33,8 @@ use crate::deletion_queue::DeletionQueueClient; use crate::metrics::{TENANT, TENANT_MANAGER as METRICS}; use crate::task_mgr::{self, TaskKind}; use crate::tenant::config::{ - AttachedLocationConfig, AttachmentMode, LocationConf, LocationMode, TenantConfOpt, + AttachedLocationConfig, AttachmentMode, LocationConf, LocationMode, SecondaryLocationConfig, + TenantConfOpt, }; use crate::tenant::delete::DeleteTenantFlow; use crate::tenant::span::debug_assert_current_span_has_tenant_id; @@ -466,6 +468,26 @@ pub async fn init_tenant_mgr( // We have a generation map: treat it as the authority for whether // this tenant is really attached. if let Some(gen) = generations.get(&tenant_shard_id) { + if let LocationMode::Attached(attached) = &location_conf.mode { + if attached.generation > *gen { + tracing::error!(tenant_id=%tenant_shard_id.tenant_id, shard_id=%tenant_shard_id.shard_slug(), + "Control plane gave decreasing generation ({gen:?}) in re-attach response for tenant that was attached in generation {:?}, demoting to secondary", + attached.generation + ); + + // We cannot safely attach this tenant given a bogus generation number, but let's avoid throwing away + // local disk content: demote to secondary rather than detaching. + tenants.insert( + tenant_shard_id, + TenantSlot::Secondary(SecondaryTenant::new( + tenant_shard_id, + location_conf.shard, + location_conf.tenant_conf, + &SecondaryLocationConfig { warm: false }, + )), + ); + } + } *gen } else { match &location_conf.mode { @@ -721,7 +743,7 @@ async fn shutdown_all_tenants0(tenants: &std::sync::RwLock) { tokio::select! { Some(joined) = join_set.join_next() => { match joined { - Ok(()) => {} + Ok(()) => {}, Err(join_error) if join_error.is_cancelled() => { unreachable!("we are not cancelling any of the tasks"); } @@ -882,7 +904,7 @@ impl TenantManager { tenant_shard_id: TenantShardId, new_location_config: LocationConf, flush: Option, - spawn_mode: SpawnMode, + mut spawn_mode: SpawnMode, ctx: &RequestContext, ) -> Result>, UpsertLocationError> { debug_assert_current_span_has_tenant_id(); @@ -902,19 +924,29 @@ impl TenantManager { tenant_map_peek_slot(&locked, &tenant_shard_id, TenantSlotPeekMode::Write)?; match (&new_location_config.mode, peek_slot) { (LocationMode::Attached(attach_conf), Some(TenantSlot::Attached(tenant))) => { - if attach_conf.generation == tenant.generation { - // A transition from Attached to Attached in the same generation, we may - // take our fast path and just provide the updated configuration - // to the tenant. - tenant.set_new_location_config( - AttachedTenantConf::try_from(new_location_config.clone()) - .map_err(UpsertLocationError::BadRequest)?, - ); + match attach_conf.generation.cmp(&tenant.generation) { + Ordering::Equal => { + // A transition from Attached to Attached in the same generation, we may + // take our fast path and just provide the updated configuration + // to the tenant. + tenant.set_new_location_config( + AttachedTenantConf::try_from(new_location_config.clone()) + .map_err(UpsertLocationError::BadRequest)?, + ); - Some(FastPathModified::Attached(tenant.clone())) - } else { - // Different generations, fall through to general case - None + Some(FastPathModified::Attached(tenant.clone())) + } + Ordering::Less => { + return Err(UpsertLocationError::BadRequest(anyhow::anyhow!( + "Generation {:?} is less than existing {:?}", + attach_conf.generation, + tenant.generation + ))); + } + Ordering::Greater => { + // Generation advanced, fall through to general case of replacing `Tenant` object + None + } } } ( @@ -1019,6 +1051,12 @@ impl TenantManager { } } slot_guard.drop_old_value().expect("We just shut it down"); + + // Edge case: if we were called with SpawnMode::Create, but a Tenant already existed, then + // the caller thinks they're creating but the tenant already existed. We must switch to + // Normal mode so that when starting this Tenant we properly probe remote storage for timelines, + // rather than assuming it to be empty. + spawn_mode = SpawnMode::Normal; } Some(TenantSlot::Secondary(state)) => { info!("Shutting down secondary tenant"); @@ -1102,14 +1140,46 @@ impl TenantManager { None }; - slot_guard.upsert(new_slot).map_err(|e| match e { - TenantSlotUpsertError::InternalError(e) => { - UpsertLocationError::Other(anyhow::anyhow!(e)) + match slot_guard.upsert(new_slot) { + Err(TenantSlotUpsertError::InternalError(e)) => { + Err(UpsertLocationError::Other(anyhow::anyhow!(e))) } - TenantSlotUpsertError::MapState(e) => UpsertLocationError::Unavailable(e), - })?; + Err(TenantSlotUpsertError::MapState(e)) => Err(UpsertLocationError::Unavailable(e)), + Err(TenantSlotUpsertError::ShuttingDown((new_slot, _completion))) => { + // If we just called tenant_spawn() on a new tenant, and can't insert it into our map, then + // we must not leak it: this would violate the invariant that after shutdown_all_tenants, all tenants + // are shutdown. + // + // We must shut it down inline here. + match new_slot { + TenantSlot::InProgress(_) => { + // Unreachable because we never insert an InProgress + unreachable!() + } + TenantSlot::Attached(tenant) => { + let (_guard, progress) = utils::completion::channel(); + info!("Shutting down just-spawned tenant, because tenant manager is shut down"); + match tenant.shutdown(progress, false).await { + Ok(()) => { + info!("Finished shutting down just-spawned tenant"); + } + Err(barrier) => { + info!("Shutdown already in progress, waiting for it to complete"); + barrier.wait().await; + } + } + } + TenantSlot::Secondary(secondary_tenant) => { + secondary_tenant.shutdown().await; + } + } - Ok(attached_tenant) + Err(UpsertLocationError::Unavailable( + TenantMapError::ShuttingDown, + )) + } + Ok(()) => Ok(attached_tenant), + } } /// Resetting a tenant is equivalent to detaching it, then attaching it again with the same @@ -1728,14 +1798,31 @@ pub(crate) enum TenantSlotError { /// Superset of TenantMapError: issues that can occur when using a SlotGuard /// to insert a new value. -#[derive(Debug, thiserror::Error)] -pub enum TenantSlotUpsertError { +#[derive(thiserror::Error)] +pub(crate) enum TenantSlotUpsertError { /// An error where the slot is in an unexpected state, indicating a code bug #[error("Internal error updating Tenant")] InternalError(Cow<'static, str>), #[error(transparent)] - MapState(#[from] TenantMapError), + MapState(TenantMapError), + + // If we encounter TenantManager shutdown during upsert, we must carry the Completion + // from the SlotGuard, so that the caller can hold it while they clean up: otherwise + // TenantManager shutdown might race ahead before we're done cleaning up any Tenant that + // was protected by the SlotGuard. + #[error("Shutting down")] + ShuttingDown((TenantSlot, utils::completion::Completion)), +} + +impl std::fmt::Debug for TenantSlotUpsertError { + fn fmt(&self, f: &mut std::fmt::Formatter) -> std::fmt::Result { + match self { + Self::InternalError(reason) => write!(f, "Internal Error {reason}"), + Self::MapState(map_error) => write!(f, "Tenant map state: {map_error:?}"), + Self::ShuttingDown(_completion) => write!(f, "Tenant map shutting down"), + } + } } #[derive(Debug, thiserror::Error)] @@ -1784,7 +1871,7 @@ pub struct SlotGuard { /// [`TenantSlot::InProgress`] carries the corresponding Barrier: it will /// release any waiters as soon as this SlotGuard is dropped. - _completion: utils::completion::Completion, + completion: utils::completion::Completion, } impl SlotGuard { @@ -1797,7 +1884,7 @@ impl SlotGuard { tenant_shard_id, old_value, upserted: false, - _completion: completion, + completion, } } @@ -1830,9 +1917,16 @@ impl SlotGuard { } let m = match &mut *locked { - TenantsMap::Initializing => return Err(TenantMapError::StillInitializing.into()), + TenantsMap::Initializing => { + return Err(TenantSlotUpsertError::MapState( + TenantMapError::StillInitializing, + )) + } TenantsMap::ShuttingDown(_) => { - return Err(TenantMapError::ShuttingDown.into()); + return Err(TenantSlotUpsertError::ShuttingDown(( + new_value, + self.completion.clone(), + ))); } TenantsMap::Open(m) => m, }; @@ -1880,7 +1974,9 @@ impl SlotGuard { Err(TenantSlotUpsertError::InternalError(_)) => { // We already logged the error, nothing else we can do. } - Err(TenantSlotUpsertError::MapState(_)) => { + Err( + TenantSlotUpsertError::MapState(_) | TenantSlotUpsertError::ShuttingDown(_), + ) => { // If the map is shutting down, we need not replace anything } Ok(()) => {} @@ -1978,18 +2074,22 @@ fn tenant_map_peek_slot<'a>( tenant_shard_id: &TenantShardId, mode: TenantSlotPeekMode, ) -> Result, TenantMapError> { - let m = match tenants.deref() { - TenantsMap::Initializing => return Err(TenantMapError::StillInitializing), + match tenants.deref() { + TenantsMap::Initializing => Err(TenantMapError::StillInitializing), TenantsMap::ShuttingDown(m) => match mode { - TenantSlotPeekMode::Read => m, - TenantSlotPeekMode::Write => { - return Err(TenantMapError::ShuttingDown); - } + TenantSlotPeekMode::Read => Ok(Some( + // When reading in ShuttingDown state, we must translate None results + // into a ShuttingDown error, because absence of a tenant shard ID in the map + // isn't a reliable indicator of the tenant being gone: it might have been + // InProgress when shutdown started, and cleaned up from that state such + // that it's now no longer in the map. Callers will have to wait until + // we next start up to get a proper answer. This avoids incorrect 404 API responses. + m.get(tenant_shard_id).ok_or(TenantMapError::ShuttingDown)?, + )), + TenantSlotPeekMode::Write => Err(TenantMapError::ShuttingDown), }, - TenantsMap::Open(m) => m, - }; - - Ok(m.get(tenant_shard_id)) + TenantsMap::Open(m) => Ok(m.get(tenant_shard_id)), + } } enum TenantSlotAcquireMode { diff --git a/pageserver/src/tenant/remote_timeline_client.rs b/pageserver/src/tenant/remote_timeline_client.rs index 1b5f861c90..80ff5c9a2d 100644 --- a/pageserver/src/tenant/remote_timeline_client.rs +++ b/pageserver/src/tenant/remote_timeline_client.rs @@ -257,6 +257,8 @@ pub(crate) const FAILED_UPLOAD_WARN_THRESHOLD: u32 = 3; pub(crate) const INITDB_PATH: &str = "initdb.tar.zst"; +pub(crate) const INITDB_PRESERVED_PATH: &str = "initdb-preserved.tar.zst"; + /// Default buffer size when interfacing with [`tokio::fs::File`]. pub(crate) const BUFFER_SIZE: usize = 32 * 1024; @@ -1066,6 +1068,28 @@ impl RemoteTimelineClient { Ok(()) } + pub(crate) async fn preserve_initdb_archive( + self: &Arc, + tenant_id: &TenantId, + timeline_id: &TimelineId, + cancel: &CancellationToken, + ) -> anyhow::Result<()> { + backoff::retry( + || async { + upload::preserve_initdb_archive(&self.storage_impl, tenant_id, timeline_id, cancel) + .await + }, + |_e| false, + FAILED_DOWNLOAD_WARN_THRESHOLD, + FAILED_REMOTE_OP_RETRIES, + "preserve_initdb_tar_zst", + backoff::Cancel::new(cancel.clone(), || anyhow::anyhow!("Cancelled!")), + ) + .await + .context("backing up initdb archive")?; + Ok(()) + } + /// Prerequisites: UploadQueue should be in stopped state and deleted_at should be successfuly set. /// The function deletes layer files one by one, then lists the prefix to see if we leaked something /// deletes leaked files if any and proceeds with deletion of index file at the end. @@ -1101,6 +1125,14 @@ impl RemoteTimelineClient { let layer_deletion_count = layers.len(); self.deletion_queue_client.push_immediate(layers).await?; + // Delete the initdb.tar.zst, which is not always present, but deletion attempts of + // inexistant objects are not considered errors. + let initdb_path = + remote_initdb_archive_path(&self.tenant_shard_id.tenant_id, &self.timeline_id); + self.deletion_queue_client + .push_immediate(vec![initdb_path]) + .await?; + // Do not delete index part yet, it is needed for possible retry. If we remove it first // and retry will arrive to different pageserver there wont be any traces of it on remote storage let timeline_storage_path = remote_timeline_path(&self.tenant_shard_id, &self.timeline_id); @@ -1148,10 +1180,8 @@ impl RemoteTimelineClient { if p == &latest_index { return false; } - if let Some(name) = p.object_name() { - if name == INITDB_PATH { - return false; - } + if p.object_name() == Some(INITDB_PRESERVED_PATH) { + return false; } true }) @@ -1724,6 +1754,16 @@ pub fn remote_initdb_archive_path(tenant_id: &TenantId, timeline_id: &TimelineId .expect("Failed to construct path") } +pub fn remote_initdb_preserved_archive_path( + tenant_id: &TenantId, + timeline_id: &TimelineId, +) -> RemotePath { + RemotePath::from_string(&format!( + "tenants/{tenant_id}/{TIMELINES_SEGMENT_NAME}/{timeline_id}/{INITDB_PRESERVED_PATH}" + )) + .expect("Failed to construct path") +} + pub fn remote_index_path( tenant_shard_id: &TenantShardId, timeline_id: &TimelineId, diff --git a/pageserver/src/tenant/remote_timeline_client/download.rs b/pageserver/src/tenant/remote_timeline_client/download.rs index d3956163c8..4309c683e2 100644 --- a/pageserver/src/tenant/remote_timeline_client/download.rs +++ b/pageserver/src/tenant/remote_timeline_client/download.rs @@ -32,7 +32,8 @@ use utils::id::TimelineId; use super::index::{IndexPart, LayerFileMetadata}; use super::{ parse_remote_index_path, remote_index_path, remote_initdb_archive_path, - FAILED_DOWNLOAD_WARN_THRESHOLD, FAILED_REMOTE_OP_RETRIES, INITDB_PATH, + remote_initdb_preserved_archive_path, FAILED_DOWNLOAD_WARN_THRESHOLD, FAILED_REMOTE_OP_RETRIES, + INITDB_PATH, }; /// @@ -430,6 +431,9 @@ pub(crate) async fn download_initdb_tar_zst( let remote_path = remote_initdb_archive_path(&tenant_shard_id.tenant_id, timeline_id); + let remote_preserved_path = + remote_initdb_preserved_archive_path(&tenant_shard_id.tenant_id, timeline_id); + let timeline_path = conf.timelines_path(tenant_shard_id); if !timeline_path.exists() { @@ -456,8 +460,16 @@ pub(crate) async fn download_initdb_tar_zst( .with_context(|| format!("tempfile creation {temp_path}")) .map_err(DownloadError::Other)?; - let download = - download_cancellable(&cancel_inner, storage.download(&remote_path)).await?; + let download = match download_cancellable(&cancel_inner, storage.download(&remote_path)) + .await + { + Ok(dl) => dl, + Err(DownloadError::NotFound) => { + download_cancellable(&cancel_inner, storage.download(&remote_preserved_path)) + .await? + } + Err(other) => Err(other)?, + }; let mut download = tokio_util::io::StreamReader::new(download.download_stream); let mut writer = tokio::io::BufWriter::with_capacity(8 * 1024, file); diff --git a/pageserver/src/tenant/remote_timeline_client/upload.rs b/pageserver/src/tenant/remote_timeline_client/upload.rs index 11c6956875..58d95f75c2 100644 --- a/pageserver/src/tenant/remote_timeline_client/upload.rs +++ b/pageserver/src/tenant/remote_timeline_client/upload.rs @@ -13,8 +13,8 @@ use super::Generation; use crate::{ config::PageServerConf, tenant::remote_timeline_client::{ - index::IndexPart, remote_index_path, remote_initdb_archive_path, remote_path, - upload_cancellable, + index::IndexPart, remote_index_path, remote_initdb_archive_path, + remote_initdb_preserved_archive_path, remote_path, upload_cancellable, }, }; use remote_storage::GenericRemoteStorage; @@ -144,3 +144,16 @@ pub(crate) async fn upload_initdb_dir( .await .with_context(|| format!("upload initdb dir for '{tenant_id} / {timeline_id}'")) } + +pub(crate) async fn preserve_initdb_archive( + storage: &GenericRemoteStorage, + tenant_id: &TenantId, + timeline_id: &TimelineId, + cancel: &CancellationToken, +) -> anyhow::Result<()> { + let source_path = remote_initdb_archive_path(tenant_id, timeline_id); + let dest_path = remote_initdb_preserved_archive_path(tenant_id, timeline_id); + upload_cancellable(cancel, storage.copy_object(&source_path, &dest_path)) + .await + .with_context(|| format!("backing up initdb archive for '{tenant_id} / {timeline_id}'")) +} diff --git a/pageserver/src/tenant/storage_layer/delta_layer.rs b/pageserver/src/tenant/storage_layer/delta_layer.rs index 4ded6d6a8d..3a445ef71e 100644 --- a/pageserver/src/tenant/storage_layer/delta_layer.rs +++ b/pageserver/src/tenant/storage_layer/delta_layer.rs @@ -36,7 +36,7 @@ use crate::tenant::block_io::{BlockBuf, BlockCursor, BlockLease, BlockReader, Fi use crate::tenant::disk_btree::{DiskBtreeBuilder, DiskBtreeReader, VisitDirection}; use crate::tenant::storage_layer::{Layer, ValueReconstructResult, ValueReconstructState}; use crate::tenant::Timeline; -use crate::virtual_file::VirtualFile; +use crate::virtual_file::{self, VirtualFile}; use crate::{walrecord, TEMP_FILE_SUFFIX}; use crate::{DELTA_FILE_MAGIC, STORAGE_FORMAT_VERSION}; use anyhow::{bail, ensure, Context, Result}; @@ -649,7 +649,7 @@ impl DeltaLayer { { let file = VirtualFile::open_with_options( path, - &*std::fs::OpenOptions::new().read(true).write(true), + virtual_file::OpenOptions::new().read(true).write(true), ) .await .with_context(|| format!("Failed to open file '{}'", path))?; diff --git a/pageserver/src/tenant/storage_layer/image_layer.rs b/pageserver/src/tenant/storage_layer/image_layer.rs index f03c7642eb..c62e6aed51 100644 --- a/pageserver/src/tenant/storage_layer/image_layer.rs +++ b/pageserver/src/tenant/storage_layer/image_layer.rs @@ -34,7 +34,7 @@ use crate::tenant::storage_layer::{ LayerAccessStats, ValueReconstructResult, ValueReconstructState, }; use crate::tenant::Timeline; -use crate::virtual_file::VirtualFile; +use crate::virtual_file::{self, VirtualFile}; use crate::{IMAGE_FILE_MAGIC, STORAGE_FORMAT_VERSION, TEMP_FILE_SUFFIX}; use anyhow::{bail, ensure, Context, Result}; use bytes::Bytes; @@ -327,7 +327,7 @@ impl ImageLayer { { let file = VirtualFile::open_with_options( path, - &*std::fs::OpenOptions::new().read(true).write(true), + virtual_file::OpenOptions::new().read(true).write(true), ) .await .with_context(|| format!("Failed to open file '{}'", path))?; @@ -492,11 +492,15 @@ impl ImageLayerWriterInner { }, ); info!("new image layer {path}"); - let mut file = VirtualFile::open_with_options( - &path, - std::fs::OpenOptions::new().write(true).create_new(true), - ) - .await?; + let mut file = { + VirtualFile::open_with_options( + &path, + virtual_file::OpenOptions::new() + .write(true) + .create_new(true), + ) + .await? + }; // make room for the header block file.seek(SeekFrom::Start(PAGE_SZ as u64)).await?; let blob_writer = BlobWriter::new(file, PAGE_SZ as u64); diff --git a/pageserver/src/tenant/tasks.rs b/pageserver/src/tenant/tasks.rs index 2b2fcc7711..5f39c46a84 100644 --- a/pageserver/src/tenant/tasks.rs +++ b/pageserver/src/tenant/tasks.rs @@ -9,6 +9,7 @@ use crate::context::{DownloadBehavior, RequestContext}; use crate::metrics::TENANT_TASK_EVENTS; use crate::task_mgr; use crate::task_mgr::{TaskKind, BACKGROUND_RUNTIME}; +use crate::tenant::timeline::CompactionError; use crate::tenant::{Tenant, TenantState}; use tokio_util::sync::CancellationToken; use tracing::*; @@ -181,8 +182,11 @@ async fn compaction_loop(tenant: Arc, cancel: CancellationToken) { ); error_run_count += 1; let wait_duration = Duration::from_secs_f64(wait_duration); - error!( - "Compaction failed {error_run_count} times, retrying in {wait_duration:?}: {e:?}", + log_compaction_error( + &e, + error_run_count, + &wait_duration, + cancel.is_cancelled(), ); wait_duration } else { @@ -210,6 +214,58 @@ async fn compaction_loop(tenant: Arc, cancel: CancellationToken) { TENANT_TASK_EVENTS.with_label_values(&["stop"]).inc(); } +fn log_compaction_error( + e: &CompactionError, + error_run_count: u32, + sleep_duration: &std::time::Duration, + task_cancelled: bool, +) { + use crate::tenant::upload_queue::NotInitialized; + use crate::tenant::PageReconstructError; + use CompactionError::*; + + enum LooksLike { + Info, + Error, + } + + let decision = match e { + ShuttingDown => None, + _ if task_cancelled => Some(LooksLike::Info), + Other(e) => { + let root_cause = e.root_cause(); + + let is_stopping = { + let upload_queue = root_cause + .downcast_ref::() + .is_some_and(|e| e.is_stopping()); + + let timeline = root_cause + .downcast_ref::() + .is_some_and(|e| e.is_stopping()); + + upload_queue || timeline + }; + + if is_stopping { + Some(LooksLike::Info) + } else { + Some(LooksLike::Error) + } + } + }; + + match decision { + Some(LooksLike::Info) => info!( + "Compaction failed {error_run_count} times, retrying in {sleep_duration:?}: {e:#}", + ), + Some(LooksLike::Error) => error!( + "Compaction failed {error_run_count} times, retrying in {sleep_duration:?}: {e:?}", + ), + None => {} + } +} + /// /// GC task's main loop /// diff --git a/pageserver/src/tenant/timeline.rs b/pageserver/src/tenant/timeline.rs index 0cb7cf26f2..70c6ee2042 100644 --- a/pageserver/src/tenant/timeline.rs +++ b/pageserver/src/tenant/timeline.rs @@ -14,6 +14,7 @@ use enumset::EnumSet; use fail::fail_point; use itertools::Itertools; use pageserver_api::{ + keyspace::{key_range_size, KeySpaceAccum}, models::{ DownloadRemoteLayersTaskInfo, DownloadRemoteLayersTaskSpawnRequest, EvictionPolicy, LayerMapInfo, TimelineState, @@ -32,7 +33,7 @@ use tokio_util::sync::CancellationToken; use tracing::*; use utils::sync::gate::Gate; -use std::collections::{BinaryHeap, HashMap, HashSet}; +use std::collections::{BTreeMap, BinaryHeap, HashMap, HashSet}; use std::ops::{Deref, Range}; use std::pin::pin; use std::sync::atomic::Ordering as AtomicOrdering; @@ -73,8 +74,8 @@ use crate::metrics::{ TimelineMetrics, MATERIALIZED_PAGE_CACHE_HIT, MATERIALIZED_PAGE_CACHE_HIT_DIRECT, }; use crate::pgdatadir_mapping::CalculateLogicalSizeError; -use crate::pgdatadir_mapping::{is_inherited_key, is_rel_fsm_block_key, is_rel_vm_block_key}; use crate::tenant::config::TenantConfOpt; +use pageserver_api::key::{is_inherited_key, is_rel_fsm_block_key, is_rel_vm_block_key}; use pageserver_api::reltag::RelTag; use pageserver_api::shard::ShardIndex; @@ -391,8 +392,7 @@ pub(crate) enum PageReconstructError { #[error("Ancestor LSN wait error: {0}")] AncestorLsnTimeout(#[from] WaitLsnError), - /// The operation was cancelled - #[error("Cancelled")] + #[error("timeline shutting down")] Cancelled, /// The ancestor of this is being stopped @@ -404,6 +404,34 @@ pub(crate) enum PageReconstructError { WalRedo(anyhow::Error), } +impl PageReconstructError { + /// Returns true if this error indicates a tenant/timeline shutdown alike situation + pub(crate) fn is_stopping(&self) -> bool { + use PageReconstructError::*; + match self { + Other(_) => false, + AncestorLsnTimeout(_) => false, + Cancelled | AncestorStopping(_) => true, + WalRedo(_) => false, + } + } +} + +#[derive(thiserror::Error, Debug)] +enum CreateImageLayersError { + #[error("timeline shutting down")] + Cancelled, + + #[error(transparent)] + GetVectoredError(GetVectoredError), + + #[error(transparent)] + PageReconstructError(PageReconstructError), + + #[error(transparent)] + Other(#[from] anyhow::Error), +} + #[derive(thiserror::Error, Debug)] enum FlushLayerError { /// Timeline cancellation token was cancelled @@ -411,12 +439,24 @@ enum FlushLayerError { Cancelled, #[error(transparent)] - PageReconstructError(#[from] PageReconstructError), + CreateImageLayersError(CreateImageLayersError), #[error(transparent)] Other(#[from] anyhow::Error), } +#[derive(thiserror::Error, Debug)] +pub(crate) enum GetVectoredError { + #[error("timeline shutting down")] + Cancelled, + + #[error("Requested too many keys: {0} > {}", Timeline::MAX_GET_VECTORED_KEYS)] + Oversized(u64), + + #[error("Requested at invalid LSN: {0}")] + InvalidLsn(Lsn), +} + #[derive(Clone, Copy)] pub enum LogicalSizeCalculationCause { Initial, @@ -456,6 +496,45 @@ pub(crate) enum WaitLsnError { Timeout(String), } +// The impls below achieve cancellation mapping for errors. +// Perhaps there's a way of achieving this with less cruft. + +impl From for CompactionError { + fn from(e: CreateImageLayersError) -> Self { + match e { + CreateImageLayersError::Cancelled => CompactionError::ShuttingDown, + _ => CompactionError::Other(e.into()), + } + } +} + +impl From for FlushLayerError { + fn from(e: CreateImageLayersError) -> Self { + match e { + CreateImageLayersError::Cancelled => FlushLayerError::Cancelled, + any => FlushLayerError::CreateImageLayersError(any), + } + } +} + +impl From for CreateImageLayersError { + fn from(e: PageReconstructError) -> Self { + match e { + PageReconstructError::Cancelled => CreateImageLayersError::Cancelled, + _ => CreateImageLayersError::PageReconstructError(e), + } + } +} + +impl From for CreateImageLayersError { + fn from(e: GetVectoredError) -> Self { + match e { + GetVectoredError::Cancelled => CreateImageLayersError::Cancelled, + _ => CreateImageLayersError::GetVectoredError(e), + } + } +} + /// Public interface functions impl Timeline { /// Get the LSN where this branch was created @@ -575,6 +654,57 @@ impl Timeline { res } + pub(crate) const MAX_GET_VECTORED_KEYS: u64 = 32; + + /// Look up multiple page versions at a given LSN + /// + /// This naive implementation will be replaced with a more efficient one + /// which actually vectorizes the read path. + pub(crate) async fn get_vectored( + &self, + key_ranges: &[Range], + lsn: Lsn, + ctx: &RequestContext, + ) -> Result>, GetVectoredError> { + if !lsn.is_valid() { + return Err(GetVectoredError::InvalidLsn(lsn)); + } + + let key_count = key_ranges + .iter() + .map(|range| key_range_size(range) as u64) + .sum(); + if key_count > Timeline::MAX_GET_VECTORED_KEYS { + return Err(GetVectoredError::Oversized(key_count)); + } + + let _timer = crate::metrics::GET_VECTORED_LATENCY + .for_task_kind(ctx.task_kind()) + .map(|t| t.start_timer()); + + let mut values = BTreeMap::new(); + for range in key_ranges { + let mut key = range.start; + while key != range.end { + assert!(!self.shard_identity.is_key_disposable(&key)); + + let block = self.get(key, lsn, ctx).await; + + if matches!( + block, + Err(PageReconstructError::Cancelled | PageReconstructError::AncestorStopping(_)) + ) { + return Err(GetVectoredError::Cancelled); + } + + values.insert(key, block); + key = key.next(); + } + } + + Ok(values) + } + /// Get last or prev record separately. Same as get_last_record_rlsn().last/prev. pub fn get_last_record_lsn(&self) -> Lsn { self.last_record_lsn.load().last @@ -2582,7 +2712,7 @@ impl Timeline { return; } err @ Err( - FlushLayerError::Other(_) | FlushLayerError::PageReconstructError(_), + FlushLayerError::Other(_) | FlushLayerError::CreateImageLayersError(_), ) => { error!("could not flush frozen layer: {err:?}"); break err; @@ -2859,6 +2989,21 @@ impl Timeline { Ok(()) } + pub(crate) async fn preserve_initdb_archive(&self) -> anyhow::Result<()> { + if let Some(remote_client) = &self.remote_client { + remote_client + .preserve_initdb_archive( + &self.tenant_shard_id.tenant_id, + &self.timeline_id, + &self.cancel, + ) + .await?; + } else { + bail!("No remote storage configured, but was asked to backup the initdb archive for {} / {}", self.tenant_shard_id.tenant_id, self.timeline_id); + } + Ok(()) + } + // Write out the given frozen in-memory layer as a new L0 delta file. This L0 file will not be tracked // in layer map immediately. The caller is responsible to put it into the layer map. async fn create_delta_layer( @@ -2950,11 +3095,7 @@ impl Timeline { } // Is it time to create a new image layer for the given partition? - async fn time_for_new_image_layer( - &self, - partition: &KeySpace, - lsn: Lsn, - ) -> anyhow::Result { + async fn time_for_new_image_layer(&self, partition: &KeySpace, lsn: Lsn) -> bool { let threshold = self.get_image_creation_threshold(); let guard = self.layers.read().await; @@ -2974,20 +3115,20 @@ impl Timeline { // but the range is already covered by image layers at more recent LSNs. Before we // create a new image layer, check if the range is already covered at more recent LSNs. if !layers - .image_layer_exists(&img_range, &(Lsn::min(lsn, *cutoff_lsn)..lsn + 1))? + .image_layer_exists(&img_range, &(Lsn::min(lsn, *cutoff_lsn)..lsn + 1)) { debug!( "Force generation of layer {}-{} wanted by GC, cutoff={}, lsn={})", img_range.start, img_range.end, cutoff_lsn, lsn ); - return Ok(true); + return true; } } } } for part_range in &partition.ranges { - let image_coverage = layers.image_coverage(part_range, lsn)?; + let image_coverage = layers.image_coverage(part_range, lsn); for (img_range, last_img) in image_coverage { let img_lsn = if let Some(last_img) = last_img { last_img.get_lsn_range().end @@ -3008,7 +3149,7 @@ impl Timeline { // after we read last_record_lsn, which is passed here in the 'lsn' argument. if img_lsn < lsn { let num_deltas = - layers.count_deltas(&img_range, &(img_lsn..lsn), Some(threshold))?; + layers.count_deltas(&img_range, &(img_lsn..lsn), Some(threshold)); max_deltas = max_deltas.max(num_deltas); if num_deltas >= threshold { @@ -3016,7 +3157,7 @@ impl Timeline { "key range {}-{}, has {} deltas on this timeline in LSN range {}..{}", img_range.start, img_range.end, num_deltas, img_lsn, lsn ); - return Ok(true); + return true; } } } @@ -3026,7 +3167,7 @@ impl Timeline { max_deltas, "none of the partitioned ranges had >= {threshold} deltas" ); - Ok(false) + false } #[tracing::instrument(skip_all, fields(%lsn, %force))] @@ -3036,7 +3177,7 @@ impl Timeline { lsn: Lsn, force: bool, ctx: &RequestContext, - ) -> Result, PageReconstructError> { + ) -> Result, CreateImageLayersError> { let timer = self.metrics.create_images_time_histo.start_timer(); let mut image_layers = Vec::new(); @@ -3054,7 +3195,7 @@ impl Timeline { for partition in partitioning.parts.iter() { let img_range = start..partition.ranges.last().unwrap().end; start = img_range.end; - if force || self.time_for_new_image_layer(partition, lsn).await? { + if force || self.time_for_new_image_layer(partition, lsn).await { let mut image_layer_writer = ImageLayerWriter::new( self.conf, self.timeline_id, @@ -3065,10 +3206,12 @@ impl Timeline { .await?; fail_point!("image-layer-writer-fail-before-finish", |_| { - Err(PageReconstructError::Other(anyhow::anyhow!( + Err(CreateImageLayersError::Other(anyhow::anyhow!( "failpoint image-layer-writer-fail-before-finish" ))) }); + + let mut key_request_accum = KeySpaceAccum::new(); for range in &partition.ranges { let mut key = range.start; while key < range.end { @@ -3081,34 +3224,55 @@ impl Timeline { key = key.next(); continue; } - let img = match self.get(key, lsn, ctx).await { - Ok(img) => img, - Err(err) => { - // If we fail to reconstruct a VM or FSM page, we can zero the - // page without losing any actual user data. That seems better - // than failing repeatedly and getting stuck. - // - // We had a bug at one point, where we truncated the FSM and VM - // in the pageserver, but the Postgres didn't know about that - // and continued to generate incremental WAL records for pages - // that didn't exist in the pageserver. Trying to replay those - // WAL records failed to find the previous image of the page. - // This special case allows us to recover from that situation. - // See https://github.com/neondatabase/neon/issues/2601. - // - // Unfortunately we cannot do this for the main fork, or for - // any metadata keys, keys, as that would lead to actual data - // loss. - if is_rel_fsm_block_key(key) || is_rel_vm_block_key(key) { - warn!("could not reconstruct FSM or VM key {key}, filling with zeros: {err:?}"); - ZERO_PAGE.clone() - } else { - return Err(err); - } - } - }; - image_layer_writer.put_image(key, &img).await?; + key_request_accum.add_key(key); + if key_request_accum.size() >= Timeline::MAX_GET_VECTORED_KEYS + || key.next() == range.end + { + let results = self + .get_vectored( + &key_request_accum.consume_keyspace().ranges, + lsn, + ctx, + ) + .await?; + + for (img_key, img) in results { + let img = match img { + Ok(img) => img, + Err(err) => { + // If we fail to reconstruct a VM or FSM page, we can zero the + // page without losing any actual user data. That seems better + // than failing repeatedly and getting stuck. + // + // We had a bug at one point, where we truncated the FSM and VM + // in the pageserver, but the Postgres didn't know about that + // and continued to generate incremental WAL records for pages + // that didn't exist in the pageserver. Trying to replay those + // WAL records failed to find the previous image of the page. + // This special case allows us to recover from that situation. + // See https://github.com/neondatabase/neon/issues/2601. + // + // Unfortunately we cannot do this for the main fork, or for + // any metadata keys, keys, as that would lead to actual data + // loss. + if is_rel_fsm_block_key(img_key) + || is_rel_vm_block_key(img_key) + { + warn!("could not reconstruct FSM or VM key {img_key}, filling with zeros: {err:?}"); + ZERO_PAGE.clone() + } else { + return Err( + CreateImageLayersError::PageReconstructError(err), + ); + } + } + }; + + image_layer_writer.put_image(img_key, &img).await?; + } + } + key = key.next(); } } @@ -3484,7 +3648,7 @@ impl Timeline { // has not so much sense, because largest holes will corresponds field1/field2 changes. // But we are mostly interested to eliminate holes which cause generation of excessive image layers. // That is why it is better to measure size of hole as number of covering image layers. - let coverage_size = layers.image_coverage(&key_range, last_record_lsn)?.len(); + let coverage_size = layers.image_coverage(&key_range, last_record_lsn).len(); if coverage_size >= min_hole_coverage_size { heap.push(Hole { key_range, @@ -4110,7 +4274,7 @@ impl Timeline { // we cannot remove C, even though it's older than 2500, because // the delta layer 2000-3000 depends on it. if !layers - .image_layer_exists(&l.get_key_range(), &(l.get_lsn_range().end..new_gc_cutoff))? + .image_layer_exists(&l.get_key_range(), &(l.get_lsn_range().end..new_gc_cutoff)) { debug!("keeping {} because it is the latest layer", l.filename()); // Collect delta key ranges that need image layers to allow garbage @@ -4240,7 +4404,7 @@ impl Timeline { .walredo_mgr .request_redo(key, request_lsn, data.img, data.records, self.pg_version) .await - .context("Failed to reconstruct a page image:") + .context("reconstruct a page image") { Ok(img) => img, Err(e) => return Err(PageReconstructError::WalRedo(e)), diff --git a/pageserver/src/tenant/upload_queue.rs b/pageserver/src/tenant/upload_queue.rs index 32f14f40c5..0b61bc0a10 100644 --- a/pageserver/src/tenant/upload_queue.rs +++ b/pageserver/src/tenant/upload_queue.rs @@ -126,6 +126,27 @@ pub(super) struct UploadQueueStopped { pub(super) deleted_at: SetDeletedFlagProgress, } +#[derive(thiserror::Error, Debug)] +pub(crate) enum NotInitialized { + #[error("queue is in state Uninitialized")] + Uninitialized, + #[error("queue is in state Stopping")] + Stopped, + #[error("queue is shutting down")] + ShuttingDown, +} + +impl NotInitialized { + pub(crate) fn is_stopping(&self) -> bool { + use NotInitialized::*; + match self { + Uninitialized => false, + Stopped => true, + ShuttingDown => true, + } + } +} + impl UploadQueue { pub(crate) fn initialize_empty_remote( &mut self, @@ -214,17 +235,17 @@ impl UploadQueue { } pub(crate) fn initialized_mut(&mut self) -> anyhow::Result<&mut UploadQueueInitialized> { + use UploadQueue::*; match self { - UploadQueue::Uninitialized | UploadQueue::Stopped(_) => { - anyhow::bail!("queue is in state {}", self.as_str()) - } - UploadQueue::Initialized(x) => { - if !x.shutting_down { - Ok(x) + Uninitialized => Err(NotInitialized::Uninitialized.into()), + Initialized(x) => { + if x.shutting_down { + Err(NotInitialized::ShuttingDown.into()) } else { - anyhow::bail!("queue is shutting down") + Ok(x) } } + Stopped(_) => Err(NotInitialized::Stopped.into()), } } diff --git a/pageserver/src/virtual_file.rs b/pageserver/src/virtual_file.rs index 9feefd8a32..066f06c88f 100644 --- a/pageserver/src/virtual_file.rs +++ b/pageserver/src/virtual_file.rs @@ -11,18 +11,28 @@ //! src/backend/storage/file/fd.c //! use crate::metrics::{StorageIoOperation, STORAGE_IO_SIZE, STORAGE_IO_TIME_METRIC}; + +use crate::page_cache::PageWriteGuard; use crate::tenant::TENANTS_SEGMENT_NAME; use camino::{Utf8Path, Utf8PathBuf}; use once_cell::sync::OnceCell; use pageserver_api::shard::TenantShardId; -use std::fs::{self, File, OpenOptions}; +use std::fs::{self, File}; use std::io::{Error, ErrorKind, Seek, SeekFrom}; +use tokio_epoll_uring::IoBufMut; + +use std::os::fd::{AsRawFd, FromRawFd, IntoRawFd, OwnedFd, RawFd}; use std::os::unix::fs::FileExt; use std::sync::atomic::{AtomicBool, AtomicUsize, Ordering}; use tokio::sync::{RwLock, RwLockReadGuard, RwLockWriteGuard}; use tokio::time::Instant; use utils::fs_ext; +mod io_engine; +mod open_options; +pub use io_engine::IoEngineKind; +pub(crate) use open_options::*; + /// /// A virtual file descriptor. You can use this just like std::fs::File, but internally /// the underlying file is closed if the system is low on file descriptors, @@ -106,7 +116,38 @@ struct SlotInner { tag: u64, /// the underlying file - file: Option, + file: Option, +} + +/// Impl of [`tokio_epoll_uring::IoBuf`] and [`tokio_epoll_uring::IoBufMut`] for [`PageWriteGuard`]. +struct PageWriteGuardBuf { + page: PageWriteGuard<'static>, + init_up_to: usize, +} +// Safety: the [`PageWriteGuard`] gives us exclusive ownership of the page cache slot, +// and the location remains stable even if [`Self`] or the [`PageWriteGuard`] is moved. +unsafe impl tokio_epoll_uring::IoBuf for PageWriteGuardBuf { + fn stable_ptr(&self) -> *const u8 { + self.page.as_ptr() + } + fn bytes_init(&self) -> usize { + self.init_up_to + } + fn bytes_total(&self) -> usize { + self.page.len() + } +} +// Safety: see above, plus: the ownership of [`PageWriteGuard`] means exclusive access, +// hence it's safe to hand out the `stable_mut_ptr()`. +unsafe impl tokio_epoll_uring::IoBufMut for PageWriteGuardBuf { + fn stable_mut_ptr(&mut self) -> *mut u8 { + self.page.as_mut_ptr() + } + + unsafe fn set_init(&mut self, pos: usize) { + assert!(pos <= self.page.len()); + self.init_up_to = pos; + } } impl OpenFiles { @@ -274,6 +315,10 @@ macro_rules! with_file { let $ident = $this.lock_file().await?; observe_duration!($op, $($body)*) }}; + ($this:expr, $op:expr, | mut $ident:ident | $($body:tt)*) => {{ + let mut $ident = $this.lock_file().await?; + observe_duration!($op, $($body)*) + }}; } impl VirtualFile { @@ -326,7 +371,9 @@ impl VirtualFile { // NB: there is also StorageIoOperation::OpenAfterReplace which is for the case // where our caller doesn't get to use the returned VirtualFile before its // slot gets re-used by someone else. - let file = observe_duration!(StorageIoOperation::Open, open_options.open(path))?; + let file = observe_duration!(StorageIoOperation::Open, { + open_options.open(path.as_std_path()).await? + }); // Strip all options other than read and write. // @@ -400,15 +447,13 @@ impl VirtualFile { /// Call File::sync_all() on the underlying File. pub async fn sync_all(&self) -> Result<(), Error> { - with_file!(self, StorageIoOperation::Fsync, |file| file - .as_ref() - .sync_all()) + with_file!(self, StorageIoOperation::Fsync, |file_guard| file_guard + .with_std_file(|std_file| std_file.sync_all())) } pub async fn metadata(&self) -> Result { - with_file!(self, StorageIoOperation::Metadata, |file| file - .as_ref() - .metadata()) + with_file!(self, StorageIoOperation::Metadata, |file_guard| file_guard + .with_std_file(|std_file| std_file.metadata())) } /// Helper function internal to `VirtualFile` that looks up the underlying File, @@ -417,7 +462,7 @@ impl VirtualFile { /// /// We are doing it via a macro as Rust doesn't support async closures that /// take on parameters with lifetimes. - async fn lock_file(&self) -> Result, Error> { + async fn lock_file(&self) -> Result { let open_files = get_open_files(); let mut handle_guard = { @@ -463,10 +508,9 @@ impl VirtualFile { // NB: we use StorageIoOperation::OpenAferReplace for this to distinguish this // case from StorageIoOperation::Open. This helps with identifying thrashing // of the virtual file descriptor cache. - let file = observe_duration!( - StorageIoOperation::OpenAfterReplace, - self.open_options.open(&self.path) - )?; + let file = observe_duration!(StorageIoOperation::OpenAfterReplace, { + self.open_options.open(self.path.as_std_path()).await? + }); // Store the File in the slot and update the handle in the VirtualFile // to point to it. @@ -491,9 +535,8 @@ impl VirtualFile { self.pos = offset; } SeekFrom::End(offset) => { - self.pos = with_file!(self, StorageIoOperation::Seek, |file| file - .as_ref() - .seek(SeekFrom::End(offset)))? + self.pos = with_file!(self, StorageIoOperation::Seek, |mut file_guard| file_guard + .with_std_file_mut(|std_file| std_file.seek(SeekFrom::End(offset))))? } SeekFrom::Current(offset) => { let pos = self.pos as i128 + offset as i128; @@ -512,25 +555,28 @@ impl VirtualFile { Ok(self.pos) } - // Copied from https://doc.rust-lang.org/1.72.0/src/std/os/unix/fs.rs.html#117-135 - pub async fn read_exact_at(&self, mut buf: &mut [u8], mut offset: u64) -> Result<(), Error> { - while !buf.is_empty() { - match self.read_at(buf, offset).await { - Ok(0) => { - return Err(Error::new( - std::io::ErrorKind::UnexpectedEof, - "failed to fill whole buffer", - )) - } - Ok(n) => { - buf = &mut buf[n..]; - offset += n as u64; - } - Err(ref e) if e.kind() == std::io::ErrorKind::Interrupted => {} - Err(e) => return Err(e), - } - } - Ok(()) + pub async fn read_exact_at(&self, buf: B, offset: u64) -> Result + where + B: IoBufMut + Send, + { + let (buf, res) = + read_exact_at_impl(buf, offset, |buf, offset| self.read_at(buf, offset)).await; + res.map(|()| buf) + } + + /// Like [`Self::read_exact_at`] but for [`PageWriteGuard`]. + pub async fn read_exact_at_page( + &self, + page: PageWriteGuard<'static>, + offset: u64, + ) -> Result, Error> { + let buf = PageWriteGuardBuf { + page, + init_up_to: 0, + }; + let res = self.read_exact_at(buf, offset).await; + res.map(|PageWriteGuardBuf { page, .. }| page) + .map_err(|e| Error::new(ErrorKind::Other, e)) } // Copied from https://doc.rust-lang.org/1.72.0/src/std/os/unix/fs.rs.html#219-235 @@ -580,22 +626,35 @@ impl VirtualFile { Ok(n) } - pub async fn read_at(&self, buf: &mut [u8], offset: u64) -> Result { - let result = with_file!(self, StorageIoOperation::Read, |file| file - .as_ref() - .read_at(buf, offset)); - if let Ok(size) = result { - STORAGE_IO_SIZE - .with_label_values(&["read", &self.tenant_id, &self.shard_id, &self.timeline_id]) - .add(size as i64); - } - result + pub(crate) async fn read_at(&self, buf: B, offset: u64) -> (B, Result) + where + B: tokio_epoll_uring::BoundedBufMut + Send, + { + let file_guard = match self.lock_file().await { + Ok(file_guard) => file_guard, + Err(e) => return (buf, Err(e)), + }; + + observe_duration!(StorageIoOperation::Read, { + let ((_file_guard, buf), res) = io_engine::get().read_at(file_guard, offset, buf).await; + if let Ok(size) = res { + STORAGE_IO_SIZE + .with_label_values(&[ + "read", + &self.tenant_id, + &self.shard_id, + &self.timeline_id, + ]) + .add(size as i64); + } + (buf, res) + }) } async fn write_at(&self, buf: &[u8], offset: u64) -> Result { - let result = with_file!(self, StorageIoOperation::Write, |file| file - .as_ref() - .write_at(buf, offset)); + let result = with_file!(self, StorageIoOperation::Write, |file_guard| { + file_guard.with_std_file(|std_file| std_file.write_at(buf, offset)) + }); if let Ok(size) = result { STORAGE_IO_SIZE .with_label_values(&["write", &self.tenant_id, &self.shard_id, &self.timeline_id]) @@ -605,18 +664,241 @@ impl VirtualFile { } } -struct FileGuard<'a> { - slot_guard: RwLockReadGuard<'a, SlotInner>, +// Adapted from https://doc.rust-lang.org/1.72.0/src/std/os/unix/fs.rs.html#117-135 +pub async fn read_exact_at_impl( + buf: B, + mut offset: u64, + mut read_at: F, +) -> (B, std::io::Result<()>) +where + B: IoBufMut + Send, + F: FnMut(tokio_epoll_uring::Slice, u64) -> Fut, + Fut: std::future::Future, std::io::Result)>, +{ + use tokio_epoll_uring::BoundedBuf; + let mut buf: tokio_epoll_uring::Slice = buf.slice_full(); // includes all the uninitialized memory + while buf.bytes_total() != 0 { + let res; + (buf, res) = read_at(buf, offset).await; + match res { + Ok(0) => break, + Ok(n) => { + buf = buf.slice(n..); + offset += n as u64; + } + Err(ref e) if e.kind() == std::io::ErrorKind::Interrupted => {} + Err(e) => return (buf.into_inner(), Err(e)), + } + } + // NB: don't use `buf.is_empty()` here; it is from the + // `impl Deref for Slice { Target = [u8] }`; the the &[u8] + // returned by it only covers the initialized portion of `buf`. + // Whereas we're interested in ensuring that we filled the entire + // buffer that the user passed in. + if buf.bytes_total() != 0 { + ( + buf.into_inner(), + Err(std::io::Error::new( + std::io::ErrorKind::UnexpectedEof, + "failed to fill whole buffer", + )), + ) + } else { + assert_eq!(buf.len(), buf.bytes_total()); + (buf.into_inner(), Ok(())) + } } -impl<'a> AsRef for FileGuard<'a> { - fn as_ref(&self) -> &File { +#[cfg(test)] +mod test_read_exact_at_impl { + + use std::{collections::VecDeque, sync::Arc}; + + use tokio_epoll_uring::{BoundedBuf, BoundedBufMut}; + + use super::read_exact_at_impl; + + struct Expectation { + offset: u64, + bytes_total: usize, + result: std::io::Result>, + } + struct MockReadAt { + expectations: VecDeque, + } + + impl MockReadAt { + async fn read_at( + &mut self, + mut buf: tokio_epoll_uring::Slice>, + offset: u64, + ) -> (tokio_epoll_uring::Slice>, std::io::Result) { + let exp = self + .expectations + .pop_front() + .expect("read_at called but we have no expectations left"); + assert_eq!(exp.offset, offset); + assert_eq!(exp.bytes_total, buf.bytes_total()); + match exp.result { + Ok(bytes) => { + assert!(bytes.len() <= buf.bytes_total()); + buf.put_slice(&bytes); + (buf, Ok(bytes.len())) + } + Err(e) => (buf, Err(e)), + } + } + } + + impl Drop for MockReadAt { + fn drop(&mut self) { + assert_eq!(self.expectations.len(), 0); + } + } + + #[tokio::test] + async fn test_basic() { + let buf = Vec::with_capacity(5); + let mock_read_at = Arc::new(tokio::sync::Mutex::new(MockReadAt { + expectations: VecDeque::from(vec![Expectation { + offset: 0, + bytes_total: 5, + result: Ok(vec![b'a', b'b', b'c', b'd', b'e']), + }]), + })); + let (buf, res) = read_exact_at_impl(buf, 0, |buf, offset| { + let mock_read_at = Arc::clone(&mock_read_at); + async move { mock_read_at.lock().await.read_at(buf, offset).await } + }) + .await; + assert!(res.is_ok()); + assert_eq!(buf, vec![b'a', b'b', b'c', b'd', b'e']); + } + + #[tokio::test] + async fn test_empty_buf_issues_no_syscall() { + let buf = Vec::new(); + let mock_read_at = Arc::new(tokio::sync::Mutex::new(MockReadAt { + expectations: VecDeque::new(), + })); + let (_buf, res) = read_exact_at_impl(buf, 0, |buf, offset| { + let mock_read_at = Arc::clone(&mock_read_at); + async move { mock_read_at.lock().await.read_at(buf, offset).await } + }) + .await; + assert!(res.is_ok()); + } + + #[tokio::test] + async fn test_two_read_at_calls_needed_until_buf_filled() { + let buf = Vec::with_capacity(4); + let mock_read_at = Arc::new(tokio::sync::Mutex::new(MockReadAt { + expectations: VecDeque::from(vec![ + Expectation { + offset: 0, + bytes_total: 4, + result: Ok(vec![b'a', b'b']), + }, + Expectation { + offset: 2, + bytes_total: 2, + result: Ok(vec![b'c', b'd']), + }, + ]), + })); + let (buf, res) = read_exact_at_impl(buf, 0, |buf, offset| { + let mock_read_at = Arc::clone(&mock_read_at); + async move { mock_read_at.lock().await.read_at(buf, offset).await } + }) + .await; + assert!(res.is_ok()); + assert_eq!(buf, vec![b'a', b'b', b'c', b'd']); + } + + #[tokio::test] + async fn test_eof_before_buffer_full() { + let buf = Vec::with_capacity(3); + let mock_read_at = Arc::new(tokio::sync::Mutex::new(MockReadAt { + expectations: VecDeque::from(vec![ + Expectation { + offset: 0, + bytes_total: 3, + result: Ok(vec![b'a']), + }, + Expectation { + offset: 1, + bytes_total: 2, + result: Ok(vec![b'b']), + }, + Expectation { + offset: 2, + bytes_total: 1, + result: Ok(vec![]), + }, + ]), + })); + let (_buf, res) = read_exact_at_impl(buf, 0, |buf, offset| { + let mock_read_at = Arc::clone(&mock_read_at); + async move { mock_read_at.lock().await.read_at(buf, offset).await } + }) + .await; + let Err(err) = res else { + panic!("should return an error"); + }; + assert_eq!(err.kind(), std::io::ErrorKind::UnexpectedEof); + assert_eq!(format!("{err}"), "failed to fill whole buffer"); + // buffer contents on error are unspecified + } +} + +struct FileGuard { + slot_guard: RwLockReadGuard<'static, SlotInner>, +} + +impl AsRef for FileGuard { + fn as_ref(&self) -> &OwnedFd { // This unwrap is safe because we only create `FileGuard`s // if we know that the file is Some. self.slot_guard.file.as_ref().unwrap() } } +impl FileGuard { + /// Soft deprecation: we'll move VirtualFile to async APIs and remove this function eventually. + fn with_std_file(&self, with: F) -> R + where + F: FnOnce(&File) -> R, + { + // SAFETY: + // - lifetime of the fd: `file` doesn't outlive the OwnedFd stored in `self`. + // - `&` usage below: `self` is `&`, hence Rust typesystem guarantees there are is no `&mut` + let file = unsafe { File::from_raw_fd(self.as_ref().as_raw_fd()) }; + let res = with(&file); + let _ = file.into_raw_fd(); + res + } + /// Soft deprecation: we'll move VirtualFile to async APIs and remove this function eventually. + fn with_std_file_mut(&mut self, with: F) -> R + where + F: FnOnce(&mut File) -> R, + { + // SAFETY: + // - lifetime of the fd: `file` doesn't outlive the OwnedFd stored in `self`. + // - &mut usage below: `self` is `&mut`, hence this call is the only task/thread that has control over the underlying fd + let mut file = unsafe { File::from_raw_fd(self.as_ref().as_raw_fd()) }; + let res = with(&mut file); + let _ = file.into_raw_fd(); + res + } +} + +impl tokio_epoll_uring::IoFd for FileGuard { + unsafe fn as_fd(&self) -> RawFd { + let owned_fd: &OwnedFd = self.as_ref(); + owned_fd.as_raw_fd() + } +} + #[cfg(test)] impl VirtualFile { pub(crate) async fn read_blk( @@ -624,16 +906,19 @@ impl VirtualFile { blknum: u32, ) -> Result, std::io::Error> { use crate::page_cache::PAGE_SZ; - let mut buf = [0; PAGE_SZ]; - self.read_exact_at(&mut buf, blknum as u64 * (PAGE_SZ as u64)) + let buf = vec![0; PAGE_SZ]; + let buf = self + .read_exact_at(buf, blknum as u64 * (PAGE_SZ as u64)) .await?; - Ok(std::sync::Arc::new(buf).into()) + Ok(crate::tenant::block_io::BlockLease::Vec(buf)) } async fn read_to_end(&mut self, buf: &mut Vec) -> Result<(), Error> { + let mut tmp = vec![0; 128]; loop { - let mut tmp = [0; 128]; - match self.read_at(&mut tmp, self.pos).await { + let res; + (tmp, res) = self.read_at(tmp, self.pos).await; + match res { Ok(0) => return Ok(()), Ok(n) => { self.pos += n as u64; @@ -709,10 +994,12 @@ impl OpenFiles { /// Initialize the virtual file module. This must be called once at page /// server startup. /// -pub fn init(num_slots: usize) { +#[cfg(not(test))] +pub fn init(num_slots: usize, engine: IoEngineKind) { if OPEN_FILES.set(OpenFiles::new(num_slots)).is_err() { panic!("virtual_file::init called twice"); } + io_engine::init(engine); crate::metrics::virtual_file_descriptor_cache::SIZE_MAX.set(num_slots as u64); } @@ -757,10 +1044,10 @@ mod tests { } impl MaybeVirtualFile { - async fn read_exact_at(&self, buf: &mut [u8], offset: u64) -> Result<(), Error> { + async fn read_exact_at(&self, mut buf: Vec, offset: u64) -> Result, Error> { match self { MaybeVirtualFile::VirtualFile(file) => file.read_exact_at(buf, offset).await, - MaybeVirtualFile::File(file) => file.read_exact_at(buf, offset), + MaybeVirtualFile::File(file) => file.read_exact_at(&mut buf, offset).map(|()| buf), } } async fn write_all_at(&self, buf: &[u8], offset: u64) -> Result<(), Error> { @@ -802,14 +1089,14 @@ mod tests { // Helper function to slurp a portion of a file into a string async fn read_string_at(&mut self, pos: u64, len: usize) -> Result { - let mut buf = vec![0; len]; - self.read_exact_at(&mut buf, pos).await?; + let buf = vec![0; len]; + let buf = self.read_exact_at(buf, pos).await?; Ok(String::from_utf8(buf).unwrap()) } } #[tokio::test] - async fn test_virtual_files() -> Result<(), Error> { + async fn test_virtual_files() -> anyhow::Result<()> { // The real work is done in the test_files() helper function. This // allows us to run the same set of tests against a native File, and // VirtualFile. We trust the native Files and wouldn't need to test them, @@ -825,14 +1112,17 @@ mod tests { } #[tokio::test] - async fn test_physical_files() -> Result<(), Error> { + async fn test_physical_files() -> anyhow::Result<()> { test_files("physical_files", |path, open_options| async move { - Ok(MaybeVirtualFile::File(open_options.open(path)?)) + Ok(MaybeVirtualFile::File({ + let owned_fd = open_options.open(path.as_std_path()).await?; + File::from(owned_fd) + })) }) .await } - async fn test_files(testname: &str, openfunc: OF) -> Result<(), Error> + async fn test_files(testname: &str, openfunc: OF) -> anyhow::Result<()> where OF: Fn(Utf8PathBuf, OpenOptions) -> FT, FT: Future>, @@ -976,11 +1266,11 @@ mod tests { for _threadno in 0..THREADS { let files = files.clone(); let hdl = rt.spawn(async move { - let mut buf = [0u8; SIZE]; + let mut buf = vec![0u8; SIZE]; let mut rng = rand::rngs::OsRng; for _ in 1..1000 { let f = &files[rng.gen_range(0..files.len())]; - f.read_exact_at(&mut buf, 0).await.unwrap(); + buf = f.read_exact_at(buf, 0).await.unwrap(); assert!(buf == SAMPLE); } }); diff --git a/pageserver/src/virtual_file/io_engine.rs b/pageserver/src/virtual_file/io_engine.rs new file mode 100644 index 0000000000..f7b46fe653 --- /dev/null +++ b/pageserver/src/virtual_file/io_engine.rs @@ -0,0 +1,114 @@ +//! [`super::VirtualFile`] supports different IO engines. +//! +//! The [`IoEngineKind`] enum identifies them. +//! +//! The choice of IO engine is global. +//! Initialize using [`init`]. +//! +//! Then use [`get`] and [`super::OpenOptions`]. + +#[derive( + Copy, + Clone, + PartialEq, + Eq, + Hash, + strum_macros::EnumString, + strum_macros::Display, + serde_with::DeserializeFromStr, + serde_with::SerializeDisplay, + Debug, +)] +#[strum(serialize_all = "kebab-case")] +pub enum IoEngineKind { + StdFs, + #[cfg(target_os = "linux")] + TokioEpollUring, +} + +static IO_ENGINE: once_cell::sync::OnceCell = once_cell::sync::OnceCell::new(); + +#[cfg(not(test))] +pub(super) fn init(engine: IoEngineKind) { + if IO_ENGINE.set(engine).is_err() { + panic!("called twice"); + } + crate::metrics::virtual_file_io_engine::KIND + .with_label_values(&[&format!("{engine}")]) + .set(1); +} + +pub(super) fn get() -> &'static IoEngineKind { + #[cfg(test)] + { + let env_var_name = "NEON_PAGESERVER_UNIT_TEST_VIRTUAL_FILE_IOENGINE"; + IO_ENGINE.get_or_init(|| match std::env::var(env_var_name) { + Ok(v) => match v.parse::() { + Ok(engine_kind) => engine_kind, + Err(e) => { + panic!("invalid VirtualFile io engine for env var {env_var_name}: {e:#}: {v:?}") + } + }, + Err(std::env::VarError::NotPresent) => { + crate::config::defaults::DEFAULT_VIRTUAL_FILE_IO_ENGINE + .parse() + .unwrap() + } + Err(std::env::VarError::NotUnicode(_)) => { + panic!("env var {env_var_name} is not unicode"); + } + }) + } + #[cfg(not(test))] + IO_ENGINE.get().unwrap() +} + +use std::os::unix::prelude::FileExt; + +use super::FileGuard; + +impl IoEngineKind { + pub(super) async fn read_at( + &self, + file_guard: FileGuard, + offset: u64, + mut buf: B, + ) -> ((FileGuard, B), std::io::Result) + where + B: tokio_epoll_uring::BoundedBufMut + Send, + { + match self { + IoEngineKind::StdFs => { + // SAFETY: `dst` only lives at most as long as this match arm, during which buf remains valid memory. + let dst = unsafe { + std::slice::from_raw_parts_mut(buf.stable_mut_ptr(), buf.bytes_total()) + }; + let res = file_guard.with_std_file(|std_file| std_file.read_at(dst, offset)); + if let Ok(nbytes) = &res { + assert!(*nbytes <= buf.bytes_total()); + // SAFETY: see above assertion + unsafe { + buf.set_init(*nbytes); + } + } + #[allow(dropping_references)] + drop(dst); + ((file_guard, buf), res) + } + #[cfg(target_os = "linux")] + IoEngineKind::TokioEpollUring => { + let system = tokio_epoll_uring::thread_local_system().await; + let (resources, res) = system.read(file_guard, offset, buf).await; + ( + resources, + res.map_err(|e| match e { + tokio_epoll_uring::Error::Op(e) => e, + tokio_epoll_uring::Error::System(system) => { + std::io::Error::new(std::io::ErrorKind::Other, system) + } + }), + ) + } + } + } +} diff --git a/pageserver/src/virtual_file/open_options.rs b/pageserver/src/virtual_file/open_options.rs new file mode 100644 index 0000000000..1e5ffe15cc --- /dev/null +++ b/pageserver/src/virtual_file/open_options.rs @@ -0,0 +1,138 @@ +//! Enum-dispatch to the `OpenOptions` type of the respective [`super::IoEngineKind`]; + +use super::IoEngineKind; +use std::{os::fd::OwnedFd, path::Path}; + +#[derive(Debug, Clone)] +pub enum OpenOptions { + StdFs(std::fs::OpenOptions), + #[cfg(target_os = "linux")] + TokioEpollUring(tokio_epoll_uring::ops::open_at::OpenOptions), +} + +impl Default for OpenOptions { + fn default() -> Self { + match super::io_engine::get() { + IoEngineKind::StdFs => Self::StdFs(std::fs::OpenOptions::new()), + #[cfg(target_os = "linux")] + IoEngineKind::TokioEpollUring => { + Self::TokioEpollUring(tokio_epoll_uring::ops::open_at::OpenOptions::new()) + } + } + } +} + +impl OpenOptions { + pub fn new() -> OpenOptions { + Self::default() + } + + pub fn read(&mut self, read: bool) -> &mut OpenOptions { + match self { + OpenOptions::StdFs(x) => { + let _ = x.read(read); + } + #[cfg(target_os = "linux")] + OpenOptions::TokioEpollUring(x) => { + let _ = x.read(read); + } + } + self + } + + pub fn write(&mut self, write: bool) -> &mut OpenOptions { + match self { + OpenOptions::StdFs(x) => { + let _ = x.write(write); + } + #[cfg(target_os = "linux")] + OpenOptions::TokioEpollUring(x) => { + let _ = x.write(write); + } + } + self + } + + pub fn create(&mut self, create: bool) -> &mut OpenOptions { + match self { + OpenOptions::StdFs(x) => { + let _ = x.create(create); + } + #[cfg(target_os = "linux")] + OpenOptions::TokioEpollUring(x) => { + let _ = x.create(create); + } + } + self + } + + pub fn create_new(&mut self, create_new: bool) -> &mut OpenOptions { + match self { + OpenOptions::StdFs(x) => { + let _ = x.create_new(create_new); + } + #[cfg(target_os = "linux")] + OpenOptions::TokioEpollUring(x) => { + let _ = x.create_new(create_new); + } + } + self + } + + pub fn truncate(&mut self, truncate: bool) -> &mut OpenOptions { + match self { + OpenOptions::StdFs(x) => { + let _ = x.truncate(truncate); + } + #[cfg(target_os = "linux")] + OpenOptions::TokioEpollUring(x) => { + let _ = x.truncate(truncate); + } + } + self + } + + pub(in crate::virtual_file) async fn open(&self, path: &Path) -> std::io::Result { + match self { + OpenOptions::StdFs(x) => x.open(path).map(|file| file.into()), + #[cfg(target_os = "linux")] + OpenOptions::TokioEpollUring(x) => { + let system = tokio_epoll_uring::thread_local_system().await; + system.open(path, x).await.map_err(|e| match e { + tokio_epoll_uring::Error::Op(e) => e, + tokio_epoll_uring::Error::System(system) => { + std::io::Error::new(std::io::ErrorKind::Other, system) + } + }) + } + } + } +} + +impl std::os::unix::prelude::OpenOptionsExt for OpenOptions { + fn mode(&mut self, mode: u32) -> &mut OpenOptions { + match self { + OpenOptions::StdFs(x) => { + let _ = x.mode(mode); + } + #[cfg(target_os = "linux")] + OpenOptions::TokioEpollUring(x) => { + let _ = x.mode(mode); + } + } + self + } + + fn custom_flags(&mut self, flags: i32) -> &mut OpenOptions { + match self { + OpenOptions::StdFs(x) => { + let _ = x.custom_flags(flags); + } + #[cfg(target_os = "linux")] + OpenOptions::TokioEpollUring(x) => { + let _ = x.custom_flags(flags); + } + } + self + } +} diff --git a/pageserver/src/walingest.rs b/pageserver/src/walingest.rs index f2c35436db..93d1dcab35 100644 --- a/pageserver/src/walingest.rs +++ b/pageserver/src/walingest.rs @@ -33,11 +33,12 @@ use utils::failpoint_support; use crate::context::RequestContext; use crate::metrics::WAL_INGEST; -use crate::pgdatadir_mapping::*; +use crate::pgdatadir_mapping::{DatadirModification, Version}; use crate::tenant::PageReconstructError; use crate::tenant::Timeline; use crate::walrecord::*; use crate::ZERO_PAGE; +use pageserver_api::key::rel_block_to_key; use pageserver_api::reltag::{BlockNumber, RelTag, SlruKind}; use postgres_ffi::pg_constants; use postgres_ffi::relfile_utils::{FSM_FORKNUM, INIT_FORKNUM, MAIN_FORKNUM, VISIBILITYMAP_FORKNUM}; @@ -1032,7 +1033,23 @@ impl WalIngest { // Copy content debug!("copying rel {} to {}, {} blocks", src_rel, dst_rel, nblocks); for blknum in 0..nblocks { - debug!("copying block {} from {} to {}", blknum, src_rel, dst_rel); + // Sharding: + // - src and dst are always on the same shard, because they differ only by dbNode, and + // dbNode is not included in the hash inputs for sharding. + // - This WAL command is replayed on all shards, but each shard only copies the blocks + // that belong to it. + let src_key = rel_block_to_key(src_rel, blknum); + if !self.shard.is_key_local(&src_key) { + debug!( + "Skipping non-local key {} during XLOG_DBASE_CREATE", + src_key + ); + continue; + } + debug!( + "copying block {} from {} ({}) to {}", + blknum, src_rel, src_key, dst_rel + ); let content = modification .tline @@ -1346,16 +1363,22 @@ impl WalIngest { self.checkpoint.nextMultiOffset = xlrec.moff + xlrec.nmembers; self.checkpoint_modified = true; } - let max_mbr_xid = xlrec.members.iter().fold(0u32, |acc, mbr| { - if mbr.xid.wrapping_sub(acc) as i32 > 0 { - mbr.xid + let max_mbr_xid = xlrec.members.iter().fold(None, |acc, mbr| { + if let Some(max_xid) = acc { + if mbr.xid.wrapping_sub(max_xid) as i32 > 0 { + Some(mbr.xid) + } else { + acc + } } else { - acc + Some(mbr.xid) } }); - if self.checkpoint.update_next_xid(max_mbr_xid) { - self.checkpoint_modified = true; + if let Some(max_xid) = max_mbr_xid { + if self.checkpoint.update_next_xid(max_xid) { + self.checkpoint_modified = true; + } } Ok(()) } diff --git a/pageserver/src/walredo.rs b/pageserver/src/walredo.rs index b4aadb2a8c..cfb8052cf1 100644 --- a/pageserver/src/walredo.rs +++ b/pageserver/src/walredo.rs @@ -47,11 +47,10 @@ use crate::metrics::{ WAL_REDO_PROCESS_LAUNCH_DURATION_HISTOGRAM, WAL_REDO_RECORDS_HISTOGRAM, WAL_REDO_RECORD_COUNTER, WAL_REDO_TIME, }; -use crate::pgdatadir_mapping::key_to_slru_block; use crate::repository::Key; use crate::walrecord::NeonWalRecord; -use pageserver_api::key::key_to_rel_block; +use pageserver_api::key::{key_to_rel_block, key_to_slru_block}; use pageserver_api::reltag::{RelTag, SlruKind}; use postgres_ffi::pg_constants; use postgres_ffi::relfile_utils::VISIBILITYMAP_FORKNUM; @@ -837,9 +836,8 @@ impl WalRedoProcess { let mut proc = { input }; // TODO: remove this legacy rename, but this keep the patch small. let mut nwrite = 0usize; - let mut stdin_pollfds = [PollFd::new(proc.stdin.as_raw_fd(), PollFlags::POLLOUT)]; - while nwrite < writebuf.len() { + let mut stdin_pollfds = [PollFd::new(&proc.stdin, PollFlags::POLLOUT)]; let n = loop { match nix::poll::poll(&mut stdin_pollfds[..], wal_redo_timeout.as_millis() as i32) { Err(nix::errno::Errno::EINTR) => continue, @@ -878,7 +876,6 @@ impl WalRedoProcess { // advancing processed responses number. let mut output = self.stdout.lock().unwrap(); - let mut stdout_pollfds = [PollFd::new(output.stdout.as_raw_fd(), PollFlags::POLLIN)]; let n_processed_responses = output.n_processed_responses; while n_processed_responses + output.pending_responses.len() <= request_no { // We expect the WAL redo process to respond with an 8k page image. We read it @@ -886,6 +883,7 @@ impl WalRedoProcess { let mut resultbuf = vec![0; BLCKSZ.into()]; let mut nresult: usize = 0; // # of bytes read into 'resultbuf' so far while nresult < BLCKSZ.into() { + let mut stdout_pollfds = [PollFd::new(&output.stdout, PollFlags::POLLIN)]; // We do two things simultaneously: reading response from stdout // and forward any logging information that the child writes to its stderr to the page server's log. let n = loop { diff --git a/pgxn/neon/control_plane_connector.c b/pgxn/neon/control_plane_connector.c index e467a9c43a..f6f006cba4 100644 --- a/pgxn/neon/control_plane_connector.c +++ b/pgxn/neon/control_plane_connector.c @@ -637,7 +637,7 @@ HandleAlterRole(AlterRoleStmt *stmt) ListCell *option; const char *role_name = stmt->role->rolename; - if (RoleIsNeonSuperuser(role_name)) + if (RoleIsNeonSuperuser(role_name) && !superuser()) elog(ERROR, "can't ALTER neon_superuser"); foreach(option, stmt->options) diff --git a/pgxn/neon/libpagestore.c b/pgxn/neon/libpagestore.c index 3a7c0f1bb6..0eb1acbfb0 100644 --- a/pgxn/neon/libpagestore.c +++ b/pgxn/neon/libpagestore.c @@ -15,6 +15,7 @@ #include "postgres.h" #include "access/xlog.h" +#include "common/hashfn.h" #include "fmgr.h" #include "libpq-fe.h" #include "libpq/libpq.h" @@ -38,17 +39,6 @@ #define MIN_RECONNECT_INTERVAL_USEC 1000 #define MAX_RECONNECT_INTERVAL_USEC 1000000 -bool connected = false; -PGconn *pageserver_conn = NULL; - -/* - * WaitEventSet containing: - * - WL_SOCKET_READABLE on pageserver_conn, - * - WL_LATCH_SET on MyLatch, and - * - WL_EXIT_ON_PM_DEATH. - */ -WaitEventSet *pageserver_conn_wes = NULL; - /* GUCs */ char *neon_timeline; char *neon_tenant; @@ -59,16 +49,40 @@ char *neon_auth_token; int readahead_buffer_size = 128; int flush_every_n_requests = 8; -static int n_reconnect_attempts = 0; -static int max_reconnect_attempts = 60; - -#define MAX_PAGESERVER_CONNSTRING_SIZE 256 +static int n_reconnect_attempts = 0; +static int max_reconnect_attempts = 60; +static int stripe_size; typedef struct { - LWLockId lock; - pg_atomic_uint64 update_counter; - char pageserver_connstring[MAX_PAGESERVER_CONNSTRING_SIZE]; + char connstring[MAX_SHARDS][MAX_PAGESERVER_CONNSTRING_SIZE]; + size_t num_shards; +} ShardMap; + +/* + * PagestoreShmemState is kept in shared memory. It contains the connection + * strings for each shard. + * + * The "neon.pageserver_connstring" GUC is marked with the PGC_SIGHUP option, + * allowing it to be changed using pg_reload_conf(). The control plane can + * update the connection string if the pageserver crashes, is relocated, or + * new shards are added. A parsed copy of the current value of the GUC is kept + * in shared memory, updated by the postmaster, because regular backends don't + * reload the config during query execution, but we might need to re-establish + * the pageserver connection with the new connection string even in the middle + * of a query. + * + * The shared memory copy is protected by a lockless algorithm using two + * atomic counters. The counters allow a backend to quickly check if the value + * has changed since last access, and to detect and retry copying the value if + * the postmaster changes the value concurrently. (Postmaster doesn't have a + * PGPROC entry and therefore cannot use LWLocks.) + */ +typedef struct +{ + pg_atomic_uint64 begin_update_counter; + pg_atomic_uint64 end_update_counter; + ShardMap shard_map; } PagestoreShmemState; #if PG_VERSION_NUM >= 150000 @@ -78,76 +92,242 @@ static void walproposer_shmem_request(void); static shmem_startup_hook_type prev_shmem_startup_hook; static PagestoreShmemState *pagestore_shared; static uint64 pagestore_local_counter = 0; -static char local_pageserver_connstring[MAX_PAGESERVER_CONNSTRING_SIZE]; -static bool pageserver_flush(void); -static void pageserver_disconnect(void); +/* This backend's per-shard connections */ +typedef struct +{ + PGconn *conn; + + /*--- + * WaitEventSet containing: + * - WL_SOCKET_READABLE on 'conn' + * - WL_LATCH_SET on MyLatch, and + * - WL_EXIT_ON_PM_DEATH. + */ + WaitEventSet *wes; +} PageServer; + +static PageServer page_servers[MAX_SHARDS]; + +static bool pageserver_flush(shardno_t shard_no); +static void pageserver_disconnect(shardno_t shard_no); static bool -PagestoreShmemIsValid() +PagestoreShmemIsValid(void) { return pagestore_shared && UsedShmemSegAddr; } +/* + * Parse a comma-separated list of connection strings into a ShardMap. + * + * If 'result' is NULL, just checks that the input is valid. If the input is + * not valid, returns false. The contents of *result are undefined in + * that case, and must not be relied on. + */ +static bool +ParseShardMap(const char *connstr, ShardMap *result) +{ + const char *p; + int nshards = 0; + + if (result) + memset(result, 0, sizeof(ShardMap)); + + p = connstr; + nshards = 0; + for (;;) + { + const char *sep; + size_t connstr_len; + + sep = strchr(p, ','); + connstr_len = sep != NULL ? sep - p : strlen(p); + + if (connstr_len == 0 && sep == NULL) + break; /* ignore trailing comma */ + + if (nshards >= MAX_SHARDS) + { + neon_log(LOG, "Too many shards"); + return false; + } + if (connstr_len >= MAX_PAGESERVER_CONNSTRING_SIZE) + { + neon_log(LOG, "Connection string too long"); + return false; + } + if (result) + { + memcpy(result->connstring[nshards], p, connstr_len); + result->connstring[nshards][connstr_len] = '\0'; + } + nshards++; + + if (sep == NULL) + break; + p = sep + 1; + } + if (result) + result->num_shards = nshards; + + return true; +} + static bool CheckPageserverConnstring(char **newval, void **extra, GucSource source) { - return strlen(*newval) < MAX_PAGESERVER_CONNSTRING_SIZE; + char *p = *newval; + + return ParseShardMap(p, NULL); } static void AssignPageserverConnstring(const char *newval, void *extra) { - if (!PagestoreShmemIsValid()) + ShardMap shard_map; + + /* + * Only postmaster updates the copy in shared memory. + */ + if (!PagestoreShmemIsValid() || IsUnderPostmaster) return; - LWLockAcquire(pagestore_shared->lock, LW_EXCLUSIVE); - strlcpy(pagestore_shared->pageserver_connstring, newval, MAX_PAGESERVER_CONNSTRING_SIZE); - pg_atomic_fetch_add_u64(&pagestore_shared->update_counter, 1); - LWLockRelease(pagestore_shared->lock); -} - -static bool -CheckConnstringUpdated() -{ - if (!PagestoreShmemIsValid()) - return false; - return pagestore_local_counter < pg_atomic_read_u64(&pagestore_shared->update_counter); + + if (!ParseShardMap(newval, &shard_map)) + { + /* + * shouldn't happen, because we already checked the value in + * CheckPageserverConnstring + */ + elog(ERROR, "could not parse shard map"); + } + + if (memcmp(&pagestore_shared->shard_map, &shard_map, sizeof(ShardMap)) != 0) + { + pg_atomic_add_fetch_u64(&pagestore_shared->begin_update_counter, 1); + pg_write_barrier(); + memcpy(&pagestore_shared->shard_map, &shard_map, sizeof(ShardMap)); + pg_write_barrier(); + pg_atomic_add_fetch_u64(&pagestore_shared->end_update_counter, 1); + } + else + { + /* no change */ + } } +/* + * Get the current number of shards, and/or the connection string for a + * particular shard from the shard map in shared memory. + * + * If num_shards_p is not NULL, it is set to the current number of shards. + * + * If connstr_p is not NULL, the connection string for 'shard_no' is copied to + * it. It must point to a buffer at least MAX_PAGESERVER_CONNSTRING_SIZE bytes + * long. + * + * As a side-effect, if the shard map in shared memory had changed since the + * last call, terminates all existing connections to all pageservers. + */ static void -ReloadConnstring() +load_shard_map(shardno_t shard_no, char *connstr_p, shardno_t *num_shards_p) { - if (!PagestoreShmemIsValid()) - return; - LWLockAcquire(pagestore_shared->lock, LW_SHARED); - strlcpy(local_pageserver_connstring, pagestore_shared->pageserver_connstring, sizeof(local_pageserver_connstring)); - pagestore_local_counter = pg_atomic_read_u64(&pagestore_shared->update_counter); - LWLockRelease(pagestore_shared->lock); + uint64 begin_update_counter; + uint64 end_update_counter; + ShardMap *shard_map = &pagestore_shared->shard_map; + shardno_t num_shards; + + /* + * Postmaster can update the shared memory values concurrently, in which + * case we would copy a garbled mix of the old and new values. We will + * detect it because the counter's won't match, and retry. But it's + * important that we don't do anything within the retry-loop that would + * depend on the string having valid contents. + */ + do + { + begin_update_counter = pg_atomic_read_u64(&pagestore_shared->begin_update_counter); + end_update_counter = pg_atomic_read_u64(&pagestore_shared->end_update_counter); + + num_shards = shard_map->num_shards; + if (connstr_p && shard_no < MAX_SHARDS) + strlcpy(connstr_p, shard_map->connstring[shard_no], MAX_PAGESERVER_CONNSTRING_SIZE); + pg_memory_barrier(); + } + while (begin_update_counter != end_update_counter + || begin_update_counter != pg_atomic_read_u64(&pagestore_shared->begin_update_counter) + || end_update_counter != pg_atomic_read_u64(&pagestore_shared->end_update_counter)); + + if (connstr_p && shard_no >= num_shards) + neon_log(ERROR, "Shard %d is greater or equal than number of shards %d", + shard_no, num_shards); + + /* + * If any of the connection strings changed, reset all connections. + */ + if (pagestore_local_counter != end_update_counter) + { + for (shardno_t i = 0; i < MAX_SHARDS; i++) + { + if (page_servers[i].conn) + pageserver_disconnect(i); + } + pagestore_local_counter = end_update_counter; + } + + if (num_shards_p) + *num_shards_p = num_shards; +} + +#define MB (1024*1024) + +shardno_t +get_shard_number(BufferTag *tag) +{ + shardno_t n_shards; + uint32 hash; + + load_shard_map(0, NULL, &n_shards); + +#if PG_MAJORVERSION_NUM < 16 + hash = murmurhash32(tag->rnode.relNode); + hash = hash_combine(hash, murmurhash32(tag->blockNum / stripe_size)); +#else + hash = murmurhash32(tag->relNumber); + hash = hash_combine(hash, murmurhash32(tag->blockNum / stripe_size)); +#endif + + return hash % n_shards; } static bool -pageserver_connect(int elevel) +pageserver_connect(shardno_t shard_no, int elevel) { char *query; int ret; const char *keywords[3]; const char *values[3]; int n; + PGconn *conn; + WaitEventSet *wes; + char connstr[MAX_PAGESERVER_CONNSTRING_SIZE]; static TimestampTz last_connect_time = 0; static uint64_t delay_us = MIN_RECONNECT_INTERVAL_USEC; TimestampTz now; - uint64_t us_since_last_connect; + uint64_t us_since_last_connect; - Assert(!connected); + Assert(page_servers[shard_no].conn == NULL); - if (CheckConnstringUpdated()) - { - ReloadConnstring(); - } + /* + * Get the connection string for this shard. If the shard map has been + * updated since we last looked, this will also disconnect any existing + * pageserver connections as a side effect. + */ + load_shard_map(shard_no, connstr, NULL); now = GetCurrentTimestamp(); - us_since_last_connect = now - last_connect_time; + us_since_last_connect = now - last_connect_time; if (us_since_last_connect < delay_us) { pg_usleep(delay_us - us_since_last_connect); @@ -180,76 +360,84 @@ pageserver_connect(int elevel) n++; } keywords[n] = "dbname"; - values[n] = local_pageserver_connstring; + values[n] = connstr; n++; keywords[n] = NULL; values[n] = NULL; n++; - pageserver_conn = PQconnectdbParams(keywords, values, 1); + conn = PQconnectdbParams(keywords, values, 1); - if (PQstatus(pageserver_conn) == CONNECTION_BAD) + if (PQstatus(conn) == CONNECTION_BAD) { - char *msg = pchomp(PQerrorMessage(pageserver_conn)); + char *msg = pchomp(PQerrorMessage(conn)); - PQfinish(pageserver_conn); - pageserver_conn = NULL; + PQfinish(conn); ereport(elevel, (errcode(ERRCODE_SQLCLIENT_UNABLE_TO_ESTABLISH_SQLCONNECTION), - errmsg(NEON_TAG "could not establish connection to pageserver"), + errmsg(NEON_TAG "[shard %d] could not establish connection to pageserver", shard_no), errdetail_internal("%s", msg))); + pfree(msg); return false; } - query = psprintf("pagestream %s %s", neon_tenant, neon_timeline); - ret = PQsendQuery(pageserver_conn, query); + ret = PQsendQuery(conn, query); + pfree(query); if (ret != 1) { - PQfinish(pageserver_conn); - pageserver_conn = NULL; - neon_log(elevel, "could not send pagestream command to pageserver"); + PQfinish(conn); + neon_shard_log(shard_no, elevel, "could not send pagestream command to pageserver"); return false; } - pageserver_conn_wes = CreateWaitEventSet(TopMemoryContext, 3); - AddWaitEventToSet(pageserver_conn_wes, WL_LATCH_SET, PGINVALID_SOCKET, + wes = CreateWaitEventSet(TopMemoryContext, 3); + AddWaitEventToSet(wes, WL_LATCH_SET, PGINVALID_SOCKET, MyLatch, NULL); - AddWaitEventToSet(pageserver_conn_wes, WL_EXIT_ON_PM_DEATH, PGINVALID_SOCKET, + AddWaitEventToSet(wes, WL_EXIT_ON_PM_DEATH, PGINVALID_SOCKET, NULL, NULL); - AddWaitEventToSet(pageserver_conn_wes, WL_SOCKET_READABLE, PQsocket(pageserver_conn), NULL, NULL); + AddWaitEventToSet(wes, WL_SOCKET_READABLE, PQsocket(conn), NULL, NULL); - while (PQisBusy(pageserver_conn)) + PG_TRY(); { - WaitEvent event; - - /* Sleep until there's something to do */ - (void) WaitEventSetWait(pageserver_conn_wes, -1L, &event, 1, PG_WAIT_EXTENSION); - ResetLatch(MyLatch); - - CHECK_FOR_INTERRUPTS(); - - /* Data available in socket? */ - if (event.events & WL_SOCKET_READABLE) + while (PQisBusy(conn)) { - if (!PQconsumeInput(pageserver_conn)) + WaitEvent event; + + /* Sleep until there's something to do */ + (void) WaitEventSetWait(wes, -1L, &event, 1, PG_WAIT_EXTENSION); + ResetLatch(MyLatch); + + CHECK_FOR_INTERRUPTS(); + + /* Data available in socket? */ + if (event.events & WL_SOCKET_READABLE) { - char *msg = pchomp(PQerrorMessage(pageserver_conn)); + if (!PQconsumeInput(conn)) + { + char *msg = pchomp(PQerrorMessage(conn)); - PQfinish(pageserver_conn); - pageserver_conn = NULL; - FreeWaitEventSet(pageserver_conn_wes); - pageserver_conn_wes = NULL; + PQfinish(conn); + FreeWaitEventSet(wes); - neon_log(elevel, "could not complete handshake with pageserver: %s", - msg); - return false; + neon_shard_log(shard_no, elevel, "could not complete handshake with pageserver: %s", + msg); + return false; + } } } } + PG_CATCH(); + { + PQfinish(conn); + FreeWaitEventSet(wes); + PG_RE_THROW(); + } + PG_END_TRY(); - neon_log(LOG, "libpagestore: connected to '%s'", page_server_connstring); + neon_shard_log(shard_no, LOG, "libpagestore: connected to '%s'", connstr); + page_servers[shard_no].conn = conn; + page_servers[shard_no].wes = wes; - connected = true; return true; } @@ -257,9 +445,10 @@ pageserver_connect(int elevel) * A wrapper around PQgetCopyData that checks for interrupts while sleeping. */ static int -call_PQgetCopyData(char **buffer) +call_PQgetCopyData(shardno_t shard_no, char **buffer) { int ret; + PGconn *pageserver_conn = page_servers[shard_no].conn; retry: ret = PQgetCopyData(pageserver_conn, buffer, 1 /* async */ ); @@ -269,7 +458,7 @@ retry: WaitEvent event; /* Sleep until there's something to do */ - (void) WaitEventSetWait(pageserver_conn_wes, -1L, &event, 1, PG_WAIT_EXTENSION); + (void) WaitEventSetWait(page_servers[shard_no].wes, -1L, &event, 1, PG_WAIT_EXTENSION); ResetLatch(MyLatch); CHECK_FOR_INTERRUPTS(); @@ -281,7 +470,7 @@ retry: { char *msg = pchomp(PQerrorMessage(pageserver_conn)); - neon_log(LOG, "could not get response from pageserver: %s", msg); + neon_shard_log(shard_no, LOG, "could not get response from pageserver: %s", msg); pfree(msg); return -1; } @@ -295,7 +484,7 @@ retry: static void -pageserver_disconnect(void) +pageserver_disconnect(shardno_t shard_no) { /* * If anything goes wrong while we were sending a request, it's not clear @@ -304,38 +493,38 @@ pageserver_disconnect(void) * time later after we have already sent a new unrelated request. Close * the connection to avoid getting confused. */ - if (connected) + if (page_servers[shard_no].conn) { - neon_log(LOG, "dropping connection to page server due to error"); - PQfinish(pageserver_conn); - pageserver_conn = NULL; - connected = false; + neon_shard_log(shard_no, LOG, "dropping connection to page server due to error"); + PQfinish(page_servers[shard_no].conn); + page_servers[shard_no].conn = NULL; + /* + * If the connection to any pageserver is lost, we throw away the + * whole prefetch queue, even for other pageservers. It should not + * cause big problems, because connection loss is supposed to be a + * rare event. + */ prefetch_on_ps_disconnect(); } - if (pageserver_conn_wes != NULL) + if (page_servers[shard_no].wes != NULL) { - FreeWaitEventSet(pageserver_conn_wes); - pageserver_conn_wes = NULL; + FreeWaitEventSet(page_servers[shard_no].wes); + page_servers[shard_no].wes = NULL; } } static bool -pageserver_send(NeonRequest *request) +pageserver_send(shardno_t shard_no, NeonRequest *request) { StringInfoData req_buff; - - if (CheckConnstringUpdated()) - { - pageserver_disconnect(); - ReloadConnstring(); - } + PGconn *pageserver_conn = page_servers[shard_no].conn; /* If the connection was lost for some reason, reconnect */ - if (connected && PQstatus(pageserver_conn) == CONNECTION_BAD) + if (pageserver_conn && PQstatus(pageserver_conn) == CONNECTION_BAD) { - neon_log(LOG, "pageserver_send disconnect bad connection"); - pageserver_disconnect(); + neon_shard_log(shard_no, LOG, "pageserver_send disconnect bad connection"); + pageserver_disconnect(shard_no); } req_buff = nm_pack_request(request); @@ -349,9 +538,9 @@ pageserver_send(NeonRequest *request) * https://github.com/neondatabase/neon/issues/1138 So try to reestablish * connection in case of failure. */ - if (!connected) + if (!page_servers[shard_no].conn) { - while (!pageserver_connect(n_reconnect_attempts < max_reconnect_attempts ? LOG : ERROR)) + while (!pageserver_connect(shard_no, n_reconnect_attempts < max_reconnect_attempts ? LOG : ERROR)) { HandleMainLoopInterrupts(); n_reconnect_attempts += 1; @@ -359,6 +548,8 @@ pageserver_send(NeonRequest *request) n_reconnect_attempts = 0; } + pageserver_conn = page_servers[shard_no].conn; + /* * Send request. * @@ -371,8 +562,8 @@ pageserver_send(NeonRequest *request) { char *msg = pchomp(PQerrorMessage(pageserver_conn)); - pageserver_disconnect(); - neon_log(LOG, "pageserver_send disconnect because failed to send page request (try to reconnect): %s", msg); + pageserver_disconnect(shard_no); + neon_shard_log(shard_no, LOG, "pageserver_send disconnect because failed to send page request (try to reconnect): %s", msg); pfree(msg); pfree(req_buff.data); return false; @@ -384,19 +575,20 @@ pageserver_send(NeonRequest *request) { char *msg = nm_to_string((NeonMessage *) request); - neon_log(PageStoreTrace, "sent request: %s", msg); + neon_shard_log(shard_no, PageStoreTrace, "sent request: %s", msg); pfree(msg); } return true; } static NeonResponse * -pageserver_receive(void) +pageserver_receive(shardno_t shard_no) { StringInfoData resp_buff; NeonResponse *resp; + PGconn *pageserver_conn = page_servers[shard_no].conn; - if (!connected) + if (!pageserver_conn) return NULL; PG_TRY(); @@ -404,7 +596,7 @@ pageserver_receive(void) /* read response */ int rc; - rc = call_PQgetCopyData(&resp_buff.data); + rc = call_PQgetCopyData(shard_no, &resp_buff.data); if (rc >= 0) { resp_buff.len = rc; @@ -416,33 +608,33 @@ pageserver_receive(void) { char *msg = nm_to_string((NeonMessage *) resp); - neon_log(PageStoreTrace, "got response: %s", msg); + neon_shard_log(shard_no, PageStoreTrace, "got response: %s", msg); pfree(msg); } } else if (rc == -1) { - neon_log(LOG, "pageserver_receive disconnect because call_PQgetCopyData returns -1: %s", pchomp(PQerrorMessage(pageserver_conn))); - pageserver_disconnect(); + neon_shard_log(shard_no, LOG, "pageserver_receive disconnect because call_PQgetCopyData returns -1: %s", pchomp(PQerrorMessage(pageserver_conn))); + pageserver_disconnect(shard_no); resp = NULL; } else if (rc == -2) { char *msg = pchomp(PQerrorMessage(pageserver_conn)); - pageserver_disconnect(); - neon_log(ERROR, "pageserver_receive disconnect because could not read COPY data: %s", msg); + pageserver_disconnect(shard_no); + neon_shard_log(shard_no, ERROR, "pageserver_receive disconnect because could not read COPY data: %s", msg); } else { - pageserver_disconnect(); - neon_log(ERROR, "pageserver_receive disconnect because unexpected PQgetCopyData return value: %d", rc); + pageserver_disconnect(shard_no); + neon_shard_log(shard_no, ERROR, "pageserver_receive disconnect because unexpected PQgetCopyData return value: %d", rc); } } PG_CATCH(); { - neon_log(LOG, "pageserver_receive disconnect due to caught exception"); - pageserver_disconnect(); + neon_shard_log(shard_no, LOG, "pageserver_receive disconnect due to caught exception"); + pageserver_disconnect(shard_no); PG_RE_THROW(); } PG_END_TRY(); @@ -452,11 +644,13 @@ pageserver_receive(void) static bool -pageserver_flush(void) +pageserver_flush(shardno_t shard_no) { - if (!connected) + PGconn *pageserver_conn = page_servers[shard_no].conn; + + if (!pageserver_conn) { - neon_log(WARNING, "Tried to flush while disconnected"); + neon_shard_log(shard_no, WARNING, "Tried to flush while disconnected"); } else { @@ -464,8 +658,8 @@ pageserver_flush(void) { char *msg = pchomp(PQerrorMessage(pageserver_conn)); - pageserver_disconnect(); - neon_log(LOG, "pageserver_flush disconnect because failed to flush page requests: %s", msg); + pageserver_disconnect(shard_no); + neon_shard_log(shard_no, LOG, "pageserver_flush disconnect because failed to flush page requests: %s", msg); pfree(msg); return false; } @@ -505,8 +699,9 @@ PagestoreShmemInit(void) &found); if (!found) { - pagestore_shared->lock = &(GetNamedLWLockTranche("neon_libpagestore")->lock); - pg_atomic_init_u64(&pagestore_shared->update_counter, 0); + pg_atomic_init_u64(&pagestore_shared->begin_update_counter, 0); + pg_atomic_init_u64(&pagestore_shared->end_update_counter, 0); + memset(&pagestore_shared->shard_map, 0, sizeof(ShardMap)); AssignPageserverConnstring(page_server_connstring, NULL); } LWLockRelease(AddinShmemInitLock); @@ -531,7 +726,6 @@ pagestore_shmem_request(void) #endif RequestAddinShmemSpace(PagestoreShmemSize()); - RequestNamedLWLockTranche("neon_libpagestore", 1); } static void @@ -582,6 +776,15 @@ pg_init_libpagestore(void) 0, /* no flags required */ check_neon_id, NULL, NULL); + DefineCustomIntVariable("neon.stripe_size", + "sharding stripe size", + NULL, + &stripe_size, + 32768, 1, INT_MAX, + PGC_SIGHUP, + GUC_UNIT_BLOCKS, + NULL, NULL, NULL); + DefineCustomIntVariable("neon.max_cluster_size", "cluster size limit", NULL, diff --git a/pgxn/neon/pagestore_client.h b/pgxn/neon/pagestore_client.h index 3fcaab0bee..8c02f357bc 100644 --- a/pgxn/neon/pagestore_client.h +++ b/pgxn/neon/pagestore_client.h @@ -20,9 +20,13 @@ #include "lib/stringinfo.h" #include "libpq/pqformat.h" #include "storage/block.h" +#include "storage/buf_internals.h" #include "storage/smgr.h" #include "utils/memutils.h" +#define MAX_SHARDS 128 +#define MAX_PAGESERVER_CONNSTRING_SIZE 256 + typedef enum { /* pagestore_client -> pagestore */ @@ -51,6 +55,9 @@ typedef struct #define neon_log(tag, fmt, ...) ereport(tag, \ (errmsg(NEON_TAG fmt, ##__VA_ARGS__), \ errhidestmt(true), errhidecontext(true), errposition(0), internalerrposition(0))) +#define neon_shard_log(shard_no, tag, fmt, ...) ereport(tag, \ + (errmsg(NEON_TAG "[shard %d] " fmt, shard_no, ##__VA_ARGS__), \ + errhidestmt(true), errhidecontext(true), errposition(0), internalerrposition(0))) /* * supertype of all the Neon*Request structs below @@ -141,11 +148,13 @@ extern char *nm_to_string(NeonMessage *msg); * API */ +typedef unsigned shardno_t; + typedef struct { - bool (*send) (NeonRequest *request); - NeonResponse *(*receive) (void); - bool (*flush) (void); + bool (*send) (shardno_t shard_no, NeonRequest * request); + NeonResponse *(*receive) (shardno_t shard_no); + bool (*flush) (shardno_t shard_no); } page_server_api; extern void prefetch_on_ps_disconnect(void); @@ -159,6 +168,8 @@ extern char *neon_timeline; extern char *neon_tenant; extern int32 max_cluster_size; +extern shardno_t get_shard_number(BufferTag* tag); + extern const f_smgr *smgr_neon(BackendId backend, NRelFileInfo rinfo); extern void smgr_init_neon(void); extern void readahead_buffer_resize(int newsize, void *extra); diff --git a/pgxn/neon/pagestore_smgr.c b/pgxn/neon/pagestore_smgr.c index 0db093e5a7..1fa802e6f4 100644 --- a/pgxn/neon/pagestore_smgr.c +++ b/pgxn/neon/pagestore_smgr.c @@ -172,6 +172,7 @@ typedef struct PrefetchRequest XLogRecPtr actual_request_lsn; NeonResponse *response; /* may be null */ PrefetchStatus status; + shardno_t shard_no; uint64 my_ring_index; } PrefetchRequest; @@ -239,10 +240,17 @@ typedef struct PrefetchState * also unused */ /* the buffers */ - prfh_hash *prf_hash; + prfh_hash *prf_hash; + int max_shard_no; + /* Mark shards involved in prefetch */ + uint8 shard_bitmap[(MAX_SHARDS + 7)/8]; PrefetchRequest prf_buffer[]; /* prefetch buffers */ } PrefetchState; +#define BITMAP_ISSET(bm, bit) ((bm)[(bit) >> 3] & (1 << ((bit) & 7))) +#define BITMAP_SET(bm, bit) (bm)[(bit) >> 3] |= (1 << ((bit) & 7)) +#define BITMAP_CLR(bm, bit) (bm)[(bit) >> 3] &= ~(1 << ((bit) & 7)) + static PrefetchState *MyPState; #define GetPrfSlot(ring_index) ( \ @@ -327,6 +335,7 @@ compact_prefetch_buffers(void) Assert(target_slot->status == PRFS_UNUSED); target_slot->buftag = source_slot->buftag; + target_slot->shard_no = source_slot->shard_no; target_slot->status = source_slot->status; target_slot->response = source_slot->response; target_slot->effective_request_lsn = source_slot->effective_request_lsn; @@ -494,6 +503,23 @@ prefetch_cleanup_trailing_unused(void) } } + +static bool +prefetch_flush_requests(void) +{ + for (shardno_t shard_no = 0; shard_no < MyPState->max_shard_no; shard_no++) + { + if (BITMAP_ISSET(MyPState->shard_bitmap, shard_no)) + { + if (!page_server->flush(shard_no)) + return false; + BITMAP_CLR(MyPState->shard_bitmap, shard_no); + } + } + MyPState->max_shard_no = 0; + return true; +} + /* * Wait for slot of ring_index to have received its response. * The caller is responsible for making sure the request buffer is flushed. @@ -509,7 +535,7 @@ prefetch_wait_for(uint64 ring_index) if (MyPState->ring_flush <= ring_index && MyPState->ring_unused > MyPState->ring_flush) { - if (!page_server->flush()) + if (!prefetch_flush_requests()) return false; MyPState->ring_flush = MyPState->ring_unused; } @@ -547,7 +573,7 @@ prefetch_read(PrefetchRequest *slot) Assert(slot->my_ring_index == MyPState->ring_receive); old = MemoryContextSwitchTo(MyPState->errctx); - response = (NeonResponse *) page_server->receive(); + response = (NeonResponse *) page_server->receive(slot->shard_no); MemoryContextSwitchTo(old); if (response) { @@ -704,12 +730,14 @@ prefetch_do_request(PrefetchRequest *slot, bool *force_latest, XLogRecPtr *force Assert(slot->response == NULL); Assert(slot->my_ring_index == MyPState->ring_unused); - while (!page_server->send((NeonRequest *) &request)); + while (!page_server->send(slot->shard_no, (NeonRequest *) &request)); /* update prefetch state */ MyPState->n_requests_inflight += 1; MyPState->n_unused -= 1; MyPState->ring_unused += 1; + BITMAP_SET(MyPState->shard_bitmap, slot->shard_no); + MyPState->max_shard_no = Max(slot->shard_no+1, MyPState->max_shard_no); /* update slot state */ slot->status = PRFS_REQUESTED; @@ -880,6 +908,7 @@ Retry: * function reads the buffer tag from the slot. */ slot->buftag = tag; + slot->shard_no = get_shard_number(&tag); slot->my_ring_index = ring_index; prefetch_do_request(slot, force_latest, force_lsn); @@ -890,7 +919,7 @@ Retry: if (flush_every_n_requests > 0 && MyPState->ring_unused - MyPState->ring_flush >= flush_every_n_requests) { - if (!page_server->flush()) + if (!prefetch_flush_requests()) { /* * Prefetch set is reset in case of error, so we should try to @@ -908,13 +937,44 @@ static NeonResponse * page_server_request(void const *req) { NeonResponse *resp; + BufferTag tag = {0}; + shardno_t shard_no; + + switch (((NeonRequest *) req)->tag) + { + case T_NeonExistsRequest: + CopyNRelFileInfoToBufTag(tag, ((NeonExistsRequest *) req)->rinfo); + break; + case T_NeonNblocksRequest: + CopyNRelFileInfoToBufTag(tag, ((NeonNblocksRequest *) req)->rinfo); + break; + case T_NeonDbSizeRequest: + NInfoGetDbOid(BufTagGetNRelFileInfo(tag)) = ((NeonDbSizeRequest *) req)->dbNode; + break; + case T_NeonGetPageRequest: + CopyNRelFileInfoToBufTag(tag, ((NeonGetPageRequest *) req)->rinfo); + tag.blockNum = ((NeonGetPageRequest *) req)->blkno; + break; + default: + neon_log(ERROR, "Unexpected request tag: %d", ((NeonRequest *) req)->tag); + } + shard_no = get_shard_number(&tag); + + + /* + * Current sharding model assumes that all metadata is present only at shard 0. + * We still need to call get_shard_no() to check if shard map is up-to-date. + */ + if (((NeonRequest *) req)->tag != T_NeonGetPageRequest || ((NeonGetPageRequest *) req)->forknum != MAIN_FORKNUM) + { + shard_no = 0; + } do { - while (!page_server->send((NeonRequest *) req) || !page_server->flush()); - MyPState->ring_flush = MyPState->ring_unused; + while (!page_server->send(shard_no, (NeonRequest *) req) || !page_server->flush(shard_no)); consume_prefetch_responses(); - resp = page_server->receive(); + resp = page_server->receive(shard_no); } while (resp == NULL); return resp; @@ -2098,8 +2158,8 @@ neon_read_at_lsn(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber blkno, case T_NeonErrorResponse: ereport(ERROR, (errcode(ERRCODE_IO_ERROR), - errmsg(NEON_TAG "could not read block %u in rel %u/%u/%u.%u from page server at lsn %X/%08X", - blkno, + errmsg(NEON_TAG "[shard %d] could not read block %u in rel %u/%u/%u.%u from page server at lsn %X/%08X", + slot->shard_no, blkno, RelFileInfoFmt(rinfo), forkNum, (uint32) (request_lsn >> 32), (uint32) request_lsn), diff --git a/proxy/src/auth.rs b/proxy/src/auth.rs index 0707c1331f..8d1b861a66 100644 --- a/proxy/src/auth.rs +++ b/proxy/src/auth.rs @@ -4,7 +4,9 @@ pub mod backend; pub use backend::BackendType; mod credentials; -pub use credentials::{check_peer_addr_is_in_list, endpoint_sni, ComputeUserInfoMaybeEndpoint}; +pub use credentials::{ + check_peer_addr_is_in_list, endpoint_sni, ComputeUserInfoMaybeEndpoint, IpPattern, +}; mod password_hack; pub use password_hack::parse_endpoint_param; diff --git a/proxy/src/auth/backend.rs b/proxy/src/auth/backend.rs index 34171d4d3f..b1634906c9 100644 --- a/proxy/src/auth/backend.rs +++ b/proxy/src/auth/backend.rs @@ -3,7 +3,6 @@ mod hacks; mod link; pub use link::LinkAuthError; -use smol_str::SmolStr; use tokio_postgres::config::AuthKeys; use crate::auth::credentials::check_peer_addr_is_in_list; @@ -16,7 +15,6 @@ use crate::context::RequestMonitoring; use crate::proxy::connect_compute::handle_try_wake; use crate::proxy::retry::retry_after; use crate::proxy::NeonOptions; -use crate::scram; use crate::stream::Stream; use crate::{ auth::{self, ComputeUserInfoMaybeEndpoint}, @@ -28,6 +26,7 @@ use crate::{ }, stream, url, }; +use crate::{scram, EndpointCacheKey, EndpointId, RoleName}; use futures::TryFutureExt; use std::borrow::Cow; use std::ops::ControlFlow; @@ -35,6 +34,8 @@ use std::sync::Arc; use tokio::io::{AsyncRead, AsyncWrite}; use tracing::{error, info, warn}; +use super::IpPattern; + /// This type serves two purposes: /// /// * When `T` is `()`, it's just a regular auth backend selector @@ -55,7 +56,7 @@ pub enum BackendType<'a, T> { pub trait TestBackend: Send + Sync + 'static { fn wake_compute(&self) -> Result; - fn get_allowed_ips(&self) -> Result, console::errors::GetAuthInfoError>; + fn get_allowed_ips(&self) -> Result, console::errors::GetAuthInfoError>; } impl std::fmt::Display for BackendType<'_, ()> { @@ -128,19 +129,19 @@ pub struct ComputeCredentials { #[derive(Debug, Clone)] pub struct ComputeUserInfoNoEndpoint { - pub user: SmolStr, + pub user: RoleName, pub options: NeonOptions, } #[derive(Debug, Clone)] pub struct ComputeUserInfo { - pub endpoint: SmolStr, - pub user: SmolStr, + pub endpoint: EndpointId, + pub user: RoleName, pub options: NeonOptions, } impl ComputeUserInfo { - pub fn endpoint_cache_key(&self) -> SmolStr { + pub fn endpoint_cache_key(&self) -> EndpointCacheKey { self.options.get_cache_key(&self.endpoint) } } @@ -156,7 +157,7 @@ impl TryFrom for ComputeUserInfo { type Error = ComputeUserInfoNoEndpoint; fn try_from(user_info: ComputeUserInfoMaybeEndpoint) -> Result { - match user_info.project { + match user_info.endpoint_id { None => Err(ComputeUserInfoNoEndpoint { user: user_info.user, options: user_info.options, @@ -202,21 +203,18 @@ async fn auth_quirks( if !check_peer_addr_is_in_list(&ctx.peer_addr, &allowed_ips) { return Err(auth::AuthError::ip_address_not_allowed()); } - let maybe_secret = api.get_role_secret(ctx, &info).await?; + let cached_secret = api.get_role_secret(ctx, &info).await?; - let cached_secret = maybe_secret.unwrap_or_else(|| { + let secret = cached_secret.value.clone().unwrap_or_else(|| { // If we don't have an authentication secret, we mock one to // prevent malicious probing (possible due to missing protocol steps). // This mocked secret will never lead to successful authentication. info!("authentication info not found, mocking it"); - Cached::new_uncached(AuthSecret::Scram(scram::ServerSecret::mock( - &info.user, - rand::random(), - ))) + AuthSecret::Scram(scram::ServerSecret::mock(&info.user, rand::random())) }); match authenticate_with_secret( ctx, - cached_secret.value.clone(), + secret, info, client, unauthenticated_password, @@ -318,11 +316,11 @@ async fn auth_and_wake_compute( impl<'a> BackendType<'a, ComputeUserInfoMaybeEndpoint> { /// Get compute endpoint name from the credentials. - pub fn get_endpoint(&self) -> Option { + pub fn get_endpoint(&self) -> Option { use BackendType::*; match self { - Console(_, user_info) => user_info.project.clone(), + Console(_, user_info) => user_info.endpoint_id.clone(), Link(_) => Some("link".into()), #[cfg(test)] Test(_) => Some("test".into()), @@ -356,7 +354,7 @@ impl<'a> BackendType<'a, ComputeUserInfoMaybeEndpoint> { Console(api, user_info) => { info!( user = &*user_info.user, - project = user_info.project(), + project = user_info.endpoint(), "performing authentication using the console" ); diff --git a/proxy/src/auth/credentials.rs b/proxy/src/auth/credentials.rs index ada7f3614c..5bf7667a1f 100644 --- a/proxy/src/auth/credentials.rs +++ b/proxy/src/auth/credentials.rs @@ -2,12 +2,13 @@ use crate::{ auth::password_hack::parse_endpoint_param, context::RequestMonitoring, error::UserFacingError, - metrics::NUM_CONNECTION_ACCEPTED_BY_SNI, proxy::NeonOptions, + metrics::NUM_CONNECTION_ACCEPTED_BY_SNI, proxy::NeonOptions, serverless::SERVERLESS_DRIVER_SNI, + EndpointId, RoleName, }; use itertools::Itertools; use pq_proto::StartupMessageParams; use smol_str::SmolStr; -use std::{collections::HashSet, net::IpAddr}; +use std::{collections::HashSet, net::IpAddr, str::FromStr}; use thiserror::Error; use tracing::{info, warn}; @@ -21,7 +22,10 @@ pub enum ComputeUserInfoParseError { SNI ('{}') and project option ('{}').", .domain, .option, )] - InconsistentProjectNames { domain: SmolStr, option: SmolStr }, + InconsistentProjectNames { + domain: EndpointId, + option: EndpointId, + }, #[error( "Common name inferred from SNI ('{}') is not known", @@ -30,7 +34,7 @@ pub enum ComputeUserInfoParseError { UnknownCommonName { cn: String }, #[error("Project name ('{0}') must contain only alphanumeric characters and hyphen.")] - MalformedProjectName(SmolStr), + MalformedProjectName(EndpointId), } impl UserFacingError for ComputeUserInfoParseError {} @@ -39,24 +43,22 @@ impl UserFacingError for ComputeUserInfoParseError {} /// Note that we don't store any kind of client key or password here. #[derive(Debug, Clone, PartialEq, Eq)] pub struct ComputeUserInfoMaybeEndpoint { - pub user: SmolStr, - // TODO: this is a severe misnomer! We should think of a new name ASAP. - pub project: Option, - + pub user: RoleName, + pub endpoint_id: Option, pub options: NeonOptions, } impl ComputeUserInfoMaybeEndpoint { #[inline] - pub fn project(&self) -> Option<&str> { - self.project.as_deref() + pub fn endpoint(&self) -> Option<&str> { + self.endpoint_id.as_deref() } } -pub fn endpoint_sni<'a>( - sni: &'a str, +pub fn endpoint_sni( + sni: &str, common_names: &HashSet, -) -> Result<&'a str, ComputeUserInfoParseError> { +) -> Result, ComputeUserInfoParseError> { let Some((subdomain, common_name)) = sni.split_once('.') else { return Err(ComputeUserInfoParseError::UnknownCommonName { cn: sni.into() }); }; @@ -65,7 +67,10 @@ pub fn endpoint_sni<'a>( cn: common_name.into(), }); } - Ok(subdomain) + if subdomain == SERVERLESS_DRIVER_SNI { + return Ok(None); + } + Ok(Some(EndpointId::from(subdomain))) } impl ComputeUserInfoMaybeEndpoint { @@ -79,15 +84,14 @@ impl ComputeUserInfoMaybeEndpoint { // Some parameters are stored in the startup message. let get_param = |key| params.get(key).ok_or(MissingKey(key)); - let user: SmolStr = get_param("user")?.into(); + let user: RoleName = get_param("user")?.into(); // record the values if we have them ctx.set_application(params.get("application_name").map(SmolStr::from)); ctx.set_user(user.clone()); - ctx.set_endpoint_id(sni.map(SmolStr::from)); // Project name might be passed via PG's command-line options. - let project_option = params + let endpoint_option = params .options_raw() .and_then(|options| { // We support both `project` (deprecated) and `endpoint` options for backward compatibility. @@ -100,9 +104,9 @@ impl ComputeUserInfoMaybeEndpoint { }) .map(|name| name.into()); - let project_from_domain = if let Some(sni_str) = sni { + let endpoint_from_domain = if let Some(sni_str) = sni { if let Some(cn) = common_names { - Some(SmolStr::from(endpoint_sni(sni_str, cn)?)) + endpoint_sni(sni_str, cn)? } else { None } @@ -110,26 +114,27 @@ impl ComputeUserInfoMaybeEndpoint { None }; - let project = match (project_option, project_from_domain) { + let endpoint = match (endpoint_option, endpoint_from_domain) { // Invariant: if we have both project name variants, they should match. (Some(option), Some(domain)) if option != domain => { Some(Err(InconsistentProjectNames { domain, option })) } // Invariant: project name may not contain certain characters. - (a, b) => a.or(b).map(|name| match project_name_valid(&name) { + (a, b) => a.or(b).map(|name| match project_name_valid(name.as_ref()) { false => Err(MalformedProjectName(name)), true => Ok(name), }), } .transpose()?; + ctx.set_endpoint_id(endpoint.clone()); - info!(%user, project = project.as_deref(), "credentials"); + info!(%user, project = endpoint.as_deref(), "credentials"); if sni.is_some() { info!("Connection with sni"); NUM_CONNECTION_ACCEPTED_BY_SNI .with_label_values(&["sni"]) .inc(); - } else if project.is_some() { + } else if endpoint.is_some() { NUM_CONNECTION_ACCEPTED_BY_SNI .with_label_values(&["no_sni"]) .inc(); @@ -145,36 +150,57 @@ impl ComputeUserInfoMaybeEndpoint { Ok(Self { user, - project, + endpoint_id: endpoint.map(EndpointId::from), options, }) } } -pub fn check_peer_addr_is_in_list(peer_addr: &IpAddr, ip_list: &Vec) -> bool { - if ip_list.is_empty() { - return true; - } - for ip in ip_list { - // We expect that all ip addresses from control plane are correct. - // However, if some of them are broken, we still can check the others. - match parse_ip_pattern(ip) { - Ok(pattern) => { - if check_ip(peer_addr, &pattern) { - return true; - } - } - Err(err) => warn!("Cannot parse ip: {}; err: {}", ip, err), - } - } - false +pub fn check_peer_addr_is_in_list(peer_addr: &IpAddr, ip_list: &[IpPattern]) -> bool { + ip_list.is_empty() || ip_list.iter().any(|pattern| check_ip(peer_addr, pattern)) } #[derive(Debug, Clone, Eq, PartialEq)] -enum IpPattern { +pub enum IpPattern { Subnet(ipnet::IpNet), Range(IpAddr, IpAddr), Single(IpAddr), + None, +} + +impl<'de> serde::de::Deserialize<'de> for IpPattern { + fn deserialize(deserializer: D) -> Result + where + D: serde::Deserializer<'de>, + { + struct StrVisitor; + impl<'de> serde::de::Visitor<'de> for StrVisitor { + type Value = IpPattern; + + fn expecting(&self, formatter: &mut std::fmt::Formatter) -> std::fmt::Result { + write!(formatter, "comma separated list with ip address, ip address range, or ip address subnet mask") + } + + fn visit_str(self, v: &str) -> Result + where + E: serde::de::Error, + { + Ok(parse_ip_pattern(v).unwrap_or_else(|e| { + warn!("Cannot parse ip pattern {v}: {e}"); + IpPattern::None + })) + } + } + deserializer.deserialize_str(StrVisitor) + } +} + +impl FromStr for IpPattern { + type Err = anyhow::Error; + + fn from_str(s: &str) -> Result { + parse_ip_pattern(s) + } } fn parse_ip_pattern(pattern: &str) -> anyhow::Result { @@ -196,6 +222,7 @@ fn check_ip(ip: &IpAddr, pattern: &IpPattern) -> bool { IpPattern::Subnet(subnet) => subnet.contains(ip), IpPattern::Range(start, end) => start <= ip && ip <= end, IpPattern::Single(addr) => addr == ip, + IpPattern::None => false, } } @@ -206,6 +233,7 @@ fn project_name_valid(name: &str) -> bool { #[cfg(test)] mod tests { use super::*; + use serde_json::json; use ComputeUserInfoParseError::*; #[test] @@ -215,7 +243,7 @@ mod tests { let mut ctx = RequestMonitoring::test(); let user_info = ComputeUserInfoMaybeEndpoint::parse(&mut ctx, &options, None, None)?; assert_eq!(user_info.user, "john_doe"); - assert_eq!(user_info.project, None); + assert_eq!(user_info.endpoint_id, None); Ok(()) } @@ -230,7 +258,7 @@ mod tests { let mut ctx = RequestMonitoring::test(); let user_info = ComputeUserInfoMaybeEndpoint::parse(&mut ctx, &options, None, None)?; assert_eq!(user_info.user, "john_doe"); - assert_eq!(user_info.project, None); + assert_eq!(user_info.endpoint_id, None); Ok(()) } @@ -246,7 +274,7 @@ mod tests { let user_info = ComputeUserInfoMaybeEndpoint::parse(&mut ctx, &options, sni, common_names.as_ref())?; assert_eq!(user_info.user, "john_doe"); - assert_eq!(user_info.project.as_deref(), Some("foo")); + assert_eq!(user_info.endpoint_id.as_deref(), Some("foo")); assert_eq!(user_info.options.get_cache_key("foo"), "foo"); Ok(()) @@ -262,7 +290,7 @@ mod tests { let mut ctx = RequestMonitoring::test(); let user_info = ComputeUserInfoMaybeEndpoint::parse(&mut ctx, &options, None, None)?; assert_eq!(user_info.user, "john_doe"); - assert_eq!(user_info.project.as_deref(), Some("bar")); + assert_eq!(user_info.endpoint_id.as_deref(), Some("bar")); Ok(()) } @@ -277,7 +305,7 @@ mod tests { let mut ctx = RequestMonitoring::test(); let user_info = ComputeUserInfoMaybeEndpoint::parse(&mut ctx, &options, None, None)?; assert_eq!(user_info.user, "john_doe"); - assert_eq!(user_info.project.as_deref(), Some("bar")); + assert_eq!(user_info.endpoint_id.as_deref(), Some("bar")); Ok(()) } @@ -295,7 +323,7 @@ mod tests { let mut ctx = RequestMonitoring::test(); let user_info = ComputeUserInfoMaybeEndpoint::parse(&mut ctx, &options, None, None)?; assert_eq!(user_info.user, "john_doe"); - assert!(user_info.project.is_none()); + assert!(user_info.endpoint_id.is_none()); Ok(()) } @@ -310,7 +338,7 @@ mod tests { let mut ctx = RequestMonitoring::test(); let user_info = ComputeUserInfoMaybeEndpoint::parse(&mut ctx, &options, None, None)?; assert_eq!(user_info.user, "john_doe"); - assert!(user_info.project.is_none()); + assert!(user_info.endpoint_id.is_none()); Ok(()) } @@ -326,7 +354,7 @@ mod tests { let user_info = ComputeUserInfoMaybeEndpoint::parse(&mut ctx, &options, sni, common_names.as_ref())?; assert_eq!(user_info.user, "john_doe"); - assert_eq!(user_info.project.as_deref(), Some("baz")); + assert_eq!(user_info.endpoint_id.as_deref(), Some("baz")); Ok(()) } @@ -340,14 +368,14 @@ mod tests { let mut ctx = RequestMonitoring::test(); let user_info = ComputeUserInfoMaybeEndpoint::parse(&mut ctx, &options, sni, common_names.as_ref())?; - assert_eq!(user_info.project.as_deref(), Some("p1")); + assert_eq!(user_info.endpoint_id.as_deref(), Some("p1")); let common_names = Some(["a.com".into(), "b.com".into()].into()); let sni = Some("p1.b.com"); let mut ctx = RequestMonitoring::test(); let user_info = ComputeUserInfoMaybeEndpoint::parse(&mut ctx, &options, sni, common_names.as_ref())?; - assert_eq!(user_info.project.as_deref(), Some("p1")); + assert_eq!(user_info.endpoint_id.as_deref(), Some("p1")); Ok(()) } @@ -404,7 +432,7 @@ mod tests { let mut ctx = RequestMonitoring::test(); let user_info = ComputeUserInfoMaybeEndpoint::parse(&mut ctx, &options, sni, common_names.as_ref())?; - assert_eq!(user_info.project.as_deref(), Some("project")); + assert_eq!(user_info.endpoint_id.as_deref(), Some("project")); assert_eq!( user_info.options.get_cache_key("project"), "project endpoint_type:read_write lsn:0/2" @@ -415,21 +443,17 @@ mod tests { #[test] fn test_check_peer_addr_is_in_list() { - let peer_addr = IpAddr::from([127, 0, 0, 1]); - assert!(check_peer_addr_is_in_list(&peer_addr, &vec![])); - assert!(check_peer_addr_is_in_list( - &peer_addr, - &vec!["127.0.0.1".into()] - )); - assert!(!check_peer_addr_is_in_list( - &peer_addr, - &vec!["8.8.8.8".into()] - )); + fn check(v: serde_json::Value) -> bool { + let peer_addr = IpAddr::from([127, 0, 0, 1]); + let ip_list: Vec = serde_json::from_value(v).unwrap(); + check_peer_addr_is_in_list(&peer_addr, &ip_list) + } + + assert!(check(json!([]))); + assert!(check(json!(["127.0.0.1"]))); + assert!(!check(json!(["8.8.8.8"]))); // If there is an incorrect address, it will be skipped. - assert!(check_peer_addr_is_in_list( - &peer_addr, - &vec!["88.8.8".into(), "127.0.0.1".into()] - )); + assert!(check(json!(["88.8.8", "127.0.0.1"]))); } #[test] fn test_parse_ip_v4() -> anyhow::Result<()> { diff --git a/proxy/src/auth/password_hack.rs b/proxy/src/auth/password_hack.rs index 372b0764ee..2ddf46fe25 100644 --- a/proxy/src/auth/password_hack.rs +++ b/proxy/src/auth/password_hack.rs @@ -4,10 +4,11 @@ //! UPDATE (Mon Aug 8 13:20:34 UTC 2022): the payload format has been simplified. use bstr::ByteSlice; -use smol_str::SmolStr; + +use crate::EndpointId; pub struct PasswordHackPayload { - pub endpoint: SmolStr, + pub endpoint: EndpointId, pub password: Vec, } diff --git a/proxy/src/cache/project_info.rs b/proxy/src/cache/project_info.rs index 57d9e5289d..6f37868a8c 100644 --- a/proxy/src/cache/project_info.rs +++ b/proxy/src/cache/project_info.rs @@ -11,13 +11,16 @@ use smol_str::SmolStr; use tokio::time::Instant; use tracing::{debug, info}; -use crate::{config::ProjectInfoCacheOptions, console::AuthSecret}; +use crate::{ + auth::IpPattern, config::ProjectInfoCacheOptions, console::AuthSecret, EndpointId, ProjectId, + RoleName, +}; use super::{Cache, Cached}; pub trait ProjectInfoCache { - fn invalidate_allowed_ips_for_project(&self, project_id: &SmolStr); - fn invalidate_role_secret_for_project(&self, project_id: &SmolStr, role_name: &SmolStr); + fn invalidate_allowed_ips_for_project(&self, project_id: &ProjectId); + fn invalidate_role_secret_for_project(&self, project_id: &ProjectId, role_name: &RoleName); fn enable_ttl(&self); fn disable_ttl(&self); } @@ -44,8 +47,8 @@ impl From for Entry { #[derive(Default)] struct EndpointInfo { - secret: std::collections::HashMap>, - allowed_ips: Option>>>, + secret: std::collections::HashMap>>, + allowed_ips: Option>>>, } impl EndpointInfo { @@ -57,10 +60,10 @@ impl EndpointInfo { } pub fn get_role_secret( &self, - role_name: &SmolStr, + role_name: &RoleName, valid_since: Instant, ignore_cache_since: Option, - ) -> Option<(AuthSecret, bool)> { + ) -> Option<(Option, bool)> { if let Some(secret) = self.secret.get(role_name) { if valid_since < secret.created_at { return Some(( @@ -76,7 +79,7 @@ impl EndpointInfo { &self, valid_since: Instant, ignore_cache_since: Option, - ) -> Option<(Arc>, bool)> { + ) -> Option<(Arc>, bool)> { if let Some(allowed_ips) = &self.allowed_ips { if valid_since < allowed_ips.created_at { return Some(( @@ -90,7 +93,7 @@ impl EndpointInfo { pub fn invalidate_allowed_ips(&mut self) { self.allowed_ips = None; } - pub fn invalidate_role_secret(&mut self, role_name: &SmolStr) { + pub fn invalidate_role_secret(&mut self, role_name: &RoleName) { self.secret.remove(role_name); } } @@ -103,9 +106,9 @@ impl EndpointInfo { /// One may ask, why the data is stored per project, when on the user request there is only data about the endpoint available? /// On the cplane side updates are done per project (or per branch), so it's easier to invalidate the whole project cache. pub struct ProjectInfoCacheImpl { - cache: DashMap, + cache: DashMap, - project2ep: DashMap>, + project2ep: DashMap>, config: ProjectInfoCacheOptions, start_time: Instant, @@ -113,7 +116,7 @@ pub struct ProjectInfoCacheImpl { } impl ProjectInfoCache for ProjectInfoCacheImpl { - fn invalidate_allowed_ips_for_project(&self, project_id: &SmolStr) { + fn invalidate_allowed_ips_for_project(&self, project_id: &ProjectId) { info!("invalidating allowed ips for project `{}`", project_id); let endpoints = self .project2ep @@ -126,7 +129,7 @@ impl ProjectInfoCache for ProjectInfoCacheImpl { } } } - fn invalidate_role_secret_for_project(&self, project_id: &SmolStr, role_name: &SmolStr) { + fn invalidate_role_secret_for_project(&self, project_id: &ProjectId, role_name: &RoleName) { info!( "invalidating role secret for project_id `{}` and role_name `{}`", project_id, role_name @@ -167,9 +170,9 @@ impl ProjectInfoCacheImpl { pub fn get_role_secret( &self, - endpoint_id: &SmolStr, - role_name: &SmolStr, - ) -> Option> { + endpoint_id: &EndpointId, + role_name: &RoleName, + ) -> Option>> { let (valid_since, ignore_cache_since) = self.get_cache_times(); let endpoint_info = self.cache.get(endpoint_id)?; let (value, ignore_cache) = @@ -188,8 +191,8 @@ impl ProjectInfoCacheImpl { } pub fn get_allowed_ips( &self, - endpoint_id: &SmolStr, - ) -> Option>>> { + endpoint_id: &EndpointId, + ) -> Option>>> { let (valid_since, ignore_cache_since) = self.get_cache_times(); let endpoint_info = self.cache.get(endpoint_id)?; let value = endpoint_info.get_allowed_ips(valid_since, ignore_cache_since); @@ -205,10 +208,10 @@ impl ProjectInfoCacheImpl { } pub fn insert_role_secret( &self, - project_id: &SmolStr, - endpoint_id: &SmolStr, - role_name: &SmolStr, - secret: AuthSecret, + project_id: &ProjectId, + endpoint_id: &EndpointId, + role_name: &RoleName, + secret: Option, ) { if self.cache.len() >= self.config.size { // If there are too many entries, wait until the next gc cycle. @@ -222,9 +225,9 @@ impl ProjectInfoCacheImpl { } pub fn insert_allowed_ips( &self, - project_id: &SmolStr, - endpoint_id: &SmolStr, - allowed_ips: Arc>, + project_id: &ProjectId, + endpoint_id: &EndpointId, + allowed_ips: Arc>, ) { if self.cache.len() >= self.config.size { // If there are too many entries, wait until the next gc cycle. @@ -236,7 +239,7 @@ impl ProjectInfoCacheImpl { .or_default() .allowed_ips = Some(allowed_ips.into()); } - fn inser_project2endpoint(&self, project_id: &SmolStr, endpoint_id: &SmolStr) { + fn inser_project2endpoint(&self, project_id: &ProjectId, endpoint_id: &EndpointId) { if let Some(mut endpoints) = self.project2ep.get_mut(project_id) { endpoints.insert(endpoint_id.clone()); } else { @@ -297,18 +300,18 @@ impl ProjectInfoCacheImpl { /// This is used to invalidate cache entries. pub struct CachedLookupInfo { /// Search by this key. - endpoint_id: SmolStr, + endpoint_id: EndpointId, lookup_type: LookupType, } impl CachedLookupInfo { - pub(self) fn new_role_secret(endpoint_id: SmolStr, role_name: SmolStr) -> Self { + pub(self) fn new_role_secret(endpoint_id: EndpointId, role_name: RoleName) -> Self { Self { endpoint_id, lookup_type: LookupType::RoleSecret(role_name), } } - pub(self) fn new_allowed_ips(endpoint_id: SmolStr) -> Self { + pub(self) fn new_allowed_ips(endpoint_id: EndpointId) -> Self { Self { endpoint_id, lookup_type: LookupType::AllowedIps, @@ -317,7 +320,7 @@ impl CachedLookupInfo { } enum LookupType { - RoleSecret(SmolStr), + RoleSecret(RoleName), AllowedIps, } @@ -348,7 +351,6 @@ impl Cache for ProjectInfoCacheImpl { mod tests { use super::*; use crate::{console::AuthSecret, scram::ServerSecret}; - use smol_str::SmolStr; use std::{sync::Arc, time::Duration}; #[tokio::test] @@ -362,11 +364,17 @@ mod tests { }); let project_id = "project".into(); let endpoint_id = "endpoint".into(); - let user1: SmolStr = "user1".into(); - let user2: SmolStr = "user2".into(); - let secret1 = AuthSecret::Scram(ServerSecret::mock(user1.as_str(), [1; 32])); - let secret2 = AuthSecret::Scram(ServerSecret::mock(user2.as_str(), [2; 32])); - let allowed_ips = Arc::new(vec!["allowed_ip1".into(), "allowed_ip2".into()]); + let user1: RoleName = "user1".into(); + let user2: RoleName = "user2".into(); + let secret1 = Some(AuthSecret::Scram(ServerSecret::mock( + user1.as_str(), + [1; 32], + ))); + let secret2 = None; + let allowed_ips = Arc::new(vec![ + "127.0.0.1".parse().unwrap(), + "127.0.0.2".parse().unwrap(), + ]); cache.insert_role_secret(&project_id, &endpoint_id, &user1, secret1.clone()); cache.insert_role_secret(&project_id, &endpoint_id, &user2, secret2.clone()); cache.insert_allowed_ips(&project_id, &endpoint_id, allowed_ips.clone()); @@ -379,8 +387,11 @@ mod tests { assert_eq!(cached.value, secret2); // Shouldn't add more than 2 roles. - let user3: SmolStr = "user3".into(); - let secret3 = AuthSecret::Scram(ServerSecret::mock(user3.as_str(), [3; 32])); + let user3: RoleName = "user3".into(); + let secret3 = Some(AuthSecret::Scram(ServerSecret::mock( + user3.as_str(), + [3; 32], + ))); cache.insert_role_secret(&project_id, &endpoint_id, &user3, secret3.clone()); assert!(cache.get_role_secret(&endpoint_id, &user3).is_none()); @@ -411,11 +422,20 @@ mod tests { let project_id = "project".into(); let endpoint_id = "endpoint".into(); - let user1: SmolStr = "user1".into(); - let user2: SmolStr = "user2".into(); - let secret1 = AuthSecret::Scram(ServerSecret::mock(user1.as_str(), [1; 32])); - let secret2 = AuthSecret::Scram(ServerSecret::mock(user2.as_str(), [2; 32])); - let allowed_ips = Arc::new(vec!["allowed_ip1".into(), "allowed_ip2".into()]); + let user1: RoleName = "user1".into(); + let user2: RoleName = "user2".into(); + let secret1 = Some(AuthSecret::Scram(ServerSecret::mock( + user1.as_str(), + [1; 32], + ))); + let secret2 = Some(AuthSecret::Scram(ServerSecret::mock( + user2.as_str(), + [2; 32], + ))); + let allowed_ips = Arc::new(vec![ + "127.0.0.1".parse().unwrap(), + "127.0.0.2".parse().unwrap(), + ]); cache.insert_role_secret(&project_id, &endpoint_id, &user1, secret1.clone()); cache.insert_role_secret(&project_id, &endpoint_id, &user2, secret2.clone()); cache.insert_allowed_ips(&project_id, &endpoint_id, allowed_ips.clone()); @@ -457,11 +477,20 @@ mod tests { let project_id = "project".into(); let endpoint_id = "endpoint".into(); - let user1: SmolStr = "user1".into(); - let user2: SmolStr = "user2".into(); - let secret1 = AuthSecret::Scram(ServerSecret::mock(user1.as_str(), [1; 32])); - let secret2 = AuthSecret::Scram(ServerSecret::mock(user2.as_str(), [2; 32])); - let allowed_ips = Arc::new(vec!["allowed_ip1".into(), "allowed_ip2".into()]); + let user1: RoleName = "user1".into(); + let user2: RoleName = "user2".into(); + let secret1 = Some(AuthSecret::Scram(ServerSecret::mock( + user1.as_str(), + [1; 32], + ))); + let secret2 = Some(AuthSecret::Scram(ServerSecret::mock( + user2.as_str(), + [2; 32], + ))); + let allowed_ips = Arc::new(vec![ + "127.0.0.1".parse().unwrap(), + "127.0.0.2".parse().unwrap(), + ]); cache.insert_role_secret(&project_id, &endpoint_id, &user1, secret1.clone()); cache.clone().disable_ttl(); tokio::time::advance(Duration::from_millis(100)).await; diff --git a/proxy/src/console/messages.rs b/proxy/src/console/messages.rs index c02d65668f..6ef9bcf4eb 100644 --- a/proxy/src/console/messages.rs +++ b/proxy/src/console/messages.rs @@ -1,7 +1,10 @@ use serde::Deserialize; -use smol_str::SmolStr; use std::fmt; +use crate::auth::IpPattern; + +use crate::{BranchId, EndpointId, ProjectId}; + /// Generic error response with human-readable description. /// Note that we can't always present it to user as is. #[derive(Debug, Deserialize)] @@ -14,8 +17,8 @@ pub struct ConsoleError { #[derive(Deserialize)] pub struct GetRoleSecret { pub role_secret: Box, - pub allowed_ips: Option>>, - pub project_id: Option>, + pub allowed_ips: Option>, + pub project_id: Option, } // Manually implement debug to omit sensitive info. @@ -92,9 +95,9 @@ impl fmt::Debug for DatabaseInfo { /// Also known as `ProxyMetricsAuxInfo` in the console. #[derive(Debug, Deserialize, Clone, Default)] pub struct MetricsAuxInfo { - pub endpoint_id: SmolStr, - pub project_id: SmolStr, - pub branch_id: SmolStr, + pub endpoint_id: EndpointId, + pub project_id: ProjectId, + pub branch_id: BranchId, } impl MetricsAuxInfo { diff --git a/proxy/src/console/provider.rs b/proxy/src/console/provider.rs index 178a7a2f4c..a6dfbd79db 100644 --- a/proxy/src/console/provider.rs +++ b/proxy/src/console/provider.rs @@ -4,16 +4,15 @@ pub mod neon; use super::messages::MetricsAuxInfo; use crate::{ - auth::backend::ComputeUserInfo, + auth::{backend::ComputeUserInfo, IpPattern}, cache::{project_info::ProjectInfoCacheImpl, Cached, TimedLru}, compute, config::{CacheOptions, ProjectInfoCacheOptions}, context::RequestMonitoring, - scram, + scram, EndpointCacheKey, ProjectId, }; use async_trait::async_trait; use dashmap::DashMap; -use smol_str::SmolStr; use std::{sync::Arc, time::Duration}; use tokio::sync::{OwnedSemaphorePermit, Semaphore}; use tokio::time::Instant; @@ -212,9 +211,9 @@ pub enum AuthSecret { pub struct AuthInfo { pub secret: Option, /// List of IP addresses allowed for the autorization. - pub allowed_ips: Vec, + pub allowed_ips: Vec, /// Project ID. This is used for cache invalidation. - pub project_id: Option, + pub project_id: Option, } /// Info for establishing a connection to a compute node. @@ -233,10 +232,10 @@ pub struct NodeInfo { pub allow_self_signed_compute: bool, } -pub type NodeInfoCache = TimedLru; +pub type NodeInfoCache = TimedLru; pub type CachedNodeInfo = Cached<&'static NodeInfoCache>; -pub type CachedRoleSecret = Cached<&'static ProjectInfoCacheImpl, AuthSecret>; -pub type CachedAllowedIps = Cached<&'static ProjectInfoCacheImpl, Arc>>; +pub type CachedRoleSecret = Cached<&'static ProjectInfoCacheImpl, Option>; +pub type CachedAllowedIps = Cached<&'static ProjectInfoCacheImpl, Arc>>; /// This will allocate per each call, but the http requests alone /// already require a few allocations, so it should be fine. @@ -249,7 +248,7 @@ pub trait Api { &self, ctx: &mut RequestMonitoring, user_info: &ComputeUserInfo, - ) -> Result, errors::GetAuthInfoError>; + ) -> Result; async fn get_allowed_ips( &self, @@ -280,7 +279,7 @@ impl Api for ConsoleBackend { &self, ctx: &mut RequestMonitoring, user_info: &ComputeUserInfo, - ) -> Result, errors::GetAuthInfoError> { + ) -> Result { use ConsoleBackend::*; match self { Console(api) => api.get_role_secret(ctx, user_info).await, @@ -345,7 +344,7 @@ impl ApiCaches { /// Various caches for [`console`](super). pub struct ApiLocks { name: &'static str, - node_locks: DashMap>, + node_locks: DashMap>, permits: usize, timeout: Duration, registered: prometheus::IntCounter, @@ -413,7 +412,7 @@ impl ApiLocks { pub async fn get_wake_compute_permit( &self, - key: &SmolStr, + key: &EndpointCacheKey, ) -> Result { if self.permits == 0 { return Ok(WakeComputePermit { permit: None }); diff --git a/proxy/src/console/provider/mock.rs b/proxy/src/console/provider/mock.rs index cc35a06708..55f395a403 100644 --- a/proxy/src/console/provider/mock.rs +++ b/proxy/src/console/provider/mock.rs @@ -4,14 +4,13 @@ use super::{ errors::{ApiError, GetAuthInfoError, WakeComputeError}, AuthInfo, AuthSecret, CachedNodeInfo, NodeInfo, }; -use crate::cache::Cached; use crate::console::provider::{CachedAllowedIps, CachedRoleSecret}; use crate::context::RequestMonitoring; use crate::{auth::backend::ComputeUserInfo, compute, error::io_error, scram, url::ApiUrl}; +use crate::{auth::IpPattern, cache::Cached}; use async_trait::async_trait; use futures::TryFutureExt; -use smol_str::SmolStr; -use std::sync::Arc; +use std::{str::FromStr, sync::Arc}; use thiserror::Error; use tokio_postgres::{config::SslMode, Client}; use tracing::{error, info, info_span, warn, Instrument}; @@ -88,7 +87,9 @@ impl Api { { Some(s) => { info!("got allowed_ips: {s}"); - s.split(',').map(String::from).collect() + s.split(',') + .map(|s| IpPattern::from_str(s).unwrap()) + .collect() } None => vec![], }; @@ -100,7 +101,7 @@ impl Api { .await?; Ok(AuthInfo { secret, - allowed_ips: allowed_ips.iter().map(SmolStr::from).collect(), + allowed_ips, project_id: None, }) } @@ -150,12 +151,10 @@ impl super::Api for Api { &self, _ctx: &mut RequestMonitoring, user_info: &ComputeUserInfo, - ) -> Result, GetAuthInfoError> { - Ok(self - .do_get_auth_info(user_info) - .await? - .secret - .map(CachedRoleSecret::new_uncached)) + ) -> Result { + Ok(CachedRoleSecret::new_uncached( + self.do_get_auth_info(user_info).await?.secret, + )) } async fn get_allowed_ips( diff --git a/proxy/src/console/provider/neon.rs b/proxy/src/console/provider/neon.rs index e8e36815c7..33618faed8 100644 --- a/proxy/src/console/provider/neon.rs +++ b/proxy/src/console/provider/neon.rs @@ -14,8 +14,6 @@ use crate::{ }; use async_trait::async_trait; use futures::TryFutureExt; -use itertools::Itertools; -use smol_str::SmolStr; use std::sync::Arc; use tokio::time::Instant; use tokio_postgres::config::SslMode; @@ -86,20 +84,20 @@ impl Api { }, }; - let secret = scram::ServerSecret::parse(&body.role_secret) - .map(AuthSecret::Scram) - .ok_or(GetAuthInfoError::BadSecret)?; - let allowed_ips = body - .allowed_ips - .into_iter() - .flatten() - .map(SmolStr::from) - .collect_vec(); + let secret = if body.role_secret.is_empty() { + None + } else { + let secret = scram::ServerSecret::parse(&body.role_secret) + .map(AuthSecret::Scram) + .ok_or(GetAuthInfoError::BadSecret)?; + Some(secret) + }; + let allowed_ips = body.allowed_ips.unwrap_or_default(); ALLOWED_IPS_NUMBER.observe(allowed_ips.len() as f64); Ok(AuthInfo { - secret: Some(secret), + secret, allowed_ips, - project_id: body.project_id.map(SmolStr::from), + project_id: body.project_id, }) } .map_err(crate::error::log_error) @@ -172,19 +170,20 @@ impl super::Api for Api { &self, ctx: &mut RequestMonitoring, user_info: &ComputeUserInfo, - ) -> Result, GetAuthInfoError> { + ) -> Result { let ep = &user_info.endpoint; let user = &user_info.user; if let Some(role_secret) = self.caches.project_info.get_role_secret(ep, user) { - return Ok(Some(role_secret)); + return Ok(role_secret); } let auth_info = self.do_get_auth_info(ctx, user_info).await?; if let Some(project_id) = auth_info.project_id { - if let Some(secret) = &auth_info.secret { - self.caches - .project_info - .insert_role_secret(&project_id, ep, user, secret.clone()) - } + self.caches.project_info.insert_role_secret( + &project_id, + ep, + user, + auth_info.secret.clone(), + ); self.caches.project_info.insert_allowed_ips( &project_id, ep, @@ -192,7 +191,7 @@ impl super::Api for Api { ); } // When we just got a secret, we don't need to invalidate it. - Ok(auth_info.secret.map(Cached::new_uncached)) + Ok(Cached::new_uncached(auth_info.secret)) } async fn get_allowed_ips( @@ -214,11 +213,12 @@ impl super::Api for Api { let allowed_ips = Arc::new(auth_info.allowed_ips); let user = &user_info.user; if let Some(project_id) = auth_info.project_id { - if let Some(secret) = &auth_info.secret { - self.caches - .project_info - .insert_role_secret(&project_id, ep, user, secret.clone()) - } + self.caches.project_info.insert_role_secret( + &project_id, + ep, + user, + auth_info.secret.clone(), + ); self.caches .project_info .insert_allowed_ips(&project_id, ep, allowed_ips.clone()); @@ -238,7 +238,7 @@ impl super::Api for Api { // for some time (highly depends on the console's scale-to-zero policy); // The connection info remains the same during that period of time, // which means that we might cache it to reduce the load and latency. - if let Some(cached) = self.caches.node_info.get(&*key) { + if let Some(cached) = self.caches.node_info.get(&key) { info!(key = &*key, "found cached compute node info"); return Ok(cached); } diff --git a/proxy/src/context.rs b/proxy/src/context.rs index 8a1aa4aec9..9e2ea10031 100644 --- a/proxy/src/context.rs +++ b/proxy/src/context.rs @@ -7,7 +7,10 @@ use std::net::IpAddr; use tokio::sync::mpsc; use uuid::Uuid; -use crate::{console::messages::MetricsAuxInfo, error::ErrorKind, metrics::LatencyTimer}; +use crate::{ + console::messages::MetricsAuxInfo, error::ErrorKind, metrics::LatencyTimer, BranchId, + EndpointId, ProjectId, RoleName, +}; pub mod parquet; @@ -26,10 +29,10 @@ pub struct RequestMonitoring { region: &'static str, // filled in as they are discovered - project: Option, - branch: Option, - endpoint_id: Option, - user: Option, + project: Option, + branch: Option, + endpoint_id: Option, + user: Option, application: Option, error_kind: Option, success: bool, @@ -86,7 +89,7 @@ impl RequestMonitoring { self.project = Some(x.project_id); } - pub fn set_endpoint_id(&mut self, endpoint_id: Option) { + pub fn set_endpoint_id(&mut self, endpoint_id: Option) { self.endpoint_id = endpoint_id.or_else(|| self.endpoint_id.clone()); } @@ -94,7 +97,7 @@ impl RequestMonitoring { self.application = app.or_else(|| self.application.clone()); } - pub fn set_user(&mut self, user: SmolStr) { + pub fn set_user(&mut self, user: RoleName) { self.user = Some(user); } diff --git a/proxy/src/lib.rs b/proxy/src/lib.rs index a22b2459b8..a9e4a38302 100644 --- a/proxy/src/lib.rs +++ b/proxy/src/lib.rs @@ -62,3 +62,79 @@ pub async fn handle_signals(token: CancellationToken) -> anyhow::Result(r: Result, JoinError>) -> anyhow::Result { r.context("join error").and_then(|x| x) } + +macro_rules! smol_str_wrapper { + ($name:ident) => { + #[derive(Clone, Debug, PartialEq, Eq, PartialOrd, Ord, Hash, Default)] + pub struct $name(smol_str::SmolStr); + + impl $name { + pub fn as_str(&self) -> &str { + self.0.as_str() + } + } + + impl std::fmt::Display for $name { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + self.0.fmt(f) + } + } + + impl std::cmp::PartialEq for $name + where + smol_str::SmolStr: std::cmp::PartialEq, + { + fn eq(&self, other: &T) -> bool { + self.0.eq(other) + } + } + + impl From for $name + where + smol_str::SmolStr: From, + { + fn from(x: T) -> Self { + Self(x.into()) + } + } + + impl AsRef for $name { + fn as_ref(&self) -> &str { + self.0.as_ref() + } + } + + impl std::ops::Deref for $name { + type Target = str; + fn deref(&self) -> &str { + &*self.0 + } + } + + impl<'de> serde::de::Deserialize<'de> for $name { + fn deserialize>(d: D) -> Result { + >::deserialize(d).map(Self) + } + } + + impl serde::Serialize for $name { + fn serialize(&self, s: S) -> Result { + self.0.serialize(s) + } + } + }; +} + +// 90% of role name strings are 20 characters or less. +smol_str_wrapper!(RoleName); +// 50% of endpoint strings are 23 characters or less. +smol_str_wrapper!(EndpointId); +// 50% of branch strings are 23 characters or less. +smol_str_wrapper!(BranchId); +// 90% of project strings are 23 characters or less. +smol_str_wrapper!(ProjectId); + +// will usually equal endpoint ID +smol_str_wrapper!(EndpointCacheKey); + +smol_str_wrapper!(DbName); diff --git a/proxy/src/proxy.rs b/proxy/src/proxy.rs index 635d157383..087cc7f7a9 100644 --- a/proxy/src/proxy.rs +++ b/proxy/src/proxy.rs @@ -19,6 +19,7 @@ use crate::{ rate_limiter::EndpointRateLimiter, stream::{PqStream, Stream}, usage_metrics::{Ids, USAGE_METRICS}, + EndpointCacheKey, }; use anyhow::{bail, Context}; use futures::TryFutureExt; @@ -26,7 +27,7 @@ use itertools::Itertools; use once_cell::sync::OnceCell; use pq_proto::{BeMessage as Be, FeStartupPacket, StartupMessageParams}; use regex::Regex; -use smol_str::SmolStr; +use smol_str::{format_smolstr, SmolStr}; use std::sync::Arc; use tokio::io::{AsyncRead, AsyncWrite, AsyncWriteExt}; use tokio_util::sync::CancellationToken; @@ -516,20 +517,21 @@ impl NeonOptions { Self(options) } - pub fn get_cache_key(&self, prefix: &str) -> SmolStr { + pub fn get_cache_key(&self, prefix: &str) -> EndpointCacheKey { // prefix + format!(" {k}:{v}") // kinda jank because SmolStr is immutable std::iter::once(prefix) .chain(self.0.iter().flat_map(|(k, v)| [" ", &**k, ":", &**v])) - .collect() + .collect::() + .into() } /// DeepObject format /// `paramName[prop1]=value1¶mName[prop2]=value2&...` - pub fn to_deep_object(&self) -> Vec<(String, SmolStr)> { + pub fn to_deep_object(&self) -> Vec<(SmolStr, SmolStr)> { self.0 .iter() - .map(|(k, v)| (format!("options[{}]", k), v.clone())) + .map(|(k, v)| (format_smolstr!("options[{}]", k), v.clone())) .collect() } } diff --git a/proxy/src/proxy/tests.rs b/proxy/src/proxy/tests.rs index 73fde2d7d0..a552a857b9 100644 --- a/proxy/src/proxy/tests.rs +++ b/proxy/src/proxy/tests.rs @@ -6,13 +6,13 @@ use super::connect_compute::ConnectMechanism; use super::retry::ShouldRetry; use super::*; use crate::auth::backend::{ComputeUserInfo, TestBackend}; +use crate::auth::IpPattern; use crate::config::CertResolver; use crate::console::{self, CachedNodeInfo, NodeInfo}; use crate::proxy::retry::{retry_after, NUM_RETRIES_CONNECT}; use crate::{auth, http, sasl, scram}; use async_trait::async_trait; use rstest::rstest; -use smol_str::SmolStr; use tokio_postgres::config::SslMode; use tokio_postgres::tls::{MakeTlsConnect, NoTls}; use tokio_postgres_rustls::{MakeRustlsConnect, RustlsStream}; @@ -471,7 +471,7 @@ impl TestBackend for TestConnectMechanism { } } - fn get_allowed_ips(&self) -> Result, console::errors::GetAuthInfoError> { + fn get_allowed_ips(&self) -> Result, console::errors::GetAuthInfoError> { unimplemented!("not used in tests") } } diff --git a/proxy/src/rate_limiter/limiter.rs b/proxy/src/rate_limiter/limiter.rs index a190b2cf8f..cbae72711c 100644 --- a/proxy/src/rate_limiter/limiter.rs +++ b/proxy/src/rate_limiter/limiter.rs @@ -11,11 +11,12 @@ use anyhow::bail; use dashmap::DashMap; use itertools::Itertools; use rand::{rngs::StdRng, Rng, SeedableRng}; -use smol_str::SmolStr; use tokio::sync::{Mutex as AsyncMutex, Semaphore, SemaphorePermit}; use tokio::time::{timeout, Duration, Instant}; use tracing::info; +use crate::EndpointId; + use super::{ limit_algorithm::{LimitAlgorithm, Sample}, RateLimiterConfig, @@ -33,7 +34,7 @@ use super::{ // does not look very nice (`SSL SYSCALL error: Undefined error: 0`), so for now // I went with a more expensive way that yields user-friendlier error messages. pub struct EndpointRateLimiter { - map: DashMap, Hasher>, + map: DashMap, Hasher>, info: &'static [RateBucketInfo], access_count: AtomicUsize, rand: Mutex, @@ -146,7 +147,7 @@ impl EndpointRateLimiter { } /// Check that number of connections to the endpoint is below `max_rps` rps. - pub fn check(&self, endpoint: SmolStr) -> bool { + pub fn check(&self, endpoint: EndpointId) -> bool { // do a partial GC every 2k requests. This cleans up ~ 1/64th of the map. // worst case memory usage is about: // = 2 * 2048 * 64 * (48B + 72B) @@ -493,11 +494,13 @@ mod tests { use futures::{task::noop_waker_ref, Future}; use rand::SeedableRng; use rustc_hash::FxHasher; - use smol_str::SmolStr; use tokio::time; use super::{EndpointRateLimiter, Limiter, Outcome}; - use crate::rate_limiter::{RateBucketInfo, RateLimitAlgorithm}; + use crate::{ + rate_limiter::{RateBucketInfo, RateLimitAlgorithm}, + EndpointId, + }; #[tokio::test] async fn it_works() { @@ -654,7 +657,7 @@ mod tests { RateBucketInfo::validate(&mut rates).unwrap(); let limiter = EndpointRateLimiter::new(Vec::leak(rates)); - let endpoint = SmolStr::from("ep-my-endpoint-1234"); + let endpoint = EndpointId::from("ep-my-endpoint-1234"); time::pause(); diff --git a/proxy/src/redis/notifications.rs b/proxy/src/redis/notifications.rs index d28dcbd1a7..9cd70b109b 100644 --- a/proxy/src/redis/notifications.rs +++ b/proxy/src/redis/notifications.rs @@ -3,9 +3,8 @@ use std::{convert::Infallible, sync::Arc}; use futures::StreamExt; use redis::aio::PubSub; use serde::Deserialize; -use smol_str::SmolStr; -use crate::cache::project_info::ProjectInfoCache; +use crate::{cache::project_info::ProjectInfoCache, ProjectId, RoleName}; const CHANNEL_NAME: &str = "neondb-proxy-ws-updates"; const RECONNECT_TIMEOUT: std::time::Duration = std::time::Duration::from_secs(20); @@ -46,12 +45,12 @@ enum Notification { } #[derive(Clone, Debug, Deserialize, Eq, PartialEq)] struct AllowedIpsUpdate { - project_id: SmolStr, + project_id: ProjectId, } #[derive(Clone, Debug, Deserialize, Eq, PartialEq)] struct PasswordUpdate { - project_id: SmolStr, - role_name: SmolStr, + project_id: ProjectId, + role_name: RoleName, } fn deserialize_json_string<'de, D, T>(deserializer: D) -> Result where diff --git a/proxy/src/serverless.rs b/proxy/src/serverless.rs index 8af008394a..dfef4ccdfa 100644 --- a/proxy/src/serverless.rs +++ b/proxy/src/serverless.rs @@ -41,6 +41,8 @@ use tokio_util::sync::CancellationToken; use tracing::{error, info, info_span, warn, Instrument}; use utils::http::{error::ApiError, json::json_response}; +pub const SERVERLESS_DRIVER_SNI: &str = "api"; + pub async fn task_main( config: &'static ProxyConfig, ws_listener: TcpListener, diff --git a/proxy/src/serverless/conn_pool.rs b/proxy/src/serverless/conn_pool.rs index c07cc2816e..5a7279ae63 100644 --- a/proxy/src/serverless/conn_pool.rs +++ b/proxy/src/serverless/conn_pool.rs @@ -31,6 +31,7 @@ use crate::{ metrics::NUM_DB_CONNECTIONS_GAUGE, proxy::connect_compute::ConnectMechanism, usage_metrics::{Ids, MetricCounter, USAGE_METRICS}, + DbName, EndpointCacheKey, RoleName, }; use crate::{compute, config}; @@ -42,17 +43,17 @@ pub const APP_NAME: SmolStr = SmolStr::new_inline("/sql_over_http"); #[derive(Debug, Clone)] pub struct ConnInfo { pub user_info: ComputeUserInfo, - pub dbname: SmolStr, + pub dbname: DbName, pub password: SmolStr, } impl ConnInfo { // hm, change to hasher to avoid cloning? - pub fn db_and_user(&self) -> (SmolStr, SmolStr) { + pub fn db_and_user(&self) -> (DbName, RoleName) { (self.dbname.clone(), self.user_info.user.clone()) } - pub fn endpoint_cache_key(&self) -> SmolStr { + pub fn endpoint_cache_key(&self) -> EndpointCacheKey { self.user_info.endpoint_cache_key() } } @@ -79,14 +80,14 @@ struct ConnPoolEntry { // Per-endpoint connection pool, (dbname, username) -> DbUserConnPool // Number of open connections is limited by the `max_conns_per_endpoint`. pub struct EndpointConnPool { - pools: HashMap<(SmolStr, SmolStr), DbUserConnPool>, + pools: HashMap<(DbName, RoleName), DbUserConnPool>, total_conns: usize, max_conns: usize, _guard: IntCounterPairGuard, } impl EndpointConnPool { - fn get_conn_entry(&mut self, db_user: (SmolStr, SmolStr)) -> Option { + fn get_conn_entry(&mut self, db_user: (DbName, RoleName)) -> Option { let Self { pools, total_conns, .. } = self; @@ -95,7 +96,7 @@ impl EndpointConnPool { .and_then(|pool_entries| pool_entries.get_conn_entry(total_conns)) } - fn remove_client(&mut self, db_user: (SmolStr, SmolStr), conn_id: uuid::Uuid) -> bool { + fn remove_client(&mut self, db_user: (DbName, RoleName), conn_id: uuid::Uuid) -> bool { let Self { pools, total_conns, .. } = self; @@ -196,7 +197,7 @@ pub struct GlobalConnPool { // // That should be a fairly conteded map, so return reference to the per-endpoint // pool as early as possible and release the lock. - global_pool: DashMap>>, + global_pool: DashMap>>, /// Number of endpoint-connection pools /// @@ -440,7 +441,10 @@ impl GlobalConnPool { Ok(Client::new(new_client, conn_info, endpoint_pool).await) } - fn get_or_create_endpoint_pool(&self, endpoint: &SmolStr) -> Arc> { + fn get_or_create_endpoint_pool( + &self, + endpoint: &EndpointCacheKey, + ) -> Arc> { // fast path if let Some(pool) = self.global_pool.get(endpoint) { return pool.clone(); diff --git a/proxy/src/serverless/sql_over_http.rs b/proxy/src/serverless/sql_over_http.rs index 9b32ae7f25..1e2ddaa2ff 100644 --- a/proxy/src/serverless/sql_over_http.rs +++ b/proxy/src/serverless/sql_over_http.rs @@ -1,6 +1,7 @@ use std::sync::Arc; use anyhow::bail; +use anyhow::Context; use futures::pin_mut; use futures::StreamExt; use hyper::body::HttpBody; @@ -13,7 +14,6 @@ use hyper::{Body, HeaderMap, Request}; use serde_json::json; use serde_json::Map; use serde_json::Value; -use smol_str::SmolStr; use tokio_postgres::error::DbError; use tokio_postgres::error::ErrorPosition; use tokio_postgres::types::Kind; @@ -36,9 +36,11 @@ use crate::config::TlsConfig; use crate::context::RequestMonitoring; use crate::metrics::NUM_CONNECTION_REQUESTS_GAUGE; use crate::proxy::NeonOptions; +use crate::RoleName; use super::conn_pool::ConnInfo; use super::conn_pool::GlobalConnPool; +use super::SERVERLESS_DRIVER_SNI; #[derive(serde::Deserialize)] struct QueryData { @@ -60,7 +62,6 @@ enum Payload { const MAX_RESPONSE_SIZE: usize = 10 * 1024 * 1024; // 10 MiB const MAX_REQUEST_SIZE: u64 = 10 * 1024 * 1024; // 10 MiB -const SERVERLESS_DRIVER_SNI_HOSTNAME_FIRST_PART: &str = "api"; static RAW_TEXT_OUTPUT: HeaderName = HeaderName::from_static("neon-raw-text-output"); static ARRAY_MODE: HeaderName = HeaderName::from_static("neon-array-mode"); @@ -155,7 +156,7 @@ fn get_conn_info( .next() .ok_or(anyhow::anyhow!("invalid database name"))?; - let username = SmolStr::from(connection_url.username()); + let username = RoleName::from(connection_url.username()); if username.is_empty() { return Err(anyhow::anyhow!("missing username")); } @@ -187,9 +188,7 @@ fn get_conn_info( } } - let endpoint = endpoint_sni(hostname, &tls.common_names)?; - - let endpoint: SmolStr = endpoint.into(); + let endpoint = endpoint_sni(hostname, &tls.common_names)?.context("malformed endpoint")?; ctx.set_endpoint_id(Some(endpoint.clone())); let pairs = connection_url.query_pairs(); @@ -226,8 +225,7 @@ fn check_matches(sni_hostname: &str, hostname: &str) -> Result = Lazy::new(|| { pub static BACKED_UP_SEGMENTS: Lazy = Lazy::new(|| { register_int_counter!( "safekeeper_backed_up_segments_total", - "Number of WAL segments backed up to the broker" + "Number of WAL segments backed up to the S3" ) .expect("Failed to register safekeeper_backed_up_segments_total counter") }); @@ -337,6 +337,7 @@ pub struct TimelineCollector { flushed_wal_seconds: GaugeVec, collect_timeline_metrics: Gauge, timelines_count: IntGauge, + active_timelines_count: IntGauge, } impl Default for TimelineCollector { @@ -520,6 +521,13 @@ impl TimelineCollector { .unwrap(); descs.extend(timelines_count.desc().into_iter().cloned()); + let active_timelines_count = IntGauge::new( + "safekeeper_active_timelines", + "Total number of active timelines", + ) + .unwrap(); + descs.extend(active_timelines_count.desc().into_iter().cloned()); + TimelineCollector { descs, commit_lsn, @@ -540,6 +548,7 @@ impl TimelineCollector { flushed_wal_seconds, collect_timeline_metrics, timelines_count, + active_timelines_count, } } } @@ -572,6 +581,7 @@ impl Collector for TimelineCollector { let timelines = GlobalTimelines::get_all(); let timelines_count = timelines.len(); + let mut active_timelines_count = 0; // Prometheus Collector is sync, and data is stored under async lock. To // bridge the gap with a crutch, collect data in spawned thread with @@ -590,6 +600,10 @@ impl Collector for TimelineCollector { let timeline_id = tli.ttid.timeline_id.to_string(); let labels = &[tenant_id.as_str(), timeline_id.as_str()]; + if tli.timeline_is_active { + active_timelines_count += 1; + } + self.commit_lsn .with_label_values(labels) .set(tli.mem_state.commit_lsn.into()); @@ -681,6 +695,8 @@ impl Collector for TimelineCollector { // report total number of timelines self.timelines_count.set(timelines_count as i64); + self.active_timelines_count + .set(active_timelines_count as i64); mfs.extend(self.timelines_count.collect()); mfs diff --git a/safekeeper/src/remove_wal.rs b/safekeeper/src/remove_wal.rs index d96eedf401..9dce06a886 100644 --- a/safekeeper/src/remove_wal.rs +++ b/safekeeper/src/remove_wal.rs @@ -7,12 +7,21 @@ use tracing::*; use crate::{GlobalTimelines, SafeKeeperConf}; +const ALLOW_INACTIVE_TIMELINES: bool = true; + pub async fn task_main(conf: SafeKeeperConf) -> anyhow::Result<()> { let wal_removal_interval = Duration::from_millis(5000); loop { + let now = tokio::time::Instant::now(); + let mut active_timelines = 0; + let tlis = GlobalTimelines::get_all(); for tli in &tlis { - if !tli.is_active().await { + let is_active = tli.is_active().await; + if is_active { + active_timelines += 1; + } + if !ALLOW_INACTIVE_TIMELINES && !is_active { continue; } let ttid = tli.ttid; @@ -27,6 +36,17 @@ pub async fn task_main(conf: SafeKeeperConf) -> anyhow::Result<()> { .instrument(info_span!("WAL removal", ttid = %ttid)) .await; } + + let elapsed = now.elapsed(); + let total_timelines = tlis.len(); + + if elapsed > wal_removal_interval { + info!( + "WAL removal is too long, processed {} active timelines ({} total) in {:?}", + active_timelines, total_timelines, elapsed + ); + } + sleep(wal_removal_interval).await; } } diff --git a/scripts/flaky_tests.py b/scripts/flaky_tests.py index b07e4bea9b..61a97f520d 100755 --- a/scripts/flaky_tests.py +++ b/scripts/flaky_tests.py @@ -3,6 +3,7 @@ import argparse import json import logging +import os from collections import defaultdict from typing import DefaultDict, Dict @@ -45,6 +46,15 @@ def main(args: argparse.Namespace): logging.error("cannot fetch flaky tests from the DB due to an error", exc) rows = [] + # If a test run has non-default PAGESERVER_VIRTUAL_FILE_IO_ENGINE (i.e. not empty, not std-fs), + # use it to parametrize test name along with build_type and pg_version + # + # See test_runner/fixtures/parametrize.py for details + if (io_engine := os.getenv("PAGESERVER_VIRTUAL_FILE_IO_ENGINE", "")) not in ("", "std-fs"): + pageserver_virtual_file_io_engine_parameter = f"-{io_engine}" + else: + pageserver_virtual_file_io_engine_parameter = "" + for row in rows: # We don't want to automatically rerun tests in a performance suite if row["parent_suite"] != "test_runner.regress": @@ -53,10 +63,10 @@ def main(args: argparse.Namespace): if row["name"].endswith("]"): parametrized_test = row["name"].replace( "[", - f"[{build_type}-pg{pg_version}-", + f"[{build_type}-pg{pg_version}{pageserver_virtual_file_io_engine_parameter}-", ) else: - parametrized_test = f"{row['name']}[{build_type}-pg{pg_version}]" + parametrized_test = f"{row['name']}[{build_type}-pg{pg_version}{pageserver_virtual_file_io_engine_parameter}]" res[row["parent_suite"]][row["suite"]][parametrized_test] = True diff --git a/scripts/ps_ec2_setup_instance_store b/scripts/ps_ec2_setup_instance_store index 00d37e5f83..4cca3a9857 100755 --- a/scripts/ps_ec2_setup_instance_store +++ b/scripts/ps_ec2_setup_instance_store @@ -39,6 +39,9 @@ SETUP COMPLETE To run your local neon.git build on the instance store volume, run the following commands from the top of the neon.git checkout + # raise file descriptor limit of your shell and its child processes + sudo prlimit -p $$ --nofile=800000:800000 + # test suite run export TEST_OUTPUT="$TEST_OUTPUT" DEFAULT_PG_VERSION=15 BUILD_TYPE=release ./scripts/pytest test_runner/performance/test_latency.py diff --git a/test_runner/fixtures/neon_fixtures.py b/test_runner/fixtures/neon_fixtures.py index d98aedf4d0..bbabfeedf6 100644 --- a/test_runner/fixtures/neon_fixtures.py +++ b/test_runner/fixtures/neon_fixtures.py @@ -2,6 +2,7 @@ from __future__ import annotations import abc import asyncio +import concurrent.futures import filecmp import json import os @@ -10,16 +11,18 @@ import shutil import subprocess import tempfile import textwrap +import threading import time import uuid from contextlib import closing, contextmanager from dataclasses import dataclass, field from datetime import datetime +from fcntl import LOCK_EX, LOCK_UN, flock from functools import cached_property from itertools import chain, product from pathlib import Path from types import TracebackType -from typing import Any, Dict, Iterator, List, Optional, Tuple, Type, Union, cast +from typing import Any, Callable, Dict, Iterator, List, Optional, Tuple, Type, Union, cast from urllib.parse import urlparse import asyncpg @@ -49,7 +52,10 @@ from fixtures.pageserver.allowed_errors import ( ) from fixtures.pageserver.http import PageserverHttpClient from fixtures.pageserver.types import IndexPartDump -from fixtures.pageserver.utils import wait_for_last_record_lsn, wait_for_upload +from fixtures.pageserver.utils import ( + wait_for_last_record_lsn, + wait_for_upload, +) from fixtures.pg_version import PgVersion from fixtures.port_distributor import PortDistributor from fixtures.remote_storage import ( @@ -424,6 +430,7 @@ class NeonEnvBuilder: pg_distrib_dir: Path, pg_version: PgVersion, test_name: str, + top_output_dir: Path, test_output_dir: Path, test_overlay_dir: Optional[Path] = None, pageserver_remote_storage: Optional[RemoteStorage] = None, @@ -440,6 +447,7 @@ class NeonEnvBuilder: preserve_database_files: bool = False, initial_tenant: Optional[TenantId] = None, initial_timeline: Optional[TimelineId] = None, + pageserver_virtual_file_io_engine: Optional[str] = None, ): self.repo_dir = repo_dir self.rust_log_override = rust_log_override @@ -473,6 +481,9 @@ class NeonEnvBuilder: self.test_overlay_dir = test_overlay_dir self.overlay_mounts_created_by_us: List[Tuple[str, Path]] = [] self.config_init_force: Optional[str] = None + self.top_output_dir = top_output_dir + + self.pageserver_virtual_file_io_engine: Optional[str] = pageserver_virtual_file_io_engine assert test_name.startswith( "test_" @@ -526,6 +537,64 @@ class NeonEnvBuilder: return env + def build_and_use_snapshot( + self, global_ident: str, create_env_for_snapshot: Callable[[NeonEnvBuilder], NeonEnv] + ) -> NeonEnv: + if os.getenv("CI", "false") == "true": + log.info("do not use snapshots in ephemeral CI environment") + env = create_env_for_snapshot(self) + env.stop(immediate=True, ps_assert_metric_no_errors=False) + return env + + with shared_snapshot_dir(self.top_output_dir, global_ident) as snapshot_dir: + if not snapshot_dir.is_initialized(): + self._build_and_use_snapshot_impl(snapshot_dir, create_env_for_snapshot) + assert snapshot_dir.is_initialized() + + return self.from_repo_dir(snapshot_dir.path) + + def _build_and_use_snapshot_impl( + self, + snapshot_dir: SnapshotDirLocked, + create_env_for_snapshot: Callable[[NeonEnvBuilder], NeonEnv], + ): + if snapshot_dir.path.exists(): + shutil.rmtree(snapshot_dir.path) + + if self.test_overlay_dir is not None: + # Make repo_dir an overlayfs mount with lowerdir being the empty snapshot_dir. + # When we're done filling up repo_dir, tear everything down, unmount the overlayfs, and use + # the upperdir as the snapshot. This is equivalent to docker `FROM scratch`. + assert not self.repo_dir.exists() + assert self.repo_dir.parent.exists() + snapshot_dir.path.mkdir() + self.overlay_mount("create-snapshot-repo-dir", snapshot_dir.path, self.repo_dir) + self.config_init_force = "empty-dir-ok" + + env = create_env_for_snapshot(self) + assert self.env is not None + assert self.env == env + + # shut down everything for snapshot + env.stop(immediate=True, ps_assert_metric_no_errors=True) + + # TODO: all kinds of assertions to ensure the env is unused + + if self.test_overlay_dir is None: + log.info("take snapshot by moving repo dir") + env.repo_dir.rename(snapshot_dir.path) + else: + log.info("take snapshot by using overlayfs upperdir") + self.overlay_unmount_and_move("create-snapshot-repo-dir", snapshot_dir.path) + log.info("remove empty repo_dir (previously mountpoint) for snapshot overlay_mount") + env.repo_dir.rmdir() + # TODO from here on, we should be able to reset / goto top where snapshot_dir.is_initialized() + log.info("make repo_dir an overlayfs mount of the snapshot we just created") + assert not env.repo_dir.exists(), "both branches above should remove it" + snapshot_dir.set_initialized() + + self.env = None # so that from_repo_dir works again + def from_repo_dir( self, repo_dir: Path, @@ -557,10 +626,15 @@ class NeonEnvBuilder: tenants_from_dir = ps_dir / "tenants" tenants_to_dir = self.repo_dir / ps_dir.name / "tenants" - log.info(f"Copying pageserver tenants directory {tenants_from_dir} to {tenants_to_dir}") if self.test_overlay_dir is None: + log.info( + f"Copying pageserver tenants directory {tenants_from_dir} to {tenants_to_dir}" + ) shutil.copytree(tenants_from_dir, tenants_to_dir) else: + log.info( + f"Creating overlayfs mount of pageserver tenants directory {tenants_from_dir} to {tenants_to_dir}" + ) self.overlay_mount(f"{ps_dir.name}:tenants", tenants_from_dir, tenants_to_dir) for sk_from_dir in (repo_dir / "safekeepers").glob("sk*"): @@ -571,10 +645,12 @@ class NeonEnvBuilder: shutil.rmtree(self.repo_dir / "local_fs_remote_storage", ignore_errors=True) if self.test_overlay_dir is None: + log.info("Copying local_fs_remote_storage directory from snapshot") shutil.copytree( repo_dir / "local_fs_remote_storage", self.repo_dir / "local_fs_remote_storage" ) else: + log.info("Creating overlayfs mount of local_fs_remote_storage directory from snapshot") self.overlay_mount( "local_fs_remote_storage", repo_dir / "local_fs_remote_storage", @@ -631,6 +707,54 @@ class NeonEnvBuilder: ) self.overlay_mounts_created_by_us.append((ident, dstdir)) + def _overlay_umount(self, mountpoint: Path): + cmd = ["sudo", "umount", str(mountpoint)] + assert mountpoint.is_mount() + subprocess_capture( + self.test_output_dir, cmd, check=True, echo_stderr=True, echo_stdout=True + ) + + def overlay_unmount_and_move(self, ident: str, dst: Path): + """ + Unmount previously established overlayfs mount at `dstdir` and move the upperdir contents to `dst`. + If `dst` is an empty directory, it gets replaced. + Caller is responsible for ensuring the unmount will succeed, i.e., that there aren't any nested mounts. + + Raises exception if self.test_overlay_dir is None + """ + assert self.test_overlay_dir is not None + # not mutating state yet, make checks + ident_state_dir = self.test_overlay_dir / ident + assert ident_state_dir.is_dir() + upper = ident_state_dir / "upper" + work = ident_state_dir / "work" + assert upper.is_dir() + assert work.is_dir() + assert ( + self.test_overlay_dir not in dst.parents + ), "otherwise workdir cleanup below wouldn't work" + # find index, still not mutating state + idxmap = { + existing_ident: idx + for idx, (existing_ident, _) in enumerate(self.overlay_mounts_created_by_us) + } + idx = idxmap.get(ident) + if idx is None: + raise RuntimeError(f"cannot find mount for ident {ident}") + + if dst.is_dir(): + dst.rmdir() # raises exception if not empty, which is what we want + + _, mountpoint = self.overlay_mounts_created_by_us.pop(idx) + self._overlay_umount(mountpoint) + upper.rename(dst) + # we moved the upperdir, clean up workdir and then its parent ident_state_dir + cmd = ["sudo", "rm", "-rf", str(work)] + subprocess_capture( + self.test_output_dir, cmd, check=True, echo_stderr=True, echo_stdout=True + ) + ident_state_dir.rmdir() # should be empty since we moved `upper` out + def overlay_cleanup_teardown(self): """ Unmount the overlayfs mounts created by `self.overlay_mount()`. @@ -641,13 +765,10 @@ class NeonEnvBuilder: while len(self.overlay_mounts_created_by_us) > 0: (ident, mountpoint) = self.overlay_mounts_created_by_us.pop() ident_state_dir = self.test_overlay_dir / ident - cmd = ["sudo", "umount", str(mountpoint)] log.info( - f"Unmounting overlayfs mount created during setup for ident {ident} at {mountpoint}: {cmd}" - ) - subprocess_capture( - self.test_output_dir, cmd, check=True, echo_stderr=True, echo_stdout=True + f"Unmounting overlayfs mount created during setup for ident {ident} at {mountpoint}" ) + self._overlay_umount(mountpoint) log.info( f"Cleaning up overlayfs state dir (owned by root user) for ident {ident} at {ident_state_dir}" ) @@ -725,8 +846,15 @@ class NeonEnvBuilder: if self.preserve_database_files: return + overlayfs_mounts = {mountpoint for _, mountpoint in self.overlay_mounts_created_by_us} + directories_to_clean: List[Path] = [] for test_entry in Path(self.repo_dir).glob("**/*"): + if test_entry in overlayfs_mounts: + continue + for parent in test_entry.parents: + if parent in overlayfs_mounts: + continue if test_entry.is_file(): test_file = test_entry if ATTACHMENT_NAME_REGEX.fullmatch(test_file.name): @@ -775,13 +903,6 @@ class NeonEnvBuilder: log.error(f"Error during remote storage scrub: {e}") cleanup_error = e - try: - self.overlay_cleanup_teardown() - except Exception as e: - log.error(f"Error cleaning up overlay state: {e}") - if cleanup_error is not None: - cleanup_error = e - try: self.cleanup_remote_storage() except Exception as e: @@ -802,6 +923,13 @@ class NeonEnvBuilder: for pageserver in self.env.pageservers: pageserver.assert_no_errors() + try: + self.overlay_cleanup_teardown() + except Exception as e: + log.error(f"Error cleaning up overlay state: {e}") + if cleanup_error is not None: + cleanup_error = e + class NeonEnv: """ @@ -866,11 +994,18 @@ class NeonEnv: self.initial_timeline = config.initial_timeline attachment_service_port = self.port_distributor.get_port() + # Reserve the next port after attachment service for use by its postgres: this + # will assert out if the next port wasn't free. + attachment_service_pg_port = self.port_distributor.get_port() + assert attachment_service_pg_port == attachment_service_port + 1 + self.control_plane_api: str = f"http://127.0.0.1:{attachment_service_port}" self.attachment_service: NeonAttachmentService = NeonAttachmentService( self, config.auth_enabled ) + self.pageserver_virtual_file_io_engine = config.pageserver_virtual_file_io_engine + # Create a config file corresponding to the options cfg: Dict[str, Any] = { "default_tenant_id": str(self.initial_tenant), @@ -902,6 +1037,9 @@ class NeonEnv: "pg_auth_type": pg_auth_type, "http_auth_type": http_auth_type, } + if self.pageserver_virtual_file_io_engine is not None: + ps_cfg["virtual_file_io_engine"] = self.pageserver_virtual_file_io_engine + # Create a corresponding NeonPageserver object self.pageservers.append( NeonPageserver( @@ -939,16 +1077,27 @@ class NeonEnv: self.neon_cli.init(cfg, force=config.config_init_force) def start(self): - # Start up broker, pageserver and all safekeepers - self.broker.try_start() - + # Attachment service starts first, so that pageserver /re-attach calls don't + # bounce through retries on startup self.attachment_service.start() - for pageserver in self.pageservers: - pageserver.start() + # Start up broker, pageserver and all safekeepers + futs = [] + with concurrent.futures.ThreadPoolExecutor( + max_workers=2 + len(self.pageservers) + len(self.safekeepers) + ) as executor: + futs.append( + executor.submit(lambda: self.broker.try_start() or None) + ) # The `or None` is for the linter - for safekeeper in self.safekeepers: - safekeeper.start() + for pageserver in self.pageservers: + futs.append(executor.submit(lambda ps=pageserver: ps.start())) + + for safekeeper in self.safekeepers: + futs.append(executor.submit(lambda sk=safekeeper: sk.start())) + + for f in futs: + f.result() def stop(self, immediate=False, ps_assert_metric_no_errors=False): """ @@ -971,7 +1120,9 @@ class NeonEnv: assert that there is only one. Tests with multiple pageservers should always use get_pageserver with an explicit ID. """ - assert len(self.pageservers) == 1 + assert ( + len(self.pageservers) == 1 + ), "env.pageserver must only be used with single pageserver NeonEnv" return self.pageservers[0] def get_pageserver(self, id: Optional[int]) -> NeonPageserver: @@ -1065,6 +1216,7 @@ def _shared_simple_env( neon_binpath: Path, pg_distrib_dir: Path, pg_version: PgVersion, + pageserver_virtual_file_io_engine: str, ) -> Iterator[NeonEnv]: """ # Internal fixture backing the `neon_simple_env` fixture. If TEST_SHARED_FIXTURES @@ -1082,6 +1234,7 @@ def _shared_simple_env( shutil.rmtree(repo_dir, ignore_errors=True) with NeonEnvBuilder( + top_output_dir=top_output_dir, repo_dir=repo_dir, port_distributor=port_distributor, broker=default_broker, @@ -1093,6 +1246,7 @@ def _shared_simple_env( preserve_database_files=pytestconfig.getoption("--preserve-database-files"), test_name=request.node.name, test_output_dir=test_output_dir, + pageserver_virtual_file_io_engine=pageserver_virtual_file_io_engine, ) as builder: env = builder.init_start() @@ -1130,6 +1284,8 @@ def neon_env_builder( run_id: uuid.UUID, request: FixtureRequest, test_overlay_dir: Path, + top_output_dir: Path, + pageserver_virtual_file_io_engine: str, ) -> Iterator[NeonEnvBuilder]: """ Fixture to create a Neon environment for test. @@ -1149,6 +1305,7 @@ def neon_env_builder( # Return the builder to the caller with NeonEnvBuilder( + top_output_dir=top_output_dir, repo_dir=Path(repo_dir), port_distributor=port_distributor, mock_s3_server=mock_s3_server, @@ -1158,6 +1315,7 @@ def neon_env_builder( broker=default_broker, run_id=run_id, preserve_database_files=pytestconfig.getoption("--preserve-database-files"), + pageserver_virtual_file_io_engine=pageserver_virtual_file_io_engine, test_name=request.node.name, test_output_dir=test_output_dir, test_overlay_dir=test_overlay_dir, @@ -1511,8 +1669,10 @@ class NeonCli(AbstractNeonCli): id: int, overrides: Tuple[str, ...] = (), extra_env_vars: Optional[Dict[str, str]] = None, + register: bool = True, ) -> "subprocess.CompletedProcess[str]": - start_args = ["pageserver", "start", f"--id={id}", *overrides] + register_str = "true" if register else "false" + start_args = ["pageserver", "start", f"--id={id}", *overrides, f"--register={register_str}"] storage = self.env.pageserver_remote_storage append_pageserver_param_overrides( params_to_update=start_args, @@ -1939,6 +2099,7 @@ class NeonPageserver(PgProtocol): self, overrides: Tuple[str, ...] = (), extra_env_vars: Optional[Dict[str, str]] = None, + register: bool = True, ) -> "NeonPageserver": """ Start the page server. @@ -1948,7 +2109,7 @@ class NeonPageserver(PgProtocol): assert self.running is False self.env.neon_cli.pageserver_start( - self.id, overrides=overrides, extra_env_vars=extra_env_vars + self.id, overrides=overrides, extra_env_vars=extra_env_vars, register=register ) self.running = True return self @@ -2914,6 +3075,7 @@ class Endpoint(PgProtocol): # Write it back updated with open(config_path, "w") as file: + log.info(json.dumps(dict(data_dict, **kwargs))) json.dump(dict(data_dict, **kwargs), file, indent=4) # Mock the extension part of spec passed from control plane for local testing @@ -3486,6 +3648,10 @@ def get_test_overlay_dir(request: FixtureRequest, top_output_dir: Path) -> Path: return _get_test_dir(request, top_output_dir, "overlay-") +def get_shared_snapshot_dir_path(top_output_dir: Path, snapshot_name: str) -> Path: + return top_output_dir / "shared-snapshots" / snapshot_name + + def get_test_repo_dir(request: FixtureRequest, top_output_dir: Path) -> Path: return get_test_output_dir(request, top_output_dir) / "repo" @@ -3532,6 +3698,75 @@ def test_output_dir( allure_attach_from_dir(test_dir) +class FileAndThreadLock: + def __init__(self, path: Path): + self.path = path + self.thread_lock = threading.Lock() + self.fd: Optional[int] = None + + def __enter__(self): + self.fd = os.open(self.path, os.O_CREAT | os.O_WRONLY) + # lock thread lock before file lock so that there's no race + # around flocking / funlocking the file lock + self.thread_lock.acquire() + flock(self.fd, LOCK_EX) + + def __exit__(self, exc_type, exc_value, exc_traceback): + assert self.fd is not None + assert self.thread_lock.locked() # ... by us + flock(self.fd, LOCK_UN) + self.thread_lock.release() + os.close(self.fd) + self.fd = None + + +class SnapshotDirLocked: + def __init__(self, parent: SnapshotDir): + self._parent = parent + + def is_initialized(self): + # TODO: in the future, take a `tag` as argument and store it in the marker in set_initialized. + # Then, in this function, compare marker file contents with the tag to invalidate the snapshot if the tag changed. + return self._parent._marker_file_path.exists() + + def set_initialized(self): + self._parent._marker_file_path.write_text("") + + @property + def path(self) -> Path: + return self._parent._path / "snapshot" + + +class SnapshotDir: + _path: Path + + def __init__(self, path: Path): + self._path = path + assert self._path.is_dir() + self._lock = FileAndThreadLock(self._lock_file_path) + + @property + def _lock_file_path(self) -> Path: + return self._path / "initializing.flock" + + @property + def _marker_file_path(self) -> Path: + return self._path / "initialized.marker" + + def __enter__(self) -> SnapshotDirLocked: + self._lock.__enter__() + return SnapshotDirLocked(self) + + def __exit__(self, exc_type, exc_value, exc_traceback): + self._lock.__exit__(exc_type, exc_value, exc_traceback) + + +def shared_snapshot_dir(top_output_dir, ident: str) -> SnapshotDir: + snapshot_dir_path = get_shared_snapshot_dir_path(top_output_dir, ident) + snapshot_dir_path.mkdir(exist_ok=True, parents=True) + return SnapshotDir(snapshot_dir_path) + + @pytest.fixture(scope="function") def test_overlay_dir(request: FixtureRequest, top_output_dir: Path) -> Optional[Path]: """ @@ -3541,7 +3776,7 @@ def test_overlay_dir(request: FixtureRequest, top_output_dir: Path) -> Optional[ The procedure cleans up after previous runs that were aborted (e.g. due to Ctrl-C, OOM kills, etc). """ - if os.getenv("NEON_ENV_BUILDER_FROM_REPO_DIR_USE_OVERLAYFS") is None: + if os.getenv("NEON_ENV_BUILDER_USE_OVERLAYFS_FOR_SNAPSHOTS") is None: return None overlay_dir = get_test_overlay_dir(request, top_output_dir) diff --git a/test_runner/fixtures/pageserver/http.py b/test_runner/fixtures/pageserver/http.py index cfa2a2674d..340cc9e9e3 100644 --- a/test_runner/fixtures/pageserver/http.py +++ b/test_runner/fixtures/pageserver/http.py @@ -20,6 +20,7 @@ from fixtures.utils import Fn class PageserverApiException(Exception): def __init__(self, message, status_code: int): super().__init__(message) + self.message = message self.status_code = status_code @@ -261,12 +262,18 @@ class PageserverHttpClient(requests.Session): ) self.verbose_error(res) - def tenant_detach(self, tenant_id: TenantId, detach_ignored=False): + def tenant_detach(self, tenant_id: TenantId, detach_ignored=False, timeout_secs=None): params = {} if detach_ignored: params["detach_ignored"] = "true" - res = self.post(f"http://localhost:{self.port}/v1/tenant/{tenant_id}/detach", params=params) + kwargs = {} + if timeout_secs is not None: + kwargs["timeout"] = timeout_secs + + res = self.post( + f"http://localhost:{self.port}/v1/tenant/{tenant_id}/detach", params=params, **kwargs + ) self.verbose_error(res) def tenant_reset(self, tenant_id: Union[TenantId, TenantShardId], drop_cache: bool): @@ -526,6 +533,17 @@ class PageserverHttpClient(requests.Session): res_json = res.json() assert res_json is None + def timeline_preserve_initdb_archive( + self, tenant_id: Union[TenantId, TenantShardId], timeline_id: TimelineId + ): + log.info( + f"Requesting initdb archive preservation for tenant {tenant_id} and timeline {timeline_id}" + ) + res = self.post( + f"http://localhost:{self.port}/v1/tenant/{tenant_id}/timeline/{timeline_id}/preserve_initdb_archive", + ) + self.verbose_error(res) + def timeline_get_lsn_by_timestamp( self, tenant_id: Union[TenantId, TenantShardId], diff --git a/test_runner/fixtures/pageserver/many_tenants.py b/test_runner/fixtures/pageserver/many_tenants.py new file mode 100644 index 0000000000..bbb4ccee5b --- /dev/null +++ b/test_runner/fixtures/pageserver/many_tenants.py @@ -0,0 +1,85 @@ +import concurrent.futures +import time +from typing import Any, Callable, Dict, Tuple + +import fixtures.pageserver.remote_storage +from fixtures.log_helper import log +from fixtures.neon_fixtures import ( + NeonEnv, + NeonEnvBuilder, +) +from fixtures.pageserver.utils import ( + wait_until_tenant_state, +) +from fixtures.remote_storage import LocalFsStorage, RemoteStorageKind +from fixtures.types import TenantId, TimelineId + + +def single_timeline( + neon_env_builder: NeonEnvBuilder, + setup_template: Callable[[NeonEnv], Tuple[TenantId, TimelineId, Dict[str, Any]]], + ncopies: int, +) -> NeonEnv: + """ + Create `ncopies` duplicates of a template tenant that has a single timeline. + """ + neon_env_builder.enable_pageserver_remote_storage(RemoteStorageKind.LOCAL_FS) + + env = neon_env_builder.init_start() + + remote_storage = env.pageserver_remote_storage + assert isinstance(remote_storage, LocalFsStorage) + + ps_http = env.pageserver.http_client() + # clean up the useless default tenant + ps_http.tenant_delete(env.initial_tenant) + + log.info("invoking callback to create template tenant") + template_tenant, template_timeline, template_config = setup_template(env) + log.info( + f"template tenant is template_tenant={template_tenant} template_timeline={template_timeline}" + ) + + log.info("detach template tenant form pageserver") + env.pageserver.tenant_detach(template_tenant) + env.pageserver.allowed_errors.append( + # tenant detach causes this because the underlying attach-hook removes the tenant from attachment_service entirely + ".*Dropped remote consistent LSN updates.*", + ) + + log.info(f"duplicating template tenant {ncopies} times in S3") + tenants = fixtures.pageserver.remote_storage.duplicate_tenant(env, template_tenant, ncopies) + + log.info("attach duplicated tenants to pageserver") + # In theory we could just attach all the tenants, force on-demand downloads via mgmt API, and be done. + # However, on-demand downloads are quite slow ATM. + # => do the on-demand downloads in Python. + assert ps_http.tenant_list() == [] + # make the attach fail after it created enough on-disk state to retry loading + # the tenant next startup, but before it can start background loops that would start download + ps_http.configure_failpoints(("attach-before-activate", "return")) + env.pageserver.allowed_errors.append( + ".*attach failed, setting tenant state to Broken: attach-before-activate.*" + ) + + def attach_broken(tenant): + env.pageserver.tenant_attach( + tenant, + config=template_config.copy(), + ) + time.sleep(0.1) + wait_until_tenant_state(ps_http, tenant, "Broken", 10) + + with concurrent.futures.ThreadPoolExecutor(max_workers=22) as executor: + executor.map(attach_broken, tenants) + + env.pageserver.stop( + immediate=True + ) # clears the failpoint as a side-effect; immediate to avoid hitting neon_local's timeout + tenant_timelines = list(map(lambda tenant: (tenant, template_timeline), tenants)) + log.info("python-side on-demand download the layer files into local tenant dir") + fixtures.pageserver.remote_storage.copy_all_remote_layer_files_to_local_tenant_dir( + env, tenant_timelines + ) + + return env diff --git a/test_runner/fixtures/pageserver/remote_storage.py b/test_runner/fixtures/pageserver/remote_storage.py new file mode 100644 index 0000000000..e6cd9b4614 --- /dev/null +++ b/test_runner/fixtures/pageserver/remote_storage.py @@ -0,0 +1,116 @@ +import concurrent.futures +import os +import queue +import shutil +import threading +from pathlib import Path +from typing import Any, List, Tuple + +from fixtures.neon_fixtures import NeonEnv, Pagectl +from fixtures.pageserver.types import ( + InvalidFileName, + parse_layer_file_name, +) +from fixtures.remote_storage import LocalFsStorage +from fixtures.types import TenantId, TimelineId + + +def duplicate_one_tenant(env: NeonEnv, template_tenant: TenantId, new_tenant: TenantId): + remote_storage = env.pageserver_remote_storage + assert isinstance(remote_storage, LocalFsStorage) + + src_timelines_dir: Path = remote_storage.tenant_path(template_tenant) / "timelines" + assert src_timelines_dir.is_dir(), f"{src_timelines_dir} is not a directory" + + assert isinstance(remote_storage, LocalFsStorage) + dst_timelines_dir: Path = remote_storage.tenant_path(new_tenant) / "timelines" + dst_timelines_dir.parent.mkdir(parents=False, exist_ok=False) + dst_timelines_dir.mkdir(parents=False, exist_ok=False) + + for tl in src_timelines_dir.iterdir(): + src_tl_dir = src_timelines_dir / tl.name + assert src_tl_dir.is_dir(), f"{src_tl_dir} is not a directory" + dst_tl_dir = dst_timelines_dir / tl.name + dst_tl_dir.mkdir(parents=False, exist_ok=False) + for file in tl.iterdir(): + shutil.copy2(file, dst_tl_dir) + if "__" in file.name: + Pagectl(env).raw_cli( + [ + "layer", + "rewrite-summary", + str(dst_tl_dir / file.name), + "--new-tenant-id", + str(new_tenant), + ] + ) + else: + # index_part etc need no patching + pass + return None + + +def duplicate_tenant(env: NeonEnv, template_tenant: TenantId, ncopies: int) -> List[TenantId]: + assert isinstance(env.pageserver_remote_storage, LocalFsStorage) + + def work(tenant_id): + duplicate_one_tenant(env, template_tenant, tenant_id) + + new_tenants: List[TenantId] = [TenantId.generate() for _ in range(0, ncopies)] + with concurrent.futures.ThreadPoolExecutor(max_workers=8) as executor: + executor.map(work, new_tenants) + return new_tenants + + +def local_layer_name_from_remote_name(remote_name: str) -> str: + try: + return parse_layer_file_name(remote_name).to_str() + except InvalidFileName as e: + comps = remote_name.rsplit("-", 1) + if len(comps) == 1: + raise InvalidFileName("no generation suffix found") from e + else: + assert len(comps) == 2 + layer_file_name, _generation = comps + try: + return parse_layer_file_name(layer_file_name).to_str() + except InvalidFileName: + raise + + +def copy_all_remote_layer_files_to_local_tenant_dir( + env: NeonEnv, tenant_timelines: List[Tuple[TenantId, TimelineId]] +): + remote_storage = env.pageserver_remote_storage + assert isinstance(remote_storage, LocalFsStorage) + work: queue.Queue[Any] = queue.Queue() + for tenant, timeline in tenant_timelines: + remote_timeline_path = remote_storage.timeline_path(tenant, timeline) + local_timeline_path = env.pageserver.timeline_dir(tenant, timeline) + local_timeline_path.mkdir(parents=True, exist_ok=True) + downloads = {} + for remote_layer in remote_timeline_path.glob("*__*"): + local_name = local_layer_name_from_remote_name(remote_layer.name) + assert local_name not in downloads, "remote storage must have had split brain" + downloads[local_name] = remote_layer + for local_name, remote_path in downloads.items(): + work.put((remote_path, local_timeline_path / local_name)) + + def copy_layer_worker(queue): + while True: + item = queue.get() + if item is None: + return + remote_path, local_path = item + # not copy2, so it looks like a recent download, in case that's relevant to e.g. eviction + shutil.copy(remote_path, local_path, follow_symlinks=False) + + workers = [] + n_threads = os.cpu_count() or 1 + for _ in range(0, n_threads): + w = threading.Thread(target=copy_layer_worker, args=[work]) + workers.append(w) + w.start() + work.put(None) + for w in workers: + w.join() diff --git a/test_runner/fixtures/pageserver/types.py b/test_runner/fixtures/pageserver/types.py index b3c1174b35..72fa30a2f2 100644 --- a/test_runner/fixtures/pageserver/types.py +++ b/test_runner/fixtures/pageserver/types.py @@ -31,10 +31,10 @@ class DeltaLayerFileName: key_start: Key key_end: Key - def is_l0(self): + def is_l0(self) -> bool: return self.key_start == KEY_MIN and self.key_end == KEY_MAX - def to_str(self): + def to_str(self) -> str: ret = f"{self.key_start.as_int():036X}-{self.key_end.as_int():036X}__{self.lsn_start.as_int():016X}-{self.lsn_end.as_int():016X}" assert self == parse_layer_file_name(ret) return ret @@ -107,7 +107,7 @@ def parse_layer_file_name(file_name: str) -> LayerFileName: except InvalidFileName: pass - raise ValueError() + raise InvalidFileName("neither image nor delta layer") def is_future_layer(layer_file_name: LayerFileName, disk_consistent_lsn: Lsn): diff --git a/test_runner/fixtures/pageserver/utils.py b/test_runner/fixtures/pageserver/utils.py index a6c4b8e930..6b2651e447 100644 --- a/test_runner/fixtures/pageserver/utils.py +++ b/test_runner/fixtures/pageserver/utils.py @@ -63,6 +63,14 @@ def wait_for_upload( ) +def _tenant_in_expected_state(tenant_info: Dict[str, Any], expected_state: str): + if tenant_info["state"]["slug"] == expected_state: + return True + if tenant_info["state"]["slug"] == "Broken": + raise RuntimeError(f"tenant became Broken, not {expected_state}") + return False + + def wait_until_tenant_state( pageserver_http: PageserverHttpClient, tenant_id: TenantId, @@ -80,10 +88,8 @@ def wait_until_tenant_state( log.debug(f"Tenant {tenant_id} state retrieval failure: {e}") else: log.debug(f"Tenant {tenant_id} data: {tenant}") - if tenant["state"]["slug"] == expected_state: + if _tenant_in_expected_state(tenant, expected_state): return tenant - if tenant["state"]["slug"] == "Broken": - raise RuntimeError(f"tenant became Broken, not {expected_state}") time.sleep(period) @@ -92,6 +98,34 @@ def wait_until_tenant_state( ) +def wait_until_all_tenants_state( + pageserver_http: PageserverHttpClient, + expected_state: str, + iterations: int, + period: float = 1.0, + http_error_ok: bool = True, +): + """ + Like wait_until_tenant_state, but checks all tenants. + """ + for _ in range(iterations): + try: + tenants = pageserver_http.tenant_list() + except Exception as e: + if http_error_ok: + log.debug(f"Failed to list tenants: {e}") + else: + raise + else: + if all(map(lambda tenant: _tenant_in_expected_state(tenant, expected_state), tenants)): + return + time.sleep(period) + + raise Exception( + f"Not all tenants became active {expected_state} within {iterations * period} seconds" + ) + + def wait_until_timeline_state( pageserver_http: PageserverHttpClient, tenant_id: Union[TenantId, TenantShardId], @@ -337,8 +371,24 @@ def tenant_delete_wait_completed( pageserver_http: PageserverHttpClient, tenant_id: TenantId, iterations: int, + ignore_errors: bool = False, ): - pageserver_http.tenant_delete(tenant_id=tenant_id) + if not ignore_errors: + pageserver_http.tenant_delete(tenant_id=tenant_id) + else: + interval = 0.5 + + def delete_request_sent(): + try: + pageserver_http.tenant_delete(tenant_id=tenant_id) + except PageserverApiException as e: + log.debug(e) + if e.status_code == 404: + return + except Exception as e: + log.debug(e) + + wait_until(iterations, interval=interval, func=delete_request_sent) wait_tenant_status_404(pageserver_http, tenant_id=tenant_id, iterations=iterations) diff --git a/test_runner/fixtures/parametrize.py b/test_runner/fixtures/parametrize.py index 53350138dd..d8ac92abb6 100644 --- a/test_runner/fixtures/parametrize.py +++ b/test_runner/fixtures/parametrize.py @@ -8,7 +8,7 @@ from _pytest.python import Metafunc from fixtures.pg_version import PgVersion """ -Dynamically parametrize tests by Postgres version and build type (debug/release/remote) +Dynamically parametrize tests by Postgres version, build type (debug/release/remote), and possibly by other parameters """ @@ -31,11 +31,12 @@ def build_type(request: FixtureRequest) -> Optional[str]: return None -def pytest_generate_tests(metafunc: Metafunc): - # Do not parametrize performance tests yet, we need to prepare grafana charts first - if "test_runner/performance" in metafunc.definition._nodeid: - return +@pytest.fixture(scope="function", autouse=True) +def pageserver_virtual_file_io_engine(request: FixtureRequest) -> Optional[str]: + return None + +def pytest_generate_tests(metafunc: Metafunc): if (v := os.environ.get("DEFAULT_PG_VERSION")) is None: pg_versions = [version for version in PgVersion if version != PgVersion.NOT_SET] else: @@ -46,5 +47,12 @@ def pytest_generate_tests(metafunc: Metafunc): else: build_types = [bt.lower()] - metafunc.parametrize("build_type", build_types) - metafunc.parametrize("pg_version", pg_versions, ids=map(lambda v: f"pg{v}", pg_versions)) + # Do not parametrize performance tests yet by Postgres version or build type, we need to prepare grafana charts first + if "test_runner/performance" not in metafunc.definition._nodeid: + metafunc.parametrize("build_type", build_types) + metafunc.parametrize("pg_version", pg_versions, ids=map(lambda v: f"pg{v}", pg_versions)) + + # A hacky way to parametrize tests only for `pageserver_virtual_file_io_engine=tokio-epoll-uring` + # And do not change test name for default `pageserver_virtual_file_io_engine=std-fs` to keep tests statistics + if (io_engine := os.environ.get("PAGESERVER_VIRTUAL_FILE_IO_ENGINE", "")) not in ("", "std-fs"): + metafunc.parametrize("pageserver_virtual_file_io_engine", [io_engine]) diff --git a/test_runner/fixtures/utils.py b/test_runner/fixtures/utils.py index cda788b2a4..91f33e1196 100644 --- a/test_runner/fixtures/utils.py +++ b/test_runner/fixtures/utils.py @@ -397,3 +397,36 @@ def run_pg_bench_small(pg_bin: "PgBin", connstr: str): } """ pg_bin.run(["pgbench", "-i", "-I dtGvp", "-s1", connstr]) + + +def humantime_to_ms(humantime: str) -> float: + """ + Converts Rust humantime's output string to milliseconds. + + humantime_to_ms("1h 1ms 406us") -> 3600001.406 + """ + + unit_multiplier_map = { + "ns": 1e-6, + "us": 1e-3, + "ms": 1, + "s": 1e3, + "m": 1e3 * 60, + "h": 1e3 * 60 * 60, + } + matcher = re.compile(rf"^(\d+)({'|'.join(unit_multiplier_map.keys())})$") + total_ms = 0.0 + + if humantime == "0": + return total_ms + + for item in humantime.split(): + if (match := matcher.search(item)) is not None: + n, unit = match.groups() + total_ms += int(n) * unit_multiplier_map[unit] + else: + raise ValueError( + f"can't parse '{item}' (from string '{humantime}'), known units are {', '.join(unit_multiplier_map.keys())}." + ) + + return round(total_ms, 3) diff --git a/test_runner/fixtures/workload.py b/test_runner/fixtures/workload.py index 30def1194d..f29a6cbf3c 100644 --- a/test_runner/fixtures/workload.py +++ b/test_runner/fixtures/workload.py @@ -21,12 +21,21 @@ class Workload: - reads, checking we get the right data (`validate`) """ - def __init__(self, env: NeonEnv, tenant_id: TenantId, timeline_id: TimelineId): + def __init__( + self, + env: NeonEnv, + tenant_id: TenantId, + timeline_id: TimelineId, + branch_name: Optional[str] = None, + ): self.env = env self.tenant_id = tenant_id self.timeline_id = timeline_id self.table = "foo" + # By default, use the default branch name for initial tenant in NeonEnv + self.branch_name = branch_name or "main" + self.expect_rows = 0 self.churn_cursor = 0 @@ -35,7 +44,7 @@ class Workload: def endpoint(self, pageserver_id: Optional[int] = None) -> Endpoint: if self._endpoint is None: self._endpoint = self.env.endpoints.create( - "main", + self.branch_name, tenant_id=self.tenant_id, pageserver_id=pageserver_id, endpoint_id="ep-workload", diff --git a/test_runner/performance/pageserver/README.md b/test_runner/performance/pageserver/README.md new file mode 100644 index 0000000000..fdd09cd946 --- /dev/null +++ b/test_runner/performance/pageserver/README.md @@ -0,0 +1,16 @@ +How to reproduce benchmark results / run these benchmarks interactively. + +1. Get an EC2 instance with Instance Store. Use the same instance type as used for the benchmark run. +2. Mount the Instance Store => `neon.git/scripts/ps_ec2_setup_instance_store` +3. Use a pytest command line (see other READMEs further up in the pytest hierarchy). + +For tests that take a long time to set up / consume a lot of storage space, +we use the test suite's repo_dir snapshotting functionality (`from_repo_dir`). +It supports mounting snapshots using overlayfs, which improves iteration time. + +Here's a full command line. + +``` +RUST_BACKTRACE=1 NEON_ENV_BUILDER_USE_OVERLAYFS_FOR_SNAPSHOTS=1 DEFAULT_PG_VERSION=15 BUILD_TYPE=release \ + ./scripts/pytest test_runner/performance/pageserver/pagebench/test_pageserver_max_throughput_getpage_at_latest_lsn.py +```` diff --git a/test_runner/performance/pageserver/__init__.py b/test_runner/performance/pageserver/__init__.py new file mode 100644 index 0000000000..e69de29bb2 diff --git a/test_runner/performance/pageserver/interactive/__init__.py b/test_runner/performance/pageserver/interactive/__init__.py new file mode 100644 index 0000000000..29644c240e --- /dev/null +++ b/test_runner/performance/pageserver/interactive/__init__.py @@ -0,0 +1,8 @@ +""" +Tests that aren't really tests or benchmarks. + +They're intended for the case where we want to standardize & automate setup, +but then debug a performance problem interactively. +It's kind of an abuse of the test framework, but, it's our only tool right +now to automate a complex test bench setup. +""" diff --git a/test_runner/performance/pageserver/interactive/test_many_small_tenants.py b/test_runner/performance/pageserver/interactive/test_many_small_tenants.py new file mode 100644 index 0000000000..3fb28ace46 --- /dev/null +++ b/test_runner/performance/pageserver/interactive/test_many_small_tenants.py @@ -0,0 +1,79 @@ +import os +import pdb + +import fixtures.pageserver.many_tenants as many_tenants +import pytest +from fixtures.neon_fixtures import ( + NeonEnv, + NeonEnvBuilder, + PgBin, + last_flush_lsn_upload, +) + +from performance.pageserver.util import ensure_pageserver_ready_for_benchmarking + +""" +Usage: +DEFAULT_PG_VERSION=15 BUILD_TYPE=debug NEON_ENV_BUILDER_USE_OVERLAYFS_FOR_SNAPSHOTS=1 INTERACTIVE=true \ + ./scripts/pytest --timeout 0 test_runner/performance/pageserver/interactive/test_many_small_tenants.py +""" + + +@pytest.mark.skipif( + os.environ.get("INTERACTIVE", "false") != "true", + reason="test is for interactive use only", +) +def test_many_small_tenants( + neon_env_builder: NeonEnvBuilder, + pg_bin: PgBin, +): + _env = setup_env(neon_env_builder, 2) # vary this to the desired number of tenants + _pg_bin = pg_bin + + # drop into pdb so that we can debug pageserver interactively, use pdb here + # For example, to interactively examine pageserver startup behavior, call + # _env.pageserver.stop(immediate=True) + # _env.pageserver.start() + # from the pdb shell. + pdb.set_trace() + + +def setup_env( + neon_env_builder: NeonEnvBuilder, + n_tenants: int, +) -> NeonEnv: + def setup_template(env: NeonEnv): + # create our template tenant + config = { + "gc_period": "0s", + "checkpoint_timeout": "10 years", + "compaction_period": "20 s", + "compaction_threshold": 10, + "compaction_target_size": 134217728, + "checkpoint_distance": 268435456, + "image_creation_threshold": 3, + } + template_tenant, template_timeline = env.neon_cli.create_tenant(set_default=True) + env.pageserver.tenant_detach(template_tenant) + env.pageserver.allowed_errors.append( + # tenant detach causes this because the underlying attach-hook removes the tenant from attachment_service entirely + ".*Dropped remote consistent LSN updates.*", + ) + env.pageserver.tenant_attach(template_tenant, config) + ep = env.endpoints.create_start("main", tenant_id=template_tenant) + ep.safe_psql("create table foo(b text)") + for _ in range(0, 8): + ep.safe_psql("insert into foo(b) values ('some text')") + last_flush_lsn_upload(env, ep, template_tenant, template_timeline) + ep.stop_and_destroy() + return (template_tenant, template_timeline, config) + + def doit(neon_env_builder: NeonEnvBuilder) -> NeonEnv: + return many_tenants.single_timeline(neon_env_builder, setup_template, n_tenants) + + env = neon_env_builder.build_and_use_snapshot(f"many-small-tenants-{n_tenants}", doit) + + env.start() + ensure_pageserver_ready_for_benchmarking(env, n_tenants) + + return env diff --git a/test_runner/performance/pageserver/pagebench/__init__.py b/test_runner/performance/pageserver/pagebench/__init__.py new file mode 100644 index 0000000000..9f5e45c0a0 --- /dev/null +++ b/test_runner/performance/pageserver/pagebench/__init__.py @@ -0,0 +1,10 @@ +""" +Pagebench-based performance regression tests. + +The defining characteristic of tests in this sub-directory is that they +are component-level tests, i.e., they exercise pageserver directly using `pagebench` +instead of benchmarking the full stack. + +See https://github.com/neondatabase/neon/issues/5771 +for the context in which this was developed. +""" diff --git a/test_runner/performance/pageserver/pagebench/test_pageserver_max_throughput_getpage_at_latest_lsn.py b/test_runner/performance/pageserver/pagebench/test_pageserver_max_throughput_getpage_at_latest_lsn.py new file mode 100644 index 0000000000..1ed7e577b9 --- /dev/null +++ b/test_runner/performance/pageserver/pagebench/test_pageserver_max_throughput_getpage_at_latest_lsn.py @@ -0,0 +1,210 @@ +import json +from pathlib import Path +from typing import Any, Dict, Tuple + +import fixtures.pageserver.many_tenants as many_tenants +import pytest +from fixtures.benchmark_fixture import MetricReport, NeonBenchmarker +from fixtures.log_helper import log +from fixtures.neon_fixtures import ( + NeonEnv, + NeonEnvBuilder, + PgBin, + wait_for_last_flush_lsn, +) +from fixtures.utils import get_scale_for_db, humantime_to_ms + +from performance.pageserver.util import ensure_pageserver_ready_for_benchmarking + + +# For reference, the space usage of the snapshots: +# admin@ip-172-31-13-23:[~/neon-main]: sudo du -hs /instance_store/test_output/shared-snapshots +# 137G /instance_store/test_output/shared-snapshots +# admin@ip-172-31-13-23:[~/neon-main]: sudo du -hs /instance_store/test_output/shared-snapshots/* +# 1.8G /instance_store/test_output/shared-snapshots/max_throughput_latest_lsn-1-13 +# 1.1G /instance_store/test_output/shared-snapshots/max_throughput_latest_lsn-1-6 +# 8.5G /instance_store/test_output/shared-snapshots/max_throughput_latest_lsn-10-13 +# 5.1G /instance_store/test_output/shared-snapshots/max_throughput_latest_lsn-10-6 +# 76G /instance_store/test_output/shared-snapshots/max_throughput_latest_lsn-100-13 +# 46G /instance_store/test_output/shared-snapshots/max_throughput_latest_lsn-100-6 +@pytest.mark.parametrize("duration", [30]) +@pytest.mark.parametrize("pgbench_scale", [get_scale_for_db(s) for s in [100, 200]]) +@pytest.mark.parametrize("n_tenants", [1, 10]) +@pytest.mark.timeout( + 10000 +) # TODO: this value is just "a really high number"; have this per instance type +def test_pageserver_max_throughput_getpage_at_latest_lsn( + neon_env_builder: NeonEnvBuilder, + zenbenchmark: NeonBenchmarker, + pg_bin: PgBin, + n_tenants: int, + pgbench_scale: int, + duration: int, +): + def record(metric, **kwargs): + zenbenchmark.record( + metric_name=f"pageserver_max_throughput_getpage_at_latest_lsn.{metric}", **kwargs + ) + + params: Dict[str, Tuple[Any, Dict[str, Any]]] = {} + + # params from fixtures + params.update( + { + "n_tenants": (n_tenants, {"unit": ""}), + "pgbench_scale": (pgbench_scale, {"unit": ""}), + "duration": (duration, {"unit": "s"}), + } + ) + + # configure cache sizes like in prod + page_cache_size = 16384 + max_file_descriptors = 500000 + neon_env_builder.pageserver_config_override = ( + f"page_cache_size={page_cache_size}; max_file_descriptors={max_file_descriptors}" + ) + params.update( + { + "pageserver_config_override.page_cache_size": ( + page_cache_size * 8192, + {"unit": "byte"}, + ), + "pageserver_config_override.max_file_descriptors": (max_file_descriptors, {"unit": ""}), + } + ) + + for param, (value, kwargs) in params.items(): + record(param, metric_value=value, report=MetricReport.TEST_PARAM, **kwargs) + env = setup_pageserver_with_pgbench_tenants(neon_env_builder, pg_bin, n_tenants, pgbench_scale) + run_benchmark_max_throughput_latest_lsn(env, pg_bin, record, duration) + + +def run_benchmark_max_throughput_latest_lsn( + env: NeonEnv, pg_bin: PgBin, record, duration_secs: int +): + """ + Benchmark `env.pageserver` for max throughput @ latest LSN and record results in `zenbenchmark`. + """ + + ps_http = env.pageserver.http_client() + cmd = [ + str(env.neon_binpath / "pagebench"), + "get-page-latest-lsn", + "--mgmt-api-endpoint", + ps_http.base_url, + "--page-service-connstring", + env.pageserver.connstr(password=None), + "--runtime", + f"{duration_secs}s", + # don't specify the targets explicitly, let pagebench auto-discover them + ] + log.info(f"command: {' '.join(cmd)}") + basepath = pg_bin.run_capture(cmd, with_command_header=False) + results_path = Path(basepath + ".stdout") + log.info(f"Benchmark results at: {results_path}") + + with open(results_path, "r") as f: + results = json.load(f) + log.info(f"Results:\n{json.dumps(results, sort_keys=True, indent=2)}") + + total = results["total"] + + metric = "request_count" + record( + metric, + metric_value=total[metric], + unit="", + report=MetricReport.HIGHER_IS_BETTER, + ) + + metric = "latency_mean" + record( + metric, + metric_value=humantime_to_ms(total[metric]), + unit="ms", + report=MetricReport.LOWER_IS_BETTER, + ) + + metric = "latency_percentiles" + for k, v in total[metric].items(): + record( + f"{metric}.{k}", + metric_value=humantime_to_ms(v), + unit="ms", + report=MetricReport.LOWER_IS_BETTER, + ) + + +def setup_pageserver_with_pgbench_tenants( + neon_env_builder: NeonEnvBuilder, + pg_bin: PgBin, + n_tenants: int, + scale: int, +) -> NeonEnv: + """ + Utility function to set up a pageserver with a given number of identical tenants. + Each tenant is a pgbench tenant, initialize to a certain scale, and treated afterwards + with a repeat application of (pgbench simple-update workload, checkpoint, compact). + """ + + def setup_template(env: NeonEnv): + # use a config that makes production of on-disk state timing-insensitive + # as we ingest data into the tenant. + config = { + "gc_period": "0s", # disable periodic gc + "checkpoint_timeout": "10 years", + "compaction_period": "0s", # disable periodic compaction + "compaction_threshold": 10, + "compaction_target_size": 134217728, + "checkpoint_distance": 268435456, + "image_creation_threshold": 3, + } + template_tenant, template_timeline = env.neon_cli.create_tenant(set_default=True) + env.pageserver.tenant_detach(template_tenant) + env.pageserver.allowed_errors.append( + # tenant detach causes this because the underlying attach-hook removes the tenant from attachment_service entirely + ".*Dropped remote consistent LSN updates.*", + ) + env.pageserver.tenant_attach(template_tenant, config) + ps_http = env.pageserver.http_client() + with env.endpoints.create_start("main", tenant_id=template_tenant) as ep: + pg_bin.run_capture(["pgbench", "-i", f"-s{scale}", "-I", "dtGvp", ep.connstr()]) + wait_for_last_flush_lsn(env, ep, template_tenant, template_timeline) + ps_http.timeline_checkpoint(template_tenant, template_timeline) + ps_http.timeline_compact(template_tenant, template_timeline) + for _ in range( + 0, 17 + ): # some prime number to avoid potential resonances with the "_threshold" variables from the config + # the L0s produced by this appear to have size ~5MiB + num_txns = 10_000 + pg_bin.run_capture( + ["pgbench", "-N", "-c1", "--transactions", f"{num_txns}", ep.connstr()] + ) + wait_for_last_flush_lsn(env, ep, template_tenant, template_timeline) + ps_http.timeline_checkpoint(template_tenant, template_timeline) + ps_http.timeline_compact(template_tenant, template_timeline) + # for reference, the output at scale=6 looked like so (306M total) + # ls -sh test_output/shared-snapshots/max_throughput_latest_lsn-2-6/snapshot/pageserver_1/tenants/35c30b88ea16a7a09f82d9c6a115551b/timelines/da902b378eebe83dc8a4e81cd3dc1c59 + # total 306M + # 188M 000000000000000000000000000000000000-030000000000000000000000000000000003__000000000149F060-0000000009E75829 + # 4.5M 000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__0000000009E75829-000000000A21E919 + # 33M 000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__000000000A21E919-000000000C20CB71 + # 36M 000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__000000000C20CB71-000000000E470791 + # 16M 000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__000000000E470791-000000000F34AEF1 + # 8.2M 000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__000000000F34AEF1-000000000FABA8A9 + # 6.0M 000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__000000000FABA8A9-000000000FFE0639 + # 6.1M 000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__000000000FFE0639-000000001051D799 + # 4.7M 000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__000000001051D799-0000000010908F19 + # 4.6M 000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__0000000010908F19-0000000010CD3021 + + return (template_tenant, template_timeline, config) + + def doit(neon_env_builder: NeonEnvBuilder) -> NeonEnv: + return many_tenants.single_timeline(neon_env_builder, setup_template, n_tenants) + + env = neon_env_builder.build_and_use_snapshot( + f"max_throughput_latest_lsn-{n_tenants}-{scale}", doit + ) + env.start() + ensure_pageserver_ready_for_benchmarking(env, n_tenants) + return env diff --git a/test_runner/performance/pageserver/util.py b/test_runner/performance/pageserver/util.py new file mode 100644 index 0000000000..45eb652362 --- /dev/null +++ b/test_runner/performance/pageserver/util.py @@ -0,0 +1,29 @@ +""" +Utilities used by all code in this sub-directory +""" + +from fixtures.log_helper import log +from fixtures.neon_fixtures import NeonEnv +from fixtures.pageserver.utils import wait_until_all_tenants_state + + +def ensure_pageserver_ready_for_benchmarking(env: NeonEnv, n_tenants: int): + """ + Helper function. + """ + ps_http = env.pageserver.http_client() + + log.info("wait for all tenants to become active") + wait_until_all_tenants_state( + ps_http, "Active", iterations=n_tenants, period=1, http_error_ok=False + ) + + # ensure all layers are resident for predictiable performance + tenants = [info["id"] for info in ps_http.tenant_list()] + for tenant in tenants: + for timeline in ps_http.tenant_status(tenant)["timelines"]: + info = ps_http.layer_map_info(tenant, timeline) + for layer in info.historic_layers: + assert not layer.remote + + log.info("ready") diff --git a/test_runner/regress/test_compatibility.py b/test_runner/regress/test_compatibility.py index f9d6d0a934..d5d70951be 100644 --- a/test_runner/regress/test_compatibility.py +++ b/test_runner/regress/test_compatibility.py @@ -7,11 +7,13 @@ from typing import List, Optional import pytest import toml +from fixtures.log_helper import log from fixtures.neon_fixtures import ( NeonEnv, NeonEnvBuilder, PgBin, ) +from fixtures.pageserver.http import PageserverApiException from fixtures.pageserver.utils import ( timeline_delete_wait_completed, wait_for_last_record_lsn, @@ -136,6 +138,7 @@ def test_create_snapshot( for sk in env.safekeepers: sk.stop() env.pageserver.stop() + env.attachment_service.stop() # Directory `compatibility_snapshot_dir` is uploaded to S3 in a workflow, keep the name in sync with it compatibility_snapshot_dir = ( @@ -224,11 +227,17 @@ def test_forward_compatibility( try: neon_env_builder.num_safekeepers = 3 + neon_local_binpath = neon_env_builder.neon_binpath env = neon_env_builder.from_repo_dir( compatibility_snapshot_dir / "repo", neon_binpath=compatibility_neon_bin, pg_distrib_dir=compatibility_postgres_distrib_dir, ) + + # Use current neon_local even though we're using old binaries for + # everything else: our test code is written for latest CLI args. + env.neon_local_binpath = neon_local_binpath + neon_env_builder.start() check_neon_works( @@ -269,14 +278,20 @@ def check_neon_works(env: NeonEnv, test_output_dir: Path, sql_dump_path: Path, r timeline_id = env.initial_timeline pg_version = env.pg_version - # Delete all files from local_fs_remote_storage except initdb.tar.zst, + try: + pageserver_http.timeline_preserve_initdb_archive(tenant_id, timeline_id) + except PageserverApiException as e: + # Allow the error as we might be running the old pageserver binary + log.info(f"Got allowed error: '{e}'") + + # Delete all files from local_fs_remote_storage except initdb-preserved.tar.zst, # the file is required for `timeline_create` with `existing_initdb_timeline_id`. # # TODO: switch to Path.walk() in Python 3.12 # for dirpath, _dirnames, filenames in (repo_dir / "local_fs_remote_storage").walk(): for dirpath, _dirnames, filenames in os.walk(repo_dir / "local_fs_remote_storage"): for filename in filenames: - if filename != "initdb.tar.zst": + if filename != "initdb-preserved.tar.zst" and filename != "initdb.tar.zst": (Path(dirpath) / filename).unlink() timeline_delete_wait_completed(pageserver_http, tenant_id, timeline_id) diff --git a/test_runner/regress/test_ddl_forwarding.py b/test_runner/regress/test_ddl_forwarding.py index 01aeb88bca..7174487e68 100644 --- a/test_runner/regress/test_ddl_forwarding.py +++ b/test_runner/regress/test_ddl_forwarding.py @@ -248,8 +248,15 @@ def test_ddl_forwarding(ddl: DdlForwardingContext): # We don't have compute_ctl, so here, so create neon_superuser here manually cur.execute("CREATE ROLE neon_superuser NOLOGIN CREATEDB CREATEROLE") - with pytest.raises(psycopg2.InternalError): - cur.execute("ALTER ROLE neon_superuser LOGIN") + # Contrary to popular belief, being superman does not make you superuser + cur.execute("CREATE ROLE superman LOGIN NOSUPERUSER PASSWORD 'jungle_man'") + + with ddl.pg.cursor(user="superman", password="jungle_man") as superman_cur: + # We allow real SUPERUSERs to ALTER neon_superuser + with pytest.raises(psycopg2.InternalError): + superman_cur.execute("ALTER ROLE neon_superuser LOGIN") + + cur.execute("ALTER ROLE neon_superuser LOGIN") with pytest.raises(psycopg2.InternalError): cur.execute("CREATE DATABASE trololobus WITH OWNER neon_superuser") diff --git a/test_runner/regress/test_disk_usage_eviction.py b/test_runner/regress/test_disk_usage_eviction.py index 70c3b77516..6a4f0edbea 100644 --- a/test_runner/regress/test_disk_usage_eviction.py +++ b/test_runner/regress/test_disk_usage_eviction.py @@ -2,7 +2,7 @@ import enum import time from collections import Counter from dataclasses import dataclass -from typing import Any, Dict, Tuple +from typing import Any, Dict, Iterable, Tuple import pytest import toml @@ -121,17 +121,7 @@ class EvictionEnv: } def count_layers_per_tenant(self, pageserver: NeonPageserver) -> Dict[TenantId, int]: - ret: Counter[TenantId] = Counter() - - for tenant_id, timeline_id in self.timelines: - timeline_dir = pageserver.timeline_dir(tenant_id, timeline_id) - assert timeline_dir.exists() - for file in timeline_dir.iterdir(): - if "__" not in file.name: - continue - ret[tenant_id] += 1 - - return dict(ret) + return count_layers_per_tenant(pageserver, self.timelines) def warm_up_tenant(self, tenant_id: TenantId): """ @@ -199,6 +189,22 @@ class EvictionEnv: wait_until(10, 1, statvfs_called) +def count_layers_per_tenant( + pageserver: NeonPageserver, timelines: Iterable[Tuple[TenantId, TimelineId]] +) -> Dict[TenantId, int]: + ret: Counter[TenantId] = Counter() + + for tenant_id, timeline_id in timelines: + timeline_dir = pageserver.timeline_dir(tenant_id, timeline_id) + assert timeline_dir.exists() + for file in timeline_dir.iterdir(): + if "__" not in file.name: + continue + ret[tenant_id] += 1 + + return dict(ret) + + def human_bytes(amt: float) -> str: suffixes = ["", "Ki", "Mi", "Gi"] @@ -243,21 +249,7 @@ def _eviction_env( timelines = [] for scale in pgbench_scales: - tenant_id, timeline_id = env.neon_cli.create_tenant( - conf={ - "gc_period": "0s", - "compaction_period": "0s", - "checkpoint_distance": f"{layer_size}", - "image_creation_threshold": "100", - "compaction_target_size": f"{layer_size}", - } - ) - - with env.endpoints.create_start("main", tenant_id=tenant_id) as endpoint: - pg_bin.run(["pgbench", "-i", f"-s{scale}", endpoint.connstr()]) - wait_for_last_flush_lsn(env, endpoint, tenant_id, timeline_id) - - timelines.append((tenant_id, timeline_id)) + timelines.append(pgbench_init_tenant(layer_size, scale, env, pg_bin)) # stop the safekeepers to avoid on-demand downloads caused by # initial logical size calculation triggered by walreceiver connection status @@ -266,25 +258,13 @@ def _eviction_env( # after stopping the safekeepers, we know that no new WAL will be coming in for tenant_id, timeline_id in timelines: - pageserver_http = env.get_tenant_pageserver(tenant_id).http_client() - - pageserver_http.timeline_checkpoint(tenant_id, timeline_id) - wait_for_upload_queue_empty(pageserver_http, tenant_id, timeline_id) - tl_info = pageserver_http.timeline_detail(tenant_id, timeline_id) - assert tl_info["last_record_lsn"] == tl_info["disk_consistent_lsn"] - assert tl_info["disk_consistent_lsn"] == tl_info["remote_consistent_lsn"] - pgbench_init_lsns[tenant_id] = Lsn(tl_info["last_record_lsn"]) - - layers = pageserver_http.layer_map_info(tenant_id, timeline_id) - log.info(f"{layers}") - assert ( - len(layers.historic_layers) >= 10 - ), "evictions happen at layer granularity, but we often assert at byte-granularity" + pgbench_init_lsns[tenant_id] = finish_tenant_creation(env, tenant_id, timeline_id, 10) eviction_env = EvictionEnv( timelines=timelines, neon_env=env, - pageserver_http=pageserver_http, + # this last tenant http client works for num_pageservers=1 + pageserver_http=env.get_tenant_pageserver(timelines[-1][0]).http_client(), layer_size=layer_size, pg_bin=pg_bin, pgbench_init_lsns=pgbench_init_lsns, @@ -293,6 +273,49 @@ def _eviction_env( return eviction_env +def pgbench_init_tenant( + layer_size: int, scale: int, env: NeonEnv, pg_bin: PgBin +) -> Tuple[TenantId, TimelineId]: + tenant_id, timeline_id = env.neon_cli.create_tenant( + conf={ + "gc_period": "0s", + "compaction_period": "0s", + "checkpoint_distance": f"{layer_size}", + "image_creation_threshold": "100", + "compaction_target_size": f"{layer_size}", + } + ) + + with env.endpoints.create_start("main", tenant_id=tenant_id) as endpoint: + pg_bin.run(["pgbench", "-i", f"-s{scale}", endpoint.connstr()]) + wait_for_last_flush_lsn(env, endpoint, tenant_id, timeline_id) + + return (tenant_id, timeline_id) + + +def finish_tenant_creation( + env: NeonEnv, + tenant_id: TenantId, + timeline_id: TimelineId, + min_expected_layers: int, +) -> Lsn: + pageserver_http = env.get_tenant_pageserver(tenant_id).http_client() + pageserver_http.timeline_checkpoint(tenant_id, timeline_id) + wait_for_upload_queue_empty(pageserver_http, tenant_id, timeline_id) + tl_info = pageserver_http.timeline_detail(tenant_id, timeline_id) + assert tl_info["last_record_lsn"] == tl_info["disk_consistent_lsn"] + assert tl_info["disk_consistent_lsn"] == tl_info["remote_consistent_lsn"] + pgbench_init_lsn = Lsn(tl_info["last_record_lsn"]) + + layers = pageserver_http.layer_map_info(tenant_id, timeline_id) + # log.info(f"{layers}") + assert ( + len(layers.historic_layers) >= min_expected_layers + ), "evictions happen at layer granularity, but we often assert at byte-granularity" + + return pgbench_init_lsn + + @pytest.fixture def eviction_env(request, neon_env_builder: NeonEnvBuilder, pg_bin: PgBin) -> EvictionEnv: return _eviction_env(request, neon_env_builder, pg_bin, num_pageservers=1) @@ -598,9 +621,82 @@ def test_partial_evict_tenant(eviction_env: EvictionEnv, order: EvictionOrder): assert abs_diff < 0.05 +@pytest.mark.parametrize( + "order", + [ + EvictionOrder.ABSOLUTE_ORDER, + EvictionOrder.RELATIVE_ORDER_EQUAL, + EvictionOrder.RELATIVE_ORDER_SPARE, + ], +) +def test_fast_growing_tenant(neon_env_builder: NeonEnvBuilder, pg_bin: PgBin, order: EvictionOrder): + """ + Create in order first smaller tenants and finally a single larger tenant. + Assert that with relative order modes, the disk usage based eviction is + more fair towards the smaller tenants. + """ + env = neon_env_builder.init_configs() + env.start() + env.pageserver.allowed_errors.append(r".* running disk usage based eviction due to pressure.*") + + # initial_tenant and initial_timeline do not exist + + # create N tenants the same fashion as EvictionEnv + layer_size = 5 * 1024**2 + timelines = [] + for scale in [1, 1, 1, 4]: + timelines.append((pgbench_init_tenant(layer_size, scale, env, pg_bin), scale)) + + env.neon_cli.safekeeper_stop() + + for (tenant_id, timeline_id), scale in timelines: + min_expected_layers = 4 if scale == 1 else 10 + finish_tenant_creation(env, tenant_id, timeline_id, min_expected_layers) + + tenant_layers = count_layers_per_tenant(env.pageserver, map(lambda x: x[0], timelines)) + (total_on_disk, _, _) = poor_mans_du(env, map(lambda x: x[0], timelines), env.pageserver, False) + + # cut 10 percent + response = env.pageserver.http_client().disk_usage_eviction_run( + {"evict_bytes": total_on_disk // 10, "eviction_order": order.config()} + ) + log.info(f"{response}") + + after_tenant_layers = count_layers_per_tenant(env.pageserver, map(lambda x: x[0], timelines)) + + ratios = [] + for i, ((tenant_id, _timeline_id), _scale) in enumerate(timelines): + # we expect the oldest to suffer most + originally, after = tenant_layers[tenant_id], after_tenant_layers[tenant_id] + log.info(f"{i + 1}th tenant went from {originally} -> {after}") + ratio = after / originally + ratios.append(ratio) + + assert ( + len(ratios) == 4 + ), "rest of the assertions expect 3 + 1 timelines, ratios, scales, all in order" + log.info(f"{ratios}") + + if order == EvictionOrder.ABSOLUTE_ORDER: + # first tenant loses most + assert ratios[0] <= ratios[1], "first should lose the most" + assert ratios[1] < ratios[2], "second should lose some" + assert ratios[1] < 1.0 + assert ratios[2] <= ratios[3], "third might not lose" + assert ratios[3] == 1.0, "tenant created last does not lose" + elif order == EvictionOrder.RELATIVE_ORDER_EQUAL: + assert all([x for x in ratios if x < 1.0]), "all tenants lose layers" + elif order == EvictionOrder.RELATIVE_ORDER_SPARE: + # with different layer sizes and pg versions, there are different combinations + assert len([x for x in ratios if x < 1.0]) >= 2, "require 2..4 tenants to lose layers" + assert ratios[3] < 1.0, "largest tenant always loses layers" + else: + raise RuntimeError(f"unimplemented {order}") + + def poor_mans_du( env: NeonEnv, - timelines: list[Tuple[TenantId, TimelineId]], + timelines: Iterable[Tuple[TenantId, TimelineId]], pageserver: NeonPageserver, verbose: bool = False, ) -> Tuple[int, int, int]: diff --git a/test_runner/regress/test_import.py b/test_runner/regress/test_import.py index faedf5d944..3519cbbaab 100644 --- a/test_runner/regress/test_import.py +++ b/test_runner/regress/test_import.py @@ -163,6 +163,8 @@ def test_import_from_vanilla(test_output_dir, pg_bin, vanilla_pg, neon_env_build endpoint = env.endpoints.create_start(endpoint_id, tenant_id=tenant) assert endpoint.safe_psql("select count(*) from t") == [(300000,)] + vanilla_pg.stop() + def test_import_from_pageserver_small( pg_bin: PgBin, neon_env_builder: NeonEnvBuilder, test_output_dir: Path diff --git a/test_runner/regress/test_migrations.py b/test_runner/regress/test_migrations.py new file mode 100644 index 0000000000..121fa91f66 --- /dev/null +++ b/test_runner/regress/test_migrations.py @@ -0,0 +1,37 @@ +import time + +from fixtures.neon_fixtures import NeonEnv + + +def test_migrations(neon_simple_env: NeonEnv): + env = neon_simple_env + env.neon_cli.create_branch("test_migrations", "empty") + + endpoint = env.endpoints.create("test_migrations") + log_path = endpoint.endpoint_path() / "compute.log" + + endpoint.respec(skip_pg_catalog_updates=False, features=["migrations"]) + endpoint.start() + + time.sleep(1) # Sleep to let migrations run + + with endpoint.cursor() as cur: + cur.execute("SELECT id FROM neon_migration.migration_id") + migration_id = cur.fetchall() + assert migration_id[0][0] == 2 + + with open(log_path, "r") as log_file: + logs = log_file.read() + assert "INFO handle_migrations: Ran 2 migrations" in logs + + endpoint.stop() + endpoint.start() + time.sleep(1) # Sleep to let migrations run + with endpoint.cursor() as cur: + cur.execute("SELECT id FROM neon_migration.migration_id") + migration_id = cur.fetchall() + assert migration_id[0][0] == 2 + + with open(log_path, "r") as log_file: + logs = log_file.read() + assert "INFO handle_migrations: Ran 0 migrations" in logs diff --git a/test_runner/regress/test_neon_local_cli.py b/test_runner/regress/test_neon_local_cli.py index 46b72fbca5..8edba49b8a 100644 --- a/test_runner/regress/test_neon_local_cli.py +++ b/test_runner/regress/test_neon_local_cli.py @@ -59,3 +59,5 @@ def test_neon_two_primary_endpoints_fail( env.neon_cli.endpoint_stop("ep1") # ep1 is stopped so create ep2 will succeed env.neon_cli.endpoint_start("ep2") + # cleanup + env.neon_cli.endpoint_stop("ep2") diff --git a/test_runner/regress/test_neon_superuser.py b/test_runner/regress/test_neon_superuser.py new file mode 100644 index 0000000000..6be7c114cb --- /dev/null +++ b/test_runner/regress/test_neon_superuser.py @@ -0,0 +1,34 @@ +import time + +from fixtures.neon_fixtures import NeonEnv +from fixtures.pg_version import PgVersion + + +def test_neon_superuser(neon_simple_env: NeonEnv, pg_version: PgVersion): + env = neon_simple_env + env.neon_cli.create_branch("test_neon_superuser", "empty") + endpoint = env.endpoints.create("test_neon_superuser") + endpoint.respec(skip_pg_catalog_updates=False, features=["migrations"]) + endpoint.start() + + time.sleep(1) # Sleep to let migrations run + + with endpoint.cursor() as cur: + cur.execute( + "CREATE ROLE mr_whiskers WITH PASSWORD 'cat' LOGIN INHERIT CREATEROLE CREATEDB BYPASSRLS REPLICATION IN ROLE neon_superuser" + ) + cur.execute("CREATE DATABASE neondb WITH OWNER mr_whiskers") + cur.execute("GRANT ALL PRIVILEGES ON DATABASE neondb TO neon_superuser") + + with endpoint.cursor(dbname="neondb", user="mr_whiskers", password="cat") as cur: + cur.execute("SELECT pg_has_role('mr_whiskers', 'neon_superuser', 'member')") + assert cur.fetchall()[0][0] + cur.execute("SELECT pg_has_role('mr_whiskers', 'neon_superuser', 'usage')") + assert cur.fetchall()[0][0] + + if pg_version == PgVersion.V16: + cur.execute("SELECT pg_has_role('mr_whiskers', 'neon_superuser', 'set')") + assert cur.fetchall()[0][0] + + cur.execute("CREATE PUBLICATION pub FOR ALL TABLES") + cur.execute("CREATE ROLE definitely_not_a_superuser WITH PASSWORD 'nope'") diff --git a/test_runner/regress/test_next_xid.py b/test_runner/regress/test_next_xid.py index da2580dbf9..e880445c4d 100644 --- a/test_runner/regress/test_next_xid.py +++ b/test_runner/regress/test_next_xid.py @@ -203,6 +203,16 @@ def test_import_at_2bil( $$; """ ) + + # Also create a multi-XID with members past the 2 billion mark + conn2 = endpoint.connect() + cur2 = conn2.cursor() + cur.execute("INSERT INTO t VALUES ('x')") + cur.execute("BEGIN; select * from t WHERE t = 'x' FOR SHARE;") + cur2.execute("BEGIN; select * from t WHERE t = 'x' FOR SHARE;") + cur.execute("COMMIT") + cur2.execute("COMMIT") + # A checkpoint writes a WAL record with xl_xid=0. Many other WAL # records would have the same effect. cur.execute("checkpoint") @@ -217,4 +227,4 @@ def test_import_at_2bil( conn = endpoint.connect() cur = conn.cursor() cur.execute("SELECT count(*) from t") - assert cur.fetchone() == (10000 + 1,) + assert cur.fetchone() == (10000 + 1 + 1,) diff --git a/test_runner/regress/test_pageserver_generations.py b/test_runner/regress/test_pageserver_generations.py index 63f6130af5..725ed63d1c 100644 --- a/test_runner/regress/test_pageserver_generations.py +++ b/test_runner/regress/test_pageserver_generations.py @@ -499,7 +499,8 @@ def test_emergency_mode(neon_env_builder: NeonEnvBuilder, pg_bin: PgBin): # and serve clients. env.pageserver.stop() # Non-immediate: implicitly checking that shutdown doesn't hang waiting for CP env.pageserver.start( - overrides=("--pageserver-config-override=control_plane_emergency_mode=true",) + overrides=("--pageserver-config-override=control_plane_emergency_mode=true",), + register=False, ) # The pageserver should provide service to clients diff --git a/test_runner/regress/test_pageserver_reconnect.py b/test_runner/regress/test_pageserver_reconnect.py new file mode 100644 index 0000000000..aecfcdd262 --- /dev/null +++ b/test_runner/regress/test_pageserver_reconnect.py @@ -0,0 +1,42 @@ +import threading +import time +from contextlib import closing + +from fixtures.log_helper import log +from fixtures.neon_fixtures import NeonEnv, PgBin + + +# Test updating neon.pageserver_connstring setting on the fly. +# +# This merely changes some whitespace in the connection string, so +# this doesn't prove that the new string actually takes effect. But at +# least the code gets exercised. +def test_pageserver_reconnect(neon_simple_env: NeonEnv, pg_bin: PgBin): + env = neon_simple_env + env.neon_cli.create_branch("test_pageserver_restarts") + endpoint = env.endpoints.create_start("test_pageserver_restarts") + n_reconnects = 1000 + timeout = 0.01 + scale = 10 + + def run_pgbench(connstr: str): + log.info(f"Start a pgbench workload on pg {connstr}") + pg_bin.run_capture(["pgbench", "-i", f"-s{scale}", connstr]) + pg_bin.run_capture(["pgbench", f"-T{int(n_reconnects*timeout)}", connstr]) + + thread = threading.Thread(target=run_pgbench, args=(endpoint.connstr(),), daemon=True) + thread.start() + + with closing(endpoint.connect()) as con: + with con.cursor() as c: + c.execute("SELECT setting FROM pg_settings WHERE name='neon.pageserver_connstring'") + connstring = c.fetchall()[0][0] + for i in range(n_reconnects): + time.sleep(timeout) + c.execute( + "alter system set neon.pageserver_connstring=%s", + (connstring + (" " * (i % 2)),), + ) + c.execute("select pg_reload_conf()") + + thread.join() diff --git a/test_runner/regress/test_pageserver_restart.py b/test_runner/regress/test_pageserver_restart.py index c4499196b5..753898f747 100644 --- a/test_runner/regress/test_pageserver_restart.py +++ b/test_runner/regress/test_pageserver_restart.py @@ -1,4 +1,6 @@ +import random from contextlib import closing +from typing import Optional import pytest from fixtures.log_helper import log @@ -141,18 +143,24 @@ def test_pageserver_restart(neon_env_builder: NeonEnvBuilder): # Test that repeatedly kills and restarts the page server, while the # safekeeper and compute node keep running. @pytest.mark.timeout(540) -def test_pageserver_chaos(neon_env_builder: NeonEnvBuilder, build_type: str): +@pytest.mark.parametrize("shard_count", [None, 4]) +def test_pageserver_chaos( + neon_env_builder: NeonEnvBuilder, build_type: str, shard_count: Optional[int] +): if build_type == "debug": pytest.skip("times out in debug builds") neon_env_builder.enable_pageserver_remote_storage(s3_storage()) neon_env_builder.enable_scrub_on_exit() + if shard_count is not None: + neon_env_builder.num_pageservers = shard_count - env = neon_env_builder.init_start() + env = neon_env_builder.init_start(initial_tenant_shard_count=shard_count) # these can happen, if we shutdown at a good time. to be fixed as part of #5172. message = ".*duplicated L1 layer layer=.*" - env.pageserver.allowed_errors.append(message) + for ps in env.pageservers: + ps.allowed_errors.append(message) # Use a tiny checkpoint distance, to create a lot of layers quickly. # That allows us to stress the compaction and layer flushing logic more. @@ -192,13 +200,19 @@ def test_pageserver_chaos(neon_env_builder: NeonEnvBuilder, build_type: str): log.info(f"shared_buffers is {row[0]}, table size {row[1]}") assert int(row[0]) < int(row[1]) + # We run "random" kills using a fixed seed, to improve reproducibility if a test + # failure is related to a particular order of operations. + seed = 0xDEADBEEF + rng = random.Random(seed) + # Update the whole table, then immediately kill and restart the pageserver for i in range(1, 15): endpoint.safe_psql("UPDATE foo set updates = updates + 1") # This kills the pageserver immediately, to simulate a crash - env.pageserver.stop(immediate=True) - env.pageserver.start() + to_kill = rng.choice(env.pageservers) + to_kill.stop(immediate=True) + to_kill.start() # Check that all the updates are visible num_updates = endpoint.safe_psql("SELECT sum(updates) FROM foo")[0][0] diff --git a/test_runner/regress/test_pageserver_secondary.py b/test_runner/regress/test_pageserver_secondary.py index 521b96779a..293152dd62 100644 --- a/test_runner/regress/test_pageserver_secondary.py +++ b/test_runner/regress/test_pageserver_secondary.py @@ -5,7 +5,10 @@ from typing import Any, Dict, Optional import pytest from fixtures.log_helper import log from fixtures.neon_fixtures import NeonEnvBuilder, NeonPageserver, S3Scrubber -from fixtures.pageserver.utils import assert_prefix_empty, tenant_delete_wait_completed +from fixtures.pageserver.utils import ( + assert_prefix_empty, + tenant_delete_wait_completed, +) from fixtures.remote_storage import LocalFsStorage, RemoteStorageKind from fixtures.types import TenantId, TimelineId from fixtures.utils import wait_until @@ -135,6 +138,16 @@ def test_location_conf_churn(neon_env_builder: NeonEnvBuilder, seed: int): pageserver.stop() pageserver.start() if last_state_ps[0].startswith("Attached") and latest_attached == pageserver.id: + # /re-attach call will bump generation: track that in our state in case we do an + # "attach in same generation" operation later + assert last_state_ps[1] is not None # latest_attached == pageserfer.id implies this + # The re-attach API increments generation by exactly one. + new_generation = last_state_ps[1] + 1 + last_state[pageserver.id] = (last_state_ps[0], new_generation) + tenants = pageserver.http_client().tenant_list() + assert len(tenants) == 1 + assert tenants[0]["generation"] == new_generation + log.info("Entering postgres...") workload.churn_rows(rng.randint(128, 256), pageserver.id) workload.validate(pageserver.id) diff --git a/test_runner/regress/test_pg_regress.py b/test_runner/regress/test_pg_regress.py index f26d04e2f3..e4219ec7a6 100644 --- a/test_runner/regress/test_pg_regress.py +++ b/test_runner/regress/test_pg_regress.py @@ -2,25 +2,40 @@ # This file runs pg_regress-based tests. # from pathlib import Path +from typing import Optional -from fixtures.neon_fixtures import NeonEnv, check_restored_datadir_content +import pytest +from fixtures.neon_fixtures import ( + NeonEnvBuilder, + check_restored_datadir_content, +) +from fixtures.remote_storage import s3_storage # Run the main PostgreSQL regression tests, in src/test/regress. # +@pytest.mark.parametrize("shard_count", [None, 4]) def test_pg_regress( - neon_simple_env: NeonEnv, + neon_env_builder: NeonEnvBuilder, test_output_dir: Path, pg_bin, capsys, base_dir: Path, pg_distrib_dir: Path, + shard_count: Optional[int], ): - env = neon_simple_env + """ + :param shard_count: if None, create an unsharded tenant. Otherwise create a tenant with this + many shards. + """ + if shard_count is not None: + neon_env_builder.num_pageservers = shard_count + neon_env_builder.enable_pageserver_remote_storage(s3_storage()) + neon_env_builder.enable_scrub_on_exit() + env = neon_env_builder.init_start(initial_tenant_shard_count=shard_count) - env.neon_cli.create_branch("test_pg_regress", "empty") # Connect to postgres and create a database called "regression". - endpoint = env.endpoints.create_start("test_pg_regress") + endpoint = env.endpoints.create_start("main") endpoint.safe_psql("CREATE DATABASE regression") # Create some local directories for pg_regress to run in. @@ -61,22 +76,25 @@ def test_pg_regress( # Run the PostgreSQL "isolation" tests, in src/test/isolation. # +@pytest.mark.parametrize("shard_count", [None, 4]) def test_isolation( - neon_simple_env: NeonEnv, + neon_env_builder: NeonEnvBuilder, test_output_dir: Path, pg_bin, capsys, base_dir: Path, pg_distrib_dir: Path, + shard_count: Optional[int], ): - env = neon_simple_env + if shard_count is not None: + neon_env_builder.num_pageservers = shard_count + neon_env_builder.enable_pageserver_remote_storage(s3_storage()) + neon_env_builder.enable_scrub_on_exit() + env = neon_env_builder.init_start(initial_tenant_shard_count=shard_count) - env.neon_cli.create_branch("test_isolation", "empty") # Connect to postgres and create a database called "regression". # isolation tests use prepared transactions, so enable them - endpoint = env.endpoints.create_start( - "test_isolation", config_lines=["max_prepared_transactions=100"] - ) + endpoint = env.endpoints.create_start("main", config_lines=["max_prepared_transactions=100"]) endpoint.safe_psql("CREATE DATABASE isolation_regression") # Create some local directories for pg_isolation_regress to run in. @@ -114,19 +132,24 @@ def test_isolation( # Run extra Neon-specific pg_regress-based tests. The tests and their # schedule file are in the sql_regress/ directory. +@pytest.mark.parametrize("shard_count", [None, 4]) def test_sql_regress( - neon_simple_env: NeonEnv, + neon_env_builder: NeonEnvBuilder, test_output_dir: Path, pg_bin, capsys, base_dir: Path, pg_distrib_dir: Path, + shard_count: Optional[int], ): - env = neon_simple_env + if shard_count is not None: + neon_env_builder.num_pageservers = shard_count + neon_env_builder.enable_pageserver_remote_storage(s3_storage()) + neon_env_builder.enable_scrub_on_exit() + env = neon_env_builder.init_start(initial_tenant_shard_count=shard_count) - env.neon_cli.create_branch("test_sql_regress", "empty") # Connect to postgres and create a database called "regression". - endpoint = env.endpoints.create_start("test_sql_regress") + endpoint = env.endpoints.create_start("main") endpoint.safe_psql("CREATE DATABASE regression") # Create some local directories for pg_regress to run in. diff --git a/test_runner/regress/test_sharding.py b/test_runner/regress/test_sharding.py new file mode 100644 index 0000000000..c16bfc2ec6 --- /dev/null +++ b/test_runner/regress/test_sharding.py @@ -0,0 +1,85 @@ +from fixtures.log_helper import log +from fixtures.neon_fixtures import ( + NeonEnvBuilder, +) +from fixtures.remote_storage import s3_storage +from fixtures.types import TimelineId +from fixtures.workload import Workload + + +def test_sharding_smoke( + neon_env_builder: NeonEnvBuilder, +): + """ + Test the basic lifecycle of a sharded tenant: + - ingested data gets split up + - page service reads + - timeline creation and deletion + - splits + """ + + shard_count = 4 + neon_env_builder.num_pageservers = shard_count + + # 1MiB stripes: enable getting some meaningful data distribution without + # writing large quantities of data in this test. The stripe size is given + # in number of 8KiB pages. + stripe_size = 128 + + # Use S3-compatible remote storage so that we can scrub: this test validates + # that the scrubber doesn't barf when it sees a sharded tenant. + neon_env_builder.enable_pageserver_remote_storage(s3_storage()) + neon_env_builder.enable_scrub_on_exit() + + neon_env_builder.preserve_database_files = True + + env = neon_env_builder.init_start( + initial_tenant_shard_count=shard_count, initial_tenant_shard_stripe_size=stripe_size + ) + tenant_id = env.initial_tenant + + pageservers = dict((int(p.id), p) for p in env.pageservers) + shards = env.attachment_service.locate(tenant_id) + + def get_sizes(): + sizes = {} + for shard in shards: + node_id = int(shard["node_id"]) + pageserver = pageservers[node_id] + sizes[node_id] = pageserver.http_client().tenant_status(shard["shard_id"])[ + "current_physical_size" + ] + log.info(f"sizes = {sizes}") + return sizes + + # Test that timeline creation works on a sharded tenant + timeline_b = env.neon_cli.create_branch("branch_b", tenant_id=tenant_id) + + # Test that we can write data to a sharded tenant + workload = Workload(env, tenant_id, timeline_b, branch_name="branch_b") + workload.init() + + sizes_before = get_sizes() + workload.write_rows(256) + + # Test that we can read data back from a sharded tenant + workload.validate() + + # Validate that the data is spread across pageservers + sizes_after = get_sizes() + # Our sizes increased when we wrote data + assert sum(sizes_after.values()) > sum(sizes_before.values()) + # That increase is present on all shards + assert all(sizes_after[ps.id] > sizes_before[ps.id] for ps in env.pageservers) + + # Validate that timeline list API works properly on all shards + for shard in shards: + node_id = int(shard["node_id"]) + pageserver = pageservers[node_id] + timelines = set( + TimelineId(tl["timeline_id"]) + for tl in pageserver.http_client().timeline_list(shard["shard_id"]) + ) + assert timelines == {env.initial_timeline, timeline_b} + + # TODO: test timeline deletion and tenant deletion (depends on change in attachment_service) diff --git a/test_runner/regress/test_tenant_delete.py b/test_runner/regress/test_tenant_delete.py index 7a5b1c0fc2..b4e5a550f3 100644 --- a/test_runner/regress/test_tenant_delete.py +++ b/test_runner/regress/test_tenant_delete.py @@ -411,9 +411,7 @@ def test_long_timeline_create_cancelled_by_tenant_delete(neon_env_builder: NeonE pageserver_http.configure_failpoints((failpoint, "pause")) def hit_pausable_failpoint_and_later_fail(): - with pytest.raises( - PageserverApiException, match="new timeline \\S+ has invalid disk_consistent_lsn" - ): + with pytest.raises(PageserverApiException, match="NotFound: tenant"): pageserver_http.timeline_create( env.pg_version, env.initial_tenant, env.initial_timeline ) @@ -443,8 +441,8 @@ def test_long_timeline_create_cancelled_by_tenant_delete(neon_env_builder: NeonE try: wait_until(10, 1, has_hit_failpoint) - # it should start ok, sync up with the stuck creation, then fail because disk_consistent_lsn was not updated - # then deletion should fail and set the tenant broken + # it should start ok, sync up with the stuck creation, then hang waiting for the timeline + # to shut down. deletion = Thread(target=start_deletion) deletion.start() @@ -573,11 +571,16 @@ def test_tenant_delete_races_timeline_creation( ps_http = env.pageserver.http_client() tenant_id = env.initial_tenant - # Sometimes it ends with "InternalServerError(Cancelled", sometimes with "InternalServerError(Operation was cancelled" + # When timeline creation is cancelled by tenant deletion, it is during Tenant::shutdown(), and + # acting on a shutdown tenant generates a 503 response (if caller retried they would later) get + # a 404 after the tenant is fully deleted. CANCELLED_ERROR = ( - ".*POST.*Cancelled request finished with an error: InternalServerError\\(.*ancelled" + ".*POST.*Cancelled request finished successfully status=503 Service Unavailable" ) + # This can occur sometimes. + CONFLICT_MESSAGE = ".*Precondition failed: Invalid state Stopping. Expected Active or Broken.*" + env.pageserver.allowed_errors.extend( [ # lucky race with stopping from flushing a layer we fail to schedule any uploads @@ -586,6 +589,9 @@ def test_tenant_delete_races_timeline_creation( ".*POST.*/timeline.* request was dropped before completing", # Timeline creation runs into this error CANCELLED_ERROR, + # Timeline deletion can run into this error during deletion + CONFLICT_MESSAGE, + ".*tenant_delete_handler.*still waiting, taking longer than expected.*", ] ) @@ -621,7 +627,10 @@ def test_tenant_delete_races_timeline_creation( ps_http.configure_failpoints((DELETE_BEFORE_CLEANUP_FAILPOINT, "pause")) def tenant_delete(): - ps_http.tenant_delete(tenant_id) + def tenant_delete_inner(): + ps_http.tenant_delete(tenant_id) + + wait_until(100, 0.5, tenant_delete_inner) Thread(target=tenant_delete).start() @@ -638,10 +647,8 @@ def test_tenant_delete_races_timeline_creation( ps_http.configure_failpoints((BEFORE_INITDB_UPLOAD_FAILPOINT, "off")) iterations = poll_for_remote_storage_iterations(remote_storage_kind) - try: - tenant_delete_wait_completed(ps_http, tenant_id, iterations) - except PageserverApiException: - pass + + tenant_delete_wait_completed(ps_http, tenant_id, iterations, ignore_errors=True) # Physical deletion should have happened assert_prefix_empty( diff --git a/test_runner/regress/test_tenant_detach.py b/test_runner/regress/test_tenant_detach.py index d548e63cc1..8d5ef4e3c4 100644 --- a/test_runner/regress/test_tenant_detach.py +++ b/test_runner/regress/test_tenant_detach.py @@ -482,7 +482,7 @@ def test_detach_while_attaching( pageserver_http.tenant_detach(tenant_id) # And re-attach - pageserver_http.configure_failpoints([("attach-before-activate", "return(5000)")]) + pageserver_http.configure_failpoints([("attach-before-activate-sleep", "return(5000)")]) env.pageserver.tenant_attach(tenant_id) @@ -681,7 +681,7 @@ def test_detach_while_activating( pageserver_http.tenant_detach(tenant_id) # And re-attach, but stop attach task_mgr task from completing - pageserver_http.configure_failpoints([("attach-before-activate", "return(600000)")]) + pageserver_http.configure_failpoints([("attach-before-activate-sleep", "return(600000)")]) env.pageserver.tenant_attach(tenant_id) # The tenant is in the Activating state. This should not block us from @@ -695,7 +695,7 @@ def test_detach_while_activating( ), "Only ignored tenant should be missing" # Subsequently attaching it again should still work - pageserver_http.configure_failpoints([("attach-before-activate", "off")]) + pageserver_http.configure_failpoints([("attach-before-activate-sleep", "off")]) env.pageserver.tenant_attach(tenant_id) wait_until_tenant_state(pageserver_http, tenant_id, "Active", 5) diff --git a/test_runner/regress/test_tenants.py b/test_runner/regress/test_tenants.py index 2ee2d8125a..5164bda470 100644 --- a/test_runner/regress/test_tenants.py +++ b/test_runner/regress/test_tenants.py @@ -1,3 +1,4 @@ +import concurrent.futures import os import time from contextlib import closing @@ -7,6 +8,7 @@ from pathlib import Path from typing import List import pytest +import requests from fixtures.log_helper import log from fixtures.metrics import ( PAGESERVER_GLOBAL_METRICS, @@ -17,7 +19,9 @@ from fixtures.neon_fixtures import ( NeonEnv, NeonEnvBuilder, ) -from fixtures.pageserver.utils import timeline_delete_wait_completed +from fixtures.pageserver.http import PageserverApiException +from fixtures.pageserver.utils import timeline_delete_wait_completed, wait_until_tenant_active +from fixtures.pg_version import PgVersion from fixtures.remote_storage import RemoteStorageKind from fixtures.types import Lsn, TenantId from fixtures.utils import wait_until @@ -341,3 +345,78 @@ def test_pageserver_with_empty_tenants(neon_env_builder: NeonEnvBuilder): assert ( tenant_active_count == 1 ), f"Tenant {tenant_with_empty_timelines} should have metric as active" + + +def test_create_churn_during_restart(neon_env_builder: NeonEnvBuilder): + """ + Probabilistic stress test for the pageserver's handling of tenant requests + across a restart. This is intended to catch things like: + - Bad response codes during shutdown (e.g. returning 500 instead of 503) + - Issues where a tenant is still starting up while we receive a request for it + - Issues with interrupting/resuming tenant/timeline creation in shutdown + """ + env = neon_env_builder.init_configs() + env.start() + tenant_id: TenantId = env.initial_tenant + timeline_id = env.initial_timeline + + # Multiple creation requests which race will generate this error + env.pageserver.allowed_errors.append(".*Conflict: Tenant is already being modified.*") + + # Tenant creation requests which arrive out of order will generate complaints about + # generation nubmers out of order. + env.pageserver.allowed_errors.append(".*Generation .+ is less than existing .+") + + # Our multiple creation requests will advance generation quickly, and when we skip + # a generation number we can generate these warnings + env.pageserver.allowed_errors.append(".*Dropped remote consistent LSN updates for tenant .+") + + # Timeline::flush_and_shutdown cannot tell if it is hitting a failure because of + # an incomplete attach, or some other problem. In the field this should be rare, + # so we allow it to log at WARN, even if it is occasionally a false positive. + env.pageserver.allowed_errors.append(".*failed to freeze and flush.*") + + # When we shut down a tenant during a timeline creation, initdb is not cancelled, we wait + # for it to complete (since https://github.com/neondatabase/neon/pull/6451). This means + # that shutdown can be delayed by >=1s on debug builds where initdb takes a long time to run. + env.pageserver.allowed_errors.append(".*still waiting, taking longer than expected... gate.*") + + def create_bg(delay_ms): + time.sleep(delay_ms / 1000.0) + try: + env.pageserver.tenant_create(tenant_id=tenant_id) + env.pageserver.http_client().timeline_create( + PgVersion.NOT_SET, tenant_id, new_timeline_id=timeline_id + ) + except PageserverApiException as e: + if e.status_code == 409: + log.info(f"delay_ms={delay_ms} 409") + pass + elif e.status_code == 400: + if "is less than existing" in e.message: + # We send creation requests very close together in time: it is expected that these + # race, and sometimes chigher-generation'd requests arrive first. The pageserver rightly + # rejects any attempt to make a generation number go backwards. + pass + else: + raise + else: + raise + except requests.exceptions.ConnectionError: + # Our requests might arrive during shutdown and be cut off at the transport level + pass + + for _ in range(0, 10): + with concurrent.futures.ThreadPoolExecutor() as executor: + futs = [] + for delay_ms in (0, 1, 10, 50, 100, 200, 500, 800): + f = executor.submit(create_bg, delay_ms) + futs.append(f) + env.pageserver.stop() + env.pageserver.start() + + for f in futs: + f.result(timeout=10) + + # The tenant should end up active + wait_until_tenant_active(env.pageserver.http_client(), tenant_id, iterations=10, period=1) diff --git a/test_runner/regress/test_timeline_size.py b/test_runner/regress/test_timeline_size.py index 2e58a413e4..4c5cb32caa 100644 --- a/test_runner/regress/test_timeline_size.py +++ b/test_runner/regress/test_timeline_size.py @@ -868,7 +868,7 @@ def test_ondemand_activation(neon_env_builder: NeonEnvBuilder): ) assert pageserver_http.get_metric_value("pageserver_tenant_startup_complete_total") == n_tenants - # Check that tenant deletion proactively wakes tenants: this is done separately to the main + # Check that tenant deletion/detach proactively wakes tenants: this is done separately to the main # body of the test because it will disrupt tenant counts env.pageserver.stop() env.pageserver.start( @@ -876,9 +876,22 @@ def test_ondemand_activation(neon_env_builder: NeonEnvBuilder): ) wait_until(10, 1, at_least_one_active) - delete_tenant_id = list( + + detach_tenant_id = list( [(tid, s) for (tid, s) in get_tenant_states().items() if s == "Attaching"] )[0][0] + delete_tenant_id = list( + [(tid, s) for (tid, s) in get_tenant_states().items() if s == "Attaching"] + )[1][0] + + # Detaching a stuck tenant should proceed promptly + # (reproducer for https://github.com/neondatabase/neon/pull/6430) + env.pageserver.http_client().tenant_detach(detach_tenant_id, timeout_secs=10) + tenant_ids.remove(detach_tenant_id) + # FIXME: currently the mechanism for cancelling attach is to set state to broken, which is reported spuriously at error level + env.pageserver.allowed_errors.append( + ".*attach failed, setting tenant state to Broken: Shut down while Attaching" + ) # Deleting a stuck tenant should prompt it to go active with concurrent.futures.ThreadPoolExecutor() as executor: @@ -912,9 +925,10 @@ def test_ondemand_activation(neon_env_builder: NeonEnvBuilder): wait_tenant_status_404(pageserver_http, tenant_id=delete_tenant_id, iterations=40) tenant_ids.remove(delete_tenant_id) - # Check that all the stuck tenants proceed to active (apart from the one that deletes) + # Check that all the stuck tenants proceed to active (apart from the one that deletes, and the one + # we detached) wait_until(10, 1, all_active) - assert len(get_tenant_states()) == n_tenants - 1 + assert len(get_tenant_states()) == n_tenants - 2 def test_timeline_logical_size_task_priority(neon_env_builder: NeonEnvBuilder): diff --git a/test_runner/regress/test_wal_restore.py b/test_runner/regress/test_wal_restore.py index 7d03f644d1..97db857c74 100644 --- a/test_runner/regress/test_wal_restore.py +++ b/test_runner/regress/test_wal_restore.py @@ -137,6 +137,9 @@ def test_wal_restore_http(neon_env_builder: NeonEnvBuilder): ps_client = env.pageserver.http_client() + # Mark the initdb archive for preservation + ps_client.timeline_preserve_initdb_archive(tenant_id, timeline_id) + # shut down the endpoint and delete the timeline from the pageserver endpoint.stop() diff --git a/vendor/postgres-v14 b/vendor/postgres-v14 index 8207291128..11e970fe2b 160000 --- a/vendor/postgres-v14 +++ b/vendor/postgres-v14 @@ -1 +1 @@ -Subproject commit 82072911287cabb32018cf92c8425fa1c744def4 +Subproject commit 11e970fe2be56804f0a786ec5fc8141ffefa4ca7 diff --git a/vendor/postgres-v15 b/vendor/postgres-v15 index c1c2272f43..731b4d1609 160000 --- a/vendor/postgres-v15 +++ b/vendor/postgres-v15 @@ -1 +1 @@ -Subproject commit c1c2272f436ed9231f6172f49de219fe71a9280d +Subproject commit 731b4d1609d6db1c953755810a41e0e67ea3db7b diff --git a/vendor/postgres-v16 b/vendor/postgres-v16 index 7be4a52d72..cf302768b2 160000 --- a/vendor/postgres-v16 +++ b/vendor/postgres-v16 @@ -1 +1 @@ -Subproject commit 7be4a52d728459b79b59343c57d338c3073059c8 +Subproject commit cf302768b2890569956641e0e5ba112ae1445351 diff --git a/vendor/revisions.json b/vendor/revisions.json index 3d626cb2bc..c7b33f8c8a 100644 --- a/vendor/revisions.json +++ b/vendor/revisions.json @@ -1,5 +1,5 @@ { - "postgres-v16": "7be4a52d728459b79b59343c57d338c3073059c8", - "postgres-v15": "c1c2272f436ed9231f6172f49de219fe71a9280d", - "postgres-v14": "82072911287cabb32018cf92c8425fa1c744def4" + "postgres-v16": "cf302768b2890569956641e0e5ba112ae1445351", + "postgres-v15": "731b4d1609d6db1c953755810a41e0e67ea3db7b", + "postgres-v14": "11e970fe2be56804f0a786ec5fc8141ffefa4ca7" } diff --git a/workspace_hack/Cargo.toml b/workspace_hack/Cargo.toml index dbd46054a4..9d0f9bfcee 100644 --- a/workspace_hack/Cargo.toml +++ b/workspace_hack/Cargo.toml @@ -29,10 +29,9 @@ chrono = { version = "0.4", default-features = false, features = ["clock", "serd clap = { version = "4", features = ["derive", "string"] } clap_builder = { version = "4", default-features = false, features = ["color", "help", "std", "string", "suggestions", "usage"] } crossbeam-utils = { version = "0.8" } -dashmap = { version = "5", default-features = false, features = ["raw-api"] } +diesel = { version = "2", features = ["postgres", "serde_json"] } either = { version = "1" } fail = { version = "0.5", default-features = false, features = ["failpoints"] } -futures = { version = "0.3" } futures-channel = { version = "0.3", features = ["sink"] } futures-core = { version = "0.3" } futures-executor = { version = "0.3" } @@ -74,6 +73,7 @@ tokio-rustls = { version = "0.24" } tokio-util = { version = "0.7", features = ["codec", "compat", "io", "rt"] } toml_datetime = { version = "0.6", default-features = false, features = ["serde"] } toml_edit = { version = "0.19", features = ["serde"] } +tonic = { version = "0.9", features = ["tls-roots"] } tower = { version = "0.4", default-features = false, features = ["balance", "buffer", "limit", "log", "timeout", "util"] } tracing = { version = "0.1", features = ["log"] } tracing-core = { version = "0.1" } @@ -109,8 +109,10 @@ regex-automata = { version = "0.4", default-features = false, features = ["dfa-o regex-syntax = { version = "0.8" } serde = { version = "1", features = ["alloc", "derive"] } syn-dff4ba8e3ae991db = { package = "syn", version = "1", features = ["extra-traits", "full", "visit"] } -syn-f595c2ba2a3f28df = { package = "syn", version = "2", features = ["extra-traits", "full", "visit", "visit-mut"] } +syn-f595c2ba2a3f28df = { package = "syn", version = "2", features = ["extra-traits", "fold", "full", "visit", "visit-mut"] } time-macros = { version = "0.2", default-features = false, features = ["formatting", "parsing", "serde"] } +toml_datetime = { version = "0.6", default-features = false, features = ["serde"] } +toml_edit = { version = "0.19", features = ["serde"] } zstd = { version = "0.13" } zstd-safe = { version = "7", default-features = false, features = ["arrays", "legacy", "std", "zdict_builder"] } zstd-sys = { version = "2", default-features = false, features = ["legacy", "std", "zdict_builder"] }