diff --git a/.github/actionlint.yml b/.github/actionlint.yml index 7a97e2ae55..aec5b4ee75 100644 --- a/.github/actionlint.yml +++ b/.github/actionlint.yml @@ -25,3 +25,4 @@ config-variables: - PGREGRESS_PG17_PROJECT_ID - SLACK_ON_CALL_QA_STAGING_STREAM - DEV_AWS_OIDC_ROLE_MANAGE_BENCHMARK_EC2_VMS_ARN + - SLACK_ON_CALL_STORAGE_STAGING_STREAM diff --git a/.github/workflows/_check-codestyle-rust.yml b/.github/workflows/_check-codestyle-rust.yml new file mode 100644 index 0000000000..cbc47c6406 --- /dev/null +++ b/.github/workflows/_check-codestyle-rust.yml @@ -0,0 +1,91 @@ +name: Check Codestyle Rust + +on: + workflow_call: + inputs: + build-tools-image: + description: "build-tools image" + required: true + type: string + archs: + description: "Json array of architectures to run on" + type: string + + +defaults: + run: + shell: bash -euxo pipefail {0} + +jobs: + check-codestyle-rust: + strategy: + matrix: + arch: ${{ fromJson(inputs.archs) }} + runs-on: ${{ fromJson(format('["self-hosted", "{0}"]', matrix.arch == 'arm64' && 'small-arm64' || 'small')) }} + + container: + image: ${{ inputs.build-tools-image }} + credentials: + username: ${{ secrets.NEON_DOCKERHUB_USERNAME }} + password: ${{ secrets.NEON_DOCKERHUB_PASSWORD }} + options: --init + + steps: + - name: Checkout + uses: actions/checkout@v4 + with: + submodules: true + + - name: Cache cargo deps + uses: actions/cache@v4 + with: + path: | + ~/.cargo/registry + !~/.cargo/registry/src + ~/.cargo/git + target + key: v1-${{ runner.os }}-${{ runner.arch }}-cargo-${{ hashFiles('./Cargo.lock') }}-${{ hashFiles('./rust-toolchain.toml') }}-rust + + # Some of our rust modules use FFI and need those to be checked + - name: Get postgres headers + run: make postgres-headers -j$(nproc) + + # cargo hack runs the given cargo subcommand (clippy in this case) for all feature combinations. + # This will catch compiler & clippy warnings in all feature combinations. + # TODO: use cargo hack for build and test as well, but, that's quite expensive. + # NB: keep clippy args in sync with ./run_clippy.sh + # + # The only difference between "clippy --debug" and "clippy --release" is that in --release mode, + # #[cfg(debug_assertions)] blocks are not built. It's not worth building everything for second + # time just for that, so skip "clippy --release". + - run: | + CLIPPY_COMMON_ARGS="$( source .neon_clippy_args; echo "$CLIPPY_COMMON_ARGS")" + if [ "$CLIPPY_COMMON_ARGS" = "" ]; then + echo "No clippy args found in .neon_clippy_args" + exit 1 + fi + echo "CLIPPY_COMMON_ARGS=${CLIPPY_COMMON_ARGS}" >> $GITHUB_ENV + - name: Run cargo clippy (debug) + run: cargo hack --features default --ignore-unknown-features --feature-powerset clippy $CLIPPY_COMMON_ARGS + + - name: Check documentation generation + run: cargo doc --workspace --no-deps --document-private-items + env: + RUSTDOCFLAGS: "-Dwarnings -Arustdoc::private_intra_doc_links" + + # Use `${{ !cancelled() }}` to run quck tests after the longer clippy run + - name: Check formatting + if: ${{ !cancelled() }} + run: cargo fmt --all -- --check + + # https://github.com/facebookincubator/cargo-guppy/tree/bec4e0eb29dcd1faac70b1b5360267fc02bf830e/tools/cargo-hakari#2-keep-the-workspace-hack-up-to-date-in-ci + - name: Check rust dependencies + if: ${{ !cancelled() }} + run: | + cargo hakari generate --diff # workspace-hack Cargo.toml is up-to-date + cargo hakari manage-deps --dry-run # all workspace crates depend on workspace-hack + + # https://github.com/EmbarkStudios/cargo-deny + - name: Check rust licenses/bans/advisories/sources + if: ${{ !cancelled() }} + run: cargo deny check --hide-inclusion-graph diff --git a/.github/workflows/build_and_test.yml b/.github/workflows/build_and_test.yml index 01f5c3ede9..9ec5273af7 100644 --- a/.github/workflows/build_and_test.yml +++ b/.github/workflows/build_and_test.yml @@ -164,77 +164,11 @@ jobs: check-codestyle-rust: needs: [ check-permissions, build-build-tools-image ] - strategy: - matrix: - arch: [ x64, arm64 ] - runs-on: ${{ fromJson(format('["self-hosted", "{0}"]', matrix.arch == 'arm64' && 'small-arm64' || 'small')) }} - - container: - image: ${{ needs.build-build-tools-image.outputs.image }}-bookworm - credentials: - username: ${{ secrets.NEON_DOCKERHUB_USERNAME }} - password: ${{ secrets.NEON_DOCKERHUB_PASSWORD }} - options: --init - - steps: - - name: Checkout - uses: actions/checkout@v4 - with: - submodules: true - - - name: Cache cargo deps - uses: actions/cache@v4 - with: - path: | - ~/.cargo/registry - !~/.cargo/registry/src - ~/.cargo/git - target - key: v1-${{ runner.os }}-${{ runner.arch }}-cargo-${{ hashFiles('./Cargo.lock') }}-${{ hashFiles('./rust-toolchain.toml') }}-rust - - # Some of our rust modules use FFI and need those to be checked - - name: Get postgres headers - run: make postgres-headers -j$(nproc) - - # cargo hack runs the given cargo subcommand (clippy in this case) for all feature combinations. - # This will catch compiler & clippy warnings in all feature combinations. - # TODO: use cargo hack for build and test as well, but, that's quite expensive. - # NB: keep clippy args in sync with ./run_clippy.sh - # - # The only difference between "clippy --debug" and "clippy --release" is that in --release mode, - # #[cfg(debug_assertions)] blocks are not built. It's not worth building everything for second - # time just for that, so skip "clippy --release". - - run: | - CLIPPY_COMMON_ARGS="$( source .neon_clippy_args; echo "$CLIPPY_COMMON_ARGS")" - if [ "$CLIPPY_COMMON_ARGS" = "" ]; then - echo "No clippy args found in .neon_clippy_args" - exit 1 - fi - echo "CLIPPY_COMMON_ARGS=${CLIPPY_COMMON_ARGS}" >> $GITHUB_ENV - - name: Run cargo clippy (debug) - run: cargo hack --features default --ignore-unknown-features --feature-powerset clippy $CLIPPY_COMMON_ARGS - - - name: Check documentation generation - run: cargo doc --workspace --no-deps --document-private-items - env: - RUSTDOCFLAGS: "-Dwarnings -Arustdoc::private_intra_doc_links" - - # Use `${{ !cancelled() }}` to run quck tests after the longer clippy run - - name: Check formatting - if: ${{ !cancelled() }} - run: cargo fmt --all -- --check - - # https://github.com/facebookincubator/cargo-guppy/tree/bec4e0eb29dcd1faac70b1b5360267fc02bf830e/tools/cargo-hakari#2-keep-the-workspace-hack-up-to-date-in-ci - - name: Check rust dependencies - if: ${{ !cancelled() }} - run: | - cargo hakari generate --diff # workspace-hack Cargo.toml is up-to-date - cargo hakari manage-deps --dry-run # all workspace crates depend on workspace-hack - - # https://github.com/EmbarkStudios/cargo-deny - - name: Check rust licenses/bans/advisories/sources - if: ${{ !cancelled() }} - run: cargo deny check --hide-inclusion-graph + uses: ./.github/workflows/_check-codestyle-rust.yml + with: + build-tools-image: ${{ needs.build-build-tools-image.outputs.image }}-bookworm + archs: '["x64", "arm64"]' + secrets: inherit build-and-test-locally: needs: [ tag, build-build-tools-image ] @@ -346,25 +280,22 @@ jobs: # XXX: no coverage data handling here, since benchmarks are run on release builds, # while coverage is currently collected for the debug ones - report-benchmarks-failures: + report-benchmarks-results-to-slack: needs: [ benchmarks, create-test-report ] - if: github.ref_name == 'main' && failure() && needs.benchmarks.result == 'failure' - permissions: - id-token: write # aws-actions/configure-aws-credentials - statuses: write - contents: write - pull-requests: write + if: github.ref_name == 'main' && !cancelled() && contains(fromJSON('["success", "failure"]'), needs.benchmarks.result) runs-on: ubuntu-22.04 steps: - - uses: slackapi/slack-github-action@v1 + - uses: slackapi/slack-github-action@v2 with: - channel-id: C060CNA47S9 # on-call-staging-storage-stream - slack-message: | - Benchmarks failed on main <${{ github.event.head_commit.url }}|${{ github.sha }}> - <${{ needs.create-test-report.outputs.report-url }}|Allure report> - env: - SLACK_BOT_TOKEN: ${{ secrets.SLACK_BOT_TOKEN }} + method: chat.postMessage + token: ${{ secrets.SLACK_BOT_TOKEN }} + payload: | + channel: "${{ vars.SLACK_ON_CALL_STORAGE_STAGING_STREAM }}" + text: | + Benchmarks on main: *${{ needs.benchmarks.result }}* + - <${{ needs.create-test-report.outputs.report-url }}|Allure report> + - <${{ github.event.head_commit.url }}|${{ github.sha }}> create-test-report: needs: [ check-permissions, build-and-test-locally, coverage-report, build-build-tools-image, benchmarks ] @@ -728,30 +659,6 @@ jobs: tags: | neondatabase/neon-test-extensions-${{ matrix.version.pg }}:${{needs.tag.outputs.build-tag}}-${{ matrix.version.debian }}-${{ matrix.arch }} - - name: Build compute-tools image - # compute-tools are Postgres independent, so build it only once - # We pick 16, because that builds on debian 11 with older glibc (and is - # thus compatible with newer glibc), rather than 17 on Debian 12, as - # that isn't guaranteed to be compatible with Debian 11 - if: matrix.version.pg == 'v16' - uses: docker/build-push-action@v6 - with: - target: compute-tools-image - context: . - build-args: | - GIT_VERSION=${{ github.event.pull_request.head.sha || github.sha }} - BUILD_TAG=${{ needs.tag.outputs.build-tag }} - TAG=${{ needs.build-build-tools-image.outputs.image-tag }}-${{ matrix.version.debian }} - DEBIAN_VERSION=${{ matrix.version.debian }} - provenance: false - push: true - pull: true - file: compute/compute-node.Dockerfile - cache-from: type=registry,ref=cache.neon.build/compute-node-${{ matrix.version.pg }}:cache-${{ matrix.version.debian }}-${{ matrix.arch }} - cache-to: ${{ github.ref_name == 'main' && format('type=registry,ref=cache.neon.build/compute-tools-{0}:cache-{1}-{2},mode=max', matrix.version.pg, matrix.version.debian, matrix.arch) || '' }} - tags: | - neondatabase/compute-tools:${{ needs.tag.outputs.build-tag }}-${{ matrix.version.debian }}-${{ matrix.arch }} - compute-node-image: needs: [ compute-node-image-arch, tag ] permissions: @@ -794,14 +701,6 @@ jobs: neondatabase/neon-test-extensions-${{ matrix.version.pg }}:${{ needs.tag.outputs.build-tag }}-${{ matrix.version.debian }}-x64 \ neondatabase/neon-test-extensions-${{ matrix.version.pg }}:${{ needs.tag.outputs.build-tag }}-${{ matrix.version.debian }}-arm64 - - name: Create multi-arch compute-tools image - if: matrix.version.pg == 'v16' - run: | - docker buildx imagetools create -t neondatabase/compute-tools:${{ needs.tag.outputs.build-tag }} \ - -t neondatabase/compute-tools:${{ needs.tag.outputs.build-tag }}-${{ matrix.version.debian }} \ - neondatabase/compute-tools:${{ needs.tag.outputs.build-tag }}-${{ matrix.version.debian }}-x64 \ - neondatabase/compute-tools:${{ needs.tag.outputs.build-tag }}-${{ matrix.version.debian }}-arm64 - - name: Configure AWS credentials uses: aws-actions/configure-aws-credentials@v4 with: @@ -817,12 +716,6 @@ jobs: docker buildx imagetools create -t 369495373322.dkr.ecr.eu-central-1.amazonaws.com/compute-node-${{ matrix.version.pg }}:${{ needs.tag.outputs.build-tag }} \ neondatabase/compute-node-${{ matrix.version.pg }}:${{ needs.tag.outputs.build-tag }} - - name: Push multi-arch compute-tools image to ECR - if: matrix.version.pg == 'v16' - run: | - docker buildx imagetools create -t 369495373322.dkr.ecr.eu-central-1.amazonaws.com/compute-tools:${{ needs.tag.outputs.build-tag }} \ - neondatabase/compute-tools:${{ needs.tag.outputs.build-tag }} - vm-compute-node-image: needs: [ check-permissions, tag, compute-node-image ] runs-on: [ self-hosted, large ] @@ -1001,9 +894,6 @@ jobs: docker buildx imagetools create -t $repo/neon:latest \ $repo/neon:${{ needs.tag.outputs.build-tag }} - docker buildx imagetools create -t $repo/compute-tools:latest \ - $repo/compute-tools:${{ needs.tag.outputs.build-tag }} - for version in ${VERSIONS}; do docker buildx imagetools create -t $repo/compute-node-${version}:latest \ $repo/compute-node-${version}:${{ needs.tag.outputs.build-tag }} @@ -1032,7 +922,7 @@ jobs: - name: Copy all images to prod ECR if: github.ref_name == 'release' || github.ref_name == 'release-proxy' || github.ref_name == 'release-compute' run: | - for image in neon compute-tools {vm-,}compute-node-{v14,v15,v16,v17}; do + for image in neon {vm-,}compute-node-{v14,v15,v16,v17}; do docker buildx imagetools create -t 093970136003.dkr.ecr.eu-central-1.amazonaws.com/${image}:${{ needs.tag.outputs.build-tag }} \ 369495373322.dkr.ecr.eu-central-1.amazonaws.com/${image}:${{ needs.tag.outputs.build-tag }} done @@ -1044,7 +934,7 @@ jobs: with: client_id: ${{ vars.AZURE_DEV_CLIENT_ID }} image_tag: ${{ needs.tag.outputs.build-tag }} - images: neon compute-tools vm-compute-node-v14 vm-compute-node-v15 vm-compute-node-v16 vm-compute-node-v17 compute-node-v14 compute-node-v15 compute-node-v16 compute-node-v17 + images: neon vm-compute-node-v14 vm-compute-node-v15 vm-compute-node-v16 vm-compute-node-v17 compute-node-v14 compute-node-v15 compute-node-v16 compute-node-v17 registry_name: ${{ vars.AZURE_DEV_REGISTRY_NAME }} subscription_id: ${{ vars.AZURE_DEV_SUBSCRIPTION_ID }} tenant_id: ${{ vars.AZURE_TENANT_ID }} @@ -1056,7 +946,7 @@ jobs: with: client_id: ${{ vars.AZURE_PROD_CLIENT_ID }} image_tag: ${{ needs.tag.outputs.build-tag }} - images: neon compute-tools vm-compute-node-v14 vm-compute-node-v15 vm-compute-node-v16 vm-compute-node-v17 compute-node-v14 compute-node-v15 compute-node-v16 compute-node-v17 + images: neon vm-compute-node-v14 vm-compute-node-v15 vm-compute-node-v16 vm-compute-node-v17 compute-node-v14 compute-node-v15 compute-node-v16 compute-node-v17 registry_name: ${{ vars.AZURE_PROD_REGISTRY_NAME }} subscription_id: ${{ vars.AZURE_PROD_SUBSCRIPTION_ID }} tenant_id: ${{ vars.AZURE_TENANT_ID }} diff --git a/.github/workflows/pre-merge-checks.yml b/.github/workflows/pre-merge-checks.yml index b2e00d94f7..e6dfbaeed8 100644 --- a/.github/workflows/pre-merge-checks.yml +++ b/.github/workflows/pre-merge-checks.yml @@ -1,6 +1,12 @@ name: Pre-merge checks on: + pull_request: + paths: + - .github/workflows/_check-codestyle-python.yml + - .github/workflows/_check-codestyle-rust.yml + - .github/workflows/build-build-tools-image.yml + - .github/workflows/pre-merge-checks.yml merge_group: branches: - main @@ -17,8 +23,10 @@ jobs: runs-on: ubuntu-22.04 outputs: python-changed: ${{ steps.python-src.outputs.any_changed }} + rust-changed: ${{ steps.rust-src.outputs.any_changed }} steps: - uses: actions/checkout@v4 + - uses: tj-actions/changed-files@4edd678ac3f81e2dc578756871e4d00c19191daf # v45.0.4 id: python-src with: @@ -30,11 +38,25 @@ jobs: poetry.lock pyproject.toml + - uses: tj-actions/changed-files@4edd678ac3f81e2dc578756871e4d00c19191daf # v45.0.4 + id: rust-src + with: + files: | + .github/workflows/_check-codestyle-rust.yml + .github/workflows/build-build-tools-image.yml + .github/workflows/pre-merge-checks.yml + **/**.rs + **/Cargo.toml + Cargo.toml + Cargo.lock + - name: PRINT ALL CHANGED FILES FOR DEBUG PURPOSES env: PYTHON_CHANGED_FILES: ${{ steps.python-src.outputs.all_changed_files }} + RUST_CHANGED_FILES: ${{ steps.rust-src.outputs.all_changed_files }} run: | echo "${PYTHON_CHANGED_FILES}" + echo "${RUST_CHANGED_FILES}" build-build-tools-image: if: needs.get-changed-files.outputs.python-changed == 'true' @@ -55,6 +77,16 @@ jobs: build-tools-image: ${{ needs.build-build-tools-image.outputs.image }}-bookworm-x64 secrets: inherit + check-codestyle-rust: + if: needs.get-changed-files.outputs.rust-changed == 'true' + needs: [ get-changed-files, build-build-tools-image ] + uses: ./.github/workflows/_check-codestyle-rust.yml + with: + # `-bookworm-x64` suffix should match the combination in `build-build-tools-image` + build-tools-image: ${{ needs.build-build-tools-image.outputs.image }}-bookworm-x64 + archs: '["x64"]' + secrets: inherit + # To get items from the merge queue merged into main we need to satisfy "Status checks that are required". # Currently we require 2 jobs (checks with exact name): # - conclusion @@ -67,6 +99,7 @@ jobs: needs: - get-changed-files - check-codestyle-python + - check-codestyle-rust runs-on: ubuntu-22.04 steps: - name: Create fake `neon-cloud-e2e` check diff --git a/Cargo.lock b/Cargo.lock index 44143fa0da..02b02a09c1 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -1312,6 +1312,7 @@ dependencies = [ "tracing-utils", "url", "utils", + "uuid", "vm_monitor", "workspace_hack", "zstd", @@ -1605,6 +1606,32 @@ dependencies = [ "typenum", ] +[[package]] +name = "curve25519-dalek" +version = "4.1.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "97fb8b7c4503de7d6ae7b42ab72a5a59857b4c937ec27a3d4539dba95b5ab2be" +dependencies = [ + "cfg-if", + "cpufeatures", + "curve25519-dalek-derive", + "digest", + "fiat-crypto", + "rustc_version", + "subtle", +] + +[[package]] +name = "curve25519-dalek-derive" +version = "0.1.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f46882e17999c6cc590af592290432be3bce0428cb0d5f8b6715e4dc7b383eb3" +dependencies = [ + "proc-macro2", + "quote", + "syn 2.0.90", +] + [[package]] name = "darling" version = "0.20.1" @@ -1653,6 +1680,20 @@ dependencies = [ "parking_lot_core 0.9.8", ] +[[package]] +name = "dashmap" +version = "6.1.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5041cc499144891f3790297212f32a74fb938e5136a14943f338ef9e0ae276cf" +dependencies = [ + "cfg-if", + "crossbeam-utils", + "hashbrown 0.14.5", + "lock_api", + "once_cell", + "parking_lot_core 0.9.8", +] + [[package]] name = "data-encoding" version = "2.4.0" @@ -1861,6 +1902,28 @@ dependencies = [ "spki 0.7.3", ] +[[package]] +name = "ed25519" +version = "2.2.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "115531babc129696a58c64a4fef0a8bf9e9698629fb97e9e40767d235cfbcd53" +dependencies = [ + "signature 2.2.0", +] + +[[package]] +name = "ed25519-dalek" +version = "2.1.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "4a3daa8e81a3963a60642bcc1f90a670680bd4a77535faa384e9d1c79d620871" +dependencies = [ + "curve25519-dalek", + "ed25519", + "rand_core 0.6.4", + "sha2", + "subtle", +] + [[package]] name = "either" version = "1.8.1" @@ -1952,6 +2015,15 @@ dependencies = [ "syn 2.0.90", ] +[[package]] +name = "env_filter" +version = "0.1.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "186e05a59d4c50738528153b83b0b0194d3a29507dfec16eccd4b342903397d0" +dependencies = [ + "log", +] + [[package]] name = "env_logger" version = "0.10.2" @@ -1965,6 +2037,16 @@ dependencies = [ "termcolor", ] +[[package]] +name = "env_logger" +version = "0.11.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6c012a26a7f605efc424dd53697843a72be7dc86ad2d01f7814337794a12231d" +dependencies = [ + "env_filter", + "log", +] + [[package]] name = "equator" version = "0.2.2" @@ -2080,6 +2162,12 @@ dependencies = [ "subtle", ] +[[package]] +name = "fiat-crypto" +version = "0.2.9" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "28dea519a9695b9977216879a3ebfddf92f1c08c05d984f8996aecd6ecdc811d" + [[package]] name = "filetime" version = "0.2.22" @@ -2948,6 +3036,28 @@ dependencies = [ "str_stack", ] +[[package]] +name = "inferno" +version = "0.12.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "75a5d75fee4d36809e6b021e4b96b686e763d365ffdb03af2bd00786353f84fe" +dependencies = [ + "ahash", + "clap", + "crossbeam-channel", + "crossbeam-utils", + "dashmap 6.1.0", + "env_logger 0.11.2", + "indexmap 2.0.1", + "itoa", + "log", + "num-format", + "once_cell", + "quick-xml 0.37.1", + "rgb", + "str_stack", +] + [[package]] name = "inotify" version = "0.9.6" @@ -3155,7 +3265,7 @@ version = "0.7.2" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "4644821e1c3d7a560fe13d842d13f587c07348a1a05d3a797152d41c90c56df2" dependencies = [ - "dashmap", + "dashmap 5.5.0", "hashbrown 0.13.2", ] @@ -3693,23 +3803,23 @@ checksum = "ff011a302c396a5197692431fc1948019154afc178baf7d8e37367442a4601cf" [[package]] name = "opentelemetry" -version = "0.26.0" +version = "0.27.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "570074cc999d1a58184080966e5bd3bf3a9a4af650c3b05047c2621e7405cd17" +checksum = "ab70038c28ed37b97d8ed414b6429d343a8bbf44c9f79ec854f3a643029ba6d7" dependencies = [ "futures-core", "futures-sink", "js-sys", - "once_cell", "pin-project-lite", "thiserror", + "tracing", ] [[package]] name = "opentelemetry-http" -version = "0.26.0" +version = "0.27.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "6351496aeaa49d7c267fb480678d85d1cd30c5edb20b497c48c56f62a8c14b99" +checksum = "10a8a7f5f6ba7c1b286c2fbca0454eaba116f63bbe69ed250b642d36fbb04d80" dependencies = [ "async-trait", "bytes", @@ -3720,9 +3830,9 @@ dependencies = [ [[package]] name = "opentelemetry-otlp" -version = "0.26.0" +version = "0.27.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "29e1f9c8b032d4f635c730c0efcf731d5e2530ea13fa8bef7939ddc8420696bd" +checksum = "91cf61a1868dacc576bf2b2a1c3e9ab150af7272909e80085c3173384fe11f76" dependencies = [ "async-trait", "futures-core", @@ -3738,9 +3848,9 @@ dependencies = [ [[package]] name = "opentelemetry-proto" -version = "0.26.1" +version = "0.27.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "c9d3968ce3aefdcca5c27e3c4ea4391b37547726a70893aab52d3de95d5f8b34" +checksum = "a6e05acbfada5ec79023c85368af14abd0b307c015e9064d249b2a950ef459a6" dependencies = [ "opentelemetry", "opentelemetry_sdk", @@ -3750,22 +3860,21 @@ dependencies = [ [[package]] name = "opentelemetry-semantic-conventions" -version = "0.26.0" +version = "0.27.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "db945c1eaea8ac6a9677185357480d215bb6999faa9f691d0c4d4d641eab7a09" +checksum = "bc1b6902ff63b32ef6c489e8048c5e253e2e4a803ea3ea7e783914536eb15c52" [[package]] name = "opentelemetry_sdk" -version = "0.26.0" +version = "0.27.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d2c627d9f4c9cdc1f21a29ee4bfbd6028fcb8bcf2a857b43f3abdf72c9c862f3" +checksum = "231e9d6ceef9b0b2546ddf52335785ce41252bc7474ee8ba05bfad277be13ab8" dependencies = [ "async-trait", "futures-channel", "futures-executor", "futures-util", "glob", - "once_cell", "opentelemetry", "percent-encoding", "rand 0.8.5", @@ -3773,6 +3882,7 @@ dependencies = [ "thiserror", "tokio", "tokio-stream", + "tracing", ] [[package]] @@ -3872,9 +3982,11 @@ name = "pagectl" version = "0.1.0" dependencies = [ "anyhow", + "bincode", "camino", "clap", "humantime", + "itertools 0.10.5", "pageserver", "pageserver_api", "postgres_ffi", @@ -3896,6 +4008,7 @@ dependencies = [ "arc-swap", "async-compression", "async-stream", + "bincode", "bit_field", "byteorder", "bytes", @@ -3935,6 +4048,7 @@ dependencies = [ "postgres_connection", "postgres_ffi", "postgres_initdb", + "pprof", "pq_proto", "procfs", "rand 0.8.5", @@ -4421,7 +4535,7 @@ dependencies = [ "bytes", "crc32c", "criterion", - "env_logger", + "env_logger 0.10.2", "log", "memoffset 0.9.0", "once_cell", @@ -4462,7 +4576,7 @@ dependencies = [ "cfg-if", "criterion", "findshlibs", - "inferno", + "inferno 0.11.21", "libc", "log", "nix 0.26.4", @@ -4688,9 +4802,10 @@ dependencies = [ "clap", "compute_api", "consumption_metrics", - "dashmap", + "dashmap 5.5.0", "ecdsa 0.16.9", - "env_logger", + "ed25519-dalek", + "env_logger 0.10.2", "fallible-iterator", "flate2", "framed-websockets", @@ -4797,6 +4912,15 @@ dependencies = [ "serde", ] +[[package]] +name = "quick-xml" +version = "0.37.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f22f29bdff3987b4d8632ef95fd6424ec7e4e0a57e2f4fc63e489e75357f6a03" +dependencies = [ + "memchr", +] + [[package]] name = "quote" version = "1.0.37" @@ -5181,9 +5305,9 @@ dependencies = [ [[package]] name = "reqwest-tracing" -version = "0.5.4" +version = "0.5.5" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ff82cf5730a1311fb9413b0bc2b8e743e0157cd73f010ab4ec374a923873b6a2" +checksum = "73e6153390585f6961341b50e5a1931d6be6dee4292283635903c26ef9d980d2" dependencies = [ "anyhow", "async-trait", @@ -5535,6 +5659,7 @@ dependencies = [ "crc32c", "criterion", "desim", + "env_logger 0.10.2", "fail", "futures", "hex", @@ -5563,6 +5688,7 @@ dependencies = [ "serde", "serde_json", "sha2", + "smallvec", "storage_broker", "strum", "strum_macros", @@ -5587,10 +5713,13 @@ dependencies = [ name = "safekeeper_api" version = "0.1.0" dependencies = [ + "anyhow", "const_format", + "pageserver_api", "postgres_ffi", "pq_proto", "serde", + "serde_json", "tokio", "utils", ] @@ -7048,9 +7177,9 @@ dependencies = [ [[package]] name = "tracing-opentelemetry" -version = "0.27.0" +version = "0.28.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "dc58af5d3f6c5811462cabb3289aec0093f7338e367e5a33d28c0433b3c7360b" +checksum = "97a971f6058498b5c0f1affa23e7ea202057a7301dbff68e968b2d578bcbd053" dependencies = [ "js-sys", "once_cell", @@ -7319,6 +7448,7 @@ dependencies = [ "hex-literal", "humantime", "hyper 0.14.30", + "inferno 0.12.0", "itertools 0.10.5", "jemalloc_pprof", "jsonwebtoken", @@ -7422,7 +7552,7 @@ dependencies = [ "anyhow", "camino-tempfile", "clap", - "env_logger", + "env_logger 0.10.2", "log", "postgres", "postgres_ffi", @@ -7437,12 +7567,21 @@ dependencies = [ "anyhow", "async-compression", "bytes", + "camino", + "camino-tempfile", + "criterion", + "futures", "pageserver_api", "postgres_ffi", + "pprof", "prost", + "remote_storage", "serde", + "serde_json", "thiserror", + "tikv-jemallocator", "tokio", + "tokio-util", "tonic", "tonic-build", "tracing", diff --git a/Cargo.toml b/Cargo.toml index 39898e1c8d..a4e601bb58 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -110,6 +110,7 @@ hyper-util = "0.1" tokio-tungstenite = "0.21.0" indexmap = "2" indoc = "2" +inferno = "0.12.0" ipnet = "2.10.0" itertools = "0.10" itoa = "1.0.11" @@ -126,10 +127,10 @@ notify = "6.0.0" num_cpus = "1.15" num-traits = "0.2.15" once_cell = "1.13" -opentelemetry = "0.26" -opentelemetry_sdk = "0.26" -opentelemetry-otlp = { version = "0.26", default-features=false, features = ["http-proto", "trace", "http", "reqwest-client"] } -opentelemetry-semantic-conventions = "0.26" +opentelemetry = "0.27" +opentelemetry_sdk = "0.27" +opentelemetry-otlp = { version = "0.27", default-features = false, features = ["http-proto", "trace", "http", "reqwest-client"] } +opentelemetry-semantic-conventions = "0.27" parking_lot = "0.12" parquet = { version = "53", default-features = false, features = ["zstd"] } parquet_derive = "53" @@ -143,7 +144,7 @@ rand = "0.8" redis = { version = "0.25.2", features = ["tokio-rustls-comp", "keep-alive"] } regex = "1.10.2" reqwest = { version = "0.12", default-features = false, features = ["rustls-tls"] } -reqwest-tracing = { version = "0.5", features = ["opentelemetry_0_26"] } +reqwest-tracing = { version = "0.5", features = ["opentelemetry_0_27"] } reqwest-middleware = "0.4" reqwest-retry = "0.7" routerify = "3" @@ -192,7 +193,7 @@ tower-http = { version = "0.6.2", features = ["request-id", "trace"] } tower-service = "0.3.3" tracing = "0.1" tracing-error = "0.2" -tracing-opentelemetry = "0.27" +tracing-opentelemetry = "0.28" tracing-subscriber = { version = "0.3", default-features = false, features = ["smallvec", "fmt", "tracing-log", "std", "env-filter", "json"] } try-lock = "0.2.5" twox-hash = { version = "1.6.3", default-features = false } diff --git a/Dockerfile b/Dockerfile index d3659f917a..2e4f8e5546 100644 --- a/Dockerfile +++ b/Dockerfile @@ -71,6 +71,7 @@ RUN set -e \ ca-certificates \ # System postgres for use with client libraries (e.g. in storage controller) postgresql-15 \ + openssl \ && rm -rf /var/lib/apt/lists/* /tmp/* /var/tmp/* \ && useradd -d /data neon \ && chown -R neon:neon /data diff --git a/Makefile b/Makefile index 9cffc74508..22ebfea7d5 100644 --- a/Makefile +++ b/Makefile @@ -3,7 +3,6 @@ ROOT_PROJECT_DIR := $(dir $(abspath $(lastword $(MAKEFILE_LIST)))) # Where to install Postgres, default is ./pg_install, maybe useful for package managers POSTGRES_INSTALL_DIR ?= $(ROOT_PROJECT_DIR)/pg_install/ -OPENSSL_PREFIX_DIR := /usr/local/openssl ICU_PREFIX_DIR := /usr/local/icu # @@ -26,11 +25,9 @@ endif ifeq ($(shell test -e /home/nonroot/.docker_build && echo -n yes),yes) # Exclude static build openssl, icu for local build (MacOS, Linux) # Only keep for build type release and debug - PG_CFLAGS += -I$(OPENSSL_PREFIX_DIR)/include PG_CONFIGURE_OPTS += --with-icu PG_CONFIGURE_OPTS += ICU_CFLAGS='-I/$(ICU_PREFIX_DIR)/include -DU_STATIC_IMPLEMENTATION' PG_CONFIGURE_OPTS += ICU_LIBS='-L$(ICU_PREFIX_DIR)/lib -L$(ICU_PREFIX_DIR)/lib64 -licui18n -licuuc -licudata -lstdc++ -Wl,-Bdynamic -lm' - PG_CONFIGURE_OPTS += LDFLAGS='-L$(OPENSSL_PREFIX_DIR)/lib -L$(OPENSSL_PREFIX_DIR)/lib64 -L$(ICU_PREFIX_DIR)/lib -L$(ICU_PREFIX_DIR)/lib64 -Wl,-Bstatic -lssl -lcrypto -Wl,-Bdynamic -lrt -lm -ldl -lpthread' endif UNAME_S := $(shell uname -s) diff --git a/build-tools.Dockerfile b/build-tools.Dockerfile index 79210a2e1b..7a2ec9c43e 100644 --- a/build-tools.Dockerfile +++ b/build-tools.Dockerfile @@ -115,7 +115,7 @@ RUN set -e \ # Keep the version the same as in compute/compute-node.Dockerfile and # test_runner/regress/test_compute_metrics.py. -ENV SQL_EXPORTER_VERSION=0.16.0 +ENV SQL_EXPORTER_VERSION=0.17.0 RUN curl -fsSL \ "https://github.com/burningalchemist/sql_exporter/releases/download/${SQL_EXPORTER_VERSION}/sql_exporter-${SQL_EXPORTER_VERSION}.linux-$(case "$(uname -m)" in x86_64) echo amd64;; aarch64) echo arm64;; esac).tar.gz" \ --output sql_exporter.tar.gz \ @@ -190,21 +190,6 @@ RUN for package in Capture::Tiny DateTime Devel::Cover Digest::MD5 File::Spec JS && make install \ && rm -rf ../lcov.tar.gz -# Compile and install the static OpenSSL library -ENV OPENSSL_VERSION=1.1.1w -ENV OPENSSL_PREFIX=/usr/local/openssl -RUN wget -O /tmp/openssl-${OPENSSL_VERSION}.tar.gz https://www.openssl.org/source/openssl-${OPENSSL_VERSION}.tar.gz && \ - echo "cf3098950cb4d853ad95c0841f1f9c6d3dc102dccfcacd521d93925208b76ac8 /tmp/openssl-${OPENSSL_VERSION}.tar.gz" | sha256sum --check && \ - cd /tmp && \ - tar xzvf /tmp/openssl-${OPENSSL_VERSION}.tar.gz && \ - rm /tmp/openssl-${OPENSSL_VERSION}.tar.gz && \ - cd /tmp/openssl-${OPENSSL_VERSION} && \ - ./config --prefix=${OPENSSL_PREFIX} -static --static no-shared -fPIC && \ - make -j "$(nproc)" && \ - make install && \ - cd /tmp && \ - rm -rf /tmp/openssl-${OPENSSL_VERSION} - # Use the same version of libicu as the compute nodes so that # clusters created using inidb on pageserver can be used by computes. # diff --git a/compute/compute-node.Dockerfile b/compute/compute-node.Dockerfile index 06aaf9e7f4..8c7200c5cb 100644 --- a/compute/compute-node.Dockerfile +++ b/compute/compute-node.Dockerfile @@ -66,6 +66,7 @@ RUN cd postgres && \ make MAKELEVEL=0 -j $(getconf _NPROCESSORS_ONLN) -s -C src/interfaces/libpq install && \ # Enable some of contrib extensions echo 'trusted = true' >> /usr/local/pgsql/share/extension/autoinc.control && \ + echo 'trusted = true' >> /usr/local/pgsql/share/extension/dblink.control && \ echo 'trusted = true' >> /usr/local/pgsql/share/extension/bloom.control && \ echo 'trusted = true' >> /usr/local/pgsql/share/extension/earthdistance.control && \ echo 'trusted = true' >> /usr/local/pgsql/share/extension/insert_username.control && \ @@ -104,16 +105,18 @@ RUN cd postgres && \ esac; \ done; +# Set PATH for all the subsequent build steps +ENV PATH="/usr/local/pgsql/bin:$PATH" + ######################################################################################### # # Layer "postgis-build" # Build PostGIS from the upstream PostGIS mirror. # ######################################################################################### -FROM build-deps AS postgis-build +FROM pg-build AS postgis-build ARG DEBIAN_VERSION ARG PG_VERSION -COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/ RUN apt update && \ apt install --no-install-recommends --no-install-suggests -y \ gdal-bin libboost-dev libboost-thread-dev libboost-filesystem-dev \ @@ -151,8 +154,6 @@ RUN case "${DEBIAN_VERSION}" in \ DESTDIR=/sfcgal ninja install -j $(getconf _NPROCESSORS_ONLN) && \ ninja clean && cp -R /sfcgal/* / -ENV PATH="/usr/local/pgsql/bin:$PATH" - # Postgis 3.5.0 supports v17 RUN case "${PG_VERSION}" in \ "v17") \ @@ -170,7 +171,6 @@ RUN case "${PG_VERSION}" in \ wget https://download.osgeo.org/postgis/source/postgis-${POSTGIS_VERSION}.tar.gz -O postgis.tar.gz && \ echo "${POSTGIS_CHECKSUM} postgis.tar.gz" | sha256sum --check && \ mkdir postgis-src && cd postgis-src && tar xzf ../postgis.tar.gz --strip-components=1 -C . && \ - find /usr/local/pgsql -type f | sed 's|^/usr/local/pgsql/||' > /before.txt &&\ ./autogen.sh && \ ./configure --with-sfcgal=/usr/local/bin/sfcgal-config && \ make -j $(getconf _NPROCESSORS_ONLN) && \ @@ -220,11 +220,7 @@ RUN case "${PG_VERSION}" in \ cmake -GNinja -DCMAKE_BUILD_TYPE=Release .. && \ ninja -j $(getconf _NPROCESSORS_ONLN) && \ ninja -j $(getconf _NPROCESSORS_ONLN) install && \ - echo 'trusted = true' >> /usr/local/pgsql/share/extension/pgrouting.control && \ - find /usr/local/pgsql -type f | sed 's|^/usr/local/pgsql/||' > /after.txt &&\ - cp /usr/local/pgsql/share/extension/pgrouting.control /extensions/postgis && \ - sort -o /before.txt /before.txt && sort -o /after.txt /after.txt && \ - comm -13 /before.txt /after.txt | tar --directory=/usr/local/pgsql --zstd -cf /extensions/postgis.tar.zst -T - + echo 'trusted = true' >> /usr/local/pgsql/share/extension/pgrouting.control ######################################################################################### # @@ -232,9 +228,8 @@ RUN case "${PG_VERSION}" in \ # Build plv8 # ######################################################################################### -FROM build-deps AS plv8-build +FROM pg-build AS plv8-build ARG PG_VERSION -COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/ COPY compute/patches/plv8-3.1.10.patch /plv8-3.1.10.patch @@ -269,7 +264,6 @@ RUN case "${PG_VERSION}" in \ # generate and copy upgrade scripts mkdir -p upgrade && ./generate_upgrade.sh ${PLV8_TAG#v} && \ cp upgrade/* /usr/local/pgsql/share/extension/ && \ - export PATH="/usr/local/pgsql/bin:$PATH" && \ make DOCKER=1 -j $(getconf _NPROCESSORS_ONLN) install && \ rm -rf /plv8-* && \ find /usr/local/pgsql/ -name "plv8-*.so" | xargs strip && \ @@ -296,9 +290,8 @@ RUN case "${PG_VERSION}" in \ # Build h3_pg # ######################################################################################### -FROM build-deps AS h3-pg-build +FROM pg-build AS h3-pg-build ARG PG_VERSION -COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/ # not version-specific # last release v4.1.0 - Jan 18, 2023 @@ -319,7 +312,6 @@ RUN mkdir -p /h3/usr/ && \ RUN wget https://github.com/zachasme/h3-pg/archive/refs/tags/v4.1.3.tar.gz -O h3-pg.tar.gz && \ echo "5c17f09a820859ffe949f847bebf1be98511fb8f1bd86f94932512c00479e324 h3-pg.tar.gz" | sha256sum --check && \ mkdir h3-pg-src && cd h3-pg-src && tar xzf ../h3-pg.tar.gz --strip-components=1 -C . && \ - export PATH="/usr/local/pgsql/bin:$PATH" && \ make -j $(getconf _NPROCESSORS_ONLN) && \ make -j $(getconf _NPROCESSORS_ONLN) install && \ echo 'trusted = true' >> /usr/local/pgsql/share/extension/h3.control && \ @@ -331,17 +323,16 @@ RUN wget https://github.com/zachasme/h3-pg/archive/refs/tags/v4.1.3.tar.gz -O h3 # compile unit extension # ######################################################################################### -FROM build-deps AS unit-pg-build +FROM pg-build AS unit-pg-build ARG PG_VERSION -COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/ # not version-specific # last release 7.9 - Sep 15, 2024 RUN wget https://github.com/df7cb/postgresql-unit/archive/refs/tags/7.9.tar.gz -O postgresql-unit.tar.gz && \ echo "e46de6245dcc8b2c2ecf29873dbd43b2b346773f31dd5ce4b8315895a052b456 postgresql-unit.tar.gz" | sha256sum --check && \ mkdir postgresql-unit-src && cd postgresql-unit-src && tar xzf ../postgresql-unit.tar.gz --strip-components=1 -C . && \ - make -j $(getconf _NPROCESSORS_ONLN) PG_CONFIG=/usr/local/pgsql/bin/pg_config && \ - make -j $(getconf _NPROCESSORS_ONLN) install PG_CONFIG=/usr/local/pgsql/bin/pg_config && \ + make -j $(getconf _NPROCESSORS_ONLN) && \ + make -j $(getconf _NPROCESSORS_ONLN) install && \ # unit extension's "create extension" script relies on absolute install path to fill some reference tables. # We move the extension from '/usr/local/pgsql/' to '/usr/local/' after it is build. So we need to adjust the path. # This one-liner removes pgsql/ part of the path. @@ -355,9 +346,8 @@ RUN wget https://github.com/df7cb/postgresql-unit/archive/refs/tags/7.9.tar.gz - # compile pgvector extension # ######################################################################################### -FROM build-deps AS vector-pg-build +FROM pg-build AS vector-pg-build ARG PG_VERSION -COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/ COPY compute/patches/pgvector.patch /pgvector.patch @@ -371,8 +361,8 @@ RUN wget https://github.com/pgvector/pgvector/archive/refs/tags/v0.8.0.tar.gz -O echo "867a2c328d4928a5a9d6f052cd3bc78c7d60228a9b914ad32aa3db88e9de27b0 pgvector.tar.gz" | sha256sum --check && \ mkdir pgvector-src && cd pgvector-src && tar xzf ../pgvector.tar.gz --strip-components=1 -C . && \ patch -p1 < /pgvector.patch && \ - make -j $(getconf _NPROCESSORS_ONLN) OPTFLAGS="" PG_CONFIG=/usr/local/pgsql/bin/pg_config && \ - make -j $(getconf _NPROCESSORS_ONLN) OPTFLAGS="" install PG_CONFIG=/usr/local/pgsql/bin/pg_config && \ + make -j $(getconf _NPROCESSORS_ONLN) OPTFLAGS="" && \ + make -j $(getconf _NPROCESSORS_ONLN) OPTFLAGS="" install && \ echo 'trusted = true' >> /usr/local/pgsql/share/extension/vector.control ######################################################################################### @@ -381,16 +371,15 @@ RUN wget https://github.com/pgvector/pgvector/archive/refs/tags/v0.8.0.tar.gz -O # compile pgjwt extension # ######################################################################################### -FROM build-deps AS pgjwt-pg-build +FROM pg-build AS pgjwt-pg-build ARG PG_VERSION -COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/ # not version-specific # doesn't use releases, last commit f3d82fd - Mar 2, 2023 RUN wget https://github.com/michelp/pgjwt/archive/f3d82fd30151e754e19ce5d6a06c71c20689ce3d.tar.gz -O pgjwt.tar.gz && \ echo "dae8ed99eebb7593b43013f6532d772b12dfecd55548d2673f2dfd0163f6d2b9 pgjwt.tar.gz" | sha256sum --check && \ mkdir pgjwt-src && cd pgjwt-src && tar xzf ../pgjwt.tar.gz --strip-components=1 -C . && \ - make -j $(getconf _NPROCESSORS_ONLN) install PG_CONFIG=/usr/local/pgsql/bin/pg_config && \ + make -j $(getconf _NPROCESSORS_ONLN) install && \ echo 'trusted = true' >> /usr/local/pgsql/share/extension/pgjwt.control ######################################################################################### @@ -399,17 +388,16 @@ RUN wget https://github.com/michelp/pgjwt/archive/f3d82fd30151e754e19ce5d6a06c71 # compile hypopg extension # ######################################################################################### -FROM build-deps AS hypopg-pg-build +FROM pg-build AS hypopg-pg-build ARG PG_VERSION -COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/ # HypoPG 1.4.1 supports v17 # last release 1.4.1 - Apr 28, 2024 RUN wget https://github.com/HypoPG/hypopg/archive/refs/tags/1.4.1.tar.gz -O hypopg.tar.gz && \ echo "9afe6357fd389d8d33fad81703038ce520b09275ec00153c6c89282bcdedd6bc hypopg.tar.gz" | sha256sum --check && \ mkdir hypopg-src && cd hypopg-src && tar xzf ../hypopg.tar.gz --strip-components=1 -C . && \ - make -j $(getconf _NPROCESSORS_ONLN) PG_CONFIG=/usr/local/pgsql/bin/pg_config && \ - make -j $(getconf _NPROCESSORS_ONLN) install PG_CONFIG=/usr/local/pgsql/bin/pg_config && \ + make -j $(getconf _NPROCESSORS_ONLN) && \ + make -j $(getconf _NPROCESSORS_ONLN) install && \ echo 'trusted = true' >> /usr/local/pgsql/share/extension/hypopg.control ######################################################################################### @@ -418,17 +406,16 @@ RUN wget https://github.com/HypoPG/hypopg/archive/refs/tags/1.4.1.tar.gz -O hypo # compile pg_hashids extension # ######################################################################################### -FROM build-deps AS pg-hashids-pg-build +FROM pg-build AS pg-hashids-pg-build ARG PG_VERSION -COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/ # not version-specific # last release v1.2.1 -Jan 12, 2018 RUN wget https://github.com/iCyberon/pg_hashids/archive/refs/tags/v1.2.1.tar.gz -O pg_hashids.tar.gz && \ echo "74576b992d9277c92196dd8d816baa2cc2d8046fe102f3dcd7f3c3febed6822a pg_hashids.tar.gz" | sha256sum --check && \ mkdir pg_hashids-src && cd pg_hashids-src && tar xzf ../pg_hashids.tar.gz --strip-components=1 -C . && \ - make -j $(getconf _NPROCESSORS_ONLN) PG_CONFIG=/usr/local/pgsql/bin/pg_config USE_PGXS=1 && \ - make -j $(getconf _NPROCESSORS_ONLN) install PG_CONFIG=/usr/local/pgsql/bin/pg_config USE_PGXS=1 && \ + make -j $(getconf _NPROCESSORS_ONLN) USE_PGXS=1 && \ + make -j $(getconf _NPROCESSORS_ONLN) install USE_PGXS=1 && \ echo 'trusted = true' >> /usr/local/pgsql/share/extension/pg_hashids.control ######################################################################################### @@ -437,9 +424,8 @@ RUN wget https://github.com/iCyberon/pg_hashids/archive/refs/tags/v1.2.1.tar.gz # compile rum extension # ######################################################################################### -FROM build-deps AS rum-pg-build +FROM pg-build AS rum-pg-build ARG PG_VERSION -COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/ COPY compute/patches/rum.patch /rum.patch @@ -450,8 +436,8 @@ RUN wget https://github.com/postgrespro/rum/archive/cb1edffc57736cd2a4455f8d0fea echo "65e0a752e99f4c3226400c9b899f997049e93503db8bf5c8072efa136d32fd83 rum.tar.gz" | sha256sum --check && \ mkdir rum-src && cd rum-src && tar xzf ../rum.tar.gz --strip-components=1 -C . && \ patch -p1 < /rum.patch && \ - make -j $(getconf _NPROCESSORS_ONLN) PG_CONFIG=/usr/local/pgsql/bin/pg_config USE_PGXS=1 && \ - make -j $(getconf _NPROCESSORS_ONLN) install PG_CONFIG=/usr/local/pgsql/bin/pg_config USE_PGXS=1 && \ + make -j $(getconf _NPROCESSORS_ONLN) USE_PGXS=1 && \ + make -j $(getconf _NPROCESSORS_ONLN) install USE_PGXS=1 && \ echo 'trusted = true' >> /usr/local/pgsql/share/extension/rum.control ######################################################################################### @@ -460,17 +446,16 @@ RUN wget https://github.com/postgrespro/rum/archive/cb1edffc57736cd2a4455f8d0fea # compile pgTAP extension # ######################################################################################### -FROM build-deps AS pgtap-pg-build +FROM pg-build AS pgtap-pg-build ARG PG_VERSION -COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/ # pgtap 1.3.3 supports v17 # last release v1.3.3 - Apr 8, 2024 RUN wget https://github.com/theory/pgtap/archive/refs/tags/v1.3.3.tar.gz -O pgtap.tar.gz && \ echo "325ea79d0d2515bce96bce43f6823dcd3effbd6c54cb2a4d6c2384fffa3a14c7 pgtap.tar.gz" | sha256sum --check && \ mkdir pgtap-src && cd pgtap-src && tar xzf ../pgtap.tar.gz --strip-components=1 -C . && \ - make -j $(getconf _NPROCESSORS_ONLN) PG_CONFIG=/usr/local/pgsql/bin/pg_config && \ - make -j $(getconf _NPROCESSORS_ONLN) install PG_CONFIG=/usr/local/pgsql/bin/pg_config && \ + make -j $(getconf _NPROCESSORS_ONLN) && \ + make -j $(getconf _NPROCESSORS_ONLN) install && \ echo 'trusted = true' >> /usr/local/pgsql/share/extension/pgtap.control ######################################################################################### @@ -479,17 +464,16 @@ RUN wget https://github.com/theory/pgtap/archive/refs/tags/v1.3.3.tar.gz -O pgta # compile ip4r extension # ######################################################################################### -FROM build-deps AS ip4r-pg-build +FROM pg-build AS ip4r-pg-build ARG PG_VERSION -COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/ # not version-specific # last release v2.4.2 - Jul 29, 2023 RUN wget https://github.com/RhodiumToad/ip4r/archive/refs/tags/2.4.2.tar.gz -O ip4r.tar.gz && \ echo "0f7b1f159974f49a47842a8ab6751aecca1ed1142b6d5e38d81b064b2ead1b4b ip4r.tar.gz" | sha256sum --check && \ mkdir ip4r-src && cd ip4r-src && tar xzf ../ip4r.tar.gz --strip-components=1 -C . && \ - make -j $(getconf _NPROCESSORS_ONLN) PG_CONFIG=/usr/local/pgsql/bin/pg_config && \ - make -j $(getconf _NPROCESSORS_ONLN) install PG_CONFIG=/usr/local/pgsql/bin/pg_config && \ + make -j $(getconf _NPROCESSORS_ONLN) && \ + make -j $(getconf _NPROCESSORS_ONLN) install && \ echo 'trusted = true' >> /usr/local/pgsql/share/extension/ip4r.control ######################################################################################### @@ -498,17 +482,16 @@ RUN wget https://github.com/RhodiumToad/ip4r/archive/refs/tags/2.4.2.tar.gz -O i # compile Prefix extension # ######################################################################################### -FROM build-deps AS prefix-pg-build +FROM pg-build AS prefix-pg-build ARG PG_VERSION -COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/ # not version-specific # last release v1.2.10 - Jul 5, 2023 RUN wget https://github.com/dimitri/prefix/archive/refs/tags/v1.2.10.tar.gz -O prefix.tar.gz && \ echo "4342f251432a5f6fb05b8597139d3ccde8dcf87e8ca1498e7ee931ca057a8575 prefix.tar.gz" | sha256sum --check && \ mkdir prefix-src && cd prefix-src && tar xzf ../prefix.tar.gz --strip-components=1 -C . && \ - make -j $(getconf _NPROCESSORS_ONLN) PG_CONFIG=/usr/local/pgsql/bin/pg_config && \ - make -j $(getconf _NPROCESSORS_ONLN) install PG_CONFIG=/usr/local/pgsql/bin/pg_config && \ + make -j $(getconf _NPROCESSORS_ONLN) && \ + make -j $(getconf _NPROCESSORS_ONLN) install && \ echo 'trusted = true' >> /usr/local/pgsql/share/extension/prefix.control ######################################################################################### @@ -517,17 +500,16 @@ RUN wget https://github.com/dimitri/prefix/archive/refs/tags/v1.2.10.tar.gz -O p # compile hll extension # ######################################################################################### -FROM build-deps AS hll-pg-build +FROM pg-build AS hll-pg-build ARG PG_VERSION -COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/ # not version-specific # last release v2.18 - Aug 29, 2023 RUN wget https://github.com/citusdata/postgresql-hll/archive/refs/tags/v2.18.tar.gz -O hll.tar.gz && \ echo "e2f55a6f4c4ab95ee4f1b4a2b73280258c5136b161fe9d059559556079694f0e hll.tar.gz" | sha256sum --check && \ mkdir hll-src && cd hll-src && tar xzf ../hll.tar.gz --strip-components=1 -C . && \ - make -j $(getconf _NPROCESSORS_ONLN) PG_CONFIG=/usr/local/pgsql/bin/pg_config && \ - make -j $(getconf _NPROCESSORS_ONLN) install PG_CONFIG=/usr/local/pgsql/bin/pg_config && \ + make -j $(getconf _NPROCESSORS_ONLN) && \ + make -j $(getconf _NPROCESSORS_ONLN) install && \ echo 'trusted = true' >> /usr/local/pgsql/share/extension/hll.control ######################################################################################### @@ -536,17 +518,16 @@ RUN wget https://github.com/citusdata/postgresql-hll/archive/refs/tags/v2.18.tar # compile plpgsql_check extension # ######################################################################################### -FROM build-deps AS plpgsql-check-pg-build +FROM pg-build AS plpgsql-check-pg-build ARG PG_VERSION -COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/ # plpgsql_check v2.7.11 supports v17 # last release v2.7.11 - Sep 16, 2024 RUN wget https://github.com/okbob/plpgsql_check/archive/refs/tags/v2.7.11.tar.gz -O plpgsql_check.tar.gz && \ echo "208933f8dbe8e0d2628eb3851e9f52e6892b8e280c63700c0f1ce7883625d172 plpgsql_check.tar.gz" | sha256sum --check && \ mkdir plpgsql_check-src && cd plpgsql_check-src && tar xzf ../plpgsql_check.tar.gz --strip-components=1 -C . && \ - make -j $(getconf _NPROCESSORS_ONLN) PG_CONFIG=/usr/local/pgsql/bin/pg_config USE_PGXS=1 && \ - make -j $(getconf _NPROCESSORS_ONLN) install PG_CONFIG=/usr/local/pgsql/bin/pg_config USE_PGXS=1 && \ + make -j $(getconf _NPROCESSORS_ONLN) USE_PGXS=1 && \ + make -j $(getconf _NPROCESSORS_ONLN) install USE_PGXS=1 && \ echo 'trusted = true' >> /usr/local/pgsql/share/extension/plpgsql_check.control ######################################################################################### @@ -555,11 +536,8 @@ RUN wget https://github.com/okbob/plpgsql_check/archive/refs/tags/v2.7.11.tar.gz # compile timescaledb extension # ######################################################################################### -FROM build-deps AS timescaledb-pg-build -COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/ - +FROM pg-build AS timescaledb-pg-build ARG PG_VERSION -ENV PATH="/usr/local/pgsql/bin:$PATH" RUN case "${PG_VERSION}" in \ "v14" | "v15") \ @@ -590,11 +568,8 @@ RUN case "${PG_VERSION}" in \ # compile pg_hint_plan extension # ######################################################################################### -FROM build-deps AS pg-hint-plan-pg-build -COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/ - +FROM pg-build AS pg-hint-plan-pg-build ARG PG_VERSION -ENV PATH="/usr/local/pgsql/bin:$PATH" # version-specific, has separate releases for each version RUN case "${PG_VERSION}" in \ @@ -632,14 +607,12 @@ RUN case "${PG_VERSION}" in \ # compile pg_cron extension # ######################################################################################### -FROM build-deps AS pg-cron-pg-build +FROM pg-build AS pg-cron-pg-build ARG PG_VERSION -COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/ # This is an experimental extension that we do not support on prod yet. # !Do not remove! # We set it in shared_preload_libraries and computes will fail to start if library is not found. -ENV PATH="/usr/local/pgsql/bin/:$PATH" RUN wget https://github.com/citusdata/pg_cron/archive/refs/tags/v1.6.4.tar.gz -O pg_cron.tar.gz && \ echo "52d1850ee7beb85a4cb7185731ef4e5a90d1de216709d8988324b0d02e76af61 pg_cron.tar.gz" | sha256sum --check && \ mkdir pg_cron-src && cd pg_cron-src && tar xzf ../pg_cron.tar.gz --strip-components=1 -C . && \ @@ -653,9 +626,8 @@ RUN wget https://github.com/citusdata/pg_cron/archive/refs/tags/v1.6.4.tar.gz -O # compile rdkit extension # ######################################################################################### -FROM build-deps AS rdkit-pg-build +FROM pg-build AS rdkit-pg-build ARG PG_VERSION -COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/ RUN apt update && \ apt install --no-install-recommends --no-install-suggests -y \ @@ -673,7 +645,13 @@ RUN apt update && \ # Use new version only for v17 # because Release_2024_09_1 has some backward incompatible changes # https://github.com/rdkit/rdkit/releases/tag/Release_2024_09_1 -ENV PATH="/usr/local/pgsql/bin/:/usr/local/pgsql/:$PATH" + +# XXX: /usr/local/pgsql/bin is already in PATH, and that should be enough to find +# pg_config. For some reason the rdkit cmake script doesn't work with just that, +# however. By also adding /usr/local/pgsql, it works, which is weird because there +# are no executables in that directory. +ENV PATH="/usr/local/pgsql:$PATH" + RUN case "${PG_VERSION}" in \ "v17") \ export RDKIT_VERSION=Release_2024_09_1 \ @@ -726,13 +704,11 @@ RUN case "${PG_VERSION}" in \ # compile pg_uuidv7 extension # ######################################################################################### -FROM build-deps AS pg-uuidv7-pg-build +FROM pg-build AS pg-uuidv7-pg-build ARG PG_VERSION -COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/ # not version-specific # last release v1.6.0 - Oct 9, 2024 -ENV PATH="/usr/local/pgsql/bin/:$PATH" RUN wget https://github.com/fboulnois/pg_uuidv7/archive/refs/tags/v1.6.0.tar.gz -O pg_uuidv7.tar.gz && \ echo "0fa6c710929d003f6ce276a7de7a864e9d1667b2d78be3dc2c07f2409eb55867 pg_uuidv7.tar.gz" | sha256sum --check && \ mkdir pg_uuidv7-src && cd pg_uuidv7-src && tar xzf ../pg_uuidv7.tar.gz --strip-components=1 -C . && \ @@ -746,13 +722,11 @@ RUN wget https://github.com/fboulnois/pg_uuidv7/archive/refs/tags/v1.6.0.tar.gz # compile pg_roaringbitmap extension # ######################################################################################### -FROM build-deps AS pg-roaringbitmap-pg-build +FROM pg-build AS pg-roaringbitmap-pg-build ARG PG_VERSION -COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/ # not version-specific # last release v0.5.4 - Jun 28, 2022 -ENV PATH="/usr/local/pgsql/bin/:$PATH" RUN wget https://github.com/ChenHuajun/pg_roaringbitmap/archive/refs/tags/v0.5.4.tar.gz -O pg_roaringbitmap.tar.gz && \ echo "b75201efcb1c2d1b014ec4ae6a22769cc7a224e6e406a587f5784a37b6b5a2aa pg_roaringbitmap.tar.gz" | sha256sum --check && \ mkdir pg_roaringbitmap-src && cd pg_roaringbitmap-src && tar xzf ../pg_roaringbitmap.tar.gz --strip-components=1 -C . && \ @@ -766,16 +740,14 @@ RUN wget https://github.com/ChenHuajun/pg_roaringbitmap/archive/refs/tags/v0.5.4 # compile pg_semver extension # ######################################################################################### -FROM build-deps AS pg-semver-pg-build +FROM pg-build AS pg-semver-pg-build ARG PG_VERSION -COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/ # Release 0.40.0 breaks backward compatibility with previous versions # see release note https://github.com/theory/pg-semver/releases/tag/v0.40.0 # Use new version only for v17 # # last release v0.40.0 - Jul 22, 2024 -ENV PATH="/usr/local/pgsql/bin/:$PATH" RUN case "${PG_VERSION}" in \ "v17") \ export SEMVER_VERSION=0.40.0 \ @@ -802,13 +774,11 @@ RUN case "${PG_VERSION}" in \ # compile pg_embedding extension # ######################################################################################### -FROM build-deps AS pg-embedding-pg-build -COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/ +FROM pg-build AS pg-embedding-pg-build # This is our extension, support stopped in favor of pgvector # TODO: deprecate it ARG PG_VERSION -ENV PATH="/usr/local/pgsql/bin/:$PATH" RUN case "${PG_VERSION}" in \ "v14" | "v15") \ export PG_EMBEDDING_VERSION=0.3.5 \ @@ -829,26 +799,19 @@ RUN case "${PG_VERSION}" in \ # compile anon extension # ######################################################################################### -FROM build-deps AS pg-anon-pg-build +FROM pg-build AS pg-anon-pg-build ARG PG_VERSION -COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/ # This is an experimental extension, never got to real production. # !Do not remove! It can be present in shared_preload_libraries and compute will fail to start if library is not found. -ENV PATH="/usr/local/pgsql/bin/:$PATH" RUN case "${PG_VERSION}" in "v17") \ echo "postgresql_anonymizer does not yet support PG17" && exit 0;; \ esac && \ wget https://github.com/neondatabase/postgresql_anonymizer/archive/refs/tags/neon_1.1.1.tar.gz -O pg_anon.tar.gz && \ echo "321ea8d5c1648880aafde850a2c576e4a9e7b9933a34ce272efc839328999fa9 pg_anon.tar.gz" | sha256sum --check && \ mkdir pg_anon-src && cd pg_anon-src && tar xzf ../pg_anon.tar.gz --strip-components=1 -C . && \ - find /usr/local/pgsql -type f | sed 's|^/usr/local/pgsql/||' > /before.txt &&\ - make -j $(getconf _NPROCESSORS_ONLN) install PG_CONFIG=/usr/local/pgsql/bin/pg_config && \ - echo 'trusted = true' >> /usr/local/pgsql/share/extension/anon.control && \ - find /usr/local/pgsql -type f | sed 's|^/usr/local/pgsql/||' > /after.txt &&\ - mkdir -p /extensions/anon && cp /usr/local/pgsql/share/extension/anon.control /extensions/anon && \ - sort -o /before.txt /before.txt && sort -o /after.txt /after.txt && \ - comm -13 /before.txt /after.txt | tar --directory=/usr/local/pgsql --zstd -cf /extensions/anon.tar.zst -T - + make -j $(getconf _NPROCESSORS_ONLN) install && \ + echo 'trusted = true' >> /usr/local/pgsql/share/extension/anon.control ######################################################################################### # @@ -856,9 +819,8 @@ RUN case "${PG_VERSION}" in "v17") \ # This layer is used to build `pgrx` deps # ######################################################################################### -FROM build-deps AS rust-extensions-build +FROM pg-build AS rust-extensions-build ARG PG_VERSION -COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/ RUN apt update && \ apt install --no-install-recommends --no-install-suggests -y curl libclang-dev && \ @@ -866,7 +828,7 @@ RUN apt update && \ useradd -ms /bin/bash nonroot -b /home ENV HOME=/home/nonroot -ENV PATH="/home/nonroot/.cargo/bin:/usr/local/pgsql/bin/:$PATH" +ENV PATH="/home/nonroot/.cargo/bin:$PATH" USER nonroot WORKDIR /home/nonroot @@ -893,9 +855,8 @@ USER root # and eventually get merged with `rust-extensions-build` # ######################################################################################### -FROM build-deps AS rust-extensions-build-pgrx12 +FROM pg-build AS rust-extensions-build-pgrx12 ARG PG_VERSION -COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/ RUN apt update && \ apt install --no-install-recommends --no-install-suggests -y curl libclang-dev && \ @@ -903,7 +864,7 @@ RUN apt update && \ useradd -ms /bin/bash nonroot -b /home ENV HOME=/home/nonroot -ENV PATH="/home/nonroot/.cargo/bin:/usr/local/pgsql/bin/:$PATH" +ENV PATH="/home/nonroot/.cargo/bin:$PATH" USER nonroot WORKDIR /home/nonroot @@ -911,7 +872,7 @@ RUN curl -sSO https://static.rust-lang.org/rustup/dist/$(uname -m)-unknown-linux chmod +x rustup-init && \ ./rustup-init -y --no-modify-path --profile minimal --default-toolchain stable && \ rm rustup-init && \ - cargo install --locked --version 0.12.6 cargo-pgrx && \ + cargo install --locked --version 0.12.9 cargo-pgrx && \ /bin/bash -c 'cargo pgrx init --pg${PG_VERSION:1}=/usr/local/pgsql/bin/pg_config' USER root @@ -948,19 +909,19 @@ RUN apt update && apt install --no-install-recommends --no-install-suggests -y p mkdir pgrag-src && cd pgrag-src && tar xzf ../pgrag.tar.gz --strip-components=1 -C . && \ \ cd exts/rag && \ - sed -i 's/pgrx = "0.12.6"/pgrx = { version = "0.12.6", features = [ "unsafe-postgres" ] }/g' Cargo.toml && \ + sed -i 's/pgrx = "0.12.6"/pgrx = { version = "0.12.9", features = [ "unsafe-postgres" ] }/g' Cargo.toml && \ cargo pgrx install --release && \ echo "trusted = true" >> /usr/local/pgsql/share/extension/rag.control && \ \ cd ../rag_bge_small_en_v15 && \ - sed -i 's/pgrx = "0.12.6"/pgrx = { version = "0.12.6", features = [ "unsafe-postgres" ] }/g' Cargo.toml && \ + sed -i 's/pgrx = "0.12.6"/pgrx = { version = "0.12.9", features = [ "unsafe-postgres" ] }/g' Cargo.toml && \ ORT_LIB_LOCATION=/home/nonroot/onnxruntime-src/build/Linux \ REMOTE_ONNX_URL=http://pg-ext-s3-gateway/pgrag-data/bge_small_en_v15.onnx \ cargo pgrx install --release --features remote_onnx && \ echo "trusted = true" >> /usr/local/pgsql/share/extension/rag_bge_small_en_v15.control && \ \ cd ../rag_jina_reranker_v1_tiny_en && \ - sed -i 's/pgrx = "0.12.6"/pgrx = { version = "0.12.6", features = [ "unsafe-postgres" ] }/g' Cargo.toml && \ + sed -i 's/pgrx = "0.12.6"/pgrx = { version = "0.12.9", features = [ "unsafe-postgres" ] }/g' Cargo.toml && \ ORT_LIB_LOCATION=/home/nonroot/onnxruntime-src/build/Linux \ REMOTE_ONNX_URL=http://pg-ext-s3-gateway/pgrag-data/jina_reranker_v1_tiny_en.onnx \ cargo pgrx install --release --features remote_onnx && \ @@ -976,29 +937,17 @@ RUN apt update && apt install --no-install-recommends --no-install-suggests -y p FROM rust-extensions-build-pgrx12 AS pg-jsonschema-pg-build ARG PG_VERSION -# version 0.3.3 supports v17 # last release v0.3.3 - Oct 16, 2024 -# -# there were no breaking changes -# so we can use the same version for all postgres versions -RUN case "${PG_VERSION}" in \ - "v14" | "v15" | "v16" | "v17") \ - export PG_JSONSCHEMA_VERSION=0.3.3 \ - export PG_JSONSCHEMA_CHECKSUM=40c2cffab4187e0233cb8c3bde013be92218c282f95f4469c5282f6b30d64eac \ - ;; \ - *) \ - echo "unexpected PostgreSQL version" && exit 1 \ - ;; \ - esac && \ - wget https://github.com/supabase/pg_jsonschema/archive/refs/tags/v${PG_JSONSCHEMA_VERSION}.tar.gz -O pg_jsonschema.tar.gz && \ - echo "${PG_JSONSCHEMA_CHECKSUM} pg_jsonschema.tar.gz" | sha256sum --check && \ +RUN wget https://github.com/supabase/pg_jsonschema/archive/refs/tags/v0.3.3.tar.gz -O pg_jsonschema.tar.gz && \ + echo "40c2cffab4187e0233cb8c3bde013be92218c282f95f4469c5282f6b30d64eac pg_jsonschema.tar.gz" | sha256sum --check && \ mkdir pg_jsonschema-src && cd pg_jsonschema-src && tar xzf ../pg_jsonschema.tar.gz --strip-components=1 -C . && \ # see commit 252b3685a27a0f4c31a0f91e983c6314838e89e8 # `unsafe-postgres` feature allows to build pgx extensions # against postgres forks that decided to change their ABI name (like us). # With that we can build extensions without forking them and using stock # pgx. As this feature is new few manual version bumps were required. - sed -i 's/pgrx = "0.12.6"/pgrx = { version = "0.12.6", features = [ "unsafe-postgres" ] }/g' Cargo.toml && \ + sed -i 's/pgrx = "0.12.6"/pgrx = { version = "0.12.9", features = [ "unsafe-postgres" ] }/g' Cargo.toml && \ + sed -i 's/pgrx-tests = "0.12.6"/pgrx-tests = "0.12.9"/g' Cargo.toml && \ cargo pgrx install --release && \ echo "trusted = true" >> /usr/local/pgsql/share/extension/pg_jsonschema.control @@ -1012,24 +961,12 @@ RUN case "${PG_VERSION}" in \ FROM rust-extensions-build-pgrx12 AS pg-graphql-pg-build ARG PG_VERSION -# version 1.5.9 supports v17 # last release v1.5.9 - Oct 16, 2024 -# -# there were no breaking changes -# so we can use the same version for all postgres versions -RUN case "${PG_VERSION}" in \ - "v14" | "v15" | "v16" | "v17") \ - export PG_GRAPHQL_VERSION=1.5.9 \ - export PG_GRAPHQL_CHECKSUM=cf768385a41278be1333472204fc0328118644ae443182cf52f7b9b23277e497 \ - ;; \ - *) \ - echo "unexpected PostgreSQL version" && exit 1 \ - ;; \ - esac && \ - wget https://github.com/supabase/pg_graphql/archive/refs/tags/v${PG_GRAPHQL_VERSION}.tar.gz -O pg_graphql.tar.gz && \ - echo "${PG_GRAPHQL_CHECKSUM} pg_graphql.tar.gz" | sha256sum --check && \ +RUN wget https://github.com/supabase/pg_graphql/archive/refs/tags/v1.5.9.tar.gz -O pg_graphql.tar.gz && \ + echo "cf768385a41278be1333472204fc0328118644ae443182cf52f7b9b23277e497 pg_graphql.tar.gz" | sha256sum --check && \ mkdir pg_graphql-src && cd pg_graphql-src && tar xzf ../pg_graphql.tar.gz --strip-components=1 -C . && \ - sed -i 's/pgrx = "=0.12.6"/pgrx = { version = "0.12.6", features = [ "unsafe-postgres" ] }/g' Cargo.toml && \ + sed -i 's/pgrx = "=0.12.6"/pgrx = { version = "=0.12.9", features = [ "unsafe-postgres" ] }/g' Cargo.toml && \ + sed -i 's/pgrx-tests = "=0.12.6"/pgrx-tests = "=0.12.9"/g' Cargo.toml && \ cargo pgrx install --release && \ # it's needed to enable extension because it uses untrusted C language sed -i 's/superuser = false/superuser = true/g' /usr/local/pgsql/share/extension/pg_graphql.control && \ @@ -1050,33 +987,58 @@ ARG PG_VERSION RUN wget https://github.com/kelvich/pg_tiktoken/archive/9118dd4549b7d8c0bbc98e04322499f7bf2fa6f7.tar.gz -O pg_tiktoken.tar.gz && \ echo "a5bc447e7920ee149d3c064b8b9f0086c0e83939499753178f7d35788416f628 pg_tiktoken.tar.gz" | sha256sum --check && \ mkdir pg_tiktoken-src && cd pg_tiktoken-src && tar xzf ../pg_tiktoken.tar.gz --strip-components=1 -C . && \ - # TODO update pgrx version in the pg_tiktoken repo and remove this line - sed -i 's/pgrx = { version = "=0.10.2",/pgrx = { version = "0.11.3",/g' Cargo.toml && \ - sed -i 's/pgrx-tests = "=0.10.2"/pgrx-tests = "0.11.3"/g' Cargo.toml && \ + sed -i 's/pgrx = { version = "=0.12.6",/pgrx = { version = "0.12.9",/g' Cargo.toml && \ + sed -i 's/pgrx-tests = "=0.12.6"/pgrx-tests = "0.12.9"/g' Cargo.toml && \ cargo pgrx install --release && \ echo "trusted = true" >> /usr/local/pgsql/share/extension/pg_tiktoken.control ######################################################################################### # # Layer "pg-pgx-ulid-build" -# Compile "pgx_ulid" extension +# Compile "pgx_ulid" extension for v16 and below # ######################################################################################### FROM rust-extensions-build AS pg-pgx-ulid-build ARG PG_VERSION -# doesn't support v17 yet -# https://github.com/pksunkara/pgx_ulid/pull/52 -RUN case "${PG_VERSION}" in "v17") \ - echo "pgx_ulid does not support pg17 as of the latest version (0.1.5)" && exit 0;; \ +RUN case "${PG_VERSION}" in \ + "v14" | "v15" | "v16") \ + ;; \ + *) \ + echo "skipping the version of pgx_ulid for $PG_VERSION" && exit 0 \ + ;; \ esac && \ wget https://github.com/pksunkara/pgx_ulid/archive/refs/tags/v0.1.5.tar.gz -O pgx_ulid.tar.gz && \ - echo "9d1659a2da65af0133d5451c454de31b37364e3502087dadf579f790bc8bef17 pgx_ulid.tar.gz" | sha256sum --check && \ + echo "9d1659a2da65af0133d5451c454de31b37364e3502087dadf579f790bc8bef17 pgx_ulid.tar.gz" | sha256sum --check && \ mkdir pgx_ulid-src && cd pgx_ulid-src && tar xzf ../pgx_ulid.tar.gz --strip-components=1 -C . && \ - sed -i 's/pgrx = "^0.11.2"/pgrx = { version = "=0.11.3", features = [ "unsafe-postgres" ] }/g' Cargo.toml && \ + sed -i 's/pgrx = "^0.11.2"/pgrx = { version = "0.11.3", features = [ "unsafe-postgres" ] }/g' Cargo.toml && \ cargo pgrx install --release && \ - echo "trusted = true" >> /usr/local/pgsql/share/extension/ulid.control + echo 'trusted = true' >> /usr/local/pgsql/share/extension/ulid.control + +######################################################################################### +# +# Layer "pg-pgx-ulid-pgrx12-build" +# Compile "pgx_ulid" extension for v17 and up +# +######################################################################################### + +FROM rust-extensions-build-pgrx12 AS pg-pgx-ulid-pgrx12-build +ARG PG_VERSION + +RUN case "${PG_VERSION}" in \ + "v17") \ + ;; \ + *) \ + echo "skipping the version of pgx_ulid for $PG_VERSION" && exit 0 \ + ;; \ + esac && \ + wget https://github.com/pksunkara/pgx_ulid/archive/refs/tags/v0.2.0.tar.gz -O pgx_ulid.tar.gz && \ + echo "cef6a9a2e5e7bd1a10a18989286586ee9e6c1c06005a4055cff190de41bf3e9f pgx_ulid.tar.gz" | sha256sum --check && \ + mkdir pgx_ulid-src && cd pgx_ulid-src && tar xzf ../pgx_ulid.tar.gz --strip-components=1 -C . && \ + sed -i 's/pgrx = "^0.12.7"/pgrx = { version = "0.12.9", features = [ "unsafe-postgres" ] }/g' Cargo.toml && \ + cargo pgrx install --release && \ + echo 'trusted = true' >> /usr/local/pgsql/share/extension/pgx_ulid.control ######################################################################################### # @@ -1091,10 +1053,14 @@ ARG PG_VERSION # NOTE: local_proxy depends on the version of pg_session_jwt # Do not update without approve from proxy team # Make sure the version is reflected in proxy/src/serverless/local_conn_pool.rs -RUN wget https://github.com/neondatabase/pg_session_jwt/archive/refs/tags/v0.1.2-v17.tar.gz -O pg_session_jwt.tar.gz && \ - echo "c8ecbed9cb8c6441bce5134a176002b043018adf9d05a08e457dda233090a86e pg_session_jwt.tar.gz" | sha256sum --check && \ +RUN wget https://github.com/neondatabase/pg_session_jwt/archive/refs/tags/v0.2.0.tar.gz -O pg_session_jwt.tar.gz && \ + echo "5ace028e591f2e000ca10afa5b1ca62203ebff014c2907c0ec3b29c36f28a1bb pg_session_jwt.tar.gz" | sha256sum --check && \ mkdir pg_session_jwt-src && cd pg_session_jwt-src && tar xzf ../pg_session_jwt.tar.gz --strip-components=1 -C . && \ - sed -i 's/pgrx = "0.12.6"/pgrx = { version = "=0.12.6", features = [ "unsafe-postgres" ] }/g' Cargo.toml && \ + sed -i 's/pgrx = "0.12.6"/pgrx = { version = "0.12.9", features = [ "unsafe-postgres" ] }/g' Cargo.toml && \ + sed -i 's/version = "0.12.6"/version = "0.12.9"/g' pgrx-tests/Cargo.toml && \ + sed -i 's/pgrx = "=0.12.6"/pgrx = { version = "=0.12.9", features = [ "unsafe-postgres" ] }/g' pgrx-tests/Cargo.toml && \ + sed -i 's/pgrx-macros = "=0.12.6"/pgrx-macros = "=0.12.9"/g' pgrx-tests/Cargo.toml && \ + sed -i 's/pgrx-pg-config = "=0.12.6"/pgrx-pg-config = "=0.12.9"/g' pgrx-tests/Cargo.toml && \ cargo pgrx install --release ######################################################################################### @@ -1104,13 +1070,11 @@ RUN wget https://github.com/neondatabase/pg_session_jwt/archive/refs/tags/v0.1.2 # ######################################################################################### -FROM build-deps AS wal2json-pg-build +FROM pg-build AS wal2json-pg-build ARG PG_VERSION -COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/ # wal2json wal2json_2_6 supports v17 # last release wal2json_2_6 - Apr 25, 2024 -ENV PATH="/usr/local/pgsql/bin/:$PATH" RUN wget https://github.com/eulerto/wal2json/archive/refs/tags/wal2json_2_6.tar.gz -O wal2json.tar.gz && \ echo "18b4bdec28c74a8fc98a11c72de38378a760327ef8e5e42e975b0029eb96ba0d wal2json.tar.gz" | sha256sum --check && \ mkdir wal2json-src && cd wal2json-src && tar xzf ../wal2json.tar.gz --strip-components=1 -C . && \ @@ -1123,13 +1087,11 @@ RUN wget https://github.com/eulerto/wal2json/archive/refs/tags/wal2json_2_6.tar. # compile pg_ivm extension # ######################################################################################### -FROM build-deps AS pg-ivm-build +FROM pg-build AS pg-ivm-build ARG PG_VERSION -COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/ # pg_ivm v1.9 supports v17 # last release v1.9 - Jul 31 -ENV PATH="/usr/local/pgsql/bin/:$PATH" RUN wget https://github.com/sraoss/pg_ivm/archive/refs/tags/v1.9.tar.gz -O pg_ivm.tar.gz && \ echo "59e15722939f274650abf637f315dd723c87073496ca77236b044cb205270d8b pg_ivm.tar.gz" | sha256sum --check && \ mkdir pg_ivm-src && cd pg_ivm-src && tar xzf ../pg_ivm.tar.gz --strip-components=1 -C . && \ @@ -1143,13 +1105,11 @@ RUN wget https://github.com/sraoss/pg_ivm/archive/refs/tags/v1.9.tar.gz -O pg_iv # compile pg_partman extension # ######################################################################################### -FROM build-deps AS pg-partman-build +FROM pg-build AS pg-partman-build ARG PG_VERSION -COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/ # should support v17 https://github.com/pgpartman/pg_partman/discussions/693 # last release 5.1.0 Apr 2, 2024 -ENV PATH="/usr/local/pgsql/bin/:$PATH" RUN wget https://github.com/pgpartman/pg_partman/archive/refs/tags/v5.1.0.tar.gz -O pg_partman.tar.gz && \ echo "3e3a27d7ff827295d5c55ef72f07a49062d6204b3cb0b9a048645d6db9f3cb9f pg_partman.tar.gz" | sha256sum --check && \ mkdir pg_partman-src && cd pg_partman-src && tar xzf ../pg_partman.tar.gz --strip-components=1 -C . && \ @@ -1165,24 +1125,12 @@ RUN wget https://github.com/pgpartman/pg_partman/archive/refs/tags/v5.1.0.tar.gz ######################################################################################### FROM rust-extensions-build AS pg-mooncake-build ARG PG_VERSION -COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/ -# The topmost commit in the `neon` branch at the time of writing this -# https://github.com/Mooncake-Labs/pg_mooncake/commits/neon/ -# https://github.com/Mooncake-Labs/pg_mooncake/commit/077c92c452bb6896a7b7776ee95f039984f076af -ENV PG_MOONCAKE_VERSION=077c92c452bb6896a7b7776ee95f039984f076af -ENV PATH="/usr/local/pgsql/bin/:$PATH" - -RUN case "${PG_VERSION}" in \ - 'v14') \ - echo "pg_mooncake is not supported on Postgres ${PG_VERSION}" && exit 0;; \ - esac && \ - git clone --depth 1 --branch neon https://github.com/Mooncake-Labs/pg_mooncake.git pg_mooncake-src && \ - cd pg_mooncake-src && \ - git checkout "${PG_MOONCAKE_VERSION}" && \ - git submodule update --init --depth 1 --recursive && \ - make BUILD_TYPE=release -j $(getconf _NPROCESSORS_ONLN) && \ - make BUILD_TYPE=release -j $(getconf _NPROCESSORS_ONLN) install && \ +RUN wget https://github.com/Mooncake-Labs/pg_mooncake/releases/download/v0.1.0/pg_mooncake-0.1.0.tar.gz -O pg_mooncake.tar.gz && \ + echo "eafd059b77f541f11525eb8affcd66a176968cbd8fe7c0d436e733f2aa4da59f pg_mooncake.tar.gz" | sha256sum --check && \ + mkdir pg_mooncake-src && cd pg_mooncake-src && tar xzf ../pg_mooncake.tar.gz --strip-components=1 -C . && \ + make release -j $(getconf _NPROCESSORS_ONLN) && \ + make install -j $(getconf _NPROCESSORS_ONLN) && \ echo 'trusted = true' >> /usr/local/pgsql/share/extension/pg_mooncake.control ######################################################################################### @@ -1192,11 +1140,8 @@ RUN case "${PG_VERSION}" in \ # ######################################################################################### -FROM build-deps AS pg-repack-build +FROM pg-build AS pg-repack-build ARG PG_VERSION -COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/ - -ENV PATH="/usr/local/pgsql/bin/:$PATH" RUN wget https://github.com/reorg/pg_repack/archive/refs/tags/ver_1.5.2.tar.gz -O pg_repack.tar.gz && \ echo '4516cad42251ed3ad53ff619733004db47d5755acac83f75924cd94d1c4fb681 pg_repack.tar.gz' | sha256sum --check && \ @@ -1238,6 +1183,7 @@ COPY --from=timescaledb-pg-build /usr/local/pgsql/ /usr/local/pgsql/ COPY --from=pg-hint-plan-pg-build /usr/local/pgsql/ /usr/local/pgsql/ COPY --from=pg-cron-pg-build /usr/local/pgsql/ /usr/local/pgsql/ COPY --from=pg-pgx-ulid-build /usr/local/pgsql/ /usr/local/pgsql/ +COPY --from=pg-pgx-ulid-pgrx12-build /usr/local/pgsql/ /usr/local/pgsql/ COPY --from=pg-session-jwt-build /usr/local/pgsql/ /usr/local/pgsql/ COPY --from=rdkit-pg-build /usr/local/pgsql/ /usr/local/pgsql/ COPY --from=pg-uuidv7-pg-build /usr/local/pgsql/ /usr/local/pgsql/ @@ -1267,20 +1213,6 @@ RUN make -j $(getconf _NPROCESSORS_ONLN) \ make -j $(getconf _NPROCESSORS_ONLN) \ PG_CONFIG=/usr/local/pgsql/bin/pg_config \ -C pgxn/neon_rmgr \ - -s install && \ - case "${PG_VERSION}" in \ - "v14" | "v15") \ - ;; \ - "v16" | "v17") \ - echo "Skipping HNSW for PostgreSQL ${PG_VERSION}" && exit 0 \ - ;; \ - *) \ - echo "unexpected PostgreSQL version" && exit 1 \ - ;; \ - esac && \ - make -j $(getconf _NPROCESSORS_ONLN) \ - PG_CONFIG=/usr/local/pgsql/bin/pg_config \ - -C pgxn/hnsw \ -s install ######################################################################################### @@ -1297,17 +1229,6 @@ USER nonroot COPY --chown=nonroot . . RUN mold -run cargo build --locked --profile release-line-debug-size-lto --bin compute_ctl --bin fast_import --bin local_proxy -######################################################################################### -# -# Final compute-tools image -# -######################################################################################### - -FROM debian:$DEBIAN_FLAVOR AS compute-tools-image - -COPY --from=compute-tools /home/nonroot/target/release-line-debug-size-lto/compute_ctl /usr/local/bin/compute_ctl -COPY --from=compute-tools /home/nonroot/target/release-line-debug-size-lto/fast_import /usr/local/bin/fast_import - ######################################################################################### # # Layer "pgbouncer" @@ -1344,11 +1265,11 @@ RUN set -e \ # ######################################################################################### -FROM quay.io/prometheuscommunity/postgres-exporter:v0.12.1 AS postgres-exporter +FROM quay.io/prometheuscommunity/postgres-exporter:v0.16.0 AS postgres-exporter # Keep the version the same as in build-tools.Dockerfile and # test_runner/regress/test_compute_metrics.py. -FROM burningalchemist/sql_exporter:0.16.0 AS sql-exporter +FROM burningalchemist/sql_exporter:0.17.0 AS sql-exporter ######################################################################################### # diff --git a/compute_tools/Cargo.toml b/compute_tools/Cargo.toml index 33892813c4..b04f364cbb 100644 --- a/compute_tools/Cargo.toml +++ b/compute_tools/Cargo.toml @@ -51,6 +51,7 @@ tracing-subscriber.workspace = true tracing-utils.workspace = true thiserror.workspace = true url.workspace = true +uuid.workspace = true prometheus.workspace = true postgres_initdb.workspace = true diff --git a/compute_tools/src/bin/compute_ctl.rs b/compute_tools/src/bin/compute_ctl.rs index 04432ad0f3..b98cf706d3 100644 --- a/compute_tools/src/bin/compute_ctl.rs +++ b/compute_tools/src/bin/compute_ctl.rs @@ -111,11 +111,6 @@ fn main() -> Result<()> { fn init() -> Result<(String, clap::ArgMatches)> { init_tracing_and_logging(DEFAULT_LOG_LEVEL)?; - opentelemetry::global::set_error_handler(|err| { - tracing::info!("OpenTelemetry error: {err}"); - }) - .expect("global error handler lock poisoned"); - let mut signals = Signals::new([SIGINT, SIGTERM, SIGQUIT])?; thread::spawn(move || { for sig in signals.forever() { diff --git a/compute_tools/src/bin/fast_import.rs b/compute_tools/src/bin/fast_import.rs index 793ec4cf10..5b008f8182 100644 --- a/compute_tools/src/bin/fast_import.rs +++ b/compute_tools/src/bin/fast_import.rs @@ -17,7 +17,7 @@ //! //! # Local Testing //! -//! - Comment out most of the pgxns in The Dockerfile.compute-tools to speed up the build. +//! - Comment out most of the pgxns in compute-node.Dockerfile to speed up the build. //! - Build the image with the following command: //! //! ```bash @@ -31,7 +31,7 @@ use camino::{Utf8Path, Utf8PathBuf}; use clap::Parser; use compute_tools::extension_server::{get_pg_version, PostgresMajorVersion}; use nix::unistd::Pid; -use tracing::{info, info_span, warn, Instrument}; +use tracing::{error, info, info_span, warn, Instrument}; use utils::fs_ext::is_directory_empty; #[path = "fast_import/aws_s3_sync.rs"] @@ -41,12 +41,19 @@ mod child_stdio_to_log; #[path = "fast_import/s3_uri.rs"] mod s3_uri; +const PG_WAIT_TIMEOUT: std::time::Duration = std::time::Duration::from_secs(600); +const PG_WAIT_RETRY_INTERVAL: std::time::Duration = std::time::Duration::from_millis(300); + #[derive(clap::Parser)] struct Args { #[clap(long)] working_directory: Utf8PathBuf, #[clap(long, env = "NEON_IMPORTER_S3_PREFIX")] - s3_prefix: s3_uri::S3Uri, + s3_prefix: Option, + #[clap(long)] + source_connection_string: Option, + #[clap(short, long)] + interactive: bool, #[clap(long)] pg_bin_dir: Utf8PathBuf, #[clap(long)] @@ -77,30 +84,70 @@ pub(crate) async fn main() -> anyhow::Result<()> { info!("starting"); - let Args { - working_directory, - s3_prefix, - pg_bin_dir, - pg_lib_dir, - } = Args::parse(); + let args = Args::parse(); - let aws_config = aws_config::load_defaults(BehaviorVersion::v2024_03_28()).await; + // Validate arguments + if args.s3_prefix.is_none() && args.source_connection_string.is_none() { + anyhow::bail!("either s3_prefix or source_connection_string must be specified"); + } + if args.s3_prefix.is_some() && args.source_connection_string.is_some() { + anyhow::bail!("only one of s3_prefix or source_connection_string can be specified"); + } - let spec: Spec = { - let spec_key = s3_prefix.append("/spec.json"); - let s3_client = aws_sdk_s3::Client::new(&aws_config); - let object = s3_client - .get_object() - .bucket(&spec_key.bucket) - .key(spec_key.key) - .send() - .await - .context("get spec from s3")? - .body - .collect() - .await - .context("download spec body")?; - serde_json::from_slice(&object.into_bytes()).context("parse spec as json")? + let working_directory = args.working_directory; + let pg_bin_dir = args.pg_bin_dir; + let pg_lib_dir = args.pg_lib_dir; + + // Initialize AWS clients only if s3_prefix is specified + let (aws_config, kms_client) = if args.s3_prefix.is_some() { + let config = aws_config::load_defaults(BehaviorVersion::v2024_03_28()).await; + let kms = aws_sdk_kms::Client::new(&config); + (Some(config), Some(kms)) + } else { + (None, None) + }; + + // Get source connection string either from S3 spec or direct argument + let source_connection_string = if let Some(s3_prefix) = &args.s3_prefix { + let spec: Spec = { + let spec_key = s3_prefix.append("/spec.json"); + let s3_client = aws_sdk_s3::Client::new(aws_config.as_ref().unwrap()); + let object = s3_client + .get_object() + .bucket(&spec_key.bucket) + .key(spec_key.key) + .send() + .await + .context("get spec from s3")? + .body + .collect() + .await + .context("download spec body")?; + serde_json::from_slice(&object.into_bytes()).context("parse spec as json")? + }; + + match spec.encryption_secret { + EncryptionSecret::KMS { key_id } => { + let mut output = kms_client + .unwrap() + .decrypt() + .key_id(key_id) + .ciphertext_blob(aws_sdk_s3::primitives::Blob::new( + spec.source_connstring_ciphertext_base64, + )) + .send() + .await + .context("decrypt source connection string")?; + let plaintext = output + .plaintext + .take() + .context("get plaintext source connection string")?; + String::from_utf8(plaintext.into_inner()) + .context("parse source connection string as utf8")? + } + } + } else { + args.source_connection_string.unwrap() }; match tokio::fs::create_dir(&working_directory).await { @@ -123,15 +170,6 @@ pub(crate) async fn main() -> anyhow::Result<()> { .await .context("create pgdata directory")?; - // - // Setup clients - // - let aws_config = aws_config::load_defaults(BehaviorVersion::v2024_03_28()).await; - let kms_client = aws_sdk_kms::Client::new(&aws_config); - - // - // Initialize pgdata - // let pgbin = pg_bin_dir.join("postgres"); let pg_version = match get_pg_version(pgbin.as_ref()) { PostgresMajorVersion::V14 => 14, @@ -170,7 +208,13 @@ pub(crate) async fn main() -> anyhow::Result<()> { .args(["-c", &format!("max_parallel_workers={nproc}")]) .args(["-c", &format!("max_parallel_workers_per_gather={nproc}")]) .args(["-c", &format!("max_worker_processes={nproc}")]) - .args(["-c", "effective_io_concurrency=100"]) + .args([ + "-c", + &format!( + "effective_io_concurrency={}", + if cfg!(target_os = "macos") { 0 } else { 100 } + ), + ]) .env_clear() .stdout(std::process::Stdio::piped()) .stderr(std::process::Stdio::piped()) @@ -185,44 +229,58 @@ pub(crate) async fn main() -> anyhow::Result<()> { ) .instrument(info_span!("postgres")), ); + + // Create neondb database in the running postgres let restore_pg_connstring = format!("host=localhost port=5432 user={superuser} dbname=postgres"); + + let start_time = std::time::Instant::now(); + loop { - let res = tokio_postgres::connect(&restore_pg_connstring, tokio_postgres::NoTls).await; - if res.is_ok() { - info!("postgres is ready, could connect to it"); - break; + if start_time.elapsed() > PG_WAIT_TIMEOUT { + error!( + "timeout exceeded: failed to poll postgres and create database within 10 minutes" + ); + std::process::exit(1); + } + + match tokio_postgres::connect(&restore_pg_connstring, tokio_postgres::NoTls).await { + Ok((client, connection)) => { + // Spawn the connection handling task to maintain the connection + tokio::spawn(async move { + if let Err(e) = connection.await { + warn!("connection error: {}", e); + } + }); + + match client.simple_query("CREATE DATABASE neondb;").await { + Ok(_) => { + info!("created neondb database"); + break; + } + Err(e) => { + warn!( + "failed to create database: {}, retying in {}s", + e, + PG_WAIT_RETRY_INTERVAL.as_secs_f32() + ); + tokio::time::sleep(PG_WAIT_RETRY_INTERVAL).await; + continue; + } + } + } + Err(_) => { + info!( + "postgres not ready yet, retrying in {}s", + PG_WAIT_RETRY_INTERVAL.as_secs_f32() + ); + tokio::time::sleep(PG_WAIT_RETRY_INTERVAL).await; + continue; + } } } - // - // Decrypt connection string - // - let source_connection_string = { - match spec.encryption_secret { - EncryptionSecret::KMS { key_id } => { - let mut output = kms_client - .decrypt() - .key_id(key_id) - .ciphertext_blob(aws_sdk_s3::primitives::Blob::new( - spec.source_connstring_ciphertext_base64, - )) - .send() - .await - .context("decrypt source connection string")?; - let plaintext = output - .plaintext - .take() - .context("get plaintext source connection string")?; - String::from_utf8(plaintext.into_inner()) - .context("parse source connection string as utf8")? - } - } - }; - - // - // Start the work - // + let restore_pg_connstring = restore_pg_connstring.replace("dbname=postgres", "dbname=neondb"); let dumpdir = working_directory.join("dumpdir"); @@ -310,6 +368,12 @@ pub(crate) async fn main() -> anyhow::Result<()> { } } + // If interactive mode, wait for Ctrl+C + if args.interactive { + info!("Running in interactive mode. Press Ctrl+C to shut down."); + tokio::signal::ctrl_c().await.context("wait for ctrl-c")?; + } + info!("shutdown postgres"); { nix::sys::signal::kill( @@ -325,21 +389,24 @@ pub(crate) async fn main() -> anyhow::Result<()> { .context("wait for postgres to shut down")?; } - info!("upload pgdata"); - aws_s3_sync::sync(Utf8Path::new(&pgdata_dir), &s3_prefix.append("/pgdata/")) - .await - .context("sync dump directory to destination")?; - - info!("write status"); - { - let status_dir = working_directory.join("status"); - std::fs::create_dir(&status_dir).context("create status directory")?; - let status_file = status_dir.join("pgdata"); - std::fs::write(&status_file, serde_json::json!({"done": true}).to_string()) - .context("write status file")?; - aws_s3_sync::sync(&status_dir, &s3_prefix.append("/status/")) + // Only sync if s3_prefix was specified + if let Some(s3_prefix) = args.s3_prefix { + info!("upload pgdata"); + aws_s3_sync::sync(Utf8Path::new(&pgdata_dir), &s3_prefix.append("/pgdata/")) .await - .context("sync status directory to destination")?; + .context("sync dump directory to destination")?; + + info!("write status"); + { + let status_dir = working_directory.join("status"); + std::fs::create_dir(&status_dir).context("create status directory")?; + let status_file = status_dir.join("pgdata"); + std::fs::write(&status_file, serde_json::json!({"done": true}).to_string()) + .context("write status file")?; + aws_s3_sync::sync(&status_dir, &s3_prefix.append("/status/")) + .await + .context("sync status directory to destination")?; + } } Ok(()) diff --git a/compute_tools/src/http/routes/extension_server.rs b/compute_tools/src/http/routes/extension_server.rs index ee5bc675ba..5cc9b6d277 100644 --- a/compute_tools/src/http/routes/extension_server.rs +++ b/compute_tools/src/http/routes/extension_server.rs @@ -17,7 +17,8 @@ use crate::{ #[derive(Debug, Clone, Deserialize)] pub(in crate::http) struct ExtensionServerParams { - is_library: Option, + #[serde(default)] + is_library: bool, } /// Download a remote extension. @@ -51,7 +52,7 @@ pub(in crate::http) async fn download_extension( remote_extensions.get_ext( &filename, - params.is_library.unwrap_or(false), + params.is_library, &compute.build_tag, &compute.pgversion, ) diff --git a/compute_tools/src/http/server.rs b/compute_tools/src/http/server.rs index 33d4b489a0..40fb1f4b4d 100644 --- a/compute_tools/src/http/server.rs +++ b/compute_tools/src/http/server.rs @@ -1,15 +1,14 @@ use std::{ net::{IpAddr, Ipv6Addr, SocketAddr}, - sync::{ - atomic::{AtomicU64, Ordering}, - Arc, - }, + sync::Arc, thread, time::Duration, }; use anyhow::Result; use axum::{ + extract::Request, + middleware::{self, Next}, response::{IntoResponse, Response}, routing::{get, post}, Router, @@ -17,11 +16,9 @@ use axum::{ use http::StatusCode; use tokio::net::TcpListener; use tower::ServiceBuilder; -use tower_http::{ - request_id::{MakeRequestId, PropagateRequestIdLayer, RequestId, SetRequestIdLayer}, - trace::TraceLayer, -}; +use tower_http::{request_id::PropagateRequestIdLayer, trace::TraceLayer}; use tracing::{debug, error, info, Span}; +use uuid::Uuid; use super::routes::{ check_writability, configure, database_schema, dbs_and_roles, extension_server, extensions, @@ -34,30 +31,24 @@ async fn handle_404() -> Response { StatusCode::NOT_FOUND.into_response() } -#[derive(Clone, Default)] -struct ComputeMakeRequestId(Arc); +const X_REQUEST_ID: &str = "x-request-id"; -impl MakeRequestId for ComputeMakeRequestId { - fn make_request_id( - &mut self, - _request: &http::Request, - ) -> Option { - let request_id = self - .0 - .fetch_add(1, Ordering::SeqCst) - .to_string() - .parse() - .unwrap(); +/// This middleware function allows compute_ctl to generate its own request ID +/// if one isn't supplied. The control plane will always send one as a UUID. The +/// neon Postgres extension on the other hand does not send one. +async fn maybe_add_request_id_header(mut request: Request, next: Next) -> Response { + let headers = request.headers_mut(); - Some(RequestId::new(request_id)) + if headers.get(X_REQUEST_ID).is_none() { + headers.append(X_REQUEST_ID, Uuid::new_v4().to_string().parse().unwrap()); } + + next.run(request).await } /// Run the HTTP server and wait on it forever. #[tokio::main] async fn serve(port: u16, compute: Arc) { - const X_REQUEST_ID: &str = "x-request-id"; - let mut app = Router::new() .route("/check_writability", post(check_writability::is_writable)) .route("/configure", post(configure::configure)) @@ -82,9 +73,8 @@ async fn serve(port: u16, compute: Arc) { .fallback(handle_404) .layer( ServiceBuilder::new() - .layer(SetRequestIdLayer::x_request_id( - ComputeMakeRequestId::default(), - )) + // Add this middleware since we assume the request ID exists + .layer(middleware::from_fn(maybe_add_request_id_header)) .layer( TraceLayer::new_for_http() .on_request(|request: &http::Request<_>, _span: &Span| { diff --git a/control_plane/src/local_env.rs b/control_plane/src/local_env.rs index 5b82acb3a5..2fe4cd5202 100644 --- a/control_plane/src/local_env.rs +++ b/control_plane/src/local_env.rs @@ -483,7 +483,6 @@ impl LocalEnv { .iter() .find(|(mapped_tenant_id, _)| mapped_tenant_id == &tenant_id) .map(|&(_, timeline_id)| timeline_id) - .map(TimelineId::from) } pub fn timeline_name_mappings(&self) -> HashMap { diff --git a/control_plane/src/storage_controller.rs b/control_plane/src/storage_controller.rs index 22d2420ed4..c41ff22d15 100644 --- a/control_plane/src/storage_controller.rs +++ b/control_plane/src/storage_controller.rs @@ -822,10 +822,7 @@ impl StorageController { self.dispatch( Method::PUT, format!("control/v1/tenant/{tenant_shard_id}/migrate"), - Some(TenantShardMigrateRequest { - tenant_shard_id, - node_id, - }), + Some(TenantShardMigrateRequest { node_id }), ) .await } diff --git a/control_plane/storcon_cli/src/main.rs b/control_plane/storcon_cli/src/main.rs index 617b2cd1ba..d9b76b9600 100644 --- a/control_plane/storcon_cli/src/main.rs +++ b/control_plane/storcon_cli/src/main.rs @@ -1,12 +1,17 @@ use futures::StreamExt; -use std::{str::FromStr, time::Duration}; +use std::{ + collections::{HashMap, HashSet}, + str::FromStr, + time::Duration, +}; use clap::{Parser, Subcommand}; use pageserver_api::{ controller_api::{ AvailabilityZone, NodeAvailabilityWrapper, NodeDescribeResponse, NodeShardResponse, - SafekeeperDescribeResponse, ShardSchedulingPolicy, TenantCreateRequest, - TenantDescribeResponse, TenantPolicyRequest, + SafekeeperDescribeResponse, SafekeeperSchedulingPolicyRequest, ShardSchedulingPolicy, + ShardsPreferredAzsRequest, SkSchedulingPolicy, TenantCreateRequest, TenantDescribeResponse, + TenantPolicyRequest, }, models::{ EvictionPolicy, EvictionPolicyLayerAccessThreshold, LocationConfigSecondary, @@ -112,6 +117,13 @@ enum Command { #[arg(long)] node: NodeId, }, + /// Migrate the secondary location for a tenant shard to a specific pageserver. + TenantShardMigrateSecondary { + #[arg(long)] + tenant_shard_id: TenantShardId, + #[arg(long)] + node: NodeId, + }, /// Cancel any ongoing reconciliation for this shard TenantShardCancelReconcile { #[arg(long)] @@ -146,6 +158,12 @@ enum Command { #[arg(long)] tenant_id: TenantId, }, + TenantSetPreferredAz { + #[arg(long)] + tenant_id: TenantId, + #[arg(long)] + preferred_az: Option, + }, /// Uncleanly drop a tenant from the storage controller: this doesn't delete anything from pageservers. Appropriate /// if you e.g. used `tenant-warmup` by mistake on a tenant ID that doesn't really exist, or is in some other region. TenantDrop { @@ -214,6 +232,13 @@ enum Command { }, /// List safekeepers known to the storage controller Safekeepers {}, + /// Set the scheduling policy of the specified safekeeper + SafekeeperScheduling { + #[arg(long)] + node_id: NodeId, + #[arg(long)] + scheduling_policy: SkSchedulingPolicyArg, + }, } #[derive(Parser)] @@ -266,6 +291,17 @@ impl FromStr for PlacementPolicyArg { } } +#[derive(Debug, Clone)] +struct SkSchedulingPolicyArg(SkSchedulingPolicy); + +impl FromStr for SkSchedulingPolicyArg { + type Err = anyhow::Error; + + fn from_str(s: &str) -> Result { + SkSchedulingPolicy::from_str(s).map(Self) + } +} + #[derive(Debug, Clone)] struct ShardSchedulingPolicyArg(ShardSchedulingPolicy); @@ -395,11 +431,12 @@ async fn main() -> anyhow::Result<()> { resp.sort_by(|a, b| a.listen_http_addr.cmp(&b.listen_http_addr)); let mut table = comfy_table::Table::new(); - table.set_header(["Id", "Hostname", "Scheduling", "Availability"]); + table.set_header(["Id", "Hostname", "AZ", "Scheduling", "Availability"]); for node in resp { table.add_row([ format!("{}", node.id), node.listen_http_addr, + node.availability_zone_id, format!("{:?}", node.scheduling), format!("{:?}", node.availability), ]); @@ -459,33 +496,65 @@ async fn main() -> anyhow::Result<()> { println!("{table}"); } Command::Tenants { node_id: None } => { - let mut resp = storcon_client - .dispatch::<(), Vec>( - Method::GET, - "control/v1/tenant".to_string(), - None, - ) - .await?; - - resp.sort_by(|a, b| a.tenant_id.cmp(&b.tenant_id)); - + // Set up output formatting let mut table = comfy_table::Table::new(); table.set_header([ "TenantId", + "Preferred AZ", "ShardCount", "StripeSize", "Placement", "Scheduling", ]); - for tenant in resp { - let shard_zero = tenant.shards.into_iter().next().unwrap(); - table.add_row([ - format!("{}", tenant.tenant_id), - format!("{}", shard_zero.tenant_shard_id.shard_count.literal()), - format!("{:?}", tenant.stripe_size), - format!("{:?}", tenant.policy), - format!("{:?}", shard_zero.scheduling_policy), - ]); + + // Pagination loop over listing API + let mut start_after = None; + const LIMIT: usize = 1000; + loop { + let path = match start_after { + None => format!("control/v1/tenant?limit={LIMIT}"), + Some(start_after) => { + format!("control/v1/tenant?limit={LIMIT}&start_after={start_after}") + } + }; + + let resp = storcon_client + .dispatch::<(), Vec>(Method::GET, path, None) + .await?; + + if resp.is_empty() { + // End of data reached + break; + } + + // Give some visual feedback while we're building up the table (comfy_table doesn't have + // streaming output) + if resp.len() >= LIMIT { + eprint!("."); + } + + start_after = Some(resp.last().unwrap().tenant_id); + + for tenant in resp { + let shard_zero = tenant.shards.into_iter().next().unwrap(); + table.add_row([ + format!("{}", tenant.tenant_id), + shard_zero + .preferred_az_id + .as_ref() + .cloned() + .unwrap_or("".to_string()), + format!("{}", shard_zero.tenant_shard_id.shard_count.literal()), + format!("{:?}", tenant.stripe_size), + format!("{:?}", tenant.policy), + format!("{:?}", shard_zero.scheduling_policy), + ]); + } + } + + // Terminate progress dots + if table.row_count() > LIMIT { + eprint!(""); } println!("{table}"); @@ -540,10 +609,7 @@ async fn main() -> anyhow::Result<()> { tenant_shard_id, node, } => { - let req = TenantShardMigrateRequest { - tenant_shard_id, - node_id: node, - }; + let req = TenantShardMigrateRequest { node_id: node }; storcon_client .dispatch::( @@ -553,6 +619,20 @@ async fn main() -> anyhow::Result<()> { ) .await?; } + Command::TenantShardMigrateSecondary { + tenant_shard_id, + node, + } => { + let req = TenantShardMigrateRequest { node_id: node }; + + storcon_client + .dispatch::( + Method::PUT, + format!("control/v1/tenant/{tenant_shard_id}/migrate_secondary"), + Some(req), + ) + .await?; + } Command::TenantShardCancelReconcile { tenant_shard_id } => { storcon_client .dispatch::<(), ()>( @@ -596,6 +676,19 @@ async fn main() -> anyhow::Result<()> { None, ) .await?; + + let nodes = storcon_client + .dispatch::<(), Vec>( + Method::GET, + "control/v1/node".to_string(), + None, + ) + .await?; + let nodes = nodes + .into_iter() + .map(|n| (n.id, n)) + .collect::>(); + println!("Tenant {tenant_id}"); let mut table = comfy_table::Table::new(); table.add_row(["Policy", &format!("{:?}", policy)]); @@ -604,7 +697,14 @@ async fn main() -> anyhow::Result<()> { println!("{table}"); println!("Shards:"); let mut table = comfy_table::Table::new(); - table.set_header(["Shard", "Attached", "Secondary", "Last error", "status"]); + table.set_header([ + "Shard", + "Attached", + "Attached AZ", + "Secondary", + "Last error", + "status", + ]); for shard in shards { let secondary = shard .node_secondary @@ -627,11 +727,18 @@ async fn main() -> anyhow::Result<()> { } let status = status_parts.join(","); + let attached_node = shard + .node_attached + .as_ref() + .map(|id| nodes.get(id).expect("Shard references nonexistent node")); + table.add_row([ format!("{}", shard.tenant_shard_id), - shard - .node_attached - .map(|n| format!("{}", n)) + attached_node + .map(|n| format!("{} ({})", n.listen_http_addr, n.id)) + .unwrap_or(String::new()), + attached_node + .map(|n| n.availability_zone_id.clone()) .unwrap_or(String::new()), secondary, shard.last_error, @@ -640,6 +747,66 @@ async fn main() -> anyhow::Result<()> { } println!("{table}"); } + Command::TenantSetPreferredAz { + tenant_id, + preferred_az, + } => { + // First learn about the tenant's shards + let describe_response = storcon_client + .dispatch::<(), TenantDescribeResponse>( + Method::GET, + format!("control/v1/tenant/{tenant_id}"), + None, + ) + .await?; + + // Learn about nodes to validate the AZ ID + let nodes = storcon_client + .dispatch::<(), Vec>( + Method::GET, + "control/v1/node".to_string(), + None, + ) + .await?; + + if let Some(preferred_az) = &preferred_az { + let azs = nodes + .into_iter() + .map(|n| (n.availability_zone_id)) + .collect::>(); + if !azs.contains(preferred_az) { + anyhow::bail!( + "AZ {} not found on any node: known AZs are: {:?}", + preferred_az, + azs + ); + } + } else { + // Make it obvious to the user that since they've omitted an AZ, we're clearing it + eprintln!("Clearing preferred AZ for tenant {}", tenant_id); + } + + // Construct a request that modifies all the tenant's shards + let req = ShardsPreferredAzsRequest { + preferred_az_ids: describe_response + .shards + .into_iter() + .map(|s| { + ( + s.tenant_shard_id, + preferred_az.clone().map(AvailabilityZone), + ) + }) + .collect(), + }; + storcon_client + .dispatch::( + Method::PUT, + "control/v1/preferred_azs".to_string(), + Some(req), + ) + .await?; + } Command::TenantWarmup { tenant_id } => { let describe_response = storcon_client .dispatch::<(), TenantDescribeResponse>( @@ -915,10 +1082,7 @@ async fn main() -> anyhow::Result<()> { .dispatch::( Method::PUT, format!("control/v1/tenant/{}/migrate", mv.tenant_shard_id), - Some(TenantShardMigrateRequest { - tenant_shard_id: mv.tenant_shard_id, - node_id: mv.to, - }), + Some(TenantShardMigrateRequest { node_id: mv.to }), ) .await .map_err(|e| (mv.tenant_shard_id, mv.from, mv.to, e)) @@ -1057,6 +1221,23 @@ async fn main() -> anyhow::Result<()> { } println!("{table}"); } + Command::SafekeeperScheduling { + node_id, + scheduling_policy, + } => { + let scheduling_policy = scheduling_policy.0; + storcon_client + .dispatch::( + Method::POST, + format!("control/v1/safekeeper/{node_id}/scheduling_policy"), + Some(SafekeeperSchedulingPolicyRequest { scheduling_policy }), + ) + .await?; + println!( + "Scheduling policy of {node_id} set to {}", + String::from(scheduling_policy) + ); + } } Ok(()) diff --git a/docs/docker.md b/docs/docker.md index 0914a00082..ae74c2b2ab 100644 --- a/docs/docker.md +++ b/docs/docker.md @@ -7,15 +7,11 @@ Currently we build two main images: - [neondatabase/neon](https://hub.docker.com/repository/docker/neondatabase/neon) — image with pre-built `pageserver`, `safekeeper` and `proxy` binaries and all the required runtime dependencies. Built from [/Dockerfile](/Dockerfile). - [neondatabase/compute-node-v16](https://hub.docker.com/repository/docker/neondatabase/compute-node-v16) — compute node image with pre-built Postgres binaries from [neondatabase/postgres](https://github.com/neondatabase/postgres). Similar images exist for v15 and v14. Built from [/compute-node/Dockerfile](/compute/compute-node.Dockerfile). -And additional intermediate image: - -- [neondatabase/compute-tools](https://hub.docker.com/repository/docker/neondatabase/compute-tools) — compute node configuration management tools. - ## Build pipeline We build all images after a successful `release` tests run and push automatically to Docker Hub with two parallel CI jobs -1. `neondatabase/compute-tools` and `neondatabase/compute-node-v16` (and -v15 and -v14) +1. `neondatabase/compute-node-v17` (and -16, -v15, -v14) 2. `neondatabase/neon` diff --git a/docs/rfcs/035-safekeeper-dynamic-membership-change.md b/docs/rfcs/035-safekeeper-dynamic-membership-change.md index 239ec58186..cea9af34ab 100644 --- a/docs/rfcs/035-safekeeper-dynamic-membership-change.md +++ b/docs/rfcs/035-safekeeper-dynamic-membership-change.md @@ -81,7 +81,7 @@ configuration generation in them is less than its current one. Namely, it refuses to vote, to truncate WAL in `handle_elected` and to accept WAL. In response it sends its current configuration generation to let walproposer know. -Safekeeper gets `PUT /v1/tenants/{tenant_id}/timelines/{timeline_id}/configuration` +Safekeeper gets `PUT /v1/tenants/{tenant_id}/timelines/{timeline_id}/configuration` accepting `Configuration`. Safekeeper switches to the given conf it is higher than its current one and ignores it otherwise. In any case it replies with ``` @@ -103,7 +103,7 @@ currently and tries to communicate with all of them. However, the list does not define consensus members. Instead, on start walproposer tracks highest configuration it receives from `AcceptorGreeting`s. Once it assembles greetings from majority of `sk_set` and majority of `new_sk_set` (if it is present), it -establishes this configuration as its own and moves to voting. +establishes this configuration as its own and moves to voting. It should stop talking to safekeepers not listed in the configuration at this point, though it is not unsafe to continue doing so. @@ -119,7 +119,7 @@ refusal to accept due to configuration change) it simply restarts. The following algorithm can be executed anywhere having access to configuration storage and safekeepers. It is safe to interrupt / restart it and run multiple instances of it concurrently, though likely one of them won't make -progress then. It accepts `desired_set: Vec` as input. +progress then. It accepts `desired_set: Vec` as input. Algorithm will refuse to make the change if it encounters previous interrupted change attempt, but in this case it will try to finish it. @@ -140,7 +140,7 @@ storage are reachable. safe. Failed CAS aborts the procedure. 4) Call `PUT` `configuration` on safekeepers from the current set, delivering them `joint_conf`. Collecting responses from majority is required - to proceed. If any response returned generation higher than + to proceed. If any response returned generation higher than `joint_conf.generation`, abort (another switch raced us). Otherwise, choose max `` among responses and establish it as (in memory) `sync_position`. Also choose max `term` and establish it as (in @@ -149,49 +149,49 @@ storage are reachable. without ack from the new set. Similarly, we'll bump term on new majority to `sync_term` so that two computes with the same term are never elected. 4) Initialize timeline on safekeeper(s) from `new_sk_set` where it - doesn't exist yet by doing `pull_timeline` from the majority of the + doesn't exist yet by doing `pull_timeline` from the majority of the current set. Doing that on majority of `new_sk_set` is enough to proceed, but it is reasonable to ensure that all `new_sk_set` members are initialized -- if some of them are down why are we migrating there? -5) Call `POST` `bump_term(sync_term)` on safekeepers from the new set. +5) Call `POST` `bump_term(sync_term)` on safekeepers from the new set. Success on majority is enough. 6) Repeatedly call `PUT` `configuration` on safekeepers from the new set, delivering them `joint_conf` and collecting their positions. This will - switch them to the `joint_conf` which generally won't be needed + switch them to the `joint_conf` which generally won't be needed because `pull_timeline` already includes it and plus additionally would be broadcast by compute. More importantly, we may proceed to the next step - only when `` on the majority of the new set reached - `sync_position`. Similarly, on the happy path no waiting is not needed because + only when `` on the majority of the new set reached + `sync_position`. Similarly, on the happy path no waiting is not needed because `pull_timeline` already includes it. However, we should double check to be safe. For example, timeline could have been created earlier e.g. - manually or after try-to-migrate, abort, try-to-migrate-again sequence. -7) Create `new_conf: Configuration` incrementing `join_conf` generation and having new - safekeeper set as `sk_set` and None `new_sk_set`. Write it to configuration + manually or after try-to-migrate, abort, try-to-migrate-again sequence. +7) Create `new_conf: Configuration` incrementing `join_conf` generation and having new + safekeeper set as `sk_set` and None `new_sk_set`. Write it to configuration storage under one more CAS. 8) Call `PUT` `configuration` on safekeepers from the new set, - delivering them `new_conf`. It is enough to deliver it to the majority + delivering them `new_conf`. It is enough to deliver it to the majority of the new set; the rest can be updated by compute. I haven't put huge effort to make the description above very precise, because it is natural language prone to interpretations anyway. Instead I'd like to make TLA+ spec of it. -Description above focuses on safety. To make the flow practical and live, here a few more +Description above focuses on safety. To make the flow practical and live, here a few more considerations. -1) It makes sense to ping new set to ensure it we are migrating to live node(s) before +1) It makes sense to ping new set to ensure it we are migrating to live node(s) before step 3. -2) If e.g. accidentally wrong new sk set has been specified, before CAS in step `6` is completed +2) If e.g. accidentally wrong new sk set has been specified, before CAS in step `6` is completed it is safe to rollback to the old conf with one more CAS. -3) On step 4 timeline might be already created on members of the new set for various reasons; +3) On step 4 timeline might be already created on members of the new set for various reasons; the simplest is the procedure restart. There are more complicated scenarious like mentioned - in step 5. Deleting and re-doing `pull_timeline` is generally unsafe without involving - generations, so seems simpler to treat existing timeline as success. However, this also + in step 5. Deleting and re-doing `pull_timeline` is generally unsafe without involving + generations, so seems simpler to treat existing timeline as success. However, this also has a disadvantage: you might imagine an surpassingly unlikely schedule where condition in the step 5 is never reached until compute is (re)awaken up to synchronize new member(s). I don't think we'll observe this in practice, but can add waking up compute if needed. 4) In the end timeline should be locally deleted on the safekeeper(s) which are in the old set but not in the new one, unless they are unreachable. To be - safe this also should be done under generation number (deletion proceeds only if + safe this also should be done under generation number (deletion proceeds only if current configuration is <= than one in request and safekeeper is not memeber of it). 5) If current conf fetched on step 1 is already not joint and members equal to `desired_set`, jump to step 7, using it as `new_conf`. @@ -202,47 +202,87 @@ The procedure ought to be driven from somewhere. Obvious candidates are control plane and storage_controller; and as each of them already has db we don't want yet another storage. I propose to manage safekeepers in storage_controller because 1) since it is in rust it simplifies simulation testing (more on this -below) 2) it already manages pageservers. +below) 2) it already manages pageservers. This assumes that migration will be fully usable only after we migrate all tenants/timelines to storage_controller. It is discussible whether we want also to manage pageserver attachments for all of these, but likely we do. -This requires us to define storcon <-> cplane interface. +This requires us to define storcon <-> cplane interface and changes. -### storage_controller <-> control plane interface +### storage_controller <-> control plane interface and changes First of all, control plane should [change](https://neondb.slack.com/archives/C03438W3FLZ/p1719226543199829) storing safekeepers per timeline instead of per tenant because we can't migrate -tenants atomically. +tenants atomically. The important question is how updated configuration is delivered from storage_controller to control plane to provide it to computes. As always, there are two options, pull and push. Let's do it the same push as with pageserver `/notify-attach` because 1) it keeps storage_controller out of critical compute -start path 2) provides easier upgrade: there won't be such a thing as 'timeline -managed by control plane / storcon', cplane just takes the value out of its db -when needed 3) uniformity. It makes storage_controller responsible for retrying notifying -control plane until it succeeds. +start path 2) uniformity. It makes storage_controller responsible for retrying +notifying control plane until it succeeds. -So, cplane `/notify-safekeepers` for the timeline accepts `Configuration` and -updates it in the db if the provided conf generation is higher (the cplane db -should also store generations for this). Similarly to [`/notify-attach`](https://www.notion.so/neondatabase/Storage-Controller-Control-Plane-interface-6de56dd310a043bfa5c2f5564fa98365), it -should update db which makes the call successful, and then try to schedule -`apply_config` if possible, it is ok if not. storage_controller -should rate limit calling the endpoint, but likely this won't be needed, as migration +It is not needed for the control plane to fully know the `Configuration`. It is +enough for it to only to be aware of the list of safekeepers in the latest +configuration to supply it to compute, plus associated generation number to +protect from stale update requests and to also pass it to compute. + +So, cplane `/notify-safekeepers` for the timeline can accept JSON like +``` +{ + tenant_id: String, + timeline_id: String, + generation: u32, + safekeepers: Vec, +} +``` +where `SafekeeperId` is +``` +{ + node_id: u64, + host: String +} +``` +In principle `host` is redundant, but may be useful for observability. + +The request updates list of safekeepers in the db if the provided conf +generation is higher (the cplane db should also store generations for this). +Similarly to +[`/notify-attach`](https://www.notion.so/neondatabase/Storage-Controller-Control-Plane-interface-6de56dd310a043bfa5c2f5564fa98365), +it should update db which makes the call successful, and then try to schedule +`apply_config` if possible, it is ok if not. storage_controller should rate +limit calling the endpoint, but likely this won't be needed, as migration throughput is limited by `pull_timeline`. Timeline (branch) creation in cplane should call storage_controller POST `tenant/:tenant_id/timeline` like it currently does for sharded tenants. -Response should be augmented with `safekeeper_conf: Configuration`. The call -should be retried until succeeds. +Response should be augmented with `safekeepers_generation` and `safekeepers` +fields like described in `/notify-safekeepers` above. Initially (currently) +these fields may be absent; in this case cplane chooses safekeepers on its own +like it currently does. The call should be retried until succeeds. Timeline deletion and tenant deletion in cplane should call appropriate storage_controller endpoints like it currently does for sharded tenants. The calls should be retried until they succeed. +When compute receives safekeepers list from control plane it needs to know the +generation to checked whether it should be updated (note that compute may get +safekeeper list from either cplane or safekeepers). Currently `neon.safekeepers` +GUC is just a comma separates list of `host:port`. Let's prefix it with +`g#:` to this end, so it will look like +``` +g#42:safekeeper-0.eu-central-1.aws.neon.tech:6401,safekeeper-2.eu-central-1.aws.neon.tech:6401,safekeeper-1.eu-central-1.aws.neon.tech:6401 +``` + +To summarize, list of cplane changes: +- per tenant -> per timeline safekeepers management and addition of int `safekeeper_generation` field. +- `/notify-safekeepers` endpoint. +- Branch creation call may return list of safekeepers and when it is + present cplane should adopt it instead of choosing on its own like it does currently. +- `neon.safekeepers` GUC should be prefixed with `g#:`. + ### storage_controller implementation Current 'load everything on startup and keep in memory' easy design is fine. @@ -360,10 +400,10 @@ source safekeeper might fail, which is not a problem if we are going to decomission the node but leaves garbage otherwise. I'd propose in the first version 1) Don't attempt deletion at all if node status is `offline`. 2) If it failed, just issue warning. -And add PUT `/control/v1/safekeepers/:node_id/scrub` endpoint which would find and -remove garbage timelines for manual use. It will 1) list all timelines on the -safekeeper 2) compare each one against configuration storage: if timeline -doesn't exist at all (had been deleted), it can be deleted. Otherwise, it can +And add PUT `/control/v1/safekeepers/:node_id/scrub` endpoint which would find and +remove garbage timelines for manual use. It will 1) list all timelines on the +safekeeper 2) compare each one against configuration storage: if timeline +doesn't exist at all (had been deleted), it can be deleted. Otherwise, it can be deleted under generation number if node is not member of current generation. Automating this is untrivial; we'd need to register all potential missing @@ -412,8 +452,8 @@ There should be following layers of tests: 3) Since simulation testing injects at relatively high level points (not syscalls), it omits some code, in particular `pull_timeline`. Thus it is better to have basic tests covering whole system as well. Extended version of - `test_restarts_under_load` would do: start background load and do migration - under it, then restart endpoint and check that no reported commits + `test_restarts_under_load` would do: start background load and do migration + under it, then restart endpoint and check that no reported commits had been lost. I'd also add one more creating classic network split scenario, with one compute talking to AC and another to BD while migration from nodes ABC to ABD happens. @@ -422,35 +462,51 @@ There should be following layers of tests: ## Order of implementation and rollout -Note that +Note that - Control plane parts and integration with it is fully independent from everything else (tests would use simulation and neon_local). +- It is reasonable to make compute <-> safekeepers protocol change + independent of enabling generations. - There is a lot of infra work making storage_controller aware of timelines and safekeepers and its impl/rollout should be separate from migration itself. -- Initially walproposer can just stop working while it observers joint configuration. +- Initially walproposer can just stop working while it observes joint configuration. Such window would be typically very short anyway. +- Obviously we want to test the whole thing thoroughly on staging and only then + gradually enable in prod. -To rollout smoothly, both walproposer and safekeeper should have flag -`configurations_enabled`; when set to false, they would work as currently, i.e. -walproposer is able to commit on whatever safekeeper set it is provided. Until -all timelines are managed by storcon we'd need to use current script to migrate -and update/drop entries in the storage_controller database if it has any. +Let's have the following implementation bits for gradual rollout: +- compute gets `neon.safekeepers_proto_version` flag. + Initially both compute and safekeepers will be able to talk both + versions so that we can delay force restart of them and for + simplicity of rollback in case it is needed. +- storcon gets `-set-safekeepers` config option disabled by + default. Timeline creation request chooses safekeepers + (and returns them in response to cplane) only when it is set to + true. +- control_plane [see above](storage_controller-<->-control-plane interface-and-changes) + prefixes `neon.safekeepers` GUC with generation number. When it is 0 + (or prefix not present at all), walproposer behaves as currently, committing on + the provided safekeeper list -- generations are disabled. + If it is non 0 it follows this RFC rules. +- We provide a script for manual migration to storage controller. + It selects timeline(s) from control plane (specified or all of them) db + and calls special import endpoint on storage controller which is very + similar to timeline creation: it inserts into the db, sets + configuration to initial on the safekeepers, calls cplane + `notify-safekeepers`. -Safekeepers would need to be able to talk both current and new protocol version -with compute to reduce number of computes restarted in prod once v2 protocol is -deployed (though before completely switching we'd need to force this). - -Let's have the following rollout order: -- storage_controller becomes aware of safekeepers; -- storage_controller gets timeline creation for new timelines and deletion requests, but - doesn't manage all timelines yet. Migration can be tested on these new timelines. - To keep control plane and storage_controller databases in sync while control - plane still chooses the safekeepers initially (until all timelines are imported - it can choose better), `TimelineCreateRequest` can get optional safekeepers - field with safekeepers chosen by cplane. -- Then we can import all existing timelines from control plane to - storage_controller and gradually enable configurations region by region. +Then the rollout for a region would be: +- Current situation: safekeepers are choosen by control_plane. +- We manually migrate some timelines, test moving them around. +- Then we enable `--set-safekeepers` so that all new timelines + are on storage controller. +- Finally migrate all existing timelines using the script (no + compute should be speaking old proto version at this point). +Until all timelines are managed by storcon we'd need to use current ad hoc +script to migrate if needed. To keep state clean, all storage controller managed +timelines must be migrated before that, or controller db and configurations +state of safekeepers dropped manually. Very rough implementation order: - Add concept of configurations to safekeepers (including control file), @@ -458,10 +514,10 @@ Very rough implementation order: - Implement walproposer changes, including protocol. - Implement storconn part. Use it in neon_local (and pytest). - Make cplane store safekeepers per timeline instead of per tenant. -- Implement cplane/storcon integration. Route branch creation/deletion +- Implement cplane/storcon integration. Route branch creation/deletion through storcon. Then we can test migration of new branches. -- Finally import existing branches. Then we can drop cplane - safekeeper selection code. Gradually enable configurations at +- Finally import existing branches. Then we can drop cplane + safekeeper selection code. Gradually enable configurations at computes and safekeepers. Before that, all computes must talk only v3 protocol version. diff --git a/docs/rfcs/040-profiling.md b/docs/rfcs/040-profiling.md new file mode 100644 index 0000000000..8da9e50774 --- /dev/null +++ b/docs/rfcs/040-profiling.md @@ -0,0 +1,247 @@ +# CPU and Memory Profiling + +Created 2025-01-12 by Erik Grinaker. + +See also [internal user guide](https://www.notion.so/neondatabase/Storage-CPU-Memory-Profiling-14bf189e004780228ec7d04442742324?pvs=4). + +## Summary + +This document proposes a standard cross-team pattern for CPU and memory profiling across +applications and languages, using the [pprof](https://github.com/google/pprof) profile format. + +It enables both ad hoc profiles via HTTP endpoints, and continuous profiling across the fleet via +[Grafana Cloud Profiles](https://grafana.com/docs/grafana-cloud/monitor-applications/profiles/). +Continuous profiling incurs an overhead of about 0.1% CPU usage and 3% slower heap allocations. + +## Motivation + +CPU and memory profiles are crucial observability tools for understanding performance issues, +resource exhaustion, and resource costs. They allow answering questions like: + +* Why is this process using 100% CPU? +* How do I make this go faster? +* Why did this process run out of memory? +* Why are we paying for all these CPU cores and memory chips? + +Go has [first-class support](https://pkg.go.dev/net/http/pprof) for profiling included in its +standard library, using the [pprof](https://github.com/google/pprof) profile format and associated +tooling. + +This is not the case for Rust and C, where obtaining profiles can be rather cumbersome. It requires +installing and running additional tools like `perf` as root on production nodes, with analysis tools +that can be hard to use and often don't give good results. This is not only annoying, but can also +significantly affect the resolution time of production incidents. + +This proposal will: + +* Provide CPU and heap profiles in pprof format via HTTP API. +* Record continuous profiles in Grafana for aggregate historical analysis. +* Make it easy for anyone to see a flamegraph in less than one minute. +* Be reasonably consistent across teams and services (Rust, Go, C). + +## Non Goals (For Now) + +* [Additional profile types](https://grafana.com/docs/pyroscope/next/configure-client/profile-types/) + like mutexes, locks, goroutines, etc. +* [Runtime trace integration](https://grafana.com/docs/pyroscope/next/configure-client/trace-span-profiles/). +* [Profile-guided optimization](https://en.wikipedia.org/wiki/Profile-guided_optimization). + +## Using Profiles + +Ready-to-use profiles can be obtained using e.g. `curl`. For Rust services: + +``` +$ curl localhost:9898/profile/cpu >profile.pb.gz +``` + +pprof profiles can be explored using the [`pprof`](https://github.com/google/pprof) web UI, which +provides flamegraphs, call graphs, plain text listings, and more: + +``` +$ pprof -http :6060 +``` + +Some endpoints (e.g. Rust-based ones) can also generate flamegraph SVGs directly: + +``` +$ curl localhost:9898/profile/cpu?format=svg >profile.svg +$ open profile.svg +``` + +Continuous profiles are available in Grafana under Explore → Profiles → Explore Profiles +(currently only in [staging](https://neonstaging.grafana.net/a/grafana-pyroscope-app/profiles-explorer)). + +## API Requirements + +* HTTP endpoints that return a profile in pprof format (with symbols). + * CPU: records a profile over the request time interval (`seconds` query parameter). + * Memory: returns the current in-use heap allocations. +* Unauthenticated, as it should not expose user data or pose a denial-of-service risk. +* Default sample frequency should not impact service (maximum 5% CPU overhead). +* Linux-compatibility. + +Nice to have: + +* Return flamegraph SVG directly from the HTTP endpoint if requested. +* Configurable sample frequency for CPU profiles. +* Historical heap allocations, by count and bytes. +* macOS-compatiblity. + +## Rust Profiling + +[`libs/utils/src/http/endpoint.rs`](https://github.com/neondatabase/neon/blob/8327f68043e692c77f70d6a6dafa463636c01578/libs/utils/src/http/endpoint.rs) +contains ready-to-use HTTP endpoints for CPU and memory profiling: +[`profile_cpu_handler`](https://github.com/neondatabase/neon/blob/8327f68043e692c77f70d6a6dafa463636c01578/libs/utils/src/http/endpoint.rs#L338) and [`profile_heap_handler`](https://github.com/neondatabase/neon/blob/8327f68043e692c77f70d6a6dafa463636c01578/libs/utils/src/http/endpoint.rs#L416). + +### CPU + +CPU profiles are provided by [pprof-rs](https://github.com/tikv/pprof-rs) via +[`profile_cpu_handler`](https://github.com/neondatabase/neon/blob/8327f68043e692c77f70d6a6dafa463636c01578/libs/utils/src/http/endpoint.rs#L338). +Expose it unauthenticated at `/profile/cpu`. + +Parameters: + +* `format`: profile output format (`pprof` or `svg`; default `pprof`). +* `seconds`: duration to collect profile over, in seconds (default `5`). +* `frequency`: how often to sample thread stacks, in Hz (default `99`). +* `force`: if `true`, cancel a running profile and start a new one (default `false`). + +Works on Linux and macOS. + +### Memory + +Use the jemalloc allocator via [`tikv-jemallocator`](https://github.com/tikv/jemallocator), +and enable profiling with samples every 2 MB allocated: + +```rust +#[global_allocator] +static GLOBAL: tikv_jemallocator::Jemalloc = tikv_jemallocator::Jemalloc; + +#[allow(non_upper_case_globals)] +#[export_name = "malloc_conf"] +pub static malloc_conf: &[u8] = b"prof:true,prof_active:true,lg_prof_sample:21\0"; +``` + +pprof profiles are generated by +[`jemalloc-pprof`](https://github.com/polarsignals/rust-jemalloc-pprof) via +[`profile_heap_handler`](https://github.com/neondatabase/neon/blob/8327f68043e692c77f70d6a6dafa463636c01578/libs/utils/src/http/endpoint.rs#L416). +Expose it unauthenticated at `/profile/heap`. + +Parameters: + +* `format`: profile output format (`pprof`, `svg`, or `jemalloc`; default `pprof`). + +Works on Linux only, due to [jemalloc limitations](https://github.com/jemalloc/jemalloc/issues/26). + +## Go Profiling + +The Go standard library includes pprof profiling via HTTP API in +[`net/http/pprof`](https://pkg.go.dev/net/http/pprof). Expose it unauthenticated at +`/debug/pprof`. + +Works on Linux and macOS. + +### CPU + +Via `/debug/pprof/profile`. Parameters: + +* `debug`: profile output format (`0` is pprof, `1` or above is plaintext; default `0`). +* `seconds`: duration to collect profile over, in seconds (default `30`). + +Does not support a frequency parameter (see [#57488](https://github.com/golang/go/issues/57488)), +and defaults to 100 Hz. A lower frequency can be hardcoded via `SetCPUProfileRate`, but the default +is likely ok (estimated 1% overhead). + +### Memory + +Via `/debug/pprof/heap`. Parameters: + +* `seconds`: take a delta profile over the given duration, in seconds (default `0`). +* `gc`: if `1`, garbage collect before taking profile. + +## C Profiling + +[gperftools](https://github.com/gperftools/gperftools) provides in-process CPU and heap profiling +with pprof output. + +However, continuous profiling of PostgreSQL is expensive (many computes), and has limited value +since we don't own the internals anyway. + +Ad hoc profiling might still be useful, but the compute team considers existing tooling sufficient, +so this is not a priority at the moment. + +## Grafana Continuous Profiling + +[Grafana Alloy](https://grafana.com/docs/alloy/latest/) continually scrapes CPU and memory profiles +across the fleet, and archives them as time series. This can be used to analyze resource usage over +time, either in aggregate or zoomed in to specific events and nodes. + +Profiles are retained for 30 days. Profile ingestion volume for CPU+heap at 60-second intervals +is about 0.5 GB/node/day, or about $0.25/node/day = $7.5/node/month ($0.50/GB). + +It is currently enabled in [staging](https://neonstaging.grafana.net/a/grafana-pyroscope-app/profiles-explorer) +for Pageserver and Safekeeper. + +### Scraping + +* CPU profiling: 59 seconds at 19 Hz every 60 seconds. +* Heap profiling: heap snapshot with 2 MB frequency every 60 seconds. + +There are two main approaches that can be taken for CPU profiles: + +* Continuous low-frequency profiles (e.g. 19 Hz for 60 seconds every 60 seconds). +* Occasional high-frequency profiles (e.g. 99 Hz for 5 seconds every 60 seconds). + +We choose continuous low-frequency profiles where possible. This has a fixed low overhead, instead +of a spiky high overhead. It likely also gives a more representative view of resource usage. +However, a 19 Hz rate gives a minimum resolution of 52.6 ms per sample, which may be larger than the +actual runtime of small functions. Note that Go does not support a frequency parameter, so we must +use a fixed frequency for all profiles via `SetCPUProfileRate()` (default 100 Hz). + +Only one CPU profile can be taken at a time. With continuous profiling, one will always be running. +To allow also taking an ad hoc CPU profile, the Rust endpoint supports a `force` query parameter to +cancel a running profile and start a new one. + +### Overhead + +With Rust: + +* CPU profiles at 19 Hz frequency: 0.1% overhead. +* Heap profiles at 2 MB frequency: 3% allocation overhead. +* Profile call/encoding/symbolization: 20 ms every 60 seconds, or 0.03% of 1 CPU (for Pageserver). +* Profile symbolization caches: 125 MB memory, or 0.4% of 32 GB (for Pageserver). + +Benchmarks with pprof-rs showed that the CPU time for taking a stack trace of a 40-frame stack was +11 µs using the `frame-pointer` feature, and 1.4 µs using `libunwind` with DWARF. `libunwind` saw +frequent seg faults, so we use `frame-pointer` and build binaries with frame pointers (negligible +overhead). + +CPU profiles work by installing an `ITIMER_PROF` for the process, which triggers a `SIGPROF` signal +after a given amount of cumulative CPU time across all CPUs. The signal handler will run for one +of the currently executing threads and take a stack trace. Thus, a 19 Hz profile will take 1 stack +trace every 52.6 ms CPU time -- assuming 11 µs for a stack trace, this is 0.02% overhead, but +likely 0.1% in practice (given e.g. context switches). + +Heap profiles work by probabilistically taking a stack trace on allocations, adjusted for the +allocation size. A 1 MB allocation takes about 15 µs in benchmarks, and a stack trace about 1 µs, +so we can estimate that a 2 MB sampling frequency has about 3% allocation overhead -- this is +consistent with benchmarks. This is significantly larger than CPU profiles, but mitigated by the +fact that performance-sensitive code will avoid allocations as far as possible. + +Profile symbolization uses in-memory caches for symbol lookups. These take about 125 MB for +Pageserver. + +## Alternatives Considered + +* eBPF profiles. + * Don't require instrumenting the binary. + * Use less resources. + * Can profile in kernel space too. + * Supported by Grafana. + * Less information about stack frames and spans. + * Limited tooling for local analysis. + * Does not support heap profiles. + * Does not work on macOS. + +* [Polar Signals](https://www.polarsignals.com) instead of Grafana. + * We already use Grafana for everything else. Appears good enough. diff --git a/libs/pageserver_api/src/controller_api.rs b/libs/pageserver_api/src/controller_api.rs index 7eb3547183..78e080981a 100644 --- a/libs/pageserver_api/src/controller_api.rs +++ b/libs/pageserver_api/src/controller_api.rs @@ -87,7 +87,7 @@ impl Display for AvailabilityZone { #[derive(Serialize, Deserialize)] pub struct ShardsPreferredAzsRequest { #[serde(flatten)] - pub preferred_az_ids: HashMap, + pub preferred_az_ids: HashMap>, } #[derive(Serialize, Deserialize)] @@ -144,6 +144,8 @@ pub struct NodeDescribeResponse { pub availability: NodeAvailabilityWrapper, pub scheduling: NodeSchedulingPolicy, + pub availability_zone_id: String, + pub listen_http_addr: String, pub listen_http_port: u16, @@ -179,7 +181,6 @@ pub struct TenantDescribeResponseShard { /// specifies some constraints, e.g. asking it to get off particular node(s) #[derive(Serialize, Deserialize, Debug)] pub struct TenantShardMigrateRequest { - pub tenant_shard_id: TenantShardId, pub node_id: NodeId, } @@ -323,7 +324,7 @@ impl From for String { #[derive(Serialize, Deserialize, Clone, Copy, Eq, PartialEq, Debug)] pub enum SkSchedulingPolicy { Active, - Disabled, + Pause, Decomissioned, } @@ -333,9 +334,13 @@ impl FromStr for SkSchedulingPolicy { fn from_str(s: &str) -> Result { Ok(match s { "active" => Self::Active, - "disabled" => Self::Disabled, + "pause" => Self::Pause, "decomissioned" => Self::Decomissioned, - _ => return Err(anyhow::anyhow!("Unknown scheduling state '{s}'")), + _ => { + return Err(anyhow::anyhow!( + "Unknown scheduling policy '{s}', try active,pause,decomissioned" + )) + } }) } } @@ -345,7 +350,7 @@ impl From for String { use SkSchedulingPolicy::*; match value { Active => "active", - Disabled => "disabled", + Pause => "pause", Decomissioned => "decomissioned", } .to_string() @@ -368,6 +373,16 @@ pub enum PlacementPolicy { Detached, } +impl PlacementPolicy { + pub fn want_secondaries(&self) -> usize { + match self { + PlacementPolicy::Attached(secondary_count) => *secondary_count, + PlacementPolicy::Secondary => 1, + PlacementPolicy::Detached => 0, + } + } +} + #[derive(Serialize, Deserialize, Debug)] pub struct TenantShardMigrateResponse {} @@ -405,8 +420,6 @@ pub struct MetadataHealthListOutdatedResponse { } /// Publicly exposed safekeeper description -/// -/// The `active` flag which we have in the DB is not included on purpose: it is deprecated. #[derive(Serialize, Deserialize, Clone)] pub struct SafekeeperDescribeResponse { pub id: NodeId, @@ -422,6 +435,11 @@ pub struct SafekeeperDescribeResponse { pub scheduling_policy: SkSchedulingPolicy, } +#[derive(Serialize, Deserialize, Clone)] +pub struct SafekeeperSchedulingPolicyRequest { + pub scheduling_policy: SkSchedulingPolicy, +} + #[cfg(test)] mod test { use super::*; diff --git a/libs/pageserver_api/src/key.rs b/libs/pageserver_api/src/key.rs index f0cd713c38..dbd45da314 100644 --- a/libs/pageserver_api/src/key.rs +++ b/libs/pageserver_api/src/key.rs @@ -24,7 +24,9 @@ pub struct Key { /// When working with large numbers of Keys in-memory, it is more efficient to handle them as i128 than as /// a struct of fields. -#[derive(Clone, Copy, Hash, PartialEq, Eq, Ord, PartialOrd, Serialize, Deserialize, Debug)] +#[derive( + Clone, Copy, Default, Hash, PartialEq, Eq, Ord, PartialOrd, Serialize, Deserialize, Debug, +)] pub struct CompactKey(i128); /// The storage key size. @@ -706,7 +708,7 @@ pub fn repl_origin_key_range() -> Range { /// Non inherited range for vectored get. pub const NON_INHERITED_RANGE: Range = AUX_FILES_KEY..AUX_FILES_KEY.next(); /// Sparse keyspace range for vectored get. Missing key error will be ignored for this range. -pub const NON_INHERITED_SPARSE_RANGE: Range = Key::metadata_key_range(); +pub const SPARSE_RANGE: Range = Key::metadata_key_range(); impl Key { // AUX_FILES currently stores only data for logical replication (slots etc), and @@ -714,7 +716,42 @@ impl Key { // switch (and generally it likely should be optional), so ignore these. #[inline(always)] pub fn is_inherited_key(self) -> bool { - !NON_INHERITED_RANGE.contains(&self) && !NON_INHERITED_SPARSE_RANGE.contains(&self) + if self.is_sparse() { + self.is_inherited_sparse_key() + } else { + !NON_INHERITED_RANGE.contains(&self) + } + } + + #[inline(always)] + pub fn is_sparse(self) -> bool { + self.field1 >= METADATA_KEY_BEGIN_PREFIX && self.field1 < METADATA_KEY_END_PREFIX + } + + /// Check if the key belongs to the inherited keyspace. + fn is_inherited_sparse_key(self) -> bool { + debug_assert!(self.is_sparse()); + self.field1 == RELATION_SIZE_PREFIX + } + + pub fn sparse_non_inherited_keyspace() -> Range { + // The two keys are adjacent; if we will have non-adjancent keys in the future, we should return a keyspace + debug_assert_eq!(AUX_KEY_PREFIX + 1, REPL_ORIGIN_KEY_PREFIX); + Key { + field1: AUX_KEY_PREFIX, + field2: 0, + field3: 0, + field4: 0, + field5: 0, + field6: 0, + }..Key { + field1: REPL_ORIGIN_KEY_PREFIX + 1, + field2: 0, + field3: 0, + field4: 0, + field5: 0, + field6: 0, + } } #[inline(always)] diff --git a/libs/pageserver_api/src/models.rs b/libs/pageserver_api/src/models.rs index 39390d7647..c38af9cb80 100644 --- a/libs/pageserver_api/src/models.rs +++ b/libs/pageserver_api/src/models.rs @@ -29,11 +29,10 @@ use utils::{ }; use crate::{ - key::Key, + key::{CompactKey, Key}, reltag::RelTag, shard::{ShardCount, ShardStripeSize, TenantShardId}, }; -use anyhow::bail; use bytes::{Buf, BufMut, Bytes, BytesMut}; /// The state of a tenant in this pageserver. @@ -272,6 +271,8 @@ pub struct CompactInfoResponse { pub compact_key_range: Option, pub compact_lsn_range: Option, pub sub_compaction: bool, + pub running: bool, + pub job_id: usize, } #[derive(Serialize, Deserialize, Clone)] @@ -1398,6 +1399,8 @@ pub enum PagestreamFeMessage { GetPage(PagestreamGetPageRequest), DbSize(PagestreamDbSizeRequest), GetSlruSegment(PagestreamGetSlruSegmentRequest), + #[cfg(feature = "testing")] + Test(PagestreamTestRequest), } // Wrapped in libpq CopyData @@ -1409,6 +1412,22 @@ pub enum PagestreamBeMessage { Error(PagestreamErrorResponse), DbSize(PagestreamDbSizeResponse), GetSlruSegment(PagestreamGetSlruSegmentResponse), + #[cfg(feature = "testing")] + Test(PagestreamTestResponse), +} + +// Keep in sync with `pagestore_client.h` +#[repr(u8)] +enum PagestreamFeMessageTag { + Exists = 0, + Nblocks = 1, + GetPage = 2, + DbSize = 3, + GetSlruSegment = 4, + /* future tags above this line */ + /// For testing purposes, not available in production. + #[cfg(feature = "testing")] + Test = 99, } // Keep in sync with `pagestore_client.h` @@ -1420,7 +1439,28 @@ enum PagestreamBeMessageTag { Error = 103, DbSize = 104, GetSlruSegment = 105, + /* future tags above this line */ + /// For testing purposes, not available in production. + #[cfg(feature = "testing")] + Test = 199, } + +impl TryFrom for PagestreamFeMessageTag { + type Error = u8; + fn try_from(value: u8) -> Result { + match value { + 0 => Ok(PagestreamFeMessageTag::Exists), + 1 => Ok(PagestreamFeMessageTag::Nblocks), + 2 => Ok(PagestreamFeMessageTag::GetPage), + 3 => Ok(PagestreamFeMessageTag::DbSize), + 4 => Ok(PagestreamFeMessageTag::GetSlruSegment), + #[cfg(feature = "testing")] + 99 => Ok(PagestreamFeMessageTag::Test), + _ => Err(value), + } + } +} + impl TryFrom for PagestreamBeMessageTag { type Error = u8; fn try_from(value: u8) -> Result { @@ -1431,6 +1471,8 @@ impl TryFrom for PagestreamBeMessageTag { 103 => Ok(PagestreamBeMessageTag::Error), 104 => Ok(PagestreamBeMessageTag::DbSize), 105 => Ok(PagestreamBeMessageTag::GetSlruSegment), + #[cfg(feature = "testing")] + 199 => Ok(PagestreamBeMessageTag::Test), _ => Err(value), } } @@ -1548,6 +1590,20 @@ pub struct PagestreamDbSizeResponse { pub db_size: i64, } +#[cfg(feature = "testing")] +#[derive(Debug, PartialEq, Eq, Clone)] +pub struct PagestreamTestRequest { + pub hdr: PagestreamRequest, + pub batch_key: u64, + pub message: String, +} + +#[cfg(feature = "testing")] +#[derive(Debug)] +pub struct PagestreamTestResponse { + pub req: PagestreamTestRequest, +} + // This is a cut-down version of TenantHistorySize from the pageserver crate, omitting fields // that require pageserver-internal types. It is sufficient to get the total size. #[derive(Serialize, Deserialize, Debug)] @@ -1567,7 +1623,7 @@ impl PagestreamFeMessage { match self { Self::Exists(req) => { - bytes.put_u8(0); + bytes.put_u8(PagestreamFeMessageTag::Exists as u8); bytes.put_u64(req.hdr.reqid); bytes.put_u64(req.hdr.request_lsn.0); bytes.put_u64(req.hdr.not_modified_since.0); @@ -1578,7 +1634,7 @@ impl PagestreamFeMessage { } Self::Nblocks(req) => { - bytes.put_u8(1); + bytes.put_u8(PagestreamFeMessageTag::Nblocks as u8); bytes.put_u64(req.hdr.reqid); bytes.put_u64(req.hdr.request_lsn.0); bytes.put_u64(req.hdr.not_modified_since.0); @@ -1589,7 +1645,7 @@ impl PagestreamFeMessage { } Self::GetPage(req) => { - bytes.put_u8(2); + bytes.put_u8(PagestreamFeMessageTag::GetPage as u8); bytes.put_u64(req.hdr.reqid); bytes.put_u64(req.hdr.request_lsn.0); bytes.put_u64(req.hdr.not_modified_since.0); @@ -1601,7 +1657,7 @@ impl PagestreamFeMessage { } Self::DbSize(req) => { - bytes.put_u8(3); + bytes.put_u8(PagestreamFeMessageTag::DbSize as u8); bytes.put_u64(req.hdr.reqid); bytes.put_u64(req.hdr.request_lsn.0); bytes.put_u64(req.hdr.not_modified_since.0); @@ -1609,13 +1665,24 @@ impl PagestreamFeMessage { } Self::GetSlruSegment(req) => { - bytes.put_u8(4); + bytes.put_u8(PagestreamFeMessageTag::GetSlruSegment as u8); bytes.put_u64(req.hdr.reqid); bytes.put_u64(req.hdr.request_lsn.0); bytes.put_u64(req.hdr.not_modified_since.0); bytes.put_u8(req.kind); bytes.put_u32(req.segno); } + #[cfg(feature = "testing")] + Self::Test(req) => { + bytes.put_u8(PagestreamFeMessageTag::Test as u8); + bytes.put_u64(req.hdr.reqid); + bytes.put_u64(req.hdr.request_lsn.0); + bytes.put_u64(req.hdr.not_modified_since.0); + bytes.put_u64(req.batch_key); + let message = req.message.as_bytes(); + bytes.put_u64(message.len() as u64); + bytes.put_slice(message); + } } bytes.into() @@ -1643,56 +1710,66 @@ impl PagestreamFeMessage { ), }; - match msg_tag { - 0 => Ok(PagestreamFeMessage::Exists(PagestreamExistsRequest { - hdr: PagestreamRequest { - reqid, - request_lsn, - not_modified_since, - }, - rel: RelTag { - spcnode: body.read_u32::()?, + match PagestreamFeMessageTag::try_from(msg_tag) + .map_err(|tag: u8| anyhow::anyhow!("invalid tag {tag}"))? + { + PagestreamFeMessageTag::Exists => { + Ok(PagestreamFeMessage::Exists(PagestreamExistsRequest { + hdr: PagestreamRequest { + reqid, + request_lsn, + not_modified_since, + }, + rel: RelTag { + spcnode: body.read_u32::()?, + dbnode: body.read_u32::()?, + relnode: body.read_u32::()?, + forknum: body.read_u8()?, + }, + })) + } + PagestreamFeMessageTag::Nblocks => { + Ok(PagestreamFeMessage::Nblocks(PagestreamNblocksRequest { + hdr: PagestreamRequest { + reqid, + request_lsn, + not_modified_since, + }, + rel: RelTag { + spcnode: body.read_u32::()?, + dbnode: body.read_u32::()?, + relnode: body.read_u32::()?, + forknum: body.read_u8()?, + }, + })) + } + PagestreamFeMessageTag::GetPage => { + Ok(PagestreamFeMessage::GetPage(PagestreamGetPageRequest { + hdr: PagestreamRequest { + reqid, + request_lsn, + not_modified_since, + }, + rel: RelTag { + spcnode: body.read_u32::()?, + dbnode: body.read_u32::()?, + relnode: body.read_u32::()?, + forknum: body.read_u8()?, + }, + blkno: body.read_u32::()?, + })) + } + PagestreamFeMessageTag::DbSize => { + Ok(PagestreamFeMessage::DbSize(PagestreamDbSizeRequest { + hdr: PagestreamRequest { + reqid, + request_lsn, + not_modified_since, + }, dbnode: body.read_u32::()?, - relnode: body.read_u32::()?, - forknum: body.read_u8()?, - }, - })), - 1 => Ok(PagestreamFeMessage::Nblocks(PagestreamNblocksRequest { - hdr: PagestreamRequest { - reqid, - request_lsn, - not_modified_since, - }, - rel: RelTag { - spcnode: body.read_u32::()?, - dbnode: body.read_u32::()?, - relnode: body.read_u32::()?, - forknum: body.read_u8()?, - }, - })), - 2 => Ok(PagestreamFeMessage::GetPage(PagestreamGetPageRequest { - hdr: PagestreamRequest { - reqid, - request_lsn, - not_modified_since, - }, - rel: RelTag { - spcnode: body.read_u32::()?, - dbnode: body.read_u32::()?, - relnode: body.read_u32::()?, - forknum: body.read_u8()?, - }, - blkno: body.read_u32::()?, - })), - 3 => Ok(PagestreamFeMessage::DbSize(PagestreamDbSizeRequest { - hdr: PagestreamRequest { - reqid, - request_lsn, - not_modified_since, - }, - dbnode: body.read_u32::()?, - })), - 4 => Ok(PagestreamFeMessage::GetSlruSegment( + })) + } + PagestreamFeMessageTag::GetSlruSegment => Ok(PagestreamFeMessage::GetSlruSegment( PagestreamGetSlruSegmentRequest { hdr: PagestreamRequest { reqid, @@ -1703,7 +1780,21 @@ impl PagestreamFeMessage { segno: body.read_u32::()?, }, )), - _ => bail!("unknown smgr message tag: {:?}", msg_tag), + #[cfg(feature = "testing")] + PagestreamFeMessageTag::Test => Ok(PagestreamFeMessage::Test(PagestreamTestRequest { + hdr: PagestreamRequest { + reqid, + request_lsn, + not_modified_since, + }, + batch_key: body.read_u64::()?, + message: { + let len = body.read_u64::()?; + let mut buf = vec![0; len as usize]; + body.read_exact(&mut buf)?; + String::from_utf8(buf)? + }, + })), } } } @@ -1746,6 +1837,15 @@ impl PagestreamBeMessage { bytes.put_u32((resp.segment.len() / BLCKSZ as usize) as u32); bytes.put(&resp.segment[..]); } + + #[cfg(feature = "testing")] + Self::Test(resp) => { + bytes.put_u8(Tag::Test as u8); + bytes.put_u64(resp.req.batch_key); + let message = resp.req.message.as_bytes(); + bytes.put_u64(message.len() as u64); + bytes.put_slice(message); + } } } PagestreamProtocolVersion::V3 => { @@ -1814,6 +1914,18 @@ impl PagestreamBeMessage { bytes.put_u32((resp.segment.len() / BLCKSZ as usize) as u32); bytes.put(&resp.segment[..]); } + + #[cfg(feature = "testing")] + Self::Test(resp) => { + bytes.put_u8(Tag::Test as u8); + bytes.put_u64(resp.req.hdr.reqid); + bytes.put_u64(resp.req.hdr.request_lsn.0); + bytes.put_u64(resp.req.hdr.not_modified_since.0); + bytes.put_u64(resp.req.batch_key); + let message = resp.req.message.as_bytes(); + bytes.put_u64(message.len() as u64); + bytes.put_slice(message); + } } } } @@ -1956,6 +2068,28 @@ impl PagestreamBeMessage { segment: segment.into(), }) } + #[cfg(feature = "testing")] + Tag::Test => { + let reqid = buf.read_u64::()?; + let request_lsn = Lsn(buf.read_u64::()?); + let not_modified_since = Lsn(buf.read_u64::()?); + let batch_key = buf.read_u64::()?; + let len = buf.read_u64::()?; + let mut msg = vec![0; len as usize]; + buf.read_exact(&mut msg)?; + let message = String::from_utf8(msg)?; + Self::Test(PagestreamTestResponse { + req: PagestreamTestRequest { + hdr: PagestreamRequest { + reqid, + request_lsn, + not_modified_since, + }, + batch_key, + message, + }, + }) + } }; let remaining = buf.into_inner(); if !remaining.is_empty() { @@ -1975,6 +2109,25 @@ impl PagestreamBeMessage { Self::Error(_) => "Error", Self::DbSize(_) => "DbSize", Self::GetSlruSegment(_) => "GetSlruSegment", + #[cfg(feature = "testing")] + Self::Test(_) => "Test", + } + } +} + +#[derive(Debug, Serialize, Deserialize)] +pub struct PageTraceEvent { + pub key: CompactKey, + pub effective_lsn: Lsn, + pub time: SystemTime, +} + +impl Default for PageTraceEvent { + fn default() -> Self { + Self { + key: Default::default(), + effective_lsn: Default::default(), + time: std::time::UNIX_EPOCH, } } } diff --git a/libs/pageserver_api/src/shard.rs b/libs/pageserver_api/src/shard.rs index 4cc0a739e8..e03df02afb 100644 --- a/libs/pageserver_api/src/shard.rs +++ b/libs/pageserver_api/src/shard.rs @@ -31,6 +31,8 @@ //! - In a tenant with 4 shards, each shard has ShardCount(N), ShardNumber(i) where i in 0..N-1 (inclusive), //! and their slugs are 0004, 0104, 0204, and 0304. +use std::hash::{Hash, Hasher}; + use crate::{key::Key, models::ShardParameters}; use postgres_ffi::relfile_utils::INIT_FORKNUM; use serde::{Deserialize, Serialize}; @@ -48,6 +50,23 @@ pub struct ShardIdentity { layout: ShardLayout, } +/// Hash implementation +/// +/// The stripe size cannot change dynamically, so it can be ignored for efficiency reasons. +impl Hash for ShardIdentity { + fn hash(&self, state: &mut H) { + let ShardIdentity { + number, + count, + stripe_size: _, + layout: _, + } = self; + + number.0.hash(state); + count.0.hash(state); + } +} + /// Stripe size in number of pages #[derive(Clone, Copy, Serialize, Deserialize, Eq, PartialEq, Debug)] pub struct ShardStripeSize(pub u32); @@ -59,7 +78,7 @@ impl Default for ShardStripeSize { } /// Layout version: for future upgrades where we might change how the key->shard mapping works -#[derive(Clone, Copy, Serialize, Deserialize, Eq, PartialEq, Debug)] +#[derive(Clone, Copy, Serialize, Deserialize, Eq, PartialEq, Hash, Debug)] pub struct ShardLayout(u8); const LAYOUT_V1: ShardLayout = ShardLayout(1); diff --git a/libs/postgres_ffi/src/walrecord.rs b/libs/postgres_ffi/src/walrecord.rs index b32106632a..fce37e2fdd 100644 --- a/libs/postgres_ffi/src/walrecord.rs +++ b/libs/postgres_ffi/src/walrecord.rs @@ -16,7 +16,7 @@ use utils::bin_ser::DeserializeError; use utils::lsn::Lsn; #[repr(C)] -#[derive(Debug, Serialize, Deserialize)] +#[derive(Clone, Debug, Serialize, Deserialize)] pub struct XlMultiXactCreate { pub mid: MultiXactId, /* new MultiXact's ID */ @@ -46,7 +46,7 @@ impl XlMultiXactCreate { } #[repr(C)] -#[derive(Debug, Serialize, Deserialize)] +#[derive(Clone, Debug, Serialize, Deserialize)] pub struct XlMultiXactTruncate { pub oldest_multi_db: Oid, /* to-be-truncated range of multixact offsets */ @@ -72,7 +72,7 @@ impl XlMultiXactTruncate { } #[repr(C)] -#[derive(Debug, Serialize, Deserialize)] +#[derive(Clone, Debug, Serialize, Deserialize)] pub struct XlRelmapUpdate { pub dbid: Oid, /* database ID, or 0 for shared map */ pub tsid: Oid, /* database's tablespace, or pg_global */ @@ -90,7 +90,7 @@ impl XlRelmapUpdate { } #[repr(C)] -#[derive(Debug, Serialize, Deserialize)] +#[derive(Clone, Debug, Serialize, Deserialize)] pub struct XlReploriginDrop { pub node_id: RepOriginId, } @@ -104,7 +104,7 @@ impl XlReploriginDrop { } #[repr(C)] -#[derive(Debug, Serialize, Deserialize)] +#[derive(Clone, Debug, Serialize, Deserialize)] pub struct XlReploriginSet { pub remote_lsn: Lsn, pub node_id: RepOriginId, @@ -911,7 +911,7 @@ impl XlSmgrCreate { } #[repr(C)] -#[derive(Debug, Serialize, Deserialize)] +#[derive(Clone, Debug, Serialize, Deserialize)] pub struct XlSmgrTruncate { pub blkno: BlockNumber, pub rnode: RelFileNode, @@ -984,7 +984,7 @@ impl XlDropDatabase { /// xl_xact_parsed_abort structs in PostgreSQL, but we use the same /// struct for commits and aborts. /// -#[derive(Debug, Serialize, Deserialize)] +#[derive(Clone, Debug, Serialize, Deserialize)] pub struct XlXactParsedRecord { pub xid: TransactionId, pub info: u8, diff --git a/libs/pq_proto/src/lib.rs b/libs/pq_proto/src/lib.rs index 94714359a3..50b2c69d24 100644 --- a/libs/pq_proto/src/lib.rs +++ b/libs/pq_proto/src/lib.rs @@ -44,7 +44,7 @@ pub struct ProtocolVersion(u32); impl ProtocolVersion { pub const fn new(major: u16, minor: u16) -> Self { - Self((major as u32) << 16 | minor as u32) + Self(((major as u32) << 16) | minor as u32) } pub const fn minor(self) -> u16 { self.0 as u16 diff --git a/libs/remote_storage/src/config.rs b/libs/remote_storage/src/config.rs index 49b1d9dc87..dae141bf77 100644 --- a/libs/remote_storage/src/config.rs +++ b/libs/remote_storage/src/config.rs @@ -43,6 +43,17 @@ impl RemoteStorageKind { } } +impl RemoteStorageConfig { + /// Helper to fetch the configured concurrency limit. + pub fn concurrency_limit(&self) -> Option { + match &self.storage { + RemoteStorageKind::LocalFs { .. } => None, + RemoteStorageKind::AwsS3(c) => Some(c.concurrency_limit.into()), + RemoteStorageKind::AzureContainer(c) => Some(c.concurrency_limit.into()), + } + } +} + fn default_timeout() -> Duration { RemoteStorageConfig::DEFAULT_TIMEOUT } diff --git a/libs/safekeeper_api/Cargo.toml b/libs/safekeeper_api/Cargo.toml index 4234ec6779..6b72ace019 100644 --- a/libs/safekeeper_api/Cargo.toml +++ b/libs/safekeeper_api/Cargo.toml @@ -5,9 +5,12 @@ edition.workspace = true license.workspace = true [dependencies] +anyhow.workspace = true const_format.workspace = true serde.workspace = true +serde_json.workspace = true postgres_ffi.workspace = true pq_proto.workspace = true tokio.workspace = true utils.workspace = true +pageserver_api.workspace = true diff --git a/libs/safekeeper_api/src/lib.rs b/libs/safekeeper_api/src/lib.rs index be6923aca9..fa86523ad7 100644 --- a/libs/safekeeper_api/src/lib.rs +++ b/libs/safekeeper_api/src/lib.rs @@ -4,12 +4,15 @@ use const_format::formatcp; use pq_proto::SystemId; use serde::{Deserialize, Serialize}; +pub mod membership; /// Public API types pub mod models; /// Consensus logical timestamp. Note: it is a part of sk control file. pub type Term = u64; -pub const INVALID_TERM: Term = 0; +/// With this term timeline is created initially. It +/// is a normal term except wp is never elected with it. +pub const INITIAL_TERM: Term = 0; /// Information about Postgres. Safekeeper gets it once and then verifies all /// further connections from computes match. Note: it is a part of sk control diff --git a/libs/safekeeper_api/src/membership.rs b/libs/safekeeper_api/src/membership.rs new file mode 100644 index 0000000000..a39fda526f --- /dev/null +++ b/libs/safekeeper_api/src/membership.rs @@ -0,0 +1,166 @@ +//! Types defining safekeeper membership, see +//! rfcs/035-safekeeper-dynamic-membership-change.md +//! for details. + +use std::{collections::HashSet, fmt::Display}; + +use anyhow; +use anyhow::bail; +use serde::{Deserialize, Serialize}; +use utils::id::NodeId; + +/// Number uniquely identifying safekeeper configuration. +/// Note: it is a part of sk control file. +pub type Generation = u32; +/// 1 is the first valid generation, 0 is used as +/// a placeholder before we fully migrate to generations. +pub const INVALID_GENERATION: Generation = 0; +pub const INITIAL_GENERATION: Generation = 1; + +/// Membership is defined by ids so e.g. walproposer uses them to figure out +/// quorums, but we also carry host and port to give wp idea where to connect. +#[derive(Debug, Clone, Serialize, Deserialize, PartialEq)] +pub struct SafekeeperId { + pub id: NodeId, + pub host: String, + /// We include here only port for computes -- that is, pg protocol tenant + /// only port, or wide pg protocol port if the former is not configured. + pub pg_port: u16, +} + +impl Display for SafekeeperId { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + write!(f, "[id={}, ep={}:{}]", self.id, self.host, self.pg_port) + } +} + +/// Set of safekeepers. +#[derive(Debug, Clone, Serialize, Deserialize, PartialEq)] +#[serde(transparent)] +pub struct MemberSet { + pub members: Vec, +} + +impl MemberSet { + pub fn empty() -> Self { + MemberSet { + members: Vec::new(), + } + } + + pub fn new(members: Vec) -> anyhow::Result { + let hs: HashSet = HashSet::from_iter(members.iter().map(|sk| sk.id)); + if hs.len() != members.len() { + bail!("duplicate safekeeper id in the set {:?}", members); + } + Ok(MemberSet { members }) + } + + pub fn contains(&self, sk: &SafekeeperId) -> bool { + self.members.iter().any(|m| m.id == sk.id) + } + + pub fn add(&mut self, sk: SafekeeperId) -> anyhow::Result<()> { + if self.contains(&sk) { + bail!(format!( + "sk {} is already member of the set {}", + sk.id, self + )); + } + self.members.push(sk); + Ok(()) + } +} + +impl Display for MemberSet { + /// Display as a comma separated list of members. + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + let sks_str = self + .members + .iter() + .map(|m| m.to_string()) + .collect::>(); + write!(f, "({})", sks_str.join(", ")) + } +} + +/// Safekeeper membership configuration. +/// Note: it is a part of both control file and http API. +#[derive(Debug, Clone, Serialize, Deserialize, PartialEq)] +pub struct Configuration { + /// Unique id. + pub generation: Generation, + /// Current members of the configuration. + pub members: MemberSet, + /// Some means it is a joint conf. + pub new_members: Option, +} + +impl Configuration { + /// Used for pre-generations timelines, will be removed eventually. + pub fn empty() -> Self { + Configuration { + generation: INVALID_GENERATION, + members: MemberSet::empty(), + new_members: None, + } + } +} + +impl Display for Configuration { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + write!( + f, + "gen={}, members={}, new_members={}", + self.generation, + self.members, + self.new_members + .as_ref() + .map(ToString::to_string) + .unwrap_or(String::from("none")) + ) + } +} + +#[cfg(test)] +mod tests { + use super::{MemberSet, SafekeeperId}; + use utils::id::NodeId; + + #[test] + fn test_member_set() { + let mut members = MemberSet::empty(); + members + .add(SafekeeperId { + id: NodeId(42), + host: String::from("lala.org"), + pg_port: 5432, + }) + .unwrap(); + + members + .add(SafekeeperId { + id: NodeId(42), + host: String::from("lala.org"), + pg_port: 5432, + }) + .expect_err("duplicate must not be allowed"); + + members + .add(SafekeeperId { + id: NodeId(43), + host: String::from("bubu.org"), + pg_port: 5432, + }) + .unwrap(); + + println!("members: {}", members); + + let j = serde_json::to_string(&members).expect("failed to serialize"); + println!("members json: {}", j); + assert_eq!( + j, + r#"[{"id":42,"host":"lala.org","pg_port":5432},{"id":43,"host":"bubu.org","pg_port":5432}]"# + ); + } +} diff --git a/libs/safekeeper_api/src/models.rs b/libs/safekeeper_api/src/models.rs index 3e424a792c..b5fa903820 100644 --- a/libs/safekeeper_api/src/models.rs +++ b/libs/safekeeper_api/src/models.rs @@ -1,5 +1,6 @@ //! Types used in safekeeper http API. Many of them are also reused internally. +use pageserver_api::shard::ShardIdentity; use postgres_ffi::TimestampTz; use serde::{Deserialize, Serialize}; use std::net::SocketAddr; @@ -11,7 +12,7 @@ use utils::{ pageserver_feedback::PageserverFeedback, }; -use crate::{ServerInfo, Term}; +use crate::{membership::Configuration, ServerInfo, Term}; #[derive(Debug, Serialize)] pub struct SafekeeperStatus { @@ -22,13 +23,16 @@ pub struct SafekeeperStatus { pub struct TimelineCreateRequest { pub tenant_id: TenantId, pub timeline_id: TimelineId, - pub peer_ids: Option>, + pub mconf: Configuration, pub pg_version: u32, pub system_id: Option, + // By default WAL_SEGMENT_SIZE pub wal_seg_size: Option, - pub commit_lsn: Lsn, - // If not passed, it is assigned to the beginning of commit_lsn segment. - pub local_start_lsn: Option, + pub start_lsn: Lsn, + // Normal creation should omit this field (start_lsn initializes all LSNs). + // However, we allow specifying custom value higher than start_lsn for + // manual recovery case, see test_s3_wal_replay. + pub commit_lsn: Option, } /// Same as TermLsn, but serializes LSN using display serializer @@ -143,7 +147,13 @@ pub type ConnectionId = u32; /// Serialize is used only for json'ing in API response. Also used internally. #[derive(Debug, Clone, Serialize, Deserialize)] -pub struct WalSenderState { +pub enum WalSenderState { + Vanilla(VanillaWalSenderState), + Interpreted(InterpretedWalSenderState), +} + +#[derive(Debug, Clone, Serialize, Deserialize)] +pub struct VanillaWalSenderState { pub ttid: TenantTimelineId, pub addr: SocketAddr, pub conn_id: ConnectionId, @@ -152,6 +162,17 @@ pub struct WalSenderState { pub feedback: ReplicationFeedback, } +#[derive(Debug, Clone, Serialize, Deserialize)] +pub struct InterpretedWalSenderState { + pub ttid: TenantTimelineId, + pub shard: ShardIdentity, + pub addr: SocketAddr, + pub conn_id: ConnectionId, + // postgres application_name + pub appname: Option, + pub feedback: ReplicationFeedback, +} + #[derive(Debug, Clone, Serialize, Deserialize)] pub struct WalReceiverState { /// None means it is recovery initiated by us (this safekeeper). @@ -172,6 +193,7 @@ pub enum WalReceiverStatus { pub struct TimelineStatus { pub tenant_id: TenantId, pub timeline_id: TimelineId, + pub mconf: Configuration, pub acceptor_state: AcceptorStateStatus, pub pg_info: ServerInfo, pub flush_lsn: Lsn, @@ -186,6 +208,20 @@ pub struct TimelineStatus { pub walreceivers: Vec, } +/// Request to switch membership configuration. +#[derive(Serialize, Deserialize)] +#[serde(transparent)] +pub struct TimelineMembershipSwitchRequest { + pub mconf: Configuration, +} + +/// In response both previous and current configuration are sent. +#[derive(Serialize, Deserialize)] +pub struct TimelineMembershipSwitchResponse { + pub previous_conf: Configuration, + pub current_conf: Configuration, +} + fn lsn_invalid() -> Lsn { Lsn::INVALID } diff --git a/libs/tracing-utils/src/lib.rs b/libs/tracing-utils/src/lib.rs index c4aad53cdb..818d759eac 100644 --- a/libs/tracing-utils/src/lib.rs +++ b/libs/tracing-utils/src/lib.rs @@ -38,7 +38,6 @@ pub mod http; use opentelemetry::trace::TracerProvider; use opentelemetry::KeyValue; -use opentelemetry_sdk::Resource; use tracing::Subscriber; use tracing_subscriber::registry::LookupSpan; use tracing_subscriber::Layer; @@ -121,7 +120,10 @@ where S: Subscriber + for<'span> LookupSpan<'span>, { // Sets up exporter from the OTEL_EXPORTER_* environment variables. - let exporter = opentelemetry_otlp::new_exporter().http(); + let exporter = opentelemetry_otlp::SpanExporter::builder() + .with_http() + .build() + .expect("could not initialize opentelemetry exporter"); // TODO: opentelemetry::global::set_error_handler() with custom handler that // bypasses default tracing layers, but logs regular looking log @@ -132,17 +134,13 @@ where opentelemetry_sdk::propagation::TraceContextPropagator::new(), ); - let tracer = opentelemetry_otlp::new_pipeline() - .tracing() - .with_exporter(exporter) - .with_trace_config(opentelemetry_sdk::trace::Config::default().with_resource( - Resource::new(vec![KeyValue::new( - opentelemetry_semantic_conventions::resource::SERVICE_NAME, - service_name, - )]), - )) - .install_batch(opentelemetry_sdk::runtime::Tokio) - .expect("could not initialize opentelemetry exporter") + let tracer = opentelemetry_sdk::trace::TracerProvider::builder() + .with_batch_exporter(exporter, opentelemetry_sdk::runtime::Tokio) + .with_resource(opentelemetry_sdk::Resource::new(vec![KeyValue::new( + opentelemetry_semantic_conventions::resource::SERVICE_NAME, + service_name, + )])) + .build() .tracer("global"); tracing_opentelemetry::layer().with_tracer(tracer) diff --git a/libs/utils/Cargo.toml b/libs/utils/Cargo.toml index 02bf77760a..edb451a02c 100644 --- a/libs/utils/Cargo.toml +++ b/libs/utils/Cargo.toml @@ -26,6 +26,7 @@ git-version.workspace = true hex = { workspace = true, features = ["serde"] } humantime.workspace = true hyper0 = { workspace = true, features = ["full"] } +inferno.workspace = true itertools.workspace = true fail.workspace = true futures = { workspace = true } diff --git a/libs/utils/src/generation.rs b/libs/utils/src/generation.rs index 5970836033..44565ee6a2 100644 --- a/libs/utils/src/generation.rs +++ b/libs/utils/src/generation.rs @@ -112,9 +112,9 @@ impl Serialize for Generation { // We should never be asked to serialize a None. Structures // that include an optional generation should convert None to an // Option::None - Err(serde::ser::Error::custom( - "Tried to serialize invalid generation ({self})", - )) + Err(serde::ser::Error::custom(format!( + "Tried to serialize invalid generation ({self:?})" + ))) } } } diff --git a/libs/utils/src/guard_arc_swap.rs b/libs/utils/src/guard_arc_swap.rs new file mode 100644 index 0000000000..cec5202460 --- /dev/null +++ b/libs/utils/src/guard_arc_swap.rs @@ -0,0 +1,54 @@ +//! A wrapper around `ArcSwap` that ensures there is only one writer at a time and writes +//! don't block reads. + +use arc_swap::ArcSwap; +use std::sync::Arc; +use tokio::sync::TryLockError; + +pub struct GuardArcSwap { + inner: ArcSwap, + guard: tokio::sync::Mutex<()>, +} + +pub struct Guard<'a, T> { + _guard: tokio::sync::MutexGuard<'a, ()>, + inner: &'a ArcSwap, +} + +impl GuardArcSwap { + pub fn new(inner: T) -> Self { + Self { + inner: ArcSwap::new(Arc::new(inner)), + guard: tokio::sync::Mutex::new(()), + } + } + + pub fn read(&self) -> Arc { + self.inner.load_full() + } + + pub async fn write_guard(&self) -> Guard<'_, T> { + Guard { + _guard: self.guard.lock().await, + inner: &self.inner, + } + } + + pub fn try_write_guard(&self) -> Result, TryLockError> { + let guard = self.guard.try_lock()?; + Ok(Guard { + _guard: guard, + inner: &self.inner, + }) + } +} + +impl Guard<'_, T> { + pub fn read(&self) -> Arc { + self.inner.load_full() + } + + pub fn write(&mut self, value: T) { + self.inner.store(Arc::new(value)); + } +} diff --git a/libs/utils/src/http/endpoint.rs b/libs/utils/src/http/endpoint.rs index 9b37b69939..9f38373ca0 100644 --- a/libs/utils/src/http/endpoint.rs +++ b/libs/utils/src/http/endpoint.rs @@ -15,7 +15,7 @@ use once_cell::sync::Lazy; use regex::Regex; use routerify::ext::RequestExt; use routerify::{Middleware, RequestInfo, Router, RouterBuilder}; -use tokio::sync::{mpsc, Mutex}; +use tokio::sync::{mpsc, Mutex, Notify}; use tokio_stream::wrappers::ReceiverStream; use tokio_util::io::ReaderStream; use tracing::{debug, info, info_span, warn, Instrument}; @@ -350,33 +350,53 @@ pub async fn profile_cpu_handler(req: Request) -> Result, A }; let seconds = match parse_query_param(&req, "seconds")? { None => 5, - Some(seconds @ 1..=30) => seconds, - Some(_) => return Err(ApiError::BadRequest(anyhow!("duration must be 1-30 secs"))), + Some(seconds @ 1..=60) => seconds, + Some(_) => return Err(ApiError::BadRequest(anyhow!("duration must be 1-60 secs"))), }; let frequency_hz = match parse_query_param(&req, "frequency")? { None => 99, Some(1001..) => return Err(ApiError::BadRequest(anyhow!("frequency must be <=1000 Hz"))), Some(frequency) => frequency, }; - - // Only allow one profiler at a time. - static PROFILE_LOCK: Lazy> = Lazy::new(|| Mutex::new(())); - let _lock = PROFILE_LOCK - .try_lock() - .map_err(|_| ApiError::Conflict("profiler already running".into()))?; + let force: bool = parse_query_param(&req, "force")?.unwrap_or_default(); // Take the profile. - let report = tokio::task::spawn_blocking(move || { + static PROFILE_LOCK: Lazy> = Lazy::new(|| Mutex::new(())); + static PROFILE_CANCEL: Lazy = Lazy::new(Notify::new); + + let report = { + // Only allow one profiler at a time. If force is true, cancel a running profile (e.g. a + // Grafana continuous profile). We use a try_lock() loop when cancelling instead of waiting + // for a lock(), to avoid races where the notify isn't currently awaited. + let _lock = loop { + match PROFILE_LOCK.try_lock() { + Ok(lock) => break lock, + Err(_) if force => PROFILE_CANCEL.notify_waiters(), + Err(_) => { + return Err(ApiError::Conflict( + "profiler already running (use ?force=true to cancel it)".into(), + )) + } + } + tokio::time::sleep(Duration::from_millis(1)).await; // don't busy-wait + }; + let guard = ProfilerGuardBuilder::default() .frequency(frequency_hz) .blocklist(&["libc", "libgcc", "pthread", "vdso"]) - .build()?; - std::thread::sleep(Duration::from_secs(seconds)); - guard.report().build() - }) - .await - .map_err(|join_err| ApiError::InternalServerError(join_err.into()))? - .map_err(|pprof_err| ApiError::InternalServerError(pprof_err.into()))?; + .build() + .map_err(|err| ApiError::InternalServerError(err.into()))?; + + tokio::select! { + _ = tokio::time::sleep(Duration::from_secs(seconds)) => {}, + _ = PROFILE_CANCEL.notified() => {}, + }; + + guard + .report() + .build() + .map_err(|err| ApiError::InternalServerError(err.into()))? + }; // Return the report in the requested format. match format { @@ -417,6 +437,7 @@ pub async fn profile_heap_handler(req: Request) -> Result, enum Format { Jemalloc, Pprof, + Svg, } // Parameters. @@ -424,9 +445,24 @@ pub async fn profile_heap_handler(req: Request) -> Result, None => Format::Pprof, Some("jemalloc") => Format::Jemalloc, Some("pprof") => Format::Pprof, + Some("svg") => Format::Svg, Some(format) => return Err(ApiError::BadRequest(anyhow!("invalid format {format}"))), }; + // Functions and mappings to strip when symbolizing pprof profiles. If true, + // also remove child frames. + static STRIP_FUNCTIONS: Lazy> = Lazy::new(|| { + vec![ + (Regex::new("^__rust").unwrap(), false), + (Regex::new("^_start$").unwrap(), false), + (Regex::new("^irallocx_prof").unwrap(), true), + (Regex::new("^prof_alloc_prep").unwrap(), true), + (Regex::new("^std::rt::lang_start").unwrap(), false), + (Regex::new("^std::sys::backtrace::__rust").unwrap(), false), + ] + }); + const STRIP_MAPPINGS: &[&str] = &["libc", "libgcc", "pthread", "vdso"]; + // Obtain profiler handle. let mut prof_ctl = jemalloc_pprof::PROF_CTL .as_ref() @@ -464,24 +500,9 @@ pub async fn profile_heap_handler(req: Request) -> Result, // Symbolize the profile. // TODO: consider moving this upstream to jemalloc_pprof and avoiding the // serialization roundtrip. - static STRIP_FUNCTIONS: Lazy> = Lazy::new(|| { - // Functions to strip from profiles. If true, also remove child frames. - vec![ - (Regex::new("^__rust").unwrap(), false), - (Regex::new("^_start$").unwrap(), false), - (Regex::new("^irallocx_prof").unwrap(), true), - (Regex::new("^prof_alloc_prep").unwrap(), true), - (Regex::new("^std::rt::lang_start").unwrap(), false), - (Regex::new("^std::sys::backtrace::__rust").unwrap(), false), - ] - }); let profile = pprof::decode(&bytes)?; let profile = pprof::symbolize(profile)?; - let profile = pprof::strip_locations( - profile, - &["libc", "libgcc", "pthread", "vdso"], - &STRIP_FUNCTIONS, - ); + let profile = pprof::strip_locations(profile, STRIP_MAPPINGS, &STRIP_FUNCTIONS); pprof::encode(&profile) }) .await @@ -494,6 +515,27 @@ pub async fn profile_heap_handler(req: Request) -> Result, .body(Body::from(data)) .map_err(|err| ApiError::InternalServerError(err.into())) } + + Format::Svg => { + let body = tokio::task::spawn_blocking(move || { + let bytes = prof_ctl.dump_pprof()?; + let profile = pprof::decode(&bytes)?; + let profile = pprof::symbolize(profile)?; + let profile = pprof::strip_locations(profile, STRIP_MAPPINGS, &STRIP_FUNCTIONS); + let mut opts = inferno::flamegraph::Options::default(); + opts.title = "Heap inuse".to_string(); + opts.count_name = "bytes".to_string(); + pprof::flamegraph(profile, &mut opts) + }) + .await + .map_err(|join_err| ApiError::InternalServerError(join_err.into()))? + .map_err(ApiError::InternalServerError)?; + Response::builder() + .status(200) + .header(CONTENT_TYPE, "image/svg+xml") + .body(Body::from(body)) + .map_err(|err| ApiError::InternalServerError(err.into())) + } } } diff --git a/libs/utils/src/lib.rs b/libs/utils/src/lib.rs index 2c56dd750f..1fb18e9e9a 100644 --- a/libs/utils/src/lib.rs +++ b/libs/utils/src/lib.rs @@ -98,6 +98,8 @@ pub mod try_rcu; pub mod pprof; +pub mod guard_arc_swap; + // Re-export used in macro. Avoids adding git-version as dep in target crates. #[doc(hidden)] pub use git_version; diff --git a/libs/utils/src/lsn.rs b/libs/utils/src/lsn.rs index f188165600..c874fa30ff 100644 --- a/libs/utils/src/lsn.rs +++ b/libs/utils/src/lsn.rs @@ -260,7 +260,7 @@ impl FromStr for Lsn { { let left_num = u32::from_str_radix(left, 16).map_err(|_| LsnParseError)?; let right_num = u32::from_str_radix(right, 16).map_err(|_| LsnParseError)?; - Ok(Lsn((left_num as u64) << 32 | right_num as u64)) + Ok(Lsn(((left_num as u64) << 32) | right_num as u64)) } else { Err(LsnParseError) } diff --git a/libs/utils/src/pprof.rs b/libs/utils/src/pprof.rs index 90910897bf..dd57f9ed4b 100644 --- a/libs/utils/src/pprof.rs +++ b/libs/utils/src/pprof.rs @@ -1,8 +1,9 @@ +use anyhow::bail; use flate2::write::{GzDecoder, GzEncoder}; use flate2::Compression; use itertools::Itertools as _; use once_cell::sync::Lazy; -use pprof::protos::{Function, Line, Message as _, Profile}; +use pprof::protos::{Function, Line, Location, Message as _, Profile}; use regex::Regex; use std::borrow::Cow; @@ -188,3 +189,59 @@ pub fn strip_locations( profile } + +/// Generates an SVG flamegraph from a symbolized pprof profile. +pub fn flamegraph( + profile: Profile, + opts: &mut inferno::flamegraph::Options, +) -> anyhow::Result> { + if profile.mapping.iter().any(|m| !m.has_functions) { + bail!("profile not symbolized"); + } + + // Index locations, functions, and strings. + let locations: HashMap = + profile.location.into_iter().map(|l| (l.id, l)).collect(); + let functions: HashMap = + profile.function.into_iter().map(|f| (f.id, f)).collect(); + let strings = profile.string_table; + + // Resolve stacks as function names, and sum sample values per stack. Also reverse the stack, + // since inferno expects it bottom-up. + let mut stacks: HashMap, i64> = HashMap::new(); + for sample in profile.sample { + let mut stack = Vec::with_capacity(sample.location_id.len()); + for location in sample.location_id.into_iter().rev() { + let Some(location) = locations.get(&location) else { + bail!("missing location {location}"); + }; + for line in location.line.iter().rev() { + let Some(function) = functions.get(&line.function_id) else { + bail!("missing function {}", line.function_id); + }; + let Some(name) = strings.get(function.name as usize) else { + bail!("missing string {}", function.name); + }; + stack.push(name.as_str()); + } + } + let Some(&value) = sample.value.first() else { + bail!("missing value"); + }; + *stacks.entry(stack).or_default() += value; + } + + // Construct stack lines for inferno. + let lines = stacks + .into_iter() + .map(|(stack, value)| (stack.into_iter().join(";"), value)) + .map(|(stack, value)| format!("{stack} {value}")) + .sorted() + .collect_vec(); + + // Construct the flamegraph. + let mut bytes = Vec::new(); + let lines = lines.iter().map(|line| line.as_str()); + inferno::flamegraph::from_lines(opts, lines, &mut bytes)?; + Ok(bytes) +} diff --git a/libs/utils/src/sync/spsc_fold.rs b/libs/utils/src/sync/spsc_fold.rs index b44f766ef0..0cab291d51 100644 --- a/libs/utils/src/sync/spsc_fold.rs +++ b/libs/utils/src/sync/spsc_fold.rs @@ -96,7 +96,11 @@ impl Sender { } } State::SenderWaitsForReceiverToConsume(_data) => { - // Really, we shouldn't be polled until receiver has consumed and wakes us. + // SAFETY: send is single threaded due to `&mut self` requirement, + // therefore register is not concurrent. + unsafe { + self.state.wake_sender.register(cx.waker()); + } Poll::Pending } State::ReceiverGone => Poll::Ready(Err(SendError::ReceiverGone)), @@ -449,4 +453,38 @@ mod tests { let err = recv_task.await.unwrap().expect_err("should error"); assert!(matches!(err, RecvError::SenderGone)); } + + #[tokio::test(start_paused = true)] + async fn test_receiver_drop_while_waiting_for_receiver_to_consume_unblocks_sender() { + let (mut sender, receiver) = channel(); + + let state = receiver.state.clone(); + + sender.send((), |_, _| unreachable!()).await.unwrap(); + + assert!(matches!(&*state.value.lock().unwrap(), &State::HasData(_))); + + let unmergeable = sender.send((), |_, _| Err(())); + let mut unmergeable = std::pin::pin!(unmergeable); + tokio::select! { + _ = tokio::time::sleep(FOREVER) => {}, + _ = &mut unmergeable => { + panic!("unmergeable should not complete"); + }, + } + + assert!(matches!( + &*state.value.lock().unwrap(), + &State::SenderWaitsForReceiverToConsume(_) + )); + + drop(receiver); + + assert!(matches!( + &*state.value.lock().unwrap(), + &State::ReceiverGone + )); + + unmergeable.await.unwrap_err(); + } } diff --git a/libs/wal_decoder/Cargo.toml b/libs/wal_decoder/Cargo.toml index 8fac4e38ca..09c4afb18a 100644 --- a/libs/wal_decoder/Cargo.toml +++ b/libs/wal_decoder/Cargo.toml @@ -24,3 +24,18 @@ workspace_hack = { version = "0.1", path = "../../workspace_hack" } [build-dependencies] tonic-build.workspace = true + +[dev-dependencies] +criterion.workspace = true +camino.workspace = true +camino-tempfile.workspace = true +remote_storage.workspace = true +tokio-util.workspace = true +serde_json.workspace = true +futures.workspace = true +tikv-jemallocator.workspace = true +pprof.workspace = true + +[[bench]] +name = "bench_interpret_wal" +harness = false diff --git a/libs/wal_decoder/benches/README.md b/libs/wal_decoder/benches/README.md new file mode 100644 index 0000000000..14885afecf --- /dev/null +++ b/libs/wal_decoder/benches/README.md @@ -0,0 +1,34 @@ +## WAL Decoding and Interpretation Benchmarks + +Note that these benchmarks pull WAL from a public bucket in S3 +as a preparation step. Hence, you need a way to auth with AWS. +You can achieve this by copying the `~/.aws/config` file from +the AWS SSO notion page and exporting `AWS_PROFILE=dev` when invoking +the benchmarks. + +To run benchmarks: + +```sh +aws sso login --profile dev + +# All benchmarks. +AWS_PROFILE=dev cargo bench --package wal_decoder + +# Specific file. +AWS_PROFILE=dev cargo bench --package wal_decoder --bench bench_interpret_wal + +# Specific benchmark. +AWS_PROFILE=dev cargo bench --package wal_decoder --bench bench_interpret_wal unsharded + +# List available benchmarks. +cargo bench --package wal_decoder --benches -- --list + +# Generate flamegraph profiles using pprof-rs, profiling for 10 seconds. +# Output in target/criterion/*/profile/flamegraph.svg. +AWS_PROFILE=dev cargo bench --package wal_decoder --bench bench_interpret_wal unsharded -- --profile-time 10 +``` + +Additional charts and statistics are available in `target/criterion/report/index.html`. + +Benchmarks are automatically compared against the previous run. To compare against other runs, see +`--baseline` and `--save-baseline`. diff --git a/libs/wal_decoder/benches/bench_interpret_wal.rs b/libs/wal_decoder/benches/bench_interpret_wal.rs new file mode 100644 index 0000000000..846904cf87 --- /dev/null +++ b/libs/wal_decoder/benches/bench_interpret_wal.rs @@ -0,0 +1,250 @@ +use anyhow::Context; +use criterion::{criterion_group, criterion_main, Criterion}; +use futures::{stream::FuturesUnordered, StreamExt}; +use pageserver_api::shard::{ShardIdentity, ShardStripeSize}; +use postgres_ffi::{waldecoder::WalStreamDecoder, MAX_SEND_SIZE, WAL_SEGMENT_SIZE}; +use pprof::criterion::{Output, PProfProfiler}; +use serde::Deserialize; +use std::{env, num::NonZeroUsize, sync::Arc}; + +use camino::{Utf8Path, Utf8PathBuf}; +use camino_tempfile::Utf8TempDir; +use remote_storage::{ + DownloadOpts, GenericRemoteStorage, ListingMode, RemoteStorageConfig, RemoteStorageKind, + S3Config, +}; +use tokio_util::sync::CancellationToken; +use utils::{ + lsn::Lsn, + shard::{ShardCount, ShardNumber}, +}; +use wal_decoder::models::InterpretedWalRecord; + +const S3_BUCKET: &str = "neon-github-public-dev"; +const S3_REGION: &str = "eu-central-1"; +const BUCKET_PREFIX: &str = "wal-snapshots/bulk-insert/"; +const METADATA_FILENAME: &str = "metadata.json"; + +/// Use jemalloc, and configure it to sample allocations for profiles every 1 MB. +/// This mirrors the configuration in bin/safekeeper.rs. +#[global_allocator] +static GLOBAL: tikv_jemallocator::Jemalloc = tikv_jemallocator::Jemalloc; + +#[allow(non_upper_case_globals)] +#[export_name = "malloc_conf"] +pub static malloc_conf: &[u8] = b"prof:true,prof_active:true,lg_prof_sample:20\0"; + +async fn create_s3_client() -> anyhow::Result> { + let remote_storage_config = RemoteStorageConfig { + storage: RemoteStorageKind::AwsS3(S3Config { + bucket_name: S3_BUCKET.to_string(), + bucket_region: S3_REGION.to_string(), + prefix_in_bucket: Some(BUCKET_PREFIX.to_string()), + endpoint: None, + concurrency_limit: NonZeroUsize::new(100).unwrap(), + max_keys_per_list_response: None, + upload_storage_class: None, + }), + timeout: RemoteStorageConfig::DEFAULT_TIMEOUT, + small_timeout: RemoteStorageConfig::DEFAULT_SMALL_TIMEOUT, + }; + Ok(Arc::new( + GenericRemoteStorage::from_config(&remote_storage_config) + .await + .context("remote storage init")?, + )) +} + +async fn download_bench_data( + client: Arc, + cancel: &CancellationToken, +) -> anyhow::Result { + let temp_dir_parent: Utf8PathBuf = env::current_dir().unwrap().try_into()?; + let temp_dir = camino_tempfile::tempdir_in(temp_dir_parent)?; + + eprintln!("Downloading benchmark data to {:?}", temp_dir); + + let listing = client + .list(None, ListingMode::NoDelimiter, None, cancel) + .await?; + + let mut downloads = listing + .keys + .into_iter() + .map(|obj| { + let client = client.clone(); + let temp_dir_path = temp_dir.path().to_owned(); + + async move { + let remote_path = obj.key; + let download = client + .download(&remote_path, &DownloadOpts::default(), cancel) + .await?; + let mut body = tokio_util::io::StreamReader::new(download.download_stream); + + let file_name = remote_path.object_name().unwrap(); + let file_path = temp_dir_path.join(file_name); + let file = tokio::fs::OpenOptions::new() + .create(true) + .truncate(true) + .write(true) + .open(&file_path) + .await?; + + let mut writer = tokio::io::BufWriter::new(file); + tokio::io::copy_buf(&mut body, &mut writer).await?; + + Ok::<(), anyhow::Error>(()) + } + }) + .collect::>(); + + while let Some(download) = downloads.next().await { + download?; + } + + Ok(temp_dir) +} + +struct BenchmarkData { + wal: Vec, + meta: BenchmarkMetadata, +} + +#[derive(Deserialize)] +struct BenchmarkMetadata { + pg_version: u32, + start_lsn: Lsn, +} + +async fn load_bench_data(path: &Utf8Path, input_size: usize) -> anyhow::Result { + eprintln!("Loading benchmark data from {:?}", path); + + let mut entries = tokio::fs::read_dir(path).await?; + let mut ordered_segment_paths = Vec::new(); + let mut metadata = None; + + while let Some(entry) = entries.next_entry().await? { + if entry.file_name() == METADATA_FILENAME { + let bytes = tokio::fs::read(entry.path()).await?; + metadata = Some( + serde_json::from_slice::(&bytes) + .context("failed to deserialize metadata.json")?, + ); + } else { + ordered_segment_paths.push(entry.path()); + } + } + + ordered_segment_paths.sort(); + + let mut buffer = Vec::new(); + for path in ordered_segment_paths { + if buffer.len() >= input_size { + break; + } + + use async_compression::tokio::bufread::ZstdDecoder; + let file = tokio::fs::File::open(path).await?; + let reader = tokio::io::BufReader::new(file); + let decoder = ZstdDecoder::new(reader); + let mut reader = tokio::io::BufReader::new(decoder); + tokio::io::copy_buf(&mut reader, &mut buffer).await?; + } + + buffer.truncate(input_size); + + Ok(BenchmarkData { + wal: buffer, + meta: metadata.unwrap(), + }) +} + +fn criterion_benchmark(c: &mut Criterion) { + const INPUT_SIZE: usize = 128 * 1024 * 1024; + + let setup_runtime = tokio::runtime::Builder::new_current_thread() + .enable_all() + .build() + .unwrap(); + + let (_temp_dir, bench_data) = setup_runtime.block_on(async move { + let cancel = CancellationToken::new(); + let client = create_s3_client().await.unwrap(); + let temp_dir = download_bench_data(client, &cancel).await.unwrap(); + let bench_data = load_bench_data(temp_dir.path(), INPUT_SIZE).await.unwrap(); + + (temp_dir, bench_data) + }); + + eprintln!( + "Benchmarking against {} MiB of WAL", + INPUT_SIZE / 1024 / 1024 + ); + + let mut group = c.benchmark_group("decode-interpret-wal"); + group.throughput(criterion::Throughput::Bytes(bench_data.wal.len() as u64)); + group.sample_size(10); + + group.bench_function("unsharded", |b| { + b.iter(|| decode_interpret_main(&bench_data, &[ShardIdentity::unsharded()])) + }); + + let eight_shards = (0..8) + .map(|i| ShardIdentity::new(ShardNumber(i), ShardCount(8), ShardStripeSize(8)).unwrap()) + .collect::>(); + + group.bench_function("8/8-shards", |b| { + b.iter(|| decode_interpret_main(&bench_data, &eight_shards)) + }); + + let four_shards = eight_shards + .into_iter() + .filter(|s| s.number.0 % 2 == 0) + .collect::>(); + group.bench_function("4/8-shards", |b| { + b.iter(|| decode_interpret_main(&bench_data, &four_shards)) + }); + + let two_shards = four_shards + .into_iter() + .filter(|s| s.number.0 % 4 == 0) + .collect::>(); + group.bench_function("2/8-shards", |b| { + b.iter(|| decode_interpret_main(&bench_data, &two_shards)) + }); +} + +fn decode_interpret_main(bench: &BenchmarkData, shards: &[ShardIdentity]) { + let r = decode_interpret(bench, shards); + if let Err(e) = r { + panic!("{e:?}"); + } +} + +fn decode_interpret(bench: &BenchmarkData, shard: &[ShardIdentity]) -> anyhow::Result<()> { + let mut decoder = WalStreamDecoder::new(bench.meta.start_lsn, bench.meta.pg_version); + let xlogoff: usize = bench.meta.start_lsn.segment_offset(WAL_SEGMENT_SIZE); + + for chunk in bench.wal[xlogoff..].chunks(MAX_SEND_SIZE) { + decoder.feed_bytes(chunk); + while let Some((lsn, recdata)) = decoder.poll_decode().unwrap() { + assert!(lsn.is_aligned()); + let _ = InterpretedWalRecord::from_bytes_filtered( + recdata, + shard, + lsn, + bench.meta.pg_version, + ) + .unwrap(); + } + } + + Ok(()) +} +criterion_group!( + name=benches; + config=Criterion::default().with_profiler(PProfProfiler::new(100, Output::Flamegraph(None))); + targets=criterion_benchmark +); +criterion_main!(benches); diff --git a/libs/wal_decoder/src/decoder.rs b/libs/wal_decoder/src/decoder.rs index aa50c62911..ebb38ceb52 100644 --- a/libs/wal_decoder/src/decoder.rs +++ b/libs/wal_decoder/src/decoder.rs @@ -1,6 +1,8 @@ //! This module contains logic for decoding and interpreting //! raw bytes which represent a raw Postgres WAL record. +use std::collections::HashMap; + use crate::models::*; use crate::serialized_batch::SerializedValueBatch; use bytes::{Buf, Bytes}; @@ -14,15 +16,15 @@ use utils::lsn::Lsn; impl InterpretedWalRecord { /// Decode and interpreted raw bytes which represent one Postgres WAL record. - /// Data blocks which do not match the provided shard identity are filtered out. + /// Data blocks which do not match any of the provided shard identities are filtered out. /// Shard 0 is a special case since it tracks all relation sizes. We only give it /// the keys that are being written as that is enough for updating relation sizes. pub fn from_bytes_filtered( buf: Bytes, - shard: &ShardIdentity, + shards: &[ShardIdentity], next_record_lsn: Lsn, pg_version: u32, - ) -> anyhow::Result { + ) -> anyhow::Result> { let mut decoded = DecodedWALRecord::default(); decode_wal_record(buf, &mut decoded, pg_version)?; let xid = decoded.xl_xid; @@ -33,43 +35,57 @@ impl InterpretedWalRecord { FlushUncommittedRecords::No }; - let metadata_record = - MetadataRecord::from_decoded_filtered(&decoded, shard, next_record_lsn, pg_version)?; - let batch = SerializedValueBatch::from_decoded_filtered( + let mut shard_records: HashMap = + HashMap::with_capacity(shards.len()); + for shard in shards { + shard_records.insert( + *shard, + InterpretedWalRecord { + metadata_record: None, + batch: SerializedValueBatch::default(), + next_record_lsn, + flush_uncommitted, + xid, + }, + ); + } + + MetadataRecord::from_decoded_filtered( + &decoded, + &mut shard_records, + next_record_lsn, + pg_version, + )?; + SerializedValueBatch::from_decoded_filtered( decoded, - shard, + &mut shard_records, next_record_lsn, pg_version, )?; - Ok(InterpretedWalRecord { - metadata_record, - batch, - next_record_lsn, - flush_uncommitted, - xid, - }) + Ok(shard_records) } } impl MetadataRecord { - /// Builds a metadata record for this WAL record, if any. + /// Populates the given `shard_records` with metadata records from this WAL record, if any, + /// discarding those belonging to other shards. /// - /// Only metadata records relevant for the given shard are emitted. Currently, most metadata + /// Only metadata records relevant for the given shards is emitted. Currently, most metadata /// records are broadcast to all shards for simplicity, but this should be improved. fn from_decoded_filtered( decoded: &DecodedWALRecord, - shard: &ShardIdentity, + shard_records: &mut HashMap, next_record_lsn: Lsn, pg_version: u32, - ) -> anyhow::Result> { + ) -> anyhow::Result<()> { // Note: this doesn't actually copy the bytes since // the [`Bytes`] type implements it via a level of indirection. let mut buf = decoded.record.clone(); buf.advance(decoded.main_data_offset); // First, generate metadata records from the decoded WAL record. - let mut metadata_record = match decoded.xl_rmid { + let metadata_record = match decoded.xl_rmid { pg_constants::RM_HEAP_ID | pg_constants::RM_HEAP2_ID => { Self::decode_heapam_record(&mut buf, decoded, pg_version)? } @@ -112,41 +128,65 @@ impl MetadataRecord { }; // Next, filter the metadata record by shard. - match metadata_record { - Some( - MetadataRecord::Heapam(HeapamRecord::ClearVmBits(ref mut clear_vm_bits)) - | MetadataRecord::Neonrmgr(NeonrmgrRecord::ClearVmBits(ref mut clear_vm_bits)), - ) => { - // Route VM page updates to the shards that own them. VM pages are stored in the VM fork - // of the main relation. These are sharded and managed just like regular relation pages. - // See: https://github.com/neondatabase/neon/issues/9855 - let is_local_vm_page = |heap_blk| { - let vm_blk = pg_constants::HEAPBLK_TO_MAPBLOCK(heap_blk); - shard.is_key_local(&rel_block_to_key(clear_vm_bits.vm_rel, vm_blk)) - }; - // Send the old and new VM page updates to their respective shards. - clear_vm_bits.old_heap_blkno = clear_vm_bits - .old_heap_blkno - .filter(|&blkno| is_local_vm_page(blkno)); - clear_vm_bits.new_heap_blkno = clear_vm_bits - .new_heap_blkno - .filter(|&blkno| is_local_vm_page(blkno)); - // If neither VM page belongs to this shard, discard the record. - if clear_vm_bits.old_heap_blkno.is_none() && clear_vm_bits.new_heap_blkno.is_none() - { - metadata_record = None + for (shard, record) in shard_records.iter_mut() { + match metadata_record { + Some( + MetadataRecord::Heapam(HeapamRecord::ClearVmBits(ref clear_vm_bits)) + | MetadataRecord::Neonrmgr(NeonrmgrRecord::ClearVmBits(ref clear_vm_bits)), + ) => { + // Route VM page updates to the shards that own them. VM pages are stored in the VM fork + // of the main relation. These are sharded and managed just like regular relation pages. + // See: https://github.com/neondatabase/neon/issues/9855 + let is_local_vm_page = |heap_blk| { + let vm_blk = pg_constants::HEAPBLK_TO_MAPBLOCK(heap_blk); + shard.is_key_local(&rel_block_to_key(clear_vm_bits.vm_rel, vm_blk)) + }; + // Send the old and new VM page updates to their respective shards. + let updated_old_heap_blkno = clear_vm_bits + .old_heap_blkno + .filter(|&blkno| is_local_vm_page(blkno)); + let updated_new_heap_blkno = clear_vm_bits + .new_heap_blkno + .filter(|&blkno| is_local_vm_page(blkno)); + // If neither VM page belongs to this shard, discard the record. + if updated_old_heap_blkno.is_some() || updated_new_heap_blkno.is_some() { + // Clone the record and update it for the current shard. + let mut for_shard = metadata_record.clone(); + match for_shard { + Some( + MetadataRecord::Heapam(HeapamRecord::ClearVmBits( + ref mut clear_vm_bits, + )) + | MetadataRecord::Neonrmgr(NeonrmgrRecord::ClearVmBits( + ref mut clear_vm_bits, + )), + ) => { + clear_vm_bits.old_heap_blkno = updated_old_heap_blkno; + clear_vm_bits.new_heap_blkno = updated_new_heap_blkno; + record.metadata_record = for_shard; + } + _ => { + unreachable!("for_shard is a clone of what we checked above") + } + } + } + } + Some(MetadataRecord::LogicalMessage(LogicalMessageRecord::Put(_))) => { + // Filter LogicalMessage records (AUX files) to only be stored on shard zero + if shard.is_shard_zero() { + record.metadata_record = metadata_record; + // No other shards should receive this record, so we stop traversing shards early. + break; + } + } + _ => { + // All other metadata records are sent to all shards. + record.metadata_record = metadata_record.clone(); } } - Some(MetadataRecord::LogicalMessage(LogicalMessageRecord::Put(_))) => { - // Filter LogicalMessage records (AUX files) to only be stored on shard zero - if !shard.is_shard_zero() { - metadata_record = None; - } - } - _ => {} } - Ok(metadata_record) + Ok(()) } fn decode_heapam_record( diff --git a/libs/wal_decoder/src/models.rs b/libs/wal_decoder/src/models.rs index 6576dd0eba..c2f9125b21 100644 --- a/libs/wal_decoder/src/models.rs +++ b/libs/wal_decoder/src/models.rs @@ -48,7 +48,7 @@ pub mod proto { tonic::include_proto!("interpreted_wal"); } -#[derive(Serialize, Deserialize)] +#[derive(Copy, Clone, Serialize, Deserialize)] pub enum FlushUncommittedRecords { Yes, No, @@ -64,7 +64,7 @@ pub struct InterpretedWalRecords { } /// An interpreted Postgres WAL record, ready to be handled by the pageserver -#[derive(Serialize, Deserialize)] +#[derive(Serialize, Deserialize, Clone)] pub struct InterpretedWalRecord { /// Optional metadata record - may cause writes to metadata keys /// in the storage engine @@ -107,7 +107,7 @@ impl InterpretedWalRecord { /// The interpreted part of the Postgres WAL record which requires metadata /// writes to the underlying storage engine. -#[derive(Serialize, Deserialize)] +#[derive(Clone, Serialize, Deserialize)] pub enum MetadataRecord { Heapam(HeapamRecord), Neonrmgr(NeonrmgrRecord), @@ -123,12 +123,12 @@ pub enum MetadataRecord { Replorigin(ReploriginRecord), } -#[derive(Serialize, Deserialize)] +#[derive(Clone, Serialize, Deserialize)] pub enum HeapamRecord { ClearVmBits(ClearVmBits), } -#[derive(Serialize, Deserialize)] +#[derive(Clone, Serialize, Deserialize)] pub struct ClearVmBits { pub new_heap_blkno: Option, pub old_heap_blkno: Option, @@ -136,29 +136,29 @@ pub struct ClearVmBits { pub flags: u8, } -#[derive(Serialize, Deserialize)] +#[derive(Clone, Serialize, Deserialize)] pub enum NeonrmgrRecord { ClearVmBits(ClearVmBits), } -#[derive(Serialize, Deserialize)] +#[derive(Clone, Serialize, Deserialize)] pub enum SmgrRecord { Create(SmgrCreate), Truncate(XlSmgrTruncate), } -#[derive(Serialize, Deserialize)] +#[derive(Clone, Serialize, Deserialize)] pub struct SmgrCreate { pub rel: RelTag, } -#[derive(Serialize, Deserialize)] +#[derive(Clone, Serialize, Deserialize)] pub enum DbaseRecord { Create(DbaseCreate), Drop(DbaseDrop), } -#[derive(Serialize, Deserialize)] +#[derive(Clone, Serialize, Deserialize)] pub struct DbaseCreate { pub db_id: Oid, pub tablespace_id: Oid, @@ -166,32 +166,32 @@ pub struct DbaseCreate { pub src_tablespace_id: Oid, } -#[derive(Serialize, Deserialize)] +#[derive(Clone, Serialize, Deserialize)] pub struct DbaseDrop { pub db_id: Oid, pub tablespace_ids: Vec, } -#[derive(Serialize, Deserialize)] +#[derive(Clone, Serialize, Deserialize)] pub enum ClogRecord { ZeroPage(ClogZeroPage), Truncate(ClogTruncate), } -#[derive(Serialize, Deserialize)] +#[derive(Clone, Serialize, Deserialize)] pub struct ClogZeroPage { pub segno: u32, pub rpageno: u32, } -#[derive(Serialize, Deserialize)] +#[derive(Clone, Serialize, Deserialize)] pub struct ClogTruncate { pub pageno: u32, pub oldest_xid: TransactionId, pub oldest_xid_db: Oid, } -#[derive(Serialize, Deserialize)] +#[derive(Clone, Serialize, Deserialize)] pub enum XactRecord { Commit(XactCommon), Abort(XactCommon), @@ -200,7 +200,7 @@ pub enum XactRecord { Prepare(XactPrepare), } -#[derive(Serialize, Deserialize)] +#[derive(Clone, Serialize, Deserialize)] pub struct XactCommon { pub parsed: XlXactParsedRecord, pub origin_id: u16, @@ -209,73 +209,73 @@ pub struct XactCommon { pub lsn: Lsn, } -#[derive(Serialize, Deserialize)] +#[derive(Clone, Serialize, Deserialize)] pub struct XactPrepare { pub xl_xid: TransactionId, pub data: Bytes, } -#[derive(Serialize, Deserialize)] +#[derive(Clone, Serialize, Deserialize)] pub enum MultiXactRecord { ZeroPage(MultiXactZeroPage), Create(XlMultiXactCreate), Truncate(XlMultiXactTruncate), } -#[derive(Serialize, Deserialize)] +#[derive(Clone, Serialize, Deserialize)] pub struct MultiXactZeroPage { pub slru_kind: SlruKind, pub segno: u32, pub rpageno: u32, } -#[derive(Serialize, Deserialize)] +#[derive(Clone, Serialize, Deserialize)] pub enum RelmapRecord { Update(RelmapUpdate), } -#[derive(Serialize, Deserialize)] +#[derive(Clone, Serialize, Deserialize)] pub struct RelmapUpdate { pub update: XlRelmapUpdate, pub buf: Bytes, } -#[derive(Serialize, Deserialize)] +#[derive(Clone, Serialize, Deserialize)] pub enum XlogRecord { Raw(RawXlogRecord), } -#[derive(Serialize, Deserialize)] +#[derive(Clone, Serialize, Deserialize)] pub struct RawXlogRecord { pub info: u8, pub lsn: Lsn, pub buf: Bytes, } -#[derive(Serialize, Deserialize)] +#[derive(Clone, Serialize, Deserialize)] pub enum LogicalMessageRecord { Put(PutLogicalMessage), #[cfg(feature = "testing")] Failpoint, } -#[derive(Serialize, Deserialize)] +#[derive(Clone, Serialize, Deserialize)] pub struct PutLogicalMessage { pub path: String, pub buf: Bytes, } -#[derive(Serialize, Deserialize)] +#[derive(Clone, Serialize, Deserialize)] pub enum StandbyRecord { RunningXacts(StandbyRunningXacts), } -#[derive(Serialize, Deserialize)] +#[derive(Clone, Serialize, Deserialize)] pub struct StandbyRunningXacts { pub oldest_running_xid: TransactionId, } -#[derive(Serialize, Deserialize)] +#[derive(Clone, Serialize, Deserialize)] pub enum ReploriginRecord { Set(XlReploriginSet), Drop(XlReploriginDrop), diff --git a/libs/wal_decoder/src/serialized_batch.rs b/libs/wal_decoder/src/serialized_batch.rs index af2b179e05..d76f75f51f 100644 --- a/libs/wal_decoder/src/serialized_batch.rs +++ b/libs/wal_decoder/src/serialized_batch.rs @@ -5,7 +5,7 @@ //! Such batches are created from decoded PG wal records and ingested //! by the pageserver by writing directly to the ephemeral file. -use std::collections::BTreeSet; +use std::collections::{BTreeSet, HashMap}; use bytes::{Bytes, BytesMut}; use pageserver_api::key::rel_block_to_key; @@ -22,6 +22,8 @@ use utils::lsn::Lsn; use pageserver_api::key::Key; +use crate::models::InterpretedWalRecord; + static ZERO_PAGE: Bytes = Bytes::from_static(&[0u8; BLCKSZ as usize]); /// Accompanying metadata for the batch @@ -30,7 +32,7 @@ static ZERO_PAGE: Bytes = Bytes::from_static(&[0u8; BLCKSZ as usize]); /// relation sizes. In the case of "observed" values, we only need to know /// the key and LSN, so two types of metadata are supported to save on network /// bandwidth. -#[derive(Serialize, Deserialize)] +#[derive(Serialize, Deserialize, Clone)] pub enum ValueMeta { Serialized(SerializedValueMeta), Observed(ObservedValueMeta), @@ -77,7 +79,7 @@ impl PartialEq for OrderedValueMeta { impl Eq for OrderedValueMeta {} /// Metadata for a [`Value`] serialized into the batch. -#[derive(Serialize, Deserialize)] +#[derive(Serialize, Deserialize, Clone)] pub struct SerializedValueMeta { pub key: CompactKey, pub lsn: Lsn, @@ -89,14 +91,14 @@ pub struct SerializedValueMeta { } /// Metadata for a [`Value`] observed by the batch -#[derive(Serialize, Deserialize)] +#[derive(Serialize, Deserialize, Clone)] pub struct ObservedValueMeta { pub key: CompactKey, pub lsn: Lsn, } /// Batch of serialized [`Value`]s. -#[derive(Serialize, Deserialize)] +#[derive(Serialize, Deserialize, Clone)] pub struct SerializedValueBatch { /// [`Value`]s serialized in EphemeralFile's native format, /// ready for disk write by the pageserver @@ -128,7 +130,8 @@ impl Default for SerializedValueBatch { } impl SerializedValueBatch { - /// Build a batch of serialized values from a decoded PG WAL record + /// Populates the given `shard_records` with value batches from this WAL record, if any, + /// discarding those belonging to other shards. /// /// The batch will only contain values for keys targeting the specifiec /// shard. Shard 0 is a special case, where any keys that don't belong to @@ -136,21 +139,20 @@ impl SerializedValueBatch { /// but absent from the raw buffer [`SerializedValueBatch::raw`]). pub(crate) fn from_decoded_filtered( decoded: DecodedWALRecord, - shard: &ShardIdentity, + shard_records: &mut HashMap, next_record_lsn: Lsn, pg_version: u32, - ) -> anyhow::Result { - // First determine how big the buffer needs to be and allocate it up-front. + ) -> anyhow::Result<()> { + // First determine how big the buffers need to be and allocate it up-front. // This duplicates some of the work below, but it's empirically much faster. - let estimated_buffer_size = Self::estimate_buffer_size(&decoded, shard, pg_version); - let mut buf = Vec::::with_capacity(estimated_buffer_size); + for (shard, record) in shard_records.iter_mut() { + assert!(record.batch.is_empty()); + + let estimate = Self::estimate_buffer_size(&decoded, shard, pg_version); + record.batch.raw = Vec::with_capacity(estimate); + } - let mut metadata: Vec = Vec::with_capacity(decoded.blocks.len()); - let mut max_lsn: Lsn = Lsn(0); - let mut len: usize = 0; for blk in decoded.blocks.iter() { - let relative_off = buf.len() as u64; - let rel = RelTag { spcnode: blk.rnode_spcnode, dbnode: blk.rnode_dbnode, @@ -168,99 +170,98 @@ impl SerializedValueBatch { ); } - let key_is_local = shard.is_key_local(&key); + for (shard, record) in shard_records.iter_mut() { + let key_is_local = shard.is_key_local(&key); - tracing::debug!( - lsn=%next_record_lsn, - key=%key, - "ingest: shard decision {}", - if !key_is_local { "drop" } else { "keep" }, - ); + tracing::debug!( + lsn=%next_record_lsn, + key=%key, + "ingest: shard decision {}", + if !key_is_local { "drop" } else { "keep" }, + ); - if !key_is_local { - if shard.is_shard_zero() { - // Shard 0 tracks relation sizes. Although we will not store this block, we will observe - // its blkno in case it implicitly extends a relation. - metadata.push(ValueMeta::Observed(ObservedValueMeta { + if !key_is_local { + if shard.is_shard_zero() { + // Shard 0 tracks relation sizes. Although we will not store this block, we will observe + // its blkno in case it implicitly extends a relation. + record + .batch + .metadata + .push(ValueMeta::Observed(ObservedValueMeta { + key: key.to_compact(), + lsn: next_record_lsn, + })) + } + + continue; + } + + // Instead of storing full-page-image WAL record, + // it is better to store extracted image: we can skip wal-redo + // in this case. Also some FPI records may contain multiple (up to 32) pages, + // so them have to be copied multiple times. + // + let val = if Self::block_is_image(&decoded, blk, pg_version) { + // Extract page image from FPI record + let img_len = blk.bimg_len as usize; + let img_offs = blk.bimg_offset as usize; + let mut image = BytesMut::with_capacity(BLCKSZ as usize); + // TODO(vlad): skip the copy + image.extend_from_slice(&decoded.record[img_offs..img_offs + img_len]); + + if blk.hole_length != 0 { + let tail = image.split_off(blk.hole_offset as usize); + image.resize(image.len() + blk.hole_length as usize, 0u8); + image.unsplit(tail); + } + // + // Match the logic of XLogReadBufferForRedoExtended: + // The page may be uninitialized. If so, we can't set the LSN because + // that would corrupt the page. + // + if !page_is_new(&image) { + page_set_lsn(&mut image, next_record_lsn) + } + assert_eq!(image.len(), BLCKSZ as usize); + + Value::Image(image.freeze()) + } else { + Value::WalRecord(NeonWalRecord::Postgres { + will_init: blk.will_init || blk.apply_image, + rec: decoded.record.clone(), + }) + }; + + let relative_off = record.batch.raw.len() as u64; + + val.ser_into(&mut record.batch.raw) + .expect("Writing into in-memory buffer is infallible"); + + let val_ser_size = record.batch.raw.len() - relative_off as usize; + + record + .batch + .metadata + .push(ValueMeta::Serialized(SerializedValueMeta { key: key.to_compact(), lsn: next_record_lsn, - })) - } - - continue; + batch_offset: relative_off, + len: val_ser_size, + will_init: val.will_init(), + })); + record.batch.max_lsn = std::cmp::max(record.batch.max_lsn, next_record_lsn); + record.batch.len += 1; } - - // Instead of storing full-page-image WAL record, - // it is better to store extracted image: we can skip wal-redo - // in this case. Also some FPI records may contain multiple (up to 32) pages, - // so them have to be copied multiple times. - // - let val = if Self::block_is_image(&decoded, blk, pg_version) { - // Extract page image from FPI record - let img_len = blk.bimg_len as usize; - let img_offs = blk.bimg_offset as usize; - let mut image = BytesMut::with_capacity(BLCKSZ as usize); - // TODO(vlad): skip the copy - image.extend_from_slice(&decoded.record[img_offs..img_offs + img_len]); - - if blk.hole_length != 0 { - let tail = image.split_off(blk.hole_offset as usize); - image.resize(image.len() + blk.hole_length as usize, 0u8); - image.unsplit(tail); - } - // - // Match the logic of XLogReadBufferForRedoExtended: - // The page may be uninitialized. If so, we can't set the LSN because - // that would corrupt the page. - // - if !page_is_new(&image) { - page_set_lsn(&mut image, next_record_lsn) - } - assert_eq!(image.len(), BLCKSZ as usize); - - Value::Image(image.freeze()) - } else { - Value::WalRecord(NeonWalRecord::Postgres { - will_init: blk.will_init || blk.apply_image, - rec: decoded.record.clone(), - }) - }; - - val.ser_into(&mut buf) - .expect("Writing into in-memory buffer is infallible"); - - let val_ser_size = buf.len() - relative_off as usize; - - metadata.push(ValueMeta::Serialized(SerializedValueMeta { - key: key.to_compact(), - lsn: next_record_lsn, - batch_offset: relative_off, - len: val_ser_size, - will_init: val.will_init(), - })); - max_lsn = std::cmp::max(max_lsn, next_record_lsn); - len += 1; } if cfg!(any(debug_assertions, test)) { - let batch = Self { - raw: buf, - metadata, - max_lsn, - len, - }; - - batch.validate_lsn_order(); - - return Ok(batch); + // Validate that the batches are correct + for record in shard_records.values() { + record.batch.validate_lsn_order(); + } } - Ok(Self { - raw: buf, - metadata, - max_lsn, - len, - }) + Ok(()) } /// Look into the decoded PG WAL record and determine diff --git a/pageserver/Cargo.toml b/pageserver/Cargo.toml index 140b287ccc..9c835c956b 100644 --- a/pageserver/Cargo.toml +++ b/pageserver/Cargo.toml @@ -8,7 +8,7 @@ license.workspace = true default = [] # Enables test-only APIs, incuding failpoints. In particular, enables the `fail_point!` macro, # which adds some runtime cost to run tests on outage conditions -testing = ["fail/failpoints", "pageserver_api/testing", "wal_decoder/testing"] +testing = ["fail/failpoints", "pageserver_api/testing", "wal_decoder/testing", "pageserver_client/testing"] [dependencies] anyhow.workspace = true @@ -16,6 +16,7 @@ arc-swap.workspace = true async-compression.workspace = true async-stream.workspace = true bit_field.workspace = true +bincode.workspace = true byteorder.workspace = true bytes.workspace = true camino.workspace = true @@ -44,6 +45,7 @@ postgres_backend.workspace = true postgres-protocol.workspace = true postgres-types.workspace = true postgres_initdb.workspace = true +pprof.workspace = true rand.workspace = true range-set-blaze = { version = "0.1.16", features = ["alloc"] } regex.workspace = true @@ -108,3 +110,11 @@ harness = false [[bench]] name = "bench_ingest" harness = false + +[[bench]] +name = "upload_queue" +harness = false + +[[bin]] +name = "test_helper_slow_client_reads" +required-features = [ "testing" ] diff --git a/pageserver/benches/upload_queue.rs b/pageserver/benches/upload_queue.rs new file mode 100644 index 0000000000..ed5daa8ae1 --- /dev/null +++ b/pageserver/benches/upload_queue.rs @@ -0,0 +1,87 @@ +//! Upload queue benchmarks. + +use std::str::FromStr as _; +use std::sync::atomic::AtomicU32; +use std::sync::Arc; + +use criterion::{criterion_group, criterion_main, Bencher, Criterion}; +use pageserver::tenant::metadata::TimelineMetadata; +use pageserver::tenant::remote_timeline_client::index::LayerFileMetadata; +use pageserver::tenant::storage_layer::LayerName; +use pageserver::tenant::upload_queue::{Delete, UploadOp, UploadQueue, UploadTask}; +use pageserver::tenant::IndexPart; +use pprof::criterion::{Output, PProfProfiler}; +use utils::generation::Generation; +use utils::shard::{ShardCount, ShardIndex, ShardNumber}; + +// Register benchmarks with Criterion. +criterion_group!( + name = benches; + config = Criterion::default().with_profiler(PProfProfiler::new(100, Output::Flamegraph(None))); + targets = bench_upload_queue_next_ready, +); +criterion_main!(benches); + +/// Benchmarks the cost of UploadQueue::next_ready() with the given number of in-progress tasks +/// (which is equivalent to tasks ahead of it in the queue). This has linear cost, and the upload +/// queue as a whole is thus quadratic. +/// +/// UploadOp::UploadLayer requires an entire tenant and timeline to construct, so we just test +/// Delete and UploadMetadata instead. This is incidentally the most expensive case. +fn bench_upload_queue_next_ready(c: &mut Criterion) { + let mut g = c.benchmark_group("upload_queue_next_ready"); + for inprogress in [0, 1, 10, 100, 1_000, 10_000, 100_000, 1_000_000] { + g.bench_function(format!("inprogress={inprogress}"), |b| { + run_bench(b, inprogress).unwrap() + }); + } + + fn run_bench(b: &mut Bencher, inprogress: usize) -> anyhow::Result<()> { + // Construct two layers. layer0 is in the indexes, layer1 will be deleted. + let layer0 = LayerName::from_str("000000000000000000000000000000000000-100000000000000000000000000000000000__00000000016B59D8-00000000016B5A51").expect("invalid name"); + let layer1 = LayerName::from_str("100000000000000000000000000000000001-200000000000000000000000000000000000__00000000016B59D8-00000000016B5A51").expect("invalid name"); + + let metadata = LayerFileMetadata { + shard: ShardIndex::new(ShardNumber(1), ShardCount(2)), + generation: Generation::Valid(1), + file_size: 0, + }; + + // Construct the (initial and uploaded) index with layer0. + let mut index = IndexPart::empty(TimelineMetadata::example()); + index.layer_metadata.insert(layer0, metadata.clone()); + + // Construct the queue. + let mut queue = UploadQueue::Uninitialized; + let queue = queue.initialize_with_current_remote_index_part(&index, 0)?; + + // Populate inprogress_tasks with a bunch of layer1 deletions. + let delete = UploadOp::Delete(Delete { + layers: vec![(layer1, metadata)], + }); + + for task_id in 0..(inprogress as u64) { + queue.inprogress_tasks.insert( + task_id, + Arc::new(UploadTask { + task_id, + retries: AtomicU32::new(0), + op: delete.clone(), + coalesced_ops: Vec::new(), + }), + ); + } + + // Benchmark index upload scheduling. + let index_upload = UploadOp::UploadMetadata { + uploaded: Box::new(index), + }; + + b.iter(|| { + queue.queued_operations.push_front(index_upload.clone()); + assert!(queue.next_ready().is_some()); + }); + + Ok(()) + } +} diff --git a/pageserver/client/Cargo.toml b/pageserver/client/Cargo.toml index d9b36bf3d4..f582d307a7 100644 --- a/pageserver/client/Cargo.toml +++ b/pageserver/client/Cargo.toml @@ -4,6 +4,9 @@ version = "0.1.0" edition.workspace = true license.workspace = true +[features] +testing = [ "pageserver_api/testing" ] + [dependencies] pageserver_api.workspace = true thiserror.workspace = true diff --git a/pageserver/client/src/page_service.rs b/pageserver/client/src/page_service.rs index 207ec4166c..27280912b4 100644 --- a/pageserver/client/src/page_service.rs +++ b/pageserver/client/src/page_service.rs @@ -1,6 +1,9 @@ -use std::pin::Pin; +use std::sync::{Arc, Mutex}; -use futures::SinkExt; +use futures::{ + stream::{SplitSink, SplitStream}, + SinkExt, StreamExt, +}; use pageserver_api::{ models::{ PagestreamBeMessage, PagestreamFeMessage, PagestreamGetPageRequest, @@ -10,7 +13,6 @@ use pageserver_api::{ }; use tokio::task::JoinHandle; use tokio_postgres::CopyOutStream; -use tokio_stream::StreamExt; use tokio_util::sync::CancellationToken; use utils::{ id::{TenantId, TimelineId}, @@ -62,15 +64,28 @@ impl Client { .client .copy_both_simple(&format!("pagestream_v3 {tenant_id} {timeline_id}")) .await?; + let (sink, stream) = copy_both.split(); // TODO: actually support splitting of the CopyBothDuplex so the lock inside this split adaptor goes away. let Client { cancel_on_client_drop, conn_task, client: _, } = self; + let shared = Arc::new(Mutex::new(PagestreamShared::ConnTaskRunning( + ConnTaskRunning { + cancel_on_client_drop, + conn_task, + }, + ))); Ok(PagestreamClient { - copy_both: Box::pin(copy_both), - conn_task, - cancel_on_client_drop, + sink: PagestreamSender { + shared: shared.clone(), + sink, + }, + stream: PagestreamReceiver { + shared: shared.clone(), + stream, + }, + shared, }) } @@ -97,7 +112,28 @@ impl Client { /// Create using [`Client::pagestream`]. pub struct PagestreamClient { - copy_both: Pin>>, + shared: Arc>, + sink: PagestreamSender, + stream: PagestreamReceiver, +} + +pub struct PagestreamSender { + #[allow(dead_code)] + shared: Arc>, + sink: SplitSink, bytes::Bytes>, +} + +pub struct PagestreamReceiver { + #[allow(dead_code)] + shared: Arc>, + stream: SplitStream>, +} + +enum PagestreamShared { + ConnTaskRunning(ConnTaskRunning), + ConnTaskCancelledJoinHandleReturnedOrDropped, +} +struct ConnTaskRunning { cancel_on_client_drop: Option, conn_task: JoinHandle<()>, } @@ -110,11 +146,11 @@ pub struct RelTagBlockNo { impl PagestreamClient { pub async fn shutdown(self) { let Self { - copy_both, - cancel_on_client_drop: cancel_conn_task, - conn_task, - } = self; - // The `copy_both` contains internal channel sender, the receiver of which is polled by `conn_task`. + shared, + sink, + stream, + } = { self }; + // The `copy_both` split into `sink` and `stream` contains internal channel sender, the receiver of which is polled by `conn_task`. // When `conn_task` observes the sender has been dropped, it sends a `FeMessage::CopyFail` into the connection. // (see https://github.com/neondatabase/rust-postgres/blob/2005bf79573b8add5cf205b52a2b208e356cc8b0/tokio-postgres/src/copy_both.rs#L56). // @@ -131,27 +167,77 @@ impl PagestreamClient { // // NB: page_service doesn't have a use case to exit the `pagestream` mode currently. // => https://github.com/neondatabase/neon/issues/6390 - let _ = cancel_conn_task.unwrap(); + let ConnTaskRunning { + cancel_on_client_drop, + conn_task, + } = { + let mut guard = shared.lock().unwrap(); + match std::mem::replace( + &mut *guard, + PagestreamShared::ConnTaskCancelledJoinHandleReturnedOrDropped, + ) { + PagestreamShared::ConnTaskRunning(conn_task_running) => conn_task_running, + PagestreamShared::ConnTaskCancelledJoinHandleReturnedOrDropped => unreachable!(), + } + }; + let _ = cancel_on_client_drop.unwrap(); conn_task.await.unwrap(); - drop(copy_both); + + // Now drop the split copy_both. + drop(sink); + drop(stream); + } + + pub fn split(self) -> (PagestreamSender, PagestreamReceiver) { + let Self { + shared: _, + sink, + stream, + } = self; + (sink, stream) } pub async fn getpage( &mut self, req: PagestreamGetPageRequest, ) -> anyhow::Result { - let req = PagestreamFeMessage::GetPage(req); - let req: bytes::Bytes = req.serialize(); - // let mut req = tokio_util::io::ReaderStream::new(&req); - let mut req = tokio_stream::once(Ok(req)); + self.getpage_send(req).await?; + self.getpage_recv().await + } - self.copy_both.send_all(&mut req).await?; + pub async fn getpage_send(&mut self, req: PagestreamGetPageRequest) -> anyhow::Result<()> { + self.sink.getpage_send(req).await + } - let next: Option> = self.copy_both.next().await; + pub async fn getpage_recv(&mut self) -> anyhow::Result { + self.stream.getpage_recv().await + } +} + +impl PagestreamSender { + // TODO: maybe make this impl Sink instead for better composability? + pub async fn send(&mut self, msg: PagestreamFeMessage) -> anyhow::Result<()> { + let msg = msg.serialize(); + self.sink.send_all(&mut tokio_stream::once(Ok(msg))).await?; + Ok(()) + } + + pub async fn getpage_send(&mut self, req: PagestreamGetPageRequest) -> anyhow::Result<()> { + self.send(PagestreamFeMessage::GetPage(req)).await + } +} + +impl PagestreamReceiver { + // TODO: maybe make this impl Stream instead for better composability? + pub async fn recv(&mut self) -> anyhow::Result { + let next: Option> = self.stream.next().await; let next: bytes::Bytes = next.unwrap()?; + PagestreamBeMessage::deserialize(next) + } - let msg = PagestreamBeMessage::deserialize(next)?; - match msg { + pub async fn getpage_recv(&mut self) -> anyhow::Result { + let next: PagestreamBeMessage = self.recv().await?; + match next { PagestreamBeMessage::GetPage(p) => Ok(p), PagestreamBeMessage::Error(e) => anyhow::bail!("Error: {:?}", e), PagestreamBeMessage::Exists(_) @@ -160,7 +246,14 @@ impl PagestreamClient { | PagestreamBeMessage::GetSlruSegment(_) => { anyhow::bail!( "unexpected be message kind in response to getpage request: {}", - msg.kind() + next.kind() + ) + } + #[cfg(feature = "testing")] + PagestreamBeMessage::Test(_) => { + anyhow::bail!( + "unexpected be message kind in response to getpage request: {}", + next.kind() ) } } diff --git a/pageserver/ctl/Cargo.toml b/pageserver/ctl/Cargo.toml index 39ca47568c..7b70f0dc87 100644 --- a/pageserver/ctl/Cargo.toml +++ b/pageserver/ctl/Cargo.toml @@ -8,9 +8,11 @@ license.workspace = true [dependencies] anyhow.workspace = true +bincode.workspace = true camino.workspace = true clap = { workspace = true, features = ["string"] } humantime.workspace = true +itertools.workspace = true pageserver = { path = ".." } pageserver_api.workspace = true remote_storage = { path = "../../libs/remote_storage" } diff --git a/pageserver/ctl/src/main.rs b/pageserver/ctl/src/main.rs index a0aac89dc8..353b4bd2f9 100644 --- a/pageserver/ctl/src/main.rs +++ b/pageserver/ctl/src/main.rs @@ -9,7 +9,9 @@ mod index_part; mod key; mod layer_map_analyzer; mod layers; +mod page_trace; +use page_trace::PageTraceCmd; use std::{ str::FromStr, time::{Duration, SystemTime}, @@ -64,6 +66,7 @@ enum Commands { Layer(LayerCmd), /// Debug print a hex key found from logs Key(key::DescribeKeyCommand), + PageTrace(PageTraceCmd), } /// Read and update pageserver metadata file @@ -183,6 +186,7 @@ async fn main() -> anyhow::Result<()> { .await?; } Commands::Key(dkc) => dkc.execute(), + Commands::PageTrace(cmd) => page_trace::main(&cmd)?, }; Ok(()) } diff --git a/pageserver/ctl/src/page_trace.rs b/pageserver/ctl/src/page_trace.rs new file mode 100644 index 0000000000..da0de72fd9 --- /dev/null +++ b/pageserver/ctl/src/page_trace.rs @@ -0,0 +1,73 @@ +use std::collections::HashMap; +use std::io::BufReader; + +use camino::Utf8PathBuf; +use clap::Parser; +use itertools::Itertools as _; +use pageserver_api::key::{CompactKey, Key}; +use pageserver_api::models::PageTraceEvent; +use pageserver_api::reltag::RelTag; + +/// Parses a page trace (as emitted by the `page_trace` timeline API), and outputs stats. +#[derive(Parser)] +pub(crate) struct PageTraceCmd { + /// Trace input file. + path: Utf8PathBuf, +} + +pub(crate) fn main(cmd: &PageTraceCmd) -> anyhow::Result<()> { + let mut file = BufReader::new(std::fs::OpenOptions::new().read(true).open(&cmd.path)?); + let mut events: Vec = Vec::new(); + loop { + match bincode::deserialize_from(&mut file) { + Ok(event) => events.push(event), + Err(err) => { + if let bincode::ErrorKind::Io(ref err) = *err { + if err.kind() == std::io::ErrorKind::UnexpectedEof { + break; + } + } + return Err(err.into()); + } + } + } + + let mut reads_by_relation: HashMap = HashMap::new(); + let mut reads_by_key: HashMap = HashMap::new(); + + for event in events { + let key = Key::from_compact(event.key); + let reltag = RelTag { + spcnode: key.field2, + dbnode: key.field3, + relnode: key.field4, + forknum: key.field5, + }; + + *reads_by_relation.entry(reltag).or_default() += 1; + *reads_by_key.entry(event.key).or_default() += 1; + } + + let multi_read_keys = reads_by_key + .into_iter() + .filter(|(_, count)| *count > 1) + .sorted_by_key(|(key, count)| (-*count, *key)) + .collect_vec(); + + println!("Multi-read keys: {}", multi_read_keys.len()); + for (key, count) in multi_read_keys { + println!(" {key}: {count}"); + } + + let reads_by_relation = reads_by_relation + .into_iter() + .sorted_by_key(|(rel, count)| (-*count, *rel)) + .collect_vec(); + + println!("Reads by relation:"); + for (reltag, count) in reads_by_relation { + println!(" {reltag}: {count}"); + } + + Ok(()) +} diff --git a/pageserver/src/bin/pageserver.rs b/pageserver/src/bin/pageserver.rs index b92ff4ebf9..921c6a5092 100644 --- a/pageserver/src/bin/pageserver.rs +++ b/pageserver/src/bin/pageserver.rs @@ -53,12 +53,12 @@ project_build_tag!(BUILD_TAG); #[global_allocator] static GLOBAL: tikv_jemallocator::Jemalloc = tikv_jemallocator::Jemalloc; -// Configure jemalloc to sample allocations for profiles every 1 MB (1 << 20). -// TODO: disabled because concurrent CPU profiles cause seg faults. See: -// https://github.com/neondatabase/neon/issues/10225. -//#[allow(non_upper_case_globals)] -//#[export_name = "malloc_conf"] -//pub static malloc_conf: &[u8] = b"prof:true,prof_active:true,lg_prof_sample:20\0"; +/// Configure jemalloc to profile heap allocations by sampling stack traces every 2 MB (1 << 21). +/// This adds roughly 3% overhead for allocations on average, which is acceptable considering +/// performance-sensitive code will avoid allocations as far as possible anyway. +#[allow(non_upper_case_globals)] +#[export_name = "malloc_conf"] +pub static malloc_conf: &[u8] = b"prof:true,prof_active:true,lg_prof_sample:21\0"; const PID_FILE_NAME: &str = "pageserver.pid"; diff --git a/pageserver/src/bin/test_helper_slow_client_reads.rs b/pageserver/src/bin/test_helper_slow_client_reads.rs new file mode 100644 index 0000000000..c1ce332b6c --- /dev/null +++ b/pageserver/src/bin/test_helper_slow_client_reads.rs @@ -0,0 +1,65 @@ +use std::{ + io::{stdin, stdout, Read, Write}, + time::Duration, +}; + +use clap::Parser; +use pageserver_api::models::{PagestreamRequest, PagestreamTestRequest}; +use utils::{ + id::{TenantId, TimelineId}, + lsn::Lsn, +}; + +#[derive(clap::Parser)] +struct Args { + connstr: String, + tenant_id: TenantId, + timeline_id: TimelineId, +} + +#[tokio::main] +async fn main() -> anyhow::Result<()> { + let Args { + connstr, + tenant_id, + timeline_id, + } = Args::parse(); + let client = pageserver_client::page_service::Client::new(connstr).await?; + let client = client.pagestream(tenant_id, timeline_id).await?; + let (mut sender, _receiver) = client.split(); + + eprintln!("filling the pipe"); + let mut msg = 0; + loop { + msg += 1; + let fut = sender.send(pageserver_api::models::PagestreamFeMessage::Test( + PagestreamTestRequest { + hdr: PagestreamRequest { + reqid: 0, + request_lsn: Lsn(23), + not_modified_since: Lsn(23), + }, + batch_key: 42, + message: format!("message {}", msg), + }, + )); + let Ok(res) = tokio::time::timeout(Duration::from_secs(10), fut).await else { + eprintln!("pipe seems full"); + break; + }; + let _: () = res?; + } + + let n = stdout().write(b"R")?; + assert_eq!(n, 1); + stdout().flush()?; + + eprintln!("waiting for signal to tell us to exit"); + + let mut buf = [0u8; 1]; + stdin().read_exact(&mut buf)?; + + eprintln!("termination signal received, exiting"); + + anyhow::Ok(()) +} diff --git a/pageserver/src/http/routes.rs b/pageserver/src/http/routes.rs index 60ef4c3702..33b2d04588 100644 --- a/pageserver/src/http/routes.rs +++ b/pageserver/src/http/routes.rs @@ -27,6 +27,7 @@ use pageserver_api::models::LocationConfigMode; use pageserver_api::models::LsnLease; use pageserver_api::models::LsnLeaseRequest; use pageserver_api::models::OffloadedTimelineInfo; +use pageserver_api::models::PageTraceEvent; use pageserver_api::models::ShardParameters; use pageserver_api::models::TenantConfigPatchRequest; use pageserver_api::models::TenantDetails; @@ -51,7 +52,9 @@ use pageserver_api::shard::TenantShardId; use remote_storage::DownloadError; use remote_storage::GenericRemoteStorage; use remote_storage::TimeTravelError; +use scopeguard::defer; use tenant_size_model::{svg::SvgBranchKind, SizeResult, StorageModel}; +use tokio::time::Instant; use tokio_util::io::StreamReader; use tokio_util::sync::CancellationToken; use tracing::*; @@ -97,8 +100,8 @@ use crate::tenant::{LogicalSizeCalculationCause, PageReconstructError}; use crate::DEFAULT_PG_VERSION; use crate::{disk_usage_eviction_task, tenant}; use pageserver_api::models::{ - CompactInfoResponse, StatusResponse, TenantConfigRequest, TenantInfo, TimelineCreateRequest, - TimelineGcRequest, TimelineInfo, + StatusResponse, TenantConfigRequest, TenantInfo, TimelineCreateRequest, TimelineGcRequest, + TimelineInfo, }; use utils::{ auth::SwappableJwtAuth, @@ -1521,6 +1524,71 @@ async fn timeline_gc_unblocking_handler( block_or_unblock_gc(request, false).await } +/// Traces GetPage@LSN requests for a timeline, and emits metadata in an efficient binary encoding. +/// Use the `pagectl page-trace` command to decode and analyze the output. +async fn timeline_page_trace_handler( + request: Request, + cancel: CancellationToken, +) -> Result, ApiError> { + let tenant_shard_id: TenantShardId = parse_request_param(&request, "tenant_shard_id")?; + let timeline_id: TimelineId = parse_request_param(&request, "timeline_id")?; + let state = get_state(&request); + check_permission(&request, None)?; + + let size_limit: usize = parse_query_param(&request, "size_limit_bytes")?.unwrap_or(1024 * 1024); + let time_limit_secs: u64 = parse_query_param(&request, "time_limit_secs")?.unwrap_or(5); + + // Convert size limit to event limit based on the serialized size of an event. The event size is + // fixed, as the default bincode serializer uses fixed-width integer encoding. + let event_size = bincode::serialize(&PageTraceEvent::default()) + .map_err(|err| ApiError::InternalServerError(err.into()))? + .len(); + let event_limit = size_limit / event_size; + + let timeline = + active_timeline_of_active_tenant(&state.tenant_manager, tenant_shard_id, timeline_id) + .await?; + + // Install a page trace, unless one is already in progress. We just use a buffered channel, + // which may 2x the memory usage in the worst case, but it's still bounded. + let (trace_tx, mut trace_rx) = tokio::sync::mpsc::channel(event_limit); + let cur = timeline.page_trace.load(); + let installed = cur.is_none() + && timeline + .page_trace + .compare_and_swap(cur, Some(Arc::new(trace_tx))) + .is_none(); + if !installed { + return Err(ApiError::Conflict("page trace already active".to_string())); + } + defer!(timeline.page_trace.store(None)); // uninstall on return + + // Collect the trace and return it to the client. We could stream the response, but this is + // simple and fine. + let mut body = Vec::with_capacity(size_limit); + let deadline = Instant::now() + Duration::from_secs(time_limit_secs); + + while body.len() < size_limit { + tokio::select! { + event = trace_rx.recv() => { + let Some(event) = event else { + break; // shouldn't happen (sender doesn't close, unless timeline dropped) + }; + bincode::serialize_into(&mut body, &event) + .map_err(|err| ApiError::InternalServerError(err.into()))?; + } + _ = tokio::time::sleep_until(deadline) => break, // time limit reached + _ = cancel.cancelled() => return Err(ApiError::Cancelled), + } + } + + Ok(Response::builder() + .status(StatusCode::OK) + .header(header::CONTENT_TYPE, "application/octet-stream") + .body(hyper::Body::from(body)) + .unwrap()) +} + /// Adding a block is `POST ../block_gc`, removing a block is `POST ../unblock_gc`. /// /// Both are technically unsafe because they might fire off index uploads, thus they are POST. @@ -2052,15 +2120,7 @@ async fn timeline_compact_info_handler( let tenant = state .tenant_manager .get_attached_tenant_shard(tenant_shard_id)?; - let res = tenant.get_scheduled_compaction_tasks(timeline_id); - let mut resp = Vec::new(); - for item in res { - resp.push(CompactInfoResponse { - compact_key_range: item.compact_key_range, - compact_lsn_range: item.compact_lsn_range, - sub_compaction: item.sub_compaction, - }); - } + let resp = tenant.get_scheduled_compaction_tasks(timeline_id); json_response(StatusCode::OK, resp) } .instrument(info_span!("timeline_compact_info", tenant_id = %tenant_shard_id.tenant_id, shard_id = %tenant_shard_id.shard_slug(), %timeline_id)) @@ -3487,6 +3547,10 @@ pub fn make_router( "/v1/tenant/:tenant_shard_id/timeline/:timeline_id/unblock_gc", |r| api_handler(r, timeline_gc_unblocking_handler), ) + .get( + "/v1/tenant/:tenant_shard_id/timeline/:timeline_id/page_trace", + |r| api_handler(r, timeline_page_trace_handler), + ) .post("/v1/tenant/:tenant_shard_id/heatmap_upload", |r| { api_handler(r, secondary_upload_handler) }) diff --git a/pageserver/src/import_datadir.rs b/pageserver/src/import_datadir.rs index c061714010..a73fa5cec8 100644 --- a/pageserver/src/import_datadir.rs +++ b/pageserver/src/import_datadir.rs @@ -278,6 +278,8 @@ async fn import_wal( let mut walingest = WalIngest::new(tline, startpoint, ctx).await?; + let shard = vec![*tline.get_shard_identity()]; + while last_lsn <= endpoint { // FIXME: assume postgresql tli 1 for now let filename = XLogFileName(1, segno, WAL_SEGMENT_SIZE); @@ -314,10 +316,12 @@ async fn import_wal( if let Some((lsn, recdata)) = waldecoder.poll_decode()? { let interpreted = InterpretedWalRecord::from_bytes_filtered( recdata, - tline.get_shard_identity(), + &shard, lsn, tline.pg_version, - )?; + )? + .remove(tline.get_shard_identity()) + .unwrap(); walingest .ingest_record(interpreted, &mut modification, ctx) @@ -411,6 +415,7 @@ pub async fn import_wal_from_tar( let mut offset = start_lsn.segment_offset(WAL_SEGMENT_SIZE); let mut last_lsn = start_lsn; let mut walingest = WalIngest::new(tline, start_lsn, ctx).await?; + let shard = vec![*tline.get_shard_identity()]; // Ingest wal until end_lsn info!("importing wal until {}", end_lsn); @@ -459,10 +464,12 @@ pub async fn import_wal_from_tar( if let Some((lsn, recdata)) = waldecoder.poll_decode()? { let interpreted = InterpretedWalRecord::from_bytes_filtered( recdata, - tline.get_shard_identity(), + &shard, lsn, tline.pg_version, - )?; + )? + .remove(tline.get_shard_identity()) + .unwrap(); walingest .ingest_record(interpreted, &mut modification, ctx) diff --git a/pageserver/src/metrics.rs b/pageserver/src/metrics.rs index a313a64080..3c4830e3cd 100644 --- a/pageserver/src/metrics.rs +++ b/pageserver/src/metrics.rs @@ -91,15 +91,6 @@ pub(crate) static STORAGE_TIME_GLOBAL: Lazy = Lazy::new(|| { .expect("failed to define a metric") }); -pub(crate) static READ_NUM_LAYERS_VISITED: Lazy = Lazy::new(|| { - register_histogram!( - "pageserver_layers_visited_per_read_global", - "Number of layers visited to reconstruct one key", - vec![1.0, 4.0, 8.0, 16.0, 32.0, 64.0, 128.0, 256.0, 512.0, 1024.0], - ) - .expect("failed to define a metric") -}); - pub(crate) static VEC_READ_NUM_LAYERS_VISITED: Lazy = Lazy::new(|| { register_histogram!( "pageserver_layers_visited_per_vectored_read_global", @@ -1233,117 +1224,189 @@ pub(crate) struct SmgrOpTimerInner { global_flush_in_progress_micros: IntCounter, per_timeline_flush_in_progress_micros: IntCounter, + throttling: Arc, + timings: SmgrOpTimerState, } +/// The stages of request processing are represented by the enum variants. +/// Used as part of [`SmgrOpTimerInner::timings`]. +/// +/// Request processing calls into the `SmgrOpTimer::observe_*` methods at the +/// transition points. +/// These methods bump relevant counters and then update [`SmgrOpTimerInner::timings`] +/// to the next state. +/// +/// Each request goes through every stage, in all configurations. +/// #[derive(Debug)] enum SmgrOpTimerState { Received { + // In the future, we may want to track the full time the request spent + // inside pageserver process (time spent in kernel buffers can't be tracked). + // `received_at` would be used for that. + #[allow(dead_code)] received_at: Instant, }, - ThrottleDoneExecutionStarting { - received_at: Instant, + Throttling { throttle_started_at: Instant, - started_execution_at: Instant, }, + Batching { + throttle_done_at: Instant, + }, + Executing { + execution_started_at: Instant, + }, + Flushing, + // NB: when adding observation points, remember to update the Drop impl. } +// NB: when adding observation points, remember to update the Drop impl. +impl SmgrOpTimer { + /// See [`SmgrOpTimerState`] for more context. + pub(crate) fn observe_throttle_start(&mut self, at: Instant) { + let Some(inner) = self.0.as_mut() else { + return; + }; + let SmgrOpTimerState::Received { received_at: _ } = &mut inner.timings else { + return; + }; + inner.throttling.count_accounted_start.inc(); + inner.timings = SmgrOpTimerState::Throttling { + throttle_started_at: at, + }; + } + + /// See [`SmgrOpTimerState`] for more context. + pub(crate) fn observe_throttle_done(&mut self, throttle: ThrottleResult) { + let Some(inner) = self.0.as_mut() else { + return; + }; + let SmgrOpTimerState::Throttling { + throttle_started_at, + } = &inner.timings + else { + return; + }; + inner.throttling.count_accounted_finish.inc(); + match throttle { + ThrottleResult::NotThrottled { end } => { + inner.timings = SmgrOpTimerState::Batching { + throttle_done_at: end, + }; + } + ThrottleResult::Throttled { end } => { + // update metrics + inner.throttling.count_throttled.inc(); + inner + .throttling + .wait_time + .inc_by((end - *throttle_started_at).as_micros().try_into().unwrap()); + // state transition + inner.timings = SmgrOpTimerState::Batching { + throttle_done_at: end, + }; + } + } + } + + /// See [`SmgrOpTimerState`] for more context. + pub(crate) fn observe_execution_start(&mut self, at: Instant) { + let Some(inner) = self.0.as_mut() else { + return; + }; + let SmgrOpTimerState::Batching { throttle_done_at } = &inner.timings else { + return; + }; + // update metrics + let batch = at - *throttle_done_at; + inner.global_batch_wait_time.observe(batch.as_secs_f64()); + inner + .per_timeline_batch_wait_time + .observe(batch.as_secs_f64()); + // state transition + inner.timings = SmgrOpTimerState::Executing { + execution_started_at: at, + } + } + + /// For all but the first caller, this is a no-op. + /// The first callers receives Some, subsequent ones None. + /// + /// See [`SmgrOpTimerState`] for more context. + pub(crate) fn observe_execution_end_flush_start( + &mut self, + at: Instant, + ) -> Option { + // NB: unlike the other observe_* methods, this one take()s. + #[allow(clippy::question_mark)] // maintain similar code pattern. + let Some(mut inner) = self.0.take() else { + return None; + }; + let SmgrOpTimerState::Executing { + execution_started_at, + } = &inner.timings + else { + return None; + }; + // update metrics + let execution = at - *execution_started_at; + inner + .global_execution_latency_histo + .observe(execution.as_secs_f64()); + if let Some(per_timeline_execution_latency_histo) = + &inner.per_timeline_execution_latency_histo + { + per_timeline_execution_latency_histo.observe(execution.as_secs_f64()); + } + + // state transition + inner.timings = SmgrOpTimerState::Flushing; + + // return the flush in progress object which + // will do the remaining metrics updates + let SmgrOpTimerInner { + global_flush_in_progress_micros, + per_timeline_flush_in_progress_micros, + .. + } = inner; + Some(SmgrOpFlushInProgress { + flush_started_at: at, + global_micros: global_flush_in_progress_micros, + per_timeline_micros: per_timeline_flush_in_progress_micros, + }) + } +} + +/// The last stage of request processing is serializing and flushing the request +/// into the TCP connection. We want to make slow flushes observable +/// _while they are occuring_, so this struct provides a wrapper method [`Self::measure`] +/// to periodically bump the metric. +/// +/// If in the future we decide that we're not interested in live updates, we can +/// add another `observe_*` method to [`SmgrOpTimer`], follow the existing pattern there, +/// and remove this struct from the code base. pub(crate) struct SmgrOpFlushInProgress { flush_started_at: Instant, global_micros: IntCounter, per_timeline_micros: IntCounter, } -impl SmgrOpTimer { - pub(crate) fn observe_throttle_done_execution_starting(&mut self, throttle: &ThrottleResult) { - let inner = self.0.as_mut().expect("other public methods consume self"); - match (&mut inner.timings, throttle) { - (SmgrOpTimerState::Received { received_at }, throttle) => match throttle { - ThrottleResult::NotThrottled { start } => { - inner.timings = SmgrOpTimerState::ThrottleDoneExecutionStarting { - received_at: *received_at, - throttle_started_at: *start, - started_execution_at: *start, - }; - } - ThrottleResult::Throttled { start, end } => { - inner.timings = SmgrOpTimerState::ThrottleDoneExecutionStarting { - received_at: *start, - throttle_started_at: *start, - started_execution_at: *end, - }; - } - }, - (x, _) => panic!("called in unexpected state: {x:?}"), - } - } - - pub(crate) fn observe_smgr_op_completion_and_start_flushing(mut self) -> SmgrOpFlushInProgress { - let (flush_start, inner) = self - .smgr_op_end() - .expect("this method consume self, and the only other caller is drop handler"); - let SmgrOpTimerInner { - global_flush_in_progress_micros, - per_timeline_flush_in_progress_micros, - .. - } = inner; - SmgrOpFlushInProgress { - flush_started_at: flush_start, - global_micros: global_flush_in_progress_micros, - per_timeline_micros: per_timeline_flush_in_progress_micros, - } - } - - /// Returns `None`` if this method has already been called, `Some` otherwise. - fn smgr_op_end(&mut self) -> Option<(Instant, SmgrOpTimerInner)> { - let inner = self.0.take()?; - - let now = Instant::now(); - - let batch; - let execution; - let throttle; - match inner.timings { - SmgrOpTimerState::Received { received_at } => { - batch = (now - received_at).as_secs_f64(); - // TODO: use label for dropped requests. - // This is quite rare in practice, only during tenant/pageservers shutdown. - throttle = Duration::ZERO; - execution = Duration::ZERO.as_secs_f64(); - } - SmgrOpTimerState::ThrottleDoneExecutionStarting { - received_at, - throttle_started_at, - started_execution_at, - } => { - batch = (throttle_started_at - received_at).as_secs_f64(); - throttle = started_execution_at - throttle_started_at; - execution = (now - started_execution_at).as_secs_f64(); - } - } - - // update time spent in batching - inner.global_batch_wait_time.observe(batch); - inner.per_timeline_batch_wait_time.observe(batch); - - // time spent in throttle metric is updated by throttle impl - let _ = throttle; - - // update metrics for execution latency - inner.global_execution_latency_histo.observe(execution); - if let Some(per_timeline_execution_latency_histo) = - &inner.per_timeline_execution_latency_histo - { - per_timeline_execution_latency_histo.observe(execution); - } - - Some((now, inner)) - } -} - impl Drop for SmgrOpTimer { fn drop(&mut self) { - self.smgr_op_end(); + // In case of early drop, update any of the remaining metrics with + // observations so that (started,finished) counter pairs balance out + // and all counters on the latency path have the the same number of + // observations. + // It's technically lying and it would be better if each metric had + // a separate label or similar for cancelled requests. + // But we don't have that right now and counter pairs balancing + // out is useful when using the metrics in panels and whatnot. + let now = Instant::now(); + self.observe_throttle_start(now); + self.observe_throttle_done(ThrottleResult::NotThrottled { end: now }); + self.observe_execution_start(now); + self.observe_execution_end_flush_start(now); } } @@ -1354,12 +1417,12 @@ impl SmgrOpFlushInProgress { { let mut fut = std::pin::pin!(fut); - let now = Instant::now(); // Whenever observe_guard gets called, or dropped, // it adds the time elapsed since its last call to metrics. // Last call is tracked in `now`. let mut observe_guard = scopeguard::guard( || { + let now = Instant::now(); let elapsed = now - self.flush_started_at; self.global_micros .inc_by(u64::try_from(elapsed.as_micros()).unwrap()); @@ -1400,9 +1463,10 @@ pub enum SmgrQueryType { GetPageAtLsn, GetDbSize, GetSlruSegment, + #[cfg(feature = "testing")] + Test, } -#[derive(Debug)] pub(crate) struct SmgrQueryTimePerTimeline { global_started: [IntCounter; SmgrQueryType::COUNT], global_latency: [Histogram; SmgrQueryType::COUNT], @@ -1414,6 +1478,7 @@ pub(crate) struct SmgrQueryTimePerTimeline { per_timeline_flush_in_progress_micros: IntCounter, global_batch_wait_time: Histogram, per_timeline_batch_wait_time: Histogram, + throttling: Arc, } static SMGR_QUERY_STARTED_GLOBAL: Lazy = Lazy::new(|| { @@ -1619,7 +1684,11 @@ static PAGE_SERVICE_SMGR_BATCH_WAIT_TIME_GLOBAL: Lazy = Lazy::new(|| }); impl SmgrQueryTimePerTimeline { - pub(crate) fn new(tenant_shard_id: &TenantShardId, timeline_id: &TimelineId) -> Self { + pub(crate) fn new( + tenant_shard_id: &TenantShardId, + timeline_id: &TimelineId, + pagestream_throttle_metrics: Arc, + ) -> Self { let tenant_id = tenant_shard_id.tenant_id.to_string(); let shard_slug = format!("{}", tenant_shard_id.shard_slug()); let timeline_id = timeline_id.to_string(); @@ -1680,6 +1749,7 @@ impl SmgrQueryTimePerTimeline { per_timeline_flush_in_progress_micros, global_batch_wait_time, per_timeline_batch_wait_time, + throttling: pagestream_throttle_metrics, } } pub(crate) fn start_smgr_op(&self, op: SmgrQueryType, received_at: Instant) -> SmgrOpTimer { @@ -1695,88 +1765,24 @@ impl SmgrQueryTimePerTimeline { SmgrOpTimer(Some(SmgrOpTimerInner { global_execution_latency_histo: self.global_latency[op as usize].clone(), per_timeline_execution_latency_histo: per_timeline_latency_histo, - timings: SmgrOpTimerState::Received { received_at }, global_flush_in_progress_micros: self.global_flush_in_progress_micros.clone(), per_timeline_flush_in_progress_micros: self .per_timeline_flush_in_progress_micros .clone(), global_batch_wait_time: self.global_batch_wait_time.clone(), per_timeline_batch_wait_time: self.per_timeline_batch_wait_time.clone(), + throttling: self.throttling.clone(), + timings: SmgrOpTimerState::Received { received_at }, })) } + /// TODO: do something about this? seems odd, we have a similar call on SmgrOpTimer pub(crate) fn observe_getpage_batch_start(&self, batch_size: usize) { self.global_batch_size.observe(batch_size as f64); self.per_timeline_batch_size.observe(batch_size as f64); } } -#[cfg(test)] -mod smgr_query_time_tests { - use std::time::Instant; - - use pageserver_api::shard::TenantShardId; - use strum::IntoEnumIterator; - use utils::id::{TenantId, TimelineId}; - - // Regression test, we used hard-coded string constants before using an enum. - #[test] - fn op_label_name() { - use super::SmgrQueryType::*; - let expect: [(super::SmgrQueryType, &'static str); 5] = [ - (GetRelExists, "get_rel_exists"), - (GetRelSize, "get_rel_size"), - (GetPageAtLsn, "get_page_at_lsn"), - (GetDbSize, "get_db_size"), - (GetSlruSegment, "get_slru_segment"), - ]; - for (op, expect) in expect { - let actual: &'static str = op.into(); - assert_eq!(actual, expect); - } - } - - #[test] - fn basic() { - let ops: Vec<_> = super::SmgrQueryType::iter().collect(); - - for op in &ops { - let tenant_id = TenantId::generate(); - let timeline_id = TimelineId::generate(); - let metrics = super::SmgrQueryTimePerTimeline::new( - &TenantShardId::unsharded(tenant_id), - &timeline_id, - ); - - let get_counts = || { - let global: u64 = ops - .iter() - .map(|op| metrics.global_latency[*op as usize].get_sample_count()) - .sum(); - ( - global, - metrics.per_timeline_getpage_latency.get_sample_count(), - ) - }; - - let (pre_global, pre_per_tenant_timeline) = get_counts(); - assert_eq!(pre_per_tenant_timeline, 0); - - let timer = metrics.start_smgr_op(*op, Instant::now()); - drop(timer); - - let (post_global, post_per_tenant_timeline) = get_counts(); - if matches!(op, super::SmgrQueryType::GetPageAtLsn) { - // getpage ops are tracked per-timeline, others aren't - assert_eq!(post_per_tenant_timeline, 1); - } else { - assert_eq!(post_per_tenant_timeline, 0); - } - assert!(post_global > pre_global); - } - } -} - // keep in sync with control plane Go code so that we can validate // compute's basebackup_ms metric with our perspective in the context of SLI/SLO. static COMPUTE_STARTUP_BUCKETS: Lazy<[f64; 28]> = Lazy::new(|| { @@ -3572,9 +3578,7 @@ pub(crate) mod tenant_throttling { use once_cell::sync::Lazy; use utils::shard::TenantShardId; - use crate::tenant::{self}; - - struct GlobalAndPerTenantIntCounter { + pub(crate) struct GlobalAndPerTenantIntCounter { global: IntCounter, per_tenant: IntCounter, } @@ -3592,10 +3596,10 @@ pub(crate) mod tenant_throttling { } pub(crate) struct Metrics { - count_accounted_start: GlobalAndPerTenantIntCounter, - count_accounted_finish: GlobalAndPerTenantIntCounter, - wait_time: GlobalAndPerTenantIntCounter, - count_throttled: GlobalAndPerTenantIntCounter, + pub(super) count_accounted_start: GlobalAndPerTenantIntCounter, + pub(super) count_accounted_finish: GlobalAndPerTenantIntCounter, + pub(super) wait_time: GlobalAndPerTenantIntCounter, + pub(super) count_throttled: GlobalAndPerTenantIntCounter, } static COUNT_ACCOUNTED_START: Lazy = Lazy::new(|| { @@ -3730,26 +3734,6 @@ pub(crate) mod tenant_throttling { } } } - - impl tenant::throttle::Metric for Metrics { - #[inline(always)] - fn accounting_start(&self) { - self.count_accounted_start.inc(); - } - #[inline(always)] - fn accounting_finish(&self) { - self.count_accounted_finish.inc(); - } - #[inline(always)] - fn observe_throttling( - &self, - tenant::throttle::Observation { wait_time }: &tenant::throttle::Observation, - ) { - let val = u64::try_from(wait_time.as_micros()).unwrap(); - self.wait_time.inc_by(val); - self.count_throttled.inc(); - } - } } pub(crate) mod disk_usage_based_eviction { @@ -3894,7 +3878,6 @@ pub fn preinitialize_metrics(conf: &'static PageServerConf) { // histograms [ - &READ_NUM_LAYERS_VISITED, &VEC_READ_NUM_LAYERS_VISITED, &WAIT_LSN_TIME, &WAL_REDO_TIME, diff --git a/pageserver/src/page_service.rs b/pageserver/src/page_service.rs index 0c4a1b18f5..b14a44f9e3 100644 --- a/pageserver/src/page_service.rs +++ b/pageserver/src/page_service.rs @@ -67,6 +67,7 @@ use crate::tenant::PageReconstructError; use crate::tenant::Timeline; use crate::{basebackup, timed_after_cancellation}; use pageserver_api::key::rel_block_to_key; +use pageserver_api::models::PageTraceEvent; use pageserver_api::reltag::SlruKind; use postgres_ffi::pg_constants::DEFAULTTABLESPACE_OID; use postgres_ffi::BLCKSZ; @@ -554,37 +555,52 @@ struct BatchedGetPageRequest { timer: SmgrOpTimer, } +#[cfg(feature = "testing")] +struct BatchedTestRequest { + req: models::PagestreamTestRequest, + timer: SmgrOpTimer, +} + +/// NB: we only hold [`timeline::handle::WeakHandle`] inside this enum, +/// so that we don't keep the [`Timeline::gate`] open while the batch +/// is being built up inside the [`spsc_fold`] (pagestream pipelining). enum BatchedFeMessage { Exists { span: Span, timer: SmgrOpTimer, - shard: timeline::handle::Handle, + shard: timeline::handle::WeakHandle, req: models::PagestreamExistsRequest, }, Nblocks { span: Span, timer: SmgrOpTimer, - shard: timeline::handle::Handle, + shard: timeline::handle::WeakHandle, req: models::PagestreamNblocksRequest, }, GetPage { span: Span, - shard: timeline::handle::Handle, + shard: timeline::handle::WeakHandle, effective_request_lsn: Lsn, pages: smallvec::SmallVec<[BatchedGetPageRequest; 1]>, }, DbSize { span: Span, timer: SmgrOpTimer, - shard: timeline::handle::Handle, + shard: timeline::handle::WeakHandle, req: models::PagestreamDbSizeRequest, }, GetSlruSegment { span: Span, timer: SmgrOpTimer, - shard: timeline::handle::Handle, + shard: timeline::handle::WeakHandle, req: models::PagestreamGetSlruSegmentRequest, }, + #[cfg(feature = "testing")] + Test { + span: Span, + shard: timeline::handle::WeakHandle, + requests: Vec, + }, RespondError { span: Span, error: BatchedPageStreamError, @@ -592,40 +608,27 @@ enum BatchedFeMessage { } impl BatchedFeMessage { - async fn throttle_and_record_start_processing( - &mut self, - cancel: &CancellationToken, - ) -> Result<(), QueryError> { - let (shard, tokens, timers) = match self { - BatchedFeMessage::Exists { shard, timer, .. } - | BatchedFeMessage::Nblocks { shard, timer, .. } - | BatchedFeMessage::DbSize { shard, timer, .. } - | BatchedFeMessage::GetSlruSegment { shard, timer, .. } => { - ( - shard, - // 1 token is probably under-estimating because these - // request handlers typically do several Timeline::get calls. - 1, - itertools::Either::Left(std::iter::once(timer)), - ) + fn observe_execution_start(&mut self, at: Instant) { + match self { + BatchedFeMessage::Exists { timer, .. } + | BatchedFeMessage::Nblocks { timer, .. } + | BatchedFeMessage::DbSize { timer, .. } + | BatchedFeMessage::GetSlruSegment { timer, .. } => { + timer.observe_execution_start(at); } - BatchedFeMessage::GetPage { shard, pages, .. } => ( - shard, - pages.len(), - itertools::Either::Right(pages.iter_mut().map(|p| &mut p.timer)), - ), - BatchedFeMessage::RespondError { .. } => return Ok(()), - }; - let throttled = tokio::select! { - throttled = shard.pagestream_throttle.throttle(tokens) => { throttled } - _ = cancel.cancelled() => { - return Err(QueryError::Shutdown); + BatchedFeMessage::GetPage { pages, .. } => { + for page in pages { + page.timer.observe_execution_start(at); + } } - }; - for timer in timers { - timer.observe_throttle_done_execution_starting(&throttled); + #[cfg(feature = "testing")] + BatchedFeMessage::Test { requests, .. } => { + for req in requests { + req.timer.observe_execution_start(at); + } + } + BatchedFeMessage::RespondError { .. } => {} } - Ok(()) } } @@ -717,6 +720,26 @@ impl PageServerHandler { let neon_fe_msg = PagestreamFeMessage::parse(&mut copy_data_bytes.reader(), protocol_version)?; + // TODO: turn in to async closure once available to avoid repeating received_at + async fn record_op_start_and_throttle( + shard: &timeline::handle::Handle, + op: metrics::SmgrQueryType, + received_at: Instant, + ) -> Result { + // It's important to start the smgr op metric recorder as early as possible + // so that the _started counters are incremented before we do + // any serious waiting, e.g., for throttle, batching, or actual request handling. + let mut timer = shard.query_metrics.start_smgr_op(op, received_at); + let now = Instant::now(); + timer.observe_throttle_start(now); + let throttled = tokio::select! { + res = shard.pagestream_throttle.throttle(1, now) => res, + _ = shard.cancel.cancelled() => return Err(QueryError::Shutdown), + }; + timer.observe_throttle_done(throttled); + Ok(timer) + } + let batched_msg = match neon_fe_msg { PagestreamFeMessage::Exists(req) => { let span = tracing::info_span!(parent: parent_span, "handle_get_rel_exists_request", rel = %req.rel, req_lsn = %req.hdr.request_lsn); @@ -724,13 +747,16 @@ impl PageServerHandler { .get(tenant_id, timeline_id, ShardSelector::Zero) .instrument(span.clone()) // sets `shard_id` field .await?; - let timer = shard - .query_metrics - .start_smgr_op(metrics::SmgrQueryType::GetRelExists, received_at); + let timer = record_op_start_and_throttle( + &shard, + metrics::SmgrQueryType::GetRelExists, + received_at, + ) + .await?; BatchedFeMessage::Exists { span, timer, - shard, + shard: shard.downgrade(), req, } } @@ -740,13 +766,16 @@ impl PageServerHandler { .get(tenant_id, timeline_id, ShardSelector::Zero) .instrument(span.clone()) // sets `shard_id` field .await?; - let timer = shard - .query_metrics - .start_smgr_op(metrics::SmgrQueryType::GetRelSize, received_at); + let timer = record_op_start_and_throttle( + &shard, + metrics::SmgrQueryType::GetRelSize, + received_at, + ) + .await?; BatchedFeMessage::Nblocks { span, timer, - shard, + shard: shard.downgrade(), req, } } @@ -756,13 +785,16 @@ impl PageServerHandler { .get(tenant_id, timeline_id, ShardSelector::Zero) .instrument(span.clone()) // sets `shard_id` field .await?; - let timer = shard - .query_metrics - .start_smgr_op(metrics::SmgrQueryType::GetDbSize, received_at); + let timer = record_op_start_and_throttle( + &shard, + metrics::SmgrQueryType::GetDbSize, + received_at, + ) + .await?; BatchedFeMessage::DbSize { span, timer, - shard, + shard: shard.downgrade(), req, } } @@ -772,13 +804,16 @@ impl PageServerHandler { .get(tenant_id, timeline_id, ShardSelector::Zero) .instrument(span.clone()) // sets `shard_id` field .await?; - let timer = shard - .query_metrics - .start_smgr_op(metrics::SmgrQueryType::GetSlruSegment, received_at); + let timer = record_op_start_and_throttle( + &shard, + metrics::SmgrQueryType::GetSlruSegment, + received_at, + ) + .await?; BatchedFeMessage::GetSlruSegment { span, timer, - shard, + shard: shard.downgrade(), req, } } @@ -823,13 +858,14 @@ impl PageServerHandler { } }; - // It's important to start the timer before waiting for the LSN - // so that the _started counters are incremented before we do - // any serious waiting, e.g., for LSNs. - let timer = shard - .query_metrics - .start_smgr_op(metrics::SmgrQueryType::GetPageAtLsn, received_at); + let timer = record_op_start_and_throttle( + &shard, + metrics::SmgrQueryType::GetPageAtLsn, + received_at, + ) + .await?; + // We're holding the Handle let effective_request_lsn = match Self::wait_or_get_last_lsn( &shard, req.hdr.request_lsn, @@ -847,11 +883,27 @@ impl PageServerHandler { }; BatchedFeMessage::GetPage { span, - shard, + shard: shard.downgrade(), effective_request_lsn, pages: smallvec::smallvec![BatchedGetPageRequest { req, timer }], } } + #[cfg(feature = "testing")] + PagestreamFeMessage::Test(req) => { + let span = tracing::info_span!(parent: parent_span, "handle_test_request"); + let shard = timeline_handles + .get(tenant_id, timeline_id, ShardSelector::Zero) + .instrument(span.clone()) // sets `shard_id` field + .await?; + let timer = + record_op_start_and_throttle(&shard, metrics::SmgrQueryType::Test, received_at) + .await?; + BatchedFeMessage::Test { + span, + shard: shard.downgrade(), + requests: vec![BatchedTestRequest { req, timer }], + } + } }; Ok(Some(batched_msg)) } @@ -893,9 +945,7 @@ impl PageServerHandler { assert_eq!(accum_pages.len(), max_batch_size.get()); return false; } - if (accum_shard.tenant_shard_id, accum_shard.timeline_id) - != (this_shard.tenant_shard_id, this_shard.timeline_id) - { + if !accum_shard.is_same_handle_as(&this_shard) { trace!(%accum_lsn, %this_lsn, "stopping batching because timeline object mismatch"); // TODO: we _could_ batch & execute each shard seperately (and in parallel). // But the current logic for keeping responses in order does not support that. @@ -914,6 +964,44 @@ impl PageServerHandler { accum_pages.extend(this_pages); Ok(()) } + #[cfg(feature = "testing")] + ( + Ok(BatchedFeMessage::Test { + shard: accum_shard, + requests: accum_requests, + .. + }), + BatchedFeMessage::Test { + shard: this_shard, + requests: this_requests, + .. + }, + ) if (|| { + assert!(this_requests.len() == 1); + if accum_requests.len() >= max_batch_size.get() { + trace!(%max_batch_size, "stopping batching because of batch size"); + assert_eq!(accum_requests.len(), max_batch_size.get()); + return false; + } + if !accum_shard.is_same_handle_as(&this_shard) { + trace!("stopping batching because timeline object mismatch"); + // TODO: we _could_ batch & execute each shard seperately (and in parallel). + // But the current logic for keeping responses in order does not support that. + return false; + } + let this_batch_key = this_requests[0].req.batch_key; + let accum_batch_key = accum_requests[0].req.batch_key; + if this_requests[0].req.batch_key != accum_requests[0].req.batch_key { + trace!(%accum_batch_key, %this_batch_key, "stopping batching because batch key changed"); + return false; + } + true + })() => + { + // ok to batch + accum_requests.extend(this_requests); + Ok(()) + } // something batched already but this message is unbatchable (_, this_msg) => { // by default, don't continue batching @@ -934,6 +1022,13 @@ impl PageServerHandler { where IO: AsyncRead + AsyncWrite + Send + Sync + Unpin, { + let started_at = Instant::now(); + let batch = { + let mut batch = batch; + batch.observe_execution_start(started_at); + batch + }; + // invoke handler function let (handler_results, span): ( Vec>, @@ -948,7 +1043,7 @@ impl PageServerHandler { fail::fail_point!("ps::handle-pagerequest-message::exists"); ( vec![self - .handle_get_rel_exists_request(&shard, &req, ctx) + .handle_get_rel_exists_request(&*shard.upgrade()?, &req, ctx) .instrument(span.clone()) .await .map(|msg| (msg, timer)) @@ -965,7 +1060,7 @@ impl PageServerHandler { fail::fail_point!("ps::handle-pagerequest-message::nblocks"); ( vec![self - .handle_get_nblocks_request(&shard, &req, ctx) + .handle_get_nblocks_request(&*shard.upgrade()?, &req, ctx) .instrument(span.clone()) .await .map(|msg| (msg, timer)) @@ -986,7 +1081,7 @@ impl PageServerHandler { trace!(npages, "handling getpage request"); let res = self .handle_get_page_at_lsn_request_batched( - &shard, + &*shard.upgrade()?, effective_request_lsn, pages, ctx, @@ -1008,7 +1103,7 @@ impl PageServerHandler { fail::fail_point!("ps::handle-pagerequest-message::dbsize"); ( vec![self - .handle_db_size_request(&shard, &req, ctx) + .handle_db_size_request(&*shard.upgrade()?, &req, ctx) .instrument(span.clone()) .await .map(|msg| (msg, timer)) @@ -1025,7 +1120,7 @@ impl PageServerHandler { fail::fail_point!("ps::handle-pagerequest-message::slrusegment"); ( vec![self - .handle_get_slru_segment_request(&shard, &req, ctx) + .handle_get_slru_segment_request(&*shard.upgrade()?, &req, ctx) .instrument(span.clone()) .await .map(|msg| (msg, timer)) @@ -1033,6 +1128,27 @@ impl PageServerHandler { span, ) } + #[cfg(feature = "testing")] + BatchedFeMessage::Test { + span, + shard, + requests, + } => { + fail::fail_point!("ps::handle-pagerequest-message::test"); + ( + { + let npages = requests.len(); + trace!(npages, "handling getpage request"); + let res = self + .handle_test_request_batch(&*shard.upgrade()?, requests, ctx) + .instrument(span.clone()) + .await; + assert_eq!(res.len(), npages); + res + }, + span, + ) + } BatchedFeMessage::RespondError { span, error } => { // We've already decided to respond with an error, so we don't need to // call the handler. @@ -1100,8 +1216,11 @@ impl PageServerHandler { // The timer's underlying metric is used for a storage-internal latency SLO and // we don't want to include latency in it that we can't control. // And as pointed out above, in this case, we don't control the time that flush will take. - let flushing_timer = - timer.map(|timer| timer.observe_smgr_op_completion_and_start_flushing()); + let flushing_timer = timer.map(|mut timer| { + timer + .observe_execution_end_flush_start(Instant::now()) + .expect("we are the first caller") + }); // what we want to do let flush_fut = pgb_writer.flush(); @@ -1255,7 +1374,7 @@ impl PageServerHandler { Ok(msg) => msg, Err(e) => break e, }; - let mut msg = match msg { + let msg = match msg { Some(msg) => msg, None => { debug!("pagestream subprotocol end observed"); @@ -1263,10 +1382,6 @@ impl PageServerHandler { } }; - if let Err(cancelled) = msg.throttle_and_record_start_processing(&self.cancel).await { - break cancelled; - } - let err = self .pagesteam_handle_batched_message(pgb_writer, msg, &cancel, protocol_version, ctx) .await; @@ -1426,15 +1541,12 @@ impl PageServerHandler { return Ok(()); } }; - let mut batch = match batch { + let batch = match batch { Ok(batch) => batch, Err(e) => { return Err(e); } }; - batch - .throttle_and_record_start_processing(&self.cancel) - .await?; self.pagesteam_handle_batched_message( pgb_writer, batch, @@ -1702,6 +1814,20 @@ impl PageServerHandler { .query_metrics .observe_getpage_batch_start(requests.len()); + // If a page trace is running, submit an event for this request. + if let Some(page_trace) = timeline.page_trace.load().as_ref() { + let time = SystemTime::now(); + for batch in &requests { + let key = rel_block_to_key(batch.req.rel, batch.req.blkno).to_compact(); + // Ignore error (trace buffer may be full or tracer may have disconnected). + _ = page_trace.try_send(PageTraceEvent { + key, + effective_lsn, + time, + }); + } + } + let results = timeline .get_rel_page_at_lsn_batched( requests.iter().map(|p| (&p.req.rel, &p.req.blkno)), @@ -1760,6 +1886,51 @@ impl PageServerHandler { )) } + // NB: this impl mimics what we do for batched getpage requests. + #[cfg(feature = "testing")] + #[instrument(skip_all, fields(shard_id))] + async fn handle_test_request_batch( + &mut self, + timeline: &Timeline, + requests: Vec, + _ctx: &RequestContext, + ) -> Vec> { + // real requests would do something with the timeline + let mut results = Vec::with_capacity(requests.len()); + for _req in requests.iter() { + tokio::task::yield_now().await; + + results.push({ + if timeline.cancel.is_cancelled() { + Err(PageReconstructError::Cancelled) + } else { + Ok(()) + } + }); + } + + // TODO: avoid creating the new Vec here + Vec::from_iter( + requests + .into_iter() + .zip(results.into_iter()) + .map(|(req, res)| { + res.map(|()| { + ( + PagestreamBeMessage::Test(models::PagestreamTestResponse { + req: req.req.clone(), + }), + req.timer, + ) + }) + .map_err(|e| BatchedPageStreamError { + err: PageStreamError::from(e), + req: req.req.hdr, + }) + }), + ) + } + /// Note on "fullbackup": /// Full basebackups should only be used for debugging purposes. /// Originally, it was introduced to enable breaking storage format changes, @@ -2375,6 +2546,14 @@ impl From for QueryError { } } +impl From for QueryError { + fn from(e: crate::tenant::timeline::handle::HandleUpgradeError) -> Self { + match e { + crate::tenant::timeline::handle::HandleUpgradeError::ShutDown => QueryError::Shutdown, + } + } +} + fn set_tracing_field_shard_id(timeline: &Timeline) { debug_assert_current_span_has_tenant_and_timeline_id_no_shard_id(); tracing::Span::current().record( diff --git a/pageserver/src/tenant.rs b/pageserver/src/tenant.rs index 8e61d09de7..f6d758ad22 100644 --- a/pageserver/src/tenant.rs +++ b/pageserver/src/tenant.rs @@ -21,6 +21,7 @@ use enumset::EnumSet; use futures::stream::FuturesUnordered; use futures::StreamExt; use pageserver_api::models; +use pageserver_api::models::CompactInfoResponse; use pageserver_api::models::LsnLease; use pageserver_api::models::TimelineArchivalState; use pageserver_api::models::TimelineState; @@ -37,21 +38,17 @@ use remote_timeline_client::manifest::{ }; use remote_timeline_client::UploadQueueNotReadyError; use std::collections::BTreeMap; -use std::collections::VecDeque; use std::fmt; use std::future::Future; use std::sync::atomic::AtomicBool; use std::sync::Weak; use std::time::SystemTime; use storage_broker::BrokerClientChannel; -use timeline::compaction::GcCompactJob; -use timeline::compaction::ScheduledCompactionTask; +use timeline::compaction::GcCompactionQueue; use timeline::import_pgdata; use timeline::offload::offload_timeline; use timeline::offload::OffloadError; -use timeline::CompactFlags; use timeline::CompactOptions; -use timeline::CompactionError; use timeline::ShutdownMode; use tokio::io::BufReader; use tokio::sync::watch; @@ -347,10 +344,8 @@ pub struct Tenant { /// Overhead of mutex is acceptable because compaction is done with a multi-second period. compaction_circuit_breaker: std::sync::Mutex, - /// Scheduled compaction tasks. Currently, this can only be populated by triggering - /// a manual gc-compaction from the manual compaction API. - scheduled_compaction_tasks: - std::sync::Mutex>>, + /// Scheduled gc-compaction tasks. + scheduled_compaction_tasks: std::sync::Mutex>>, /// If the tenant is in Activating state, notify this to encourage it /// to proceed to Active as soon as possible, rather than waiting for lazy @@ -370,8 +365,9 @@ pub struct Tenant { /// Throttle applied at the top of [`Timeline::get`]. /// All [`Tenant::timelines`] of a given [`Tenant`] instance share the same [`throttle::Throttle`] instance. - pub(crate) pagestream_throttle: - Arc>, + pub(crate) pagestream_throttle: Arc, + + pub(crate) pagestream_throttle_metrics: Arc, /// An ongoing timeline detach concurrency limiter. /// @@ -1692,6 +1688,7 @@ impl Tenant { TimelineResources { remote_client, pagestream_throttle: self.pagestream_throttle.clone(), + pagestream_throttle_metrics: self.pagestream_throttle_metrics.clone(), l0_flush_global_state: self.l0_flush_global_state.clone(), }, LoadTimelineCause::Attach, @@ -2997,104 +2994,18 @@ impl Tenant { if has_pending_l0_compaction_task { Some(true) } else { - let mut has_pending_scheduled_compaction_task; - let next_scheduled_compaction_task = { - let mut guard = self.scheduled_compaction_tasks.lock().unwrap(); - if let Some(tline_pending_tasks) = guard.get_mut(timeline_id) { - if !tline_pending_tasks.is_empty() { - info!( - "{} tasks left in the compaction schedule queue", - tline_pending_tasks.len() - ); - } - let next_task = tline_pending_tasks.pop_front(); - has_pending_scheduled_compaction_task = !tline_pending_tasks.is_empty(); - next_task - } else { - has_pending_scheduled_compaction_task = false; - None - } + let queue = { + let guard = self.scheduled_compaction_tasks.lock().unwrap(); + guard.get(timeline_id).cloned() }; - if let Some(mut next_scheduled_compaction_task) = next_scheduled_compaction_task - { - if !next_scheduled_compaction_task - .options - .flags - .contains(CompactFlags::EnhancedGcBottomMostCompaction) - { - warn!("ignoring scheduled compaction task: scheduled task must be gc compaction: {:?}", next_scheduled_compaction_task.options); - } else if next_scheduled_compaction_task.options.sub_compaction { - info!("running scheduled enhanced gc bottom-most compaction with sub-compaction, splitting compaction jobs"); - let jobs: Vec = timeline - .gc_compaction_split_jobs( - GcCompactJob::from_compact_options( - next_scheduled_compaction_task.options.clone(), - ), - next_scheduled_compaction_task - .options - .sub_compaction_max_job_size_mb, - ) - .await - .map_err(CompactionError::Other)?; - if jobs.is_empty() { - info!("no jobs to run, skipping scheduled compaction task"); - } else { - has_pending_scheduled_compaction_task = true; - let jobs_len = jobs.len(); - let mut guard = self.scheduled_compaction_tasks.lock().unwrap(); - let tline_pending_tasks = guard.entry(*timeline_id).or_default(); - for (idx, job) in jobs.into_iter().enumerate() { - // Unfortunately we need to convert the `GcCompactJob` back to `CompactionOptions` - // until we do further refactors to allow directly call `compact_with_gc`. - let mut flags: EnumSet = EnumSet::default(); - flags |= CompactFlags::EnhancedGcBottomMostCompaction; - if job.dry_run { - flags |= CompactFlags::DryRun; - } - let options = CompactOptions { - flags, - sub_compaction: false, - compact_key_range: Some(job.compact_key_range.into()), - compact_lsn_range: Some(job.compact_lsn_range.into()), - sub_compaction_max_job_size_mb: None, - }; - tline_pending_tasks.push_back(if idx == jobs_len - 1 { - ScheduledCompactionTask { - options, - // The last job in the queue sends the signal and releases the gc guard - result_tx: next_scheduled_compaction_task - .result_tx - .take(), - gc_block: next_scheduled_compaction_task - .gc_block - .take(), - } - } else { - ScheduledCompactionTask { - options, - result_tx: None, - gc_block: None, - } - }); - } - info!("scheduled enhanced gc bottom-most compaction with sub-compaction, split into {} jobs", jobs_len); - } - } else { - let _ = timeline - .compact_with_options( - cancel, - next_scheduled_compaction_task.options, - ctx, - ) - .instrument(info_span!("scheduled_compact_timeline", %timeline_id)) - .await?; - if let Some(tx) = next_scheduled_compaction_task.result_tx.take() { - // TODO: we can send compaction statistics in the future - tx.send(()).ok(); - } - } + if let Some(queue) = queue { + let has_pending_tasks = queue + .iteration(cancel, ctx, &self.gc_block, timeline) + .await?; + Some(has_pending_tasks) + } else { + Some(false) } - Some(has_pending_scheduled_compaction_task) } } else { None @@ -3124,34 +3035,32 @@ impl Tenant { } /// Cancel scheduled compaction tasks - pub(crate) fn cancel_scheduled_compaction( - &self, - timeline_id: TimelineId, - ) -> Vec { + pub(crate) fn cancel_scheduled_compaction(&self, timeline_id: TimelineId) { let mut guard = self.scheduled_compaction_tasks.lock().unwrap(); - if let Some(tline_pending_tasks) = guard.get_mut(&timeline_id) { - let current_tline_pending_tasks = std::mem::take(tline_pending_tasks); - current_tline_pending_tasks.into_iter().collect() - } else { - Vec::new() + if let Some(q) = guard.get_mut(&timeline_id) { + q.cancel_scheduled(); } } pub(crate) fn get_scheduled_compaction_tasks( &self, timeline_id: TimelineId, - ) -> Vec { - use itertools::Itertools; - let guard = self.scheduled_compaction_tasks.lock().unwrap(); - guard - .get(&timeline_id) - .map(|tline_pending_tasks| { - tline_pending_tasks - .iter() - .map(|x| x.options.clone()) - .collect_vec() - }) - .unwrap_or_default() + ) -> Vec { + let res = { + let guard = self.scheduled_compaction_tasks.lock().unwrap(); + guard.get(&timeline_id).map(|q| q.remaining_jobs()) + }; + let Some((running, remaining)) = res else { + return Vec::new(); + }; + let mut result = Vec::new(); + if let Some((id, running)) = running { + result.extend(running.into_compact_info_resp(id, true)); + } + for (id, job) in remaining { + result.extend(job.into_compact_info_resp(id, false)); + } + result } /// Schedule a compaction task for a timeline. @@ -3160,20 +3069,12 @@ impl Tenant { timeline_id: TimelineId, options: CompactOptions, ) -> anyhow::Result> { - let gc_guard = match self.gc_block.start().await { - Ok(guard) => guard, - Err(e) => { - bail!("cannot run gc-compaction because gc is blocked: {}", e); - } - }; let (tx, rx) = tokio::sync::oneshot::channel(); let mut guard = self.scheduled_compaction_tasks.lock().unwrap(); - let tline_pending_tasks = guard.entry(timeline_id).or_default(); - tline_pending_tasks.push_back(ScheduledCompactionTask { - options, - result_tx: Some(tx), - gc_block: Some(gc_guard), - }); + let q = guard + .entry(timeline_id) + .or_insert_with(|| Arc::new(GcCompactionQueue::new())); + q.schedule_manual_compaction(options, Some(tx)); Ok(rx) } @@ -4093,6 +3994,9 @@ impl Tenant { Ok(timeline) } + /// [`Tenant::shutdown`] must be called before dropping the returned [`Tenant`] object + /// to ensure proper cleanup of background tasks and metrics. + // // Allow too_many_arguments because a constructor's argument list naturally grows with the // number of attributes in the struct: breaking these out into a builder wouldn't be helpful. #[allow(clippy::too_many_arguments)] @@ -4201,8 +4105,10 @@ impl Tenant { gate: Gate::default(), pagestream_throttle: Arc::new(throttle::Throttle::new( Tenant::get_pagestream_throttle_config(conf, &attached_conf.tenant_conf), - crate::metrics::tenant_throttling::Metrics::new(&tenant_shard_id), )), + pagestream_throttle_metrics: Arc::new( + crate::metrics::tenant_throttling::Pagestream::new(&tenant_shard_id), + ), tenant_conf: Arc::new(ArcSwap::from_pointee(attached_conf)), ongoing_timeline_detach: std::sync::Mutex::default(), gc_block: Default::default(), @@ -5109,6 +5015,7 @@ impl Tenant { TimelineResources { remote_client: self.build_timeline_remote_client(timeline_id), pagestream_throttle: self.pagestream_throttle.clone(), + pagestream_throttle_metrics: self.pagestream_throttle_metrics.clone(), l0_flush_global_state: self.l0_flush_global_state.clone(), } } @@ -5783,7 +5690,7 @@ mod tests { use bytes::{Bytes, BytesMut}; use hex_literal::hex; use itertools::Itertools; - use pageserver_api::key::{Key, AUX_KEY_PREFIX, NON_INHERITED_RANGE}; + use pageserver_api::key::{Key, AUX_KEY_PREFIX, NON_INHERITED_RANGE, RELATION_SIZE_PREFIX}; use pageserver_api::keyspace::KeySpace; use pageserver_api::models::{CompactionAlgorithm, CompactionAlgorithmSettings}; use pageserver_api::value::Value; @@ -7842,7 +7749,18 @@ mod tests { let base_key = Key::from_hex("620000000033333333444444445500000000").unwrap(); let base_key_child = Key::from_hex("620000000033333333444444445500000001").unwrap(); let base_key_nonexist = Key::from_hex("620000000033333333444444445500000002").unwrap(); + let base_key_overwrite = Key::from_hex("620000000033333333444444445500000003").unwrap(); + + let base_inherited_key = Key::from_hex("610000000033333333444444445500000000").unwrap(); + let base_inherited_key_child = + Key::from_hex("610000000033333333444444445500000001").unwrap(); + let base_inherited_key_nonexist = + Key::from_hex("610000000033333333444444445500000002").unwrap(); + let base_inherited_key_overwrite = + Key::from_hex("610000000033333333444444445500000003").unwrap(); + assert_eq!(base_key.field1, AUX_KEY_PREFIX); // in case someone accidentally changed the prefix... + assert_eq!(base_inherited_key.field1, RELATION_SIZE_PREFIX); let tline = tenant .create_test_timeline_with_layers( @@ -7851,7 +7769,18 @@ mod tests { DEFAULT_PG_VERSION, &ctx, Vec::new(), // delta layers - vec![(Lsn(0x20), vec![(base_key, test_img("metadata key 1"))])], // image layers + vec![( + Lsn(0x20), + vec![ + (base_inherited_key, test_img("metadata inherited key 1")), + ( + base_inherited_key_overwrite, + test_img("metadata key overwrite 1a"), + ), + (base_key, test_img("metadata key 1")), + (base_key_overwrite, test_img("metadata key overwrite 1b")), + ], + )], // image layers Lsn(0x20), // it's fine to not advance LSN to 0x30 while using 0x30 to get below because `get_vectored_impl` does not wait for LSN ) .await?; @@ -7865,7 +7794,18 @@ mod tests { Vec::new(), // delta layers vec![( Lsn(0x30), - vec![(base_key_child, test_img("metadata key 2"))], + vec![ + ( + base_inherited_key_child, + test_img("metadata inherited key 2"), + ), + ( + base_inherited_key_overwrite, + test_img("metadata key overwrite 2a"), + ), + (base_key_child, test_img("metadata key 2")), + (base_key_overwrite, test_img("metadata key overwrite 2b")), + ], )], // image layers Lsn(0x30), ) @@ -7887,6 +7827,26 @@ mod tests { get_vectored_impl_wrapper(&tline, base_key_nonexist, lsn, &ctx).await?, None ); + assert_eq!( + get_vectored_impl_wrapper(&tline, base_key_overwrite, lsn, &ctx).await?, + Some(test_img("metadata key overwrite 1b")) + ); + assert_eq!( + get_vectored_impl_wrapper(&tline, base_inherited_key, lsn, &ctx).await?, + Some(test_img("metadata inherited key 1")) + ); + assert_eq!( + get_vectored_impl_wrapper(&tline, base_inherited_key_child, lsn, &ctx).await?, + None + ); + assert_eq!( + get_vectored_impl_wrapper(&tline, base_inherited_key_nonexist, lsn, &ctx).await?, + None + ); + assert_eq!( + get_vectored_impl_wrapper(&tline, base_inherited_key_overwrite, lsn, &ctx).await?, + Some(test_img("metadata key overwrite 1a")) + ); // test vectored get on child timeline assert_eq!( @@ -7901,6 +7861,82 @@ mod tests { get_vectored_impl_wrapper(&child, base_key_nonexist, lsn, &ctx).await?, None ); + assert_eq!( + get_vectored_impl_wrapper(&child, base_inherited_key, lsn, &ctx).await?, + Some(test_img("metadata inherited key 1")) + ); + assert_eq!( + get_vectored_impl_wrapper(&child, base_inherited_key_child, lsn, &ctx).await?, + Some(test_img("metadata inherited key 2")) + ); + assert_eq!( + get_vectored_impl_wrapper(&child, base_inherited_key_nonexist, lsn, &ctx).await?, + None + ); + assert_eq!( + get_vectored_impl_wrapper(&child, base_key_overwrite, lsn, &ctx).await?, + Some(test_img("metadata key overwrite 2b")) + ); + assert_eq!( + get_vectored_impl_wrapper(&child, base_inherited_key_overwrite, lsn, &ctx).await?, + Some(test_img("metadata key overwrite 2a")) + ); + + // test vectored scan on parent timeline + let mut reconstruct_state = ValuesReconstructState::new(); + let res = tline + .get_vectored_impl( + KeySpace::single(Key::metadata_key_range()), + lsn, + &mut reconstruct_state, + &ctx, + ) + .await?; + + assert_eq!( + res.into_iter() + .map(|(k, v)| (k, v.unwrap())) + .collect::>(), + vec![ + (base_inherited_key, test_img("metadata inherited key 1")), + ( + base_inherited_key_overwrite, + test_img("metadata key overwrite 1a") + ), + (base_key, test_img("metadata key 1")), + (base_key_overwrite, test_img("metadata key overwrite 1b")), + ] + ); + + // test vectored scan on child timeline + let mut reconstruct_state = ValuesReconstructState::new(); + let res = child + .get_vectored_impl( + KeySpace::single(Key::metadata_key_range()), + lsn, + &mut reconstruct_state, + &ctx, + ) + .await?; + + assert_eq!( + res.into_iter() + .map(|(k, v)| (k, v.unwrap())) + .collect::>(), + vec![ + (base_inherited_key, test_img("metadata inherited key 1")), + ( + base_inherited_key_child, + test_img("metadata inherited key 2") + ), + ( + base_inherited_key_overwrite, + test_img("metadata key overwrite 2a") + ), + (base_key_child, test_img("metadata key 2")), + (base_key_overwrite, test_img("metadata key overwrite 2b")), + ] + ); Ok(()) } diff --git a/pageserver/src/tenant/config.rs b/pageserver/src/tenant/config.rs index d54dded778..edf2e6a3aa 100644 --- a/pageserver/src/tenant/config.rs +++ b/pageserver/src/tenant/config.rs @@ -11,7 +11,7 @@ pub(crate) use pageserver_api::config::TenantConfigToml as TenantConf; use pageserver_api::models::CompactionAlgorithmSettings; use pageserver_api::models::EvictionPolicy; -use pageserver_api::models::{self, TenantConfigPatch, ThrottleConfig}; +use pageserver_api::models::{self, TenantConfigPatch}; use pageserver_api::shard::{ShardCount, ShardIdentity, ShardNumber, ShardStripeSize}; use serde::de::IntoDeserializer; use serde::{Deserialize, Serialize}; @@ -597,7 +597,7 @@ impl From for models::TenantConfig { .map(humantime), heatmap_period: value.heatmap_period.map(humantime), lazy_slru_download: value.lazy_slru_download, - timeline_get_throttle: value.timeline_get_throttle.map(ThrottleConfig::from), + timeline_get_throttle: value.timeline_get_throttle, image_layer_creation_check_threshold: value.image_layer_creation_check_threshold, lsn_lease_length: value.lsn_lease_length.map(humantime), lsn_lease_length_for_ts: value.lsn_lease_length_for_ts.map(humantime), diff --git a/pageserver/src/tenant/disk_btree.rs b/pageserver/src/tenant/disk_btree.rs index c77342b144..bb9df020b5 100644 --- a/pageserver/src/tenant/disk_btree.rs +++ b/pageserver/src/tenant/disk_btree.rs @@ -84,17 +84,17 @@ impl Value { fn to_u64(self) -> u64 { let b = &self.0; - (b[0] as u64) << 32 - | (b[1] as u64) << 24 - | (b[2] as u64) << 16 - | (b[3] as u64) << 8 + ((b[0] as u64) << 32) + | ((b[1] as u64) << 24) + | ((b[2] as u64) << 16) + | ((b[3] as u64) << 8) | b[4] as u64 } fn to_blknum(self) -> u32 { let b = &self.0; assert!(b[0] == 0x80); - (b[1] as u32) << 24 | (b[2] as u32) << 16 | (b[3] as u32) << 8 | b[4] as u32 + ((b[1] as u32) << 24) | ((b[2] as u32) << 16) | ((b[3] as u32) << 8) | b[4] as u32 } } diff --git a/pageserver/src/tenant/metadata.rs b/pageserver/src/tenant/metadata.rs index 24440d4b35..d281eb305f 100644 --- a/pageserver/src/tenant/metadata.rs +++ b/pageserver/src/tenant/metadata.rs @@ -320,7 +320,6 @@ impl TimelineMetadata { // Checksums make it awkward to build a valid instance by hand. This helper // provides a TimelineMetadata with a valid checksum in its header. - #[cfg(test)] pub fn example() -> Self { let instance = Self::new( "0/16960E8".parse::().unwrap(), diff --git a/pageserver/src/tenant/remote_timeline_client.rs b/pageserver/src/tenant/remote_timeline_client.rs index 813111245d..a006647785 100644 --- a/pageserver/src/tenant/remote_timeline_client.rs +++ b/pageserver/src/tenant/remote_timeline_client.rs @@ -63,22 +63,18 @@ //! The contract between client and its user is that the user is responsible of //! scheduling operations in an order that keeps the remote consistent as //! described above. +//! //! From the user's perspective, the operations are executed sequentially. //! Internally, the client knows which operations can be performed in parallel, //! and which operations act like a "barrier" that require preceding operations //! to finish. The calling code just needs to call the schedule-functions in the //! correct order, and the client will parallelize the operations in a way that -//! is safe. -//! -//! The caller should be careful with deletion, though. They should not delete -//! local files that have been scheduled for upload but not yet finished uploading. -//! Otherwise the upload will fail. To wait for an upload to finish, use -//! the 'wait_completion' function (more on that later.) +//! is safe. For more details, see `UploadOp::can_bypass`. //! //! All of this relies on the following invariants: //! //! - We rely on read-after write consistency in the remote storage. -//! - Layer files are immutable +//! - Layer files are immutable. //! //! NB: Pageserver assumes that it has exclusive write access to the tenant in remote //! storage. Different tenants can be attached to different pageservers, but if the @@ -386,6 +382,12 @@ pub(crate) struct RemoteTimelineClient { cancel: CancellationToken, } +impl Drop for RemoteTimelineClient { + fn drop(&mut self) { + debug!("dropping RemoteTimelineClient"); + } +} + impl RemoteTimelineClient { /// /// Create a remote storage client for given timeline @@ -429,8 +431,16 @@ impl RemoteTimelineClient { /// an index file upload, i.e., it's not empty. /// The given `index_part` must be the one on the remote. pub fn init_upload_queue(&self, index_part: &IndexPart) -> anyhow::Result<()> { + // Set the maximum number of inprogress tasks to the remote storage concurrency. There's + // certainly no point in starting more upload tasks than this. + let inprogress_limit = self + .conf + .remote_storage_config + .as_ref() + .and_then(|r| r.concurrency_limit()) + .unwrap_or(0); let mut upload_queue = self.upload_queue.lock().unwrap(); - upload_queue.initialize_with_current_remote_index_part(index_part)?; + upload_queue.initialize_with_current_remote_index_part(index_part, inprogress_limit)?; self.update_remote_physical_size_gauge(Some(index_part)); info!( "initialized upload queue from remote index with {} layer files", @@ -445,8 +455,16 @@ impl RemoteTimelineClient { &self, local_metadata: &TimelineMetadata, ) -> anyhow::Result<()> { + // Set the maximum number of inprogress tasks to the remote storage concurrency. There's + // certainly no point in starting more upload tasks than this. + let inprogress_limit = self + .conf + .remote_storage_config + .as_ref() + .and_then(|r| r.concurrency_limit()) + .unwrap_or(0); let mut upload_queue = self.upload_queue.lock().unwrap(); - upload_queue.initialize_empty_remote(local_metadata)?; + upload_queue.initialize_empty_remote(local_metadata, inprogress_limit)?; self.update_remote_physical_size_gauge(None); info!("initialized upload queue as empty"); Ok(()) @@ -462,9 +480,15 @@ impl RemoteTimelineClient { let deleted_at = index_part.deleted_at.ok_or(anyhow::anyhow!( "bug: it is responsibility of the caller to provide index part from MaybeDeletedIndexPart::Deleted" ))?; + let inprogress_limit = self + .conf + .remote_storage_config + .as_ref() + .and_then(|r| r.concurrency_limit()) + .unwrap_or(0); let mut upload_queue = self.upload_queue.lock().unwrap(); - upload_queue.initialize_with_current_remote_index_part(index_part)?; + upload_queue.initialize_with_current_remote_index_part(index_part, inprogress_limit)?; self.update_remote_physical_size_gauge(Some(index_part)); self.stop_impl(&mut upload_queue); @@ -1855,57 +1879,17 @@ impl RemoteTimelineClient { Ok(()) } - /// /// Pick next tasks from the queue, and start as many of them as possible without violating /// the ordering constraints. /// - /// The caller needs to already hold the `upload_queue` lock. + /// TODO: consider limiting the number of in-progress tasks, beyond what remote_storage does. + /// This can launch an unbounded number of queued tasks. `UploadQueue::next_ready()` also has + /// worst-case quadratic cost in the number of tasks, and may struggle beyond 10,000 tasks. fn launch_queued_tasks(self: &Arc, upload_queue: &mut UploadQueueInitialized) { - while let Some(next_op) = upload_queue.queued_operations.front() { - // Can we run this task now? - let can_run_now = match next_op { - UploadOp::UploadLayer(..) => { - // Can always be scheduled. - true - } - UploadOp::UploadMetadata { .. } => { - // These can only be performed after all the preceding operations - // have finished. - upload_queue.inprogress_tasks.is_empty() - } - UploadOp::Delete(..) => { - // Wait for preceding uploads to finish. Concurrent deletions are OK, though. - upload_queue.num_inprogress_deletions == upload_queue.inprogress_tasks.len() - } + while let Some((mut next_op, coalesced_ops)) = upload_queue.next_ready() { + debug!("starting op: {next_op}"); - UploadOp::Barrier(_) | UploadOp::Shutdown => { - upload_queue.inprogress_tasks.is_empty() - } - }; - - // If we cannot launch this task, don't look any further. - // - // In some cases, we could let some non-frontmost tasks to "jump the queue" and launch - // them now, but we don't try to do that currently. For example, if the frontmost task - // is an index-file upload that cannot proceed until preceding uploads have finished, we - // could still start layer uploads that were scheduled later. - if !can_run_now { - break; - } - - if let UploadOp::Shutdown = next_op { - // leave the op in the queue but do not start more tasks; it will be dropped when - // the stop is called. - upload_queue.shutdown_ready.close(); - break; - } - - // We can launch this task. Remove it from the queue first. - let mut next_op = upload_queue.queued_operations.pop_front().unwrap(); - - debug!("starting op: {}", next_op); - - // Update the counters and prepare + // Prepare upload. match &mut next_op { UploadOp::UploadLayer(layer, meta, mode) => { if upload_queue @@ -1916,18 +1900,14 @@ impl RemoteTimelineClient { } else { *mode = Some(OpType::MayReorder) } - upload_queue.num_inprogress_layer_uploads += 1; - } - UploadOp::UploadMetadata { .. } => { - upload_queue.num_inprogress_metadata_uploads += 1; } + UploadOp::UploadMetadata { .. } => {} UploadOp::Delete(Delete { layers }) => { for (name, meta) in layers { upload_queue .recently_deleted .insert((name.clone(), meta.generation)); } - upload_queue.num_inprogress_deletions += 1; } UploadOp::Barrier(sender) => { sender.send_replace(()); @@ -1944,6 +1924,7 @@ impl RemoteTimelineClient { let task = Arc::new(UploadTask { task_id: upload_task_id, op: next_op, + coalesced_ops, retries: AtomicU32::new(0), }); upload_queue @@ -2027,6 +2008,8 @@ impl RemoteTimelineClient { let upload_result: anyhow::Result<()> = match &task.op { UploadOp::UploadLayer(ref layer, ref layer_metadata, mode) => { + // TODO: check if this mechanism can be removed now that can_bypass() performs + // conflict checks during scheduling. if let Some(OpType::FlushDeletion) = mode { if self.config.read().unwrap().block_deletions { // Of course, this is not efficient... but usually the queue should be empty. @@ -2249,13 +2232,8 @@ impl RemoteTimelineClient { upload_queue.inprogress_tasks.remove(&task.task_id); let lsn_update = match task.op { - UploadOp::UploadLayer(_, _, _) => { - upload_queue.num_inprogress_layer_uploads -= 1; - None - } + UploadOp::UploadLayer(_, _, _) => None, UploadOp::UploadMetadata { ref uploaded } => { - upload_queue.num_inprogress_metadata_uploads -= 1; - // the task id is reused as a monotonicity check for storing the "clean" // IndexPart. let last_updater = upload_queue.clean.1; @@ -2289,10 +2267,7 @@ impl RemoteTimelineClient { None } } - UploadOp::Delete(_) => { - upload_queue.num_inprogress_deletions -= 1; - None - } + UploadOp::Delete(_) => None, UploadOp::Barrier(..) | UploadOp::Shutdown => unreachable!(), }; @@ -2317,6 +2292,9 @@ impl RemoteTimelineClient { } self.metric_end(&task.op); + for coalesced_op in &task.coalesced_ops { + self.metric_end(coalesced_op); + } } fn metric_impl( @@ -2409,6 +2387,7 @@ impl RemoteTimelineClient { // but for this use case it doesnt really makes sense to bring unsafe code only for this usage point. // Deletion is not really perf sensitive so there shouldnt be any problems with cloning a fraction of it. let upload_queue_for_deletion = UploadQueueInitialized { + inprogress_limit: initialized.inprogress_limit, task_counter: 0, dirty: initialized.dirty.clone(), clean: initialized.clean.clone(), @@ -2416,9 +2395,6 @@ impl RemoteTimelineClient { visible_remote_consistent_lsn: initialized .visible_remote_consistent_lsn .clone(), - num_inprogress_layer_uploads: 0, - num_inprogress_metadata_uploads: 0, - num_inprogress_deletions: 0, inprogress_tasks: HashMap::default(), queued_operations: VecDeque::default(), #[cfg(feature = "testing")] @@ -2445,14 +2421,6 @@ impl RemoteTimelineClient { } }; - // consistency check - assert_eq!( - qi.num_inprogress_layer_uploads - + qi.num_inprogress_metadata_uploads - + qi.num_inprogress_deletions, - qi.inprogress_tasks.len() - ); - // We don't need to do anything here for in-progress tasks. They will finish // on their own, decrement the unfinished-task counter themselves, and observe // that the queue is Stopped. @@ -2899,8 +2867,8 @@ mod tests { let mut guard = client.upload_queue.lock().unwrap(); let upload_queue = guard.initialized_mut().unwrap(); assert!(upload_queue.queued_operations.is_empty()); - assert!(upload_queue.inprogress_tasks.len() == 2); - assert!(upload_queue.num_inprogress_layer_uploads == 2); + assert_eq!(upload_queue.inprogress_tasks.len(), 2); + assert_eq!(upload_queue.num_inprogress_layer_uploads(), 2); // also check that `latest_file_changes` was updated assert!(upload_queue.latest_files_changes_since_metadata_upload_scheduled == 2); @@ -2970,8 +2938,8 @@ mod tests { // Deletion schedules upload of the index file, and the file deletion itself assert_eq!(upload_queue.queued_operations.len(), 2); assert_eq!(upload_queue.inprogress_tasks.len(), 1); - assert_eq!(upload_queue.num_inprogress_layer_uploads, 1); - assert_eq!(upload_queue.num_inprogress_deletions, 0); + assert_eq!(upload_queue.num_inprogress_layer_uploads(), 1); + assert_eq!(upload_queue.num_inprogress_deletions(), 0); assert_eq!( upload_queue.latest_files_changes_since_metadata_upload_scheduled, 0 diff --git a/pageserver/src/tenant/remote_timeline_client/index.rs b/pageserver/src/tenant/remote_timeline_client/index.rs index 51f093cb87..244be5bbb7 100644 --- a/pageserver/src/tenant/remote_timeline_client/index.rs +++ b/pageserver/src/tenant/remote_timeline_client/index.rs @@ -104,7 +104,7 @@ impl IndexPart { pub const FILE_NAME: &'static str = "index_part.json"; - pub(crate) fn empty(metadata: TimelineMetadata) -> Self { + pub fn empty(metadata: TimelineMetadata) -> Self { IndexPart { version: Self::LATEST_VERSION, layer_metadata: Default::default(), diff --git a/pageserver/src/tenant/storage_layer.rs b/pageserver/src/tenant/storage_layer.rs index b8206fca5a..3913637ca0 100644 --- a/pageserver/src/tenant/storage_layer.rs +++ b/pageserver/src/tenant/storage_layer.rs @@ -12,7 +12,7 @@ pub mod merge_iterator; use crate::context::{AccessStatsBehavior, RequestContext}; use bytes::Bytes; -use pageserver_api::key::{Key, NON_INHERITED_SPARSE_RANGE}; +use pageserver_api::key::Key; use pageserver_api::keyspace::{KeySpace, KeySpaceRandomAccum}; use pageserver_api::record::NeonWalRecord; use pageserver_api::value::Value; @@ -209,7 +209,7 @@ impl ValuesReconstructState { .keys .entry(*key) .or_insert(Ok(VectoredValueReconstructState::default())); - let is_sparse_key = NON_INHERITED_SPARSE_RANGE.contains(key); + let is_sparse_key = key.is_sparse(); if let Ok(state) = state { let key_done = match state.situation { ValueReconstructSituation::Complete => { diff --git a/pageserver/src/tenant/storage_layer/inmemory_layer.rs b/pageserver/src/tenant/storage_layer/inmemory_layer.rs index 71e53da20f..2b67f55a17 100644 --- a/pageserver/src/tenant/storage_layer/inmemory_layer.rs +++ b/pageserver/src/tenant/storage_layer/inmemory_layer.rs @@ -112,8 +112,8 @@ const MAX_SUPPORTED_BLOB_LEN_BITS: usize = { /// /// Layout: /// - 1 bit: `will_init` -/// - [`MAX_SUPPORTED_BLOB_LEN_BITS`]: `len` -/// - [`MAX_SUPPORTED_POS_BITS`]: `pos` +/// - [`MAX_SUPPORTED_BLOB_LEN_BITS`][]: `len` +/// - [`MAX_SUPPORTED_POS_BITS`](IndexEntry::MAX_SUPPORTED_POS_BITS): `pos` #[derive(Debug, Clone, Copy, PartialEq, Eq)] pub struct IndexEntry(u64); diff --git a/pageserver/src/tenant/storage_layer/layer.rs b/pageserver/src/tenant/storage_layer/layer.rs index 8933e8ceb1..2b06c88e8b 100644 --- a/pageserver/src/tenant/storage_layer/layer.rs +++ b/pageserver/src/tenant/storage_layer/layer.rs @@ -1812,7 +1812,7 @@ enum LayerKind { /// Guard for forcing a layer be resident while it exists. #[derive(Clone)] -pub(crate) struct ResidentLayer { +pub struct ResidentLayer { owner: Layer, downloaded: Arc, } diff --git a/pageserver/src/tenant/throttle.rs b/pageserver/src/tenant/throttle.rs index 8ab6a0e060..300d779125 100644 --- a/pageserver/src/tenant/throttle.rs +++ b/pageserver/src/tenant/throttle.rs @@ -3,7 +3,7 @@ use std::{ atomic::{AtomicU64, Ordering}, Arc, }, - time::{Duration, Instant}, + time::Instant, }; use arc_swap::ArcSwap; @@ -16,9 +16,8 @@ use utils::leaky_bucket::{LeakyBucketConfig, RateLimiter}; /// To share a throttle among multiple entities, wrap it in an [`Arc`]. /// /// The intial use case for this is tenant-wide throttling of getpage@lsn requests. -pub struct Throttle { +pub struct Throttle { inner: ArcSwap, - metric: M, /// will be turned into [`Stats::count_accounted_start`] count_accounted_start: AtomicU64, /// will be turned into [`Stats::count_accounted_finish`] @@ -36,15 +35,6 @@ pub struct Inner { pub type Config = pageserver_api::models::ThrottleConfig; -pub struct Observation { - pub wait_time: Duration, -} -pub trait Metric { - fn accounting_start(&self); - fn accounting_finish(&self); - fn observe_throttling(&self, observation: &Observation); -} - /// See [`Throttle::reset_stats`]. pub struct Stats { /// Number of requests that started [`Throttle::throttle`] calls. @@ -59,18 +49,14 @@ pub struct Stats { } pub enum ThrottleResult { - NotThrottled { start: Instant }, - Throttled { start: Instant, end: Instant }, + NotThrottled { end: Instant }, + Throttled { end: Instant }, } -impl Throttle -where - M: Metric, -{ - pub fn new(config: Config, metric: M) -> Self { +impl Throttle { + pub fn new(config: Config) -> Self { Self { inner: ArcSwap::new(Arc::new(Self::new_inner(config))), - metric, count_accounted_start: AtomicU64::new(0), count_accounted_finish: AtomicU64::new(0), count_throttled: AtomicU64::new(0), @@ -127,32 +113,27 @@ where self.inner.load().rate_limiter.steady_rps() } - pub async fn throttle(&self, key_count: usize) -> ThrottleResult { + /// `start` must be [`Instant::now`] or earlier. + pub async fn throttle(&self, key_count: usize, start: Instant) -> ThrottleResult { let inner = self.inner.load_full(); // clones the `Inner` Arc - let start = std::time::Instant::now(); - if !inner.enabled { - return ThrottleResult::NotThrottled { start }; + return ThrottleResult::NotThrottled { end: start }; } - self.metric.accounting_start(); self.count_accounted_start.fetch_add(1, Ordering::Relaxed); let did_throttle = inner.rate_limiter.acquire(key_count).await; self.count_accounted_finish.fetch_add(1, Ordering::Relaxed); - self.metric.accounting_finish(); if did_throttle { self.count_throttled.fetch_add(1, Ordering::Relaxed); - let now = Instant::now(); - let wait_time = now - start; + let end = Instant::now(); + let wait_time = end - start; self.sum_throttled_usecs .fetch_add(wait_time.as_micros() as u64, Ordering::Relaxed); - let observation = Observation { wait_time }; - self.metric.observe_throttling(&observation); - ThrottleResult::Throttled { start, end: now } + ThrottleResult::Throttled { end } } else { - ThrottleResult::NotThrottled { start } + ThrottleResult::NotThrottled { end: start } } } } diff --git a/pageserver/src/tenant/timeline.rs b/pageserver/src/tenant/timeline.rs index c1b71262e0..2ba71416b8 100644 --- a/pageserver/src/tenant/timeline.rs +++ b/pageserver/src/tenant/timeline.rs @@ -14,7 +14,7 @@ pub mod uninit; mod walreceiver; use anyhow::{anyhow, bail, ensure, Context, Result}; -use arc_swap::ArcSwap; +use arc_swap::{ArcSwap, ArcSwapOption}; use bytes::Bytes; use camino::Utf8Path; use chrono::{DateTime, Utc}; @@ -23,11 +23,12 @@ use fail::fail_point; use handle::ShardTimelineId; use offload::OffloadError; use once_cell::sync::Lazy; +use pageserver_api::models::PageTraceEvent; use pageserver_api::{ config::tenant_conf_defaults::DEFAULT_COMPACTION_THRESHOLD, key::{ KEY_SIZE, METADATA_KEY_BEGIN_PREFIX, METADATA_KEY_END_PREFIX, NON_INHERITED_RANGE, - NON_INHERITED_SPARSE_RANGE, + SPARSE_RANGE, }, keyspace::{KeySpaceAccum, KeySpaceRandomAccum, SparseKeyPartitioning}, models::{ @@ -42,6 +43,7 @@ use rand::Rng; use remote_storage::DownloadError; use serde_with::serde_as; use storage_broker::BrokerClientChannel; +use tokio::sync::mpsc::Sender; use tokio::{ runtime::Handle, sync::{oneshot, watch}, @@ -49,7 +51,9 @@ use tokio::{ use tokio_util::sync::CancellationToken; use tracing::*; use utils::{ - fs_ext, pausable_failpoint, + fs_ext, + guard_arc_swap::GuardArcSwap, + pausable_failpoint, postgres_client::PostgresClientProtocol, sync::gate::{Gate, GateGuard}, }; @@ -72,6 +76,7 @@ use std::{pin::pin, sync::OnceLock}; use crate::{ aux_file::AuxFileSizeEstimator, + page_service::TenantManagerTypes, tenant::{ config::AttachmentMode, layer_map::{LayerMap, SearchResult}, @@ -208,8 +213,8 @@ fn drop_wlock(rlock: tokio::sync::RwLockWriteGuard<'_, T>) { /// The outward-facing resources required to build a Timeline pub struct TimelineResources { pub remote_client: RemoteTimelineClient, - pub pagestream_throttle: - Arc>, + pub pagestream_throttle: Arc, + pub pagestream_throttle_metrics: Arc, pub l0_flush_global_state: l0_flush::L0FlushGlobalState, } @@ -351,8 +356,8 @@ pub struct Timeline { // though let's keep them both for better error visibility. pub initdb_lsn: Lsn, - /// When did we last calculate the partitioning? Make it pub to test cases. - pub(super) partitioning: tokio::sync::Mutex<((KeyPartitioning, SparseKeyPartitioning), Lsn)>, + /// The repartitioning result. Allows a single writer and multiple readers. + pub(crate) partitioning: GuardArcSwap<((KeyPartitioning, SparseKeyPartitioning), Lsn)>, /// Configuration: how often should the partitioning be recalculated. repartition_threshold: u64, @@ -412,8 +417,7 @@ pub struct Timeline { gc_lock: tokio::sync::Mutex<()>, /// Cloned from [`super::Tenant::pagestream_throttle`] on construction. - pub(crate) pagestream_throttle: - Arc>, + pub(crate) pagestream_throttle: Arc, /// Size estimator for aux file v2 pub(crate) aux_file_size_estimator: AuxFileSizeEstimator, @@ -428,12 +432,15 @@ pub struct Timeline { pub(crate) l0_flush_global_state: L0FlushGlobalState, - pub(crate) handles: handle::PerTimelineState, + pub(crate) handles: handle::PerTimelineState, pub(crate) attach_wal_lag_cooldown: Arc>, /// Cf. [`crate::tenant::CreateTimelineIdempotency`]. pub(crate) create_idempotency: crate::tenant::CreateTimelineIdempotency, + + /// If Some, collects GetPage metadata for an ongoing PageTrace. + pub(crate) page_trace: ArcSwapOption>, } pub type TimelineDeleteProgress = Arc>; @@ -2310,6 +2317,7 @@ impl Timeline { query_metrics: crate::metrics::SmgrQueryTimePerTimeline::new( &tenant_shard_id, &timeline_id, + resources.pagestream_throttle_metrics, ), directory_metrics: array::from_fn(|_| AtomicU64::new(0)), @@ -2335,7 +2343,8 @@ impl Timeline { // initial logical size is 0. LogicalSize::empty_initial() }, - partitioning: tokio::sync::Mutex::new(( + + partitioning: GuardArcSwap::new(( (KeyPartitioning::new(), KeyPartitioning::new().into_sparse()), Lsn(0), )), @@ -2380,6 +2389,8 @@ impl Timeline { attach_wal_lag_cooldown, create_idempotency, + + page_trace: Default::default(), }; result.repartition_threshold = @@ -3221,7 +3232,7 @@ impl Timeline { // We don't return a blanket [`GetVectoredError::MissingKey`] to avoid // stalling compaction. keyspace.remove_overlapping_with(&KeySpace { - ranges: vec![NON_INHERITED_RANGE, NON_INHERITED_SPARSE_RANGE], + ranges: vec![NON_INHERITED_RANGE, Key::sparse_non_inherited_keyspace()], }); // Keyspace is fully retrieved @@ -3242,7 +3253,11 @@ impl Timeline { // keys from `keyspace`, we expect there to be no overlap between it and the image covered key // space. If that's not the case, we had at least one key encounter a gap in the image layer // and stop the search as a result of that. - let removed = keyspace.remove_overlapping_with(&image_covered_keyspace); + let mut removed = keyspace.remove_overlapping_with(&image_covered_keyspace); + // Do not fire missing key error for sparse keys. + removed.remove_overlapping_with(&KeySpace { + ranges: vec![SPARSE_RANGE], + }); if !removed.is_empty() { break Some(removed); } @@ -3257,6 +3272,21 @@ impl Timeline { timeline = &*timeline_owned; }; + // Remove sparse keys from the keyspace so that it doesn't fire errors. + let missing_keyspace = if let Some(missing_keyspace) = missing_keyspace { + let mut missing_keyspace = missing_keyspace; + missing_keyspace.remove_overlapping_with(&KeySpace { + ranges: vec![SPARSE_RANGE], + }); + if missing_keyspace.is_empty() { + None + } else { + Some(missing_keyspace) + } + } else { + None + }; + if let Some(missing_keyspace) = missing_keyspace { return Err(GetVectoredError::MissingKey(MissingKeyError { key: missing_keyspace.start().unwrap(), /* better if we can store the full keyspace */ @@ -3762,36 +3792,35 @@ impl Timeline { return Err(FlushLayerError::Cancelled); } - let mut layers_to_upload = Vec::new(); - layers_to_upload.extend( - self.create_image_layers( - &rel_partition, - self.initdb_lsn, - ImageLayerCreationMode::Initial, - ctx, - ) - .await?, - ); + // Ensure that we have a single call to `create_image_layers` with a combined dense keyspace. + // So that the key ranges don't overlap. + let mut partitions = KeyPartitioning::default(); + partitions.parts.extend(rel_partition.parts); if !metadata_partition.parts.is_empty() { assert_eq!( metadata_partition.parts.len(), 1, "currently sparse keyspace should only contain a single metadata keyspace" ); - layers_to_upload.extend( - self.create_image_layers( - // Safety: create_image_layers treat sparse keyspaces differently that it does not scan - // every single key within the keyspace, and therefore, it's safe to force converting it - // into a dense keyspace before calling this function. - &metadata_partition.into_dense(), - self.initdb_lsn, - ImageLayerCreationMode::Initial, - ctx, - ) - .await?, - ); + // Safety: create_image_layers treat sparse keyspaces differently that it does not scan + // every single key within the keyspace, and therefore, it's safe to force converting it + // into a dense keyspace before calling this function. + partitions + .parts + .extend(metadata_partition.into_dense().parts); } + let mut layers_to_upload = Vec::new(); + layers_to_upload.extend( + self.create_image_layers( + &partitions, + self.initdb_lsn, + ImageLayerCreationMode::Initial, + ctx, + ) + .await?, + ); + (layers_to_upload, None) } else { // Normal case, write out a L0 delta layer file. @@ -4003,18 +4032,15 @@ impl Timeline { flags: EnumSet, ctx: &RequestContext, ) -> Result<((KeyPartitioning, SparseKeyPartitioning), Lsn), CompactionError> { - let Ok(mut partitioning_guard) = self.partitioning.try_lock() else { + let Ok(mut guard) = self.partitioning.try_write_guard() else { // NB: there are two callers, one is the compaction task, of which there is only one per struct Tenant and hence Timeline. // The other is the initdb optimization in flush_frozen_layer, used by `boostrap_timeline`, which runs before `.activate()` // and hence before the compaction task starts. - // Note that there are a third "caller" that will take the `partitioning` lock. It is `gc_compaction_split_jobs` for - // gc-compaction where it uses the repartition data to determine the split jobs. In the future, it might use its own - // heuristics, but for now, we should allow concurrent access to it and let the caller retry compaction. return Err(CompactionError::Other(anyhow!( - "repartition() called concurrently, this is rare and a retry should be fine" + "repartition() called concurrently" ))); }; - let ((dense_partition, sparse_partition), partition_lsn) = &*partitioning_guard; + let ((dense_partition, sparse_partition), partition_lsn) = &*guard.read(); if lsn < *partition_lsn { return Err(CompactionError::Other(anyhow!( "repartition() called with LSN going backwards, this should not happen" @@ -4042,9 +4068,9 @@ impl Timeline { let sparse_partitioning = SparseKeyPartitioning { parts: vec![sparse_ks], }; // no partitioning for metadata keys for now - *partitioning_guard = ((dense_partitioning, sparse_partitioning), lsn); - - Ok((partitioning_guard.0.clone(), partitioning_guard.1)) + let result = ((dense_partitioning, sparse_partitioning), lsn); + guard.write(result.clone()); + Ok(result) } // Is it time to create a new image layer for the given partition? @@ -4600,6 +4626,10 @@ impl Drop for Timeline { } } } + info!( + "Timeline {} for tenant {} is being dropped", + self.timeline_id, self.tenant_shard_id.tenant_id + ); } } diff --git a/pageserver/src/tenant/timeline/compaction.rs b/pageserver/src/tenant/timeline/compaction.rs index 55cde8603e..06a21f6b3c 100644 --- a/pageserver/src/tenant/timeline/compaction.rs +++ b/pageserver/src/tenant/timeline/compaction.rs @@ -4,7 +4,7 @@ //! //! The old legacy algorithm is implemented directly in `timeline.rs`. -use std::collections::{BinaryHeap, HashMap, HashSet}; +use std::collections::{BinaryHeap, HashMap, HashSet, VecDeque}; use std::ops::{Deref, Range}; use std::sync::Arc; @@ -16,10 +16,12 @@ use super::{ use anyhow::{anyhow, bail, Context}; use bytes::Bytes; +use enumset::EnumSet; use fail::fail_point; use itertools::Itertools; use pageserver_api::key::KEY_SIZE; use pageserver_api::keyspace::ShardedRange; +use pageserver_api::models::CompactInfoResponse; use pageserver_api::shard::{ShardCount, ShardIdentity, TenantShardId}; use serde::Serialize; use tokio_util::sync::CancellationToken; @@ -30,6 +32,7 @@ use crate::context::{AccessStatsBehavior, RequestContext, RequestContextBuilder} use crate::page_cache; use crate::statvfs::Statvfs; use crate::tenant::checks::check_valid_layermap; +use crate::tenant::gc_block::GcBlock; use crate::tenant::remote_timeline_client::WaitCompletionError; use crate::tenant::storage_layer::batch_split_writer::{ BatchWriterResult, SplitDeltaLayerWriter, SplitImageLayerWriter, @@ -63,16 +66,284 @@ use super::CompactionError; /// Maximum number of deltas before generating an image layer in bottom-most compaction. const COMPACTION_DELTA_THRESHOLD: usize = 5; -/// A scheduled compaction task. -pub(crate) struct ScheduledCompactionTask { - /// It's unfortunate that we need to store a compact options struct here because the only outer - /// API we can call here is `compact_with_options` which does a few setup calls before starting the - /// actual compaction job... We should refactor this to store `GcCompactionJob` in the future. - pub options: CompactOptions, - /// The channel to send the compaction result. If this is a subcompaction, the last compaction job holds the sender. - pub result_tx: Option>, - /// Hold the GC block. If this is a subcompaction, the last compaction job holds the gc block guard. - pub gc_block: Option, +#[derive(Debug, Clone, Copy, Hash, PartialEq, Eq)] +pub struct GcCompactionJobId(pub usize); + +impl std::fmt::Display for GcCompactionJobId { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + write!(f, "{}", self.0) + } +} + +#[derive(Debug, Clone)] +pub enum GcCompactionQueueItem { + Manual(CompactOptions), + SubCompactionJob(CompactOptions), + #[allow(dead_code)] + UpdateL2Lsn(Lsn), + Notify(GcCompactionJobId), +} + +impl GcCompactionQueueItem { + pub fn into_compact_info_resp( + self, + id: GcCompactionJobId, + running: bool, + ) -> Option { + match self { + GcCompactionQueueItem::Manual(options) => Some(CompactInfoResponse { + compact_key_range: options.compact_key_range, + compact_lsn_range: options.compact_lsn_range, + sub_compaction: options.sub_compaction, + running, + job_id: id.0, + }), + GcCompactionQueueItem::SubCompactionJob(options) => Some(CompactInfoResponse { + compact_key_range: options.compact_key_range, + compact_lsn_range: options.compact_lsn_range, + sub_compaction: options.sub_compaction, + running, + job_id: id.0, + }), + GcCompactionQueueItem::UpdateL2Lsn(_) => None, + GcCompactionQueueItem::Notify(_) => None, + } + } +} + +struct GcCompactionQueueInner { + running: Option<(GcCompactionJobId, GcCompactionQueueItem)>, + queued: VecDeque<(GcCompactionJobId, GcCompactionQueueItem)>, + notify: HashMap>, + gc_guards: HashMap, + last_id: GcCompactionJobId, +} + +impl GcCompactionQueueInner { + fn next_id(&mut self) -> GcCompactionJobId { + let id = self.last_id; + self.last_id = GcCompactionJobId(id.0 + 1); + id + } +} + +/// A structure to store gc_compaction jobs. +pub struct GcCompactionQueue { + /// All items in the queue, and the currently-running job. + inner: std::sync::Mutex, + /// Ensure only one thread is consuming the queue. + consumer_lock: tokio::sync::Mutex<()>, +} + +impl GcCompactionQueue { + pub fn new() -> Self { + GcCompactionQueue { + inner: std::sync::Mutex::new(GcCompactionQueueInner { + running: None, + queued: VecDeque::new(), + notify: HashMap::new(), + gc_guards: HashMap::new(), + last_id: GcCompactionJobId(0), + }), + consumer_lock: tokio::sync::Mutex::new(()), + } + } + + pub fn cancel_scheduled(&self) { + let mut guard = self.inner.lock().unwrap(); + guard.queued.clear(); + guard.notify.clear(); + guard.gc_guards.clear(); + } + + /// Schedule a manual compaction job. + pub fn schedule_manual_compaction( + &self, + options: CompactOptions, + notify: Option>, + ) -> GcCompactionJobId { + let mut guard = self.inner.lock().unwrap(); + let id = guard.next_id(); + guard + .queued + .push_back((id, GcCompactionQueueItem::Manual(options))); + if let Some(notify) = notify { + guard.notify.insert(id, notify); + } + info!("scheduled compaction job id={}", id); + id + } + + /// Trigger an auto compaction. + #[allow(dead_code)] + pub fn trigger_auto_compaction(&self, _: &Arc) {} + + /// Notify the caller the job has finished and unblock GC. + fn notify_and_unblock(&self, id: GcCompactionJobId) { + info!("compaction job id={} finished", id); + let mut guard = self.inner.lock().unwrap(); + if let Some(blocking) = guard.gc_guards.remove(&id) { + drop(blocking) + } + if let Some(tx) = guard.notify.remove(&id) { + let _ = tx.send(()); + } + } + + async fn handle_sub_compaction( + &self, + id: GcCompactionJobId, + options: CompactOptions, + timeline: &Arc, + gc_block: &GcBlock, + ) -> Result<(), CompactionError> { + info!("running scheduled enhanced gc bottom-most compaction with sub-compaction, splitting compaction jobs"); + let jobs: Vec = timeline + .gc_compaction_split_jobs( + GcCompactJob::from_compact_options(options.clone()), + options.sub_compaction_max_job_size_mb, + ) + .await + .map_err(CompactionError::Other)?; + if jobs.is_empty() { + info!("no jobs to run, skipping scheduled compaction task"); + self.notify_and_unblock(id); + } else { + let gc_guard = match gc_block.start().await { + Ok(guard) => guard, + Err(e) => { + return Err(CompactionError::Other(anyhow!( + "cannot run gc-compaction because gc is blocked: {}", + e + ))); + } + }; + + let jobs_len = jobs.len(); + let mut pending_tasks = Vec::new(); + for job in jobs { + // Unfortunately we need to convert the `GcCompactJob` back to `CompactionOptions` + // until we do further refactors to allow directly call `compact_with_gc`. + let mut flags: EnumSet = EnumSet::default(); + flags |= CompactFlags::EnhancedGcBottomMostCompaction; + if job.dry_run { + flags |= CompactFlags::DryRun; + } + let options = CompactOptions { + flags, + sub_compaction: false, + compact_key_range: Some(job.compact_key_range.into()), + compact_lsn_range: Some(job.compact_lsn_range.into()), + sub_compaction_max_job_size_mb: None, + }; + pending_tasks.push(GcCompactionQueueItem::SubCompactionJob(options)); + } + pending_tasks.push(GcCompactionQueueItem::Notify(id)); + { + let mut guard = self.inner.lock().unwrap(); + guard.gc_guards.insert(id, gc_guard); + let mut tasks = Vec::new(); + for task in pending_tasks { + let id = guard.next_id(); + tasks.push((id, task)); + } + tasks.reverse(); + for item in tasks { + guard.queued.push_front(item); + } + } + info!("scheduled enhanced gc bottom-most compaction with sub-compaction, split into {} jobs", jobs_len); + } + Ok(()) + } + + /// Take a job from the queue and process it. Returns if there are still pending tasks. + pub async fn iteration( + &self, + cancel: &CancellationToken, + ctx: &RequestContext, + gc_block: &GcBlock, + timeline: &Arc, + ) -> Result { + let _one_op_at_a_time_guard = self.consumer_lock.lock().await; + let has_pending_tasks; + let (id, item) = { + let mut guard = self.inner.lock().unwrap(); + let Some((id, item)) = guard.queued.pop_front() else { + return Ok(false); + }; + guard.running = Some((id, item.clone())); + has_pending_tasks = !guard.queued.is_empty(); + (id, item) + }; + + match item { + GcCompactionQueueItem::Manual(options) => { + if !options + .flags + .contains(CompactFlags::EnhancedGcBottomMostCompaction) + { + warn!("ignoring scheduled compaction task: scheduled task must be gc compaction: {:?}", options); + } else if options.sub_compaction { + self.handle_sub_compaction(id, options, timeline, gc_block) + .await?; + } else { + let gc_guard = match gc_block.start().await { + Ok(guard) => guard, + Err(e) => { + return Err(CompactionError::Other(anyhow!( + "cannot run gc-compaction because gc is blocked: {}", + e + ))); + } + }; + { + let mut guard = self.inner.lock().unwrap(); + guard.gc_guards.insert(id, gc_guard); + } + let _ = timeline + .compact_with_options(cancel, options, ctx) + .instrument(info_span!("scheduled_compact_timeline", %timeline.timeline_id)) + .await?; + self.notify_and_unblock(id); + } + } + GcCompactionQueueItem::SubCompactionJob(options) => { + let _ = timeline + .compact_with_options(cancel, options, ctx) + .instrument(info_span!("scheduled_compact_timeline", %timeline.timeline_id)) + .await?; + } + GcCompactionQueueItem::Notify(id) => { + self.notify_and_unblock(id); + } + GcCompactionQueueItem::UpdateL2Lsn(_) => { + unreachable!() + } + } + { + let mut guard = self.inner.lock().unwrap(); + guard.running = None; + } + Ok(has_pending_tasks) + } + + #[allow(clippy::type_complexity)] + pub fn remaining_jobs( + &self, + ) -> ( + Option<(GcCompactionJobId, GcCompactionQueueItem)>, + VecDeque<(GcCompactionJobId, GcCompactionQueueItem)>, + ) { + let guard = self.inner.lock().unwrap(); + (guard.running.clone(), guard.queued.clone()) + } + + #[allow(dead_code)] + pub fn remaining_jobs_num(&self) -> usize { + let guard = self.inner.lock().unwrap(); + guard.queued.len() + if guard.running.is_some() { 1 } else { 0 } + } } /// A job description for the gc-compaction job. This structure describes the rectangle range that the job will @@ -1505,7 +1776,10 @@ impl Timeline { base_img_from_ancestor: Option<(Key, Lsn, Bytes)>, ) -> anyhow::Result { // Pre-checks for the invariants - if cfg!(debug_assertions) { + + let debug_mode = cfg!(debug_assertions) || cfg!(feature = "testing"); + + if debug_mode { for (log_key, _, _) in full_history { assert_eq!(log_key, &key, "mismatched key"); } @@ -1651,15 +1925,19 @@ impl Timeline { output } + let mut key_exists = false; for (i, split_for_lsn) in split_history.into_iter().enumerate() { // TODO: there could be image keys inside the splits, and we can compute records_since_last_image accordingly. records_since_last_image += split_for_lsn.len(); - let generate_image = if i == 0 && !has_ancestor { + // Whether to produce an image into the final layer files + let produce_image = if i == 0 && !has_ancestor { // We always generate images for the first batch (below horizon / lowest retain_lsn) true } else if i == batch_cnt - 1 { // Do not generate images for the last batch (above horizon) false + } else if records_since_last_image == 0 { + false } else if records_since_last_image >= delta_threshold_cnt { // Generate images when there are too many records true @@ -1674,29 +1952,45 @@ impl Timeline { break; } } - if let Some((_, _, val)) = replay_history.first() { - if !val.will_init() { - return Err(anyhow::anyhow!("invalid history, no base image")).with_context( - || { - generate_debug_trace( - Some(&replay_history), - full_history, - retain_lsn_below_horizon, - horizon, - ) - }, - ); - } + if replay_history.is_empty() && !key_exists { + // The key does not exist at earlier LSN, we can skip this iteration. + retention.push(Vec::new()); + continue; + } else { + key_exists = true; } - if generate_image && records_since_last_image > 0 { + let Some((_, _, val)) = replay_history.first() else { + unreachable!("replay history should not be empty once it exists") + }; + if !val.will_init() { + return Err(anyhow::anyhow!("invalid history, no base image")).with_context(|| { + generate_debug_trace( + Some(&replay_history), + full_history, + retain_lsn_below_horizon, + horizon, + ) + }); + } + // Whether to reconstruct the image. In debug mode, we will generate an image + // at every retain_lsn to ensure data is not corrupted, but we won't put the + // image into the final layer. + let generate_image = produce_image || debug_mode; + if produce_image { records_since_last_image = 0; - let replay_history_for_debug = if cfg!(debug_assertions) { + } + let img_and_lsn = if generate_image { + let replay_history_for_debug = if debug_mode { Some(replay_history.clone()) } else { None }; let replay_history_for_debug_ref = replay_history_for_debug.as_deref(); - let history = std::mem::take(&mut replay_history); + let history = if produce_image { + std::mem::take(&mut replay_history) + } else { + replay_history.clone() + }; let mut img = None; let mut records = Vec::with_capacity(history.len()); if let (_, lsn, Value::Image(val)) = history.first().as_ref().unwrap() { @@ -1733,8 +2027,20 @@ impl Timeline { } records.reverse(); let state = ValueReconstructState { img, records }; - let request_lsn = lsn_split_points[i]; // last batch does not generate image so i is always in range + // last batch does not generate image so i is always in range, unless we force generate + // an image during testing + let request_lsn = if i >= lsn_split_points.len() { + Lsn::MAX + } else { + lsn_split_points[i] + }; let img = self.reconstruct_value(key, request_lsn, state).await?; + Some((request_lsn, img)) + } else { + None + }; + if produce_image { + let (request_lsn, img) = img_and_lsn.unwrap(); replay_history.push((key, request_lsn, Value::Image(img.clone()))); retention.push(vec![(request_lsn, Value::Image(img))]); } else { @@ -1840,12 +2146,7 @@ impl Timeline { let mut compact_jobs = Vec::new(); // For now, we simply use the key partitioning information; we should do a more fine-grained partitioning // by estimating the amount of files read for a compaction job. We should also partition on LSN. - let ((dense_ks, sparse_ks), _) = { - let Ok(partition) = self.partitioning.try_lock() else { - bail!("failed to acquire partition lock during gc-compaction"); - }; - partition.clone() - }; + let ((dense_ks, sparse_ks), _) = self.partitioning.read().as_ref().clone(); // Truncate the key range to be within user specified compaction range. fn truncate_to( source_start: &Key, @@ -2002,6 +2303,8 @@ impl Timeline { let compact_key_range = job.compact_key_range; let compact_lsn_range = job.compact_lsn_range; + let debug_mode = cfg!(debug_assertions) || cfg!(feature = "testing"); + info!("running enhanced gc bottom-most compaction, dry_run={dry_run}, compact_key_range={}..{}, compact_lsn_range={}..{}", compact_key_range.start, compact_key_range.end, compact_lsn_range.start, compact_lsn_range.end); scopeguard::defer! { @@ -2127,7 +2430,7 @@ impl Timeline { .first() .copied() .unwrap_or(job_desc.gc_cutoff); - if cfg!(debug_assertions) { + if debug_mode { assert_eq!( res, job_desc diff --git a/pageserver/src/tenant/timeline/delete.rs b/pageserver/src/tenant/timeline/delete.rs index ae44af3fad..bdc315d985 100644 --- a/pageserver/src/tenant/timeline/delete.rs +++ b/pageserver/src/tenant/timeline/delete.rs @@ -301,6 +301,7 @@ impl DeleteTimelineFlow { TimelineResources { remote_client, pagestream_throttle: tenant.pagestream_throttle.clone(), + pagestream_throttle_metrics: tenant.pagestream_throttle_metrics.clone(), l0_flush_global_state: tenant.l0_flush_global_state.clone(), }, // Important. We dont pass ancestor above because it can be missing. diff --git a/pageserver/src/tenant/timeline/handle.rs b/pageserver/src/tenant/timeline/handle.rs index e82559b8b3..35d8c75ce1 100644 --- a/pageserver/src/tenant/timeline/handle.rs +++ b/pageserver/src/tenant/timeline/handle.rs @@ -32,54 +32,151 @@ //! //! # Design //! +//! ## Data Structures +//! //! There are three user-facing data structures: //! - `PerTimelineState`: a struct embedded into each Timeline struct. Lifetime == Timeline lifetime. //! - `Cache`: a struct private to each connection handler; Lifetime == connection lifetime. //! - `Handle`: a smart pointer that holds the Timeline gate open and derefs to `&Timeline`. -//! Lifetime: for a single request dispatch on the Timeline (i.e., one getpage request) +//! - `WeakHandle`: downgrade of a `Handle` that does not keep the gate open, but allows +//! trying to ugprade back to a `Handle`, guaranteeing it's the same `Timeline` *object*. //! -//! The `Handle` is just a wrapper around an `Arc`. +//! Internally, there is 0 or 1 `HandleInner` per `(Cache,Timeline)`. +//! Since Cache:Connection is 1:1, there is 0 or 1 `HandleInner` per `(Connection,Timeline)`. //! -//! There is one long-lived `Arc`, which is stored in the `PerTimelineState`. -//! The `Cache` stores a `Weak` for each cached Timeline. +//! The `HandleInner` is allocated as a `Arc>` and +//! referenced weakly and strongly from various places which we are now illustrating. +//! For brevity, we will omit the `Arc>` part in the following and instead +//! use `strong ref` and `weak ref` when referring to the `Arc>` +//! or `Weak>`, respectively. +//! +//! - The `Handle` is a strong ref. +//! - The `WeakHandle` is a weak ref. +//! - The `PerTimelineState` contains a `HashMap`. +//! - The `Cache` is a `HashMap`. +//! +//! Lifetimes: +//! - `WeakHandle` and `Handle`: single pagestream request. +//! - `Cache`: single page service connection. +//! - `PerTimelineState`: lifetime of the Timeline object (i.e., i.e., till `Timeline::shutdown`). +//! +//! ## Request Handling Flow (= filling and using the `Cache``) //! //! To dispatch a request, the page service connection calls `Cache::get`. //! //! A cache miss means we consult the tenant manager for shard routing, -//! resulting in an `Arc`. We enter its gate _once_ and construct an -//! `Arc`. We store a `Weak` in the cache -//! and the `Arc` in the `PerTimelineState`. +//! resulting in an `Arc`. We enter its gate _once_ and store it in the the +//! `Arc>>`. A weak ref is stored in the `Cache` +//! and a strong ref in the `PerTimelineState`. +//! A strong ref is returned wrapped in a `Handle`. //! //! For subsequent requests, `Cache::get` will perform a "fast path" shard routing -//! and find the `Weak` in the cache. -//! We upgrade the `Weak` to an `Arc` and wrap it in the user-facing `Handle` type. +//! and find the weak ref in the cache. +//! We upgrade the weak ref to a strong ref and return it wrapped in a `Handle`. //! -//! The request handler dispatches the request to the right `>::$request_method`. +//! The pagestream processing is pipelined and involves a batching step. +//! While a request is batching, the `Handle` is downgraded to a `WeakHandle`. +//! When the batch is ready to be executed, the `WeakHandle` is upgraded back to a `Handle` +//! and the request handler dispatches the request to the right `>::$request_method`. //! It then drops the `Handle`, which drops the `Arc`. //! -//! # Memory Management / How The Reference Cycle Is Broken +//! # Performance //! -//! The attentive reader may have noticed the strong reference cycle -//! from `Arc` to `PerTimelineState` to `Arc`. +//! Remember from the introductory section: //! -//! This cycle is intentional: while it exists, the `Cache` can upgrade its -//! `Weak` to an `Arc` in a single atomic operation. +//! > However, we want to avoid the overhead of entering the gate for every +//! > method invocation. +//! +//! Why do we want to avoid that? +//! Because the gate is a shared location in memory and entering it involves +//! bumping refcounts, which leads to cache contention if done frequently +//! from multiple cores in parallel. +//! +//! So, we only acquire the `GateGuard` once on `Cache` miss, and wrap it in an `Arc`. +//! That `Arc` is private to the `HandleInner` and hence to the connection. +//! (Review the "Data Structures" section if that is unclear to you.) +//! +//! A `WeakHandle` is a weak ref to the `HandleInner`. +//! When upgrading a `WeakHandle`, we upgrade to a strong ref to the `HandleInner` and +//! further acquire an additional strong ref to the `Arc` inside it. +//! Again, this manipulation of ref counts is is cheap because `Arc` is private to the connection. +//! +//! When downgrading a `Handle` to a `WeakHandle`, we drop the `Arc`. +//! Again, this is cheap because the `Arc` is private to the connection. +//! +//! In addition to the GateGuard, we need to provide `Deref` impl. +//! For this, both `Handle` need infallible access to an `Arc`. +//! We could clone the `Arc` when upgrading a `WeakHandle`, but that would cause contention +//! on the shared memory location that trakcs the refcount of the `Arc`. +//! Instead, we wrap the `Arc` into another `Arc`. +//! so that we can clone it cheaply when upgrading a `WeakHandle`. +//! +//! # Shutdown +//! +//! The attentive reader may have noticed the following reference cycle around the `Arc`: +//! +//! ```text +//! Timeline --owns--> PerTimelineState --strong--> HandleInner --strong--> Timeline +//! ``` +//! +//! Further, there is this cycle: +//! +//! ```text +//! Timeline --owns--> PerTimelineState --strong--> HandleInner --strong--> GateGuard --keepalive--> Timeline +//! ``` +//! +//! The former cycle is a memory leak if not broken. +//! The latter cycle further prevents the Timeline from shutting down +//! because we certainly won't drop the Timeline while the GateGuard is alive. +//! Preventing shutdown is the whole point of this handle/cache system, +//! but when the Timeline needs to shut down, we need to break the cycle. //! //! The cycle is broken by either -//! - `PerTimelineState::shutdown` or -//! - dropping the `Cache`. +//! - Timeline shutdown (=> `PerTimelineState::shutdown`) +//! - Connection shutdown (=> dropping the `Cache`). //! -//! Concurrently existing `Handle`s will extend the existence of the cycle. +//! Both transition the `HandleInner` from [`HandleInner::KeepingTimelineGateOpen`] to +//! [`HandleInner::ShutDown`], which drops the only long-lived strong ref to the +//! `Arc`. +//! +//! `PerTimelineState::shutdown` drops all the `HandleInners` it contains, +//! thereby breaking the cycle. +//! It also initiates draining of already existing `Handle`s by +//! poisoning things so that no new `HandleInner`'s can be added +//! to the `PerTimelineState`, which will make subsequent `Cache::get` fail. +//! +//! Concurrently existing / already upgraded `Handle`s will extend the +//! lifetime of the `Arc>` and hence cycles. //! However, since `Handle`s are short-lived and new `Handle`s are not -//! handed out after either `PerTimelineState::shutdown` or `Cache` drop, -//! that extension of the cycle is bounded. +//! handed out from `Cache::get` or `WeakHandle::upgrade` after +//! `PerTimelineState::shutdown`, that extension of the cycle is bounded. +//! +//! Concurrently existing `WeakHandle`s will fail to `upgrade()`: +//! while they will succeed in upgrading `Weak>`, +//! they will find the inner in state `HandleInner::ShutDown` state where the +//! `Arc` and Timeline has already been dropped. +//! +//! Dropping the `Cache` undoes the registration of this `Cache`'s +//! `HandleInner`s from all the `PerTimelineState`s, i.e., it +//! removes the strong ref to each of its `HandleInner`s +//! from all the `PerTimelineState`. +//! +//! # Locking Rules +//! +//! To prevent deadlocks we: +//! +//! 1. Only ever hold one of the locks at a time. +//! 2. Don't add more than one Drop impl that locks on the +//! cycles above. +//! +//! As per (2), that impl is in `Drop for Cache`. //! //! # Fast Path for Shard Routing //! //! The `Cache` has a fast path for shard routing to avoid calling into //! the tenant manager for every request. //! -//! The `Cache` maintains a hash map of `ShardTimelineId` to `Weak`. +//! The `Cache` maintains a hash map of `ShardTimelineId` to `WeakHandle`s. //! //! The current implementation uses the first entry in the hash map //! to determine the `ShardParameters` and derive the correct @@ -87,18 +184,18 @@ //! //! It then looks up the hash map for that `ShardTimelineId := {ShardIndex,TimelineId}`. //! -//! If the lookup is successful and the `Weak` can be upgraded, +//! If the lookup is successful and the `WeakHandle` can be upgraded, //! it's a hit. //! //! ## Cache invalidation //! -//! The insight is that cache invalidation is sufficient and most efficiently done lazily. +//! The insight is that cache invalidation is sufficient and most efficiently if done lazily. //! The only reasons why an entry in the cache can become stale are: //! 1. The `PerTimelineState` / Timeline is shutting down e.g. because the shard is //! being detached, timeline or shard deleted, or pageserver is shutting down. //! 2. We're doing a shard split and new traffic should be routed to the child shards. //! -//! Regarding (1), we will eventually fail to upgrade the `Weak` once the +//! Regarding (1), we will eventually fail to upgrade the `WeakHandle` once the //! timeline has shut down, and when that happens, we remove the entry from the cache. //! //! Regarding (2), the insight is that it is toally fine to keep dispatching requests @@ -107,8 +204,6 @@ use std::collections::hash_map; use std::collections::HashMap; -use std::sync::atomic::AtomicBool; -use std::sync::atomic::Ordering; use std::sync::Arc; use std::sync::Mutex; use std::sync::Weak; @@ -152,7 +247,7 @@ pub(crate) struct Cache { map: Map, } -type Map = HashMap>>; +type Map = HashMap>; impl Default for Cache { fn default() -> Self { @@ -170,12 +265,22 @@ pub(crate) struct ShardTimelineId { } /// See module-level comment. -pub(crate) struct Handle(Arc>); -struct HandleInner { - shut_down: AtomicBool, - timeline: T::Timeline, - // The timeline's gate held open. - _gate_guard: utils::sync::gate::GateGuard, +pub(crate) struct Handle { + timeline: Arc, + #[allow(dead_code)] // the field exists to keep the gate open + gate_guard: Arc, + inner: Arc>>, +} +pub(crate) struct WeakHandle { + inner: Weak>>, +} +enum HandleInner { + KeepingTimelineGateOpen { + #[allow(dead_code)] + gate_guard: Arc, + timeline: Arc, + }, + ShutDown, } /// Embedded in each [`Types::Timeline`] as the anchor for the only long-lived strong ref to `HandleInner`. @@ -183,7 +288,8 @@ struct HandleInner { /// See module-level comment for details. pub struct PerTimelineState { // None = shutting down - handles: Mutex>>>>, + #[allow(clippy::type_complexity)] + handles: Mutex>>>>>, } impl Default for PerTimelineState { @@ -243,49 +349,24 @@ impl Cache { shard_selector: ShardSelector, tenant_manager: &T::TenantManager, ) -> Result, GetError> { - // terminates because each iteration removes an element from the map - loop { - let handle = self - .get_impl(timeline_id, shard_selector, tenant_manager) - .await?; - if handle.0.shut_down.load(Ordering::Relaxed) { - let removed = self - .map - .remove(&handle.0.timeline.shard_timeline_id()) - .expect("invariant of get_impl is that the returned handle is in the map"); - assert!( - Weak::ptr_eq(&removed, &Arc::downgrade(&handle.0)), - "shard_timeline_id() incorrect?" - ); - } else { - return Ok(handle); - } - } - } - - #[instrument(level = "trace", skip_all)] - async fn get_impl( - &mut self, - timeline_id: TimelineId, - shard_selector: ShardSelector, - tenant_manager: &T::TenantManager, - ) -> Result, GetError> { - let miss: ShardSelector = { + // terminates because when every iteration we remove an element from the map + let miss: ShardSelector = loop { let routing_state = self.shard_routing(timeline_id, shard_selector); match routing_state { RoutingResult::FastPath(handle) => return Ok(handle), RoutingResult::SlowPath(key) => match self.map.get(&key) { Some(cached) => match cached.upgrade() { - Some(upgraded) => return Ok(Handle(upgraded)), - None => { + Ok(upgraded) => return Ok(upgraded), + Err(HandleUpgradeError::ShutDown) => { + // TODO: dedup with shard_routing() trace!("handle cache stale"); self.map.remove(&key).unwrap(); - ShardSelector::Known(key.shard_index) + continue; } }, - None => ShardSelector::Known(key.shard_index), + None => break ShardSelector::Known(key.shard_index), }, - RoutingResult::NeedConsultTenantManager => shard_selector, + RoutingResult::NeedConsultTenantManager => break shard_selector, } }; self.get_miss(timeline_id, miss, tenant_manager).await @@ -302,7 +383,7 @@ impl Cache { let Some((first_key, first_handle)) = self.map.iter().next() else { return RoutingResult::NeedConsultTenantManager; }; - let Some(first_handle) = first_handle.upgrade() else { + let Ok(first_handle) = first_handle.upgrade() else { // TODO: dedup with get() trace!("handle cache stale"); let first_key_owned = *first_key; @@ -310,7 +391,7 @@ impl Cache { continue; }; - let first_handle_shard_identity = first_handle.timeline.get_shard_identity(); + let first_handle_shard_identity = first_handle.get_shard_identity(); let make_shard_index = |shard_num: ShardNumber| ShardIndex { shard_number: shard_num, shard_count: first_handle_shard_identity.count, @@ -329,11 +410,11 @@ impl Cache { }; let first_handle_shard_timeline_id = ShardTimelineId { shard_index: first_handle_shard_identity.shard_index(), - timeline_id: first_handle.timeline.shard_timeline_id().timeline_id, + timeline_id: first_handle.shard_timeline_id().timeline_id, }; if need_shard_timeline_id == first_handle_shard_timeline_id { - return RoutingResult::FastPath(Handle(first_handle)); + return RoutingResult::FastPath(first_handle); } else { return RoutingResult::SlowPath(need_shard_timeline_id); } @@ -357,23 +438,30 @@ impl Cache { ShardSelector::Known(idx) => assert_eq!(idx, &key.shard_index), } - let gate_guard = match timeline.gate().enter() { - Ok(guard) => guard, - Err(_) => { - return Err(GetError::TimelineGateClosed); - } - }; trace!("creating new HandleInner"); - let handle = Arc::new( - // TODO: global metric that keeps track of the number of live HandlerTimeline instances - // so we can identify reference cycle bugs. - HandleInner { - shut_down: AtomicBool::new(false), - _gate_guard: gate_guard, - timeline: timeline.clone(), - }, - ); - let handle = { + let handle_inner_arc = Arc::new(Mutex::new(HandleInner::KeepingTimelineGateOpen { + gate_guard: Arc::new( + // this enter() is expensive in production code because + // it hits the global Arc::gate refcounts + match timeline.gate().enter() { + Ok(guard) => guard, + Err(_) => { + return Err(GetError::TimelineGateClosed); + } + }, + ), + // this clone is expensive in production code because + // it hits the global Arc::clone refcounts + timeline: Arc::new(timeline.clone()), + })); + let handle_weak = WeakHandle { + inner: Arc::downgrade(&handle_inner_arc), + }; + let handle = handle_weak + .upgrade() + .ok() + .expect("we just created it and it's not linked anywhere yet"); + { let mut lock_guard = timeline .per_timeline_state() .handles @@ -381,7 +469,8 @@ impl Cache { .expect("mutex poisoned"); match &mut *lock_guard { Some(per_timeline_state) => { - let replaced = per_timeline_state.insert(self.id, Arc::clone(&handle)); + let replaced = + per_timeline_state.insert(self.id, Arc::clone(&handle_inner_arc)); assert!(replaced.is_none(), "some earlier code left a stale handle"); match self.map.entry(key) { hash_map::Entry::Occupied(_o) => { @@ -392,8 +481,7 @@ impl Cache { unreachable!() } hash_map::Entry::Vacant(v) => { - v.insert(Arc::downgrade(&handle)); - handle + v.insert(handle_weak); } } } @@ -401,14 +489,62 @@ impl Cache { return Err(GetError::PerTimelineStateShutDown); } } - }; - Ok(Handle(handle)) + } + Ok(handle) } Err(e) => Err(GetError::TenantManager(e)), } } } +pub(crate) enum HandleUpgradeError { + ShutDown, +} + +impl WeakHandle { + pub(crate) fn upgrade(&self) -> Result, HandleUpgradeError> { + let Some(inner) = Weak::upgrade(&self.inner) else { + return Err(HandleUpgradeError::ShutDown); + }; + let lock_guard = inner.lock().expect("poisoned"); + match &*lock_guard { + HandleInner::KeepingTimelineGateOpen { + timeline, + gate_guard, + } => { + let gate_guard = Arc::clone(gate_guard); + let timeline = Arc::clone(timeline); + drop(lock_guard); + Ok(Handle { + timeline, + gate_guard, + inner, + }) + } + HandleInner::ShutDown => Err(HandleUpgradeError::ShutDown), + } + } + + pub(crate) fn is_same_handle_as(&self, other: &WeakHandle) -> bool { + Weak::ptr_eq(&self.inner, &other.inner) + } +} + +impl std::ops::Deref for Handle { + type Target = T::Timeline; + fn deref(&self) -> &Self::Target { + &self.timeline + } +} + +impl Handle { + pub(crate) fn downgrade(&self) -> WeakHandle { + WeakHandle { + inner: Arc::downgrade(&self.inner), + } + } +} + impl PerTimelineState { /// After this method returns, [`Cache::get`] will never again return a [`Handle`] /// to the [`Types::Timeline`] that embeds this per-timeline state. @@ -430,43 +566,54 @@ impl PerTimelineState { trace!("already shut down"); return; }; - for handle in handles.values() { + for handle_inner_arc in handles.values() { // Make hits fail. - handle.shut_down.store(true, Ordering::Relaxed); + let mut lock_guard = handle_inner_arc.lock().expect("poisoned"); + lock_guard.shutdown(); } drop(handles); } } -impl std::ops::Deref for Handle { - type Target = T::Timeline; - fn deref(&self) -> &Self::Target { - &self.0.timeline - } -} - -#[cfg(test)] -impl Drop for HandleInner { - fn drop(&mut self) { - trace!("HandleInner dropped"); - } -} - // When dropping a [`Cache`], prune its handles in the [`PerTimelineState`] to break the reference cycle. impl Drop for Cache { fn drop(&mut self) { - for (_, weak) in self.map.drain() { - if let Some(strong) = weak.upgrade() { - // handle is still being kept alive in PerTimelineState - let timeline = strong.timeline.per_timeline_state(); - let mut handles = timeline.handles.lock().expect("mutex poisoned"); - if let Some(handles) = &mut *handles { - let Some(removed) = handles.remove(&self.id) else { - // There could have been a shutdown inbetween us upgrading the weak and locking the mutex. - continue; - }; - assert!(Arc::ptr_eq(&removed, &strong)); - } + for ( + _, + WeakHandle { + inner: handle_inner_weak, + }, + ) in self.map.drain() + { + let Some(handle_inner_arc) = handle_inner_weak.upgrade() else { + continue; + }; + let handle_timeline = handle_inner_arc + // locking rules: drop lock before acquiring other lock below + .lock() + .expect("poisoned") + .shutdown(); + let per_timeline_state = handle_timeline.per_timeline_state(); + let mut handles_lock_guard = per_timeline_state.handles.lock().expect("mutex poisoned"); + let Some(handles) = &mut *handles_lock_guard else { + continue; + }; + let Some(removed_handle_inner_arc) = handles.remove(&self.id) else { + // There could have been a shutdown inbetween us upgrading the weak and locking the mutex. + continue; + }; + drop(handles_lock_guard); // locking rules: remember them when! + assert!(Arc::ptr_eq(&removed_handle_inner_arc, &handle_inner_arc,)); + } + } +} + +impl HandleInner { + fn shutdown(&mut self) -> Arc { + match std::mem::replace(self, HandleInner::ShutDown) { + HandleInner::KeepingTimelineGateOpen { timeline, .. } => timeline, + HandleInner::ShutDown => { + unreachable!("handles are only shut down once in their lifetime"); } } } @@ -474,6 +621,8 @@ impl Drop for Cache { #[cfg(test)] mod tests { + use std::sync::Weak; + use pageserver_api::{ key::{rel_block_to_key, Key, DBDIR_KEY}, models::ShardParameters, @@ -583,39 +732,13 @@ mod tests { // // fill the cache // - assert_eq!( - (Arc::strong_count(&shard0), Arc::weak_count(&shard0)), - (2, 1), - "strong: shard0, mgr; weak: myself" - ); - let handle: Handle<_> = cache .get(timeline_id, ShardSelector::Page(key), &mgr) .await .expect("we have the timeline"); - let handle_inner_weak = Arc::downgrade(&handle.0); assert!(Weak::ptr_eq(&handle.myself, &shard0.myself)); - assert_eq!( - ( - Weak::strong_count(&handle_inner_weak), - Weak::weak_count(&handle_inner_weak) - ), - (2, 2), - "strong: handle, per_timeline_state, weak: handle_inner_weak, cache" - ); assert_eq!(cache.map.len(), 1); - - assert_eq!( - (Arc::strong_count(&shard0), Arc::weak_count(&shard0)), - (3, 1), - "strong: handleinner(per_timeline_state), shard0, mgr; weak: myself" - ); drop(handle); - assert_eq!( - (Arc::strong_count(&shard0), Arc::weak_count(&shard0)), - (3, 1), - "strong: handleinner(per_timeline_state), shard0, mgr; weak: myself" - ); // // demonstrate that Handle holds up gate closure @@ -640,21 +763,11 @@ mod tests { // SHUTDOWN shard0.per_timeline_state.shutdown(); // keeping handle alive across shutdown - assert_eq!( - 1, - Weak::strong_count(&handle_inner_weak), - "through local var handle" - ); assert_eq!( cache.map.len(), 1, "this is an implementation detail but worth pointing out: we can't clear the cache from shutdown(), it's cleared on first access after" ); - assert_eq!( - (Arc::strong_count(&shard0), Arc::weak_count(&shard0)), - (3, 1), - "strong: handleinner(via handle), shard0, mgr; weak: myself" - ); // this handle is perfectly usable handle.getpage(); @@ -678,16 +791,6 @@ mod tests { } drop(handle); - assert_eq!( - 0, - Weak::strong_count(&handle_inner_weak), - "the HandleInner destructor already ran" - ); - assert_eq!( - (Arc::strong_count(&shard0), Arc::weak_count(&shard0)), - (2, 1), - "strong: shard0, mgr; weak: myself" - ); // closing gate succeeds after dropping handle tokio::select! { @@ -706,10 +809,8 @@ mod tests { assert_eq!(cache.map.len(), 0); // ensure all refs to shard0 are gone and we're not leaking anything - let myself = Weak::clone(&shard0.myself); drop(shard0); drop(mgr); - assert_eq!(Weak::strong_count(&myself), 0); } #[tokio::test] @@ -948,15 +1049,11 @@ mod tests { handle }; handle.getpage(); - used_handles.push(Arc::downgrade(&handle.0)); + used_handles.push(Arc::downgrade(&handle.timeline)); } - // No handles exist, thus gates are closed and don't require shutdown - assert!(used_handles - .iter() - .all(|weak| Weak::strong_count(weak) == 0)); - - // ... thus the gate should close immediately, even without shutdown + // No handles exist, thus gates are closed and don't require shutdown. + // Thus the gate should close immediately, even without shutdown. tokio::select! { _ = shard0.gate.close() => { } _ = tokio::time::sleep(FOREVER) => { @@ -964,4 +1061,75 @@ mod tests { } } } + + #[tokio::test(start_paused = true)] + async fn test_weak_handles() { + crate::tenant::harness::setup_logging(); + let timeline_id = TimelineId::generate(); + let shard0 = Arc::new_cyclic(|myself| StubTimeline { + gate: Default::default(), + id: timeline_id, + shard: ShardIdentity::unsharded(), + per_timeline_state: PerTimelineState::default(), + myself: myself.clone(), + }); + let mgr = StubManager { + shards: vec![shard0.clone()], + }; + + let refcount_start = Arc::strong_count(&shard0); + + let key = DBDIR_KEY; + + let mut cache = Cache::::default(); + + let handle = cache + .get(timeline_id, ShardSelector::Page(key), &mgr) + .await + .expect("we have the timeline"); + assert!(Weak::ptr_eq(&handle.myself, &shard0.myself)); + + let weak_handle = handle.downgrade(); + + drop(handle); + + let upgraded_handle = weak_handle.upgrade().ok().expect("we can upgrade it"); + + // Start shutdown + shard0.per_timeline_state.shutdown(); + + // Upgrades during shutdown don't work, even if upgraded_handle exists. + weak_handle + .upgrade() + .err() + .expect("can't upgrade weak handle as soon as shutdown started"); + + // But upgraded_handle is still alive, so the gate won't close. + tokio::select! { + _ = shard0.gate.close() => { + panic!("handle is keeping gate open"); + } + _ = tokio::time::sleep(FOREVER) => { } + } + + // Drop the last handle. + drop(upgraded_handle); + + // The gate should close now, despite there still being a weak_handle. + tokio::select! { + _ = shard0.gate.close() => { } + _ = tokio::time::sleep(FOREVER) => { + panic!("only strong handle is dropped and we shut down per-timeline-state") + } + } + + // The weak handle still can't be upgraded. + weak_handle + .upgrade() + .err() + .expect("still shouldn't be able to upgrade the weak handle"); + + // There should be no strong references to the timeline object except the one on "stack". + assert_eq!(Arc::strong_count(&shard0), refcount_start); + } } diff --git a/pageserver/src/tenant/timeline/walreceiver/walreceiver_connection.rs b/pageserver/src/tenant/timeline/walreceiver/walreceiver_connection.rs index d74faa1af5..01c272633c 100644 --- a/pageserver/src/tenant/timeline/walreceiver/walreceiver_connection.rs +++ b/pageserver/src/tenant/timeline/walreceiver/walreceiver_connection.rs @@ -140,7 +140,7 @@ pub(super) async fn handle_walreceiver_connection( let (replication_client, connection) = { let mut config = wal_source_connconf.to_tokio_postgres_config(); - config.application_name("pageserver"); + config.application_name(format!("pageserver-{}", node.0).as_str()); config.replication_mode(tokio_postgres::config::ReplicationMode::Physical); match time::timeout(connect_timeout, config.connect(postgres::NoTls)).await { Ok(client_and_conn) => client_and_conn?, @@ -264,6 +264,8 @@ pub(super) async fn handle_walreceiver_connection( let mut walingest = WalIngest::new(timeline.as_ref(), startpoint, &ctx).await?; + let shard = vec![*timeline.get_shard_identity()]; + let interpreted_proto_config = match protocol { PostgresClientProtocol::Vanilla => None, PostgresClientProtocol::Interpreted { @@ -403,7 +405,7 @@ pub(super) async fn handle_walreceiver_connection( // need to advance last record LSN on all shards. If we've not ingested the latest // record, then set the LSN of the modification past it. This way all shards // advance their last record LSN at the same time. - let needs_last_record_lsn_advance = match next_record_lsn.map(Lsn::from) { + let needs_last_record_lsn_advance = match next_record_lsn { Some(lsn) if lsn > modification.get_lsn() => { modification.set_lsn(lsn).unwrap(); true @@ -476,10 +478,12 @@ pub(super) async fn handle_walreceiver_connection( // Deserialize and interpret WAL record let interpreted = InterpretedWalRecord::from_bytes_filtered( recdata, - modification.tline.get_shard_identity(), + &shard, next_record_lsn, modification.tline.pg_version, - )?; + )? + .remove(timeline.get_shard_identity()) + .unwrap(); if matches!(interpreted.flush_uncommitted, FlushUncommittedRecords::Yes) && uncommitted_records > 0 diff --git a/pageserver/src/tenant/upload_queue.rs b/pageserver/src/tenant/upload_queue.rs index ef3aa759f3..d302205ffe 100644 --- a/pageserver/src/tenant/upload_queue.rs +++ b/pageserver/src/tenant/upload_queue.rs @@ -1,28 +1,38 @@ +use std::collections::{HashMap, HashSet, VecDeque}; +use std::fmt::Debug; +use std::sync::atomic::AtomicU32; +use std::sync::Arc; + +use super::remote_timeline_client::is_same_remote_layer_path; +use super::storage_layer::AsLayerDesc as _; use super::storage_layer::LayerName; use super::storage_layer::ResidentLayer; use crate::tenant::metadata::TimelineMetadata; use crate::tenant::remote_timeline_client::index::IndexPart; use crate::tenant::remote_timeline_client::index::LayerFileMetadata; -use std::collections::HashSet; -use std::collections::{HashMap, VecDeque}; -use std::fmt::Debug; +use utils::generation::Generation; +use utils::lsn::{AtomicLsn, Lsn}; use chrono::NaiveDateTime; -use std::sync::Arc; +use once_cell::sync::Lazy; use tracing::info; -use utils::lsn::AtomicLsn; -use std::sync::atomic::AtomicU32; -use utils::lsn::Lsn; +/// Kill switch for upload queue reordering in case it causes problems. +/// TODO: remove this once we have confidence in it. +static DISABLE_UPLOAD_QUEUE_REORDERING: Lazy = + Lazy::new(|| std::env::var("DISABLE_UPLOAD_QUEUE_REORDERING").as_deref() == Ok("true")); -use utils::generation::Generation; +/// Kill switch for index upload coalescing in case it causes problems. +/// TODO: remove this once we have confidence in it. +static DISABLE_UPLOAD_QUEUE_INDEX_COALESCING: Lazy = + Lazy::new(|| std::env::var("DISABLE_UPLOAD_QUEUE_INDEX_COALESCING").as_deref() == Ok("true")); // clippy warns that Uninitialized is much smaller than Initialized, which wastes // memory for Uninitialized variants. Doesn't matter in practice, there are not // that many upload queues in a running pageserver, and most of them are initialized // anyway. #[allow(clippy::large_enum_variant)] -pub(super) enum UploadQueue { +pub enum UploadQueue { Uninitialized, Initialized(UploadQueueInitialized), Stopped(UploadQueueStopped), @@ -39,13 +49,16 @@ impl UploadQueue { } #[derive(Copy, Clone, PartialEq, Eq, Hash, Debug)] -pub(crate) enum OpType { +pub enum OpType { MayReorder, FlushDeletion, } /// This keeps track of queued and in-progress tasks. -pub(crate) struct UploadQueueInitialized { +pub struct UploadQueueInitialized { + /// Maximum number of inprogress tasks to schedule. 0 is no limit. + pub(crate) inprogress_limit: usize, + /// Counter to assign task IDs pub(crate) task_counter: u64, @@ -70,21 +83,16 @@ pub(crate) struct UploadQueueInitialized { /// we skip validation) pub(crate) visible_remote_consistent_lsn: Arc, - // Breakdown of different kinds of tasks currently in-progress - pub(crate) num_inprogress_layer_uploads: usize, - pub(crate) num_inprogress_metadata_uploads: usize, - pub(crate) num_inprogress_deletions: usize, - /// Tasks that are currently in-progress. In-progress means that a tokio Task /// has been launched for it. An in-progress task can be busy uploading, but it can /// also be waiting on the `concurrency_limiter` Semaphore in S3Bucket, or it can /// be waiting for retry in `exponential_backoff`. - pub(crate) inprogress_tasks: HashMap>, + pub inprogress_tasks: HashMap>, /// Queued operations that have not been launched yet. They might depend on previous /// tasks to finish. For example, metadata upload cannot be performed before all /// preceding layer file uploads have completed. - pub(crate) queued_operations: VecDeque, + pub queued_operations: VecDeque, /// Files which have been unlinked but not yet had scheduled a deletion for. Only kept around /// for error logging. @@ -122,6 +130,167 @@ impl UploadQueueInitialized { let lsn = self.clean.0.metadata.disk_consistent_lsn(); self.clean.1.map(|_| lsn) } + + /// Returns and removes the next ready operation from the queue, if any. This isn't necessarily + /// the first operation in the queue, to avoid head-of-line blocking -- an operation can jump + /// the queue if it doesn't conflict with operations ahead of it. + /// + /// Also returns any operations that were coalesced into this one, e.g. multiple index uploads. + /// + /// None may be returned even if the queue isn't empty, if no operations are ready yet. + /// + /// NB: this is quadratic, but queues are expected to be small, and bounded by inprogress_limit. + pub fn next_ready(&mut self) -> Option<(UploadOp, Vec)> { + // If inprogress_tasks is already at limit, don't schedule anything more. + if self.inprogress_limit > 0 && self.inprogress_tasks.len() >= self.inprogress_limit { + return None; + } + + for (i, candidate) in self.queued_operations.iter().enumerate() { + // If this candidate is ready, go for it. Otherwise, try the next one. + if self.is_ready(i) { + // Shutdown operations are left at the head of the queue, to prevent further + // operations from starting. Signal that we're ready to shut down. + if matches!(candidate, UploadOp::Shutdown) { + assert!(self.inprogress_tasks.is_empty(), "shutdown with tasks"); + assert_eq!(i, 0, "shutdown not at head of queue"); + self.shutdown_ready.close(); + return None; + } + + let mut op = self.queued_operations.remove(i).expect("i can't disappear"); + + // Coalesce any back-to-back index uploads by only uploading the newest one that's + // ready. This typically happens with layer/index/layer/index/... sequences, where + // the layers bypass the indexes, leaving the indexes queued. + // + // If other operations are interleaved between index uploads we don't try to + // coalesce them, since we may as well update the index concurrently with them. + // This keeps the index fresh and avoids starvation. + // + // NB: we assume that all uploaded indexes have the same remote path. This + // is true at the time of writing: the path only depends on the tenant, + // timeline and generation, all of which are static for a timeline instance. + // Otherwise, we must be careful not to coalesce different paths. + let mut coalesced_ops = Vec::new(); + if matches!(op, UploadOp::UploadMetadata { .. }) { + while let Some(UploadOp::UploadMetadata { .. }) = self.queued_operations.get(i) + { + if *DISABLE_UPLOAD_QUEUE_INDEX_COALESCING { + break; + } + if !self.is_ready(i) { + break; + } + coalesced_ops.push(op); + op = self.queued_operations.remove(i).expect("i can't disappear"); + } + } + + return Some((op, coalesced_ops)); + } + + // Nothing can bypass a barrier or shutdown. If it wasn't scheduled above, give up. + if matches!(candidate, UploadOp::Barrier(_) | UploadOp::Shutdown) { + return None; + } + + // If upload queue reordering is disabled, bail out after the first operation. + if *DISABLE_UPLOAD_QUEUE_REORDERING { + return None; + } + } + None + } + + /// Returns true if the queued operation at the given position is ready to be uploaded, i.e. if + /// it doesn't conflict with any in-progress or queued operations ahead of it. Operations are + /// allowed to skip the queue when it's safe to do so, to increase parallelism. + /// + /// The position must be valid for the queue size. + fn is_ready(&self, pos: usize) -> bool { + let candidate = self.queued_operations.get(pos).expect("invalid position"); + self + // Look at in-progress operations, in random order. + .inprogress_tasks + .values() + .map(|task| &task.op) + // Then queued operations ahead of the candidate, front-to-back. + .chain(self.queued_operations.iter().take(pos)) + // Keep track of the active index ahead of each operation. This is used to ensure that + // an upload doesn't skip the queue too far, such that it modifies a layer that's + // referenced by an active index. + // + // It's okay that in-progress operations are emitted in random order above, since at + // most one of them can be an index upload (enforced by can_bypass). + .scan(&self.clean.0, |next_active_index, op| { + let active_index = *next_active_index; + if let UploadOp::UploadMetadata { ref uploaded } = op { + *next_active_index = uploaded; // stash index for next operation after this + } + Some((op, active_index)) + }) + // Check if the candidate can bypass all of them. + .all(|(op, active_index)| candidate.can_bypass(op, active_index)) + } + + /// Returns the number of in-progress deletion operations. + #[cfg(test)] + pub(crate) fn num_inprogress_deletions(&self) -> usize { + self.inprogress_tasks + .iter() + .filter(|(_, t)| matches!(t.op, UploadOp::Delete(_))) + .count() + } + + /// Returns the number of in-progress layer uploads. + #[cfg(test)] + pub(crate) fn num_inprogress_layer_uploads(&self) -> usize { + self.inprogress_tasks + .iter() + .filter(|(_, t)| matches!(t.op, UploadOp::UploadLayer(_, _, _))) + .count() + } + + /// Test helper that schedules all ready operations into inprogress_tasks, and returns + /// references to them. + /// + /// TODO: the corresponding production logic should be moved from RemoteTimelineClient into + /// UploadQueue, so we can use the same code path. + #[cfg(test)] + fn schedule_ready(&mut self) -> Vec> { + let mut tasks = Vec::new(); + // NB: schedule operations one by one, to handle conflicts with inprogress_tasks. + while let Some((op, coalesced_ops)) = self.next_ready() { + self.task_counter += 1; + let task = Arc::new(UploadTask { + task_id: self.task_counter, + op, + coalesced_ops, + retries: 0.into(), + }); + self.inprogress_tasks.insert(task.task_id, task.clone()); + tasks.push(task); + } + tasks + } + + /// Test helper that marks an operation as completed, removing it from inprogress_tasks. + /// + /// TODO: the corresponding production logic should be moved from RemoteTimelineClient into + /// UploadQueue, so we can use the same code path. + #[cfg(test)] + fn complete(&mut self, task_id: u64) { + let Some(task) = self.inprogress_tasks.remove(&task_id) else { + return; + }; + // Update the clean index on uploads. + if let UploadOp::UploadMetadata { ref uploaded } = task.op { + if task.task_id > self.clean.1.unwrap_or_default() { + self.clean = (*uploaded.clone(), Some(task.task_id)); + } + } + } } #[derive(Clone, Copy)] @@ -131,12 +300,12 @@ pub(super) enum SetDeletedFlagProgress { Successful(NaiveDateTime), } -pub(super) struct UploadQueueStoppedDeletable { +pub struct UploadQueueStoppedDeletable { pub(super) upload_queue_for_deletion: UploadQueueInitialized, pub(super) deleted_at: SetDeletedFlagProgress, } -pub(super) enum UploadQueueStopped { +pub enum UploadQueueStopped { Deletable(UploadQueueStoppedDeletable), Uninitialized, } @@ -163,9 +332,10 @@ impl NotInitialized { } impl UploadQueue { - pub(crate) fn initialize_empty_remote( + pub fn initialize_empty_remote( &mut self, metadata: &TimelineMetadata, + inprogress_limit: usize, ) -> anyhow::Result<&mut UploadQueueInitialized> { match self { UploadQueue::Uninitialized => (), @@ -179,15 +349,13 @@ impl UploadQueue { let index_part = IndexPart::empty(metadata.clone()); let state = UploadQueueInitialized { + inprogress_limit, dirty: index_part.clone(), clean: (index_part, None), latest_files_changes_since_metadata_upload_scheduled: 0, visible_remote_consistent_lsn: Arc::new(AtomicLsn::new(0)), // what follows are boring default initializations task_counter: 0, - num_inprogress_layer_uploads: 0, - num_inprogress_metadata_uploads: 0, - num_inprogress_deletions: 0, inprogress_tasks: HashMap::new(), queued_operations: VecDeque::new(), #[cfg(feature = "testing")] @@ -202,9 +370,10 @@ impl UploadQueue { Ok(self.initialized_mut().expect("we just set it")) } - pub(crate) fn initialize_with_current_remote_index_part( + pub fn initialize_with_current_remote_index_part( &mut self, index_part: &IndexPart, + inprogress_limit: usize, ) -> anyhow::Result<&mut UploadQueueInitialized> { match self { UploadQueue::Uninitialized => (), @@ -219,6 +388,7 @@ impl UploadQueue { ); let state = UploadQueueInitialized { + inprogress_limit, dirty: index_part.clone(), clean: (index_part.clone(), None), latest_files_changes_since_metadata_upload_scheduled: 0, @@ -227,9 +397,6 @@ impl UploadQueue { ), // what follows are boring default initializations task_counter: 0, - num_inprogress_layer_uploads: 0, - num_inprogress_metadata_uploads: 0, - num_inprogress_deletions: 0, inprogress_tasks: HashMap::new(), queued_operations: VecDeque::new(), #[cfg(feature = "testing")] @@ -244,9 +411,7 @@ impl UploadQueue { Ok(self.initialized_mut().expect("we just set it")) } - pub(crate) fn initialized_mut( - &mut self, - ) -> Result<&mut UploadQueueInitialized, NotInitialized> { + pub fn initialized_mut(&mut self) -> Result<&mut UploadQueueInitialized, NotInitialized> { use UploadQueue::*; match self { Uninitialized => Err(NotInitialized::Uninitialized), @@ -276,23 +441,27 @@ impl UploadQueue { /// An in-progress upload or delete task. #[derive(Debug)] -pub(crate) struct UploadTask { +pub struct UploadTask { /// Unique ID of this task. Used as the key in `inprogress_tasks` above. - pub(crate) task_id: u64, - pub(crate) retries: AtomicU32, - - pub(crate) op: UploadOp, + pub task_id: u64, + /// Number of task retries. + pub retries: AtomicU32, + /// The upload operation. + pub op: UploadOp, + /// Any upload operations that were coalesced into this operation. This typically happens with + /// back-to-back index uploads, see `UploadQueueInitialized::next_ready()`. + pub coalesced_ops: Vec, } /// A deletion of some layers within the lifetime of a timeline. This is not used /// for timeline deletion, which skips this queue and goes directly to DeletionQueue. #[derive(Debug, Clone)] -pub(crate) struct Delete { - pub(crate) layers: Vec<(LayerName, LayerFileMetadata)>, +pub struct Delete { + pub layers: Vec<(LayerName, LayerFileMetadata)>, } -#[derive(Debug)] -pub(crate) enum UploadOp { +#[derive(Clone, Debug)] +pub enum UploadOp { /// Upload a layer file. The last field indicates the last operation for thie file. UploadLayer(ResidentLayer, LayerFileMetadata, Option), @@ -338,3 +507,796 @@ impl std::fmt::Display for UploadOp { } } } + +impl UploadOp { + /// Returns true if self can bypass other, i.e. if the operations don't conflict. index is the + /// active index when other would be uploaded -- if we allow self to bypass other, this would + /// be the active index when self is uploaded. + pub fn can_bypass(&self, other: &UploadOp, index: &IndexPart) -> bool { + match (self, other) { + // Nothing can bypass a barrier or shutdown, and it can't bypass anything. + (UploadOp::Barrier(_), _) | (_, UploadOp::Barrier(_)) => false, + (UploadOp::Shutdown, _) | (_, UploadOp::Shutdown) => false, + + // Uploads and deletes can bypass each other unless they're for the same file. + (UploadOp::UploadLayer(a, ameta, _), UploadOp::UploadLayer(b, bmeta, _)) => { + let aname = &a.layer_desc().layer_name(); + let bname = &b.layer_desc().layer_name(); + !is_same_remote_layer_path(aname, ameta, bname, bmeta) + } + (UploadOp::UploadLayer(u, umeta, _), UploadOp::Delete(d)) + | (UploadOp::Delete(d), UploadOp::UploadLayer(u, umeta, _)) => { + d.layers.iter().all(|(dname, dmeta)| { + !is_same_remote_layer_path(&u.layer_desc().layer_name(), umeta, dname, dmeta) + }) + } + + // Deletes are idempotent and can always bypass each other. + (UploadOp::Delete(_), UploadOp::Delete(_)) => true, + + // Uploads and deletes can bypass an index upload as long as neither the uploaded index + // nor the active index below it references the file. A layer can't be modified or + // deleted while referenced by an index. + // + // Similarly, index uploads can bypass uploads and deletes as long as neither the + // uploaded index nor the active index references the file (the latter would be + // incorrect use by the caller). + (UploadOp::UploadLayer(u, umeta, _), UploadOp::UploadMetadata { uploaded: i }) + | (UploadOp::UploadMetadata { uploaded: i }, UploadOp::UploadLayer(u, umeta, _)) => { + let uname = u.layer_desc().layer_name(); + !i.references(&uname, umeta) && !index.references(&uname, umeta) + } + (UploadOp::Delete(d), UploadOp::UploadMetadata { uploaded: i }) + | (UploadOp::UploadMetadata { uploaded: i }, UploadOp::Delete(d)) => { + d.layers.iter().all(|(dname, dmeta)| { + !i.references(dname, dmeta) && !index.references(dname, dmeta) + }) + } + + // Indexes can never bypass each other. They can coalesce though, and + // `UploadQueue::next_ready()` currently does this when possible. + (UploadOp::UploadMetadata { .. }, UploadOp::UploadMetadata { .. }) => false, + } + } +} + +#[cfg(test)] +mod tests { + use super::*; + use crate::tenant::harness::{TenantHarness, TIMELINE_ID}; + use crate::tenant::storage_layer::layer::local_layer_path; + use crate::tenant::storage_layer::Layer; + use crate::tenant::Timeline; + use crate::DEFAULT_PG_VERSION; + use itertools::Itertools as _; + use std::str::FromStr as _; + use utils::shard::{ShardCount, ShardIndex, ShardNumber}; + + /// Test helper which asserts that two operations are the same, in lieu of UploadOp PartialEq. + #[track_caller] + fn assert_same_op(a: &UploadOp, b: &UploadOp) { + use UploadOp::*; + match (a, b) { + (UploadLayer(a, ameta, atype), UploadLayer(b, bmeta, btype)) => { + assert_eq!(a.layer_desc().layer_name(), b.layer_desc().layer_name()); + assert_eq!(ameta, bmeta); + assert_eq!(atype, btype); + } + (Delete(a), Delete(b)) => assert_eq!(a.layers, b.layers), + (UploadMetadata { uploaded: a }, UploadMetadata { uploaded: b }) => assert_eq!(a, b), + (Barrier(_), Barrier(_)) => {} + (Shutdown, Shutdown) => {} + (a, b) => panic!("{a:?} != {b:?}"), + } + } + + /// Test helper which asserts that two sets of operations are the same. + #[track_caller] + fn assert_same_ops<'a>( + a: impl IntoIterator, + b: impl IntoIterator, + ) { + a.into_iter() + .zip_eq(b) + .for_each(|(a, b)| assert_same_op(a, b)) + } + + /// Test helper to construct a test timeline. + /// + /// TODO: it really shouldn't be necessary to construct an entire tenant and timeline just to + /// test the upload queue -- decouple ResidentLayer from Timeline. + /// + /// TODO: the upload queue uses TimelineMetadata::example() instead, because there's no way to + /// obtain a TimelineMetadata from a Timeline. + fn make_timeline() -> Arc { + // Grab the current test name from the current thread name. + // TODO: TenantHarness shouldn't take a &'static str, but just leak the test name for now. + let test_name = std::thread::current().name().unwrap().to_string(); + let test_name = Box::leak(test_name.into_boxed_str()); + + let runtime = tokio::runtime::Builder::new_current_thread() + .enable_all() + .build() + .expect("failed to create runtime"); + + runtime + .block_on(async { + let harness = TenantHarness::create(test_name).await?; + let (tenant, ctx) = harness.load().await; + tenant + .create_test_timeline(TIMELINE_ID, Lsn(8), DEFAULT_PG_VERSION, &ctx) + .await + }) + .expect("failed to create timeline") + } + + /// Test helper to construct an (empty) resident layer. + fn make_layer(timeline: &Arc, name: &str) -> ResidentLayer { + make_layer_with_size(timeline, name, 0) + } + + /// Test helper to construct a resident layer with the given size. + fn make_layer_with_size(timeline: &Arc, name: &str, size: usize) -> ResidentLayer { + let metadata = LayerFileMetadata { + generation: timeline.generation, + shard: timeline.get_shard_index(), + file_size: size as u64, + }; + make_layer_with_metadata(timeline, name, metadata) + } + + /// Test helper to construct a layer with the given metadata. + fn make_layer_with_metadata( + timeline: &Arc, + name: &str, + metadata: LayerFileMetadata, + ) -> ResidentLayer { + let name = LayerName::from_str(name).expect("invalid name"); + let local_path = local_layer_path( + timeline.conf, + &timeline.tenant_shard_id, + &timeline.timeline_id, + &name, + &metadata.generation, + ); + std::fs::write(&local_path, vec![0; metadata.file_size as usize]) + .expect("failed to write file"); + Layer::for_resident(timeline.conf, timeline, local_path, name, metadata) + } + + /// Test helper to add a layer to an index and return a new index. + fn index_with(index: &IndexPart, layer: &ResidentLayer) -> Box { + let mut index = index.clone(); + index + .layer_metadata + .insert(layer.layer_desc().layer_name(), layer.metadata()); + Box::new(index) + } + + /// Test helper to remove a layer from an index and return a new index. + fn index_without(index: &IndexPart, layer: &ResidentLayer) -> Box { + let mut index = index.clone(); + index + .layer_metadata + .remove(&layer.layer_desc().layer_name()); + Box::new(index) + } + + /// Nothing can bypass a barrier, and it can't bypass inprogress tasks. + #[test] + fn schedule_barrier() -> anyhow::Result<()> { + let mut queue = UploadQueue::Uninitialized; + let queue = queue.initialize_empty_remote(&TimelineMetadata::example(), 0)?; + let tli = make_timeline(); + + let index = Box::new(queue.clean.0.clone()); // empty, doesn't matter + let layer0 = make_layer(&tli, "000000000000000000000000000000000000-100000000000000000000000000000000000__00000000016B59D8-00000000016B5A51"); + let layer1 = make_layer(&tli, "100000000000000000000000000000000000-200000000000000000000000000000000000__00000000016B59D8-00000000016B5A51"); + let layer2 = make_layer(&tli, "200000000000000000000000000000000000-300000000000000000000000000000000000__00000000016B59D8-00000000016B5A51"); + let layer3 = make_layer(&tli, "300000000000000000000000000000000000-400000000000000000000000000000000000__00000000016B59D8-00000000016B5A51"); + let (barrier, _) = tokio::sync::watch::channel(()); + + // Enqueue non-conflicting upload, delete, and index before and after a barrier. + let ops = [ + UploadOp::UploadLayer(layer0.clone(), layer0.metadata(), None), + UploadOp::Delete(Delete { + layers: vec![(layer1.layer_desc().layer_name(), layer1.metadata())], + }), + UploadOp::UploadMetadata { + uploaded: index.clone(), + }, + UploadOp::Barrier(barrier), + UploadOp::UploadLayer(layer2.clone(), layer2.metadata(), None), + UploadOp::Delete(Delete { + layers: vec![(layer3.layer_desc().layer_name(), layer3.metadata())], + }), + UploadOp::UploadMetadata { + uploaded: index.clone(), + }, + ]; + + queue.queued_operations.extend(ops.clone()); + + // Schedule the initial operations ahead of the barrier. + let tasks = queue.schedule_ready(); + + assert_same_ops(tasks.iter().map(|t| &t.op), &ops[0..3]); + assert!(matches!( + queue.queued_operations.front(), + Some(&UploadOp::Barrier(_)) + )); + + // Complete the initial operations. The barrier isn't scheduled while they're pending. + for task in tasks { + assert!(queue.schedule_ready().is_empty()); + queue.complete(task.task_id); + } + + // Schedule the barrier. The later tasks won't schedule until it completes. + let tasks = queue.schedule_ready(); + + assert_eq!(tasks.len(), 1); + assert!(matches!(tasks[0].op, UploadOp::Barrier(_))); + assert_eq!(queue.queued_operations.len(), 3); + + // Complete the barrier. The rest of the tasks schedule immediately. + queue.complete(tasks[0].task_id); + + let tasks = queue.schedule_ready(); + assert_same_ops(tasks.iter().map(|t| &t.op), &ops[4..]); + assert!(queue.queued_operations.is_empty()); + + Ok(()) + } + + /// Deletes can be scheduled in parallel, even if they're for the same file. + #[test] + fn schedule_delete_parallel() -> anyhow::Result<()> { + let mut queue = UploadQueue::Uninitialized; + let queue = queue.initialize_empty_remote(&TimelineMetadata::example(), 0)?; + let tli = make_timeline(); + + // Enqueue a bunch of deletes, some with conflicting names. + let layer0 = make_layer(&tli, "000000000000000000000000000000000000-100000000000000000000000000000000000__00000000016B59D8-00000000016B5A51"); + let layer1 = make_layer(&tli, "100000000000000000000000000000000000-200000000000000000000000000000000000__00000000016B59D8-00000000016B5A51"); + let layer2 = make_layer(&tli, "200000000000000000000000000000000000-300000000000000000000000000000000000__00000000016B59D8-00000000016B5A51"); + let layer3 = make_layer(&tli, "300000000000000000000000000000000000-400000000000000000000000000000000000__00000000016B59D8-00000000016B5A51"); + + let ops = [ + UploadOp::Delete(Delete { + layers: vec![(layer0.layer_desc().layer_name(), layer0.metadata())], + }), + UploadOp::Delete(Delete { + layers: vec![(layer1.layer_desc().layer_name(), layer1.metadata())], + }), + UploadOp::Delete(Delete { + layers: vec![ + (layer1.layer_desc().layer_name(), layer1.metadata()), + (layer2.layer_desc().layer_name(), layer2.metadata()), + ], + }), + UploadOp::Delete(Delete { + layers: vec![(layer2.layer_desc().layer_name(), layer2.metadata())], + }), + UploadOp::Delete(Delete { + layers: vec![(layer3.layer_desc().layer_name(), layer3.metadata())], + }), + ]; + + queue.queued_operations.extend(ops.clone()); + + // Schedule all ready operations. Since deletes don't conflict, they're all scheduled. + let tasks = queue.schedule_ready(); + + assert_same_ops(tasks.iter().map(|t| &t.op), &ops); + assert!(queue.queued_operations.is_empty()); + + Ok(()) + } + + /// Conflicting uploads are serialized. + #[test] + fn schedule_upload_conflicts() -> anyhow::Result<()> { + let mut queue = UploadQueue::Uninitialized; + let queue = queue.initialize_with_current_remote_index_part(&IndexPart::example(), 0)?; + let tli = make_timeline(); + + // Enqueue three versions of the same layer, with different file sizes. + let layer0a = make_layer_with_size(&tli, "000000000000000000000000000000000000-100000000000000000000000000000000000__00000000016B59D8-00000000016B5A51", 1); + let layer0b = make_layer_with_size(&tli, "000000000000000000000000000000000000-100000000000000000000000000000000000__00000000016B59D8-00000000016B5A51", 2); + let layer0c = make_layer_with_size(&tli, "000000000000000000000000000000000000-100000000000000000000000000000000000__00000000016B59D8-00000000016B5A51", 3); + + let ops = [ + UploadOp::UploadLayer(layer0a.clone(), layer0a.metadata(), None), + UploadOp::UploadLayer(layer0b.clone(), layer0b.metadata(), None), + UploadOp::UploadLayer(layer0c.clone(), layer0c.metadata(), None), + ]; + + queue.queued_operations.extend(ops.clone()); + + // Only one version should be scheduled and uploaded at a time. + for op in ops { + let tasks = queue.schedule_ready(); + assert_eq!(tasks.len(), 1); + assert_same_op(&tasks[0].op, &op); + queue.complete(tasks[0].task_id); + } + assert!(queue.schedule_ready().is_empty()); + assert!(queue.queued_operations.is_empty()); + + Ok(()) + } + + /// Conflicting uploads and deletes are serialized. + #[test] + fn schedule_upload_delete_conflicts() -> anyhow::Result<()> { + let mut queue = UploadQueue::Uninitialized; + let queue = queue.initialize_with_current_remote_index_part(&IndexPart::example(), 0)?; + let tli = make_timeline(); + + // Enqueue two layer uploads, with a delete of both layers in between them. These should be + // scheduled one at a time, since deletes can't bypass uploads and vice versa. + let layer0 = make_layer(&tli, "000000000000000000000000000000000000-100000000000000000000000000000000000__00000000016B59D8-00000000016B5A51"); + let layer1 = make_layer(&tli, "100000000000000000000000000000000000-200000000000000000000000000000000000__00000000016B59D8-00000000016B5A51"); + + let ops = [ + UploadOp::UploadLayer(layer0.clone(), layer0.metadata(), None), + UploadOp::Delete(Delete { + layers: vec![ + (layer0.layer_desc().layer_name(), layer0.metadata()), + (layer1.layer_desc().layer_name(), layer1.metadata()), + ], + }), + UploadOp::UploadLayer(layer1.clone(), layer1.metadata(), None), + ]; + + queue.queued_operations.extend(ops.clone()); + + // Only one version should be scheduled and uploaded at a time. + for op in ops { + let tasks = queue.schedule_ready(); + assert_eq!(tasks.len(), 1); + assert_same_op(&tasks[0].op, &op); + queue.complete(tasks[0].task_id); + } + assert!(queue.schedule_ready().is_empty()); + assert!(queue.queued_operations.is_empty()); + + Ok(()) + } + + /// Non-conflicting uploads and deletes can bypass the queue, avoiding the conflicting + /// delete/upload operations at the head of the queue. + #[test] + fn schedule_upload_delete_conflicts_bypass() -> anyhow::Result<()> { + let mut queue = UploadQueue::Uninitialized; + let queue = queue.initialize_with_current_remote_index_part(&IndexPart::example(), 0)?; + let tli = make_timeline(); + + // Enqueue two layer uploads, with a delete of both layers in between them. These should be + // scheduled one at a time, since deletes can't bypass uploads and vice versa. + // + // Also enqueue non-conflicting uploads and deletes at the end. These can bypass the queue + // and run immediately. + let layer0 = make_layer(&tli, "000000000000000000000000000000000000-100000000000000000000000000000000000__00000000016B59D8-00000000016B5A51"); + let layer1 = make_layer(&tli, "100000000000000000000000000000000000-200000000000000000000000000000000000__00000000016B59D8-00000000016B5A51"); + let layer2 = make_layer(&tli, "200000000000000000000000000000000000-300000000000000000000000000000000000__00000000016B59D8-00000000016B5A51"); + let layer3 = make_layer(&tli, "300000000000000000000000000000000000-400000000000000000000000000000000000__00000000016B59D8-00000000016B5A51"); + + let ops = [ + UploadOp::UploadLayer(layer0.clone(), layer0.metadata(), None), + UploadOp::Delete(Delete { + layers: vec![ + (layer0.layer_desc().layer_name(), layer0.metadata()), + (layer1.layer_desc().layer_name(), layer1.metadata()), + ], + }), + UploadOp::UploadLayer(layer1.clone(), layer1.metadata(), None), + UploadOp::UploadLayer(layer2.clone(), layer2.metadata(), None), + UploadOp::Delete(Delete { + layers: vec![(layer3.layer_desc().layer_name(), layer3.metadata())], + }), + ]; + + queue.queued_operations.extend(ops.clone()); + + // Operations 0, 3, and 4 are scheduled immediately. + let tasks = queue.schedule_ready(); + assert_same_ops(tasks.iter().map(|t| &t.op), [&ops[0], &ops[3], &ops[4]]); + assert_eq!(queue.queued_operations.len(), 2); + + Ok(()) + } + + /// Non-conflicting uploads are parallelized. + #[test] + fn schedule_upload_parallel() -> anyhow::Result<()> { + let mut queue = UploadQueue::Uninitialized; + let queue = queue.initialize_with_current_remote_index_part(&IndexPart::example(), 0)?; + let tli = make_timeline(); + + // Enqueue three different layer uploads. + let layer0 = make_layer(&tli, "000000000000000000000000000000000000-100000000000000000000000000000000000__00000000016B59D8-00000000016B5A51"); + let layer1 = make_layer(&tli, "100000000000000000000000000000000000-200000000000000000000000000000000000__00000000016B59D8-00000000016B5A51"); + let layer2 = make_layer(&tli, "200000000000000000000000000000000000-300000000000000000000000000000000000__00000000016B59D8-00000000016B5A51"); + + let ops = [ + UploadOp::UploadLayer(layer0.clone(), layer0.metadata(), None), + UploadOp::UploadLayer(layer1.clone(), layer1.metadata(), None), + UploadOp::UploadLayer(layer2.clone(), layer2.metadata(), None), + ]; + + queue.queued_operations.extend(ops.clone()); + + // All uploads should be scheduled concurrently. + let tasks = queue.schedule_ready(); + + assert_same_ops(tasks.iter().map(|t| &t.op), &ops); + assert!(queue.queued_operations.is_empty()); + + Ok(()) + } + + /// Index uploads are coalesced. + #[test] + fn schedule_index_coalesce() -> anyhow::Result<()> { + let mut queue = UploadQueue::Uninitialized; + let queue = queue.initialize_with_current_remote_index_part(&IndexPart::example(), 0)?; + + // Enqueue three uploads of the current empty index. + let index = Box::new(queue.clean.0.clone()); + + let ops = [ + UploadOp::UploadMetadata { + uploaded: index.clone(), + }, + UploadOp::UploadMetadata { + uploaded: index.clone(), + }, + UploadOp::UploadMetadata { + uploaded: index.clone(), + }, + ]; + + queue.queued_operations.extend(ops.clone()); + + // The index uploads are coalesced into a single operation. + let tasks = queue.schedule_ready(); + assert_eq!(tasks.len(), 1); + assert_same_op(&tasks[0].op, &ops[2]); + assert_same_ops(&tasks[0].coalesced_ops, &ops[0..2]); + + assert!(queue.queued_operations.is_empty()); + + Ok(()) + } + + /// Chains of upload/index operations lead to parallel layer uploads and serial index uploads. + /// This is the common case with layer flushes. + #[test] + fn schedule_index_upload_chain() -> anyhow::Result<()> { + let mut queue = UploadQueue::Uninitialized; + let queue = queue.initialize_with_current_remote_index_part(&IndexPart::example(), 0)?; + let tli = make_timeline(); + + // Enqueue three uploads of the current empty index. + let index = Box::new(queue.clean.0.clone()); + let layer0 = make_layer(&tli, "000000000000000000000000000000000000-100000000000000000000000000000000000__00000000016B59D8-00000000016B5A51"); + let index0 = index_with(&index, &layer0); + let layer1 = make_layer(&tli, "100000000000000000000000000000000000-200000000000000000000000000000000000__00000000016B59D8-00000000016B5A51"); + let index1 = index_with(&index0, &layer1); + let layer2 = make_layer(&tli, "200000000000000000000000000000000000-300000000000000000000000000000000000__00000000016B59D8-00000000016B5A51"); + let index2 = index_with(&index1, &layer2); + + let ops = [ + UploadOp::UploadLayer(layer0.clone(), layer0.metadata(), None), + UploadOp::UploadMetadata { + uploaded: index0.clone(), + }, + UploadOp::UploadLayer(layer1.clone(), layer1.metadata(), None), + UploadOp::UploadMetadata { + uploaded: index1.clone(), + }, + UploadOp::UploadLayer(layer2.clone(), layer2.metadata(), None), + UploadOp::UploadMetadata { + uploaded: index2.clone(), + }, + ]; + + queue.queued_operations.extend(ops.clone()); + + // The layer uploads should be scheduled immediately. The indexes must wait. + let upload_tasks = queue.schedule_ready(); + assert_same_ops( + upload_tasks.iter().map(|t| &t.op), + [&ops[0], &ops[2], &ops[4]], + ); + + // layer2 completes first. None of the indexes can upload yet. + queue.complete(upload_tasks[2].task_id); + assert!(queue.schedule_ready().is_empty()); + + // layer0 completes. index0 can upload. It completes. + queue.complete(upload_tasks[0].task_id); + let index_tasks = queue.schedule_ready(); + assert_eq!(index_tasks.len(), 1); + assert_same_op(&index_tasks[0].op, &ops[1]); + queue.complete(index_tasks[0].task_id); + + // layer 1 completes. This unblocks index 1 and 2, which coalesce into + // a single upload for index 2. + queue.complete(upload_tasks[1].task_id); + + let index_tasks = queue.schedule_ready(); + assert_eq!(index_tasks.len(), 1); + assert_same_op(&index_tasks[0].op, &ops[5]); + assert_same_ops(&index_tasks[0].coalesced_ops, &ops[3..4]); + + assert!(queue.queued_operations.is_empty()); + + Ok(()) + } + + /// A delete can't bypass an index upload if an index ahead of it still references it. + #[test] + fn schedule_index_delete_dereferenced() -> anyhow::Result<()> { + let mut queue = UploadQueue::Uninitialized; + let queue = queue.initialize_with_current_remote_index_part(&IndexPart::example(), 0)?; + let tli = make_timeline(); + + // Create a layer to upload. + let layer = make_layer(&tli, "000000000000000000000000000000000000-100000000000000000000000000000000000__00000000016B59D8-00000000016B5A51"); + let index_upload = index_with(&queue.clean.0, &layer); + + // Remove the layer reference in a new index, then delete the layer. + let index_deref = index_without(&index_upload, &layer); + + let ops = [ + // Initial upload, with a barrier to prevent index coalescing. + UploadOp::UploadLayer(layer.clone(), layer.metadata(), None), + UploadOp::UploadMetadata { + uploaded: index_upload.clone(), + }, + UploadOp::Barrier(tokio::sync::watch::channel(()).0), + // Dereference the layer and delete it. + UploadOp::UploadMetadata { + uploaded: index_deref.clone(), + }, + UploadOp::Delete(Delete { + layers: vec![(layer.layer_desc().layer_name(), layer.metadata())], + }), + ]; + + queue.queued_operations.extend(ops.clone()); + + // Operations are serialized. + for op in ops { + let tasks = queue.schedule_ready(); + assert_eq!(tasks.len(), 1); + assert_same_op(&tasks[0].op, &op); + queue.complete(tasks[0].task_id); + } + assert!(queue.queued_operations.is_empty()); + + Ok(()) + } + + /// An upload with a reused layer name doesn't clobber the previous layer. Specifically, a + /// dereference/upload/reference cycle can't allow the upload to bypass the reference. + #[test] + fn schedule_index_upload_dereferenced() -> anyhow::Result<()> { + let mut queue = UploadQueue::Uninitialized; + let queue = queue.initialize_with_current_remote_index_part(&IndexPart::example(), 0)?; + let tli = make_timeline(); + + // Create a layer to upload. + let layer = make_layer(&tli, "000000000000000000000000000000000000-100000000000000000000000000000000000__00000000016B59D8-00000000016B5A51"); + + // Upload the layer. Then dereference the layer, and upload/reference it again. + let index_upload = index_with(&queue.clean.0, &layer); + let index_deref = index_without(&index_upload, &layer); + let index_ref = index_with(&index_deref, &layer); + + let ops = [ + // Initial upload, with a barrier to prevent index coalescing. + UploadOp::UploadLayer(layer.clone(), layer.metadata(), None), + UploadOp::UploadMetadata { + uploaded: index_upload.clone(), + }, + UploadOp::Barrier(tokio::sync::watch::channel(()).0), + // Dereference the layer. + UploadOp::UploadMetadata { + uploaded: index_deref.clone(), + }, + // Replace and reference the layer. + UploadOp::UploadLayer(layer.clone(), layer.metadata(), None), + UploadOp::UploadMetadata { + uploaded: index_ref.clone(), + }, + ]; + + queue.queued_operations.extend(ops.clone()); + + // Operations are serialized. + for op in ops { + let tasks = queue.schedule_ready(); + assert_eq!(tasks.len(), 1); + assert_same_op(&tasks[0].op, &op); + queue.complete(tasks[0].task_id); + } + assert!(queue.queued_operations.is_empty()); + + Ok(()) + } + + /// Nothing can bypass a shutdown, and it waits for inprogress tasks. It's never returned from + /// next_ready(), but is left at the head of the queue. + #[test] + fn schedule_shutdown() -> anyhow::Result<()> { + let mut queue = UploadQueue::Uninitialized; + let queue = queue.initialize_empty_remote(&TimelineMetadata::example(), 0)?; + let tli = make_timeline(); + + let index = Box::new(queue.clean.0.clone()); // empty, doesn't matter + let layer0 = make_layer(&tli, "000000000000000000000000000000000000-100000000000000000000000000000000000__00000000016B59D8-00000000016B5A51"); + let layer1 = make_layer(&tli, "100000000000000000000000000000000000-200000000000000000000000000000000000__00000000016B59D8-00000000016B5A51"); + let layer2 = make_layer(&tli, "200000000000000000000000000000000000-300000000000000000000000000000000000__00000000016B59D8-00000000016B5A51"); + let layer3 = make_layer(&tli, "300000000000000000000000000000000000-400000000000000000000000000000000000__00000000016B59D8-00000000016B5A51"); + + // Enqueue non-conflicting upload, delete, and index before and after a shutdown. + let ops = [ + UploadOp::UploadLayer(layer0.clone(), layer0.metadata(), None), + UploadOp::Delete(Delete { + layers: vec![(layer1.layer_desc().layer_name(), layer1.metadata())], + }), + UploadOp::UploadMetadata { + uploaded: index.clone(), + }, + UploadOp::Shutdown, + UploadOp::UploadLayer(layer2.clone(), layer2.metadata(), None), + UploadOp::Delete(Delete { + layers: vec![(layer3.layer_desc().layer_name(), layer3.metadata())], + }), + UploadOp::UploadMetadata { + uploaded: index.clone(), + }, + ]; + + queue.queued_operations.extend(ops.clone()); + + // Schedule the initial operations ahead of the shutdown. + let tasks = queue.schedule_ready(); + + assert_same_ops(tasks.iter().map(|t| &t.op), &ops[0..3]); + assert!(matches!( + queue.queued_operations.front(), + Some(&UploadOp::Shutdown) + )); + + // Complete the initial operations. The shutdown isn't triggered while they're pending. + for task in tasks { + assert!(queue.schedule_ready().is_empty()); + queue.complete(task.task_id); + } + + // The shutdown is triggered the next time we try to pull an operation. It isn't returned, + // but is left in the queue. + assert!(!queue.shutdown_ready.is_closed()); + assert!(queue.next_ready().is_none()); + assert!(queue.shutdown_ready.is_closed()); + + Ok(()) + } + + /// Scheduling respects inprogress_limit. + #[test] + fn schedule_inprogress_limit() -> anyhow::Result<()> { + // Create a queue with inprogress_limit=2. + let mut queue = UploadQueue::Uninitialized; + let queue = queue.initialize_empty_remote(&TimelineMetadata::example(), 2)?; + let tli = make_timeline(); + + // Enqueue a bunch of uploads. + let layer0 = make_layer(&tli, "000000000000000000000000000000000000-100000000000000000000000000000000000__00000000016B59D8-00000000016B5A51"); + let layer1 = make_layer(&tli, "100000000000000000000000000000000000-200000000000000000000000000000000000__00000000016B59D8-00000000016B5A51"); + let layer2 = make_layer(&tli, "200000000000000000000000000000000000-300000000000000000000000000000000000__00000000016B59D8-00000000016B5A51"); + let layer3 = make_layer(&tli, "300000000000000000000000000000000000-400000000000000000000000000000000000__00000000016B59D8-00000000016B5A51"); + + let ops = [ + UploadOp::UploadLayer(layer0.clone(), layer0.metadata(), None), + UploadOp::UploadLayer(layer1.clone(), layer1.metadata(), None), + UploadOp::UploadLayer(layer2.clone(), layer2.metadata(), None), + UploadOp::UploadLayer(layer3.clone(), layer3.metadata(), None), + ]; + + queue.queued_operations.extend(ops.clone()); + + // Schedule all ready operations. Only 2 are scheduled. + let tasks = queue.schedule_ready(); + assert_same_ops(tasks.iter().map(|t| &t.op), &ops[0..2]); + assert!(queue.next_ready().is_none()); + + // When one completes, another is scheduled. + queue.complete(tasks[0].task_id); + let tasks = queue.schedule_ready(); + assert_same_ops(tasks.iter().map(|t| &t.op), &ops[2..3]); + + Ok(()) + } + + /// Tests that can_bypass takes name, generation and shard index into account for all operations. + #[test] + fn can_bypass_path() -> anyhow::Result<()> { + let tli = make_timeline(); + + let name0 = &"000000000000000000000000000000000000-100000000000000000000000000000000000__00000000016B59D8-00000000016B5A51"; + let name1 = &"100000000000000000000000000000000000-200000000000000000000000000000000000__00000000016B59D8-00000000016B5A51"; + + // Asserts that layers a and b either can or can't bypass each other, for all combinations + // of operations (except Delete and UploadMetadata which are special-cased). + #[track_caller] + fn assert_can_bypass(a: ResidentLayer, b: ResidentLayer, can_bypass: bool) { + let index = IndexPart::empty(TimelineMetadata::example()); + for (a, b) in make_ops(a).into_iter().zip(make_ops(b)) { + match (&a, &b) { + // Deletes can always bypass each other. + (UploadOp::Delete(_), UploadOp::Delete(_)) => assert!(a.can_bypass(&b, &index)), + // Indexes can never bypass each other. + (UploadOp::UploadMetadata { .. }, UploadOp::UploadMetadata { .. }) => { + assert!(!a.can_bypass(&b, &index)) + } + // For other operations, assert as requested. + (a, b) => assert_eq!(a.can_bypass(b, &index), can_bypass), + } + } + } + + fn make_ops(layer: ResidentLayer) -> Vec { + let mut index = IndexPart::empty(TimelineMetadata::example()); + index + .layer_metadata + .insert(layer.layer_desc().layer_name(), layer.metadata()); + vec![ + UploadOp::UploadLayer(layer.clone(), layer.metadata(), None), + UploadOp::Delete(Delete { + layers: vec![(layer.layer_desc().layer_name(), layer.metadata())], + }), + UploadOp::UploadMetadata { + uploaded: Box::new(index), + }, + ] + } + + // Makes a ResidentLayer. + let layer = |name: &'static str, shard: Option, generation: u32| -> ResidentLayer { + let shard = shard + .map(|n| ShardIndex::new(ShardNumber(n), ShardCount(8))) + .unwrap_or(ShardIndex::unsharded()); + let metadata = LayerFileMetadata { + shard, + generation: Generation::Valid(generation), + file_size: 0, + }; + make_layer_with_metadata(&tli, name, metadata) + }; + + // Same name and metadata can't bypass. This goes both for unsharded and sharded, as well as + // 0 or >0 generation. + assert_can_bypass(layer(name0, None, 0), layer(name0, None, 0), false); + assert_can_bypass(layer(name0, Some(0), 0), layer(name0, Some(0), 0), false); + assert_can_bypass(layer(name0, None, 1), layer(name0, None, 1), false); + + // Different names can bypass. + assert_can_bypass(layer(name0, None, 0), layer(name1, None, 0), true); + + // Different shards can bypass. Shard 0 is different from unsharded. + assert_can_bypass(layer(name0, Some(0), 0), layer(name0, Some(1), 0), true); + assert_can_bypass(layer(name0, Some(0), 0), layer(name0, None, 0), true); + + // Different generations can bypass, both sharded and unsharded. + assert_can_bypass(layer(name0, None, 0), layer(name0, None, 1), true); + assert_can_bypass(layer(name0, Some(1), 0), layer(name0, Some(1), 1), true); + + Ok(()) + } +} diff --git a/pageserver/src/walingest.rs b/pageserver/src/walingest.rs index e5b23fed51..ad7bcc0714 100644 --- a/pageserver/src/walingest.rs +++ b/pageserver/src/walingest.rs @@ -308,7 +308,7 @@ impl WalIngest { epoch -= 1; } - Ok((epoch as u64) << 32 | xid as u64) + Ok(((epoch as u64) << 32) | xid as u64) } async fn ingest_clear_vm_bits( @@ -2163,10 +2163,12 @@ mod tests { while let Some((lsn, recdata)) = decoder.poll_decode().unwrap() { let interpreted = InterpretedWalRecord::from_bytes_filtered( recdata, - modification.tline.get_shard_identity(), + &[*modification.tline.get_shard_identity()], lsn, modification.tline.pg_version, ) + .unwrap() + .remove(modification.tline.get_shard_identity()) .unwrap(); walingest diff --git a/pgxn/hnsw/Makefile b/pgxn/hnsw/Makefile deleted file mode 100644 index 66436b5920..0000000000 --- a/pgxn/hnsw/Makefile +++ /dev/null @@ -1,26 +0,0 @@ -EXTENSION = hnsw -EXTVERSION = 0.1.0 - -MODULE_big = hnsw -DATA = $(wildcard *--*.sql) -OBJS = hnsw.o hnswalg.o - -TESTS = $(wildcard test/sql/*.sql) -REGRESS = $(patsubst test/sql/%.sql,%,$(TESTS)) -REGRESS_OPTS = --inputdir=test --load-extension=hnsw - -# For auto-vectorization: -# - GCC (needs -ftree-vectorize OR -O3) - https://gcc.gnu.org/projects/tree-ssa/vectorization.html -PG_CFLAGS += -O3 -PG_CXXFLAGS += -O3 -std=c++11 -PG_LDFLAGS += -lstdc++ - -all: $(EXTENSION)--$(EXTVERSION).sql - -PG_CONFIG ?= pg_config -PGXS := $(shell $(PG_CONFIG) --pgxs) -include $(PGXS) - -dist: - mkdir -p dist - git archive --format zip --prefix=$(EXTENSION)-$(EXTVERSION)/ --output dist/$(EXTENSION)-$(EXTVERSION).zip master diff --git a/pgxn/hnsw/README.md b/pgxn/hnsw/README.md deleted file mode 100644 index bc9c8d571c..0000000000 --- a/pgxn/hnsw/README.md +++ /dev/null @@ -1,25 +0,0 @@ -# Revisiting the Inverted Indices for Billion-Scale Approximate Nearest Neighbors - -This ANN extension of Postgres is based -on [ivf-hnsw](https://github.com/dbaranchuk/ivf-hnsw.git) implementation of [HNSW](https://www.pinecone.io/learn/hnsw), -the code for the current state-of-the-art billion-scale nearest neighbor search system presented in the paper: - -[Revisiting the Inverted Indices for Billion-Scale Approximate Nearest Neighbors](http://openaccess.thecvf.com/content_ECCV_2018/html/Dmitry_Baranchuk_Revisiting_the_Inverted_ECCV_2018_paper.html), -
-Dmitry Baranchuk, Artem Babenko, Yury Malkov - -# Postgres extension - -HNSW index is hold in memory (built on demand) and it's maxial size is limited -by `maxelements` index parameter. Another required parameter is nubmer of dimensions (if it is not specified in column type). -Optional parameter `ef` specifies number of neighbors which are considered during index construction and search (corresponds `efConstruction` and `efSearch` parameters -described in the article). - -# Example of usage: - -``` -create extension hnsw; -create table embeddings(id integer primary key, payload real[]); -create index on embeddings using hnsw(payload) with (maxelements=1000000, dims=100, m=32); -select id from embeddings order by payload <-> array[1.0, 2.0,...] limit 100; -``` \ No newline at end of file diff --git a/pgxn/hnsw/hnsw--0.1.0.sql b/pgxn/hnsw/hnsw--0.1.0.sql deleted file mode 100644 index ebf424326d..0000000000 --- a/pgxn/hnsw/hnsw--0.1.0.sql +++ /dev/null @@ -1,29 +0,0 @@ --- complain if script is sourced in psql, rather than via CREATE EXTENSION -\echo Use "CREATE EXTENSION hnsw" to load this file. \quit - --- functions - -CREATE FUNCTION l2_distance(real[], real[]) RETURNS real - AS 'MODULE_PATHNAME' LANGUAGE C IMMUTABLE STRICT PARALLEL SAFE; - --- operators - -CREATE OPERATOR <-> ( - LEFTARG = real[], RIGHTARG = real[], PROCEDURE = l2_distance, - COMMUTATOR = '<->' -); - --- access method - -CREATE FUNCTION hnsw_handler(internal) RETURNS index_am_handler - AS 'MODULE_PATHNAME' LANGUAGE C; - -CREATE ACCESS METHOD hnsw TYPE INDEX HANDLER hnsw_handler; - -COMMENT ON ACCESS METHOD hnsw IS 'hnsw index access method'; - --- opclasses - -CREATE OPERATOR CLASS knn_ops - DEFAULT FOR TYPE real[] USING hnsw AS - OPERATOR 1 <-> (real[], real[]) FOR ORDER BY float_ops; diff --git a/pgxn/hnsw/hnsw.c b/pgxn/hnsw/hnsw.c deleted file mode 100644 index e624cb831f..0000000000 --- a/pgxn/hnsw/hnsw.c +++ /dev/null @@ -1,590 +0,0 @@ -#include "postgres.h" - -#include "access/amapi.h" -#include "access/generic_xlog.h" -#include "access/relation.h" -#include "access/reloptions.h" -#include "access/tableam.h" -#include "catalog/index.h" -#include "commands/vacuum.h" -#include "nodes/execnodes.h" -#include "storage/bufmgr.h" -#include "utils/guc.h" -#include "utils/selfuncs.h" - -#include -#include - -#include "hnsw.h" - -PG_MODULE_MAGIC; - -typedef struct { - int32 vl_len_; /* varlena header (do not touch directly!) */ - int dims; - int maxelements; - int efConstruction; - int efSearch; - int M; -} HnswOptions; - -static relopt_kind hnsw_relopt_kind; - -typedef struct { - HierarchicalNSW* hnsw; - size_t curr; - size_t n_results; - ItemPointer results; -} HnswScanOpaqueData; - -typedef HnswScanOpaqueData* HnswScanOpaque; - -typedef struct { - Oid relid; - uint32 status; - HierarchicalNSW* hnsw; -} HnswHashEntry; - - -#define SH_PREFIX hnsw_index -#define SH_ELEMENT_TYPE HnswHashEntry -#define SH_KEY_TYPE Oid -#define SH_KEY relid -#define SH_STORE_HASH -#define SH_GET_HASH(tb, a) ((a)->relid) -#define SH_HASH_KEY(tb, key) (key) -#define SH_EQUAL(tb, a, b) ((a) == (b)) -#define SH_SCOPE static inline -#define SH_DEFINE -#define SH_DECLARE -#include "lib/simplehash.h" - -#define INDEX_HASH_SIZE 11 - -#define DEFAULT_EF_SEARCH 64 - -PGDLLEXPORT void _PG_init(void); - -static hnsw_index_hash *hnsw_indexes; - -/* - * Initialize index options and variables - */ -void -_PG_init(void) -{ - hnsw_relopt_kind = add_reloption_kind(); - add_int_reloption(hnsw_relopt_kind, "dims", "Number of dimensions", - 0, 0, INT_MAX, AccessExclusiveLock); - add_int_reloption(hnsw_relopt_kind, "maxelements", "Maximal number of elements", - 0, 0, INT_MAX, AccessExclusiveLock); - add_int_reloption(hnsw_relopt_kind, "m", "Number of neighbors of each vertex", - 100, 0, INT_MAX, AccessExclusiveLock); - add_int_reloption(hnsw_relopt_kind, "efconstruction", "Number of inspected neighbors during index construction", - 16, 1, INT_MAX, AccessExclusiveLock); - add_int_reloption(hnsw_relopt_kind, "efsearch", "Number of inspected neighbors during index search", - 64, 1, INT_MAX, AccessExclusiveLock); - hnsw_indexes = hnsw_index_create(TopMemoryContext, INDEX_HASH_SIZE, NULL); -} - - -static void -hnsw_build_callback(Relation index, ItemPointer tid, Datum *values, - bool *isnull, bool tupleIsAlive, void *state) -{ - HierarchicalNSW* hnsw = (HierarchicalNSW*) state; - ArrayType* array; - int n_items; - label_t label = 0; - - /* Skip nulls */ - if (isnull[0]) - return; - - array = DatumGetArrayTypeP(values[0]); - n_items = ArrayGetNItems(ARR_NDIM(array), ARR_DIMS(array)); - if (n_items != hnsw_dimensions(hnsw)) - { - elog(ERROR, "Wrong number of dimensions: %d instead of %d expected", - n_items, hnsw_dimensions(hnsw)); - } - - memcpy(&label, tid, sizeof(*tid)); - hnsw_add_point(hnsw, (coord_t*)ARR_DATA_PTR(array), label); -} - -static void -hnsw_populate(HierarchicalNSW* hnsw, Relation indexRel, Relation heapRel) -{ - IndexInfo* indexInfo = BuildIndexInfo(indexRel); - Assert(indexInfo->ii_NumIndexAttrs == 1); - table_index_build_scan(heapRel, indexRel, indexInfo, - true, true, hnsw_build_callback, (void *) hnsw, NULL); -} - -#ifdef __APPLE__ - -#include -#include - -static void -hnsw_check_available_memory(Size requested) -{ - size_t total; - if (sysctlbyname("hw.memsize", NULL, &total, NULL, 0) < 0) - elog(ERROR, "Failed to get amount of RAM: %m"); - - if ((Size)NBuffers*BLCKSZ + requested >= total) - elog(ERROR, "HNSW index requeries %ld bytes while only %ld are available", - requested, total - (Size)NBuffers*BLCKSZ); -} - -#else - -#include - -static void -hnsw_check_available_memory(Size requested) -{ - struct sysinfo si; - Size total; - if (sysinfo(&si) < 0) - elog(ERROR, "Failed to get amount of RAM: %m"); - - total = si.totalram*si.mem_unit; - if ((Size)NBuffers*BLCKSZ + requested >= total) - elog(ERROR, "HNSW index requeries %ld bytes while only %ld are available", - requested, total - (Size)NBuffers*BLCKSZ); -} - -#endif - -static HierarchicalNSW* -hnsw_get_index(Relation indexRel, Relation heapRel) -{ - HierarchicalNSW* hnsw; - Oid indexoid = RelationGetRelid(indexRel); - HnswHashEntry* entry = hnsw_index_lookup(hnsw_indexes, indexoid); - if (entry == NULL) - { - size_t dims, maxelements; - size_t M; - size_t maxM; - size_t size_links_level0; - size_t size_data_per_element; - size_t data_size; - dsm_handle handle = indexoid << 1; /* make it even */ - void* impl_private = NULL; - void* mapped_address = NULL; - Size mapped_size = 0; - Size shmem_size; - bool exists = true; - bool found; - HnswOptions *opts = (HnswOptions *) indexRel->rd_options; - if (opts == NULL || opts->maxelements == 0 || opts->dims == 0) { - elog(ERROR, "HNSW index requires 'maxelements' and 'dims' to be specified"); - } - dims = opts->dims; - maxelements = opts->maxelements; - M = opts->M; - maxM = M * 2; - data_size = dims * sizeof(coord_t); - size_links_level0 = (maxM + 1) * sizeof(idx_t); - size_data_per_element = size_links_level0 + data_size + sizeof(label_t); - shmem_size = hnsw_sizeof() + maxelements * size_data_per_element; - - hnsw_check_available_memory(shmem_size); - - /* first try to attach to existed index */ - if (!dsm_impl_op(DSM_OP_ATTACH, handle, 0, &impl_private, - &mapped_address, &mapped_size, DEBUG1)) - { - /* index doesn't exists: try to create it */ - if (!dsm_impl_op(DSM_OP_CREATE, handle, shmem_size, &impl_private, - &mapped_address, &mapped_size, DEBUG1)) - { - /* We can do it under shared lock, so some other backend may - * try to initialize index. If create is failed because index already - * created by somebody else, then try to attach to it once again - */ - if (!dsm_impl_op(DSM_OP_ATTACH, handle, 0, &impl_private, - &mapped_address, &mapped_size, ERROR)) - { - return NULL; - } - } - else - { - exists = false; - } - } - Assert(mapped_size == shmem_size); - hnsw = (HierarchicalNSW*)mapped_address; - - if (!exists) - { - hnsw_init(hnsw, dims, maxelements, M, maxM, opts->efConstruction); - hnsw_populate(hnsw, indexRel, heapRel); - } - entry = hnsw_index_insert(hnsw_indexes, indexoid, &found); - Assert(!found); - entry->hnsw = hnsw; - } - else - { - hnsw = entry->hnsw; - } - return hnsw; -} - -/* - * Start or restart an index scan - */ -static IndexScanDesc -hnsw_beginscan(Relation index, int nkeys, int norderbys) -{ - IndexScanDesc scan = RelationGetIndexScan(index, nkeys, norderbys); - HnswScanOpaque so = (HnswScanOpaque) palloc(sizeof(HnswScanOpaqueData)); - Relation heap = relation_open(index->rd_index->indrelid, NoLock); - so->hnsw = hnsw_get_index(index, heap); - relation_close(heap, NoLock); - so->curr = 0; - so->n_results = 0; - so->results = NULL; - scan->opaque = so; - return scan; -} - -/* - * Start or restart an index scan - */ -static void -hnsw_rescan(IndexScanDesc scan, ScanKey keys, int nkeys, ScanKey orderbys, int norderbys) -{ - HnswScanOpaque so = (HnswScanOpaque) scan->opaque; - if (so->results) - { - pfree(so->results); - so->results = NULL; - } - so->curr = 0; - if (orderbys && scan->numberOfOrderBys > 0) - memmove(scan->orderByData, orderbys, scan->numberOfOrderBys * sizeof(ScanKeyData)); -} - -/* - * Fetch the next tuple in the given scan - */ -static bool -hnsw_gettuple(IndexScanDesc scan, ScanDirection dir) -{ - HnswScanOpaque so = (HnswScanOpaque) scan->opaque; - - /* - * Index can be used to scan backward, but Postgres doesn't support - * backward scan on operators - */ - Assert(ScanDirectionIsForward(dir)); - - if (so->curr == 0) - { - Datum value; - ArrayType* array; - int n_items; - size_t n_results; - label_t* results; - HnswOptions *opts = (HnswOptions *) scan->indexRelation->rd_options; - size_t efSearch = opts ? opts->efSearch : DEFAULT_EF_SEARCH; - - /* Safety check */ - if (scan->orderByData == NULL) - elog(ERROR, "cannot scan HNSW index without order"); - - /* No items will match if null */ - if (scan->orderByData->sk_flags & SK_ISNULL) - return false; - - value = scan->orderByData->sk_argument; - array = DatumGetArrayTypeP(value); - n_items = ArrayGetNItems(ARR_NDIM(array), ARR_DIMS(array)); - if (n_items != hnsw_dimensions(so->hnsw)) - { - elog(ERROR, "Wrong number of dimensions: %d instead of %d expected", - n_items, hnsw_dimensions(so->hnsw)); - } - - if (!hnsw_search(so->hnsw, (coord_t*)ARR_DATA_PTR(array), efSearch, &n_results, &results)) - elog(ERROR, "HNSW index search failed"); - so->results = (ItemPointer)palloc(n_results*sizeof(ItemPointerData)); - so->n_results = n_results; - for (size_t i = 0; i < n_results; i++) - { - memcpy(&so->results[i], &results[i], sizeof(so->results[i])); - } - free(results); - } - if (so->curr >= so->n_results) - { - return false; - } - else - { - scan->xs_heaptid = so->results[so->curr++]; - scan->xs_recheckorderby = false; - return true; - } -} - -/* - * End a scan and release resources - */ -static void -hnsw_endscan(IndexScanDesc scan) -{ - HnswScanOpaque so = (HnswScanOpaque) scan->opaque; - if (so->results) - pfree(so->results); - pfree(so); - scan->opaque = NULL; -} - - -/* - * Estimate the cost of an index scan - */ -static void -hnsw_costestimate(PlannerInfo *root, IndexPath *path, double loop_count, - Cost *indexStartupCost, Cost *indexTotalCost, - Selectivity *indexSelectivity, double *indexCorrelation - ,double *indexPages -) -{ - GenericCosts costs; - - /* Never use index without order */ - if (path->indexorderbys == NULL) - { - *indexStartupCost = DBL_MAX; - *indexTotalCost = DBL_MAX; - *indexSelectivity = 0; - *indexCorrelation = 0; - *indexPages = 0; - return; - } - - MemSet(&costs, 0, sizeof(costs)); - - genericcostestimate(root, path, loop_count, &costs); - - /* Startup cost and total cost are same */ - *indexStartupCost = costs.indexTotalCost; - *indexTotalCost = costs.indexTotalCost; - *indexSelectivity = costs.indexSelectivity; - *indexCorrelation = costs.indexCorrelation; - *indexPages = costs.numIndexPages; -} - -/* - * Parse and validate the reloptions - */ -static bytea * -hnsw_options(Datum reloptions, bool validate) -{ - static const relopt_parse_elt tab[] = { - {"dims", RELOPT_TYPE_INT, offsetof(HnswOptions, dims)}, - {"maxelements", RELOPT_TYPE_INT, offsetof(HnswOptions, maxelements)}, - {"efconstruction", RELOPT_TYPE_INT, offsetof(HnswOptions, efConstruction)}, - {"efsearch", RELOPT_TYPE_INT, offsetof(HnswOptions, efSearch)}, - {"m", RELOPT_TYPE_INT, offsetof(HnswOptions, M)} - }; - - return (bytea *) build_reloptions(reloptions, validate, - hnsw_relopt_kind, - sizeof(HnswOptions), - tab, lengthof(tab)); -} - -/* - * Validate catalog entries for the specified operator class - */ -static bool -hnsw_validate(Oid opclassoid) -{ - return true; -} - -/* - * Build the index for a logged table - */ -static IndexBuildResult * -hnsw_build(Relation heap, Relation index, IndexInfo *indexInfo) -{ - HierarchicalNSW* hnsw = hnsw_get_index(index, heap); - IndexBuildResult* result = (IndexBuildResult *) palloc(sizeof(IndexBuildResult)); - result->heap_tuples = result->index_tuples = hnsw_count(hnsw); - - return result; -} - -/* - * Insert a tuple into the index - */ -static bool -hnsw_insert(Relation index, Datum *values, bool *isnull, ItemPointer heap_tid, - Relation heap, IndexUniqueCheck checkUnique, - bool indexUnchanged, - IndexInfo *indexInfo) -{ - HierarchicalNSW* hnsw = hnsw_get_index(index, heap); - Datum value; - ArrayType* array; - int n_items; - label_t label = 0; - - /* Skip nulls */ - if (isnull[0]) - return false; - - /* Detoast value */ - value = PointerGetDatum(PG_DETOAST_DATUM(values[0])); - array = DatumGetArrayTypeP(value); - n_items = ArrayGetNItems(ARR_NDIM(array), ARR_DIMS(array)); - if (n_items != hnsw_dimensions(hnsw)) - { - elog(ERROR, "Wrong number of dimensions: %d instead of %d expected", - n_items, hnsw_dimensions(hnsw)); - } - memcpy(&label, heap_tid, sizeof(*heap_tid)); - if (!hnsw_add_point(hnsw, (coord_t*)ARR_DATA_PTR(array), label)) - elog(ERROR, "HNSW index insert failed"); - return true; -} - -/* - * Build the index for an unlogged table - */ -static void -hnsw_buildempty(Relation index) -{ - /* index will be constructed on dema nd when accessed */ -} - -/* - * Clean up after a VACUUM operation - */ -static IndexBulkDeleteResult * -hnsw_vacuumcleanup(IndexVacuumInfo *info, IndexBulkDeleteResult *stats) -{ - Relation rel = info->index; - - if (stats == NULL) - return NULL; - - stats->num_pages = RelationGetNumberOfBlocks(rel); - - return stats; -} - -/* - * Bulk delete tuples from the index - */ -static IndexBulkDeleteResult * -hnsw_bulkdelete(IndexVacuumInfo *info, IndexBulkDeleteResult *stats, - IndexBulkDeleteCallback callback, void *callback_state) -{ - if (stats == NULL) - stats = (IndexBulkDeleteResult *) palloc0(sizeof(IndexBulkDeleteResult)); - return stats; -} - -/* - * Define index handler - * - * See https://www.postgresql.org/docs/current/index-api.html - */ -PGDLLEXPORT PG_FUNCTION_INFO_V1(hnsw_handler); -Datum -hnsw_handler(PG_FUNCTION_ARGS) -{ - IndexAmRoutine *amroutine = makeNode(IndexAmRoutine); - - amroutine->amstrategies = 0; - amroutine->amsupport = 0; - amroutine->amoptsprocnum = 0; - amroutine->amcanorder = false; - amroutine->amcanorderbyop = true; - amroutine->amcanbackward = false; /* can change direction mid-scan */ - amroutine->amcanunique = false; - amroutine->amcanmulticol = false; - amroutine->amoptionalkey = true; - amroutine->amsearcharray = false; - amroutine->amsearchnulls = false; - amroutine->amstorage = false; - amroutine->amclusterable = false; - amroutine->ampredlocks = false; - amroutine->amcanparallel = false; - amroutine->amcaninclude = false; - amroutine->amusemaintenanceworkmem = false; /* not used during VACUUM */ - amroutine->amparallelvacuumoptions = VACUUM_OPTION_PARALLEL_BULKDEL; - amroutine->amkeytype = InvalidOid; - - /* Interface functions */ - amroutine->ambuild = hnsw_build; - amroutine->ambuildempty = hnsw_buildempty; - amroutine->aminsert = hnsw_insert; - amroutine->ambulkdelete = hnsw_bulkdelete; - amroutine->amvacuumcleanup = hnsw_vacuumcleanup; - amroutine->amcanreturn = NULL; /* tuple not included in heapsort */ - amroutine->amcostestimate = hnsw_costestimate; - amroutine->amoptions = hnsw_options; - amroutine->amproperty = NULL; /* TODO AMPROP_DISTANCE_ORDERABLE */ - amroutine->ambuildphasename = NULL; - amroutine->amvalidate = hnsw_validate; - amroutine->amadjustmembers = NULL; - amroutine->ambeginscan = hnsw_beginscan; - amroutine->amrescan = hnsw_rescan; - amroutine->amgettuple = hnsw_gettuple; - amroutine->amgetbitmap = NULL; - amroutine->amendscan = hnsw_endscan; - amroutine->ammarkpos = NULL; - amroutine->amrestrpos = NULL; - - /* Interface functions to support parallel index scans */ - amroutine->amestimateparallelscan = NULL; - amroutine->aminitparallelscan = NULL; - amroutine->amparallelrescan = NULL; - - PG_RETURN_POINTER(amroutine); -} - -/* - * Get the L2 distance between vectors - */ -PGDLLEXPORT PG_FUNCTION_INFO_V1(l2_distance); -Datum -l2_distance(PG_FUNCTION_ARGS) -{ - ArrayType *a = PG_GETARG_ARRAYTYPE_P(0); - ArrayType *b = PG_GETARG_ARRAYTYPE_P(1); - int a_dim = ArrayGetNItems(ARR_NDIM(a), ARR_DIMS(a)); - int b_dim = ArrayGetNItems(ARR_NDIM(b), ARR_DIMS(b)); - dist_t distance = 0.0; - dist_t diff; - coord_t *ax = (coord_t*)ARR_DATA_PTR(a); - coord_t *bx = (coord_t*)ARR_DATA_PTR(b); - - if (a_dim != b_dim) - { - ereport(ERROR, - (errcode(ERRCODE_DATA_EXCEPTION), - errmsg("different array dimensions %d and %d", a_dim, b_dim))); - } - - for (int i = 0; i < a_dim; i++) - { - diff = ax[i] - bx[i]; - distance += diff * diff; - } - - PG_RETURN_FLOAT4((dist_t)sqrt(distance)); -} diff --git a/pgxn/hnsw/hnsw.control b/pgxn/hnsw/hnsw.control deleted file mode 100644 index fbfa1a5b47..0000000000 --- a/pgxn/hnsw/hnsw.control +++ /dev/null @@ -1,4 +0,0 @@ -comment = '** Deprecated ** Please use pg_embedding instead' -default_version = '0.1.0' -module_pathname = '$libdir/hnsw' -relocatable = true diff --git a/pgxn/hnsw/hnsw.h b/pgxn/hnsw/hnsw.h deleted file mode 100644 index d4065ab8fe..0000000000 --- a/pgxn/hnsw/hnsw.h +++ /dev/null @@ -1,15 +0,0 @@ -#pragma once - -typedef float coord_t; -typedef float dist_t; -typedef uint32_t idx_t; -typedef uint64_t label_t; - -typedef struct HierarchicalNSW HierarchicalNSW; - -bool hnsw_search(HierarchicalNSW* hnsw, const coord_t *point, size_t efSearch, size_t* n_results, label_t** results); -bool hnsw_add_point(HierarchicalNSW* hnsw, const coord_t *point, label_t label); -void hnsw_init(HierarchicalNSW* hnsw, size_t dim, size_t maxelements, size_t M, size_t maxM, size_t efConstruction); -int hnsw_dimensions(HierarchicalNSW* hnsw); -size_t hnsw_count(HierarchicalNSW* hnsw); -size_t hnsw_sizeof(void); diff --git a/pgxn/hnsw/hnswalg.cpp b/pgxn/hnsw/hnswalg.cpp deleted file mode 100644 index f6de3b8314..0000000000 --- a/pgxn/hnsw/hnswalg.cpp +++ /dev/null @@ -1,379 +0,0 @@ -#include "hnswalg.h" - -#if defined(__GNUC__) -#define PORTABLE_ALIGN32 __attribute__((aligned(32))) -#define PREFETCH(addr,hint) __builtin_prefetch(addr, 0, hint) -#else -#define PORTABLE_ALIGN32 __declspec(align(32)) -#define PREFETCH(addr,hint) -#endif - -HierarchicalNSW::HierarchicalNSW(size_t dim_, size_t maxelements_, size_t M_, size_t maxM_, size_t efConstruction_) -{ - dim = dim_; - data_size = dim * sizeof(coord_t); - - efConstruction = efConstruction_; - - maxelements = maxelements_; - M = M_; - maxM = maxM_; - size_links_level0 = (maxM + 1) * sizeof(idx_t); - size_data_per_element = size_links_level0 + data_size + sizeof(label_t); - offset_data = size_links_level0; - offset_label = offset_data + data_size; - - enterpoint_node = 0; - cur_element_count = 0; -#ifdef __x86_64__ - use_avx2 = __builtin_cpu_supports("avx2"); -#endif -} - -std::priority_queue> HierarchicalNSW::searchBaseLayer(const coord_t *point, size_t ef) -{ - std::vector visited; - visited.resize((cur_element_count + 31) >> 5); - - std::priority_queue> topResults; - std::priority_queue> candidateSet; - - dist_t dist = fstdistfunc(point, getDataByInternalId(enterpoint_node)); - - topResults.emplace(dist, enterpoint_node); - candidateSet.emplace(-dist, enterpoint_node); - visited[enterpoint_node >> 5] = 1 << (enterpoint_node & 31); - dist_t lowerBound = dist; - - while (!candidateSet.empty()) - { - std::pair curr_el_pair = candidateSet.top(); - if (-curr_el_pair.first > lowerBound) - break; - - candidateSet.pop(); - idx_t curNodeNum = curr_el_pair.second; - - idx_t* data = get_linklist0(curNodeNum); - size_t size = *data++; - - PREFETCH(getDataByInternalId(*data), 0); - - for (size_t j = 0; j < size; ++j) { - size_t tnum = *(data + j); - - PREFETCH(getDataByInternalId(*(data + j + 1)), 0); - - if (!(visited[tnum >> 5] & (1 << (tnum & 31)))) { - visited[tnum >> 5] |= 1 << (tnum & 31); - - dist = fstdistfunc(point, getDataByInternalId(tnum)); - - if (topResults.top().first > dist || topResults.size() < ef) { - candidateSet.emplace(-dist, tnum); - - PREFETCH(get_linklist0(candidateSet.top().second), 0); - topResults.emplace(dist, tnum); - - if (topResults.size() > ef) - topResults.pop(); - - lowerBound = topResults.top().first; - } - } - } - } - return topResults; -} - - -void HierarchicalNSW::getNeighborsByHeuristic(std::priority_queue> &topResults, size_t NN) -{ - if (topResults.size() < NN) - return; - - std::priority_queue> resultSet; - std::vector> returnlist; - - while (topResults.size() > 0) { - resultSet.emplace(-topResults.top().first, topResults.top().second); - topResults.pop(); - } - - while (resultSet.size()) { - if (returnlist.size() >= NN) - break; - std::pair curen = resultSet.top(); - dist_t dist_to_query = -curen.first; - resultSet.pop(); - bool good = true; - for (std::pair curen2 : returnlist) { - dist_t curdist = fstdistfunc(getDataByInternalId(curen2.second), - getDataByInternalId(curen.second)); - if (curdist < dist_to_query) { - good = false; - break; - } - } - if (good) returnlist.push_back(curen); - } - for (std::pair elem : returnlist) - topResults.emplace(-elem.first, elem.second); -} - -void HierarchicalNSW::mutuallyConnectNewElement(const coord_t *point, idx_t cur_c, - std::priority_queue> topResults) -{ - getNeighborsByHeuristic(topResults, M); - - std::vector res; - res.reserve(M); - while (topResults.size() > 0) { - res.push_back(topResults.top().second); - topResults.pop(); - } - { - idx_t* data = get_linklist0(cur_c); - if (*data) - throw std::runtime_error("Should be blank"); - - *data++ = res.size(); - - for (size_t idx = 0; idx < res.size(); idx++) { - if (data[idx]) - throw std::runtime_error("Should be blank"); - data[idx] = res[idx]; - } - } - for (size_t idx = 0; idx < res.size(); idx++) { - if (res[idx] == cur_c) - throw std::runtime_error("Connection to the same element"); - - size_t resMmax = maxM; - idx_t *ll_other = get_linklist0(res[idx]); - idx_t sz_link_list_other = *ll_other; - - if (sz_link_list_other > resMmax || sz_link_list_other < 0) - throw std::runtime_error("Bad sz_link_list_other"); - - if (sz_link_list_other < resMmax) { - idx_t *data = ll_other + 1; - data[sz_link_list_other] = cur_c; - *ll_other = sz_link_list_other + 1; - } else { - // finding the "weakest" element to replace it with the new one - idx_t *data = ll_other + 1; - dist_t d_max = fstdistfunc(getDataByInternalId(cur_c), getDataByInternalId(res[idx])); - // Heuristic: - std::priority_queue> candidates; - candidates.emplace(d_max, cur_c); - - for (size_t j = 0; j < sz_link_list_other; j++) - candidates.emplace(fstdistfunc(getDataByInternalId(data[j]), getDataByInternalId(res[idx])), data[j]); - - getNeighborsByHeuristic(candidates, resMmax); - - size_t indx = 0; - while (!candidates.empty()) { - data[indx] = candidates.top().second; - candidates.pop(); - indx++; - } - *ll_other = indx; - } - } -} - -void HierarchicalNSW::addPoint(const coord_t *point, label_t label) -{ - if (cur_element_count >= maxelements) { - throw std::runtime_error("The number of elements exceeds the specified limit"); - } - idx_t cur_c = cur_element_count++; - memset((char *) get_linklist0(cur_c), 0, size_data_per_element); - memcpy(getDataByInternalId(cur_c), point, data_size); - memcpy(getExternalLabel(cur_c), &label, sizeof label); - - // Do nothing for the first element - if (cur_c != 0) { - std::priority_queue > topResults = searchBaseLayer(point, efConstruction); - mutuallyConnectNewElement(point, cur_c, topResults); - } -}; - -std::priority_queue> HierarchicalNSW::searchKnn(const coord_t *query, size_t k) -{ - std::priority_queue> topResults; - auto topCandidates = searchBaseLayer(query, k); - while (topCandidates.size() > k) { - topCandidates.pop(); - } - while (!topCandidates.empty()) { - std::pair rez = topCandidates.top(); - label_t label; - memcpy(&label, getExternalLabel(rez.second), sizeof(label)); - topResults.push(std::pair(rez.first, label)); - topCandidates.pop(); - } - - return topResults; -}; - -dist_t fstdistfunc_scalar(const coord_t *x, const coord_t *y, size_t n) -{ - dist_t distance = 0.0; - - for (size_t i = 0; i < n; i++) - { - dist_t diff = x[i] - y[i]; - distance += diff * diff; - } - return distance; - -} - -#ifdef __x86_64__ -#include - -__attribute__((target("avx2"))) -dist_t fstdistfunc_avx2(const coord_t *x, const coord_t *y, size_t n) -{ - const size_t TmpResSz = sizeof(__m256) / sizeof(float); - float PORTABLE_ALIGN32 TmpRes[TmpResSz]; - size_t qty16 = n / 16; - const float *pEnd1 = x + (qty16 * 16); - __m256 diff, v1, v2; - __m256 sum = _mm256_set1_ps(0); - - while (x < pEnd1) { - v1 = _mm256_loadu_ps(x); - x += 8; - v2 = _mm256_loadu_ps(y); - y += 8; - diff = _mm256_sub_ps(v1, v2); - sum = _mm256_add_ps(sum, _mm256_mul_ps(diff, diff)); - - v1 = _mm256_loadu_ps(x); - x += 8; - v2 = _mm256_loadu_ps(y); - y += 8; - diff = _mm256_sub_ps(v1, v2); - sum = _mm256_add_ps(sum, _mm256_mul_ps(diff, diff)); - } - _mm256_store_ps(TmpRes, sum); - float res = TmpRes[0] + TmpRes[1] + TmpRes[2] + TmpRes[3] + TmpRes[4] + TmpRes[5] + TmpRes[6] + TmpRes[7]; - return (res); -} - -dist_t fstdistfunc_sse(const coord_t *x, const coord_t *y, size_t n) -{ - const size_t TmpResSz = sizeof(__m128) / sizeof(float); - float PORTABLE_ALIGN32 TmpRes[TmpResSz]; - size_t qty16 = n / 16; - const float *pEnd1 = x + (qty16 * 16); - - __m128 diff, v1, v2; - __m128 sum = _mm_set1_ps(0); - - while (x < pEnd1) { - v1 = _mm_loadu_ps(x); - x += 4; - v2 = _mm_loadu_ps(y); - y += 4; - diff = _mm_sub_ps(v1, v2); - sum = _mm_add_ps(sum, _mm_mul_ps(diff, diff)); - - v1 = _mm_loadu_ps(x); - x += 4; - v2 = _mm_loadu_ps(y); - y += 4; - diff = _mm_sub_ps(v1, v2); - sum = _mm_add_ps(sum, _mm_mul_ps(diff, diff)); - - v1 = _mm_loadu_ps(x); - x += 4; - v2 = _mm_loadu_ps(y); - y += 4; - diff = _mm_sub_ps(v1, v2); - sum = _mm_add_ps(sum, _mm_mul_ps(diff, diff)); - - v1 = _mm_loadu_ps(x); - x += 4; - v2 = _mm_loadu_ps(y); - y += 4; - diff = _mm_sub_ps(v1, v2); - sum = _mm_add_ps(sum, _mm_mul_ps(diff, diff)); - } - _mm_store_ps(TmpRes, sum); - float res = TmpRes[0] + TmpRes[1] + TmpRes[2] + TmpRes[3]; - return res; -} -#endif - -dist_t HierarchicalNSW::fstdistfunc(const coord_t *x, const coord_t *y) -{ -#ifndef __x86_64__ - return fstdistfunc_scalar(x, y, dim); -#else - if(use_avx2) - return fstdistfunc_avx2(x, y, dim); - - return fstdistfunc_sse(x, y, dim); -#endif -} - -bool hnsw_search(HierarchicalNSW* hnsw, const coord_t *point, size_t efSearch, size_t* n_results, label_t** results) -{ - try - { - auto result = hnsw->searchKnn(point, efSearch); - size_t nResults = result.size(); - *results = (label_t*)malloc(nResults*sizeof(label_t)); - for (size_t i = nResults; i-- != 0;) - { - (*results)[i] = result.top().second; - result.pop(); - } - *n_results = nResults; - return true; - } - catch (std::exception& x) - { - return false; - } -} - -bool hnsw_add_point(HierarchicalNSW* hnsw, const coord_t *point, label_t label) -{ - try - { - hnsw->addPoint(point, label); - return true; - } - catch (std::exception& x) - { - fprintf(stderr, "Catch %s\n", x.what()); - return false; - } -} - -void hnsw_init(HierarchicalNSW* hnsw, size_t dims, size_t maxelements, size_t M, size_t maxM, size_t efConstruction) -{ - new ((void*)hnsw) HierarchicalNSW(dims, maxelements, M, maxM, efConstruction); -} - - -int hnsw_dimensions(HierarchicalNSW* hnsw) -{ - return (int)hnsw->dim; -} - -size_t hnsw_count(HierarchicalNSW* hnsw) -{ - return hnsw->cur_element_count; -} - -size_t hnsw_sizeof(void) -{ - return sizeof(HierarchicalNSW); -} diff --git a/pgxn/hnsw/hnswalg.h b/pgxn/hnsw/hnswalg.h deleted file mode 100644 index f38aeac362..0000000000 --- a/pgxn/hnsw/hnswalg.h +++ /dev/null @@ -1,69 +0,0 @@ -#pragma once - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -extern "C" { -#include "hnsw.h" -} - -struct HierarchicalNSW -{ - size_t maxelements; - size_t cur_element_count; - - idx_t enterpoint_node; - - size_t dim; - size_t data_size; - size_t offset_data; - size_t offset_label; - size_t size_data_per_element; - size_t M; - size_t maxM; - size_t size_links_level0; - size_t efConstruction; - -#ifdef __x86_64__ - bool use_avx2; -#endif - - char data_level0_memory[0]; // varying size - - public: - HierarchicalNSW(size_t dim, size_t maxelements, size_t M, size_t maxM, size_t efConstruction); - ~HierarchicalNSW(); - - - inline coord_t *getDataByInternalId(idx_t internal_id) const { - return (coord_t *)&data_level0_memory[internal_id * size_data_per_element + offset_data]; - } - - inline idx_t *get_linklist0(idx_t internal_id) const { - return (idx_t*)&data_level0_memory[internal_id * size_data_per_element]; - } - - inline label_t *getExternalLabel(idx_t internal_id) const { - return (label_t *)&data_level0_memory[internal_id * size_data_per_element + offset_label]; - } - - std::priority_queue> searchBaseLayer(const coord_t *x, size_t ef); - - void getNeighborsByHeuristic(std::priority_queue> &topResults, size_t NN); - - void mutuallyConnectNewElement(const coord_t *x, idx_t id, std::priority_queue> topResults); - - void addPoint(const coord_t *point, label_t label); - - std::priority_queue> searchKnn(const coord_t *query_data, size_t k); - - dist_t fstdistfunc(const coord_t *x, const coord_t *y); -}; diff --git a/pgxn/hnsw/test/expected/knn.out b/pgxn/hnsw/test/expected/knn.out deleted file mode 100644 index a1cee4525e..0000000000 --- a/pgxn/hnsw/test/expected/knn.out +++ /dev/null @@ -1,28 +0,0 @@ -SET enable_seqscan = off; -CREATE TABLE t (val real[]); -INSERT INTO t (val) VALUES ('{0,0,0}'), ('{1,2,3}'), ('{1,1,1}'), (NULL); -CREATE INDEX ON t USING hnsw (val) WITH (maxelements = 10, dims=3, m=3); -INSERT INTO t (val) VALUES (array[1,2,4]); -explain SELECT * FROM t ORDER BY val <-> array[3,3,3]; - QUERY PLAN --------------------------------------------------------------------- - Index Scan using t_val_idx on t (cost=4.02..8.06 rows=3 width=36) - Order By: (val <-> '{3,3,3}'::real[]) -(2 rows) - -SELECT * FROM t ORDER BY val <-> array[3,3,3]; - val ---------- - {1,2,3} - {1,2,4} - {1,1,1} - {0,0,0} -(4 rows) - -SELECT COUNT(*) FROM t; - count -------- - 5 -(1 row) - -DROP TABLE t; diff --git a/pgxn/hnsw/test/sql/knn.sql b/pgxn/hnsw/test/sql/knn.sql deleted file mode 100644 index 0635bda4a2..0000000000 --- a/pgxn/hnsw/test/sql/knn.sql +++ /dev/null @@ -1,13 +0,0 @@ -SET enable_seqscan = off; - -CREATE TABLE t (val real[]); -INSERT INTO t (val) VALUES ('{0,0,0}'), ('{1,2,3}'), ('{1,1,1}'), (NULL); -CREATE INDEX ON t USING hnsw (val) WITH (maxelements = 10, dims=3, m=3); - -INSERT INTO t (val) VALUES (array[1,2,4]); - -explain SELECT * FROM t ORDER BY val <-> array[3,3,3]; -SELECT * FROM t ORDER BY val <-> array[3,3,3]; -SELECT COUNT(*) FROM t; - -DROP TABLE t; diff --git a/pgxn/neon/libpagestore.c b/pgxn/neon/libpagestore.c index 769befb4e5..4460e3b40c 100644 --- a/pgxn/neon/libpagestore.c +++ b/pgxn/neon/libpagestore.c @@ -911,7 +911,74 @@ pageserver_receive(shardno_t shard_no) } PG_CATCH(); { - neon_shard_log(shard_no, LOG, "pageserver_receive: disconnect due malformatted response"); + neon_shard_log(shard_no, LOG, "pageserver_receive: disconnect due to failure while parsing response"); + pageserver_disconnect(shard_no); + PG_RE_THROW(); + } + PG_END_TRY(); + + if (message_level_is_interesting(PageStoreTrace)) + { + char *msg = nm_to_string((NeonMessage *) resp); + + neon_shard_log(shard_no, PageStoreTrace, "got response: %s", msg); + pfree(msg); + } + } + else if (rc == -1) + { + neon_shard_log(shard_no, LOG, "pageserver_receive disconnect: psql end of copy data: %s", pchomp(PQerrorMessage(pageserver_conn))); + pageserver_disconnect(shard_no); + resp = NULL; + } + else if (rc == -2) + { + char *msg = pchomp(PQerrorMessage(pageserver_conn)); + + pageserver_disconnect(shard_no); + neon_shard_log(shard_no, ERROR, "pageserver_receive disconnect: could not read COPY data: %s", msg); + } + else + { + pageserver_disconnect(shard_no); + neon_shard_log(shard_no, ERROR, "pageserver_receive disconnect: unexpected PQgetCopyData return value: %d", rc); + } + + shard->nresponses_received++; + return (NeonResponse *) resp; +} + +static NeonResponse * +pageserver_try_receive(shardno_t shard_no) +{ + StringInfoData resp_buff; + NeonResponse *resp; + PageServer *shard = &page_servers[shard_no]; + PGconn *pageserver_conn = shard->conn; + /* read response */ + int rc; + + if (shard->state != PS_Connected) + return NULL; + + Assert(pageserver_conn); + + rc = PQgetCopyData(shard->conn, &resp_buff.data, 1 /* async = true */); + + if (rc == 0) + return NULL; + else if (rc > 0) + { + PG_TRY(); + { + resp_buff.len = rc; + resp_buff.cursor = 0; + resp = nm_unpack_response(&resp_buff); + PQfreemem(resp_buff.data); + } + PG_CATCH(); + { + neon_shard_log(shard_no, LOG, "pageserver_receive: disconnect due to failure while parsing response"); pageserver_disconnect(shard_no); PG_RE_THROW(); } @@ -980,6 +1047,7 @@ page_server_api api = .send = pageserver_send, .flush = pageserver_flush, .receive = pageserver_receive, + .try_receive = pageserver_try_receive, .disconnect = pageserver_disconnect_shard }; diff --git a/pgxn/neon/pagestore_client.h b/pgxn/neon/pagestore_client.h index 37bc4f7886..7b748d7252 100644 --- a/pgxn/neon/pagestore_client.h +++ b/pgxn/neon/pagestore_client.h @@ -34,6 +34,8 @@ typedef enum T_NeonGetPageRequest, T_NeonDbSizeRequest, T_NeonGetSlruSegmentRequest, + /* future tags above this line */ + T_NeonTestRequest = 99, /* only in cfg(feature = "testing") */ /* pagestore -> pagestore_client */ T_NeonExistsResponse = 100, @@ -42,6 +44,8 @@ typedef enum T_NeonErrorResponse, T_NeonDbSizeResponse, T_NeonGetSlruSegmentResponse, + /* future tags above this line */ + T_NeonTestResponse = 199, /* only in cfg(feature = "testing") */ } NeonMessageTag; typedef uint64 NeonRequestId; @@ -192,9 +196,29 @@ typedef uint16 shardno_t; typedef struct { + /* + * Send this request to the PageServer associated with this shard. + */ bool (*send) (shardno_t shard_no, NeonRequest * request); + /* + * Blocking read for the next response of this shard. + * + * When a CANCEL signal is handled, the connection state will be + * unmodified. + */ NeonResponse *(*receive) (shardno_t shard_no); + /* + * Try get the next response from the TCP buffers, if any. + * Returns NULL when the data is not yet available. + */ + NeonResponse *(*try_receive) (shardno_t shard_no); + /* + * Make sure all requests are sent to PageServer. + */ bool (*flush) (shardno_t shard_no); + /* + * Disconnect from this pageserver shard. + */ void (*disconnect) (shardno_t shard_no); } page_server_api; diff --git a/pgxn/neon/pagestore_smgr.c b/pgxn/neon/pagestore_smgr.c index 7a4c0ef487..54cacea984 100644 --- a/pgxn/neon/pagestore_smgr.c +++ b/pgxn/neon/pagestore_smgr.c @@ -405,6 +405,56 @@ compact_prefetch_buffers(void) return false; } +/* + * If there might be responses still in the TCP buffer, then + * we should try to use those, so as to reduce any TCP backpressure + * on the OS/PS side. + * + * This procedure handles that. + * + * Note that this is only valid as long as the only pipelined + * operations in the TCP buffer are getPage@Lsn requests. + */ +static void +prefetch_pump_state(void) +{ + while (MyPState->ring_receive != MyPState->ring_flush) + { + NeonResponse *response; + PrefetchRequest *slot; + MemoryContext old; + + slot = GetPrfSlot(MyPState->ring_receive); + + old = MemoryContextSwitchTo(MyPState->errctx); + response = page_server->try_receive(slot->shard_no); + MemoryContextSwitchTo(old); + + if (response == NULL) + break; + + /* The slot should still be valid */ + if (slot->status != PRFS_REQUESTED || + slot->response != NULL || + slot->my_ring_index != MyPState->ring_receive) + neon_shard_log(slot->shard_no, ERROR, + "Incorrect prefetch slot state after receive: status=%d response=%p my=%lu receive=%lu", + slot->status, slot->response, + (long) slot->my_ring_index, (long) MyPState->ring_receive); + + /* update prefetch state */ + MyPState->n_responses_buffered += 1; + MyPState->n_requests_inflight -= 1; + MyPState->ring_receive += 1; + MyNeonCounters->getpage_prefetches_buffered = + MyPState->n_responses_buffered; + + /* update slot state */ + slot->status = PRFS_RECEIVED; + slot->response = response; + } +} + void readahead_buffer_resize(int newsize, void *extra) { @@ -2808,6 +2858,8 @@ neon_prefetch(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, MyPState->ring_last <= ring_index); } + prefetch_pump_state(); + return false; } @@ -2849,6 +2901,8 @@ neon_prefetch(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum) Assert(ring_index < MyPState->ring_unused && MyPState->ring_last <= ring_index); + prefetch_pump_state(); + return false; } #endif /* PG_MAJORVERSION_NUM < 17 */ @@ -2891,6 +2945,8 @@ neon_writeback(SMgrRelation reln, ForkNumber forknum, */ neon_log(SmgrTrace, "writeback noop"); + prefetch_pump_state(); + #ifdef DEBUG_COMPARE_LOCAL if (IS_LOCAL_REL(reln)) mdwriteback(reln, forknum, blocknum, nblocks); @@ -3145,6 +3201,8 @@ neon_read(SMgrRelation reln, ForkNumber forkNum, BlockNumber blkno, void *buffer neon_get_request_lsns(InfoFromSMgrRel(reln), forkNum, blkno, &request_lsns, 1, NULL); neon_read_at_lsn(InfoFromSMgrRel(reln), forkNum, blkno, request_lsns, buffer); + prefetch_pump_state(); + #ifdef DEBUG_COMPARE_LOCAL if (forkNum == MAIN_FORKNUM && IS_LOCAL_REL(reln)) { @@ -3282,6 +3340,8 @@ neon_readv(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, neon_read_at_lsnv(InfoFromSMgrRel(reln), forknum, blocknum, request_lsns, buffers, nblocks, read); + prefetch_pump_state(); + #ifdef DEBUG_COMPARE_LOCAL if (forkNum == MAIN_FORKNUM && IS_LOCAL_REL(reln)) { @@ -3450,6 +3510,8 @@ neon_write(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, const vo lfc_write(InfoFromSMgrRel(reln), forknum, blocknum, buffer); + prefetch_pump_state(); + #ifdef DEBUG_COMPARE_LOCAL if (IS_LOCAL_REL(reln)) #if PG_MAJORVERSION_NUM >= 17 @@ -3503,6 +3565,8 @@ neon_writev(SMgrRelation reln, ForkNumber forknum, BlockNumber blkno, lfc_writev(InfoFromSMgrRel(reln), forknum, blkno, buffers, nblocks); + prefetch_pump_state(); + #ifdef DEBUG_COMPARE_LOCAL if (IS_LOCAL_REL(reln)) mdwritev(reln, forknum, blocknum, &buffer, 1, skipFsync); @@ -3792,6 +3856,8 @@ neon_immedsync(SMgrRelation reln, ForkNumber forknum) neon_log(SmgrTrace, "[NEON_SMGR] immedsync noop"); + prefetch_pump_state(); + #ifdef DEBUG_COMPARE_LOCAL if (IS_LOCAL_REL(reln)) mdimmedsync(reln, forknum); diff --git a/poetry.lock b/poetry.lock index 5f15223dca..2cd2bc6383 100644 --- a/poetry.lock +++ b/poetry.lock @@ -2028,13 +2028,13 @@ openapi-schema-validator = ">=0.4.2,<0.5.0" [[package]] name = "packaging" -version = "23.0" +version = "24.2" description = "Core utilities for Python packages" optional = false -python-versions = ">=3.7" +python-versions = ">=3.8" files = [ - {file = "packaging-23.0-py3-none-any.whl", hash = "sha256:714ac14496c3e68c99c29b00845f7a2b85f3bb6f1078fd9f72fd20f0570002b2"}, - {file = "packaging-23.0.tar.gz", hash = "sha256:b6ad297f8907de0fa2fe1ccbd26fdaf387f5f47c7275fedf8cce89f99446cf97"}, + {file = "packaging-24.2-py3-none-any.whl", hash = "sha256:09abb1bccd265c01f4a3aa3f7a7db064b36514d2cba19a2f694fe6150451a759"}, + {file = "packaging-24.2.tar.gz", hash = "sha256:c228a6dc5e932d346bc5739379109d49e8853dd8223571c7c5b55260edc0b97f"}, ] [[package]] diff --git a/proxy/Cargo.toml b/proxy/Cargo.toml index 2f63ee3acc..f362a45035 100644 --- a/proxy/Cargo.toml +++ b/proxy/Cargo.toml @@ -106,6 +106,7 @@ jose-jwk = { version = "0.1.2", features = ["p256", "p384", "rsa"] } signature = "2" ecdsa = "0.16" p256 = { version = "0.13", features = ["jwk"] } +ed25519-dalek = { version = "2", default-features = false, features = ["rand_core"] } rsa = "0.9" workspace_hack.workspace = true diff --git a/proxy/src/auth/backend/console_redirect.rs b/proxy/src/auth/backend/console_redirect.rs index dbfda588cc..1cbf91d3ae 100644 --- a/proxy/src/auth/backend/console_redirect.rs +++ b/proxy/src/auth/backend/console_redirect.rs @@ -1,7 +1,8 @@ +use std::fmt; + use async_trait::async_trait; use postgres_client::config::SslMode; use pq_proto::BeMessage as Be; -use std::fmt; use thiserror::Error; use tokio::io::{AsyncRead, AsyncWrite}; use tracing::{info, info_span}; @@ -12,10 +13,13 @@ use crate::auth::IpPattern; use crate::cache::Cached; use crate::config::AuthenticationConfig; use crate::context::RequestContext; -use crate::control_plane::{self, client::cplane_proxy_v1, CachedNodeInfo, NodeInfo}; +use crate::control_plane::client::cplane_proxy_v1; +use crate::control_plane::{self, CachedNodeInfo, NodeInfo}; use crate::error::{ReportableError, UserFacingError}; use crate::proxy::connect_compute::ComputeConnectBackend; +use crate::proxy::NeonOptions; use crate::stream::PqStream; +use crate::types::RoleName; use crate::{auth, compute, waiters}; #[derive(Debug, Error)] @@ -105,10 +109,16 @@ impl ConsoleRedirectBackend { ctx: &RequestContext, auth_config: &'static AuthenticationConfig, client: &mut PqStream, - ) -> auth::Result<(ConsoleRedirectNodeInfo, Option>)> { + ) -> auth::Result<( + ConsoleRedirectNodeInfo, + ComputeUserInfo, + Option>, + )> { authenticate(ctx, auth_config, &self.console_uri, client) .await - .map(|(node_info, ip_allowlist)| (ConsoleRedirectNodeInfo(node_info), ip_allowlist)) + .map(|(node_info, user_info, ip_allowlist)| { + (ConsoleRedirectNodeInfo(node_info), user_info, ip_allowlist) + }) } } @@ -133,7 +143,7 @@ async fn authenticate( auth_config: &'static AuthenticationConfig, link_uri: &reqwest::Url, client: &mut PqStream, -) -> auth::Result<(NodeInfo, Option>)> { +) -> auth::Result<(NodeInfo, ComputeUserInfo, Option>)> { ctx.set_auth_method(crate::context::AuthMethod::ConsoleRedirect); // registering waiter can fail if we get unlucky with rng. @@ -188,8 +198,15 @@ async fn authenticate( let mut config = compute::ConnCfg::new(db_info.host.to_string(), db_info.port); config.dbname(&db_info.dbname).user(&db_info.user); + let user: RoleName = db_info.user.into(); + let user_info = ComputeUserInfo { + endpoint: db_info.aux.endpoint_id.as_str().into(), + user: user.clone(), + options: NeonOptions::default(), + }; + ctx.set_dbname(db_info.dbname.into()); - ctx.set_user(db_info.user.into()); + ctx.set_user(user); ctx.set_project(db_info.aux.clone()); info!("woken up a compute node"); @@ -212,6 +229,7 @@ async fn authenticate( config, aux: db_info.aux, }, + user_info, db_info.allowed_ips, )) } diff --git a/proxy/src/compute.rs b/proxy/src/compute.rs index 788bd63fee..aff796bbab 100644 --- a/proxy/src/compute.rs +++ b/proxy/src/compute.rs @@ -24,10 +24,8 @@ use crate::control_plane::messages::MetricsAuxInfo; use crate::error::{ReportableError, UserFacingError}; use crate::metrics::{Metrics, NumDbConnectionsGuard}; use crate::proxy::neon_option; -use crate::proxy::NeonOptions; use crate::tls::postgres_rustls::MakeRustlsConnect; use crate::types::Host; -use crate::types::{EndpointId, RoleName}; pub const COULD_NOT_CONNECT: &str = "Couldn't connect to compute node"; @@ -253,6 +251,7 @@ impl ConnCfg { ctx: &RequestContext, aux: MetricsAuxInfo, config: &ComputeConfig, + user_info: ComputeUserInfo, ) -> Result { let pause = ctx.latency_timer_pause(crate::metrics::Waiting::Compute); let (socket_addr, stream, host) = self.connect_raw(config.timeout).await?; @@ -287,28 +286,6 @@ impl ConnCfg { self.0.get_ssl_mode() ); - let compute_info = match parameters.get("user") { - Some(user) => { - match parameters.get("database") { - Some(database) => { - ComputeUserInfo { - user: RoleName::from(user), - options: NeonOptions::default(), // just a shim, we don't need options - endpoint: EndpointId::from(database), - } - } - None => { - warn!("compute node didn't return database name"); - ComputeUserInfo::default() - } - } - } - None => { - warn!("compute node didn't return user name"); - ComputeUserInfo::default() - } - }; - // NB: CancelToken is supposed to hold socket_addr, but we use connect_raw. // Yet another reason to rework the connection establishing code. let cancel_closure = CancelClosure::new( @@ -321,7 +298,7 @@ impl ConnCfg { }, vec![], // TODO: deprecated, will be removed host.to_string(), - compute_info, + user_info, ); let connection = PostgresConnection { diff --git a/proxy/src/console_redirect_proxy.rs b/proxy/src/console_redirect_proxy.rs index 846f55f9e1..0c6755063f 100644 --- a/proxy/src/console_redirect_proxy.rs +++ b/proxy/src/console_redirect_proxy.rs @@ -195,7 +195,7 @@ pub(crate) async fn handle_client( ctx.set_db_options(params.clone()); - let (user_info, ip_allowlist) = match backend + let (node_info, user_info, ip_allowlist) = match backend .authenticate(ctx, &config.authentication_config, &mut stream) .await { @@ -208,11 +208,12 @@ pub(crate) async fn handle_client( let mut node = connect_to_compute( ctx, &TcpMechanism { + user_info, params_compat: true, params: ¶ms, locks: &config.connect_compute_locks, }, - &user_info, + &node_info, config.wake_compute_retry_config, &config.connect_to_compute, ) diff --git a/proxy/src/context/parquet.rs b/proxy/src/context/parquet.rs index 5f65b17374..d7ffff0483 100644 --- a/proxy/src/context/parquet.rs +++ b/proxy/src/context/parquet.rs @@ -187,10 +187,6 @@ pub async fn worker( let rx = futures::stream::poll_fn(move |cx| rx.poll_recv(cx)); let rx = rx.map(RequestData::from); - let storage = GenericRemoteStorage::from_config(&remote_storage_config) - .await - .context("remote storage init")?; - let properties = WriterProperties::builder() .set_data_page_size_limit(config.parquet_upload_page_size) .set_compression(config.parquet_upload_compression); @@ -224,18 +220,18 @@ pub async fn worker( let rx_disconnect = futures::stream::poll_fn(move |cx| rx_disconnect.poll_recv(cx)); let rx_disconnect = rx_disconnect.map(RequestData::from); - let storage_disconnect = - GenericRemoteStorage::from_config(&disconnect_events_storage_config) - .await - .context("remote storage for disconnect events init")?; let parquet_config_disconnect = parquet_config.clone(); tokio::try_join!( - worker_inner(storage, rx, parquet_config), - worker_inner(storage_disconnect, rx_disconnect, parquet_config_disconnect) + worker_inner(remote_storage_config, rx, parquet_config), + worker_inner( + disconnect_events_storage_config, + rx_disconnect, + parquet_config_disconnect + ) ) .map(|_| ()) } else { - worker_inner(storage, rx, parquet_config).await + worker_inner(remote_storage_config, rx, parquet_config).await } } @@ -251,18 +247,32 @@ struct ParquetConfig { test_remote_failures: u64, } +impl ParquetConfig { + async fn storage( + &self, + storage_config: &RemoteStorageConfig, + ) -> anyhow::Result { + let storage = GenericRemoteStorage::from_config(storage_config) + .await + .context("remote storage init")?; + + #[cfg(any(test, feature = "testing"))] + if self.test_remote_failures > 0 { + return Ok(GenericRemoteStorage::unreliable_wrapper( + storage, + self.test_remote_failures, + )); + } + + Ok(storage) + } +} + async fn worker_inner( - storage: GenericRemoteStorage, + storage_config: RemoteStorageConfig, rx: impl Stream, config: ParquetConfig, ) -> anyhow::Result<()> { - #[cfg(any(test, feature = "testing"))] - let storage = if config.test_remote_failures > 0 { - GenericRemoteStorage::unreliable_wrapper(storage, config.test_remote_failures) - } else { - storage - }; - let mut rx = std::pin::pin!(rx); let mut rows = Vec::with_capacity(config.rows_per_group); @@ -285,7 +295,7 @@ async fn worker_inner( } if len > config.file_size || force { last_upload = time::Instant::now(); - let file = upload_parquet(w, len, &storage).await?; + let file = upload_parquet(w, len, &storage_config, &config).await?; w = SerializedFileWriter::new(file, schema.clone(), config.propeties.clone())?; len = 0; } @@ -298,7 +308,7 @@ async fn worker_inner( } if !w.flushed_row_groups().is_empty() { - let _rtchk: Writer = upload_parquet(w, len, &storage).await?; + let _rtchk: Writer = upload_parquet(w, len, &storage_config, &config).await?; } Ok(()) @@ -340,7 +350,8 @@ where async fn upload_parquet( mut w: SerializedFileWriter>, len: i64, - storage: &GenericRemoteStorage, + storage_config: &RemoteStorageConfig, + config: &ParquetConfig, ) -> anyhow::Result> { let len_uncompressed = w .flushed_row_groups() @@ -377,6 +388,15 @@ async fn upload_parquet( size, compression, "uploading request parquet file" ); + // A bug in azure-sdk means that the identity-token-file that expires after + // 1 hour is not refreshed. This identity-token is used to fetch the actual azure storage + // tokens that last for 24 hours. After this 24 hour period, azure-sdk tries to refresh + // the storage token, but the identity token has now expired. + // + // + // To work around this, we recreate the storage every time. + let storage = config.storage(storage_config).await?; + let year = now.year(); let month = now.month(); let day = now.day(); @@ -431,8 +451,8 @@ mod tests { use rand::rngs::StdRng; use rand::{Rng, SeedableRng}; use remote_storage::{ - GenericRemoteStorage, RemoteStorageConfig, RemoteStorageKind, S3Config, - DEFAULT_MAX_KEYS_PER_LIST_RESPONSE, DEFAULT_REMOTE_STORAGE_S3_CONCURRENCY_LIMIT, + RemoteStorageConfig, RemoteStorageKind, S3Config, DEFAULT_MAX_KEYS_PER_LIST_RESPONSE, + DEFAULT_REMOTE_STORAGE_S3_CONCURRENCY_LIMIT, }; use tokio::sync::mpsc; use tokio::time; @@ -559,12 +579,11 @@ mod tests { timeout: std::time::Duration::from_secs(120), small_timeout: std::time::Duration::from_secs(30), }; - let storage = GenericRemoteStorage::from_config(&remote_storage_config) + + worker_inner(remote_storage_config, rx, config) .await .unwrap(); - worker_inner(storage, rx, config).await.unwrap(); - let mut files = WalkDir::new(tmpdir.as_std_path()) .into_iter() .filter_map(|entry| entry.ok()) diff --git a/proxy/src/control_plane/mod.rs b/proxy/src/control_plane/mod.rs index c65041df0e..1dca26d686 100644 --- a/proxy/src/control_plane/mod.rs +++ b/proxy/src/control_plane/mod.rs @@ -74,8 +74,11 @@ impl NodeInfo { &self, ctx: &RequestContext, config: &ComputeConfig, + user_info: ComputeUserInfo, ) -> Result { - self.config.connect(ctx, self.aux.clone(), config).await + self.config + .connect(ctx, self.aux.clone(), config, user_info) + .await } pub(crate) fn reuse_settings(&mut self, other: Self) { diff --git a/proxy/src/proxy/connect_compute.rs b/proxy/src/proxy/connect_compute.rs index 8a80494860..dd145e6bb2 100644 --- a/proxy/src/proxy/connect_compute.rs +++ b/proxy/src/proxy/connect_compute.rs @@ -4,7 +4,7 @@ use tokio::time; use tracing::{debug, info, warn}; use super::retry::ShouldRetryWakeCompute; -use crate::auth::backend::ComputeCredentialKeys; +use crate::auth::backend::{ComputeCredentialKeys, ComputeUserInfo}; use crate::compute::{self, PostgresConnection, COULD_NOT_CONNECT}; use crate::config::{ComputeConfig, RetryConfig}; use crate::context::RequestContext; @@ -71,6 +71,8 @@ pub(crate) struct TcpMechanism<'a> { /// connect_to_compute concurrency lock pub(crate) locks: &'static ApiLocks, + + pub(crate) user_info: ComputeUserInfo, } #[async_trait] @@ -88,7 +90,7 @@ impl ConnectMechanism for TcpMechanism<'_> { ) -> Result { let host = node_info.config.get_host(); let permit = self.locks.get_permit(&host).await?; - permit.release_result(node_info.connect(ctx, config).await) + permit.release_result(node_info.connect(ctx, config, self.user_info.clone()).await) } fn update_connect_config(&self, config: &mut compute::ConnCfg) { diff --git a/proxy/src/proxy/mod.rs b/proxy/src/proxy/mod.rs index 1f7dba2f9a..63f93f0a91 100644 --- a/proxy/src/proxy/mod.rs +++ b/proxy/src/proxy/mod.rs @@ -332,16 +332,19 @@ pub(crate) async fn handle_client( } }; - let params_compat = match &user_info { - auth::Backend::ControlPlane(_, info) => { - info.info.options.get(NeonOptions::PARAMS_COMPAT).is_some() - } - auth::Backend::Local(_) => false, + let compute_user_info = match &user_info { + auth::Backend::ControlPlane(_, info) => &info.info, + auth::Backend::Local(_) => unreachable!("local proxy does not run tcp proxy service"), }; + let params_compat = compute_user_info + .options + .get(NeonOptions::PARAMS_COMPAT) + .is_some(); let mut node = connect_to_compute( ctx, &TcpMechanism { + user_info: compute_user_info.clone(), params_compat, params: ¶ms, locks: &config.connect_compute_locks, diff --git a/proxy/src/redis/notifications.rs b/proxy/src/redis/notifications.rs index bf9d61ded3..63cdf6176c 100644 --- a/proxy/src/redis/notifications.rs +++ b/proxy/src/redis/notifications.rs @@ -74,7 +74,11 @@ pub(crate) enum Notification { #[serde(rename = "/cancel_session")] Cancel(CancelSession), - #[serde(other, skip_serializing)] + #[serde( + other, + deserialize_with = "deserialize_unknown_topic", + skip_serializing + )] UnknownTopic, } @@ -123,6 +127,15 @@ where serde_json::from_str(&s).map_err(::custom) } +// https://github.com/serde-rs/serde/issues/1714 +fn deserialize_unknown_topic<'de, D>(deserializer: D) -> Result<(), D::Error> +where + D: serde::Deserializer<'de>, +{ + deserializer.deserialize_any(serde::de::IgnoredAny)?; + Ok(()) +} + struct MessageHandler { cache: Arc, cancellation_handler: Arc>, @@ -458,4 +471,30 @@ mod tests { Ok(()) } + + #[test] + fn parse_unknown_topic() -> anyhow::Result<()> { + let with_data = json!({ + "type": "message", + "topic": "/doesnotexist", + "data": { + "payload": "ignored" + }, + "extra_fields": "something" + }) + .to_string(); + let result: Notification = serde_json::from_str(&with_data)?; + assert_eq!(result, Notification::UnknownTopic); + + let without_data = json!({ + "type": "message", + "topic": "/doesnotexist", + "extra_fields": "something" + }) + .to_string(); + let result: Notification = serde_json::from_str(&without_data)?; + assert_eq!(result, Notification::UnknownTopic); + + Ok(()) + } } diff --git a/proxy/src/serverless/backend.rs b/proxy/src/serverless/backend.rs index b398c3ddd0..6d5fb13681 100644 --- a/proxy/src/serverless/backend.rs +++ b/proxy/src/serverless/backend.rs @@ -3,9 +3,9 @@ use std::sync::Arc; use std::time::Duration; use async_trait::async_trait; +use ed25519_dalek::SigningKey; use hyper_util::rt::{TokioExecutor, TokioIo, TokioTimer}; -use p256::ecdsa::SigningKey; -use p256::elliptic_curve::JwkEcKey; +use jose_jwk::jose_b64; use rand::rngs::OsRng; use tokio::net::{lookup_host, TcpStream}; use tracing::field::display; @@ -354,9 +354,15 @@ impl PoolingBackend { } } -fn create_random_jwk() -> (SigningKey, JwkEcKey) { - let key = SigningKey::random(&mut OsRng); - let jwk = p256::PublicKey::from(key.verifying_key()).to_jwk(); +fn create_random_jwk() -> (SigningKey, jose_jwk::Key) { + let key = SigningKey::generate(&mut OsRng); + + let jwk = jose_jwk::Key::Okp(jose_jwk::Okp { + crv: jose_jwk::OkpCurves::Ed25519, + x: jose_b64::serde::Bytes::from(key.verifying_key().to_bytes().to_vec()), + d: None, + }); + (key, jwk) } diff --git a/proxy/src/serverless/local_conn_pool.rs b/proxy/src/serverless/local_conn_pool.rs index c51a2bc9ba..fe33f0ff65 100644 --- a/proxy/src/serverless/local_conn_pool.rs +++ b/proxy/src/serverless/local_conn_pool.rs @@ -16,17 +16,16 @@ use std::sync::Arc; use std::task::{ready, Poll}; use std::time::Duration; +use ed25519_dalek::{Signature, Signer, SigningKey}; use futures::future::poll_fn; use futures::Future; use indexmap::IndexMap; use jose_jwk::jose_b64::base64ct::{Base64UrlUnpadded, Encoding}; -use p256::ecdsa::{Signature, SigningKey}; use parking_lot::RwLock; use postgres_client::tls::NoTlsStream; use postgres_client::types::ToSql; use postgres_client::AsyncMessage; use serde_json::value::RawValue; -use signature::Signer; use tokio::net::TcpStream; use tokio::time::Instant; use tokio_util::sync::CancellationToken; @@ -42,7 +41,7 @@ use crate::control_plane::messages::{ColdStartInfo, MetricsAuxInfo}; use crate::metrics::Metrics; pub(crate) const EXT_NAME: &str = "pg_session_jwt"; -pub(crate) const EXT_VERSION: &str = "0.1.2"; +pub(crate) const EXT_VERSION: &str = "0.2.0"; pub(crate) const EXT_SCHEMA: &str = "auth"; #[derive(Clone)] @@ -339,8 +338,8 @@ fn sign_jwt(sk: &SigningKey, payload: &[u8]) -> String { let cap = jwt.capacity(); // we only need an empty header with the alg specified. - // base64url(r#"{"alg":"ES256"}"#) == "eyJhbGciOiJFUzI1NiJ9" - jwt.push_str("eyJhbGciOiJFUzI1NiJ9."); + // base64url(r#"{"alg":"EdDSA"}"#) == "eyJhbGciOiJFZERTQSJ9" + jwt.push_str("eyJhbGciOiJFZERTQSJ9."); // encode the jwt payload in-place base64::encode_config_buf(payload, base64::URL_SAFE_NO_PAD, &mut jwt); @@ -366,14 +365,14 @@ fn sign_jwt(sk: &SigningKey, payload: &[u8]) -> String { #[cfg(test)] #[expect(clippy::unwrap_used)] mod tests { - use p256::ecdsa::SigningKey; + use ed25519_dalek::SigningKey; use typed_json::json; use super::resign_jwt; #[test] fn jwt_token_snapshot() { - let key = SigningKey::from_bytes(&[1; 32].into()).unwrap(); + let key = SigningKey::from_bytes(&[1; 32]); let data = json!({"foo":"bar","jti":"foo\nbar","nested":{"jti":"tricky nesting"}}).to_string(); @@ -381,12 +380,17 @@ mod tests { // To validate the JWT, copy the JWT string and paste it into https://jwt.io/. // In the public-key box, paste the following jwk public key - // `{"kty":"EC","crv":"P-256","x":"b_A7lJJBzh2t1DUZ5pYOCoW0GmmgXDKBA6orzhWUyhY","y":"PE91OlW_AdxT9sCwx-7ni0DG_30lqW4igrmJzvccFEo"}` + // `{"kty":"OKP","crv":"Ed25519","x":"iojj3XQJ8ZX9UtstPLpdcspnCb8dlBIb83SIAbQPb1w"}` + // Note - jwt.io doesn't support EdDSA :( + // https://github.com/jsonwebtoken/jsonwebtoken.github.io/issues/509 - // let pub_key = p256::ecdsa::VerifyingKey::from(&key); - // let pub_key = p256::PublicKey::from(pub_key); - // println!("{}", pub_key.to_jwk_string()); + // let jwk = jose_jwk::Key::Okp(jose_jwk::Okp { + // crv: jose_jwk::OkpCurves::Ed25519, + // x: jose_jwk::jose_b64::serde::Bytes::from(key.verifying_key().to_bytes().to_vec()), + // d: None, + // }); + // println!("{}", serde_json::to_string(&jwk).unwrap()); - assert_eq!(jwt, "eyJhbGciOiJFUzI1NiJ9.eyJmb28iOiJiYXIiLCJqdGkiOjIsIm5lc3RlZCI6eyJqdGkiOiJ0cmlja3kgbmVzdGluZyJ9fQ.pYf0LxoJ8sDgpmsYOgrbNecOSipnPBEGwnZzB-JhW2cONrKlqRsgXwK8_cOsyolGy-hTTe8GXbWTl_UdpF5RyA"); + assert_eq!(jwt, "eyJhbGciOiJFZERTQSJ9.eyJmb28iOiJiYXIiLCJqdGkiOjIsIm5lc3RlZCI6eyJqdGkiOiJ0cmlja3kgbmVzdGluZyJ9fQ.Cvyc2By33KI0f0obystwdy8PN111L3Sc9_Mr2CU3XshtSqSdxuRxNEZGbb_RvyJf2IzheC_s7aBZ-jLeQ9N0Bg"); } } diff --git a/safekeeper/Cargo.toml b/safekeeper/Cargo.toml index 3ebb7097f2..0eb511f1cc 100644 --- a/safekeeper/Cargo.toml +++ b/safekeeper/Cargo.toml @@ -26,6 +26,7 @@ hex.workspace = true humantime.workspace = true http.workspace = true hyper0.workspace = true +itertools.workspace = true futures.workspace = true once_cell.workspace = true parking_lot.workspace = true @@ -39,6 +40,7 @@ scopeguard.workspace = true reqwest = { workspace = true, features = ["json"] } serde.workspace = true serde_json.workspace = true +smallvec.workspace = true strum.workspace = true strum_macros.workspace = true thiserror.workspace = true @@ -63,6 +65,7 @@ storage_broker.workspace = true tokio-stream.workspace = true utils.workspace = true wal_decoder.workspace = true +env_logger.workspace = true workspace_hack.workspace = true diff --git a/safekeeper/benches/receive_wal.rs b/safekeeper/benches/receive_wal.rs index 996c4d9b8c..19c6662e74 100644 --- a/safekeeper/benches/receive_wal.rs +++ b/safekeeper/benches/receive_wal.rs @@ -21,14 +21,13 @@ const KB: usize = 1024; const MB: usize = 1024 * KB; const GB: usize = 1024 * MB; -/// Use jemalloc, and configure it to sample allocations for profiles every 1 MB. -/// This mirrors the configuration in bin/safekeeper.rs. +/// Use jemalloc and enable profiling, to mirror bin/safekeeper.rs. #[global_allocator] static GLOBAL: tikv_jemallocator::Jemalloc = tikv_jemallocator::Jemalloc; #[allow(non_upper_case_globals)] #[export_name = "malloc_conf"] -pub static malloc_conf: &[u8] = b"prof:true,prof_active:true,lg_prof_sample:20\0"; +pub static malloc_conf: &[u8] = b"prof:true,prof_active:true,lg_prof_sample:21\0"; // Register benchmarks with Criterion. criterion_group!( diff --git a/safekeeper/src/bin/safekeeper.rs b/safekeeper/src/bin/safekeeper.rs index e0ba38d638..6cc53e0d23 100644 --- a/safekeeper/src/bin/safekeeper.rs +++ b/safekeeper/src/bin/safekeeper.rs @@ -51,12 +51,12 @@ use utils::{ #[global_allocator] static GLOBAL: tikv_jemallocator::Jemalloc = tikv_jemallocator::Jemalloc; -// Configure jemalloc to sample allocations for profiles every 1 MB (1 << 20). -// TODO: disabled because concurrent CPU profiles cause seg faults. See: -// https://github.com/neondatabase/neon/issues/10225. -//#[allow(non_upper_case_globals)] -//#[export_name = "malloc_conf"] -//pub static malloc_conf: &[u8] = b"prof:true,prof_active:true,lg_prof_sample:20\0"; +/// Configure jemalloc to profile heap allocations by sampling stack traces every 2 MB (1 << 21). +/// This adds roughly 3% overhead for allocations on average, which is acceptable considering +/// performance-sensitive code will avoid allocations as far as possible anyway. +#[allow(non_upper_case_globals)] +#[export_name = "malloc_conf"] +pub static malloc_conf: &[u8] = b"prof:true,prof_active:true,lg_prof_sample:21\0"; const PID_FILE_NAME: &str = "safekeeper.pid"; const ID_FILE_NAME: &str = "safekeeper.id"; @@ -207,6 +207,13 @@ struct Args { /// Also defines interval for eviction retries. #[arg(long, value_parser = humantime::parse_duration, default_value = DEFAULT_EVICTION_MIN_RESIDENT)] eviction_min_resident: Duration, + /// Enable fanning out WAL to different shards from the same reader + #[arg(long)] + wal_reader_fanout: bool, + /// Only fan out the WAL reader if the absoulte delta between the new requested position + /// and the current position of the reader is smaller than this value. + #[arg(long)] + max_delta_for_fanout: Option, } // Like PathBufValueParser, but allows empty string. @@ -370,6 +377,8 @@ async fn main() -> anyhow::Result<()> { control_file_save_interval: args.control_file_save_interval, partial_backup_concurrency: args.partial_backup_concurrency, eviction_min_resident: args.eviction_min_resident, + wal_reader_fanout: args.wal_reader_fanout, + max_delta_for_fanout: args.max_delta_for_fanout, }); // initialize sentry if SENTRY_DSN is provided diff --git a/safekeeper/src/control_file.rs b/safekeeper/src/control_file.rs index 06e5afbf74..e92ca881e1 100644 --- a/safekeeper/src/control_file.rs +++ b/safekeeper/src/control_file.rs @@ -3,6 +3,7 @@ use anyhow::{bail, ensure, Context, Result}; use byteorder::{LittleEndian, ReadBytesExt, WriteBytesExt}; use camino::{Utf8Path, Utf8PathBuf}; +use safekeeper_api::membership::INVALID_GENERATION; use tokio::fs::File; use tokio::io::AsyncWriteExt; use utils::crashsafe::durable_rename; @@ -13,14 +14,14 @@ use std::ops::Deref; use std::path::Path; use std::time::Instant; -use crate::control_file_upgrade::downgrade_v9_to_v8; +use crate::control_file_upgrade::downgrade_v10_to_v9; use crate::control_file_upgrade::upgrade_control_file; use crate::metrics::PERSIST_CONTROL_FILE_SECONDS; use crate::state::{EvictionState, TimelinePersistentState}; use utils::bin_ser::LeSer; pub const SK_MAGIC: u32 = 0xcafeceefu32; -pub const SK_FORMAT_VERSION: u32 = 9; +pub const SK_FORMAT_VERSION: u32 = 10; // contains persistent metadata for safekeeper pub const CONTROL_FILE_NAME: &str = "safekeeper.control"; @@ -169,10 +170,11 @@ impl TimelinePersistentState { let mut buf: Vec = Vec::new(); WriteBytesExt::write_u32::(&mut buf, SK_MAGIC)?; - if self.eviction_state == EvictionState::Present { - // temp hack for forward compatibility - const PREV_FORMAT_VERSION: u32 = 8; - let prev = downgrade_v9_to_v8(self); + if self.mconf.generation == INVALID_GENERATION { + // Temp hack for forward compatibility test: in case of none + // configuration save cfile in previous v9 format. + const PREV_FORMAT_VERSION: u32 = 9; + let prev = downgrade_v10_to_v9(self); WriteBytesExt::write_u32::(&mut buf, PREV_FORMAT_VERSION)?; prev.ser_into(&mut buf)?; } else { @@ -233,6 +235,7 @@ impl Storage for FileStorage { #[cfg(test)] mod test { use super::*; + use safekeeper_api::membership::{Configuration, MemberSet}; use tokio::fs; use utils::lsn::Lsn; @@ -242,6 +245,11 @@ mod test { async fn test_read_write_safekeeper_state() -> anyhow::Result<()> { let tempdir = camino_tempfile::tempdir()?; let mut state = TimelinePersistentState::empty(); + state.mconf = Configuration { + generation: 42, + members: MemberSet::empty(), + new_members: None, + }; let mut storage = FileStorage::create_new(tempdir.path(), state.clone(), NO_SYNC).await?; // Make a change. diff --git a/safekeeper/src/control_file_upgrade.rs b/safekeeper/src/control_file_upgrade.rs index dd152fd4cc..904e79f976 100644 --- a/safekeeper/src/control_file_upgrade.rs +++ b/safekeeper/src/control_file_upgrade.rs @@ -1,17 +1,22 @@ //! Code to deal with safekeeper control file upgrades +use std::vec; + use crate::{ safekeeper::{AcceptorState, PgUuid, TermHistory, TermLsn}, - state::{EvictionState, PersistedPeers, TimelinePersistentState}, + state::{EvictionState, TimelinePersistentState}, wal_backup_partial, }; use anyhow::{bail, Result}; use pq_proto::SystemId; -use safekeeper_api::{ServerInfo, Term}; +use safekeeper_api::{ + membership::{Configuration, INVALID_GENERATION}, + ServerInfo, Term, +}; use serde::{Deserialize, Serialize}; use tracing::*; use utils::{ bin_ser::LeSer, - id::{TenantId, TimelineId}, + id::{NodeId, TenantId, TimelineId}, lsn::Lsn, }; @@ -233,6 +238,90 @@ pub struct SafeKeeperStateV8 { pub partial_backup: wal_backup_partial::State, } +#[derive(Debug, Clone, Serialize, Deserialize, PartialEq)] +pub struct PersistedPeers(pub Vec<(NodeId, PersistedPeerInfo)>); + +#[derive(Debug, Clone, Serialize, Deserialize, PartialEq)] +pub struct PersistedPeerInfo { + /// LSN up to which safekeeper offloaded WAL to s3. + pub backup_lsn: Lsn, + /// Term of the last entry. + pub term: Term, + /// LSN of the last record. + pub flush_lsn: Lsn, + /// Up to which LSN safekeeper regards its WAL as committed. + pub commit_lsn: Lsn, +} + +impl PersistedPeerInfo { + pub fn new() -> Self { + Self { + backup_lsn: Lsn::INVALID, + term: safekeeper_api::INITIAL_TERM, + flush_lsn: Lsn(0), + commit_lsn: Lsn(0), + } + } +} + +// make clippy happy +impl Default for PersistedPeerInfo { + fn default() -> Self { + Self::new() + } +} + +/// Note: SafekeeperStateVn is old name for TimelinePersistentStateVn. +#[derive(Debug, Clone, Serialize, Deserialize, PartialEq)] +pub struct TimelinePersistentStateV9 { + #[serde(with = "hex")] + pub tenant_id: TenantId, + #[serde(with = "hex")] + pub timeline_id: TimelineId, + /// persistent acceptor state + pub acceptor_state: AcceptorState, + /// information about server + pub server: ServerInfo, + /// Unique id of the last *elected* proposer we dealt with. Not needed + /// for correctness, exists for monitoring purposes. + #[serde(with = "hex")] + pub proposer_uuid: PgUuid, + /// Since which LSN this timeline generally starts. Safekeeper might have + /// joined later. + pub timeline_start_lsn: Lsn, + /// Since which LSN safekeeper has (had) WAL for this timeline. + /// All WAL segments next to one containing local_start_lsn are + /// filled with data from the beginning. + pub local_start_lsn: Lsn, + /// Part of WAL acknowledged by quorum *and available locally*. Always points + /// to record boundary. + pub commit_lsn: Lsn, + /// LSN that points to the end of the last backed up segment. Useful to + /// persist to avoid finding out offloading progress on boot. + pub backup_lsn: Lsn, + /// Minimal LSN which may be needed for recovery of some safekeeper (end_lsn + /// of last record streamed to everyone). Persisting it helps skipping + /// recovery in walproposer, generally we compute it from peers. In + /// walproposer proto called 'truncate_lsn'. Updates are currently drived + /// only by walproposer. + pub peer_horizon_lsn: Lsn, + /// LSN of the oldest known checkpoint made by pageserver and successfully + /// pushed to s3. We don't remove WAL beyond it. Persisted only for + /// informational purposes, we receive it from pageserver (or broker). + pub remote_consistent_lsn: Lsn, + /// Peers and their state as we remember it. Knowing peers themselves is + /// fundamental; but state is saved here only for informational purposes and + /// obviously can be stale. (Currently not saved at all, but let's provision + /// place to have less file version upgrades). + pub peers: PersistedPeers, + /// Holds names of partial segments uploaded to remote storage. Used to + /// clean up old objects without leaving garbage in remote storage. + pub partial_backup: wal_backup_partial::State, + /// Eviction state of the timeline. If it's Offloaded, we should download + /// WAL files from remote storage to serve the timeline. + pub eviction_state: EvictionState, +} + pub fn upgrade_control_file(buf: &[u8], version: u32) -> Result { // migrate to storing full term history if version == 1 { @@ -248,6 +337,7 @@ pub fn upgrade_control_file(buf: &[u8], version: u32) -> Result Result Result Result Result Result Result Result Result Result Result Result Result SafeKeeperStateV8 { - assert!(state.eviction_state == EvictionState::Present); - SafeKeeperStateV8 { +// Used as a temp hack to make forward compatibility test work. Should be +// removed after PR adding v10 is merged. +pub fn downgrade_v10_to_v9(state: &TimelinePersistentState) -> TimelinePersistentStateV9 { + assert!(state.mconf.generation == INVALID_GENERATION); + TimelinePersistentStateV9 { tenant_id: state.tenant_id, timeline_id: state.timeline_id, acceptor_state: state.acceptor_state.clone(), @@ -426,8 +542,9 @@ pub fn downgrade_v9_to_v8(state: &TimelinePersistentState) -> SafeKeeperStateV8 backup_lsn: state.backup_lsn, peer_horizon_lsn: state.peer_horizon_lsn, remote_consistent_lsn: state.remote_consistent_lsn, - peers: state.peers.clone(), + peers: PersistedPeers(vec![]), partial_backup: state.partial_backup.clone(), + eviction_state: state.eviction_state, } } @@ -437,7 +554,7 @@ mod tests { use utils::{id::NodeId, Hex}; - use crate::safekeeper::PersistedPeerInfo; + use crate::control_file_upgrade::PersistedPeerInfo; use super::*; diff --git a/safekeeper/src/copy_timeline.rs b/safekeeper/src/copy_timeline.rs index 28ef2b1d23..10a761e1f5 100644 --- a/safekeeper/src/copy_timeline.rs +++ b/safekeeper/src/copy_timeline.rs @@ -1,6 +1,7 @@ use anyhow::{bail, Result}; use camino::Utf8PathBuf; use postgres_ffi::{MAX_SEND_SIZE, WAL_SEGMENT_SIZE}; +use safekeeper_api::membership::Configuration; use std::sync::Arc; use tokio::{ fs::OpenOptions, @@ -147,10 +148,10 @@ pub async fn handle_request( let mut new_state = TimelinePersistentState::new( &request.destination_ttid, + Configuration::empty(), state.server.clone(), - vec![], - request.until_lsn, start_lsn, + request.until_lsn, )?; new_state.timeline_start_lsn = start_lsn; new_state.peer_horizon_lsn = request.until_lsn; diff --git a/safekeeper/src/handler.rs b/safekeeper/src/handler.rs index bb639bfb32..e77eeb4130 100644 --- a/safekeeper/src/handler.rs +++ b/safekeeper/src/handler.rs @@ -52,16 +52,70 @@ pub struct SafekeeperPostgresHandler { /// Parsed Postgres command. enum SafekeeperPostgresCommand { - StartWalPush, - StartReplication { start_lsn: Lsn, term: Option }, + StartWalPush { + proto_version: u32, + // Eventually timelines will be always created explicitly by storcon. + // This option allows legacy behaviour for compute to do that until we + // fully migrate. + allow_timeline_creation: bool, + }, + StartReplication { + start_lsn: Lsn, + term: Option, + }, IdentifySystem, TimelineStatus, - JSONCtrl { cmd: AppendLogicalMessage }, + JSONCtrl { + cmd: AppendLogicalMessage, + }, } fn parse_cmd(cmd: &str) -> anyhow::Result { if cmd.starts_with("START_WAL_PUSH") { - Ok(SafekeeperPostgresCommand::StartWalPush) + // Allow additional options in postgres START_REPLICATION style like + // START_WAL_PUSH (proto_version '3', allow_timeline_creation 'false'). + // Parsing here is very naive and breaks in case of commas or + // whitespaces in values, but enough for our purposes. + let re = Regex::new(r"START_WAL_PUSH(\s+?\((.*)\))?").unwrap(); + let caps = re + .captures(cmd) + .context(format!("failed to parse START_WAL_PUSH command {}", cmd))?; + // capture () content + let options = caps.get(2).map(|m| m.as_str()).unwrap_or(""); + // default values + let mut proto_version = 2; + let mut allow_timeline_creation = true; + for kvstr in options.split(",") { + if kvstr.is_empty() { + continue; + } + let mut kvit = kvstr.split_whitespace(); + let key = kvit.next().context(format!( + "failed to parse key in kv {} in command {}", + kvstr, cmd + ))?; + let value = kvit.next().context(format!( + "failed to parse value in kv {} in command {}", + kvstr, cmd + ))?; + let value_trimmed = value.trim_matches('\''); + if key == "proto_version" { + proto_version = value_trimmed.parse::().context(format!( + "failed to parse proto_version value {} in command {}", + value, cmd + ))?; + } + if key == "allow_timeline_creation" { + allow_timeline_creation = value_trimmed.parse::().context(format!( + "failed to parse allow_timeline_creation value {} in command {}", + value, cmd + ))?; + } + } + Ok(SafekeeperPostgresCommand::StartWalPush { + proto_version, + allow_timeline_creation, + }) } else if cmd.starts_with("START_REPLICATION") { let re = Regex::new( // We follow postgres START_REPLICATION LOGICAL options to pass term. @@ -95,7 +149,7 @@ fn parse_cmd(cmd: &str) -> anyhow::Result { fn cmd_to_string(cmd: &SafekeeperPostgresCommand) -> &str { match cmd { - SafekeeperPostgresCommand::StartWalPush => "START_WAL_PUSH", + SafekeeperPostgresCommand::StartWalPush { .. } => "START_WAL_PUSH", SafekeeperPostgresCommand::StartReplication { .. } => "START_REPLICATION", SafekeeperPostgresCommand::TimelineStatus => "TIMELINE_STATUS", SafekeeperPostgresCommand::IdentifySystem => "IDENTIFY_SYSTEM", @@ -293,8 +347,11 @@ impl postgres_backend::Handler self.ttid = TenantTimelineId::new(tenant_id, timeline_id); match cmd { - SafekeeperPostgresCommand::StartWalPush => { - self.handle_start_wal_push(pgb) + SafekeeperPostgresCommand::StartWalPush { + proto_version, + allow_timeline_creation, + } => { + self.handle_start_wal_push(pgb, proto_version, allow_timeline_creation) .instrument(info_span!("WAL receiver")) .await } @@ -467,3 +524,39 @@ impl SafekeeperPostgresHandler { } } } + +#[cfg(test)] +mod tests { + use super::SafekeeperPostgresCommand; + + /// Test parsing of START_WAL_PUSH command + #[test] + fn test_start_wal_push_parse() { + let cmd = "START_WAL_PUSH"; + let parsed = super::parse_cmd(cmd).expect("failed to parse"); + match parsed { + SafekeeperPostgresCommand::StartWalPush { + proto_version, + allow_timeline_creation, + } => { + assert_eq!(proto_version, 2); + assert!(allow_timeline_creation); + } + _ => panic!("unexpected command"), + } + + let cmd = + "START_WAL_PUSH (proto_version '3', allow_timeline_creation 'false', unknown 'hoho')"; + let parsed = super::parse_cmd(cmd).expect("failed to parse"); + match parsed { + SafekeeperPostgresCommand::StartWalPush { + proto_version, + allow_timeline_creation, + } => { + assert_eq!(proto_version, 3); + assert!(!allow_timeline_creation); + } + _ => panic!("unexpected command"), + } + } +} diff --git a/safekeeper/src/http/routes.rs b/safekeeper/src/http/routes.rs index 6186f4c3ba..4b9fb9eb67 100644 --- a/safekeeper/src/http/routes.rs +++ b/safekeeper/src/http/routes.rs @@ -1,4 +1,5 @@ use hyper::{Body, Request, Response, StatusCode}; +use safekeeper_api::models; use safekeeper_api::models::AcceptorStateStatus; use safekeeper_api::models::SafekeeperStatus; use safekeeper_api::models::TermSwitchApiEntry; @@ -111,14 +112,15 @@ async fn timeline_create_handler(mut request: Request) -> Result) -> Result) -> Result) -> Result, +) -> Result, ApiError> { + let ttid = TenantTimelineId::new( + parse_request_param(&request, "tenant_id")?, + parse_request_param(&request, "timeline_id")?, + ); + check_permission(&request, Some(ttid.tenant_id))?; + + let global_timelines = get_global_timelines(&request); + let tli = global_timelines.get(ttid).map_err(ApiError::from)?; + + let data: models::TimelineMembershipSwitchRequest = json_request(&mut request).await?; + let response = tli + .membership_switch(data.mconf) + .await + .map_err(ApiError::InternalServerError)?; + + json_response(StatusCode::OK, response) +} + async fn timeline_copy_handler(mut request: Request) -> Result, ApiError> { check_permission(&request, None)?; @@ -618,6 +643,10 @@ pub fn make_router( "/v1/tenant/:tenant_id/timeline/:timeline_id/snapshot/:destination_id", |r| request_span(r, timeline_snapshot_handler), ) + .post( + "/v1/tenant/:tenant_id/timeline/:timeline_id/membership", + |r| request_span(r, timeline_membership_handler), + ) .post( "/v1/tenant/:tenant_id/timeline/:source_timeline_id/copy", |r| request_span(r, timeline_copy_handler), diff --git a/safekeeper/src/json_ctrl.rs b/safekeeper/src/json_ctrl.rs index 256e350ceb..19e17c4a75 100644 --- a/safekeeper/src/json_ctrl.rs +++ b/safekeeper/src/json_ctrl.rs @@ -8,6 +8,7 @@ use anyhow::Context; use postgres_backend::QueryError; +use safekeeper_api::membership::Configuration; use safekeeper_api::{ServerInfo, Term}; use serde::{Deserialize, Serialize}; use tokio::io::{AsyncRead, AsyncWrite}; @@ -105,6 +106,7 @@ async fn prepare_safekeeper( .global_timelines .create( spg.ttid, + Configuration::empty(), ServerInfo { pg_version, wal_seg_size: WAL_SEGMENT_SIZE as u32, diff --git a/safekeeper/src/lib.rs b/safekeeper/src/lib.rs index 7acf355e6a..e0090c638a 100644 --- a/safekeeper/src/lib.rs +++ b/safekeeper/src/lib.rs @@ -108,6 +108,8 @@ pub struct SafeKeeperConf { pub control_file_save_interval: Duration, pub partial_backup_concurrency: usize, pub eviction_min_resident: Duration, + pub wal_reader_fanout: bool, + pub max_delta_for_fanout: Option, } impl SafeKeeperConf { @@ -150,6 +152,8 @@ impl SafeKeeperConf { control_file_save_interval: Duration::from_secs(1), partial_backup_concurrency: 1, eviction_min_resident: Duration::ZERO, + wal_reader_fanout: false, + max_delta_for_fanout: None, } } } diff --git a/safekeeper/src/metrics.rs b/safekeeper/src/metrics.rs index 5883f402c7..3ea9e3d674 100644 --- a/safekeeper/src/metrics.rs +++ b/safekeeper/src/metrics.rs @@ -12,9 +12,9 @@ use metrics::{ pow2_buckets, proto::MetricFamily, register_histogram, register_histogram_vec, register_int_counter, register_int_counter_pair, - register_int_counter_pair_vec, register_int_counter_vec, register_int_gauge, Gauge, GaugeVec, - Histogram, HistogramVec, IntCounter, IntCounterPair, IntCounterPairVec, IntCounterVec, - IntGauge, IntGaugeVec, DISK_FSYNC_SECONDS_BUCKETS, + register_int_counter_pair_vec, register_int_counter_vec, register_int_gauge, + register_int_gauge_vec, Gauge, GaugeVec, Histogram, HistogramVec, IntCounter, IntCounterPair, + IntCounterPairVec, IntCounterVec, IntGauge, IntGaugeVec, DISK_FSYNC_SECONDS_BUCKETS, }; use once_cell::sync::Lazy; use postgres_ffi::XLogSegNo; @@ -211,6 +211,14 @@ pub static WAL_RECEIVERS: Lazy = Lazy::new(|| { ) .expect("Failed to register safekeeper_wal_receivers") }); +pub static WAL_READERS: Lazy = Lazy::new(|| { + register_int_gauge_vec!( + "safekeeper_wal_readers", + "Number of active WAL readers (may serve pageservers or other safekeepers)", + &["kind", "target"] + ) + .expect("Failed to register safekeeper_wal_receivers") +}); pub static WAL_RECEIVER_QUEUE_DEPTH: Lazy = Lazy::new(|| { // Use powers of two buckets, but add a bucket at 0 and the max queue size to track empty and // full queues respectively. @@ -443,6 +451,7 @@ pub struct FullTimelineInfo { pub timeline_is_active: bool, pub num_computes: u32, pub last_removed_segno: XLogSegNo, + pub interpreted_wal_reader_tasks: usize, pub epoch_start_lsn: Lsn, pub mem_state: TimelineMemState, @@ -472,6 +481,7 @@ pub struct TimelineCollector { disk_usage: GenericGaugeVec, acceptor_term: GenericGaugeVec, written_wal_bytes: GenericGaugeVec, + interpreted_wal_reader_tasks: GenericGaugeVec, written_wal_seconds: GaugeVec, flushed_wal_seconds: GaugeVec, collect_timeline_metrics: Gauge, @@ -670,6 +680,16 @@ impl TimelineCollector { .unwrap(); descs.extend(active_timelines_count.desc().into_iter().cloned()); + let interpreted_wal_reader_tasks = GenericGaugeVec::new( + Opts::new( + "safekeeper_interpreted_wal_reader_tasks", + "Number of active interpreted wal reader tasks, grouped by timeline", + ), + &["tenant_id", "timeline_id"], + ) + .unwrap(); + descs.extend(interpreted_wal_reader_tasks.desc().into_iter().cloned()); + TimelineCollector { global_timelines, descs, @@ -693,6 +713,7 @@ impl TimelineCollector { collect_timeline_metrics, timelines_count, active_timelines_count, + interpreted_wal_reader_tasks, } } } @@ -721,6 +742,7 @@ impl Collector for TimelineCollector { self.disk_usage.reset(); self.acceptor_term.reset(); self.written_wal_bytes.reset(); + self.interpreted_wal_reader_tasks.reset(); self.written_wal_seconds.reset(); self.flushed_wal_seconds.reset(); @@ -782,6 +804,9 @@ impl Collector for TimelineCollector { self.written_wal_bytes .with_label_values(labels) .set(tli.wal_storage.write_wal_bytes); + self.interpreted_wal_reader_tasks + .with_label_values(labels) + .set(tli.interpreted_wal_reader_tasks as u64); self.written_wal_seconds .with_label_values(labels) .set(tli.wal_storage.write_wal_seconds); @@ -834,6 +859,7 @@ impl Collector for TimelineCollector { mfs.extend(self.disk_usage.collect()); mfs.extend(self.acceptor_term.collect()); mfs.extend(self.written_wal_bytes.collect()); + mfs.extend(self.interpreted_wal_reader_tasks.collect()); mfs.extend(self.written_wal_seconds.collect()); mfs.extend(self.flushed_wal_seconds.collect()); diff --git a/safekeeper/src/receive_wal.rs b/safekeeper/src/receive_wal.rs index 3e9ce1da8e..cb42f6f414 100644 --- a/safekeeper/src/receive_wal.rs +++ b/safekeeper/src/receive_wal.rs @@ -21,6 +21,7 @@ use postgres_backend::PostgresBackend; use postgres_backend::PostgresBackendReader; use postgres_backend::QueryError; use pq_proto::BeMessage; +use safekeeper_api::membership::Configuration; use safekeeper_api::models::{ConnectionId, WalReceiverState, WalReceiverStatus}; use safekeeper_api::ServerInfo; use std::future; @@ -199,9 +200,14 @@ impl SafekeeperPostgresHandler { pub async fn handle_start_wal_push( &mut self, pgb: &mut PostgresBackend, + proto_version: u32, + allow_timeline_creation: bool, ) -> Result<(), QueryError> { let mut tli: Option = None; - if let Err(end) = self.handle_start_wal_push_guts(pgb, &mut tli).await { + if let Err(end) = self + .handle_start_wal_push_guts(pgb, &mut tli, proto_version, allow_timeline_creation) + .await + { // Log the result and probably send it to the client, closing the stream. let handle_end_fut = pgb.handle_copy_stream_end(end); // If we managed to create the timeline, augment logging with current LSNs etc. @@ -221,6 +227,8 @@ impl SafekeeperPostgresHandler { &mut self, pgb: &mut PostgresBackend, tli: &mut Option, + proto_version: u32, + allow_timeline_creation: bool, ) -> Result<(), CopyStreamHandlerEnd> { // The `tli` parameter is only used for passing _out_ a timeline, one should // not have been passed in. @@ -249,12 +257,17 @@ impl SafekeeperPostgresHandler { conn_id: self.conn_id, pgb_reader: &mut pgb_reader, peer_addr, + proto_version, acceptor_handle: &mut acceptor_handle, global_timelines: self.global_timelines.clone(), }; - // Read first message and create timeline if needed. - let res = network_reader.read_first_message().await; + // Read first message and create timeline if needed and allowed. This + // won't be when timelines will be always created by storcon and + // allow_timeline_creation becomes false. + let res = network_reader + .read_first_message(allow_timeline_creation) + .await; let network_res = if let Ok((timeline, next_msg)) = res { let pageserver_feedback_rx: tokio::sync::broadcast::Receiver = @@ -312,6 +325,7 @@ struct NetworkReader<'a, IO> { conn_id: ConnectionId, pgb_reader: &'a mut PostgresBackendReader, peer_addr: SocketAddr, + proto_version: u32, // WalAcceptor is spawned when we learn server info from walproposer and // create timeline; handle is put here. acceptor_handle: &'a mut Option>>, @@ -321,9 +335,10 @@ struct NetworkReader<'a, IO> { impl NetworkReader<'_, IO> { async fn read_first_message( &mut self, + allow_timeline_creation: bool, ) -> Result<(WalResidentTimeline, ProposerAcceptorMessage), CopyStreamHandlerEnd> { // Receive information about server to create timeline, if not yet. - let next_msg = read_message(self.pgb_reader).await?; + let next_msg = read_message(self.pgb_reader, self.proto_version).await?; let tli = match next_msg { ProposerAcceptorMessage::Greeting(ref greeting) => { info!( @@ -335,11 +350,22 @@ impl NetworkReader<'_, IO> { system_id: greeting.system_id, wal_seg_size: greeting.wal_seg_size, }; - let tli = self - .global_timelines - .create(self.ttid, server_info, Lsn::INVALID, Lsn::INVALID) - .await - .context("create timeline")?; + let tli = if allow_timeline_creation { + self.global_timelines + .create( + self.ttid, + Configuration::empty(), + server_info, + Lsn::INVALID, + Lsn::INVALID, + ) + .await + .context("create timeline")? + } else { + self.global_timelines + .get(self.ttid) + .context("get timeline")? + }; tli.wal_residence_guard().await? } _ => { @@ -368,7 +394,7 @@ impl NetworkReader<'_, IO> { )); // Forward all messages to WalAcceptor - read_network_loop(self.pgb_reader, msg_tx, next_msg).await + read_network_loop(self.pgb_reader, msg_tx, next_msg, self.proto_version).await } } @@ -376,9 +402,10 @@ impl NetworkReader<'_, IO> { /// TODO: Return Ok(None) on graceful termination. async fn read_message( pgb_reader: &mut PostgresBackendReader, + proto_version: u32, ) -> Result { let copy_data = pgb_reader.read_copy_message().await?; - let msg = ProposerAcceptorMessage::parse(copy_data)?; + let msg = ProposerAcceptorMessage::parse(copy_data, proto_version)?; Ok(msg) } @@ -386,6 +413,7 @@ async fn read_network_loop( pgb_reader: &mut PostgresBackendReader, msg_tx: Sender, mut next_msg: ProposerAcceptorMessage, + proto_version: u32, ) -> Result<(), CopyStreamHandlerEnd> { /// Threshold for logging slow WalAcceptor sends. const SLOW_THRESHOLD: Duration = Duration::from_secs(5); @@ -418,7 +446,7 @@ async fn read_network_loop( WAL_RECEIVER_QUEUE_DEPTH_TOTAL.inc(); WAL_RECEIVER_QUEUE_SIZE_TOTAL.add(size as i64); - next_msg = read_message(pgb_reader).await?; + next_msg = read_message(pgb_reader, proto_version).await?; } } diff --git a/safekeeper/src/safekeeper.rs b/safekeeper/src/safekeeper.rs index 6ceaf325b0..45e19c31b6 100644 --- a/safekeeper/src/safekeeper.rs +++ b/safekeeper/src/safekeeper.rs @@ -7,7 +7,6 @@ use bytes::{Buf, BufMut, Bytes, BytesMut}; use postgres_ffi::{TimeLineID, MAX_SEND_SIZE}; use safekeeper_api::models::HotStandbyFeedback; use safekeeper_api::Term; -use safekeeper_api::INVALID_TERM; use serde::{Deserialize, Serialize}; use std::cmp::max; use std::cmp::min; @@ -30,7 +29,7 @@ use utils::{ lsn::Lsn, }; -const SK_PROTOCOL_VERSION: u32 = 2; +pub const SK_PROTOCOL_VERSION: u32 = 2; pub const UNKNOWN_SERVER_VERSION: u32 = 0; #[derive(Debug, Clone, Copy, Serialize, Deserialize, PartialEq, Eq, PartialOrd, Ord)] @@ -193,36 +192,6 @@ impl AcceptorState { } } -#[derive(Debug, Clone, Serialize, Deserialize, PartialEq)] -pub struct PersistedPeerInfo { - /// LSN up to which safekeeper offloaded WAL to s3. - pub backup_lsn: Lsn, - /// Term of the last entry. - pub term: Term, - /// LSN of the last record. - pub flush_lsn: Lsn, - /// Up to which LSN safekeeper regards its WAL as committed. - pub commit_lsn: Lsn, -} - -impl PersistedPeerInfo { - pub fn new() -> Self { - Self { - backup_lsn: Lsn::INVALID, - term: INVALID_TERM, - flush_lsn: Lsn(0), - commit_lsn: Lsn(0), - } - } -} - -// make clippy happy -impl Default for PersistedPeerInfo { - fn default() -> Self { - Self::new() - } -} - // protocol messages /// Initial Proposer -> Acceptor message @@ -348,7 +317,14 @@ pub enum ProposerAcceptorMessage { impl ProposerAcceptorMessage { /// Parse proposer message. - pub fn parse(msg_bytes: Bytes) -> Result { + pub fn parse(msg_bytes: Bytes, proto_version: u32) -> Result { + if proto_version != SK_PROTOCOL_VERSION { + bail!( + "incompatible protocol version {}, expected {}", + proto_version, + SK_PROTOCOL_VERSION + ); + } // xxx using Reader is inefficient but easy to work with bincode let mut stream = msg_bytes.reader(); // u64 is here to avoid padding; it will be removed once we stop packing C structs into the wire as is @@ -1010,7 +986,7 @@ where /// Update commit_lsn from peer safekeeper data. pub async fn record_safekeeper_info(&mut self, sk_info: &SafekeeperTimelineInfo) -> Result<()> { - if (Lsn(sk_info.commit_lsn) != Lsn::INVALID) && (sk_info.last_log_term != INVALID_TERM) { + if Lsn(sk_info.commit_lsn) != Lsn::INVALID { // Note: the check is too restrictive, generally we can update local // commit_lsn if our history matches (is part of) history of advanced // commit_lsn provider. @@ -1025,12 +1001,20 @@ where #[cfg(test)] mod tests { use futures::future::BoxFuture; + use postgres_ffi::{XLogSegNo, WAL_SEGMENT_SIZE}; - use safekeeper_api::ServerInfo; + use safekeeper_api::{ + membership::{Configuration, MemberSet, SafekeeperId}, + ServerInfo, + }; use super::*; - use crate::state::{EvictionState, PersistedPeers, TimelinePersistentState}; - use std::{ops::Deref, str::FromStr, time::Instant}; + use crate::state::{EvictionState, TimelinePersistentState}; + use std::{ + ops::Deref, + str::FromStr, + time::{Instant, UNIX_EPOCH}, + }; // fake storage for tests struct InMemoryState { @@ -1313,12 +1297,21 @@ mod tests { #[test] fn test_sk_state_bincode_serde_roundtrip() { - use utils::Hex; let tenant_id = TenantId::from_str("cf0480929707ee75372337efaa5ecf96").unwrap(); let timeline_id = TimelineId::from_str("112ded66422aa5e953e5440fa5427ac4").unwrap(); let state = TimelinePersistentState { tenant_id, timeline_id, + mconf: Configuration { + generation: 42, + members: MemberSet::new(vec![SafekeeperId { + id: NodeId(1), + host: "hehe.org".to_owned(), + pg_port: 5432, + }]) + .expect("duplicate member"), + new_members: None, + }, acceptor_state: AcceptorState { term: 42, term_history: TermHistory(vec![TermLsn { @@ -1342,70 +1335,13 @@ mod tests { backup_lsn: Lsn(1234567300), peer_horizon_lsn: Lsn(9999999), remote_consistent_lsn: Lsn(1234560000), - peers: PersistedPeers(vec![( - NodeId(1), - PersistedPeerInfo { - backup_lsn: Lsn(1234567000), - term: 42, - flush_lsn: Lsn(1234567800 - 8), - commit_lsn: Lsn(1234567600), - }, - )]), partial_backup: crate::wal_backup_partial::State::default(), eviction_state: EvictionState::Present, + creation_ts: UNIX_EPOCH, }; let ser = state.ser().unwrap(); - #[rustfmt::skip] - let expected = [ - // tenant_id as length prefixed hex - 0x20, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, - 0x63, 0x66, 0x30, 0x34, 0x38, 0x30, 0x39, 0x32, 0x39, 0x37, 0x30, 0x37, 0x65, 0x65, 0x37, 0x35, 0x33, 0x37, 0x32, 0x33, 0x33, 0x37, 0x65, 0x66, 0x61, 0x61, 0x35, 0x65, 0x63, 0x66, 0x39, 0x36, - // timeline_id as length prefixed hex - 0x20, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, - 0x31, 0x31, 0x32, 0x64, 0x65, 0x64, 0x36, 0x36, 0x34, 0x32, 0x32, 0x61, 0x61, 0x35, 0x65, 0x39, 0x35, 0x33, 0x65, 0x35, 0x34, 0x34, 0x30, 0x66, 0x61, 0x35, 0x34, 0x32, 0x37, 0x61, 0x63, 0x34, - // term - 0x2a, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, - // length prefix - 0x01, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, - // unsure why this order is swapped - 0x29, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, - 0x01, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, - // pg_version - 0x0e, 0x00, 0x00, 0x00, - // systemid - 0x21, 0x43, 0x65, 0x87, 0x78, 0x56, 0x34, 0x12, - // wal_seg_size - 0x78, 0x56, 0x34, 0x12, - // pguuid as length prefixed hex - 0x20, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, - 0x63, 0x34, 0x37, 0x61, 0x34, 0x32, 0x61, 0x35, 0x30, 0x66, 0x34, 0x34, 0x65, 0x35, 0x35, 0x33, 0x65, 0x39, 0x61, 0x35, 0x32, 0x61, 0x34, 0x32, 0x36, 0x36, 0x65, 0x64, 0x32, 0x64, 0x31, 0x31, - - // timeline_start_lsn - 0x00, 0x56, 0x34, 0x12, 0x00, 0x00, 0x00, 0x00, - 0x12, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, - 0x78, 0x02, 0x96, 0x49, 0x00, 0x00, 0x00, 0x00, - 0x84, 0x00, 0x96, 0x49, 0x00, 0x00, 0x00, 0x00, - 0x7f, 0x96, 0x98, 0x00, 0x00, 0x00, 0x00, 0x00, - 0x00, 0xe4, 0x95, 0x49, 0x00, 0x00, 0x00, 0x00, - // length prefix for persistentpeers - 0x01, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, - // nodeid - 0x01, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, - // backuplsn - 0x58, 0xff, 0x95, 0x49, 0x00, 0x00, 0x00, 0x00, - 0x2a, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, - 0x70, 0x02, 0x96, 0x49, 0x00, 0x00, 0x00, 0x00, - 0xb0, 0x01, 0x96, 0x49, 0x00, 0x00, 0x00, 0x00, - // partial_backup - 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, - // eviction_state - 0x00, 0x00, 0x00, 0x00, - ]; - - assert_eq!(Hex(&ser), Hex(&expected)); - let deser = TimelinePersistentState::des(&ser).unwrap(); assert_eq!(deser, state); diff --git a/safekeeper/src/send_interpreted_wal.rs b/safekeeper/src/send_interpreted_wal.rs index 7d215176dd..ea09ce364d 100644 --- a/safekeeper/src/send_interpreted_wal.rs +++ b/safekeeper/src/send_interpreted_wal.rs @@ -1,96 +1,330 @@ +use std::collections::HashMap; +use std::fmt::Display; +use std::sync::Arc; use std::time::Duration; -use anyhow::Context; +use anyhow::{anyhow, Context}; +use futures::future::Either; use futures::StreamExt; use pageserver_api::shard::ShardIdentity; use postgres_backend::{CopyStreamHandlerEnd, PostgresBackend}; -use postgres_ffi::MAX_SEND_SIZE; +use postgres_ffi::waldecoder::WalDecodeError; use postgres_ffi::{get_current_timestamp, waldecoder::WalStreamDecoder}; use pq_proto::{BeMessage, InterpretedWalRecordsBody, WalSndKeepAlive}; use tokio::io::{AsyncRead, AsyncWrite}; +use tokio::sync::mpsc::error::SendError; +use tokio::task::JoinHandle; use tokio::time::MissedTickBehavior; +use tracing::{info_span, Instrument}; use utils::lsn::Lsn; use utils::postgres_client::Compression; use utils::postgres_client::InterpretedFormat; use wal_decoder::models::{InterpretedWalRecord, InterpretedWalRecords}; use wal_decoder::wire_format::ToWireFormat; -use crate::send_wal::EndWatchView; -use crate::wal_reader_stream::{WalBytes, WalReaderStreamBuilder}; +use crate::metrics::WAL_READERS; +use crate::send_wal::{EndWatchView, WalSenderGuard}; +use crate::timeline::WalResidentTimeline; +use crate::wal_reader_stream::{StreamingWalReader, WalBytes}; -/// Shard-aware interpreted record sender. -/// This is used for sending WAL to the pageserver. Said WAL -/// is pre-interpreted and filtered for the shard. -pub(crate) struct InterpretedWalSender<'a, IO> { - pub(crate) format: InterpretedFormat, - pub(crate) compression: Option, - pub(crate) pgb: &'a mut PostgresBackend, - pub(crate) wal_stream_builder: WalReaderStreamBuilder, - pub(crate) end_watch_view: EndWatchView, - pub(crate) shard: ShardIdentity, - pub(crate) pg_version: u32, - pub(crate) appname: Option, +/// Identifier used to differentiate between senders of the same +/// shard. +/// +/// In the steady state there's only one, but two pageservers may +/// temporarily have the same shard attached and attempt to ingest +/// WAL for it. See also [`ShardSenderId`]. +#[derive(Hash, Eq, PartialEq, Copy, Clone)] +struct SenderId(u8); + +impl SenderId { + fn first() -> Self { + SenderId(0) + } + + fn next(&self) -> Self { + SenderId(self.0.checked_add(1).expect("few senders")) + } } -struct Batch { +#[derive(Hash, Eq, PartialEq)] +struct ShardSenderId { + shard: ShardIdentity, + sender_id: SenderId, +} + +impl Display for ShardSenderId { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + write!(f, "{}{}", self.sender_id.0, self.shard.shard_slug()) + } +} + +impl ShardSenderId { + fn new(shard: ShardIdentity, sender_id: SenderId) -> Self { + ShardSenderId { shard, sender_id } + } + + fn shard(&self) -> ShardIdentity { + self.shard + } +} + +/// Shard-aware fan-out interpreted record reader. +/// Reads WAL from disk, decodes it, intepretets it, and sends +/// it to any [`InterpretedWalSender`] connected to it. +/// Each [`InterpretedWalSender`] corresponds to one shard +/// and gets interpreted records concerning that shard only. +pub(crate) struct InterpretedWalReader { + wal_stream: StreamingWalReader, + shard_senders: HashMap>, + shard_notification_rx: Option>, + state: Arc>, + pg_version: u32, +} + +/// A handle for [`InterpretedWalReader`] which allows for interacting with it +/// when it runs as a separate tokio task. +#[derive(Debug)] +pub(crate) struct InterpretedWalReaderHandle { + join_handle: JoinHandle>, + state: Arc>, + shard_notification_tx: tokio::sync::mpsc::UnboundedSender, +} + +struct ShardSenderState { + sender_id: SenderId, + tx: tokio::sync::mpsc::Sender, + next_record_lsn: Lsn, +} + +/// State of [`InterpretedWalReader`] visible outside of the task running it. +#[derive(Debug)] +pub(crate) enum InterpretedWalReaderState { + Running { current_position: Lsn }, + Done, +} + +pub(crate) struct Batch { wal_end_lsn: Lsn, available_wal_end_lsn: Lsn, records: InterpretedWalRecords, } -impl InterpretedWalSender<'_, IO> { - /// Send interpreted WAL to a receiver. - /// Stops when an error occurs or the receiver is caught up and there's no active compute. - /// - /// Err(CopyStreamHandlerEnd) is always returned; Result is used only for ? - /// convenience. - pub(crate) async fn run(self) -> Result<(), CopyStreamHandlerEnd> { - let mut wal_position = self.wal_stream_builder.start_pos(); - let mut wal_decoder = - WalStreamDecoder::new(self.wal_stream_builder.start_pos(), self.pg_version); +#[derive(thiserror::Error, Debug)] +pub enum InterpretedWalReaderError { + /// Handler initiates the end of streaming. + #[error("decode error: {0}")] + Decode(#[from] WalDecodeError), + #[error("read or interpret error: {0}")] + ReadOrInterpret(#[from] anyhow::Error), + #[error("wal stream closed")] + WalStreamClosed, +} - let stream = self.wal_stream_builder.build(MAX_SEND_SIZE).await?; - let mut stream = std::pin::pin!(stream); +impl InterpretedWalReaderState { + fn current_position(&self) -> Option { + match self { + InterpretedWalReaderState::Running { + current_position, .. + } => Some(*current_position), + InterpretedWalReaderState::Done => None, + } + } +} - let mut keepalive_ticker = tokio::time::interval(Duration::from_secs(1)); - keepalive_ticker.set_missed_tick_behavior(MissedTickBehavior::Skip); - keepalive_ticker.reset(); +pub(crate) struct AttachShardNotification { + shard_id: ShardIdentity, + sender: tokio::sync::mpsc::Sender, + start_pos: Lsn, +} - let (tx, mut rx) = tokio::sync::mpsc::channel::(2); +impl InterpretedWalReader { + /// Spawn the reader in a separate tokio task and return a handle + pub(crate) fn spawn( + wal_stream: StreamingWalReader, + start_pos: Lsn, + tx: tokio::sync::mpsc::Sender, + shard: ShardIdentity, + pg_version: u32, + appname: &Option, + ) -> InterpretedWalReaderHandle { + let state = Arc::new(std::sync::RwLock::new(InterpretedWalReaderState::Running { + current_position: start_pos, + })); + + let (shard_notification_tx, shard_notification_rx) = tokio::sync::mpsc::unbounded_channel(); + + let reader = InterpretedWalReader { + wal_stream, + shard_senders: HashMap::from([( + shard, + smallvec::smallvec![ShardSenderState { + sender_id: SenderId::first(), + tx, + next_record_lsn: start_pos, + }], + )]), + shard_notification_rx: Some(shard_notification_rx), + state: state.clone(), + pg_version, + }; + + let metric = WAL_READERS + .get_metric_with_label_values(&["task", appname.as_deref().unwrap_or("safekeeper")]) + .unwrap(); + + let join_handle = tokio::task::spawn( + async move { + metric.inc(); + scopeguard::defer! { + metric.dec(); + } + + let res = reader.run_impl(start_pos).await; + if let Err(ref err) = res { + tracing::error!("Task finished with error: {err}"); + } + res + } + .instrument(info_span!("interpreted wal reader")), + ); + + InterpretedWalReaderHandle { + join_handle, + state, + shard_notification_tx, + } + } + + /// Construct the reader without spawning anything + /// Callers should drive the future returned by [`Self::run`]. + pub(crate) fn new( + wal_stream: StreamingWalReader, + start_pos: Lsn, + tx: tokio::sync::mpsc::Sender, + shard: ShardIdentity, + pg_version: u32, + ) -> InterpretedWalReader { + let state = Arc::new(std::sync::RwLock::new(InterpretedWalReaderState::Running { + current_position: start_pos, + })); + + InterpretedWalReader { + wal_stream, + shard_senders: HashMap::from([( + shard, + smallvec::smallvec![ShardSenderState { + sender_id: SenderId::first(), + tx, + next_record_lsn: start_pos, + }], + )]), + shard_notification_rx: None, + state: state.clone(), + pg_version, + } + } + + /// Entry point for future (polling) based wal reader. + pub(crate) async fn run( + self, + start_pos: Lsn, + appname: &Option, + ) -> Result<(), CopyStreamHandlerEnd> { + let metric = WAL_READERS + .get_metric_with_label_values(&["future", appname.as_deref().unwrap_or("safekeeper")]) + .unwrap(); + + metric.inc(); + scopeguard::defer! { + metric.dec(); + } + + let res = self.run_impl(start_pos).await; + if let Err(err) = res { + tracing::error!("Interpreted wal reader encountered error: {err}"); + } else { + tracing::info!("Interpreted wal reader exiting"); + } + + Err(CopyStreamHandlerEnd::Other(anyhow!( + "interpreted wal reader finished" + ))) + } + + /// Send interpreted WAL to one or more [`InterpretedWalSender`]s + /// Stops when an error is encountered or when the [`InterpretedWalReaderHandle`] + /// goes out of scope. + async fn run_impl(mut self, start_pos: Lsn) -> Result<(), InterpretedWalReaderError> { + let defer_state = self.state.clone(); + scopeguard::defer! { + *defer_state.write().unwrap() = InterpretedWalReaderState::Done; + } + + let mut wal_decoder = WalStreamDecoder::new(start_pos, self.pg_version); loop { tokio::select! { - // Get some WAL from the stream and then: decode, interpret and push it down the - // pipeline. - wal = stream.next(), if tx.capacity() > 0 => { - let WalBytes { wal, wal_start_lsn: _, wal_end_lsn, available_wal_end_lsn } = match wal { - Some(some) => some?, - None => { break; } + // Main branch for reading WAL and forwarding it + wal_or_reset = self.wal_stream.next() => { + let wal = wal_or_reset.map(|wor| wor.get_wal().expect("reset handled in select branch below")); + let WalBytes { + wal, + wal_start_lsn: _, + wal_end_lsn, + available_wal_end_lsn, + } = match wal { + Some(some) => some.map_err(InterpretedWalReaderError::ReadOrInterpret)?, + None => { + // [`StreamingWalReader::next`] is an endless stream of WAL. + // It shouldn't ever finish unless it panicked or became internally + // inconsistent. + return Result::Err(InterpretedWalReaderError::WalStreamClosed); + } }; - wal_position = wal_end_lsn; wal_decoder.feed_bytes(&wal); - let mut records = Vec::new(); + // Deserialize and interpret WAL records from this batch of WAL. + // Interpreted records for each shard are collected separately. + let shard_ids = self.shard_senders.keys().copied().collect::>(); + let mut records_by_sender: HashMap> = HashMap::new(); let mut max_next_record_lsn = None; - while let Some((next_record_lsn, recdata)) = wal_decoder - .poll_decode() - .with_context(|| "Failed to decode WAL")? + while let Some((next_record_lsn, recdata)) = wal_decoder.poll_decode()? { assert!(next_record_lsn.is_aligned()); max_next_record_lsn = Some(next_record_lsn); - // Deserialize and interpret WAL record let interpreted = InterpretedWalRecord::from_bytes_filtered( recdata, - &self.shard, + &shard_ids, next_record_lsn, self.pg_version, ) .with_context(|| "Failed to interpret WAL")?; - if !interpreted.is_empty() { - records.push(interpreted); + for (shard, record) in interpreted { + if record.is_empty() { + continue; + } + + let mut states_iter = self.shard_senders + .get(&shard) + .expect("keys collected above") + .iter() + .filter(|state| record.next_record_lsn > state.next_record_lsn) + .peekable(); + while let Some(state) = states_iter.next() { + let shard_sender_id = ShardSenderId::new(shard, state.sender_id); + + // The most commont case is one sender per shard. Peek and break to avoid the + // clone in that situation. + if states_iter.peek().is_none() { + records_by_sender.entry(shard_sender_id).or_default().push(record); + break; + } else { + records_by_sender.entry(shard_sender_id).or_default().push(record.clone()); + } + } } } @@ -99,20 +333,170 @@ impl InterpretedWalSender<'_, IO> { None => { continue; } }; - let batch = InterpretedWalRecords { - records, - next_record_lsn: Some(max_next_record_lsn), - }; + // Update the current position such that new receivers can decide + // whether to attach to us or spawn a new WAL reader. + match &mut *self.state.write().unwrap() { + InterpretedWalReaderState::Running { current_position, .. } => { + *current_position = max_next_record_lsn; + }, + InterpretedWalReaderState::Done => { + unreachable!() + } + } - tx.send(Batch {wal_end_lsn, available_wal_end_lsn, records: batch}).await.unwrap(); + // Send interpreted records downstream. Anything that has already been seen + // by a shard is filtered out. + let mut shard_senders_to_remove = Vec::new(); + for (shard, states) in &mut self.shard_senders { + for state in states { + if max_next_record_lsn <= state.next_record_lsn { + continue; + } + + let shard_sender_id = ShardSenderId::new(*shard, state.sender_id); + let records = records_by_sender.remove(&shard_sender_id).unwrap_or_default(); + + let batch = InterpretedWalRecords { + records, + next_record_lsn: Some(max_next_record_lsn), + }; + + let res = state.tx.send(Batch { + wal_end_lsn, + available_wal_end_lsn, + records: batch, + }).await; + + if res.is_err() { + shard_senders_to_remove.push(shard_sender_id); + } else { + state.next_record_lsn = max_next_record_lsn; + } + } + } + + // Clean up any shard senders that have dropped out. + // This is inefficient, but such events are rare (connection to PS termination) + // and the number of subscriptions on the same shards very small (only one + // for the steady state). + for to_remove in shard_senders_to_remove { + let shard_senders = self.shard_senders.get_mut(&to_remove.shard()).expect("saw it above"); + if let Some(idx) = shard_senders.iter().position(|s| s.sender_id == to_remove.sender_id) { + shard_senders.remove(idx); + tracing::info!("Removed shard sender {}", to_remove); + } + + if shard_senders.is_empty() { + self.shard_senders.remove(&to_remove.shard()); + } + } }, - // For a previously interpreted batch, serialize it and push it down the wire. - batch = rx.recv() => { + // Listen for new shards that want to attach to this reader. + // If the reader is not running as a task, then this is not supported + // (see the pending branch below). + notification = match self.shard_notification_rx.as_mut() { + Some(rx) => Either::Left(rx.recv()), + None => Either::Right(std::future::pending()) + } => { + if let Some(n) = notification { + let AttachShardNotification { shard_id, sender, start_pos } = n; + + // Update internal and external state, then reset the WAL stream + // if required. + let senders = self.shard_senders.entry(shard_id).or_default(); + let new_sender_id = match senders.last() { + Some(sender) => sender.sender_id.next(), + None => SenderId::first() + }; + + senders.push(ShardSenderState { sender_id: new_sender_id, tx: sender, next_record_lsn: start_pos}); + let current_pos = self.state.read().unwrap().current_position().unwrap(); + if start_pos < current_pos { + self.wal_stream.reset(start_pos).await; + wal_decoder = WalStreamDecoder::new(start_pos, self.pg_version); + } + + tracing::info!( + "Added shard sender {} with start_pos={} current_pos={}", + ShardSenderId::new(shard_id, new_sender_id), start_pos, current_pos + ); + } + } + } + } + } +} + +impl InterpretedWalReaderHandle { + /// Fan-out the reader by attaching a new shard to it + pub(crate) fn fanout( + &self, + shard_id: ShardIdentity, + sender: tokio::sync::mpsc::Sender, + start_pos: Lsn, + ) -> Result<(), SendError> { + self.shard_notification_tx.send(AttachShardNotification { + shard_id, + sender, + start_pos, + }) + } + + /// Get the current WAL position of the reader + pub(crate) fn current_position(&self) -> Option { + self.state.read().unwrap().current_position() + } + + pub(crate) fn abort(&self) { + self.join_handle.abort() + } +} + +impl Drop for InterpretedWalReaderHandle { + fn drop(&mut self) { + tracing::info!("Aborting interpreted wal reader"); + self.abort() + } +} + +pub(crate) struct InterpretedWalSender<'a, IO> { + pub(crate) format: InterpretedFormat, + pub(crate) compression: Option, + pub(crate) appname: Option, + + pub(crate) tli: WalResidentTimeline, + pub(crate) start_lsn: Lsn, + + pub(crate) pgb: &'a mut PostgresBackend, + pub(crate) end_watch_view: EndWatchView, + pub(crate) wal_sender_guard: Arc, + pub(crate) rx: tokio::sync::mpsc::Receiver, +} + +impl InterpretedWalSender<'_, IO> { + /// Send interpreted WAL records over the network. + /// Also manages keep-alives if nothing was sent for a while. + pub(crate) async fn run(mut self) -> Result<(), CopyStreamHandlerEnd> { + let mut keepalive_ticker = tokio::time::interval(Duration::from_secs(1)); + keepalive_ticker.set_missed_tick_behavior(MissedTickBehavior::Skip); + keepalive_ticker.reset(); + + let mut wal_position = self.start_lsn; + + loop { + tokio::select! { + batch = self.rx.recv() => { let batch = match batch { Some(b) => b, - None => { break; } + None => { + return Result::Err( + CopyStreamHandlerEnd::Other(anyhow!("Interpreted WAL reader exited early")) + ); + } }; + wal_position = batch.wal_end_lsn; + let buf = batch .records .to_wire(self.format, self.compression) @@ -132,7 +516,21 @@ impl InterpretedWalSender<'_, IO> { })).await?; } // Send a periodic keep alive when the connection has been idle for a while. + // Since we've been idle, also check if we can stop streaming. _ = keepalive_ticker.tick() => { + if let Some(remote_consistent_lsn) = self.wal_sender_guard + .walsenders() + .get_ws_remote_consistent_lsn(self.wal_sender_guard.id()) + { + if self.tli.should_walsender_stop(remote_consistent_lsn).await { + // Stop streaming if the receivers are caught up and + // there's no active compute. This causes the loop in + // [`crate::send_interpreted_wal::InterpretedWalSender::run`] + // to exit and terminate the WAL stream. + break; + } + } + self.pgb .write_message(&BeMessage::KeepAlive(WalSndKeepAlive { wal_end: self.end_watch_view.get().0, @@ -140,14 +538,259 @@ impl InterpretedWalSender<'_, IO> { request_reply: true, })) .await?; - } + }, } } - // The loop above ends when the receiver is caught up and there's no more WAL to send. Err(CopyStreamHandlerEnd::ServerInitiated(format!( "ending streaming to {:?} at {}, receiver is caughtup and there is no computes", self.appname, wal_position, ))) } } +#[cfg(test)] +mod tests { + use std::{collections::HashMap, str::FromStr, time::Duration}; + + use pageserver_api::shard::{ShardIdentity, ShardStripeSize}; + use postgres_ffi::MAX_SEND_SIZE; + use tokio::sync::mpsc::error::TryRecvError; + use utils::{ + id::{NodeId, TenantTimelineId}, + lsn::Lsn, + shard::{ShardCount, ShardNumber}, + }; + + use crate::{ + send_interpreted_wal::{Batch, InterpretedWalReader}, + test_utils::Env, + wal_reader_stream::StreamingWalReader, + }; + + #[tokio::test] + async fn test_interpreted_wal_reader_fanout() { + let _ = env_logger::builder().is_test(true).try_init(); + + const SIZE: usize = 8 * 1024; + const MSG_COUNT: usize = 200; + const PG_VERSION: u32 = 17; + const SHARD_COUNT: u8 = 2; + + let start_lsn = Lsn::from_str("0/149FD18").unwrap(); + let env = Env::new(true).unwrap(); + let tli = env + .make_timeline(NodeId(1), TenantTimelineId::generate(), start_lsn) + .await + .unwrap(); + + let resident_tli = tli.wal_residence_guard().await.unwrap(); + let end_watch = Env::write_wal(tli, start_lsn, SIZE, MSG_COUNT) + .await + .unwrap(); + let end_pos = end_watch.get(); + + tracing::info!("Doing first round of reads ..."); + + let streaming_wal_reader = StreamingWalReader::new( + resident_tli, + None, + start_lsn, + end_pos, + end_watch, + MAX_SEND_SIZE, + ); + + let shard_0 = ShardIdentity::new( + ShardNumber(0), + ShardCount(SHARD_COUNT), + ShardStripeSize::default(), + ) + .unwrap(); + + let shard_1 = ShardIdentity::new( + ShardNumber(1), + ShardCount(SHARD_COUNT), + ShardStripeSize::default(), + ) + .unwrap(); + + let mut shards = HashMap::new(); + + for shard_number in 0..SHARD_COUNT { + let shard_id = ShardIdentity::new( + ShardNumber(shard_number), + ShardCount(SHARD_COUNT), + ShardStripeSize::default(), + ) + .unwrap(); + let (tx, rx) = tokio::sync::mpsc::channel::(MSG_COUNT * 2); + shards.insert(shard_id, (Some(tx), Some(rx))); + } + + let shard_0_tx = shards.get_mut(&shard_0).unwrap().0.take().unwrap(); + let mut shard_0_rx = shards.get_mut(&shard_0).unwrap().1.take().unwrap(); + + let handle = InterpretedWalReader::spawn( + streaming_wal_reader, + start_lsn, + shard_0_tx, + shard_0, + PG_VERSION, + &Some("pageserver".to_string()), + ); + + tracing::info!("Reading all WAL with only shard 0 attached ..."); + + let mut shard_0_interpreted_records = Vec::new(); + while let Some(batch) = shard_0_rx.recv().await { + shard_0_interpreted_records.push(batch.records); + if batch.wal_end_lsn == batch.available_wal_end_lsn { + break; + } + } + + let shard_1_tx = shards.get_mut(&shard_1).unwrap().0.take().unwrap(); + let mut shard_1_rx = shards.get_mut(&shard_1).unwrap().1.take().unwrap(); + + tracing::info!("Attaching shard 1 to the reader at start of WAL"); + handle.fanout(shard_1, shard_1_tx, start_lsn).unwrap(); + + tracing::info!("Reading all WAL with shard 0 and shard 1 attached ..."); + + let mut shard_1_interpreted_records = Vec::new(); + while let Some(batch) = shard_1_rx.recv().await { + shard_1_interpreted_records.push(batch.records); + if batch.wal_end_lsn == batch.available_wal_end_lsn { + break; + } + } + + // This test uses logical messages. Those only go to shard 0. Check that the + // filtering worked and shard 1 did not get any. + assert!(shard_1_interpreted_records + .iter() + .all(|recs| recs.records.is_empty())); + + // Shard 0 should not receive anything more since the reader is + // going through wal that it has already processed. + let res = shard_0_rx.try_recv(); + if let Ok(ref ok) = res { + tracing::error!( + "Shard 0 received batch: wal_end_lsn={} available_wal_end_lsn={}", + ok.wal_end_lsn, + ok.available_wal_end_lsn + ); + } + assert!(matches!(res, Err(TryRecvError::Empty))); + + // Check that the next records lsns received by the two shards match up. + let shard_0_next_lsns = shard_0_interpreted_records + .iter() + .map(|recs| recs.next_record_lsn) + .collect::>(); + let shard_1_next_lsns = shard_1_interpreted_records + .iter() + .map(|recs| recs.next_record_lsn) + .collect::>(); + assert_eq!(shard_0_next_lsns, shard_1_next_lsns); + + handle.abort(); + let mut done = false; + for _ in 0..5 { + if handle.current_position().is_none() { + done = true; + break; + } + tokio::time::sleep(Duration::from_millis(1)).await; + } + + assert!(done); + } + + #[tokio::test] + async fn test_interpreted_wal_reader_same_shard_fanout() { + let _ = env_logger::builder().is_test(true).try_init(); + + const SIZE: usize = 8 * 1024; + const MSG_COUNT: usize = 200; + const PG_VERSION: u32 = 17; + const SHARD_COUNT: u8 = 2; + const ATTACHED_SHARDS: u8 = 4; + + let start_lsn = Lsn::from_str("0/149FD18").unwrap(); + let env = Env::new(true).unwrap(); + let tli = env + .make_timeline(NodeId(1), TenantTimelineId::generate(), start_lsn) + .await + .unwrap(); + + let resident_tli = tli.wal_residence_guard().await.unwrap(); + let end_watch = Env::write_wal(tli, start_lsn, SIZE, MSG_COUNT) + .await + .unwrap(); + let end_pos = end_watch.get(); + + let streaming_wal_reader = StreamingWalReader::new( + resident_tli, + None, + start_lsn, + end_pos, + end_watch, + MAX_SEND_SIZE, + ); + + let shard_0 = ShardIdentity::new( + ShardNumber(0), + ShardCount(SHARD_COUNT), + ShardStripeSize::default(), + ) + .unwrap(); + + let (tx, rx) = tokio::sync::mpsc::channel::(MSG_COUNT * 2); + let mut batch_receivers = vec![rx]; + + let handle = InterpretedWalReader::spawn( + streaming_wal_reader, + start_lsn, + tx, + shard_0, + PG_VERSION, + &Some("pageserver".to_string()), + ); + + for _ in 0..(ATTACHED_SHARDS - 1) { + let (tx, rx) = tokio::sync::mpsc::channel::(MSG_COUNT * 2); + handle.fanout(shard_0, tx, start_lsn).unwrap(); + batch_receivers.push(rx); + } + + loop { + let batch = batch_receivers.first_mut().unwrap().recv().await.unwrap(); + for rx in batch_receivers.iter_mut().skip(1) { + let other_batch = rx.recv().await.unwrap(); + + assert_eq!(batch.wal_end_lsn, other_batch.wal_end_lsn); + assert_eq!( + batch.available_wal_end_lsn, + other_batch.available_wal_end_lsn + ); + } + + if batch.wal_end_lsn == batch.available_wal_end_lsn { + break; + } + } + + handle.abort(); + let mut done = false; + for _ in 0..5 { + if handle.current_position().is_none() { + done = true; + break; + } + tokio::time::sleep(Duration::from_millis(1)).await; + } + + assert!(done); + } +} diff --git a/safekeeper/src/send_wal.rs b/safekeeper/src/send_wal.rs index 8463221998..4a4a74a0fd 100644 --- a/safekeeper/src/send_wal.rs +++ b/safekeeper/src/send_wal.rs @@ -2,16 +2,18 @@ //! with the "START_REPLICATION" message, and registry of walsenders. use crate::handler::SafekeeperPostgresHandler; -use crate::metrics::RECEIVED_PS_FEEDBACKS; +use crate::metrics::{RECEIVED_PS_FEEDBACKS, WAL_READERS}; use crate::receive_wal::WalReceivers; use crate::safekeeper::TermLsn; -use crate::send_interpreted_wal::InterpretedWalSender; +use crate::send_interpreted_wal::{ + Batch, InterpretedWalReader, InterpretedWalReaderHandle, InterpretedWalSender, +}; use crate::timeline::WalResidentTimeline; -use crate::wal_reader_stream::WalReaderStreamBuilder; +use crate::wal_reader_stream::StreamingWalReader; use crate::wal_storage::WalReader; use anyhow::{bail, Context as AnyhowContext}; use bytes::Bytes; -use futures::future::Either; +use futures::FutureExt; use parking_lot::Mutex; use postgres_backend::PostgresBackend; use postgres_backend::{CopyStreamHandlerEnd, PostgresBackendReader, QueryError}; @@ -19,16 +21,16 @@ use postgres_ffi::get_current_timestamp; use postgres_ffi::{TimestampTz, MAX_SEND_SIZE}; use pq_proto::{BeMessage, WalSndKeepAlive, XLogDataBody}; use safekeeper_api::models::{ - ConnectionId, HotStandbyFeedback, ReplicationFeedback, StandbyFeedback, StandbyReply, - WalSenderState, INVALID_FULL_TRANSACTION_ID, + HotStandbyFeedback, ReplicationFeedback, StandbyFeedback, StandbyReply, + INVALID_FULL_TRANSACTION_ID, }; use safekeeper_api::Term; use tokio::io::{AsyncRead, AsyncWrite}; use utils::failpoint_support; -use utils::id::TenantTimelineId; use utils::pageserver_feedback::PageserverFeedback; use utils::postgres_client::PostgresClientProtocol; +use itertools::Itertools; use std::cmp::{max, min}; use std::net::SocketAddr; use std::sync::Arc; @@ -50,6 +52,12 @@ pub struct WalSenders { walreceivers: Arc, } +pub struct WalSendersTimelineMetricValues { + pub ps_feedback_counter: u64, + pub last_ps_feedback: PageserverFeedback, + pub interpreted_wal_reader_tasks: usize, +} + impl WalSenders { pub fn new(walreceivers: Arc) -> Arc { Arc::new(WalSenders { @@ -60,21 +68,8 @@ impl WalSenders { /// Register new walsender. Returned guard provides access to the slot and /// automatically deregisters in Drop. - fn register( - self: &Arc, - ttid: TenantTimelineId, - addr: SocketAddr, - conn_id: ConnectionId, - appname: Option, - ) -> WalSenderGuard { + fn register(self: &Arc, walsender_state: WalSenderState) -> WalSenderGuard { let slots = &mut self.mutex.lock().slots; - let walsender_state = WalSenderState { - ttid, - addr, - conn_id, - appname, - feedback: ReplicationFeedback::Pageserver(PageserverFeedback::empty()), - }; // find empty slot or create new one let pos = if let Some(pos) = slots.iter().position(|s| s.is_none()) { slots[pos] = Some(walsender_state); @@ -90,9 +85,79 @@ impl WalSenders { } } + fn create_or_update_interpreted_reader< + FUp: FnOnce(&Arc) -> anyhow::Result<()>, + FNew: FnOnce() -> InterpretedWalReaderHandle, + >( + self: &Arc, + id: WalSenderId, + start_pos: Lsn, + max_delta_for_fanout: Option, + update: FUp, + create: FNew, + ) -> anyhow::Result<()> { + let state = &mut self.mutex.lock(); + + let mut selected_interpreted_reader = None; + for slot in state.slots.iter().flatten() { + if let WalSenderState::Interpreted(slot_state) = slot { + if let Some(ref interpreted_reader) = slot_state.interpreted_wal_reader { + let select = match (interpreted_reader.current_position(), max_delta_for_fanout) + { + (Some(pos), Some(max_delta)) => { + let delta = pos.0.abs_diff(start_pos.0); + delta <= max_delta + } + // Reader is not active + (None, _) => false, + // Gating fanout by max delta is disabled. + // Attach to any active reader. + (_, None) => true, + }; + + if select { + selected_interpreted_reader = Some(interpreted_reader.clone()); + break; + } + } + } + } + + let slot = state.get_slot_mut(id); + let slot_state = match slot { + WalSenderState::Interpreted(s) => s, + WalSenderState::Vanilla(_) => unreachable!(), + }; + + let selected_or_new = match selected_interpreted_reader { + Some(selected) => { + update(&selected)?; + selected + } + None => Arc::new(create()), + }; + + slot_state.interpreted_wal_reader = Some(selected_or_new); + + Ok(()) + } + /// Get state of all walsenders. - pub fn get_all(self: &Arc) -> Vec { - self.mutex.lock().slots.iter().flatten().cloned().collect() + pub fn get_all_public(self: &Arc) -> Vec { + self.mutex + .lock() + .slots + .iter() + .flatten() + .map(|state| match state { + WalSenderState::Vanilla(s) => { + safekeeper_api::models::WalSenderState::Vanilla(s.clone()) + } + WalSenderState::Interpreted(s) => { + safekeeper_api::models::WalSenderState::Interpreted(s.public_state.clone()) + } + }) + .collect() } /// Get LSN of the most lagging pageserver receiver. Return None if there are no @@ -103,7 +168,7 @@ impl WalSenders { .slots .iter() .flatten() - .filter_map(|s| match s.feedback { + .filter_map(|s| match s.get_feedback() { ReplicationFeedback::Pageserver(feedback) => Some(feedback.last_received_lsn), ReplicationFeedback::Standby(_) => None, }) @@ -111,9 +176,25 @@ impl WalSenders { } /// Returns total counter of pageserver feedbacks received and last feedback. - pub fn get_ps_feedback_stats(self: &Arc) -> (u64, PageserverFeedback) { + pub fn info_for_metrics(self: &Arc) -> WalSendersTimelineMetricValues { let shared = self.mutex.lock(); - (shared.ps_feedback_counter, shared.last_ps_feedback) + + let interpreted_wal_reader_tasks = shared + .slots + .iter() + .filter_map(|ss| match ss { + Some(WalSenderState::Interpreted(int)) => int.interpreted_wal_reader.as_ref(), + Some(WalSenderState::Vanilla(_)) => None, + None => None, + }) + .unique_by(|reader| Arc::as_ptr(reader)) + .count(); + + WalSendersTimelineMetricValues { + ps_feedback_counter: shared.ps_feedback_counter, + last_ps_feedback: shared.last_ps_feedback, + interpreted_wal_reader_tasks, + } } /// Get aggregated hot standby feedback (we send it to compute). @@ -124,7 +205,7 @@ impl WalSenders { /// Record new pageserver feedback, update aggregated values. fn record_ps_feedback(self: &Arc, id: WalSenderId, feedback: &PageserverFeedback) { let mut shared = self.mutex.lock(); - shared.get_slot_mut(id).feedback = ReplicationFeedback::Pageserver(*feedback); + *shared.get_slot_mut(id).get_mut_feedback() = ReplicationFeedback::Pageserver(*feedback); shared.last_ps_feedback = *feedback; shared.ps_feedback_counter += 1; drop(shared); @@ -143,10 +224,10 @@ impl WalSenders { "Record standby reply: ts={} apply_lsn={}", reply.reply_ts, reply.apply_lsn ); - match &mut slot.feedback { + match &mut slot.get_mut_feedback() { ReplicationFeedback::Standby(sf) => sf.reply = *reply, ReplicationFeedback::Pageserver(_) => { - slot.feedback = ReplicationFeedback::Standby(StandbyFeedback { + *slot.get_mut_feedback() = ReplicationFeedback::Standby(StandbyFeedback { reply: *reply, hs_feedback: HotStandbyFeedback::empty(), }) @@ -158,10 +239,10 @@ impl WalSenders { fn record_hs_feedback(self: &Arc, id: WalSenderId, feedback: &HotStandbyFeedback) { let mut shared = self.mutex.lock(); let slot = shared.get_slot_mut(id); - match &mut slot.feedback { + match &mut slot.get_mut_feedback() { ReplicationFeedback::Standby(sf) => sf.hs_feedback = *feedback, ReplicationFeedback::Pageserver(_) => { - slot.feedback = ReplicationFeedback::Standby(StandbyFeedback { + *slot.get_mut_feedback() = ReplicationFeedback::Standby(StandbyFeedback { reply: StandbyReply::empty(), hs_feedback: *feedback, }) @@ -175,7 +256,7 @@ impl WalSenders { pub fn get_ws_remote_consistent_lsn(self: &Arc, id: WalSenderId) -> Option { let shared = self.mutex.lock(); let slot = shared.get_slot(id); - match slot.feedback { + match slot.get_feedback() { ReplicationFeedback::Pageserver(feedback) => Some(feedback.remote_consistent_lsn), _ => None, } @@ -199,6 +280,47 @@ struct WalSendersShared { slots: Vec>, } +/// Safekeeper internal definitions of wal sender state +/// +/// As opposed to [`safekeeper_api::models::WalSenderState`] these struct may +/// include state that we don not wish to expose to the public api. +#[derive(Debug, Clone)] +pub(crate) enum WalSenderState { + Vanilla(VanillaWalSenderInternalState), + Interpreted(InterpretedWalSenderInternalState), +} + +type VanillaWalSenderInternalState = safekeeper_api::models::VanillaWalSenderState; + +#[derive(Debug, Clone)] +pub(crate) struct InterpretedWalSenderInternalState { + public_state: safekeeper_api::models::InterpretedWalSenderState, + interpreted_wal_reader: Option>, +} + +impl WalSenderState { + fn get_addr(&self) -> &SocketAddr { + match self { + WalSenderState::Vanilla(state) => &state.addr, + WalSenderState::Interpreted(state) => &state.public_state.addr, + } + } + + fn get_feedback(&self) -> &ReplicationFeedback { + match self { + WalSenderState::Vanilla(state) => &state.feedback, + WalSenderState::Interpreted(state) => &state.public_state.feedback, + } + } + + fn get_mut_feedback(&mut self) -> &mut ReplicationFeedback { + match self { + WalSenderState::Vanilla(state) => &mut state.feedback, + WalSenderState::Interpreted(state) => &mut state.public_state.feedback, + } + } +} + impl WalSendersShared { fn new() -> Self { WalSendersShared { @@ -225,7 +347,7 @@ impl WalSendersShared { let mut agg = HotStandbyFeedback::empty(); let mut reply_agg = StandbyReply::empty(); for ws_state in self.slots.iter().flatten() { - if let ReplicationFeedback::Standby(standby_feedback) = ws_state.feedback { + if let ReplicationFeedback::Standby(standby_feedback) = ws_state.get_feedback() { let hs_feedback = standby_feedback.hs_feedback; // doing Option math like op1.iter().chain(op2.iter()).min() // would be nicer, but we serialize/deserialize this struct @@ -317,7 +439,7 @@ impl SafekeeperPostgresHandler { /// Wrapper around handle_start_replication_guts handling result. Error is /// handled here while we're still in walsender ttid span; with API /// extension, this can probably be moved into postgres_backend. - pub async fn handle_start_replication( + pub async fn handle_start_replication( &mut self, pgb: &mut PostgresBackend, start_pos: Lsn, @@ -342,7 +464,7 @@ impl SafekeeperPostgresHandler { Ok(()) } - pub async fn handle_start_replication_guts( + pub async fn handle_start_replication_guts( &mut self, pgb: &mut PostgresBackend, start_pos: Lsn, @@ -352,12 +474,30 @@ impl SafekeeperPostgresHandler { let appname = self.appname.clone(); // Use a guard object to remove our entry from the timeline when we are done. - let ws_guard = Arc::new(tli.get_walsenders().register( - self.ttid, - *pgb.get_peer_addr(), - self.conn_id, - self.appname.clone(), - )); + let ws_guard = match self.protocol() { + PostgresClientProtocol::Vanilla => Arc::new(tli.get_walsenders().register( + WalSenderState::Vanilla(VanillaWalSenderInternalState { + ttid: self.ttid, + addr: *pgb.get_peer_addr(), + conn_id: self.conn_id, + appname: self.appname.clone(), + feedback: ReplicationFeedback::Pageserver(PageserverFeedback::empty()), + }), + )), + PostgresClientProtocol::Interpreted { .. } => Arc::new(tli.get_walsenders().register( + WalSenderState::Interpreted(InterpretedWalSenderInternalState { + public_state: safekeeper_api::models::InterpretedWalSenderState { + ttid: self.ttid, + shard: self.shard.unwrap(), + addr: *pgb.get_peer_addr(), + conn_id: self.conn_id, + appname: self.appname.clone(), + feedback: ReplicationFeedback::Pageserver(PageserverFeedback::empty()), + }, + interpreted_wal_reader: None, + }), + )), + }; // Walsender can operate in one of two modes which we select by // application_name: give only committed WAL (used by pageserver) or all @@ -403,7 +543,7 @@ impl SafekeeperPostgresHandler { pgb, // should succeed since we're already holding another guard tli: tli.wal_residence_guard().await?, - appname, + appname: appname.clone(), start_pos, end_pos, term, @@ -413,7 +553,7 @@ impl SafekeeperPostgresHandler { send_buf: vec![0u8; MAX_SEND_SIZE], }; - Either::Left(sender.run()) + FutureExt::boxed(sender.run()) } PostgresClientProtocol::Interpreted { format, @@ -421,27 +561,96 @@ impl SafekeeperPostgresHandler { } => { let pg_version = tli.tli.get_state().await.1.server.pg_version / 10000; let end_watch_view = end_watch.view(); - let wal_stream_builder = WalReaderStreamBuilder { - tli: tli.wal_residence_guard().await?, - start_pos, - end_pos, - term, - end_watch, - wal_sender_guard: ws_guard.clone(), - }; + let wal_residence_guard = tli.wal_residence_guard().await?; + let (tx, rx) = tokio::sync::mpsc::channel::(2); + let shard = self.shard.unwrap(); - let sender = InterpretedWalSender { - format, - compression, - pgb, - wal_stream_builder, - end_watch_view, - shard: self.shard.unwrap(), - pg_version, - appname, - }; + if self.conf.wal_reader_fanout && !shard.is_unsharded() { + let ws_id = ws_guard.id(); + ws_guard.walsenders().create_or_update_interpreted_reader( + ws_id, + start_pos, + self.conf.max_delta_for_fanout, + { + let tx = tx.clone(); + |reader| { + tracing::info!( + "Fanning out interpreted wal reader at {}", + start_pos + ); + reader + .fanout(shard, tx, start_pos) + .with_context(|| "Failed to fan out reader") + } + }, + || { + tracing::info!("Spawning interpreted wal reader at {}", start_pos); - Either::Right(sender.run()) + let wal_stream = StreamingWalReader::new( + wal_residence_guard, + term, + start_pos, + end_pos, + end_watch, + MAX_SEND_SIZE, + ); + + InterpretedWalReader::spawn( + wal_stream, start_pos, tx, shard, pg_version, &appname, + ) + }, + )?; + + let sender = InterpretedWalSender { + format, + compression, + appname, + tli: tli.wal_residence_guard().await?, + start_lsn: start_pos, + pgb, + end_watch_view, + wal_sender_guard: ws_guard.clone(), + rx, + }; + + FutureExt::boxed(sender.run()) + } else { + let wal_reader = StreamingWalReader::new( + wal_residence_guard, + term, + start_pos, + end_pos, + end_watch, + MAX_SEND_SIZE, + ); + + let reader = + InterpretedWalReader::new(wal_reader, start_pos, tx, shard, pg_version); + + let sender = InterpretedWalSender { + format, + compression, + appname: appname.clone(), + tli: tli.wal_residence_guard().await?, + start_lsn: start_pos, + pgb, + end_watch_view, + wal_sender_guard: ws_guard.clone(), + rx, + }; + + FutureExt::boxed(async move { + // Sender returns an Err on all code paths. + // If the sender finishes first, we will drop the reader future. + // If the reader finishes first, the sender will finish too since + // the wal sender has dropped. + let res = tokio::try_join!(sender.run(), reader.run(start_pos, &appname)); + match res.map(|_| ()) { + Ok(_) => unreachable!("sender finishes with Err by convention"), + err_res => err_res, + } + }) + } } }; @@ -470,7 +679,8 @@ impl SafekeeperPostgresHandler { .clone(); info!( "finished streaming to {}, feedback={:?}", - ws_state.addr, ws_state.feedback, + ws_state.get_addr(), + ws_state.get_feedback(), ); // Join pg backend back. @@ -578,6 +788,18 @@ impl WalSender<'_, IO> { /// Err(CopyStreamHandlerEnd) is always returned; Result is used only for ? /// convenience. async fn run(mut self) -> Result<(), CopyStreamHandlerEnd> { + let metric = WAL_READERS + .get_metric_with_label_values(&[ + "future", + self.appname.as_deref().unwrap_or("safekeeper"), + ]) + .unwrap(); + + metric.inc(); + scopeguard::defer! { + metric.dec(); + } + loop { // Wait for the next portion if it is not there yet, or just // update our end of WAL available for sending value, we @@ -813,7 +1035,7 @@ impl ReplyReader { #[cfg(test)] mod tests { use safekeeper_api::models::FullTransactionId; - use utils::id::{TenantId, TimelineId}; + use utils::id::{TenantId, TenantTimelineId, TimelineId}; use super::*; @@ -830,13 +1052,13 @@ mod tests { // add to wss specified feedback setting other fields to dummy values fn push_feedback(wss: &mut WalSendersShared, feedback: ReplicationFeedback) { - let walsender_state = WalSenderState { + let walsender_state = WalSenderState::Vanilla(VanillaWalSenderInternalState { ttid: mock_ttid(), addr: mock_addr(), conn_id: 1, appname: None, feedback, - }; + }); wss.slots.push(Some(walsender_state)) } diff --git a/safekeeper/src/state.rs b/safekeeper/src/state.rs index c6ae6c1d2b..4d566b12a0 100644 --- a/safekeeper/src/state.rs +++ b/safekeeper/src/state.rs @@ -1,20 +1,25 @@ //! Defines per timeline data stored persistently (SafeKeeperPersistentState) //! and its wrapper with in memory layer (SafekeeperState). -use std::{cmp::max, ops::Deref}; +use std::{cmp::max, ops::Deref, time::SystemTime}; use anyhow::{bail, Result}; use postgres_ffi::WAL_SEGMENT_SIZE; -use safekeeper_api::{models::TimelineTermBumpResponse, ServerInfo, Term}; +use safekeeper_api::{ + membership::Configuration, + models::{TimelineMembershipSwitchResponse, TimelineTermBumpResponse}, + ServerInfo, Term, INITIAL_TERM, +}; use serde::{Deserialize, Serialize}; +use tracing::info; use utils::{ - id::{NodeId, TenantId, TenantTimelineId, TimelineId}, + id::{TenantId, TenantTimelineId, TimelineId}, lsn::Lsn, }; use crate::{ control_file, - safekeeper::{AcceptorState, PersistedPeerInfo, PgUuid, TermHistory, UNKNOWN_SERVER_VERSION}, + safekeeper::{AcceptorState, PgUuid, TermHistory, TermLsn, UNKNOWN_SERVER_VERSION}, timeline::TimelineError, wal_backup_partial::{self}, }; @@ -27,6 +32,8 @@ pub struct TimelinePersistentState { pub tenant_id: TenantId, #[serde(with = "hex")] pub timeline_id: TimelineId, + /// Membership configuration. + pub mconf: Configuration, /// persistent acceptor state pub acceptor_state: AcceptorState, /// information about server @@ -58,22 +65,15 @@ pub struct TimelinePersistentState { /// pushed to s3. We don't remove WAL beyond it. Persisted only for /// informational purposes, we receive it from pageserver (or broker). pub remote_consistent_lsn: Lsn, - /// Peers and their state as we remember it. Knowing peers themselves is - /// fundamental; but state is saved here only for informational purposes and - /// obviously can be stale. (Currently not saved at all, but let's provision - /// place to have less file version upgrades). - pub peers: PersistedPeers, /// Holds names of partial segments uploaded to remote storage. Used to /// clean up old objects without leaving garbage in remote storage. pub partial_backup: wal_backup_partial::State, /// Eviction state of the timeline. If it's Offloaded, we should download /// WAL files from remote storage to serve the timeline. pub eviction_state: EvictionState, + pub creation_ts: SystemTime, } -#[derive(Debug, Clone, Serialize, Deserialize, PartialEq)] -pub struct PersistedPeers(pub Vec<(NodeId, PersistedPeerInfo)>); - /// State of the local WAL files. Used to track current timeline state, /// that can be either WAL files are present on disk or last partial segment /// is offloaded to remote storage. @@ -87,12 +87,14 @@ pub enum EvictionState { } impl TimelinePersistentState { + /// commit_lsn is the same as start_lsn in the normal creaiton; see + /// `TimelineCreateRequest` comments.` pub fn new( ttid: &TenantTimelineId, + mconf: Configuration, server_info: ServerInfo, - peers: Vec, + start_lsn: Lsn, commit_lsn: Lsn, - local_start_lsn: Lsn, ) -> anyhow::Result { if server_info.wal_seg_size == 0 { bail!(TimelineError::UninitializedWalSegSize(*ttid)); @@ -102,49 +104,59 @@ impl TimelinePersistentState { bail!(TimelineError::UninitialinzedPgVersion(*ttid)); } - if commit_lsn < local_start_lsn { + if commit_lsn < start_lsn { bail!( - "commit_lsn {} is smaller than local_start_lsn {}", + "commit_lsn {} is smaller than start_lsn {}", commit_lsn, - local_start_lsn + start_lsn ); } + // If we are given with init LSN, initialize term history with it. It + // ensures that walproposer always must be able to find a common point + // in histories; if it can't something is corrupted. Not having LSN here + // is so far left for legacy case where timeline is created by compute + // and LSN during creation is not known yet. + let term_history = if commit_lsn != Lsn::INVALID { + TermHistory(vec![TermLsn { + term: INITIAL_TERM, + lsn: start_lsn, + }]) + } else { + TermHistory::empty() + }; + Ok(TimelinePersistentState { tenant_id: ttid.tenant_id, timeline_id: ttid.timeline_id, + mconf, acceptor_state: AcceptorState { - term: 0, - term_history: TermHistory::empty(), + term: INITIAL_TERM, + term_history, }, server: server_info, proposer_uuid: [0; 16], - timeline_start_lsn: Lsn(0), - local_start_lsn, + timeline_start_lsn: start_lsn, + local_start_lsn: start_lsn, commit_lsn, - backup_lsn: local_start_lsn, - peer_horizon_lsn: local_start_lsn, + backup_lsn: start_lsn, + peer_horizon_lsn: start_lsn, remote_consistent_lsn: Lsn(0), - peers: PersistedPeers( - peers - .iter() - .map(|p| (*p, PersistedPeerInfo::new())) - .collect(), - ), partial_backup: wal_backup_partial::State::default(), eviction_state: EvictionState::Present, + creation_ts: SystemTime::now(), }) } pub fn empty() -> Self { TimelinePersistentState::new( &TenantTimelineId::empty(), + Configuration::empty(), ServerInfo { pg_version: 170000, /* Postgres server version (major * 10000) */ system_id: 0, /* Postgres system identifier */ wal_seg_size: WAL_SEGMENT_SIZE as u32, }, - vec![], Lsn::INVALID, Lsn::INVALID, ) @@ -249,6 +261,31 @@ where current_term: after, }) } + + /// Switch into membership configuration `to` if it is higher than the + /// current one. + pub async fn membership_switch( + &mut self, + to: Configuration, + ) -> Result { + let before = self.mconf.clone(); + // Is switch allowed? + if to.generation <= self.mconf.generation { + info!( + "ignoring request to switch membership conf to lower {}, current conf {}", + to, self.mconf + ); + } else { + let mut state = self.start_change(); + state.mconf = to.clone(); + self.finish_change(&state).await?; + info!("switched membership conf to {} from {}", to, before); + } + Ok(TimelineMembershipSwitchResponse { + previous_conf: before, + current_conf: self.mconf.clone(), + }) + } } impl Deref for TimelineState diff --git a/safekeeper/src/test_utils.rs b/safekeeper/src/test_utils.rs index c40a8bae5a..4e851c5b3d 100644 --- a/safekeeper/src/test_utils.rs +++ b/safekeeper/src/test_utils.rs @@ -1,13 +1,19 @@ use std::sync::Arc; use crate::rate_limit::RateLimiter; -use crate::safekeeper::{ProposerAcceptorMessage, ProposerElected, SafeKeeper, TermHistory}; +use crate::receive_wal::WalAcceptor; +use crate::safekeeper::{ + AcceptorProposerMessage, AppendRequest, AppendRequestHeader, ProposerAcceptorMessage, + ProposerElected, SafeKeeper, TermHistory, +}; +use crate::send_wal::EndWatch; use crate::state::{TimelinePersistentState, TimelineState}; use crate::timeline::{get_timeline_dir, SharedState, StateSK, Timeline}; use crate::timelines_set::TimelinesSet; use crate::wal_backup::remote_timeline_path; -use crate::{control_file, wal_storage, SafeKeeperConf}; +use crate::{control_file, receive_wal, wal_storage, SafeKeeperConf}; use camino_tempfile::Utf8TempDir; +use postgres_ffi::v17::wal_generator::{LogicalMessageGenerator, WalGenerator}; use tokio::fs::create_dir_all; use utils::id::{NodeId, TenantTimelineId}; use utils::lsn::Lsn; @@ -107,4 +113,59 @@ impl Env { ); Ok(timeline) } + + // This will be dead code when building a non-benchmark target with the + // benchmarking feature enabled. + #[allow(dead_code)] + pub(crate) async fn write_wal( + tli: Arc, + start_lsn: Lsn, + msg_size: usize, + msg_count: usize, + ) -> anyhow::Result { + let (msg_tx, msg_rx) = tokio::sync::mpsc::channel(receive_wal::MSG_QUEUE_SIZE); + let (reply_tx, mut reply_rx) = tokio::sync::mpsc::channel(receive_wal::REPLY_QUEUE_SIZE); + + let end_watch = EndWatch::Commit(tli.get_commit_lsn_watch_rx()); + + WalAcceptor::spawn(tli.wal_residence_guard().await?, msg_rx, reply_tx, Some(0)); + + let prefix = c"p"; + let prefixlen = prefix.to_bytes_with_nul().len(); + assert!(msg_size >= prefixlen); + let message = vec![0; msg_size - prefixlen]; + + let walgen = + &mut WalGenerator::new(LogicalMessageGenerator::new(prefix, &message), start_lsn); + for _ in 0..msg_count { + let (lsn, record) = walgen.next().unwrap(); + + let req = AppendRequest { + h: AppendRequestHeader { + term: 1, + term_start_lsn: start_lsn, + begin_lsn: lsn, + end_lsn: lsn + record.len() as u64, + commit_lsn: lsn, + truncate_lsn: Lsn(0), + proposer_uuid: [0; 16], + }, + wal_data: record, + }; + + let end_lsn = req.h.end_lsn; + + let msg = ProposerAcceptorMessage::AppendRequest(req); + msg_tx.send(msg).await?; + while let Some(reply) = reply_rx.recv().await { + if let AcceptorProposerMessage::AppendResponse(resp) = reply { + if resp.flush_lsn >= end_lsn { + break; + } + } + } + } + + Ok(end_watch) + } } diff --git a/safekeeper/src/timeline.rs b/safekeeper/src/timeline.rs index 36860a0da2..5eb0bd7146 100644 --- a/safekeeper/src/timeline.rs +++ b/safekeeper/src/timeline.rs @@ -4,7 +4,10 @@ use anyhow::{anyhow, bail, Result}; use camino::{Utf8Path, Utf8PathBuf}; use remote_storage::RemotePath; -use safekeeper_api::models::{PeerInfo, TimelineTermBumpResponse}; +use safekeeper_api::membership::Configuration; +use safekeeper_api::models::{ + PeerInfo, TimelineMembershipSwitchResponse, TimelineTermBumpResponse, +}; use safekeeper_api::Term; use tokio::fs::{self}; use tokio_util::sync::CancellationToken; @@ -32,7 +35,7 @@ use crate::control_file; use crate::rate_limit::RateLimiter; use crate::receive_wal::WalReceivers; use crate::safekeeper::{AcceptorProposerMessage, ProposerAcceptorMessage, SafeKeeper, TermLsn}; -use crate::send_wal::WalSenders; +use crate::send_wal::{WalSenders, WalSendersTimelineMetricValues}; use crate::state::{EvictionState, TimelineMemState, TimelinePersistentState, TimelineState}; use crate::timeline_guard::ResidenceGuard; use crate::timeline_manager::{AtomicStatus, ManagerCtl}; @@ -188,6 +191,13 @@ impl StateSK { self.state_mut().term_bump(to).await } + pub async fn membership_switch( + &mut self, + to: Configuration, + ) -> Result { + self.state_mut().membership_switch(to).await + } + /// Close open WAL files to release FDs. fn close_wal_store(&mut self) { if let StateSK::Loaded(sk) = self { @@ -702,16 +712,22 @@ impl Timeline { return None; } - let (ps_feedback_count, last_ps_feedback) = self.walsenders.get_ps_feedback_stats(); + let WalSendersTimelineMetricValues { + ps_feedback_counter, + last_ps_feedback, + interpreted_wal_reader_tasks, + } = self.walsenders.info_for_metrics(); + let state = self.read_shared_state().await; Some(FullTimelineInfo { ttid: self.ttid, - ps_feedback_count, + ps_feedback_count: ps_feedback_counter, last_ps_feedback, wal_backup_active: self.wal_backup_active.load(Ordering::Relaxed), timeline_is_active: self.broker_active.load(Ordering::Relaxed), num_computes: self.walreceivers.get_num() as u32, last_removed_segno: self.last_removed_segno.load(Ordering::Relaxed), + interpreted_wal_reader_tasks, epoch_start_lsn: state.sk.term_start_lsn(), mem_state: state.sk.state().inmem.clone(), persisted_state: TimelinePersistentState::clone(state.sk.state()), @@ -730,7 +746,7 @@ impl Timeline { debug_dump::Memory { is_cancelled: self.is_cancelled(), peers_info_len: state.peers_info.0.len(), - walsenders: self.walsenders.get_all(), + walsenders: self.walsenders.get_all_public(), wal_backup_active: self.wal_backup_active.load(Ordering::Relaxed), active: self.broker_active.load(Ordering::Relaxed), num_computes: self.walreceivers.get_num() as u32, @@ -768,6 +784,14 @@ impl Timeline { state.sk.term_bump(to).await } + pub async fn membership_switch( + self: &Arc, + to: Configuration, + ) -> Result { + let mut state = self.write_shared_state().await; + state.sk.membership_switch(to).await + } + /// Guts of [`Self::wal_residence_guard`] and [`Self::try_wal_residence_guard`] async fn do_wal_residence_guard( self: &Arc, diff --git a/safekeeper/src/timelines_global_map.rs b/safekeeper/src/timelines_global_map.rs index ad29c9f66c..a701534f65 100644 --- a/safekeeper/src/timelines_global_map.rs +++ b/safekeeper/src/timelines_global_map.rs @@ -12,6 +12,7 @@ use crate::{control_file, wal_storage, SafeKeeperConf}; use anyhow::{bail, Context, Result}; use camino::Utf8PathBuf; use camino_tempfile::Utf8TempDir; +use safekeeper_api::membership::Configuration; use safekeeper_api::ServerInfo; use serde::Serialize; use std::collections::HashMap; @@ -214,9 +215,10 @@ impl GlobalTimelines { pub(crate) async fn create( &self, ttid: TenantTimelineId, + mconf: Configuration, server_info: ServerInfo, + start_lsn: Lsn, commit_lsn: Lsn, - local_start_lsn: Lsn, ) -> Result> { let (conf, _, _) = { let state = self.state.lock().unwrap(); @@ -239,8 +241,7 @@ impl GlobalTimelines { // TODO: currently we create only cfile. It would be reasonable to // immediately initialize first WAL segment as well. - let state = - TimelinePersistentState::new(&ttid, server_info, vec![], commit_lsn, local_start_lsn)?; + let state = TimelinePersistentState::new(&ttid, mconf, server_info, start_lsn, commit_lsn)?; control_file::FileStorage::create_new(&tmp_dir_path, state, conf.no_sync).await?; let timeline = self.load_temp_timeline(ttid, &tmp_dir_path, true).await?; Ok(timeline) diff --git a/safekeeper/src/wal_reader_stream.rs b/safekeeper/src/wal_reader_stream.rs index aea628c208..adac6067da 100644 --- a/safekeeper/src/wal_reader_stream.rs +++ b/safekeeper/src/wal_reader_stream.rs @@ -1,34 +1,16 @@ -use std::sync::Arc; - -use async_stream::try_stream; -use bytes::Bytes; -use futures::Stream; -use postgres_backend::CopyStreamHandlerEnd; -use safekeeper_api::Term; -use std::time::Duration; -use tokio::time::timeout; -use utils::lsn::Lsn; - -use crate::{ - send_wal::{EndWatch, WalSenderGuard}, - timeline::WalResidentTimeline, +use std::{ + pin::Pin, + task::{Context, Poll}, }; -pub(crate) struct WalReaderStreamBuilder { - pub(crate) tli: WalResidentTimeline, - pub(crate) start_pos: Lsn, - pub(crate) end_pos: Lsn, - pub(crate) term: Option, - pub(crate) end_watch: EndWatch, - pub(crate) wal_sender_guard: Arc, -} +use bytes::Bytes; +use futures::{stream::BoxStream, Stream, StreamExt}; +use utils::lsn::Lsn; -impl WalReaderStreamBuilder { - pub(crate) fn start_pos(&self) -> Lsn { - self.start_pos - } -} +use crate::{send_wal::EndWatch, timeline::WalResidentTimeline, wal_storage::WalReader}; +use safekeeper_api::Term; +#[derive(PartialEq, Eq, Debug)] pub(crate) struct WalBytes { /// Raw PG WAL pub(crate) wal: Bytes, @@ -44,106 +26,270 @@ pub(crate) struct WalBytes { pub(crate) available_wal_end_lsn: Lsn, } -impl WalReaderStreamBuilder { - /// Builds a stream of Postgres WAL starting from [`Self::start_pos`]. - /// The stream terminates when the receiver (pageserver) is fully caught up - /// and there's no active computes. - pub(crate) async fn build( - self, - buffer_size: usize, - ) -> anyhow::Result>> { - // TODO(vlad): The code below duplicates functionality from [`crate::send_wal`]. - // We can make the raw WAL sender use this stream too and remove the duplication. - let Self { - tli, - mut start_pos, - mut end_pos, - term, - mut end_watch, - wal_sender_guard, - } = self; - let mut wal_reader = tli.get_walreader(start_pos).await?; - let mut buffer = vec![0; buffer_size]; +struct PositionedWalReader { + start: Lsn, + end: Lsn, + reader: Option, +} - const POLL_STATE_TIMEOUT: Duration = Duration::from_secs(1); +/// A streaming WAL reader wrapper which can be reset while running +pub(crate) struct StreamingWalReader { + stream: BoxStream<'static, WalOrReset>, + start_changed_tx: tokio::sync::watch::Sender, +} - Ok(try_stream! { - loop { - let have_something_to_send = end_pos > start_pos; +pub(crate) enum WalOrReset { + Wal(anyhow::Result), + Reset(Lsn), +} - if !have_something_to_send { - // wait for lsn - let res = timeout(POLL_STATE_TIMEOUT, end_watch.wait_for_lsn(start_pos, term)).await; - match res { - Ok(ok) => { - end_pos = ok?; - }, - Err(_) => { - if let EndWatch::Commit(_) = end_watch { - if let Some(remote_consistent_lsn) = wal_sender_guard - .walsenders() - .get_ws_remote_consistent_lsn(wal_sender_guard.id()) - { - if tli.should_walsender_stop(remote_consistent_lsn).await { - // Stop streaming if the receivers are caught up and - // there's no active compute. This causes the loop in - // [`crate::send_interpreted_wal::InterpretedWalSender::run`] - // to exit and terminate the WAL stream. - return; - } - } - } - - continue; - } - } - } - - - assert!( - end_pos > start_pos, - "nothing to send after waiting for WAL" - ); - - // try to send as much as available, capped by the buffer size - let mut chunk_end_pos = start_pos + buffer_size as u64; - // if we went behind available WAL, back off - if chunk_end_pos >= end_pos { - chunk_end_pos = end_pos; - } else { - // If sending not up to end pos, round down to page boundary to - // avoid breaking WAL record not at page boundary, as protocol - // demands. See walsender.c (XLogSendPhysical). - chunk_end_pos = chunk_end_pos - .checked_sub(chunk_end_pos.block_offset()) - .unwrap(); - } - let send_size = (chunk_end_pos.0 - start_pos.0) as usize; - let buffer = &mut buffer[..send_size]; - let send_size: usize; - { - // If uncommitted part is being pulled, check that the term is - // still the expected one. - let _term_guard = if let Some(t) = term { - Some(tli.acquire_term(t).await?) - } else { - None - }; - // Read WAL into buffer. send_size can be additionally capped to - // segment boundary here. - send_size = wal_reader.read(buffer).await? - }; - let wal = Bytes::copy_from_slice(&buffer[..send_size]); - - yield WalBytes { - wal, - wal_start_lsn: start_pos, - wal_end_lsn: start_pos + send_size as u64, - available_wal_end_lsn: end_pos - }; - - start_pos += send_size as u64; - } - }) +impl WalOrReset { + pub(crate) fn get_wal(self) -> Option> { + match self { + WalOrReset::Wal(wal) => Some(wal), + WalOrReset::Reset(_) => None, + } + } +} + +impl StreamingWalReader { + pub(crate) fn new( + tli: WalResidentTimeline, + term: Option, + start: Lsn, + end: Lsn, + end_watch: EndWatch, + buffer_size: usize, + ) -> Self { + let (start_changed_tx, start_changed_rx) = tokio::sync::watch::channel(start); + + let state = WalReaderStreamState { + tli, + wal_reader: PositionedWalReader { + start, + end, + reader: None, + }, + term, + end_watch, + buffer: vec![0; buffer_size], + buffer_size, + }; + + // When a change notification is received while polling the internal + // reader, stop polling the read future and service the change. + let stream = futures::stream::unfold( + (state, start_changed_rx), + |(mut state, mut rx)| async move { + let wal_or_reset = tokio::select! { + read_res = state.read() => { WalOrReset::Wal(read_res) }, + changed_res = rx.changed() => { + if changed_res.is_err() { + return None; + } + + let new_start_pos = rx.borrow_and_update(); + WalOrReset::Reset(*new_start_pos) + } + }; + + if let WalOrReset::Reset(lsn) = wal_or_reset { + state.wal_reader.start = lsn; + state.wal_reader.reader = None; + } + + Some((wal_or_reset, (state, rx))) + }, + ) + .boxed(); + + Self { + stream, + start_changed_tx, + } + } + + /// Reset the stream to a given position. + pub(crate) async fn reset(&mut self, start: Lsn) { + self.start_changed_tx.send(start).unwrap(); + while let Some(wal_or_reset) = self.stream.next().await { + match wal_or_reset { + WalOrReset::Reset(at) => { + // Stream confirmed the reset. + // There may only one ongoing reset at any given time, + // hence the assertion. + assert_eq!(at, start); + break; + } + WalOrReset::Wal(_) => { + // Ignore wal generated before reset was handled + } + } + } + } +} + +impl Stream for StreamingWalReader { + type Item = WalOrReset; + + fn poll_next(mut self: Pin<&mut Self>, cx: &mut Context<'_>) -> Poll> { + Pin::new(&mut self.stream).poll_next(cx) + } +} + +struct WalReaderStreamState { + tli: WalResidentTimeline, + wal_reader: PositionedWalReader, + term: Option, + end_watch: EndWatch, + buffer: Vec, + buffer_size: usize, +} + +impl WalReaderStreamState { + async fn read(&mut self) -> anyhow::Result { + // Create reader if needed + if self.wal_reader.reader.is_none() { + self.wal_reader.reader = Some(self.tli.get_walreader(self.wal_reader.start).await?); + } + + let have_something_to_send = self.wal_reader.end > self.wal_reader.start; + if !have_something_to_send { + tracing::debug!( + "Waiting for wal: start={}, end={}", + self.wal_reader.end, + self.wal_reader.start + ); + self.wal_reader.end = self + .end_watch + .wait_for_lsn(self.wal_reader.start, self.term) + .await?; + tracing::debug!( + "Done waiting for wal: start={}, end={}", + self.wal_reader.end, + self.wal_reader.start + ); + } + + assert!( + self.wal_reader.end > self.wal_reader.start, + "nothing to send after waiting for WAL" + ); + + // Calculate chunk size + let mut chunk_end_pos = self.wal_reader.start + self.buffer_size as u64; + if chunk_end_pos >= self.wal_reader.end { + chunk_end_pos = self.wal_reader.end; + } else { + chunk_end_pos = chunk_end_pos + .checked_sub(chunk_end_pos.block_offset()) + .unwrap(); + } + + let send_size = (chunk_end_pos.0 - self.wal_reader.start.0) as usize; + let buffer = &mut self.buffer[..send_size]; + + // Read WAL + let send_size = { + let _term_guard = if let Some(t) = self.term { + Some(self.tli.acquire_term(t).await?) + } else { + None + }; + self.wal_reader + .reader + .as_mut() + .unwrap() + .read(buffer) + .await? + }; + + let wal = Bytes::copy_from_slice(&buffer[..send_size]); + let result = WalBytes { + wal, + wal_start_lsn: self.wal_reader.start, + wal_end_lsn: self.wal_reader.start + send_size as u64, + available_wal_end_lsn: self.wal_reader.end, + }; + + self.wal_reader.start += send_size as u64; + + Ok(result) + } +} + +#[cfg(test)] +mod tests { + use std::str::FromStr; + + use futures::StreamExt; + use postgres_ffi::MAX_SEND_SIZE; + use utils::{ + id::{NodeId, TenantTimelineId}, + lsn::Lsn, + }; + + use crate::{test_utils::Env, wal_reader_stream::StreamingWalReader}; + + #[tokio::test] + async fn test_streaming_wal_reader_reset() { + let _ = env_logger::builder().is_test(true).try_init(); + + const SIZE: usize = 8 * 1024; + const MSG_COUNT: usize = 200; + + let start_lsn = Lsn::from_str("0/149FD18").unwrap(); + let env = Env::new(true).unwrap(); + let tli = env + .make_timeline(NodeId(1), TenantTimelineId::generate(), start_lsn) + .await + .unwrap(); + + let resident_tli = tli.wal_residence_guard().await.unwrap(); + let end_watch = Env::write_wal(tli, start_lsn, SIZE, MSG_COUNT) + .await + .unwrap(); + let end_pos = end_watch.get(); + + tracing::info!("Doing first round of reads ..."); + + let mut streaming_wal_reader = StreamingWalReader::new( + resident_tli, + None, + start_lsn, + end_pos, + end_watch, + MAX_SEND_SIZE, + ); + + let mut before_reset = Vec::new(); + while let Some(wor) = streaming_wal_reader.next().await { + let wal = wor.get_wal().unwrap().unwrap(); + let stop = wal.available_wal_end_lsn == wal.wal_end_lsn; + before_reset.push(wal); + + if stop { + break; + } + } + + tracing::info!("Resetting the WAL stream ..."); + + streaming_wal_reader.reset(start_lsn).await; + + tracing::info!("Doing second round of reads ..."); + + let mut after_reset = Vec::new(); + while let Some(wor) = streaming_wal_reader.next().await { + let wal = wor.get_wal().unwrap().unwrap(); + let stop = wal.available_wal_end_lsn == wal.wal_end_lsn; + after_reset.push(wal); + + if stop { + break; + } + } + + assert_eq!(before_reset, after_reset); } } diff --git a/safekeeper/tests/walproposer_sim/safekeeper.rs b/safekeeper/tests/walproposer_sim/safekeeper.rs index efcdd89e7d..0023a4d22a 100644 --- a/safekeeper/tests/walproposer_sim/safekeeper.rs +++ b/safekeeper/tests/walproposer_sim/safekeeper.rs @@ -15,13 +15,15 @@ use desim::{ }; use http::Uri; use safekeeper::{ - safekeeper::{ProposerAcceptorMessage, SafeKeeper, UNKNOWN_SERVER_VERSION}, + safekeeper::{ + ProposerAcceptorMessage, SafeKeeper, SK_PROTOCOL_VERSION, UNKNOWN_SERVER_VERSION, + }, state::{TimelinePersistentState, TimelineState}, timeline::TimelineError, wal_storage::Storage, SafeKeeperConf, }; -use safekeeper_api::ServerInfo; +use safekeeper_api::{membership::Configuration, ServerInfo}; use tracing::{debug, info_span, warn}; use utils::{ id::{NodeId, TenantId, TenantTimelineId, TimelineId}, @@ -96,8 +98,13 @@ impl GlobalMap { let commit_lsn = Lsn::INVALID; let local_start_lsn = Lsn::INVALID; - let state = - TimelinePersistentState::new(&ttid, server_info, vec![], commit_lsn, local_start_lsn)?; + let state = TimelinePersistentState::new( + &ttid, + Configuration::empty(), + server_info, + commit_lsn, + local_start_lsn, + )?; let disk_timeline = self.disk.put_state(&ttid, state); let control_store = DiskStateStorage::new(disk_timeline.clone()); @@ -173,6 +180,8 @@ pub fn run_server(os: NodeOs, disk: Arc) -> Result<()> { control_file_save_interval: Duration::from_secs(1), partial_backup_concurrency: 1, eviction_min_resident: Duration::ZERO, + wal_reader_fanout: false, + max_delta_for_fanout: None, }; let mut global = GlobalMap::new(disk, conf.clone())?; @@ -278,7 +287,7 @@ impl ConnState { bail!("finished processing START_REPLICATION") } - let msg = ProposerAcceptorMessage::parse(copy_data)?; + let msg = ProposerAcceptorMessage::parse(copy_data, SK_PROTOCOL_VERSION)?; debug!("got msg: {:?}", msg); self.process(msg, global) } else { diff --git a/storage_controller/Cargo.toml b/storage_controller/Cargo.toml index 5f3319512d..caaa22d0a5 100644 --- a/storage_controller/Cargo.toml +++ b/storage_controller/Cargo.toml @@ -55,4 +55,4 @@ r2d2 = { version = "0.8.10" } utils = { path = "../libs/utils/" } metrics = { path = "../libs/metrics/" } control_plane = { path = "../control_plane" } -workspace_hack = { version = "0.1", path = "../workspace_hack" } +workspace_hack = { version = "0.1", path = "../workspace_hack" } \ No newline at end of file diff --git a/storage_controller/client/src/control_api.rs b/storage_controller/client/src/control_api.rs index a981b5020e..f8a2790769 100644 --- a/storage_controller/client/src/control_api.rs +++ b/storage_controller/client/src/control_api.rs @@ -1,7 +1,6 @@ use pageserver_client::mgmt_api::{self, ResponseErrorMessageExt}; use reqwest::{Method, Url}; use serde::{de::DeserializeOwned, Serialize}; -use std::str::FromStr; pub struct Client { base_url: Url, @@ -31,16 +30,11 @@ impl Client { RQ: Serialize + Sized, RS: DeserializeOwned + Sized, { - // The configured URL has the /upcall path prefix for pageservers to use: we will strip that out - // for general purpose API access. - let url = Url::from_str(&format!( - "http://{}:{}/{path}", - self.base_url.host_str().unwrap(), - self.base_url.port().unwrap() - )) - .unwrap(); - - let mut builder = self.client.request(method, url); + let request_path = self + .base_url + .join(&path) + .expect("Failed to build request path"); + let mut builder = self.client.request(method, request_path); if let Some(body) = body { builder = builder.json(&body) } diff --git a/storage_controller/migrations/2025-01-09-160454_safekeepers_remove_active/down.sql b/storage_controller/migrations/2025-01-09-160454_safekeepers_remove_active/down.sql new file mode 100644 index 0000000000..c2624f858b --- /dev/null +++ b/storage_controller/migrations/2025-01-09-160454_safekeepers_remove_active/down.sql @@ -0,0 +1,4 @@ +-- this sadly isn't a "true" revert of the migration, as the column is now at the end of the table. +-- But preserving order is not a trivial operation. +-- https://wiki.postgresql.org/wiki/Alter_column_position +ALTER TABLE safekeepers ADD active BOOLEAN NOT NULL DEFAULT false; diff --git a/storage_controller/migrations/2025-01-09-160454_safekeepers_remove_active/up.sql b/storage_controller/migrations/2025-01-09-160454_safekeepers_remove_active/up.sql new file mode 100644 index 0000000000..d76f044eda --- /dev/null +++ b/storage_controller/migrations/2025-01-09-160454_safekeepers_remove_active/up.sql @@ -0,0 +1 @@ +ALTER TABLE safekeepers DROP active; diff --git a/storage_controller/migrations/2025-01-15-181207_safekeepers_disabled_to_pause/down.sql b/storage_controller/migrations/2025-01-15-181207_safekeepers_disabled_to_pause/down.sql new file mode 100644 index 0000000000..3c7126e343 --- /dev/null +++ b/storage_controller/migrations/2025-01-15-181207_safekeepers_disabled_to_pause/down.sql @@ -0,0 +1,2 @@ +ALTER TABLE safekeepers ALTER COLUMN scheduling_policy SET DEFAULT 'disabled'; +UPDATE safekeepers SET scheduling_policy = 'disabled' WHERE scheduling_policy = 'pause'; diff --git a/storage_controller/migrations/2025-01-15-181207_safekeepers_disabled_to_pause/up.sql b/storage_controller/migrations/2025-01-15-181207_safekeepers_disabled_to_pause/up.sql new file mode 100644 index 0000000000..9ff75444f3 --- /dev/null +++ b/storage_controller/migrations/2025-01-15-181207_safekeepers_disabled_to_pause/up.sql @@ -0,0 +1,2 @@ +ALTER TABLE safekeepers ALTER COLUMN scheduling_policy SET DEFAULT 'pause'; +UPDATE safekeepers SET scheduling_policy = 'pause' WHERE scheduling_policy = 'disabled'; diff --git a/storage_controller/src/compute_hook.rs b/storage_controller/src/compute_hook.rs index 69db48f8d1..3884a6df46 100644 --- a/storage_controller/src/compute_hook.rs +++ b/storage_controller/src/compute_hook.rs @@ -124,7 +124,10 @@ impl ComputeHookTenant { if let Some(shard_idx) = shard_idx { sharded.shards.remove(shard_idx); } else { - tracing::warn!("Shard not found while handling detach") + // This is a valid but niche case, where the tenant was previously attached + // as a Secondary location and then detached, so has no previously notified + // state. + tracing::info!("Shard not found while handling detach") } } ComputeHookTenant::Unsharded(_) => { @@ -761,7 +764,10 @@ impl ComputeHook { let mut state_locked = self.state.lock().unwrap(); match state_locked.entry(tenant_shard_id.tenant_id) { Entry::Vacant(_) => { - tracing::warn!("Compute hook tenant not found for detach"); + // This is a valid but niche case, where the tenant was previously attached + // as a Secondary location and then detached, so has no previously notified + // state. + tracing::info!("Compute hook tenant not found for detach"); } Entry::Occupied(mut e) => { let sharded = e.get().is_sharded(); diff --git a/storage_controller/src/drain_utils.rs b/storage_controller/src/drain_utils.rs index 47f4276ff2..8b7be88078 100644 --- a/storage_controller/src/drain_utils.rs +++ b/storage_controller/src/drain_utils.rs @@ -112,7 +112,7 @@ impl TenantShardDrain { } } - match scheduler.node_preferred(tenant_shard.intent.get_secondary()) { + match tenant_shard.preferred_secondary(scheduler) { Some(node) => Some(node), None => { tracing::warn!( diff --git a/storage_controller/src/http.rs b/storage_controller/src/http.rs index 5385e4ee0b..ac890b008f 100644 --- a/storage_controller/src/http.rs +++ b/storage_controller/src/http.rs @@ -15,7 +15,7 @@ use metrics::{BuildInfo, NeonMetrics}; use pageserver_api::controller_api::{ MetadataHealthListOutdatedRequest, MetadataHealthListOutdatedResponse, MetadataHealthListUnhealthyResponse, MetadataHealthUpdateRequest, MetadataHealthUpdateResponse, - ShardsPreferredAzsRequest, TenantCreateRequest, + SafekeeperSchedulingPolicyRequest, ShardsPreferredAzsRequest, TenantCreateRequest, }; use pageserver_api::models::{ TenantConfigPatchRequest, TenantConfigRequest, TenantLocationConfigRequest, @@ -653,6 +653,10 @@ async fn handle_tenant_list( ) -> Result, ApiError> { check_permissions(&req, Scope::Admin)?; + let limit: Option = parse_query_param(&req, "limit")?; + let start_after: Option = parse_query_param(&req, "start_after")?; + tracing::info!("start_after: {:?}", start_after); + match maybe_forward(req).await { ForwardOutcome::Forwarded(res) => { return res; @@ -660,7 +664,7 @@ async fn handle_tenant_list( ForwardOutcome::NotForwarded(_req) => {} }; - json_response(StatusCode::OK, service.tenant_list()) + json_response(StatusCode::OK, service.tenant_list(limit, start_after)) } async fn handle_node_register(req: Request) -> Result, ApiError> { @@ -690,7 +694,8 @@ async fn handle_node_list(req: Request) -> Result, ApiError }; let state = get_state(&req); - let nodes = state.service.node_list().await?; + let mut nodes = state.service.node_list().await?; + nodes.sort_by_key(|n| n.get_id()); let api_nodes = nodes.into_iter().map(|n| n.describe()).collect::>(); json_response(StatusCode::OK, api_nodes) @@ -1005,6 +1010,29 @@ async fn handle_tenant_shard_migrate( ) } +async fn handle_tenant_shard_migrate_secondary( + service: Arc, + req: Request, +) -> Result, ApiError> { + check_permissions(&req, Scope::Admin)?; + + let mut req = match maybe_forward(req).await { + ForwardOutcome::Forwarded(res) => { + return res; + } + ForwardOutcome::NotForwarded(req) => req, + }; + + let tenant_shard_id: TenantShardId = parse_request_param(&req, "tenant_shard_id")?; + let migrate_req = json_request::(&mut req).await?; + json_response( + StatusCode::OK, + service + .tenant_shard_migrate_secondary(tenant_shard_id, migrate_req) + .await?, + ) +} + async fn handle_tenant_shard_cancel_reconcile( service: Arc, req: Request, @@ -1277,6 +1305,35 @@ async fn handle_upsert_safekeeper(mut req: Request) -> Result, +) -> Result, ApiError> { + check_permissions(&req, Scope::Admin)?; + + let body = json_request::(&mut req).await?; + let id = parse_request_param::(&req, "id")?; + + let req = match maybe_forward(req).await { + ForwardOutcome::Forwarded(res) => { + return res; + } + ForwardOutcome::NotForwarded(req) => req, + }; + + let state = get_state(&req); + + state + .service + .set_safekeeper_scheduling_policy(id, body.scheduling_policy) + .await?; + + Ok(Response::builder() + .status(StatusCode::NO_CONTENT) + .body(Body::empty()) + .unwrap()) +} + /// Common wrapper for request handlers that call into Service and will operate on tenants: they must only /// be allowed to run if Service has finished its initial reconciliation. async fn tenant_service_handler( @@ -1845,7 +1902,18 @@ pub fn make_router( }) .post("/control/v1/safekeeper/:id", |r| { // id is in the body - named_request_span(r, handle_upsert_safekeeper, RequestName("v1_safekeeper")) + named_request_span( + r, + handle_upsert_safekeeper, + RequestName("v1_safekeeper_post"), + ) + }) + .post("/control/v1/safekeeper/:id/scheduling_policy", |r| { + named_request_span( + r, + handle_safekeeper_scheduling_policy, + RequestName("v1_safekeeper_status"), + ) }) // Tenant Shard operations .put("/control/v1/tenant/:tenant_shard_id/migrate", |r| { @@ -1855,6 +1923,16 @@ pub fn make_router( RequestName("control_v1_tenant_migrate"), ) }) + .put( + "/control/v1/tenant/:tenant_shard_id/migrate_secondary", + |r| { + tenant_service_handler( + r, + handle_tenant_shard_migrate_secondary, + RequestName("control_v1_tenant_migrate_secondary"), + ) + }, + ) .put( "/control/v1/tenant/:tenant_shard_id/cancel_reconcile", |r| { diff --git a/storage_controller/src/metrics.rs b/storage_controller/src/metrics.rs index 6d5885eba6..4164e3dc2b 100644 --- a/storage_controller/src/metrics.rs +++ b/storage_controller/src/metrics.rs @@ -53,6 +53,16 @@ pub(crate) struct StorageControllerMetricGroup { /// How many shards are not scheduled into their preferred AZ pub(crate) storage_controller_schedule_az_violation: measured::Gauge, + /// How many shard locations (secondary or attached) on each node + pub(crate) storage_controller_node_shards: measured::GaugeVec, + + /// How many _attached_ shard locations on each node + pub(crate) storage_controller_node_attached_shards: measured::GaugeVec, + + /// How many _home_ shard locations on each node (i.e. the node's AZ matches the shard's + /// preferred AZ) + pub(crate) storage_controller_node_home_shards: measured::GaugeVec, + /// How many shards would like to reconcile but were blocked by concurrency limits pub(crate) storage_controller_pending_reconciles: measured::Gauge, @@ -132,6 +142,15 @@ impl Default for StorageControllerMetrics { } } +#[derive(measured::LabelGroup, Clone)] +#[label(set = NodeLabelGroupSet)] +pub(crate) struct NodeLabelGroup<'a> { + #[label(dynamic_with = lasso::ThreadedRodeo, default)] + pub(crate) az: &'a str, + #[label(dynamic_with = lasso::ThreadedRodeo, default)] + pub(crate) node_id: &'a str, +} + #[derive(measured::LabelGroup)] #[label(set = ReconcileCompleteLabelGroupSet)] pub(crate) struct ReconcileCompleteLabelGroup { diff --git a/storage_controller/src/node.rs b/storage_controller/src/node.rs index 4cc9b0070d..f5c2d329e0 100644 --- a/storage_controller/src/node.rs +++ b/storage_controller/src/node.rs @@ -299,6 +299,7 @@ impl Node { id: self.id, availability: self.availability.clone().into(), scheduling: self.scheduling, + availability_zone_id: self.availability_zone_id.0.clone(), listen_http_addr: self.listen_http_addr.clone(), listen_http_port: self.listen_http_port, listen_pg_addr: self.listen_pg_addr.clone(), diff --git a/storage_controller/src/persistence.rs b/storage_controller/src/persistence.rs index cebf3e9594..37bfaf1139 100644 --- a/storage_controller/src/persistence.rs +++ b/storage_controller/src/persistence.rs @@ -708,10 +708,11 @@ impl Persistence { Ok(()) } + /// Note that passing None for a shard clears the preferred AZ (rather than leaving it unmodified) pub(crate) async fn set_tenant_shard_preferred_azs( &self, - preferred_azs: Vec<(TenantShardId, AvailabilityZone)>, - ) -> DatabaseResult> { + preferred_azs: Vec<(TenantShardId, Option)>, + ) -> DatabaseResult)>> { use crate::schema::tenant_shards::dsl::*; self.with_measured_conn(DatabaseOperation::SetPreferredAzs, move |conn| { @@ -722,7 +723,7 @@ impl Persistence { .filter(tenant_id.eq(tenant_shard_id.tenant_id.to_string())) .filter(shard_number.eq(tenant_shard_id.shard_number.0 as i32)) .filter(shard_count.eq(tenant_shard_id.shard_count.literal() as i32)) - .set(preferred_az_id.eq(preferred_az.0.clone())) + .set(preferred_az_id.eq(preferred_az.as_ref().map(|az| az.0.clone()))) .execute(conn)?; if updated == 1 { @@ -1103,6 +1104,37 @@ impl Persistence { }) .await } + + pub(crate) async fn set_safekeeper_scheduling_policy( + &self, + id_: i64, + scheduling_policy_: SkSchedulingPolicy, + ) -> Result<(), DatabaseError> { + use crate::schema::safekeepers::dsl::*; + + self.with_conn(move |conn| -> DatabaseResult<()> { + #[derive(Insertable, AsChangeset)] + #[diesel(table_name = crate::schema::safekeepers)] + struct UpdateSkSchedulingPolicy<'a> { + id: i64, + scheduling_policy: &'a str, + } + let scheduling_policy_ = String::from(scheduling_policy_); + + let rows_affected = diesel::update(safekeepers.filter(id.eq(id_))) + .set(scheduling_policy.eq(scheduling_policy_)) + .execute(conn)?; + + if rows_affected != 1 { + return Err(DatabaseError::Logical(format!( + "unexpected number of rows ({rows_affected})", + ))); + } + + Ok(()) + }) + .await + } } /// Parts of [`crate::tenant_shard::TenantShard`] that are stored durably @@ -1258,7 +1290,6 @@ pub(crate) struct SafekeeperPersistence { pub(crate) version: i64, pub(crate) host: String, pub(crate) port: i32, - pub(crate) active: bool, pub(crate) http_port: i32, pub(crate) availability_zone_id: String, pub(crate) scheduling_policy: String, @@ -1270,7 +1301,6 @@ impl SafekeeperPersistence { SkSchedulingPolicy::from_str(&self.scheduling_policy).map_err(|e| { DatabaseError::Logical(format!("can't construct SkSchedulingPolicy: {e:?}")) })?; - // omit the `active` flag on purpose: it is deprecated. Ok(SafekeeperDescribeResponse { id: NodeId(self.id as u64), region_id: self.region_id.clone(), @@ -1295,7 +1325,8 @@ pub(crate) struct SafekeeperUpsert { pub(crate) version: i64, pub(crate) host: String, pub(crate) port: i32, - pub(crate) active: bool, + /// The active flag will not be stored in the database and will be ignored. + pub(crate) active: Option, pub(crate) http_port: i32, pub(crate) availability_zone_id: String, } @@ -1311,7 +1342,6 @@ impl SafekeeperUpsert { version: self.version, host: &self.host, port: self.port, - active: self.active, http_port: self.http_port, availability_zone_id: &self.availability_zone_id, // None means a wish to not update this column. We expose abilities to update it via other means. @@ -1328,7 +1358,6 @@ struct InsertUpdateSafekeeper<'a> { version: i64, host: &'a str, port: i32, - active: bool, http_port: i32, availability_zone_id: &'a str, scheduling_policy: Option<&'a str>, diff --git a/storage_controller/src/reconciler.rs b/storage_controller/src/reconciler.rs index e0a854fff7..adced3b77d 100644 --- a/storage_controller/src/reconciler.rs +++ b/storage_controller/src/reconciler.rs @@ -696,6 +696,11 @@ impl Reconciler { /// First we apply special case handling (e.g. for live migrations), and then a /// general case reconciliation where we walk through the intent by pageserver /// and call out to the pageserver to apply the desired state. + /// + /// An Ok(()) result indicates that we successfully attached the tenant, but _not_ that + /// all locations for the tenant are in the expected state. When nodes that are to be detached + /// or configured as secondary are unavailable, we may return Ok(()) but leave the shard in a + /// state where it still requires later reconciliation. pub(crate) async fn reconcile(&mut self) -> Result<(), ReconcileError> { // Prepare: if we have uncertain `observed` state for our would-be attachement location, then refresh it self.maybe_refresh_observed().await?; @@ -784,10 +789,18 @@ impl Reconciler { tracing::info!(node_id=%node.get_id(), "Observed configuration already correct.") } _ => { - // In all cases other than a matching observed configuration, we will - // reconcile this location. - tracing::info!(node_id=%node.get_id(), "Observed configuration requires update."); - changes.push((node.clone(), wanted_conf)) + // Only try and configure secondary locations on nodes that are available. This + // allows the reconciler to "succeed" while some secondaries are offline (e.g. after + // a node failure, where the failed node will have a secondary intent) + if node.is_available() { + tracing::info!(node_id=%node.get_id(), "Observed configuration requires update."); + changes.push((node.clone(), wanted_conf)) + } else { + tracing::info!(node_id=%node.get_id(), "Skipping configuration as secondary, node is unavailable"); + self.observed + .locations + .insert(node.get_id(), ObservedStateLocation { conf: None }); + } } } } @@ -813,7 +826,21 @@ impl Reconciler { if self.cancel.is_cancelled() { return Err(ReconcileError::Cancel); } - self.location_config(&node, conf, None, false).await?; + // We only try to configure secondary locations if the node is available. This does + // not stop us succeeding with the reconcile, because our core goal is to make the + // shard _available_ (the attached location), and configuring secondary locations + // can be done lazily when the node becomes available (via background reconciliation). + if node.is_available() { + self.location_config(&node, conf, None, false).await?; + } else { + // If the node is unavailable, we skip and consider the reconciliation successful: this + // is a common case where a pageserver is marked unavailable: we demote a location on + // that unavailable pageserver to secondary. + tracing::info!("Skipping configuring secondary location {node}, it is unavailable"); + self.observed + .locations + .insert(node.get_id(), ObservedStateLocation { conf: None }); + } } // The condition below identifies a detach. We must have no attached intent and diff --git a/storage_controller/src/scheduler.rs b/storage_controller/src/scheduler.rs index 51a4cf35be..f5cab9dd57 100644 --- a/storage_controller/src/scheduler.rs +++ b/storage_controller/src/scheduler.rs @@ -1,4 +1,4 @@ -use crate::{node::Node, tenant_shard::TenantShard}; +use crate::{metrics::NodeLabelGroup, node::Node, tenant_shard::TenantShard}; use itertools::Itertools; use pageserver_api::{controller_api::AvailabilityZone, models::PageserverUtilization}; use serde::Serialize; @@ -32,6 +32,9 @@ pub(crate) struct SchedulerNode { shard_count: usize, /// How many shards are currently attached on this node, via their [`crate::tenant_shard::IntentState`]. attached_shard_count: usize, + /// How many shards have a location on this node (via [`crate::tenant_shard::IntentState`]) _and_ this node + /// is in their preferred AZ (i.e. this is their 'home' location) + home_shard_count: usize, /// Availability zone id in which the node resides az: AvailabilityZone, @@ -47,6 +50,12 @@ pub(crate) trait NodeSchedulingScore: Debug + Ord + Copy + Sized { preferred_az: &Option, context: &ScheduleContext, ) -> Option; + + /// Return a score that drops any components based on node utilization: this is useful + /// for finding scores for scheduling optimisation, when we want to avoid rescheduling + /// shards due to e.g. disk usage, to avoid flapping. + fn for_optimization(&self) -> Self; + fn is_overloaded(&self) -> bool; fn node_id(&self) -> NodeId; } @@ -136,17 +145,13 @@ impl PartialOrd for SecondaryAzMatch { /// Ordering is given by member declaration order (top to bottom). #[derive(Debug, PartialEq, Eq, PartialOrd, Ord, Clone, Copy)] pub(crate) struct NodeAttachmentSchedulingScore { - /// The number of shards belonging to the tenant currently being - /// scheduled that are attached to this node. - affinity_score: AffinityScore, /// Flag indicating whether this node matches the preferred AZ /// of the shard. For equal affinity scores, nodes in the matching AZ /// are considered first. az_match: AttachmentAzMatch, - /// Size of [`ScheduleContext::attached_nodes`] for the current node. - /// This normally tracks the number of attached shards belonging to the - /// tenant being scheduled that are already on this node. - attached_shards_in_context: usize, + /// The number of shards belonging to the tenant currently being + /// scheduled that are attached to this node. + affinity_score: AffinityScore, /// Utilisation score that combines shard count and disk utilisation utilization_score: u64, /// Total number of shards attached to this node. When nodes have identical utilisation, this @@ -177,13 +182,25 @@ impl NodeSchedulingScore for NodeAttachmentSchedulingScore { .copied() .unwrap_or(AffinityScore::FREE), az_match: AttachmentAzMatch(AzMatch::new(&node.az, preferred_az.as_ref())), - attached_shards_in_context: context.attached_nodes.get(node_id).copied().unwrap_or(0), utilization_score: utilization.cached_score(), total_attached_shard_count: node.attached_shard_count, node_id: *node_id, }) } + /// For use in scheduling optimisation, where we only want to consider the aspects + /// of the score that can only be resolved by moving things (such as inter-shard affinity + /// and AZ affinity), and ignore aspects that reflect the total utilization of a node (which + /// can fluctuate for other reasons) + fn for_optimization(&self) -> Self { + Self { + utilization_score: 0, + total_attached_shard_count: 0, + node_id: NodeId(0), + ..*self + } + } + fn is_overloaded(&self) -> bool { PageserverUtilization::is_overloaded(self.utilization_score) } @@ -208,9 +225,9 @@ pub(crate) struct NodeSecondarySchedulingScore { affinity_score: AffinityScore, /// Utilisation score that combines shard count and disk utilisation utilization_score: u64, - /// Total number of shards attached to this node. When nodes have identical utilisation, this - /// acts as an anti-affinity between attached shards. - total_attached_shard_count: usize, + /// Anti-affinity with other non-home locations: this gives the behavior that secondaries + /// will spread out across the nodes in an AZ. + total_non_home_shard_count: usize, /// Convenience to make selection deterministic in tests and empty systems node_id: NodeId, } @@ -237,11 +254,20 @@ impl NodeSchedulingScore for NodeSecondarySchedulingScore { .copied() .unwrap_or(AffinityScore::FREE), utilization_score: utilization.cached_score(), - total_attached_shard_count: node.attached_shard_count, + total_non_home_shard_count: (node.shard_count - node.home_shard_count), node_id: *node_id, }) } + fn for_optimization(&self) -> Self { + Self { + utilization_score: 0, + total_non_home_shard_count: 0, + node_id: NodeId(0), + ..*self + } + } + fn is_overloaded(&self) -> bool { PageserverUtilization::is_overloaded(self.utilization_score) } @@ -293,6 +319,10 @@ impl AffinityScore { pub(crate) fn inc(&mut self) { self.0 += 1; } + + pub(crate) fn dec(&mut self) { + self.0 -= 1; + } } impl std::ops::Add for AffinityScore { @@ -324,9 +354,6 @@ pub(crate) struct ScheduleContext { /// Sparse map of nodes: omitting a node implicitly makes its affinity [`AffinityScore::FREE`] pub(crate) nodes: HashMap, - /// Specifically how many _attached_ locations are on each node - pub(crate) attached_nodes: HashMap, - pub(crate) mode: ScheduleMode, } @@ -334,7 +361,6 @@ impl ScheduleContext { pub(crate) fn new(mode: ScheduleMode) -> Self { Self { nodes: HashMap::new(), - attached_nodes: HashMap::new(), mode, } } @@ -348,25 +374,31 @@ impl ScheduleContext { } } - pub(crate) fn push_attached(&mut self, node_id: NodeId) { - let entry = self.attached_nodes.entry(node_id).or_default(); - *entry += 1; - } - - pub(crate) fn get_node_affinity(&self, node_id: NodeId) -> AffinityScore { - self.nodes - .get(&node_id) - .copied() - .unwrap_or(AffinityScore::FREE) - } - - pub(crate) fn get_node_attachments(&self, node_id: NodeId) -> usize { - self.attached_nodes.get(&node_id).copied().unwrap_or(0) + /// Remove `shard`'s contributions to this context. This is useful when considering scheduling + /// this shard afresh, where we don't want it to e.g. experience anti-affinity to its current location. + pub(crate) fn project_detach(&self, shard: &TenantShard) -> Self { + let mut new_context = self.clone(); + + if let Some(attached) = shard.intent.get_attached() { + if let Some(score) = new_context.nodes.get_mut(attached) { + score.dec(); + } + } + + for secondary in shard.intent.get_secondary() { + if let Some(score) = new_context.nodes.get_mut(secondary) { + score.dec(); + } + } + + new_context } + /// For test, track the sum of AffinityScore values, which is effectively how many + /// attached or secondary locations have been registered with this context. #[cfg(test)] - pub(crate) fn attach_count(&self) -> usize { - self.attached_nodes.values().sum() + pub(crate) fn location_count(&self) -> usize { + self.nodes.values().map(|i| i.0).sum() } } @@ -388,6 +420,7 @@ impl Scheduler { SchedulerNode { shard_count: 0, attached_shard_count: 0, + home_shard_count: 0, may_schedule: node.may_schedule(), az: node.get_availability_zone_id().clone(), }, @@ -415,6 +448,7 @@ impl Scheduler { SchedulerNode { shard_count: 0, attached_shard_count: 0, + home_shard_count: 0, may_schedule: node.may_schedule(), az: node.get_availability_zone_id().clone(), }, @@ -427,6 +461,9 @@ impl Scheduler { Some(node) => { node.shard_count += 1; node.attached_shard_count += 1; + if Some(&node.az) == shard.preferred_az() { + node.home_shard_count += 1; + } } None => anyhow::bail!( "Tenant {} references nonexistent node {}", @@ -438,7 +475,12 @@ impl Scheduler { for node_id in shard.intent.get_secondary() { match expect_nodes.get_mut(node_id) { - Some(node) => node.shard_count += 1, + Some(node) => { + node.shard_count += 1; + if Some(&node.az) == shard.preferred_az() { + node.home_shard_count += 1; + } + } None => anyhow::bail!( "Tenant {} references nonexistent node {}", shard.tenant_shard_id, @@ -482,13 +524,20 @@ impl Scheduler { /// /// It is an error to call this for a node that is not known to the scheduler (i.e. passed into /// [`Self::new`] or [`Self::node_upsert`]) - pub(crate) fn update_node_ref_counts(&mut self, node_id: NodeId, update: RefCountUpdate) { + pub(crate) fn update_node_ref_counts( + &mut self, + node_id: NodeId, + preferred_az: Option<&AvailabilityZone>, + update: RefCountUpdate, + ) { let Some(node) = self.nodes.get_mut(&node_id) else { debug_assert!(false); tracing::error!("Scheduler missing node {node_id}"); return; }; + let is_home_az = Some(&node.az) == preferred_az; + match update { RefCountUpdate::PromoteSecondary => { node.attached_shard_count += 1; @@ -496,19 +545,31 @@ impl Scheduler { RefCountUpdate::Attach => { node.shard_count += 1; node.attached_shard_count += 1; + if is_home_az { + node.home_shard_count += 1; + } } RefCountUpdate::Detach => { node.shard_count -= 1; node.attached_shard_count -= 1; + if is_home_az { + node.home_shard_count -= 1; + } } RefCountUpdate::DemoteAttached => { node.attached_shard_count -= 1; } RefCountUpdate::AddSecondary => { node.shard_count += 1; + if is_home_az { + node.home_shard_count += 1; + } } RefCountUpdate::RemoveSecondary => { node.shard_count -= 1; + if is_home_az { + node.home_shard_count -= 1; + } } } @@ -594,6 +655,7 @@ impl Scheduler { entry.insert(SchedulerNode { shard_count: 0, attached_shard_count: 0, + home_shard_count: 0, may_schedule: node.may_schedule(), az: node.get_availability_zone_id().clone(), }); @@ -607,33 +669,20 @@ impl Scheduler { } } - /// Where we have several nodes to choose from, for example when picking a secondary location - /// to promote to an attached location, this method may be used to pick the best choice based - /// on the scheduler's knowledge of utilization and availability. - /// - /// If the input is empty, or all the nodes are not elegible for scheduling, return None: the - /// caller can pick a node some other way. - pub(crate) fn node_preferred(&self, nodes: &[NodeId]) -> Option { - if nodes.is_empty() { - return None; - } - - // TODO: When the utilization score returned by the pageserver becomes meaningful, - // schedule based on that instead of the shard count. - let node = nodes - .iter() - .map(|node_id| { - let may_schedule = self - .nodes - .get(node_id) - .map(|n| !matches!(n.may_schedule, MaySchedule::No)) - .unwrap_or(false); - (*node_id, may_schedule) - }) - .max_by_key(|(_n, may_schedule)| *may_schedule); - - // If even the preferred node has may_schedule==false, return None - node.and_then(|(node_id, may_schedule)| if may_schedule { Some(node_id) } else { None }) + /// Calculate a single node's score, used in optimizer logic to compare specific + /// nodes' scores. + pub(crate) fn compute_node_score( + &mut self, + node_id: NodeId, + preferred_az: &Option, + context: &ScheduleContext, + ) -> Option + where + Score: NodeSchedulingScore, + { + self.nodes + .get_mut(&node_id) + .and_then(|node| Score::generate(&node_id, node, preferred_az, context)) } /// Compute a schedulling score for each node that the scheduler knows of @@ -727,7 +776,7 @@ impl Scheduler { tracing::info!( "scheduler selected node {node_id} (elegible nodes {:?}, hard exclude: {hard_exclude:?}, soft exclude: {context:?})", scores.iter().map(|i| i.node_id().0).collect::>() - ); + ); } // Note that we do not update shard count here to reflect the scheduling: that @@ -743,47 +792,74 @@ impl Scheduler { } /// For choosing which AZ to schedule a new shard into, use this. It will return the - /// AZ with the lowest median utilization. + /// AZ with the the lowest number of shards currently scheduled in this AZ as their home + /// location. /// /// We use an AZ-wide measure rather than simply selecting the AZ of the least-loaded /// node, because while tenants start out single sharded, when they grow and undergo - /// shard-split, they will occupy space on many nodes within an AZ. + /// shard-split, they will occupy space on many nodes within an AZ. It is important + /// that we pick the AZ in a way that balances this _future_ load. /// - /// We use median rather than total free space or mean utilization, because - /// we wish to avoid preferring AZs that have low-load nodes resulting from - /// recent replacements. - /// - /// The practical result is that we will pick an AZ based on its median node, and - /// then actually _schedule_ the new shard onto the lowest-loaded node in that AZ. + /// Once we've picked an AZ, subsequent scheduling within that AZ will be driven by + /// nodes' utilization scores. pub(crate) fn get_az_for_new_tenant(&self) -> Option { if self.nodes.is_empty() { return None; } - let mut scores_by_az = HashMap::new(); - for (node_id, node) in &self.nodes { - let az_scores = scores_by_az.entry(&node.az).or_insert_with(Vec::new); - let score = match &node.may_schedule { - MaySchedule::Yes(utilization) => utilization.score(), - MaySchedule::No => PageserverUtilization::full().score(), - }; - az_scores.push((node_id, node, score)); + #[derive(Default)] + struct AzScore { + home_shard_count: usize, + scheduleable: bool, } - // Sort by utilization. Also include the node ID to break ties. - for scores in scores_by_az.values_mut() { - scores.sort_by_key(|i| (i.2, i.0)); + let mut azs: HashMap<&AvailabilityZone, AzScore> = HashMap::new(); + for node in self.nodes.values() { + let az = azs.entry(&node.az).or_default(); + az.home_shard_count += node.home_shard_count; + az.scheduleable |= matches!(node.may_schedule, MaySchedule::Yes(_)); } - let mut median_by_az = scores_by_az + // If any AZs are schedulable, then filter out the non-schedulable ones (i.e. AZs where + // all nodes are overloaded or otherwise unschedulable). + if azs.values().any(|i| i.scheduleable) { + azs.retain(|_, i| i.scheduleable); + } + + // Find the AZ with the lowest number of shards currently allocated + Some( + azs.into_iter() + .min_by_key(|i| (i.1.home_shard_count, i.0)) + .unwrap() + .0 + .clone(), + ) + } + + pub(crate) fn get_node_az(&self, node_id: &NodeId) -> Option { + self.nodes.get(node_id).map(|n| n.az.clone()) + } + + /// For use when choosing a preferred secondary location: filter out nodes that are not + /// available, and gather their AZs. + pub(crate) fn filter_usable_nodes( + &self, + nodes: &[NodeId], + ) -> Vec<(NodeId, Option)> { + nodes .iter() - .map(|(az, nodes)| (*az, nodes.get(nodes.len() / 2).unwrap().2)) - .collect::>(); - // Sort by utilization. Also include the AZ to break ties. - median_by_az.sort_by_key(|i| (i.1, i.0)); - - // Return the AZ with the lowest median utilization - Some(median_by_az.first().unwrap().0.clone()) + .filter_map(|node_id| { + let node = self + .nodes + .get(node_id) + .expect("Referenced nodes always exist"); + if matches!(node.may_schedule, MaySchedule::Yes(_)) { + Some((*node_id, Some(node.az.clone()))) + } else { + None + } + }) + .collect() } /// Unit test access to internal state @@ -796,6 +872,33 @@ impl Scheduler { pub(crate) fn get_node_attached_shard_count(&self, node_id: NodeId) -> usize { self.nodes.get(&node_id).unwrap().attached_shard_count } + + /// Some metrics that we only calculate periodically: this is simpler than + /// rigorously updating them on every change. + pub(crate) fn update_metrics(&self) { + for (node_id, node) in &self.nodes { + let node_id_str = format!("{}", node_id); + let label_group = NodeLabelGroup { + az: &node.az.0, + node_id: &node_id_str, + }; + + crate::metrics::METRICS_REGISTRY + .metrics_group + .storage_controller_node_shards + .set(label_group.clone(), node.shard_count as i64); + + crate::metrics::METRICS_REGISTRY + .metrics_group + .storage_controller_node_attached_shards + .set(label_group.clone(), node.attached_shard_count as i64); + + crate::metrics::METRICS_REGISTRY + .metrics_group + .storage_controller_node_home_shards + .set(label_group.clone(), node.home_shard_count as i64); + } + } } #[cfg(test)] @@ -843,7 +946,14 @@ pub(crate) mod test_utils { #[cfg(test)] mod tests { - use pageserver_api::{controller_api::NodeAvailability, models::utilization::test_utilization}; + use pageserver_api::{ + controller_api::NodeAvailability, models::utilization::test_utilization, + shard::ShardIdentity, + }; + use utils::{ + id::TenantId, + shard::{ShardCount, ShardNumber, TenantShardId}, + }; use super::*; @@ -853,8 +963,8 @@ mod tests { let nodes = test_utils::make_test_nodes(2, &[]); let mut scheduler = Scheduler::new(nodes.values()); - let mut t1_intent = IntentState::new(); - let mut t2_intent = IntentState::new(); + let mut t1_intent = IntentState::new(None); + let mut t2_intent = IntentState::new(None); let context = ScheduleContext::default(); @@ -930,7 +1040,7 @@ mod tests { let scheduled = scheduler .schedule_shard::(&[], &None, context) .unwrap(); - let mut intent = IntentState::new(); + let mut intent = IntentState::new(None); intent.set_attached(scheduler, Some(scheduled)); scheduled_intents.push(intent); assert_eq!(scheduled, expect_node); @@ -1063,7 +1173,7 @@ mod tests { let scheduled = scheduler .schedule_shard::(&[], &preferred_az, context) .unwrap(); - let mut intent = IntentState::new(); + let mut intent = IntentState::new(preferred_az.clone()); intent.set_attached(scheduler, Some(scheduled)); scheduled_intents.push(intent); assert_eq!(scheduled, expect_node); @@ -1089,9 +1199,9 @@ mod tests { &mut context, ); - // Node 2 is not in "az-a", but it has the lowest affinity so we prefer that. + // Node 1 and 3 (az-a) have same affinity score, so prefer the lowest node id. assert_scheduler_chooses::( - NodeId(2), + NodeId(1), Some(az_a_tag.clone()), &mut scheduled_intents, &mut scheduler, @@ -1107,26 +1217,6 @@ mod tests { &mut context, ); - // Avoid nodes in "az-b" for the secondary location. - // Nodes 1 and 3 are identically loaded, so prefer the lowest node id. - assert_scheduler_chooses::( - NodeId(1), - Some(az_b_tag.clone()), - &mut scheduled_intents, - &mut scheduler, - &mut context, - ); - - // Avoid nodes in "az-b" for the secondary location. - // Node 3 has lower affinity score than 1, so prefer that. - assert_scheduler_chooses::( - NodeId(3), - Some(az_b_tag.clone()), - &mut scheduled_intents, - &mut scheduler, - &mut context, - ); - for mut intent in scheduled_intents { intent.clear(&mut scheduler); } @@ -1150,34 +1240,292 @@ mod tests { let mut scheduler = Scheduler::new(nodes.values()); - /// Force the utilization of a node in Scheduler's state to a particular - /// number of bytes used. - fn set_utilization(scheduler: &mut Scheduler, node_id: NodeId, shard_count: u32) { - let mut node = Node::new( - node_id, - "".to_string(), - 0, - "".to_string(), - 0, - scheduler.nodes.get(&node_id).unwrap().az.clone(), - ); - node.set_availability(NodeAvailability::Active(test_utilization::simple( - shard_count, - 0, - ))); - scheduler.node_upsert(&node); + /// Force the `home_shard_count` of a node directly: this is the metric used + /// by the scheduler when picking AZs. + fn set_shard_count(scheduler: &mut Scheduler, node_id: NodeId, shard_count: usize) { + let node = scheduler.nodes.get_mut(&node_id).unwrap(); + node.home_shard_count = shard_count; } // Initial empty state. Scores are tied, scheduler prefers lower AZ ID. assert_eq!(scheduler.get_az_for_new_tenant(), Some(az_a_tag.clone())); - // Put some utilization on one node in AZ A: this should change nothing, as the median hasn't changed - set_utilization(&mut scheduler, NodeId(1), 1000000); - assert_eq!(scheduler.get_az_for_new_tenant(), Some(az_a_tag.clone())); - - // Put some utilization on a second node in AZ A: now the median has changed, so the scheduler - // should prefer the other AZ. - set_utilization(&mut scheduler, NodeId(2), 1000000); + // Home shard count is higher in AZ A, so AZ B will be preferred + set_shard_count(&mut scheduler, NodeId(1), 10); assert_eq!(scheduler.get_az_for_new_tenant(), Some(az_b_tag.clone())); + + // Total home shard count is higher in AZ B, so we revert to preferring AZ A + set_shard_count(&mut scheduler, NodeId(4), 6); + set_shard_count(&mut scheduler, NodeId(5), 6); + assert_eq!(scheduler.get_az_for_new_tenant(), Some(az_a_tag.clone())); + } + + /// Test that when selecting AZs for many new tenants, we get the expected balance across nodes + #[test] + fn az_selection_many() { + let az_a_tag = AvailabilityZone("az-a".to_string()); + let az_b_tag = AvailabilityZone("az-b".to_string()); + let az_c_tag = AvailabilityZone("az-c".to_string()); + let nodes = test_utils::make_test_nodes( + 6, + &[ + az_a_tag.clone(), + az_b_tag.clone(), + az_c_tag.clone(), + az_a_tag.clone(), + az_b_tag.clone(), + az_c_tag.clone(), + ], + ); + + let mut scheduler = Scheduler::new(nodes.values()); + + // We should get 1/6th of these on each node, give or take a few... + let total_tenants = 300; + + // ...where the 'few' is the number of AZs, because the scheduling will sometimes overshoot + // on one AZ before correcting itself. This is because we select the 'home' AZ based on + // an AZ-wide metric, but we select the location for secondaries on a purely node-based + // metric (while excluding the home AZ). + let grace = 3; + + let mut scheduled_shards = Vec::new(); + for _i in 0..total_tenants { + let preferred_az = scheduler.get_az_for_new_tenant().unwrap(); + + let mut node_home_counts = scheduler + .nodes + .iter() + .map(|(node_id, node)| (node_id, node.home_shard_count)) + .collect::>(); + node_home_counts.sort_by_key(|i| i.0); + eprintln!("Selected {}, vs nodes {:?}", preferred_az, node_home_counts); + + let tenant_shard_id = TenantShardId { + tenant_id: TenantId::generate(), + shard_number: ShardNumber(0), + shard_count: ShardCount(1), + }; + + let shard_identity = ShardIdentity::new( + tenant_shard_id.shard_number, + tenant_shard_id.shard_count, + pageserver_api::shard::ShardStripeSize(1), + ) + .unwrap(); + let mut shard = TenantShard::new( + tenant_shard_id, + shard_identity, + pageserver_api::controller_api::PlacementPolicy::Attached(1), + Some(preferred_az), + ); + + let mut context = ScheduleContext::default(); + shard.schedule(&mut scheduler, &mut context).unwrap(); + eprintln!("Scheduled shard at {:?}", shard.intent); + + scheduled_shards.push(shard); + } + + for (node_id, node) in &scheduler.nodes { + eprintln!( + "Node {}: {} {} {}", + node_id, node.shard_count, node.attached_shard_count, node.home_shard_count + ); + } + + for node in scheduler.nodes.values() { + assert!((node.home_shard_count as i64 - total_tenants as i64 / 6).abs() < grace); + } + + for mut shard in scheduled_shards { + shard.intent.clear(&mut scheduler); + } + } + + #[test] + /// Make sure that when we have an odd number of nodes and an even number of shards, we still + /// get scheduling stability. + fn odd_nodes_stability() { + let az_a = AvailabilityZone("az-a".to_string()); + let az_b = AvailabilityZone("az-b".to_string()); + + let nodes = test_utils::make_test_nodes( + 10, + &[ + az_a.clone(), + az_a.clone(), + az_a.clone(), + az_a.clone(), + az_a.clone(), + az_b.clone(), + az_b.clone(), + az_b.clone(), + az_b.clone(), + az_b.clone(), + ], + ); + let mut scheduler = Scheduler::new(nodes.values()); + + // Need to keep these alive because they contribute to shard counts via RAII + let mut scheduled_shards = Vec::new(); + + let mut context = ScheduleContext::default(); + + fn schedule_shard( + tenant_shard_id: TenantShardId, + expect_attached: NodeId, + expect_secondary: NodeId, + scheduled_shards: &mut Vec, + scheduler: &mut Scheduler, + preferred_az: Option, + context: &mut ScheduleContext, + ) { + let shard_identity = ShardIdentity::new( + tenant_shard_id.shard_number, + tenant_shard_id.shard_count, + pageserver_api::shard::ShardStripeSize(1), + ) + .unwrap(); + let mut shard = TenantShard::new( + tenant_shard_id, + shard_identity, + pageserver_api::controller_api::PlacementPolicy::Attached(1), + preferred_az, + ); + + shard.schedule(scheduler, context).unwrap(); + + assert_eq!(shard.intent.get_attached().unwrap(), expect_attached); + assert_eq!( + shard.intent.get_secondary().first().unwrap(), + &expect_secondary + ); + + scheduled_shards.push(shard); + } + + let tenant_id = TenantId::generate(); + + schedule_shard( + TenantShardId { + tenant_id, + shard_number: ShardNumber(0), + shard_count: ShardCount(8), + }, + NodeId(1), + NodeId(6), + &mut scheduled_shards, + &mut scheduler, + Some(az_a.clone()), + &mut context, + ); + + schedule_shard( + TenantShardId { + tenant_id, + shard_number: ShardNumber(1), + shard_count: ShardCount(8), + }, + NodeId(2), + NodeId(7), + &mut scheduled_shards, + &mut scheduler, + Some(az_a.clone()), + &mut context, + ); + + schedule_shard( + TenantShardId { + tenant_id, + shard_number: ShardNumber(2), + shard_count: ShardCount(8), + }, + NodeId(3), + NodeId(8), + &mut scheduled_shards, + &mut scheduler, + Some(az_a.clone()), + &mut context, + ); + + schedule_shard( + TenantShardId { + tenant_id, + shard_number: ShardNumber(3), + shard_count: ShardCount(8), + }, + NodeId(4), + NodeId(9), + &mut scheduled_shards, + &mut scheduler, + Some(az_a.clone()), + &mut context, + ); + + schedule_shard( + TenantShardId { + tenant_id, + shard_number: ShardNumber(4), + shard_count: ShardCount(8), + }, + NodeId(5), + NodeId(10), + &mut scheduled_shards, + &mut scheduler, + Some(az_a.clone()), + &mut context, + ); + + schedule_shard( + TenantShardId { + tenant_id, + shard_number: ShardNumber(5), + shard_count: ShardCount(8), + }, + NodeId(1), + NodeId(6), + &mut scheduled_shards, + &mut scheduler, + Some(az_a.clone()), + &mut context, + ); + + schedule_shard( + TenantShardId { + tenant_id, + shard_number: ShardNumber(6), + shard_count: ShardCount(8), + }, + NodeId(2), + NodeId(7), + &mut scheduled_shards, + &mut scheduler, + Some(az_a.clone()), + &mut context, + ); + + schedule_shard( + TenantShardId { + tenant_id, + shard_number: ShardNumber(7), + shard_count: ShardCount(8), + }, + NodeId(3), + NodeId(8), + &mut scheduled_shards, + &mut scheduler, + Some(az_a.clone()), + &mut context, + ); + + // Assert that the optimizer suggests nochanges, i.e. our initial scheduling was stable. + for shard in &scheduled_shards { + assert_eq!(shard.optimize_attachment(&mut scheduler, &context), None); + } + + for mut shard in scheduled_shards { + shard.intent.clear(&mut scheduler); + } } } diff --git a/storage_controller/src/schema.rs b/storage_controller/src/schema.rs index 44c91619ab..14c30c296d 100644 --- a/storage_controller/src/schema.rs +++ b/storage_controller/src/schema.rs @@ -36,7 +36,6 @@ diesel::table! { version -> Int8, host -> Text, port -> Int4, - active -> Bool, http_port -> Int4, availability_zone_id -> Text, scheduling_policy -> Varchar, diff --git a/storage_controller/src/service.rs b/storage_controller/src/service.rs index 265b2798d2..9ac9ee17ca 100644 --- a/storage_controller/src/service.rs +++ b/storage_controller/src/service.rs @@ -47,7 +47,7 @@ use pageserver_api::{ AvailabilityZone, MetadataHealthRecord, MetadataHealthUpdateRequest, NodeAvailability, NodeRegisterRequest, NodeSchedulingPolicy, NodeShard, NodeShardResponse, PlacementPolicy, SafekeeperDescribeResponse, ShardSchedulingPolicy, ShardsPreferredAzsRequest, - ShardsPreferredAzsResponse, TenantCreateRequest, TenantCreateResponse, + ShardsPreferredAzsResponse, SkSchedulingPolicy, TenantCreateRequest, TenantCreateResponse, TenantCreateResponseShard, TenantDescribeResponse, TenantDescribeResponseShard, TenantLocateResponse, TenantPolicyRequest, TenantShardMigrateRequest, TenantShardMigrateResponse, @@ -1404,7 +1404,11 @@ impl Service { // We will populate intent properly later in [`Self::startup_reconcile`], initially populate // it with what we can infer: the node for which a generation was most recently issued. - let mut intent = IntentState::new(); + let mut intent = IntentState::new( + tsp.preferred_az_id + .as_ref() + .map(|az| AvailabilityZone(az.clone())), + ); if let Some(generation_pageserver) = tsp.generation_pageserver.map(|n| NodeId(n as u64)) { if nodes.contains_key(&generation_pageserver) { @@ -2474,18 +2478,29 @@ impl Service { tenant_id: TenantId, _guard: &TracingExclusiveGuard, ) -> Result<(), ApiError> { - let present_in_memory = { + // Check if the tenant is present in memory, and select an AZ to use when loading + // if we will load it. + let load_in_az = { let locked = self.inner.read().unwrap(); - locked + let existing = locked .tenants .range(TenantShardId::tenant_range(tenant_id)) - .next() - .is_some() - }; + .next(); - if present_in_memory { - return Ok(()); - } + // If the tenant is not present in memory, we expect to load it from database, + // so let's figure out what AZ to load it into while we have self.inner locked. + if existing.is_none() { + locked + .scheduler + .get_az_for_new_tenant() + .ok_or(ApiError::BadRequest(anyhow::anyhow!( + "No AZ with nodes found to load tenant" + )))? + } else { + // We already have this tenant in memory + return Ok(()); + } + }; let tenant_shards = self.persistence.load_tenant(tenant_id).await?; if tenant_shards.is_empty() { @@ -2494,8 +2509,20 @@ impl Service { )); } - // TODO: choose a fresh AZ to use for this tenant when un-detaching: there definitely isn't a running - // compute, so no benefit to making AZ sticky across detaches. + // Update the persistent shards with the AZ that we are about to apply to in-memory state + self.persistence + .set_tenant_shard_preferred_azs( + tenant_shards + .iter() + .map(|t| { + ( + t.get_tenant_shard_id().expect("Corrupt shard in database"), + Some(load_in_az.clone()), + ) + }) + .collect(), + ) + .await?; let mut locked = self.inner.write().unwrap(); tracing::info!( @@ -2505,7 +2532,7 @@ impl Service { ); locked.tenants.extend(tenant_shards.into_iter().map(|p| { - let intent = IntentState::new(); + let intent = IntentState::new(Some(load_in_az.clone())); let shard = TenantShard::from_persistent(p, intent).expect("Corrupt shard row in database"); @@ -4131,17 +4158,42 @@ impl Service { .ok_or_else(|| ApiError::NotFound(anyhow::anyhow!("Tenant {tenant_id} not found").into())) } - pub(crate) fn tenant_list(&self) -> Vec { + /// limit & offset are pagination parameters. Since we are walking an in-memory HashMap, `offset` does not + /// avoid traversing data, it just avoid returning it. This is suitable for our purposes, since our in memory + /// maps are small enough to traverse fast, our pagination is just to avoid serializing huge JSON responses + /// in our external API. + pub(crate) fn tenant_list( + &self, + limit: Option, + start_after: Option, + ) -> Vec { let locked = self.inner.read().unwrap(); + // Apply start_from parameter + let shard_range = match start_after { + None => locked.tenants.range(..), + Some(tenant_id) => locked.tenants.range( + TenantShardId { + tenant_id, + shard_number: ShardNumber(u8::MAX), + shard_count: ShardCount(u8::MAX), + }.., + ), + }; + let mut result = Vec::new(); - for (_tenant_id, tenant_shards) in - &locked.tenants.iter().group_by(|(id, _shard)| id.tenant_id) - { + for (_tenant_id, tenant_shards) in &shard_range.group_by(|(id, _shard)| id.tenant_id) { result.push( self.tenant_describe_impl(tenant_shards.map(|(_k, v)| v)) .expect("Groups are always non-empty"), ); + + // Enforce `limit` parameter + if let Some(limit) = limit { + if result.len() >= limit { + break; + } + } } result @@ -4236,6 +4288,22 @@ impl Service { } tracing::info!("Restoring parent shard {tenant_shard_id}"); + + // Drop any intents that refer to unavailable nodes, to enable this abort to proceed even + // if the original attachment location is offline. + if let Some(node_id) = shard.intent.get_attached() { + if !nodes.get(node_id).unwrap().is_available() { + tracing::info!("Demoting attached intent for {tenant_shard_id} on unavailable node {node_id}"); + shard.intent.demote_attached(scheduler, *node_id); + } + } + for node_id in shard.intent.get_secondary().clone() { + if !nodes.get(&node_id).unwrap().is_available() { + tracing::info!("Dropping secondary intent for {tenant_shard_id} on unavailable node {node_id}"); + shard.intent.remove_secondary(scheduler, node_id); + } + } + shard.splitting = SplitState::Idle; if let Err(e) = shard.schedule(scheduler, &mut ScheduleContext::default()) { // If this shard can't be scheduled now (perhaps due to offline nodes or @@ -4389,15 +4457,13 @@ impl Service { let mut child_state = TenantShard::new(child, child_shard, policy.clone(), preferred_az.clone()); - child_state.intent = IntentState::single(scheduler, Some(pageserver)); + child_state.intent = + IntentState::single(scheduler, Some(pageserver), preferred_az.clone()); child_state.observed = ObservedState { locations: child_observed, }; child_state.generation = Some(generation); child_state.config = config.clone(); - if let Some(preferred_az) = &preferred_az { - child_state.set_preferred_az(preferred_az.clone()); - } // The child's TenantShard::splitting is intentionally left at the default value of Idle, // as at this point in the split process we have succeeded and this part is infallible: @@ -5014,6 +5080,8 @@ impl Service { // If our new attached node was a secondary, it no longer should be. shard.intent.remove_secondary(scheduler, migrate_req.node_id); + shard.intent.set_attached(scheduler, Some(migrate_req.node_id)); + // If we were already attached to something, demote that to a secondary if let Some(old_attached) = old_attached { if n > 0 { @@ -5025,8 +5093,6 @@ impl Service { shard.intent.push_secondary(scheduler, old_attached); } } - - shard.intent.set_attached(scheduler, Some(migrate_req.node_id)); } PlacementPolicy::Secondary => { shard.intent.clear(scheduler); @@ -5055,6 +5121,69 @@ impl Service { Ok(TenantShardMigrateResponse {}) } + pub(crate) async fn tenant_shard_migrate_secondary( + &self, + tenant_shard_id: TenantShardId, + migrate_req: TenantShardMigrateRequest, + ) -> Result { + let waiter = { + let mut locked = self.inner.write().unwrap(); + let (nodes, tenants, scheduler) = locked.parts_mut(); + + let Some(node) = nodes.get(&migrate_req.node_id) else { + return Err(ApiError::BadRequest(anyhow::anyhow!( + "Node {} not found", + migrate_req.node_id + ))); + }; + + if !node.is_available() { + // Warn but proceed: the caller may intend to manually adjust the placement of + // a shard even if the node is down, e.g. if intervening during an incident. + tracing::warn!("Migrating to unavailable node {node}"); + } + + let Some(shard) = tenants.get_mut(&tenant_shard_id) else { + return Err(ApiError::NotFound( + anyhow::anyhow!("Tenant shard not found").into(), + )); + }; + + if shard.intent.get_secondary().len() == 1 + && shard.intent.get_secondary()[0] == migrate_req.node_id + { + tracing::info!( + "Migrating secondary to {node}: intent is unchanged {:?}", + shard.intent + ); + } else if shard.intent.get_attached() == &Some(migrate_req.node_id) { + tracing::info!("Migrating secondary to {node}: already attached where we were asked to create a secondary"); + } else { + let old_secondaries = shard.intent.get_secondary().clone(); + for secondary in old_secondaries { + shard.intent.remove_secondary(scheduler, secondary); + } + + shard.intent.push_secondary(scheduler, migrate_req.node_id); + shard.sequence = shard.sequence.next(); + tracing::info!( + "Migrating secondary to {node}: new intent {:?}", + shard.intent + ); + } + + self.maybe_reconcile_shard(shard, nodes) + }; + + if let Some(waiter) = waiter { + waiter.wait_timeout(RECONCILE_TIMEOUT).await?; + } else { + tracing::info!("Migration is a no-op"); + } + + Ok(TenantShardMigrateResponse {}) + } + /// 'cancel' in this context means cancel any ongoing reconcile pub(crate) async fn tenant_shard_cancel_reconcile( &self, @@ -5256,7 +5385,8 @@ impl Service { expect_nodes.sort_by_key(|n| n.node_id); nodes.sort_by_key(|n| n.node_id); - if nodes != expect_nodes { + // Errors relating to nodes are deferred so that we don't skip the shard checks below if we have a node error + let node_result = if nodes != expect_nodes { tracing::error!("Consistency check failed on nodes."); tracing::error!( "Nodes in memory: {}", @@ -5268,10 +5398,12 @@ impl Service { serde_json::to_string(&nodes) .map_err(|e| ApiError::InternalServerError(e.into()))? ); - return Err(ApiError::InternalServerError(anyhow::anyhow!( + Err(ApiError::InternalServerError(anyhow::anyhow!( "Node consistency failure" - ))); - } + ))) + } else { + Ok(()) + }; let mut persistent_shards = self.persistence.load_active_tenant_shards().await?; persistent_shards @@ -5279,8 +5411,18 @@ impl Service { expect_shards.sort_by_key(|tsp| (tsp.tenant_id.clone(), tsp.shard_number, tsp.shard_count)); + // Because JSON contents of persistent tenants might disagree with the fields in current `TenantConfig` + // definition, we will do an encode/decode cycle to ensure any legacy fields are dropped and any new + // fields are added, before doing a comparison. + for tsp in &mut persistent_shards { + let config: TenantConfig = serde_json::from_str(&tsp.config) + .map_err(|e| ApiError::InternalServerError(e.into()))?; + tsp.config = serde_json::to_string(&config).expect("Encoding config is infallible"); + } + if persistent_shards != expect_shards { tracing::error!("Consistency check failed on shards."); + tracing::error!( "Shards in memory: {}", serde_json::to_string(&expect_shards) @@ -5291,12 +5433,57 @@ impl Service { serde_json::to_string(&persistent_shards) .map_err(|e| ApiError::InternalServerError(e.into()))? ); + + // The total dump log lines above are useful in testing but in the field grafana will + // usually just drop them because they're so large. So we also do some explicit logging + // of just the diffs. + let persistent_shards = persistent_shards + .into_iter() + .map(|tsp| (tsp.get_tenant_shard_id().unwrap(), tsp)) + .collect::>(); + let expect_shards = expect_shards + .into_iter() + .map(|tsp| (tsp.get_tenant_shard_id().unwrap(), tsp)) + .collect::>(); + for (tenant_shard_id, persistent_tsp) in &persistent_shards { + match expect_shards.get(tenant_shard_id) { + None => { + tracing::error!( + "Shard {} found in database but not in memory", + tenant_shard_id + ); + } + Some(expect_tsp) => { + if expect_tsp != persistent_tsp { + tracing::error!( + "Shard {} is inconsistent. In memory: {}, database has: {}", + tenant_shard_id, + serde_json::to_string(expect_tsp).unwrap(), + serde_json::to_string(&persistent_tsp).unwrap() + ); + } + } + } + } + + // Having already logged any differences, log any shards that simply aren't present in the database + for (tenant_shard_id, memory_tsp) in &expect_shards { + if !persistent_shards.contains_key(tenant_shard_id) { + tracing::error!( + "Shard {} found in memory but not in database: {}", + tenant_shard_id, + serde_json::to_string(memory_tsp) + .map_err(|e| ApiError::InternalServerError(e.into()))? + ); + } + } + return Err(ApiError::InternalServerError(anyhow::anyhow!( "Shard consistency failure" ))); } - Ok(()) + node_result } /// For debug/support: a JSON dump of the [`Scheduler`]. Returns a response so that @@ -5600,7 +5787,7 @@ impl Service { register_req.listen_http_port, register_req.listen_pg_addr, register_req.listen_pg_port, - register_req.availability_zone_id, + register_req.availability_zone_id.clone(), ); // TODO: idempotency if the node already exists in the database @@ -5620,8 +5807,9 @@ impl Service { .set(locked.nodes.len() as i64); tracing::info!( - "Registered pageserver {}, now have {} pageservers", + "Registered pageserver {} ({}), now have {} pageservers", register_req.node_id, + register_req.availability_zone_id, locked.nodes.len() ); Ok(()) @@ -6236,7 +6424,7 @@ impl Service { /// available. A return value of 0 indicates that everything is fully reconciled already. fn reconcile_all(&self) -> usize { let mut locked = self.inner.write().unwrap(); - let (nodes, tenants, _scheduler) = locked.parts_mut(); + let (nodes, tenants, scheduler) = locked.parts_mut(); let pageservers = nodes.clone(); // This function is an efficient place to update lazy statistics, since we are walking @@ -6297,6 +6485,9 @@ impl Service { } } + // Some metrics are calculated from SchedulerNode state, update these periodically + scheduler.update_metrics(); + // Process any deferred tenant drops for (tenant_id, guard) in drop_detached_tenants { self.maybe_drop_tenant(tenant_id, &mut locked, &guard); @@ -6355,6 +6546,7 @@ impl Service { // Shard was dropped between planning and execution; continue; }; + tracing::info!(tenant_shard_id=%tenant_shard_id, "Applying optimization: {optimization:?}"); if shard.apply_optimization(scheduler, optimization) { optimizations_applied += 1; if self.maybe_reconcile_shard(shard, nodes).is_some() { @@ -6385,7 +6577,13 @@ impl Service { let mut work = Vec::new(); let mut locked = self.inner.write().unwrap(); - let (nodes, tenants, scheduler) = locked.parts_mut(); + let (_nodes, tenants, scheduler) = locked.parts_mut(); + + // We are going to plan a bunch of optimisations before applying any of them, so the + // utilisation stats on nodes will be effectively stale for the >1st optimisation we + // generate. To avoid this causing unstable migrations/flapping, it's important that the + // code in TenantShard for finding optimisations uses [`NodeAttachmentSchedulingScore::disregard_utilization`] + // to ignore the utilisation component of the score. for (_tenant_id, schedule_context, shards) in TenantShardContextIterator::new(tenants, ScheduleMode::Speculative) @@ -6416,13 +6614,28 @@ impl Service { continue; } - // TODO: optimization calculations are relatively expensive: create some fast-path for - // the common idle case (avoiding the search on tenants that we have recently checked) + // Fast path: we may quickly identify shards that don't have any possible optimisations + if !shard.maybe_optimizable(scheduler, &schedule_context) { + if cfg!(feature = "testing") { + // Check that maybe_optimizable doesn't disagree with the actual optimization functions. + // Only do this in testing builds because it is not a correctness-critical check, so we shouldn't + // panic in prod if we hit this, or spend cycles on it in prod. + assert!(shard + .optimize_attachment(scheduler, &schedule_context) + .is_none()); + assert!(shard + .optimize_secondary(scheduler, &schedule_context) + .is_none()); + } + continue; + } + if let Some(optimization) = - // If idle, maybe ptimize attachments: if a shard has a secondary location that is preferable to + // If idle, maybe optimize attachments: if a shard has a secondary location that is preferable to // its primary location based on soft constraints, cut it over. - shard.optimize_attachment(nodes, &schedule_context) + shard.optimize_attachment(scheduler, &schedule_context) { + tracing::info!(tenant_shard_id=%shard.tenant_shard_id, "Identified optimization for attachment: {optimization:?}"); work.push((shard.tenant_shard_id, optimization)); break; } else if let Some(optimization) = @@ -6432,6 +6645,7 @@ impl Service { // in the same tenant with secondary locations on the node where they originally split. shard.optimize_secondary(scheduler, &schedule_context) { + tracing::info!(tenant_shard_id=%shard.tenant_shard_id, "Identified optimization for secondary: {optimization:?}"); work.push((shard.tenant_shard_id, optimization)); break; } @@ -6480,8 +6694,10 @@ impl Service { } } } - ScheduleOptimizationAction::ReplaceSecondary(_) => { - // No extra checks needed to replace a secondary: this does not interrupt client access + ScheduleOptimizationAction::ReplaceSecondary(_) + | ScheduleOptimizationAction::CreateSecondary(_) + | ScheduleOptimizationAction::RemoveSecondary(_) => { + // No extra checks needed to manage secondaries: this does not interrupt client access validated_work.push((tenant_shard_id, optimization)) } }; @@ -6553,26 +6769,35 @@ impl Service { /// we have this helper to move things along faster. #[cfg(feature = "testing")] async fn kick_secondary_download(&self, tenant_shard_id: TenantShardId) { - let (attached_node, secondary_node) = { + let (attached_node, secondaries) = { let locked = self.inner.read().unwrap(); let Some(shard) = locked.tenants.get(&tenant_shard_id) else { + tracing::warn!( + "Skipping kick of secondary download for {tenant_shard_id}: not found" + ); return; }; - let (Some(attached), Some(secondary)) = ( - shard.intent.get_attached(), - shard.intent.get_secondary().first(), - ) else { + + let Some(attached) = shard.intent.get_attached() else { + tracing::warn!( + "Skipping kick of secondary download for {tenant_shard_id}: no attached" + ); return; }; - ( - locked.nodes.get(attached).unwrap().clone(), - locked.nodes.get(secondary).unwrap().clone(), - ) + + let secondaries = shard + .intent + .get_secondary() + .iter() + .map(|n| locked.nodes.get(n).unwrap().clone()) + .collect::>(); + + (locked.nodes.get(attached).unwrap().clone(), secondaries) }; // Make remote API calls to upload + download heatmaps: we ignore errors because this is just // a 'kick' to let scheduling optimisation run more promptly. - attached_node + match attached_node .with_client_retries( |client| async move { client.tenant_heatmap_upload(tenant_shard_id).await }, &self.config.jwt_token, @@ -6581,22 +6806,57 @@ impl Service { SHORT_RECONCILE_TIMEOUT, &self.cancel, ) - .await; + .await + { + Some(Err(e)) => { + tracing::info!( + "Failed to upload heatmap from {attached_node} for {tenant_shard_id}: {e}" + ); + } + None => { + tracing::info!( + "Cancelled while uploading heatmap from {attached_node} for {tenant_shard_id}" + ); + } + Some(Ok(_)) => { + tracing::info!( + "Successfully uploaded heatmap from {attached_node} for {tenant_shard_id}" + ); + } + } - secondary_node - .with_client_retries( - |client| async move { - client - .tenant_secondary_download(tenant_shard_id, Some(Duration::from_secs(1))) - .await - }, - &self.config.jwt_token, - 3, - 10, - SHORT_RECONCILE_TIMEOUT, - &self.cancel, - ) - .await; + for secondary_node in secondaries { + match secondary_node + .with_client_retries( + |client| async move { + client + .tenant_secondary_download( + tenant_shard_id, + Some(Duration::from_secs(1)), + ) + .await + }, + &self.config.jwt_token, + 3, + 10, + SHORT_RECONCILE_TIMEOUT, + &self.cancel, + ) + .await + { + Some(Err(e)) => { + tracing::info!( + "Failed to download heatmap from {secondary_node} for {tenant_shard_id}: {e}" + ); + } + None => { + tracing::info!("Cancelled while downloading heatmap from {secondary_node} for {tenant_shard_id}"); + } + Some(Ok(progress)) => { + tracing::info!("Successfully downloaded heatmap from {secondary_node} for {tenant_shard_id}: {progress:?}"); + } + } + } } /// Look for shards which are oversized and in need of splitting @@ -7019,49 +7279,95 @@ impl Service { Ok(()) } - /// Create a node fill plan (pick secondaries to promote) that meets the following requirements: - /// 1. The node should be filled until it reaches the expected cluster average of - /// attached shards. If there are not enough secondaries on the node, the plan stops early. - /// 2. Select tenant shards to promote such that the number of attached shards is balanced - /// throughout the cluster. We achieve this by picking tenant shards from each node, - /// starting from the ones with the largest number of attached shards, until the node - /// reaches the expected cluster average. - /// 3. Avoid promoting more shards of the same tenant than required. The upper bound - /// for the number of tenants from the same shard promoted to the node being filled is: - /// shard count for the tenant divided by the number of nodes in the cluster. + /// Create a node fill plan (pick secondaries to promote), based on: + /// 1. Shards which have a secondary on this node, and this node is in their home AZ, and are currently attached to a node + /// outside their home AZ, should be migrated back here. + /// 2. If after step 1 we have not migrated enough shards for this node to have its fair share of + /// attached shards, we will promote more shards from the nodes with the most attached shards, unless + /// those shards have a home AZ that doesn't match the node we're filling. fn fill_node_plan(&self, node_id: NodeId) -> Vec { let mut locked = self.inner.write().unwrap(); - let fill_requirement = locked.scheduler.compute_fill_requirement(node_id); + let (nodes, tenants, _scheduler) = locked.parts_mut(); - let mut tids_by_node = locked - .tenants - .iter_mut() - .filter_map(|(tid, tenant_shard)| { - if !matches!( - tenant_shard.get_scheduling_policy(), - ShardSchedulingPolicy::Active - ) { - // Only include tenants in fills if they have a normal (Active) scheduling policy. We - // even exclude Essential, because moving to fill a node is not essential to keeping this - // tenant available. - return None; - } + let node_az = nodes + .get(&node_id) + .expect("Node must exist") + .get_availability_zone_id() + .clone(); - if tenant_shard.intent.get_secondary().contains(&node_id) { + // The tenant shard IDs that we plan to promote from secondary to attached on this node + let mut plan = Vec::new(); + + // Collect shards which do not have a preferred AZ & are elegible for moving in stage 2 + let mut free_tids_by_node: HashMap> = HashMap::new(); + + // Don't respect AZ preferences if there is only one AZ. This comes up in tests, but it could + // conceivably come up in real life if deploying a single-AZ region intentionally. + let respect_azs = nodes + .values() + .map(|n| n.get_availability_zone_id()) + .unique() + .count() + > 1; + + // Step 1: collect all shards that we are required to migrate back to this node because their AZ preference + // requires it. + for (tsid, tenant_shard) in tenants { + if !tenant_shard.intent.get_secondary().contains(&node_id) { + // Shard doesn't have a secondary on this node, ignore it. + continue; + } + + // AZ check: when filling nodes after a restart, our intent is to move _back_ the + // shards which belong on this node, not to promote shards whose scheduling preference + // would be on their currently attached node. So will avoid promoting shards whose + // home AZ doesn't match the AZ of the node we're filling. + match tenant_shard.preferred_az() { + _ if !respect_azs => { if let Some(primary) = tenant_shard.intent.get_attached() { - return Some((*primary, *tid)); + free_tids_by_node.entry(*primary).or_default().push(*tsid); } } + None => { + // Shard doesn't have an AZ preference: it is elegible to be moved, but we + // will only do so if our target shard count requires it. + if let Some(primary) = tenant_shard.intent.get_attached() { + free_tids_by_node.entry(*primary).or_default().push(*tsid); + } + } + Some(az) if az == &node_az => { + // This shard's home AZ is equal to the node we're filling: it should + // be moved back to this node as part of filling, unless its currently + // attached location is also in its home AZ. + if let Some(primary) = tenant_shard.intent.get_attached() { + if nodes + .get(primary) + .expect("referenced node must exist") + .get_availability_zone_id() + != tenant_shard + .preferred_az() + .expect("tenant must have an AZ preference") + { + plan.push(*tsid) + } + } else { + plan.push(*tsid) + } + } + Some(_) => { + // This shard's home AZ is somewhere other than the node we're filling, + // it may not be moved back to this node as part of filling. Ignore it + } + } + } - None - }) - .into_group_map(); + // Step 2: also promote any AZ-agnostic shards as required to achieve the target number of attachments + let fill_requirement = locked.scheduler.compute_fill_requirement(node_id); let expected_attached = locked.scheduler.expected_attached_shard_count(); let nodes_by_load = locked.scheduler.nodes_by_attached_shard_count(); let mut promoted_per_tenant: HashMap = HashMap::new(); - let mut plan = Vec::new(); for (node_id, attached) in nodes_by_load { let available = locked.nodes.get(&node_id).is_some_and(|n| n.is_available()); @@ -7070,7 +7376,7 @@ impl Service { } if plan.len() >= fill_requirement - || tids_by_node.is_empty() + || free_tids_by_node.is_empty() || attached <= expected_attached { break; @@ -7082,7 +7388,7 @@ impl Service { let mut remove_node = false; while take > 0 { - match tids_by_node.get_mut(&node_id) { + match free_tids_by_node.get_mut(&node_id) { Some(tids) => match tids.pop() { Some(tid) => { let max_promote_for_tenant = std::cmp::max( @@ -7108,7 +7414,7 @@ impl Service { } if remove_node { - tids_by_node.remove(&node_id); + free_tids_by_node.remove(&node_id); } } @@ -7375,6 +7681,16 @@ impl Service { self.persistence.safekeeper_upsert(record).await } + pub(crate) async fn set_safekeeper_scheduling_policy( + &self, + id: i64, + scheduling_policy: SkSchedulingPolicy, + ) -> Result<(), DatabaseError> { + self.persistence + .set_safekeeper_scheduling_policy(id, scheduling_policy) + .await + } + pub(crate) async fn update_shards_preferred_azs( &self, req: ShardsPreferredAzsRequest, diff --git a/storage_controller/src/service/context_iterator.rs b/storage_controller/src/service/context_iterator.rs index d38010a27e..dd6913e988 100644 --- a/storage_controller/src/service/context_iterator.rs +++ b/storage_controller/src/service/context_iterator.rs @@ -43,9 +43,6 @@ impl<'a> Iterator for TenantShardContextIterator<'a> { // Accumulate the schedule context for all the shards in a tenant schedule_context.avoid(&shard.intent.all_pageservers()); - if let Some(attached) = shard.intent.get_attached() { - schedule_context.push_attached(*attached); - } tenant_shards.push(shard); if tenant_shard_id.shard_number.0 == tenant_shard_id.shard_count.count() - 1 { @@ -115,7 +112,7 @@ mod tests { assert_eq!(tenant_id, t1_id); assert_eq!(shards[0].tenant_shard_id.shard_number, ShardNumber(0)); assert_eq!(shards.len(), 1); - assert_eq!(context.attach_count(), 1); + assert_eq!(context.location_count(), 2); let (tenant_id, context, shards) = iter.next().unwrap(); assert_eq!(tenant_id, t2_id); @@ -124,13 +121,13 @@ mod tests { assert_eq!(shards[2].tenant_shard_id.shard_number, ShardNumber(2)); assert_eq!(shards[3].tenant_shard_id.shard_number, ShardNumber(3)); assert_eq!(shards.len(), 4); - assert_eq!(context.attach_count(), 4); + assert_eq!(context.location_count(), 8); let (tenant_id, context, shards) = iter.next().unwrap(); assert_eq!(tenant_id, t3_id); assert_eq!(shards[0].tenant_shard_id.shard_number, ShardNumber(0)); assert_eq!(shards.len(), 1); - assert_eq!(context.attach_count(), 1); + assert_eq!(context.location_count(), 2); for shard in tenants.values_mut() { shard.intent.clear(&mut scheduler); diff --git a/storage_controller/src/tenant_shard.rs b/storage_controller/src/tenant_shard.rs index c17989a316..79ed628c25 100644 --- a/storage_controller/src/tenant_shard.rs +++ b/storage_controller/src/tenant_shard.rs @@ -11,16 +11,14 @@ use crate::{ persistence::TenantShardPersistence, reconciler::{ReconcileUnits, ReconcilerConfig}, scheduler::{ - AffinityScore, AttachedShardTag, MaySchedule, RefCountUpdate, ScheduleContext, - SecondaryShardTag, + AffinityScore, AttachedShardTag, NodeSchedulingScore, NodeSecondarySchedulingScore, + RefCountUpdate, ScheduleContext, SecondaryShardTag, ShardTag, }, service::ReconcileResultRequest, }; use futures::future::{self, Either}; use itertools::Itertools; -use pageserver_api::controller_api::{ - AvailabilityZone, NodeSchedulingPolicy, PlacementPolicy, ShardSchedulingPolicy, -}; +use pageserver_api::controller_api::{AvailabilityZone, PlacementPolicy, ShardSchedulingPolicy}; use pageserver_api::{ models::{LocationConfig, LocationConfigMode, TenantConfig}, shard::{ShardIdentity, TenantShardId}, @@ -33,6 +31,7 @@ use utils::{ generation::Generation, id::NodeId, seqwait::{SeqWait, SeqWaitError}, + shard::ShardCount, sync::gate::GateGuard, }; @@ -147,45 +146,67 @@ pub(crate) struct TenantShard { // Support/debug tool: if something is going wrong or flapping with scheduling, this may // be set to a non-active state to avoid making changes while the issue is fixed. scheduling_policy: ShardSchedulingPolicy, +} + +#[derive(Clone, Debug, Serialize)] +pub(crate) struct IntentState { + attached: Option, + secondary: Vec, // We should attempt to schedule this shard in the provided AZ to // decrease chances of cross-AZ compute. preferred_az_id: Option, } -#[derive(Default, Clone, Debug, Serialize)] -pub(crate) struct IntentState { - attached: Option, - secondary: Vec, -} - impl IntentState { - pub(crate) fn new() -> Self { + pub(crate) fn new(preferred_az_id: Option) -> Self { Self { attached: None, secondary: vec![], + preferred_az_id, } } - pub(crate) fn single(scheduler: &mut Scheduler, node_id: Option) -> Self { + pub(crate) fn single( + scheduler: &mut Scheduler, + node_id: Option, + preferred_az_id: Option, + ) -> Self { if let Some(node_id) = node_id { - scheduler.update_node_ref_counts(node_id, RefCountUpdate::Attach); + scheduler.update_node_ref_counts( + node_id, + preferred_az_id.as_ref(), + RefCountUpdate::Attach, + ); } Self { attached: node_id, secondary: vec![], + preferred_az_id, } } pub(crate) fn set_attached(&mut self, scheduler: &mut Scheduler, new_attached: Option) { if self.attached != new_attached { if let Some(old_attached) = self.attached.take() { - scheduler.update_node_ref_counts(old_attached, RefCountUpdate::Detach); + scheduler.update_node_ref_counts( + old_attached, + self.preferred_az_id.as_ref(), + RefCountUpdate::Detach, + ); } if let Some(new_attached) = &new_attached { - scheduler.update_node_ref_counts(*new_attached, RefCountUpdate::Attach); + scheduler.update_node_ref_counts( + *new_attached, + self.preferred_az_id.as_ref(), + RefCountUpdate::Attach, + ); } self.attached = new_attached; } + + if let Some(new_attached) = &new_attached { + assert!(!self.secondary.contains(new_attached)); + } } /// Like set_attached, but the node is from [`Self::secondary`]. This swaps the node from @@ -204,15 +225,28 @@ impl IntentState { let demoted = self.attached; self.attached = Some(promote_secondary); - scheduler.update_node_ref_counts(promote_secondary, RefCountUpdate::PromoteSecondary); + scheduler.update_node_ref_counts( + promote_secondary, + self.preferred_az_id.as_ref(), + RefCountUpdate::PromoteSecondary, + ); if let Some(demoted) = demoted { - scheduler.update_node_ref_counts(demoted, RefCountUpdate::DemoteAttached); + scheduler.update_node_ref_counts( + demoted, + self.preferred_az_id.as_ref(), + RefCountUpdate::DemoteAttached, + ); } } pub(crate) fn push_secondary(&mut self, scheduler: &mut Scheduler, new_secondary: NodeId) { - debug_assert!(!self.secondary.contains(&new_secondary)); - scheduler.update_node_ref_counts(new_secondary, RefCountUpdate::AddSecondary); + assert!(!self.secondary.contains(&new_secondary)); + assert!(self.attached != Some(new_secondary)); + scheduler.update_node_ref_counts( + new_secondary, + self.preferred_az_id.as_ref(), + RefCountUpdate::AddSecondary, + ); self.secondary.push(new_secondary); } @@ -220,27 +254,43 @@ impl IntentState { pub(crate) fn remove_secondary(&mut self, scheduler: &mut Scheduler, node_id: NodeId) { let index = self.secondary.iter().position(|n| *n == node_id); if let Some(index) = index { - scheduler.update_node_ref_counts(node_id, RefCountUpdate::RemoveSecondary); + scheduler.update_node_ref_counts( + node_id, + self.preferred_az_id.as_ref(), + RefCountUpdate::RemoveSecondary, + ); self.secondary.remove(index); } } pub(crate) fn clear_secondary(&mut self, scheduler: &mut Scheduler) { for secondary in self.secondary.drain(..) { - scheduler.update_node_ref_counts(secondary, RefCountUpdate::RemoveSecondary); + scheduler.update_node_ref_counts( + secondary, + self.preferred_az_id.as_ref(), + RefCountUpdate::RemoveSecondary, + ); } } /// Remove the last secondary node from the list of secondaries pub(crate) fn pop_secondary(&mut self, scheduler: &mut Scheduler) { if let Some(node_id) = self.secondary.pop() { - scheduler.update_node_ref_counts(node_id, RefCountUpdate::RemoveSecondary); + scheduler.update_node_ref_counts( + node_id, + self.preferred_az_id.as_ref(), + RefCountUpdate::RemoveSecondary, + ); } } pub(crate) fn clear(&mut self, scheduler: &mut Scheduler) { if let Some(old_attached) = self.attached.take() { - scheduler.update_node_ref_counts(old_attached, RefCountUpdate::Detach); + scheduler.update_node_ref_counts( + old_attached, + self.preferred_az_id.as_ref(), + RefCountUpdate::Detach, + ); } self.clear_secondary(scheduler); @@ -275,7 +325,11 @@ impl IntentState { if self.attached == Some(node_id) { self.attached = None; self.secondary.push(node_id); - scheduler.update_node_ref_counts(node_id, RefCountUpdate::DemoteAttached); + scheduler.update_node_ref_counts( + node_id, + self.preferred_az_id.as_ref(), + RefCountUpdate::DemoteAttached, + ); true } else { false @@ -315,6 +369,7 @@ pub(crate) struct ObservedStateLocation { /// we know that we might have some state on this node. pub(crate) conf: Option, } + pub(crate) struct ReconcilerWaiter { // For observability purposes, remember the ID of the shard we're // waiting for. @@ -360,6 +415,10 @@ pub(crate) enum ScheduleOptimizationAction { ReplaceSecondary(ReplaceSecondary), // Migrate attachment to an existing secondary location MigrateAttachment(MigrateAttachment), + // Create a secondary location, with the intent of later migrating to it + CreateSecondary(NodeId), + // Remove a secondary location that we previously created to facilitate a migration + RemoveSecondary(NodeId), } #[derive(Eq, PartialEq, Debug, Clone)] @@ -486,7 +545,7 @@ impl TenantShard { Self { tenant_shard_id, policy, - intent: IntentState::default(), + intent: IntentState::new(preferred_az_id), generation: Some(Generation::new(0)), shard, observed: ObservedState::default(), @@ -500,7 +559,6 @@ impl TenantShard { last_error: Arc::default(), pending_compute_notification: false, scheduling_policy: ShardSchedulingPolicy::default(), - preferred_az_id, } } @@ -563,7 +621,7 @@ impl TenantShard { return Ok((false, node_id)); } - if let Some(promote_secondary) = scheduler.node_preferred(&self.intent.secondary) { + if let Some(promote_secondary) = self.preferred_secondary(scheduler) { // Promote a secondary tracing::debug!("Promoted secondary {} to attached", promote_secondary); self.intent.promote_attached(scheduler, promote_secondary); @@ -572,7 +630,7 @@ impl TenantShard { // Pick a fresh node: either we had no secondaries or none were schedulable let node_id = scheduler.schedule_shard::( &self.intent.secondary, - &self.preferred_az_id, + &self.intent.preferred_az_id, context, )?; tracing::debug!("Selected {} as attached", node_id); @@ -594,9 +652,6 @@ impl TenantShard { let r = self.do_schedule(scheduler, context); context.avoid(&self.intent.all_pageservers()); - if let Some(attached) = self.intent.get_attached() { - context.push_attached(*attached); - } r } @@ -631,24 +686,7 @@ impl TenantShard { use PlacementPolicy::*; match self.policy { Attached(secondary_count) => { - let retain_secondaries = if self.intent.attached.is_none() - && scheduler.node_preferred(&self.intent.secondary).is_some() - { - // If we have no attached, and one of the secondaries is elegible to be promoted, retain - // one more secondary than we usually would, as one of them will become attached futher down this function. - secondary_count + 1 - } else { - secondary_count - }; - - while self.intent.secondary.len() > retain_secondaries { - // We have no particular preference for one secondary location over another: just - // arbitrarily drop from the end - self.intent.pop_secondary(scheduler); - modified = true; - } - - // Should have exactly one attached, and N secondaries + // Should have exactly one attached, and at least N secondaries let (modified_attached, attached_node_id) = self.schedule_attached(scheduler, context)?; modified |= modified_attached; @@ -657,7 +695,7 @@ impl TenantShard { while self.intent.secondary.len() < secondary_count { let node_id = scheduler.schedule_shard::( &used_pageservers, - &self.preferred_az_id, + &self.intent.preferred_az_id, context, )?; self.intent.push_secondary(scheduler, node_id); @@ -674,7 +712,7 @@ impl TenantShard { // Populate secondary by scheduling a fresh node let node_id = scheduler.schedule_shard::( &[], - &self.preferred_az_id, + &self.intent.preferred_az_id, context, )?; self.intent.push_secondary(scheduler, node_id); @@ -718,7 +756,7 @@ impl TenantShard { ) -> Result<(), ScheduleError> { let promote_to = match promote_to { Some(node) => node, - None => match scheduler.node_preferred(self.intent.get_secondary()) { + None => match self.preferred_secondary(scheduler) { Some(node) => node, None => { return Err(ScheduleError::ImpossibleConstraint); @@ -745,90 +783,276 @@ impl TenantShard { Ok(()) } + /// Returns None if the current location's score is unavailable, i.e. cannot draw a conclusion + fn is_better_location( + &self, + scheduler: &mut Scheduler, + schedule_context: &ScheduleContext, + current: NodeId, + candidate: NodeId, + ) -> Option { + let Some(candidate_score) = scheduler.compute_node_score::( + candidate, + &self.intent.preferred_az_id, + schedule_context, + ) else { + // The candidate node is unavailable for scheduling or otherwise couldn't get a score + return None; + }; + + match scheduler.compute_node_score::( + current, + &self.intent.preferred_az_id, + schedule_context, + ) { + Some(current_score) => { + // Ignore utilization components when comparing scores: we don't want to migrate + // because of transient load variations, it risks making the system thrash, and + // migrating for utilization requires a separate high level view of the system to + // e.g. prioritize moving larger or smaller tenants, rather than arbitrarily + // moving things around in the order that we hit this function. + let candidate_score = candidate_score.for_optimization(); + let current_score = current_score.for_optimization(); + + if candidate_score < current_score { + tracing::info!("Found a lower scoring location! {candidate} is better than {current} ({candidate_score:?} is better than {current_score:?})"); + Some(true) + } else { + // The candidate node is no better than our current location, so don't migrate + tracing::debug!( + "Candidate node {candidate} is no better than our current location {current} (candidate {candidate_score:?} vs current {current_score:?})", + ); + Some(false) + } + } + None => { + // The current node is unavailable for scheduling, so we can't make any sensible + // decisions about optimisation. This should be a transient state -- if the node + // is offline then it will get evacuated, if is blocked by a scheduling mode + // then we will respect that mode by doing nothing. + tracing::debug!("Current node {current} is unavailable for scheduling"); + None + } + } + } + + fn find_better_location( + &self, + scheduler: &mut Scheduler, + schedule_context: &ScheduleContext, + current: NodeId, + hard_exclude: &[NodeId], + ) -> Option { + // Look for a lower-scoring location to attach to + let Ok(candidate_node) = scheduler.schedule_shard::( + hard_exclude, + &self.intent.preferred_az_id, + schedule_context, + ) else { + // A scheduling error means we have no possible candidate replacements + tracing::debug!("No candidate node found"); + return None; + }; + + if candidate_node == current { + // We're already at the best possible location, so don't migrate + tracing::debug!("Candidate node {candidate_node} is already in use"); + return None; + } + + self.is_better_location::(scheduler, schedule_context, current, candidate_node) + .and_then(|better| if better { Some(candidate_node) } else { None }) + } + + /// This function is an optimization, used to avoid doing large numbers of scheduling operations + /// when looking for optimizations. This function uses knowledge of how scores work to do some + /// fast checks for whether it may to be possible to improve a score. + /// + /// If we return true, it only means that optimization _might_ be possible, not that it necessarily is. If we + /// return no, it definitely means that calling [`Self::optimize_attachment`] or [`Self::optimize_secondary`] would do no + /// work. + pub(crate) fn maybe_optimizable( + &self, + scheduler: &mut Scheduler, + schedule_context: &ScheduleContext, + ) -> bool { + // Sharded tenant: check if any locations have a nonzero affinity score + if self.shard.count >= ShardCount(1) { + let schedule_context = schedule_context.project_detach(self); + for node in self.intent.all_pageservers() { + if let Some(af) = schedule_context.nodes.get(&node) { + if *af > AffinityScore(0) { + return true; + } + } + } + } + + // Attached tenant: check if the attachment is outside the preferred AZ + if let PlacementPolicy::Attached(_) = self.policy { + if let Some(attached) = self.intent.get_attached() { + if scheduler.get_node_az(attached) != self.intent.preferred_az_id { + return true; + } + } + } + + // Tenant with secondary locations: check if any are within the preferred AZ + for secondary in self.intent.get_secondary() { + if scheduler.get_node_az(secondary) == self.intent.preferred_az_id { + return true; + } + } + + // Does the tenant have excess secondaries? + if self.intent.get_secondary().len() > self.policy.want_secondaries() { + return true; + } + + // Fall through: no optimizations possible + false + } + /// Optimize attachments: if a shard has a secondary location that is preferable to /// its primary location based on soft constraints, switch that secondary location /// to be attached. #[instrument(skip_all, fields(tenant_id=%self.tenant_shard_id.tenant_id, shard_id=%self.tenant_shard_id.shard_slug()))] pub(crate) fn optimize_attachment( &self, - nodes: &HashMap, + scheduler: &mut Scheduler, schedule_context: &ScheduleContext, ) -> Option { let attached = (*self.intent.get_attached())?; - if self.intent.secondary.is_empty() { - // We can only do useful work if we have both attached and secondary locations: this - // function doesn't schedule new locations, only swaps between attached and secondaries. - return None; - } - let current_affinity_score = schedule_context.get_node_affinity(attached); - let current_attachment_count = schedule_context.get_node_attachments(attached); + let schedule_context = schedule_context.project_detach(self); - // Generate score for each node, dropping any un-schedulable nodes. - let all_pageservers = self.intent.all_pageservers(); - let mut scores = all_pageservers - .iter() - .flat_map(|node_id| { - let node = nodes.get(node_id); - if node.is_none() { - None - } else if matches!( - node.unwrap().get_scheduling(), - NodeSchedulingPolicy::Filling - ) { - // If the node is currently filling, don't count it as a candidate to avoid, - // racing with the background fill. - None - } else if matches!(node.unwrap().may_schedule(), MaySchedule::No) { - None - } else { - let affinity_score = schedule_context.get_node_affinity(*node_id); - let attachment_count = schedule_context.get_node_attachments(*node_id); - Some((*node_id, affinity_score, attachment_count)) - } - }) - .collect::>(); - - // Sort precedence: - // 1st - prefer nodes with the lowest total affinity score - // 2nd - prefer nodes with the lowest number of attachments in this context - // 3rd - if all else is equal, sort by node ID for determinism in tests. - scores.sort_by_key(|i| (i.1, i.2, i.0)); - - if let Some((preferred_node, preferred_affinity_score, preferred_attachment_count)) = - scores.first() - { - if attached != *preferred_node { - // The best alternative must be more than 1 better than us, otherwise we could end - // up flapping back next time we're called (e.g. there's no point migrating from - // a location with score 1 to a score zero, because on next location the situation - // would be the same, but in reverse). - if current_affinity_score > *preferred_affinity_score + AffinityScore(1) - || current_attachment_count > *preferred_attachment_count + 1 - { - tracing::info!( - "Identified optimization: migrate attachment {attached}->{preferred_node} (secondaries {:?})", - self.intent.get_secondary() - ); - return Some(ScheduleOptimization { - sequence: self.sequence, - action: ScheduleOptimizationAction::MigrateAttachment(MigrateAttachment { - old_attached_node_id: attached, - new_attached_node_id: *preferred_node, - }), - }); - } - } else { - tracing::debug!( - "Node {} is already preferred (score {:?})", - preferred_node, - preferred_affinity_score - ); + // If we already have a secondary that is higher-scoring than out current location, + // then simply migrate to it. + for secondary in self.intent.get_secondary() { + if let Some(true) = self.is_better_location::( + scheduler, + &schedule_context, + attached, + *secondary, + ) { + return Some(ScheduleOptimization { + sequence: self.sequence, + action: ScheduleOptimizationAction::MigrateAttachment(MigrateAttachment { + old_attached_node_id: attached, + new_attached_node_id: *secondary, + }), + }); } } - // Fall-through: we didn't find an optimization - None + // Given that none of our current secondaries is a better location than our current + // attached location (checked above), we may trim any secondaries that are not needed + // for the placement policy. + if self.intent.get_secondary().len() > self.policy.want_secondaries() { + // This code path cleans up extra secondaries after migrating, and/or + // trims extra secondaries after a PlacementPolicy::Attached(N) was + // modified to decrease N. + + let secondary_scores = self + .intent + .get_secondary() + .iter() + .map(|node_id| { + ( + *node_id, + scheduler.compute_node_score::( + *node_id, + &self.intent.preferred_az_id, + &schedule_context, + ), + ) + }) + .collect::>(); + + if secondary_scores.iter().any(|score| score.1.is_none()) { + // Don't have full list of scores, so can't make a good decision about which to drop unless + // there is an obvious one in the wrong AZ + for secondary in self.intent.get_secondary() { + if scheduler.get_node_az(secondary) == self.intent.preferred_az_id { + return Some(ScheduleOptimization { + sequence: self.sequence, + action: ScheduleOptimizationAction::RemoveSecondary(*secondary), + }); + } + } + + // Fall through: we didn't identify one to remove. This ought to be rare. + tracing::warn!("Keeping extra secondaries: can't determine which of {:?} to remove (some nodes offline?)", + self.intent.get_secondary() + ); + } else { + let victim = secondary_scores + .iter() + .max_by_key(|score| score.1.unwrap()) + .unwrap() + .0; + return Some(ScheduleOptimization { + sequence: self.sequence, + action: ScheduleOptimizationAction::RemoveSecondary(victim), + }); + } + } + + let replacement = self.find_better_location::( + scheduler, + &schedule_context, + attached, + &[], // Don't exclude secondaries: our preferred attachment location may be a secondary + ); + + // We have found a candidate and confirmed that its score is preferable + // to our current location. See if we have a secondary location in the preferred location already: if not, + // then create one. + if let Some(replacement) = replacement { + // If we are currently in non-preferred AZ, then the scheduler might suggest a location that is better, but still + // not in our preferred AZ. Migration has a cost in resources an impact to the workload, so we want to avoid doing + // multiple hops where we might go to some other AZ before eventually finding a suitable location in our preferred + // AZ: skip this optimization if it is not in our final, preferred AZ. + // + // This should be a transient state, there should always be capacity eventually in our preferred AZ (even if nodes + // there are too overloaded for scheduler to suggest them, more should be provisioned eventually). + if self.intent.preferred_az_id.is_some() + && scheduler.get_node_az(&replacement) != self.intent.preferred_az_id + { + tracing::debug!( + "Candidate node {replacement} is not in preferred AZ {:?}", + self.intent.preferred_az_id + ); + + // This should only happen if our current location is not in the preferred AZ, otherwise + // [`Self::find_better_location`]` should have rejected any other location outside the preferred Az, because + // AZ is the highest priority part of NodeAttachmentSchedulingScore. + debug_assert!(scheduler.get_node_az(&attached) != self.intent.preferred_az_id); + + return None; + } + + if !self.intent.get_secondary().contains(&replacement) { + Some(ScheduleOptimization { + sequence: self.sequence, + action: ScheduleOptimizationAction::CreateSecondary(replacement), + }) + } else { + // We already have a secondary in the preferred location, let's try migrating to it. Our caller + // will check the warmth of the destination before deciding whether to really execute this. + Some(ScheduleOptimization { + sequence: self.sequence, + action: ScheduleOptimizationAction::MigrateAttachment(MigrateAttachment { + old_attached_node_id: attached, + new_attached_node_id: replacement, + }), + }) + } + } else { + // We didn't find somewhere we'd rather be, and we don't have any excess secondaries + // to clean up: no action required. + None + } } #[instrument(skip_all, fields(tenant_id=%self.tenant_shard_id.tenant_id, shard_id=%self.tenant_shard_id.shard_slug()))] @@ -837,50 +1061,40 @@ impl TenantShard { scheduler: &mut Scheduler, schedule_context: &ScheduleContext, ) -> Option { - if self.intent.secondary.is_empty() { - // We can only do useful work if we have both attached and secondary locations: this - // function doesn't schedule new locations, only swaps between attached and secondaries. + if self.intent.get_secondary().len() > self.policy.want_secondaries() { + // We have extra secondaries, perhaps to facilitate a migration of the attached location: + // do nothing, it is up to [`Self::optimize_attachment`] to clean them up. When that's done, + // and we are called again, we will proceed. + tracing::debug!("Too many secondaries: skipping"); return None; } + let schedule_context = schedule_context.project_detach(self); + for secondary in self.intent.get_secondary() { - let Some(affinity_score) = schedule_context.nodes.get(secondary) else { - // We're already on a node unaffected any affinity constraints, - // so we won't change it. - continue; + // Make sure we don't try to migrate a secondary to our attached location: this case happens + // easily in environments without multiple AZs. + let exclude = match self.intent.attached { + Some(attached) => vec![attached], + None => vec![], }; - // Let the scheduler suggest a node, where it would put us if we were scheduling afresh - // This implicitly limits the choice to nodes that are available, and prefers nodes - // with lower utilization. - let Ok(candidate_node) = scheduler.schedule_shard::( - &self.intent.all_pageservers(), - &self.preferred_az_id, - schedule_context, - ) else { - // A scheduling error means we have no possible candidate replacements - continue; - }; - - let candidate_affinity_score = schedule_context - .nodes - .get(&candidate_node) - .unwrap_or(&AffinityScore::FREE); - - // The best alternative must be more than 1 better than us, otherwise we could end - // up flapping back next time we're called. - if *candidate_affinity_score + AffinityScore(1) < *affinity_score { - // If some other node is available and has a lower score than this node, then - // that other node is a good place to migrate to. - tracing::info!( - "Identified optimization: replace secondary {secondary}->{candidate_node} (current secondaries {:?})", - self.intent.get_secondary() - ); + let replacement = self.find_better_location::( + scheduler, + &schedule_context, + *secondary, + &exclude, + ); + assert!(replacement != Some(*secondary)); + if let Some(replacement) = replacement { + // We have found a candidate and confirmed that its score is preferable + // to our current location. See if we have a secondary location in the preferred location already: if not, + // then create one. return Some(ScheduleOptimization { sequence: self.sequence, action: ScheduleOptimizationAction::ReplaceSecondary(ReplaceSecondary { old_node_id: *secondary, - new_node_id: candidate_node, + new_node_id: replacement, }), }); } @@ -921,11 +1135,54 @@ impl TenantShard { self.intent.remove_secondary(scheduler, old_node_id); self.intent.push_secondary(scheduler, new_node_id); } + ScheduleOptimizationAction::CreateSecondary(new_node_id) => { + self.intent.push_secondary(scheduler, new_node_id); + } + ScheduleOptimizationAction::RemoveSecondary(old_secondary) => { + self.intent.remove_secondary(scheduler, old_secondary); + } } true } + /// When a shard has several secondary locations, we need to pick one in situations where + /// we promote one of them to an attached location: + /// - When draining a node for restart + /// - When responding to a node failure + /// + /// In this context, 'preferred' does not mean the node with the best scheduling score: instead + /// we want to pick the node which is best for use _temporarily_ while the previous attached location + /// is unavailable (e.g. because it's down or deploying). That means we prefer to use secondary + /// locations in a non-preferred AZ, as they're more likely to have awarm cache than a temporary + /// secondary in the preferred AZ (which are usually only created for migrations, and if they exist + /// they're probably not warmed up yet). The latter behavior is based oni + /// + /// If the input is empty, or all the nodes are not elegible for scheduling, return None: the + /// caller needs to a pick a node some other way. + pub(crate) fn preferred_secondary(&self, scheduler: &Scheduler) -> Option { + let candidates = scheduler.filter_usable_nodes(&self.intent.secondary); + + // We will sort candidates to prefer nodes which are _not_ in our preferred AZ, i.e. we prefer + // to migrate to a long-lived secondary location (which would have been scheduled in a non-preferred AZ), + // rather than a short-lived secondary location being used for optimization/migration (which would have + // been scheduled in our preferred AZ). + let mut candidates = candidates + .iter() + .map(|(node_id, node_az)| { + if node_az == &self.intent.preferred_az_id { + (1, *node_id) + } else { + (0, *node_id) + } + }) + .collect::>(); + + candidates.sort(); + + candidates.first().map(|i| i.1) + } + /// Query whether the tenant's observed state for attached node matches its intent state, and if so, /// yield the node ID. This is appropriate for emitting compute hook notifications: we are checking that /// the node in question is not only where we intend to attach, but that the tenant is indeed already attached there. @@ -1122,10 +1379,15 @@ impl TenantShard { let result = reconciler.reconcile().await; // If we know we had a pending compute notification from some previous action, send a notification irrespective - // of whether the above reconcile() did any work + // of whether the above reconcile() did any work. It has to be Ok() though, because otherwise we might be + // sending a notification of a location that isn't really attached. if result.is_ok() && must_notify { // If this fails we will send the need to retry in [`ReconcileResult::pending_compute_notification`] reconciler.compute_notify().await.ok(); + } else if must_notify { + // Carry this flag so that the reconciler's result will indicate that it still needs to retry + // the compute hook notification eventually. + reconciler.compute_notify_failure = true; } // Update result counter @@ -1202,7 +1464,7 @@ impl TenantShard { detach, reconciler_config, config: self.config.clone(), - preferred_az: self.preferred_az_id.clone(), + preferred_az: self.intent.preferred_az_id.clone(), observed: self.observed.clone(), original_observed: self.observed.clone(), compute_hook: compute_hook.clone(), @@ -1423,7 +1685,6 @@ impl TenantShard { pending_compute_notification: false, delayed_reconcile: false, scheduling_policy: serde_json::from_str(&tsp.scheduling_policy).unwrap(), - preferred_az_id: tsp.preferred_az_id.map(AvailabilityZone), }) } @@ -1439,16 +1700,16 @@ impl TenantShard { config: serde_json::to_string(&self.config).unwrap(), splitting: SplitState::default(), scheduling_policy: serde_json::to_string(&self.scheduling_policy).unwrap(), - preferred_az_id: self.preferred_az_id.as_ref().map(|az| az.0.clone()), + preferred_az_id: self.intent.preferred_az_id.as_ref().map(|az| az.0.clone()), } } pub(crate) fn preferred_az(&self) -> Option<&AvailabilityZone> { - self.preferred_az_id.as_ref() + self.intent.preferred_az_id.as_ref() } - pub(crate) fn set_preferred_az(&mut self, preferred_az_id: AvailabilityZone) { - self.preferred_az_id = Some(preferred_az_id); + pub(crate) fn set_preferred_az(&mut self, preferred_az_id: Option) { + self.intent.preferred_az_id = preferred_az_id; } /// Returns all the nodes to which this tenant shard is attached according to the @@ -1751,65 +2012,90 @@ pub(crate) mod tests { } #[test] - fn optimize_attachment() -> anyhow::Result<()> { - let nodes = make_test_nodes(3, &[]); + /// Simple case: moving attachment to somewhere better where we already have a secondary + fn optimize_attachment_simple() -> anyhow::Result<()> { + let nodes = make_test_nodes( + 3, + &[ + AvailabilityZone("az-a".to_string()), + AvailabilityZone("az-b".to_string()), + AvailabilityZone("az-c".to_string()), + ], + ); let mut scheduler = Scheduler::new(nodes.values()); let mut shard_a = make_test_tenant_shard(PlacementPolicy::Attached(1)); + shard_a.intent.preferred_az_id = Some(AvailabilityZone("az-a".to_string())); let mut shard_b = make_test_tenant_shard(PlacementPolicy::Attached(1)); + shard_b.intent.preferred_az_id = Some(AvailabilityZone("az-a".to_string())); // Initially: both nodes attached on shard 1, and both have secondary locations // on different nodes. - shard_a.intent.set_attached(&mut scheduler, Some(NodeId(1))); - shard_a.intent.push_secondary(&mut scheduler, NodeId(2)); + shard_a.intent.set_attached(&mut scheduler, Some(NodeId(2))); + shard_a.intent.push_secondary(&mut scheduler, NodeId(1)); shard_b.intent.set_attached(&mut scheduler, Some(NodeId(1))); - shard_b.intent.push_secondary(&mut scheduler, NodeId(3)); + shard_b.intent.push_secondary(&mut scheduler, NodeId(2)); - let mut schedule_context = ScheduleContext::default(); - schedule_context.avoid(&shard_a.intent.all_pageservers()); - schedule_context.push_attached(shard_a.intent.get_attached().unwrap()); - schedule_context.avoid(&shard_b.intent.all_pageservers()); - schedule_context.push_attached(shard_b.intent.get_attached().unwrap()); + fn make_schedule_context(shard_a: &TenantShard, shard_b: &TenantShard) -> ScheduleContext { + let mut schedule_context = ScheduleContext::default(); + schedule_context.avoid(&shard_a.intent.all_pageservers()); + schedule_context.avoid(&shard_b.intent.all_pageservers()); + schedule_context + } - let optimization_a = shard_a.optimize_attachment(&nodes, &schedule_context); - - // Either shard should recognize that it has the option to switch to a secondary location where there - // would be no other shards from the same tenant, and request to do so. + let schedule_context = make_schedule_context(&shard_a, &shard_b); + let optimization_a = shard_a.optimize_attachment(&mut scheduler, &schedule_context); assert_eq!( optimization_a, Some(ScheduleOptimization { sequence: shard_a.sequence, action: ScheduleOptimizationAction::MigrateAttachment(MigrateAttachment { - old_attached_node_id: NodeId(1), - new_attached_node_id: NodeId(2) + old_attached_node_id: NodeId(2), + new_attached_node_id: NodeId(1) }) }) ); - - // Note that these optimizing two shards in the same tenant with the same ScheduleContext is - // mutually exclusive (the optimization of one invalidates the stats) -- it is the responsibility - // of [`Service::optimize_all`] to avoid trying - // to do optimizations for multiple shards in the same tenant at the same time. Generating - // both optimizations is just done for test purposes - let optimization_b = shard_b.optimize_attachment(&nodes, &schedule_context); - assert_eq!( - optimization_b, - Some(ScheduleOptimization { - sequence: shard_b.sequence, - action: ScheduleOptimizationAction::MigrateAttachment(MigrateAttachment { - old_attached_node_id: NodeId(1), - new_attached_node_id: NodeId(3) - }) - }) - ); - - // Applying these optimizations should result in the end state proposed shard_a.apply_optimization(&mut scheduler, optimization_a.unwrap()); - assert_eq!(shard_a.intent.get_attached(), &Some(NodeId(2))); - assert_eq!(shard_a.intent.get_secondary(), &vec![NodeId(1)]); - shard_b.apply_optimization(&mut scheduler, optimization_b.unwrap()); - assert_eq!(shard_b.intent.get_attached(), &Some(NodeId(3))); - assert_eq!(shard_b.intent.get_secondary(), &vec![NodeId(1)]); + + // // Either shard should recognize that it has the option to switch to a secondary location where there + // // would be no other shards from the same tenant, and request to do so. + // assert_eq!( + // optimization_a_prepare, + // Some(ScheduleOptimization { + // sequence: shard_a.sequence, + // action: ScheduleOptimizationAction::CreateSecondary(NodeId(2)) + // }) + // ); + // shard_a.apply_optimization(&mut scheduler, optimization_a_prepare.unwrap()); + + // let schedule_context = make_schedule_context(&shard_a, &shard_b); + // let optimization_a_migrate = shard_a.optimize_attachment(&mut scheduler, &schedule_context); + // assert_eq!( + // optimization_a_migrate, + // Some(ScheduleOptimization { + // sequence: shard_a.sequence, + // action: ScheduleOptimizationAction::MigrateAttachment(MigrateAttachment { + // old_attached_node_id: NodeId(1), + // new_attached_node_id: NodeId(2) + // }) + // }) + // ); + // shard_a.apply_optimization(&mut scheduler, optimization_a_migrate.unwrap()); + + // let schedule_context = make_schedule_context(&shard_a, &shard_b); + // let optimization_a_cleanup = shard_a.optimize_attachment(&mut scheduler, &schedule_context); + // assert_eq!( + // optimization_a_cleanup, + // Some(ScheduleOptimization { + // sequence: shard_a.sequence, + // action: ScheduleOptimizationAction::RemoveSecondary(NodeId(1)) + // }) + // ); + // shard_a.apply_optimization(&mut scheduler, optimization_a_cleanup.unwrap()); + + // // Shard B should not be moved anywhere, since the pressure on node 1 was relieved by moving shard A + // let schedule_context = make_schedule_context(&shard_a, &shard_b); + // assert_eq!(shard_b.optimize_attachment(&mut scheduler, &schedule_context), None); shard_a.intent.clear(&mut scheduler); shard_b.intent.clear(&mut scheduler); @@ -1817,6 +2103,190 @@ pub(crate) mod tests { Ok(()) } + #[test] + /// Complicated case: moving attachment to somewhere better where we do not have a secondary + /// already, creating one as needed. + fn optimize_attachment_multistep() -> anyhow::Result<()> { + let nodes = make_test_nodes( + 3, + &[ + AvailabilityZone("az-a".to_string()), + AvailabilityZone("az-b".to_string()), + AvailabilityZone("az-c".to_string()), + ], + ); + let mut scheduler = Scheduler::new(nodes.values()); + + // Two shards of a tenant that wants to be in AZ A + let mut shard_a = make_test_tenant_shard(PlacementPolicy::Attached(1)); + shard_a.intent.preferred_az_id = Some(AvailabilityZone("az-a".to_string())); + let mut shard_b = make_test_tenant_shard(PlacementPolicy::Attached(1)); + shard_b.intent.preferred_az_id = Some(AvailabilityZone("az-a".to_string())); + + // Both shards are initially attached in non-home AZ _and_ have secondaries in non-home AZs + shard_a.intent.set_attached(&mut scheduler, Some(NodeId(2))); + shard_a.intent.push_secondary(&mut scheduler, NodeId(3)); + shard_b.intent.set_attached(&mut scheduler, Some(NodeId(3))); + shard_b.intent.push_secondary(&mut scheduler, NodeId(2)); + + fn make_schedule_context(shard_a: &TenantShard, shard_b: &TenantShard) -> ScheduleContext { + let mut schedule_context = ScheduleContext::default(); + schedule_context.avoid(&shard_a.intent.all_pageservers()); + schedule_context.avoid(&shard_b.intent.all_pageservers()); + schedule_context + } + + let schedule_context = make_schedule_context(&shard_a, &shard_b); + let optimization_a_prepare = shard_a.optimize_attachment(&mut scheduler, &schedule_context); + assert_eq!( + optimization_a_prepare, + Some(ScheduleOptimization { + sequence: shard_a.sequence, + action: ScheduleOptimizationAction::CreateSecondary(NodeId(1)) + }) + ); + shard_a.apply_optimization(&mut scheduler, optimization_a_prepare.unwrap()); + + let schedule_context = make_schedule_context(&shard_a, &shard_b); + let optimization_a_migrate = shard_a.optimize_attachment(&mut scheduler, &schedule_context); + assert_eq!( + optimization_a_migrate, + Some(ScheduleOptimization { + sequence: shard_a.sequence, + action: ScheduleOptimizationAction::MigrateAttachment(MigrateAttachment { + old_attached_node_id: NodeId(2), + new_attached_node_id: NodeId(1) + }) + }) + ); + shard_a.apply_optimization(&mut scheduler, optimization_a_migrate.unwrap()); + + let schedule_context = make_schedule_context(&shard_a, &shard_b); + let optimization_a_cleanup = shard_a.optimize_attachment(&mut scheduler, &schedule_context); + assert_eq!( + optimization_a_cleanup, + Some(ScheduleOptimization { + sequence: shard_a.sequence, + action: ScheduleOptimizationAction::RemoveSecondary(NodeId(3)) + }) + ); + shard_a.apply_optimization(&mut scheduler, optimization_a_cleanup.unwrap()); + + // // Shard B should not be moved anywhere, since the pressure on node 1 was relieved by moving shard A + // let schedule_context = make_schedule_context(&shard_a, &shard_b); + // assert_eq!(shard_b.optimize_attachment(&mut scheduler, &schedule_context), None); + + shard_a.intent.clear(&mut scheduler); + shard_b.intent.clear(&mut scheduler); + + Ok(()) + } + + #[test] + /// Check that multi-step migration works when moving to somewhere that is only better by + /// 1 AffinityScore -- this ensures that we don't have a bug like the intermediate secondary + /// counting toward the affinity score such that it prevents the rest of the migration from happening. + fn optimize_attachment_marginal() -> anyhow::Result<()> { + let nodes = make_test_nodes(2, &[]); + let mut scheduler = Scheduler::new(nodes.values()); + + // Multi-sharded tenant, we will craft a situation where affinity + // scores differ only slightly + let mut shards = make_test_tenant(PlacementPolicy::Attached(0), ShardCount::new(4), None); + + // 1 attached on node 1 + shards[0] + .intent + .set_attached(&mut scheduler, Some(NodeId(1))); + // 3 attached on node 2 + shards[1] + .intent + .set_attached(&mut scheduler, Some(NodeId(2))); + shards[2] + .intent + .set_attached(&mut scheduler, Some(NodeId(2))); + shards[3] + .intent + .set_attached(&mut scheduler, Some(NodeId(2))); + + // The scheduler should figure out that we need to: + // - Create a secondary for shard 3 on node 1 + // - Migrate shard 3 to node 1 + // - Remove shard 3's location on node 2 + + fn make_schedule_context(shards: &Vec) -> ScheduleContext { + let mut schedule_context = ScheduleContext::default(); + for shard in shards { + schedule_context.avoid(&shard.intent.all_pageservers()); + } + schedule_context + } + + let schedule_context = make_schedule_context(&shards); + let optimization_a_prepare = + shards[1].optimize_attachment(&mut scheduler, &schedule_context); + assert_eq!( + optimization_a_prepare, + Some(ScheduleOptimization { + sequence: shards[1].sequence, + action: ScheduleOptimizationAction::CreateSecondary(NodeId(1)) + }) + ); + shards[1].apply_optimization(&mut scheduler, optimization_a_prepare.unwrap()); + + let schedule_context = make_schedule_context(&shards); + let optimization_a_migrate = + shards[1].optimize_attachment(&mut scheduler, &schedule_context); + assert_eq!( + optimization_a_migrate, + Some(ScheduleOptimization { + sequence: shards[1].sequence, + action: ScheduleOptimizationAction::MigrateAttachment(MigrateAttachment { + old_attached_node_id: NodeId(2), + new_attached_node_id: NodeId(1) + }) + }) + ); + shards[1].apply_optimization(&mut scheduler, optimization_a_migrate.unwrap()); + + let schedule_context = make_schedule_context(&shards); + let optimization_a_cleanup = + shards[1].optimize_attachment(&mut scheduler, &schedule_context); + assert_eq!( + optimization_a_cleanup, + Some(ScheduleOptimization { + sequence: shards[1].sequence, + action: ScheduleOptimizationAction::RemoveSecondary(NodeId(2)) + }) + ); + shards[1].apply_optimization(&mut scheduler, optimization_a_cleanup.unwrap()); + + // Everything should be stable now + let schedule_context = make_schedule_context(&shards); + assert_eq!( + shards[0].optimize_attachment(&mut scheduler, &schedule_context), + None + ); + assert_eq!( + shards[1].optimize_attachment(&mut scheduler, &schedule_context), + None + ); + assert_eq!( + shards[2].optimize_attachment(&mut scheduler, &schedule_context), + None + ); + assert_eq!( + shards[3].optimize_attachment(&mut scheduler, &schedule_context), + None + ); + + for mut shard in shards { + shard.intent.clear(&mut scheduler); + } + + Ok(()) + } + #[test] fn optimize_secondary() -> anyhow::Result<()> { let nodes = make_test_nodes(4, &[]); @@ -1834,9 +2304,7 @@ pub(crate) mod tests { let mut schedule_context = ScheduleContext::default(); schedule_context.avoid(&shard_a.intent.all_pageservers()); - schedule_context.push_attached(shard_a.intent.get_attached().unwrap()); schedule_context.avoid(&shard_b.intent.all_pageservers()); - schedule_context.push_attached(shard_b.intent.get_attached().unwrap()); let optimization_a = shard_a.optimize_secondary(&mut scheduler, &schedule_context); @@ -1867,7 +2335,6 @@ pub(crate) mod tests { // called repeatedly in the background. // Returns the applied optimizations fn optimize_til_idle( - nodes: &HashMap, scheduler: &mut Scheduler, shards: &mut [TenantShard], ) -> Vec { @@ -1879,14 +2346,18 @@ pub(crate) mod tests { for shard in shards.iter() { schedule_context.avoid(&shard.intent.all_pageservers()); - if let Some(attached) = shard.intent.get_attached() { - schedule_context.push_attached(*attached); - } } for shard in shards.iter_mut() { - let optimization = shard.optimize_attachment(nodes, &schedule_context); + let optimization = shard.optimize_attachment(scheduler, &schedule_context); + tracing::info!( + "optimize_attachment({})={:?}", + shard.tenant_shard_id, + optimization + ); if let Some(optimization) = optimization { + // Check that maybe_optimizable wouldn't have wrongly claimed this optimization didn't exist + assert!(shard.maybe_optimizable(scheduler, &schedule_context)); optimizations.push(optimization.clone()); shard.apply_optimization(scheduler, optimization); any_changed = true; @@ -1894,7 +2365,15 @@ pub(crate) mod tests { } let optimization = shard.optimize_secondary(scheduler, &schedule_context); + tracing::info!( + "optimize_secondary({})={:?}", + shard.tenant_shard_id, + optimization + ); if let Some(optimization) = optimization { + // Check that maybe_optimizable wouldn't have wrongly claimed this optimization didn't exist + assert!(shard.maybe_optimizable(scheduler, &schedule_context)); + optimizations.push(optimization.clone()); shard.apply_optimization(scheduler, optimization); any_changed = true; @@ -1918,14 +2397,34 @@ pub(crate) mod tests { /// that it converges. #[test] fn optimize_add_nodes() -> anyhow::Result<()> { - let nodes = make_test_nodes(4, &[]); + let nodes = make_test_nodes( + 9, + &[ + // Initial 6 nodes + AvailabilityZone("az-a".to_string()), + AvailabilityZone("az-a".to_string()), + AvailabilityZone("az-b".to_string()), + AvailabilityZone("az-b".to_string()), + AvailabilityZone("az-c".to_string()), + AvailabilityZone("az-c".to_string()), + // Three we will add later + AvailabilityZone("az-a".to_string()), + AvailabilityZone("az-b".to_string()), + AvailabilityZone("az-c".to_string()), + ], + ); - // Only show the scheduler a couple of nodes + // Only show the scheduler two nodes in each AZ to start with let mut scheduler = Scheduler::new([].iter()); - scheduler.node_upsert(nodes.get(&NodeId(1)).unwrap()); - scheduler.node_upsert(nodes.get(&NodeId(2)).unwrap()); + for i in 1..=6 { + scheduler.node_upsert(nodes.get(&NodeId(i)).unwrap()); + } - let mut shards = make_test_tenant(PlacementPolicy::Attached(1), ShardCount::new(4), None); + let mut shards = make_test_tenant( + PlacementPolicy::Attached(1), + ShardCount::new(4), + Some(AvailabilityZone("az-a".to_string())), + ); let mut schedule_context = ScheduleContext::default(); for shard in &mut shards { assert!(shard @@ -1933,30 +2432,50 @@ pub(crate) mod tests { .is_ok()); } - // We should see equal number of locations on the two nodes. - assert_eq!(scheduler.get_node_shard_count(NodeId(1)), 4); + // Initial: attached locations land in the tenant's home AZ. + assert_eq!(scheduler.get_node_shard_count(NodeId(1)), 2); assert_eq!(scheduler.get_node_attached_shard_count(NodeId(1)), 2); - - assert_eq!(scheduler.get_node_shard_count(NodeId(2)), 4); + assert_eq!(scheduler.get_node_shard_count(NodeId(2)), 2); assert_eq!(scheduler.get_node_attached_shard_count(NodeId(2)), 2); - // Add another two nodes: we should see the shards spread out when their optimize - // methods are called - scheduler.node_upsert(nodes.get(&NodeId(3)).unwrap()); - scheduler.node_upsert(nodes.get(&NodeId(4)).unwrap()); - optimize_til_idle(&nodes, &mut scheduler, &mut shards); + // Initial: secondary locations in a remote AZ + assert_eq!(scheduler.get_node_shard_count(NodeId(3)), 1); + assert_eq!(scheduler.get_node_attached_shard_count(NodeId(3)), 0); + assert_eq!(scheduler.get_node_shard_count(NodeId(4)), 1); + assert_eq!(scheduler.get_node_attached_shard_count(NodeId(4)), 0); + assert_eq!(scheduler.get_node_shard_count(NodeId(5)), 1); + assert_eq!(scheduler.get_node_attached_shard_count(NodeId(5)), 0); + assert_eq!(scheduler.get_node_shard_count(NodeId(6)), 1); + assert_eq!(scheduler.get_node_attached_shard_count(NodeId(6)), 0); - assert_eq!(scheduler.get_node_shard_count(NodeId(1)), 2); + // Add another three nodes: we should see the shards spread out when their optimize + // methods are called + scheduler.node_upsert(nodes.get(&NodeId(7)).unwrap()); + scheduler.node_upsert(nodes.get(&NodeId(8)).unwrap()); + scheduler.node_upsert(nodes.get(&NodeId(9)).unwrap()); + optimize_til_idle(&mut scheduler, &mut shards); + + // We expect one attached location was moved to the new node in the tenant's home AZ + assert_eq!(scheduler.get_node_shard_count(NodeId(7)), 1); + assert_eq!(scheduler.get_node_attached_shard_count(NodeId(7)), 1); + // The original node has one less attached shard + assert_eq!(scheduler.get_node_shard_count(NodeId(1)), 1); assert_eq!(scheduler.get_node_attached_shard_count(NodeId(1)), 1); + // One of the original nodes still has two attachments, since there are an odd number of nodes assert_eq!(scheduler.get_node_shard_count(NodeId(2)), 2); - assert_eq!(scheduler.get_node_attached_shard_count(NodeId(2)), 1); + assert_eq!(scheduler.get_node_attached_shard_count(NodeId(2)), 2); - assert_eq!(scheduler.get_node_shard_count(NodeId(3)), 2); - assert_eq!(scheduler.get_node_attached_shard_count(NodeId(3)), 1); - - assert_eq!(scheduler.get_node_shard_count(NodeId(4)), 2); - assert_eq!(scheduler.get_node_attached_shard_count(NodeId(4)), 1); + // None of our secondaries moved, since we already had enough nodes for those to be + // scheduled perfectly + assert_eq!(scheduler.get_node_shard_count(NodeId(3)), 1); + assert_eq!(scheduler.get_node_attached_shard_count(NodeId(3)), 0); + assert_eq!(scheduler.get_node_shard_count(NodeId(4)), 1); + assert_eq!(scheduler.get_node_attached_shard_count(NodeId(4)), 0); + assert_eq!(scheduler.get_node_shard_count(NodeId(5)), 1); + assert_eq!(scheduler.get_node_attached_shard_count(NodeId(5)), 0); + assert_eq!(scheduler.get_node_shard_count(NodeId(6)), 1); + assert_eq!(scheduler.get_node_attached_shard_count(NodeId(6)), 0); for shard in shards.iter_mut() { shard.intent.clear(&mut scheduler); @@ -1996,10 +2515,10 @@ pub(crate) mod tests { shard.schedule(&mut scheduler, context).unwrap(); } - let applied_to_a = optimize_til_idle(&nodes, &mut scheduler, &mut a); + let applied_to_a = optimize_til_idle(&mut scheduler, &mut a); assert_eq!(applied_to_a, vec![]); - let applied_to_b = optimize_til_idle(&nodes, &mut scheduler, &mut b); + let applied_to_b = optimize_til_idle(&mut scheduler, &mut b); assert_eq!(applied_to_b, vec![]); for shard in a.iter_mut().chain(b.iter_mut()) { diff --git a/test_runner/fixtures/metrics.py b/test_runner/fixtures/metrics.py index c5295360c3..fa541bad17 100644 --- a/test_runner/fixtures/metrics.py +++ b/test_runner/fixtures/metrics.py @@ -131,7 +131,6 @@ PAGESERVER_GLOBAL_METRICS: tuple[str, ...] = ( "pageserver_getpage_reconstruct_seconds_sum", *[f"pageserver_basebackup_query_seconds_{x}" for x in ["bucket", "count", "sum"]], *histogram("pageserver_smgr_query_seconds_global"), - *histogram("pageserver_layers_visited_per_read_global"), *histogram("pageserver_getpage_get_reconstruct_data_seconds"), *histogram("pageserver_wait_lsn_seconds"), *histogram("pageserver_remote_operation_seconds"), diff --git a/test_runner/fixtures/neon_fixtures.py b/test_runner/fixtures/neon_fixtures.py index e22e452a52..a01cb47984 100644 --- a/test_runner/fixtures/neon_fixtures.py +++ b/test_runner/fixtures/neon_fixtures.py @@ -370,6 +370,7 @@ class NeonEnvBuilder: pageserver_config_override: str | Callable[[dict[str, Any]], None] | None = None, num_safekeepers: int = 1, num_pageservers: int = 1, + num_azs: int = 1, # Use non-standard SK ids to check for various parsing bugs safekeepers_id_start: int = 0, # fsync is disabled by default to make the tests go faster @@ -401,6 +402,7 @@ class NeonEnvBuilder: self.pageserver_config_override = pageserver_config_override self.num_safekeepers = num_safekeepers self.num_pageservers = num_pageservers + self.num_azs = num_azs self.safekeepers_id_start = safekeepers_id_start self.safekeepers_enable_fsync = safekeepers_enable_fsync self.auth_enabled = auth_enabled @@ -990,6 +992,7 @@ class NeonEnv: self.endpoints = EndpointFactory(self) self.safekeepers: list[Safekeeper] = [] self.pageservers: list[NeonPageserver] = [] + self.num_azs = config.num_azs self.broker = NeonBroker(self) self.pageserver_remote_storage = config.pageserver_remote_storage self.safekeepers_remote_storage = config.safekeepers_remote_storage @@ -1090,14 +1093,21 @@ class NeonEnv: http=self.port_distributor.get_port(), ) + # Availabilty zones may also be configured manually with `NeonEnvBuilder.pageserver_config_override` + if self.num_azs > 1: + # Round-robin assignment of AZ names like us-east-2a, us-east-2b, etc. + az_prefix = DEFAULT_AZ_ID[:-1] + availability_zone = f"{az_prefix}{chr(ord('a') + (ps_id - 1) % self.num_azs)}" + else: + availability_zone = DEFAULT_AZ_ID + ps_cfg: dict[str, Any] = { "id": ps_id, "listen_pg_addr": f"localhost:{pageserver_port.pg}", "listen_http_addr": f"localhost:{pageserver_port.http}", "pg_auth_type": pg_auth_type, "http_auth_type": http_auth_type, - # Default which can be overriden with `NeonEnvBuilder.pageserver_config_override` - "availability_zone": DEFAULT_AZ_ID, + "availability_zone": availability_zone, # Disable pageserver disk syncs in tests: when running tests concurrently, this avoids # the pageserver taking a long time to start up due to syncfs flushing other tests' data "no_sync": True, @@ -1884,7 +1894,10 @@ class NeonStorageController(MetricsGetter, LogUtils): ) return response.json() - def tenant_list(self): + def tenant_shard_dump(self): + """ + Debug listing API: dumps the internal map of tenant shards + """ response = self.request( "GET", f"{self.api}/debug/v1/tenant", @@ -1892,6 +1905,18 @@ class NeonStorageController(MetricsGetter, LogUtils): ) return response.json() + def tenant_list(self, **kwargs): + """ + Control API tenant listing: a vector of the same content returned by tenant_describe + """ + response = self.request( + "GET", + f"{self.api}/control/v1/tenant", + headers=self.headers(TokenScope.ADMIN), + params=kwargs, + ) + return response.json() + def node_configure(self, node_id, body: dict[str, Any]): log.info(f"node_configure({node_id}, {body})") body["node_id"] = node_id @@ -2238,7 +2263,7 @@ class NeonStorageController(MetricsGetter, LogUtils): """ Get the intent and observed placements of all tenants known to the storage controller. """ - tenants = self.tenant_list() + tenants = self.tenant_shard_dump() tenant_placement: defaultdict[str, dict[str, Any]] = defaultdict( lambda: { @@ -2321,6 +2346,14 @@ class NeonStorageController(MetricsGetter, LogUtils): json=body, ) + def safekeeper_scheduling_policy(self, id: int, scheduling_policy: str): + self.request( + "POST", + f"{self.api}/control/v1/safekeeper/{id}/scheduling_policy", + headers=self.headers(TokenScope.ADMIN), + json={"id": id, "scheduling_policy": scheduling_policy}, + ) + def get_safekeeper(self, id: int) -> dict[str, Any] | None: try: response = self.request( @@ -4120,7 +4153,7 @@ class Endpoint(PgProtocol, LogUtils): # Checkpoints running endpoint and returns pg_wal size in MB. def get_pg_wal_size(self): - log.info(f'checkpointing at LSN {self.safe_psql("select pg_current_wal_lsn()")[0][0]}') + log.info(f"checkpointing at LSN {self.safe_psql('select pg_current_wal_lsn()')[0][0]}") self.safe_psql("checkpoint") assert self.pgdata_dir is not None # please mypy return get_dir_size(self.pgdata_dir / "pg_wal") / 1024 / 1024 @@ -4960,7 +4993,7 @@ def logical_replication_sync( if res: log.info(f"subscriber_lsn={res}") subscriber_lsn = Lsn(res) - log.info(f"Subscriber LSN={subscriber_lsn}, publisher LSN={ publisher_lsn}") + log.info(f"Subscriber LSN={subscriber_lsn}, publisher LSN={publisher_lsn}") if subscriber_lsn >= publisher_lsn: return subscriber_lsn time.sleep(0.5) diff --git a/test_runner/fixtures/pageserver/http.py b/test_runner/fixtures/pageserver/http.py index 378e568622..364aff325d 100644 --- a/test_runner/fixtures/pageserver/http.py +++ b/test_runner/fixtures/pageserver/http.py @@ -15,7 +15,6 @@ from requests.adapters import HTTPAdapter from urllib3.util.retry import Retry from fixtures.common_types import ( - Id, Lsn, TenantId, TenantShardId, @@ -25,7 +24,7 @@ from fixtures.common_types import ( from fixtures.log_helper import log from fixtures.metrics import Metrics, MetricsGetter, parse_metrics from fixtures.pg_version import PgVersion -from fixtures.utils import Fn +from fixtures.utils import EnhancedJSONEncoder, Fn class PageserverApiException(Exception): @@ -83,14 +82,6 @@ class TimelineCreateRequest: mode: TimelineCreateRequestMode def to_json(self) -> str: - class EnhancedJSONEncoder(json.JSONEncoder): - def default(self, o): - if dataclasses.is_dataclass(o) and not isinstance(o, type): - return dataclasses.asdict(o) - elif isinstance(o, Id): - return o.id.hex() - return super().default(o) - # mode is flattened this = dataclasses.asdict(self) mode = this.pop("mode") diff --git a/test_runner/fixtures/safekeeper/http.py b/test_runner/fixtures/safekeeper/http.py index 286f80ba69..493ce7334e 100644 --- a/test_runner/fixtures/safekeeper/http.py +++ b/test_runner/fixtures/safekeeper/http.py @@ -10,7 +10,7 @@ import requests from fixtures.common_types import Lsn, TenantId, TenantTimelineId, TimelineId from fixtures.log_helper import log from fixtures.metrics import Metrics, MetricsGetter, parse_metrics -from fixtures.utils import wait_until +from fixtures.utils import EnhancedJSONEncoder, wait_until if TYPE_CHECKING: from typing import Any @@ -25,6 +25,7 @@ class Walreceiver: @dataclass class SafekeeperTimelineStatus: + mconf: Configuration | None term: int last_log_term: int pg_version: int # Not exactly a PgVersion, safekeeper returns version as int, for example 150002 for 15.2 @@ -69,6 +70,56 @@ class TermBumpResponse: ) +@dataclass +class SafekeeperId: + id: int + host: str + pg_port: int + + +@dataclass +class Configuration: + generation: int + members: list[SafekeeperId] + new_members: list[SafekeeperId] | None + + @classmethod + def from_json(cls, d: dict[str, Any]) -> Configuration: + generation = d["generation"] + members = d["members"] + new_members = d.get("new_members") + return Configuration(generation, members, new_members) + + def to_json(self) -> str: + return json.dumps(self, cls=EnhancedJSONEncoder) + + +@dataclass +class TimelineCreateRequest: + tenant_id: TenantId + timeline_id: TimelineId + mconf: Configuration + # not exactly PgVersion, for example 150002 for 15.2 + pg_version: int + start_lsn: Lsn + commit_lsn: Lsn | None + + def to_json(self) -> str: + return json.dumps(self, cls=EnhancedJSONEncoder) + + +@dataclass +class TimelineMembershipSwitchResponse: + previous_conf: Configuration + current_conf: Configuration + + @classmethod + def from_json(cls, d: dict[str, Any]) -> TimelineMembershipSwitchResponse: + previous_conf = Configuration.from_json(d["previous_conf"]) + current_conf = Configuration.from_json(d["current_conf"]) + return TimelineMembershipSwitchResponse(previous_conf, current_conf) + + class SafekeeperHttpClient(requests.Session, MetricsGetter): HTTPError = requests.HTTPError @@ -131,20 +182,8 @@ class SafekeeperHttpClient(requests.Session, MetricsGetter): resj = res.json() return [TenantTimelineId.from_json(ttidj) for ttidj in resj] - def timeline_create( - self, - tenant_id: TenantId, - timeline_id: TimelineId, - pg_version: int, # Not exactly a PgVersion, safekeeper returns version as int, for example 150002 for 15.2 - commit_lsn: Lsn, - ): - body = { - "tenant_id": str(tenant_id), - "timeline_id": str(timeline_id), - "pg_version": pg_version, - "commit_lsn": str(commit_lsn), - } - res = self.post(f"http://localhost:{self.port}/v1/tenant/timeline", json=body) + def timeline_create(self, r: TimelineCreateRequest): + res = self.post(f"http://localhost:{self.port}/v1/tenant/timeline", data=r.to_json()) res.raise_for_status() def timeline_status( @@ -154,7 +193,10 @@ class SafekeeperHttpClient(requests.Session, MetricsGetter): res.raise_for_status() resj = res.json() walreceivers = [Walreceiver(wr["conn_id"], wr["status"]) for wr in resj["walreceivers"]] + # It is always normally not None, it is allowed only to make forward compat tests happy. + mconf = Configuration.from_json(resj["mconf"]) if "mconf" in resj else None return SafekeeperTimelineStatus( + mconf=mconf, term=resj["acceptor_state"]["term"], last_log_term=resj["acceptor_state"]["epoch"], pg_version=resj["pg_info"]["pg_version"], @@ -180,6 +222,11 @@ class SafekeeperHttpClient(requests.Session, MetricsGetter): def get_commit_lsn(self, tenant_id: TenantId, timeline_id: TimelineId) -> Lsn: return self.timeline_status(tenant_id, timeline_id).commit_lsn + # Get timeline membership configuration. + def get_membership(self, tenant_id: TenantId, timeline_id: TimelineId) -> Configuration: + # make mypy happy + return self.timeline_status(tenant_id, timeline_id).mconf # type: ignore + # only_local doesn't remove segments in the remote storage. def timeline_delete( self, tenant_id: TenantId, timeline_id: TimelineId, only_local: bool = False @@ -226,6 +273,16 @@ class SafekeeperHttpClient(requests.Session, MetricsGetter): assert isinstance(res_json, dict) return res_json + def membership_switch( + self, tenant_id: TenantId, timeline_id: TimelineId, to: Configuration + ) -> TimelineMembershipSwitchResponse: + res = self.post( + f"http://localhost:{self.port}/v1/tenant/{tenant_id}/timeline/{timeline_id}/membership", + data=to.to_json(), + ) + res.raise_for_status() + return TimelineMembershipSwitchResponse.from_json(res.json()) + def copy_timeline(self, tenant_id: TenantId, timeline_id: TimelineId, body: dict[str, Any]): res = self.post( f"http://localhost:{self.port}/v1/tenant/{tenant_id}/timeline/{timeline_id}/copy", diff --git a/test_runner/fixtures/utils.py b/test_runner/fixtures/utils.py index c34ac298d1..e160c617cd 100644 --- a/test_runner/fixtures/utils.py +++ b/test_runner/fixtures/utils.py @@ -1,6 +1,7 @@ from __future__ import annotations import contextlib +import dataclasses import json import os import re @@ -21,6 +22,7 @@ import zstandard from psycopg2.extensions import cursor from typing_extensions import override +from fixtures.common_types import Id, Lsn from fixtures.log_helper import log from fixtures.pageserver.common_types import ( parse_delta_layer, @@ -605,6 +607,22 @@ class PropagatingThread(threading.Thread): return self.ret +class EnhancedJSONEncoder(json.JSONEncoder): + """ + Default json.JSONEncoder works only on primitive builtins. Extend it to any + dataclass plus our custom types. + """ + + def default(self, o): + if dataclasses.is_dataclass(o) and not isinstance(o, type): + return dataclasses.asdict(o) + elif isinstance(o, Id): + return o.id.hex() + elif isinstance(o, Lsn): + return str(o) # standard hex notation + return super().default(o) + + def human_bytes(amt: float) -> str: """ Render a bytes amount into nice IEC bytes string. diff --git a/test_runner/fixtures/workload.py b/test_runner/fixtures/workload.py index 1b8c9fef44..eea0ec2b95 100644 --- a/test_runner/fixtures/workload.py +++ b/test_runner/fixtures/workload.py @@ -53,6 +53,22 @@ class Workload: self._endpoint: Endpoint | None = None self._endpoint_opts = endpoint_opts or {} + def branch( + self, + timeline_id: TimelineId, + branch_name: str | None = None, + endpoint_opts: dict[str, Any] | None = None, + ) -> Workload: + """ + Checkpoint the current status of the workload in case of branching + """ + branch_workload = Workload( + self.env, self.tenant_id, timeline_id, branch_name, endpoint_opts + ) + branch_workload.expect_rows = self.expect_rows + branch_workload.churn_cursor = self.churn_cursor + return branch_workload + def reconfigure(self) -> None: """ Request the endpoint to reconfigure based on location reported by storage controller diff --git a/test_runner/regress/test_parallel_copy.py b/test_runner/performance/test_parallel_copy.py similarity index 100% rename from test_runner/regress/test_parallel_copy.py rename to test_runner/performance/test_parallel_copy.py diff --git a/test_runner/performance/test_sharding_autosplit.py b/test_runner/performance/test_sharding_autosplit.py index caa89955e3..76c3ad01a4 100644 --- a/test_runner/performance/test_sharding_autosplit.py +++ b/test_runner/performance/test_sharding_autosplit.py @@ -2,6 +2,7 @@ from __future__ import annotations import concurrent.futures import re +import threading from pathlib import Path import pytest @@ -188,7 +189,20 @@ def test_sharding_autosplit(neon_env_builder: NeonEnvBuilder, pg_bin: PgBin): check_pgbench_output(out_path) - with concurrent.futures.ThreadPoolExecutor(max_workers=tenant_count) as pgbench_threads: + stop_pump = threading.Event() + + def pump_controller(): + # Run a background loop to force the storage controller to run its + # background work faster than it otherwise would: this helps + # us: + # A) to create a test that runs in a shorter time + # B) to create a test that is more intensive by doing the shard migrations + # after splits happen more rapidly. + while not stop_pump.is_set(): + env.storage_controller.reconcile_all() + stop_pump.wait(0.1) + + with concurrent.futures.ThreadPoolExecutor(max_workers=tenant_count + 1) as pgbench_threads: pgbench_futs = [] for tenant_state in tenants.values(): fut = pgbench_threads.submit(run_pgbench_init, tenant_state.endpoint) @@ -198,6 +212,8 @@ def test_sharding_autosplit(neon_env_builder: NeonEnvBuilder, pg_bin: PgBin): for fut in pgbench_futs: fut.result() + pump_fut = pgbench_threads.submit(pump_controller) + pgbench_futs = [] for tenant_state in tenants.values(): fut = pgbench_threads.submit(run_pgbench_main, tenant_state.endpoint) @@ -207,6 +223,9 @@ def test_sharding_autosplit(neon_env_builder: NeonEnvBuilder, pg_bin: PgBin): for fut in pgbench_futs: fut.result() + stop_pump.set() + pump_fut.result() + def assert_all_split(): for tenant_id in tenants.keys(): shards = tenant_get_shards(env, tenant_id) diff --git a/test_runner/performance/test_storage_controller_scale.py b/test_runner/performance/test_storage_controller_scale.py index 49f41483ec..d45db28c78 100644 --- a/test_runner/performance/test_storage_controller_scale.py +++ b/test_runner/performance/test_storage_controller_scale.py @@ -13,11 +13,13 @@ from fixtures.log_helper import log from fixtures.neon_fixtures import ( NeonEnv, NeonEnvBuilder, + NeonPageserver, PageserverAvailability, PageserverSchedulingPolicy, ) from fixtures.pageserver.http import PageserverApiException, PageserverHttpClient from fixtures.pg_version import PgVersion +from fixtures.utils import wait_until def get_consistent_node_shard_counts(env: NeonEnv, total_shards) -> defaultdict[str, int]: @@ -85,8 +87,12 @@ def test_storage_controller_many_tenants( ) AZS = ["alpha", "bravo", "charlie"] + + def az_selector(node_id): + return f"az-{AZS[(node_id - 1) % len(AZS)]}" + neon_env_builder.pageserver_config_override = lambda ps_cfg: ps_cfg.update( - {"availability_zone": f"az-{AZS[ps_cfg['id'] % len(AZS)]}"} + {"availability_zone": az_selector(ps_cfg["id"])} ) # A small sleep on each call into the notify hook, to simulate the latency of doing a database write @@ -168,6 +174,31 @@ def test_storage_controller_many_tenants( log.info(f"Resident memory: {rss} ({ rss / total_shards} per shard)") assert rss < expect_memory_per_shard * total_shards + def assert_all_tenants_scheduled_in_home_az(): + for tenant_id in tenant_ids: + desc = env.storage_controller.tenant_describe(tenant_id) + preferred_az = None + for shard in desc["shards"]: + # All shards in a tenant should have the same preferred AZ + if preferred_az is None: + preferred_az = shard["preferred_az_id"] + else: + assert preferred_az == shard["preferred_az_id"] + + # Attachment should be in the preferred AZ + assert shard["preferred_az_id"] == az_selector( + shard["node_attached"] + ), f"Shard {shard['tenant_shard_id']} not in {shard['preferred_az_id']}" + + # Secondary locations should not be in the preferred AZ + for node_secondary in shard["node_secondary"]: + assert ( + shard["preferred_az_id"] != az_selector(node_secondary) + ), f"Shard {shard['tenant_shard_id']} secondary should be in {shard['preferred_az_id']}" + + # There should only be one secondary location (i.e. no migrations in flight) + assert len(shard["node_secondary"]) == 1 + # Issue more concurrent operations than the storage controller's reconciler concurrency semaphore # permits, to ensure that we are exercising stressing that. api_concurrency = 135 @@ -242,6 +273,22 @@ def test_storage_controller_many_tenants( f"Created {len(tenants_with_timelines)} timelines in {time.time() - t1}, {len(tenants_with_timelines) / (time.time() - t1)}/s" ) + # Check initial scheduling + assert_all_tenants_scheduled_in_home_az() + az_attached_counts: defaultdict[str, int] = defaultdict(int) + az_secondary_counts: defaultdict[str, int] = defaultdict(int) + node_attached_counts: defaultdict[str, int] = defaultdict(int) + for tenant_id in tenants.keys(): + desc = env.storage_controller.tenant_describe(tenant_id) + for shard in desc["shards"]: + az_attached_counts[az_selector(shard["node_attached"])] += 1 + node_attached_counts[shard["node_attached"]] += 1 + for node_secondary in shard["node_secondary"]: + az_secondary_counts[az_selector(node_secondary)] += 1 + + log.info(f"Initial node attached counts: {node_attached_counts}") + log.info(f"Initial AZ shard counts: {az_attached_counts}, {az_secondary_counts}") + # Plan operations: ensure each tenant with a timeline gets at least # one of each operation type. Then add other tenants to make up the # numbers. @@ -450,11 +497,77 @@ def test_storage_controller_many_tenants( env.storage_controller.reconcile_until_idle(max_interval=0.1, timeout_secs=120) env.storage_controller.consistency_check() + # Since we did `reconcile_until_idle` during the above loop, the system should be left in + # an optimally scheduled state. Validate that this includes all the tenants being scheduled + # in their home AZ. + assert_all_tenants_scheduled_in_home_az() + # Consistency check is safe here: restarting pageservers should not have caused any Reconcilers to spawn, # as they were not offline long enough to trigger any scheduling changes. env.storage_controller.consistency_check() check_memory() + # Simulate loss of an AZ + victim_az = "az-alpha" + killed_pageservers = [] + for ps in env.pageservers: + if az_selector(ps.id) == victim_az: + ps.stop(immediate=True) + killed_pageservers.append(ps) + log.info(f"Killed pageserver {ps.id}") + + assert killed_pageservers + + # Wait for the controller to notice the pageservers are dead + def assert_pageservers_availability( + pageservers: list[NeonPageserver], expected_availability: PageserverAvailability + ): + nodes = env.storage_controller.nodes() + checked_any = False + node_ids = [ps.id for ps in pageservers] + for node in nodes: + if node["id"] in node_ids: + checked_any = True + assert ( + node["availability"] == expected_availability + ), f"Node {node['id']} is not {expected_availability} yet: {node['availability']}" + + assert checked_any + + wait_until( + lambda: assert_pageservers_availability(killed_pageservers, PageserverAvailability.OFFLINE), + timeout=60, + ) + + # Let the controller finish all its rescheduling + env.storage_controller.reconcile_until_idle(max_interval=0.1, timeout_secs=120) + + # Check that all the tenants are rescheduled to the remaining pageservers + for tenant_id in tenant_ids: + desc = env.storage_controller.tenant_describe(tenant_id) + for shard in desc["shards"]: + # Attachment should be outside the AZ where we killed the pageservers + assert ( + az_selector(shard["node_attached"]) != victim_az + ), f"Shard {shard['tenant_shard_id']} still in {victim_az} (node {shard['node_attached']})" + + # Bring back the pageservers + for ps in killed_pageservers: + ps.start() + + wait_until( + lambda: assert_pageservers_availability(killed_pageservers, PageserverAvailability.ACTIVE), + timeout=60, + ) + + # A very long timeout is required: we will be migrating all the tenants on all the pageservers + # in the region that we just restored. Assume it'll take up to twice as long as it took to fill + # a single node + env.storage_controller.reconcile_until_idle( + max_interval=0.1, timeout_secs=DRAIN_FILL_TIMEOUT * 4 + ) + assert_all_tenants_scheduled_in_home_az() + # Stop the storage controller before tearing down fixtures, because it otherwise might log # errors trying to call our `ComputeReconfigure`. env.storage_controller.stop() diff --git a/test_runner/regress/test_compaction.py b/test_runner/regress/test_compaction.py index ae48a8fc27..d0a2349ccf 100644 --- a/test_runner/regress/test_compaction.py +++ b/test_runner/regress/test_compaction.py @@ -84,9 +84,6 @@ page_cache_size=10 log.info("Checking layer access metrics ...") layer_access_metric_names = [ - "pageserver_layers_visited_per_read_global_sum", - "pageserver_layers_visited_per_read_global_count", - "pageserver_layers_visited_per_read_global_bucket", "pageserver_layers_visited_per_vectored_read_global_sum", "pageserver_layers_visited_per_vectored_read_global_count", "pageserver_layers_visited_per_vectored_read_global_bucket", @@ -97,12 +94,6 @@ page_cache_size=10 layer_access_metrics = metrics.query_all(name) log.info(f"Got metrics: {layer_access_metrics}") - non_vectored_sum = metrics.query_one("pageserver_layers_visited_per_read_global_sum") - non_vectored_count = metrics.query_one("pageserver_layers_visited_per_read_global_count") - if non_vectored_count.value != 0: - non_vectored_average = non_vectored_sum.value / non_vectored_count.value - else: - non_vectored_average = 0 vectored_sum = metrics.query_one("pageserver_layers_visited_per_vectored_read_global_sum") vectored_count = metrics.query_one("pageserver_layers_visited_per_vectored_read_global_count") if vectored_count.value > 0: @@ -113,16 +104,19 @@ page_cache_size=10 assert vectored_sum.value == 0 vectored_average = 0 - log.info(f"{non_vectored_average=} {vectored_average=}") + log.info(f"{vectored_average=}") # The upper bound for average number of layer visits below (8) # was chosen empirically for this workload. - assert non_vectored_average < 8 assert vectored_average < 8 @skip_in_debug_build("only run with release build") -def test_pageserver_gc_compaction_smoke(neon_env_builder: NeonEnvBuilder): +@pytest.mark.parametrize( + "with_branches", + ["with_branches", "no_branches"], +) +def test_pageserver_gc_compaction_smoke(neon_env_builder: NeonEnvBuilder, with_branches: str): SMOKE_CONF = { # Run both gc and gc-compaction. "gc_period": "5s", @@ -153,12 +147,17 @@ def test_pageserver_gc_compaction_smoke(neon_env_builder: NeonEnvBuilder): log.info("Writing initial data ...") workload.write_rows(row_count, env.pageserver.id) + child_workloads: list[Workload] = [] + for i in range(1, churn_rounds + 1): if i % 10 == 0: log.info(f"Running churn round {i}/{churn_rounds} ...") - - if (i - 1) % 10 == 0: - # Run gc-compaction every 10 rounds to ensure the test doesn't take too long time. + if i % 10 == 5 and with_branches == "with_branches": + branch_name = f"child-{i}" + branch_timeline_id = env.create_branch(branch_name) + child_workloads.append(workload.branch(branch_timeline_id, branch_name)) + if (i - 1) % 10 == 0 or (i - 1) % 10 == 1: + # Run gc-compaction twice every 10 rounds to ensure the test doesn't take too long time. ps_http.timeline_compact( tenant_id, timeline_id, @@ -189,6 +188,9 @@ def test_pageserver_gc_compaction_smoke(neon_env_builder: NeonEnvBuilder): log.info("Validating at workload end ...") workload.validate(env.pageserver.id) + for child_workload in child_workloads: + log.info(f"Validating at branch {child_workload.branch_name}") + child_workload.validate(env.pageserver.id) # Run a legacy compaction+gc to ensure gc-compaction can coexist with legacy compaction. ps_http.timeline_checkpoint(tenant_id, timeline_id, wait_until_uploaded=True) diff --git a/test_runner/regress/test_compatibility.py b/test_runner/regress/test_compatibility.py index ba7305148f..a6eaaf6c4c 100644 --- a/test_runner/regress/test_compatibility.py +++ b/test_runner/regress/test_compatibility.py @@ -141,11 +141,18 @@ def test_create_snapshot( neon_env_builder.num_safekeepers = 3 neon_env_builder.enable_pageserver_remote_storage(RemoteStorageKind.LOCAL_FS) - env = neon_env_builder.init_start() + env = neon_env_builder.init_start( + initial_tenant_conf={ + # Miniature layers to enable generating non-trivial layer map without writing lots of data + "checkpoint_distance": f"{128 * 1024}", + "compaction_threshold": "1", + "compaction_target_size": f"{128 * 1024}", + } + ) endpoint = env.endpoints.create_start("main") - pg_bin.run_capture(["pgbench", "--initialize", "--scale=10", endpoint.connstr()]) - pg_bin.run_capture(["pgbench", "--time=60", "--progress=2", endpoint.connstr()]) + pg_bin.run_capture(["pgbench", "--initialize", "--scale=1", endpoint.connstr()]) + pg_bin.run_capture(["pgbench", "--time=30", "--progress=2", endpoint.connstr()]) pg_bin.run_capture( ["pg_dumpall", f"--dbname={endpoint.connstr()}", f"--file={test_output_dir / 'dump.sql'}"] ) @@ -157,7 +164,9 @@ def test_create_snapshot( pageserver_http = env.pageserver.http_client() flush_ep_to_pageserver(env, endpoint, tenant_id, timeline_id) - pageserver_http.timeline_checkpoint(tenant_id, timeline_id, wait_until_uploaded=True) + pageserver_http.timeline_checkpoint( + tenant_id, timeline_id, wait_until_uploaded=True, force_image_layer_creation=True + ) env.endpoints.stop_all() for sk in env.safekeepers: diff --git a/test_runner/regress/test_compute_metrics.py b/test_runner/regress/test_compute_metrics.py index 71963355b7..5dcc93acff 100644 --- a/test_runner/regress/test_compute_metrics.py +++ b/test_runner/regress/test_compute_metrics.py @@ -219,7 +219,7 @@ if SQL_EXPORTER is None: # # The "host" network mode allows sql_exporter to talk to the # endpoint which is running on the host. - super().__init__("docker.io/burningalchemist/sql_exporter:0.16.0", network_mode="host") + super().__init__("docker.io/burningalchemist/sql_exporter:0.17.0", network_mode="host") self.__logs_dir = logs_dir self.__port = port @@ -252,7 +252,7 @@ if SQL_EXPORTER is None: log.info("Waiting for sql_exporter to be ready") wait_for_logs( self, - rf'level=info msg="Listening on" address=\[::\]:{self.__port}', + rf'msg="Listening on" address=\[::\]:{self.__port}', timeout=5, ) @@ -344,10 +344,7 @@ else: time.sleep(0.5) continue - if ( - f'level=info msg="Listening on" address=[::]:{self._sql_exporter_port}' - in line - ): + if f'msg="Listening on" address=[::]:{self._sql_exporter_port}' in line: break @override diff --git a/test_runner/regress/test_lfc_resize.py b/test_runner/regress/test_lfc_resize.py index 377b0fb4d4..8762e6525b 100644 --- a/test_runner/regress/test_lfc_resize.py +++ b/test_runner/regress/test_lfc_resize.py @@ -30,7 +30,7 @@ def test_lfc_resize(neon_simple_env: NeonEnv, pg_bin: PgBin): ], ) n_resize = 10 - scale = 100 + scale = 20 def run_pgbench(connstr: str): log.info(f"Start a pgbench workload on pg {connstr}") @@ -46,17 +46,36 @@ def test_lfc_resize(neon_simple_env: NeonEnv, pg_bin: PgBin): conn = endpoint.connect() cur = conn.cursor() + def get_lfc_size() -> tuple[int, int]: + lfc_file_path = endpoint.lfc_path() + lfc_file_size = lfc_file_path.stat().st_size + res = subprocess.run( + ["ls", "-sk", lfc_file_path], check=True, text=True, capture_output=True + ) + lfc_file_blocks = re.findall("([0-9A-F]+)", res.stdout)[0] + log.info(f"Size of LFC file {lfc_file_size}, blocks {lfc_file_blocks}") + + return (lfc_file_size, lfc_file_blocks) + # For as long as pgbench is running, twiddle the LFC size once a second. # Note that we launch this immediately, already while the "pgbench -i" # initialization step is still running. That's quite a different workload # than the actual pgbench benchamark run, so this gives us coverage of both. while thread.is_alive(): - size = random.randint(1, 512) + # Vary the LFC size randomly within a range above what we will later + # decrease it to. This should ensure that the final size decrease + # is really doing something. + size = random.randint(192, 512) cur.execute(f"alter system set neon.file_cache_size_limit='{size}MB'") cur.execute("select pg_reload_conf()") time.sleep(1) + thread.join() + # Before shrinking the cache, check that it really is large now + (lfc_file_size, lfc_file_blocks) = get_lfc_size() + assert int(lfc_file_blocks) > 128 * 1024 + # At the end, set it at 100 MB, and perform a final check that the disk usage # of the file is in that ballbark. # @@ -66,13 +85,7 @@ def test_lfc_resize(neon_simple_env: NeonEnv, pg_bin: PgBin): cur.execute("select pg_reload_conf()") nretries = 10 while True: - lfc_file_path = endpoint.lfc_path() - lfc_file_size = lfc_file_path.stat().st_size - res = subprocess.run( - ["ls", "-sk", lfc_file_path], check=True, text=True, capture_output=True - ) - lfc_file_blocks = re.findall("([0-9A-F]+)", res.stdout)[0] - log.info(f"Size of LFC file {lfc_file_size}, blocks {lfc_file_blocks}") + (lfc_file_size, lfc_file_blocks) = get_lfc_size() assert lfc_file_size <= 512 * 1024 * 1024 if int(lfc_file_blocks) <= 128 * 1024 or nretries == 0: diff --git a/test_runner/regress/test_local_file_cache.py b/test_runner/regress/test_local_file_cache.py index 94c630ffcf..21c9e97a42 100644 --- a/test_runner/regress/test_local_file_cache.py +++ b/test_runner/regress/test_local_file_cache.py @@ -29,8 +29,8 @@ def test_local_file_cache_unlink(neon_env_builder: NeonEnvBuilder): cur = endpoint.connect().cursor() stop = threading.Event() - n_rows = 100000 - n_threads = 20 + n_rows = 10000 + n_threads = 5 n_updates_per_connection = 1000 cur.execute("CREATE TABLE lfctest (id int4 PRIMARY KEY, n int) WITH (fillfactor=10)") diff --git a/test_runner/regress/test_nbtree_pagesplit_cycleid.py b/test_runner/regress/test_nbtree_pagesplit_cycleid.py index 558557aeba..32ec6fcb92 100644 --- a/test_runner/regress/test_nbtree_pagesplit_cycleid.py +++ b/test_runner/regress/test_nbtree_pagesplit_cycleid.py @@ -4,9 +4,19 @@ import time from fixtures.neon_fixtures import NeonEnv BTREE_NUM_CYCLEID_PAGES = """ - WITH raw_pages AS ( - SELECT blkno, get_raw_page_at_lsn('t_uidx', 'main', blkno, NULL, NULL) page - FROM generate_series(1, pg_relation_size('t_uidx'::regclass) / 8192) blkno + WITH lsns AS ( + /* + * pg_switch_wal() ensures we have an LSN that + * 1. is after any previous modifications, but also, + * 2. (critically) is flushed, preventing any issues with waiting for + * unflushed WAL in PageServer. + */ + SELECT pg_switch_wal() as lsn + ), + raw_pages AS ( + SELECT blkno, get_raw_page_at_lsn('t_uidx', 'main', blkno, lsn, lsn) page + FROM generate_series(1, pg_relation_size('t_uidx'::regclass) / 8192) AS blkno, + lsns l(lsn) ), parsed_pages AS ( /* cycle ID is the last 2 bytes of the btree page */ @@ -36,7 +46,6 @@ def test_nbtree_pagesplit_cycleid(neon_simple_env: NeonEnv): ses1.execute("CREATE UNIQUE INDEX t_uidx ON t(id);") ses1.execute("INSERT INTO t (txt) SELECT i::text FROM generate_series(1, 2035) i;") - ses1.execute("SELECT neon_xlogflush();") ses1.execute(BTREE_NUM_CYCLEID_PAGES) pages = ses1.fetchall() assert ( @@ -57,7 +66,6 @@ def test_nbtree_pagesplit_cycleid(neon_simple_env: NeonEnv): ses1.execute("DELETE FROM t WHERE id <= 610;") # Flush wal, for checking purposes - ses1.execute("SELECT neon_xlogflush();") ses1.execute(BTREE_NUM_CYCLEID_PAGES) pages = ses1.fetchall() assert len(pages) == 0, f"No back splits with cycle ID expected, got batches of {pages} instead" @@ -108,8 +116,6 @@ def test_nbtree_pagesplit_cycleid(neon_simple_env: NeonEnv): # unpin the btree page, allowing s3's vacuum to complete ses2.execute("FETCH ALL FROM foo;") ses2.execute("ROLLBACK;") - # flush WAL to make sure PS is up-to-date - ses1.execute("SELECT neon_xlogflush();") # check that our expectations are correct ses1.execute(BTREE_NUM_CYCLEID_PAGES) pages = ses1.fetchall() diff --git a/test_runner/regress/test_page_service_batching_regressions.py b/test_runner/regress/test_page_service_batching_regressions.py new file mode 100644 index 0000000000..fa85e1210b --- /dev/null +++ b/test_runner/regress/test_page_service_batching_regressions.py @@ -0,0 +1,60 @@ +# NB: there are benchmarks that double-serve as tests inside the `performance` directory. + +import subprocess +from pathlib import Path + +import pytest +from fixtures.log_helper import log +from fixtures.neon_fixtures import NeonEnvBuilder + + +@pytest.mark.timeout(30) # test takes <20s if pageserver impl is correct +@pytest.mark.parametrize("kind", ["pageserver-stop", "tenant-detach"]) +def test_slow_flush(neon_env_builder: NeonEnvBuilder, neon_binpath: Path, kind: str): + def patch_pageserver_toml(config): + config["page_service_pipelining"] = { + "mode": "pipelined", + "max_batch_size": 32, + "execution": "concurrent-futures", + } + + neon_env_builder.pageserver_config_override = patch_pageserver_toml + env = neon_env_builder.init_start() + + log.info("make flush appear slow") + + log.info("sending requests until pageserver accepts no more") + # TODO: extract this into a helper, like subprocess_capture, + # so that we capture the stderr from the helper somewhere. + child = subprocess.Popen( + [ + neon_binpath / "test_helper_slow_client_reads", + env.pageserver.connstr(), + str(env.initial_tenant), + str(env.initial_timeline), + ], + bufsize=0, # unbuffered + stdin=subprocess.PIPE, + stdout=subprocess.PIPE, + ) + assert child.stdout is not None + buf = child.stdout.read(1) + if len(buf) != 1: + raise Exception("unexpected EOF") + if buf != b"R": + raise Exception(f"unexpected data: {buf!r}") + log.info("helper reports pageserver accepts no more requests") + log.info( + "assuming pageserver connection handle is in a state where TCP has backpressured pageserver=>client response flush() into userspace" + ) + + if kind == "pageserver-stop": + log.info("try to shut down the pageserver cleanly") + env.pageserver.stop() + elif kind == "tenant-detach": + log.info("try to shut down the tenant") + env.pageserver.tenant_detach(env.initial_tenant) + else: + raise ValueError(f"unexpected kind: {kind}") + + log.info("shutdown did not time out, test passed") diff --git a/test_runner/regress/test_physical_replication.py b/test_runner/regress/test_physical_replication.py index 6cb11b825d..17819fd367 100644 --- a/test_runner/regress/test_physical_replication.py +++ b/test_runner/regress/test_physical_replication.py @@ -187,7 +187,7 @@ def test_physical_replication_config_mismatch_too_many_known_xids(neon_simple_en origin=primary, endpoint_id="secondary", config_lines=[ - "max_connections=2", + "max_connections=5", "autovacuum_max_workers=1", "max_worker_processes=5", "max_wal_senders=1", diff --git a/test_runner/regress/test_proxy_websockets.py b/test_runner/regress/test_proxy_websockets.py index ea01252ce4..f14317a39f 100644 --- a/test_runner/regress/test_proxy_websockets.py +++ b/test_runner/regress/test_proxy_websockets.py @@ -1,10 +1,15 @@ from __future__ import annotations +import asyncio import ssl +import asyncpg import pytest +import websocket_tunnel import websockets +from fixtures.log_helper import log from fixtures.neon_fixtures import NeonProxy +from fixtures.port_distributor import PortDistributor @pytest.mark.asyncio @@ -196,3 +201,53 @@ async def test_websockets_pipelined(static_proxy: NeonProxy): # close await websocket.send(b"X\x00\x00\x00\x04") await websocket.wait_closed() + + +@pytest.mark.asyncio +async def test_websockets_tunneled(static_proxy: NeonProxy, port_distributor: PortDistributor): + static_proxy.safe_psql("create user ws_auth with password 'ws' superuser") + + user = "ws_auth" + password = "ws" + + ssl_context = ssl.SSLContext(ssl.PROTOCOL_TLS_CLIENT) + ssl_context.load_verify_locations(str(static_proxy.test_output_dir / "proxy.crt")) + + # Launch a tunnel service so that we can speak the websockets protocol to + # the proxy + tunnel_port = port_distributor.get_port() + tunnel_server = await websocket_tunnel.start_server( + "127.0.0.1", + tunnel_port, + f"wss://{static_proxy.domain}:{static_proxy.external_http_port}/sql", + ssl_context, + ) + log.info(f"websockets tunnel listening for connections on port {tunnel_port}") + + async with tunnel_server: + + async def run_tunnel(): + try: + async with tunnel_server: + await tunnel_server.serve_forever() + except Exception as e: + log.error(f"Error in tunnel task: {e}") + + tunnel_task = asyncio.create_task(run_tunnel()) + + # Ok, the tunnel is now running. Check that we can connect to the proxy's + # websocket interface, through the tunnel + tunnel_connstring = f"postgres://{user}:{password}@127.0.0.1:{tunnel_port}/postgres" + + log.info(f"connecting to {tunnel_connstring}") + conn = await asyncpg.connect(tunnel_connstring) + res = await conn.fetchval("SELECT 123") + assert res == 123 + await conn.close() + log.info("Ran a query successfully through the tunnel") + + tunnel_server.close() + try: + await tunnel_task + except asyncio.CancelledError: + pass diff --git a/test_runner/regress/test_sharding.py b/test_runner/regress/test_sharding.py index 673904a1cd..86a6b7428b 100644 --- a/test_runner/regress/test_sharding.py +++ b/test_runner/regress/test_sharding.py @@ -520,14 +520,18 @@ def test_sharding_split_smoke( shard_count = 2 # Shard count we split into split_shard_count = 4 - # We will have 2 shards per pageserver once done (including secondaries) - neon_env_builder.num_pageservers = split_shard_count + # In preferred AZ & other AZ we will end up with one shard per pageserver + neon_env_builder.num_pageservers = split_shard_count * 2 # Two AZs def assign_az(ps_cfg): az = f"az-{(ps_cfg['id'] - 1) % 2}" ps_cfg["availability_zone"] = az + # We will run more pageservers than tests usually do, so give them tiny page caches + # in case we're on a test node under memory pressure. + ps_cfg["page_cache_size"] = 128 + neon_env_builder.pageserver_config_override = assign_az # 1MiB stripes: enable getting some meaningful data distribution without @@ -679,8 +683,8 @@ def test_sharding_split_smoke( # - shard_count reconciles for the original setup of the tenant # - shard_count reconciles for detaching the original secondary locations during split # - split_shard_count reconciles during shard splitting, for setting up secondaries. - # - split_shard_count/2 of the child shards will need to fail over to their secondaries (since we have 8 shards and 4 pageservers, only 4 will move) - expect_reconciles = shard_count * 2 + split_shard_count + split_shard_count / 2 + # - split_shard_count/2 reconciles to migrate shards to their temporary secondaries + expect_reconciles = shard_count * 2 + split_shard_count + 3 * (split_shard_count / 2) reconcile_ok = env.storage_controller.get_metric_value( "storage_controller_reconcile_complete_total", filter={"status": "ok"} @@ -745,10 +749,14 @@ def test_sharding_split_smoke( # dominated by shard count. log.info(f"total: {total}") assert total == { - 1: 2, - 2: 2, - 3: 2, - 4: 2, + 1: 1, + 2: 1, + 3: 1, + 4: 1, + 5: 1, + 6: 1, + 7: 1, + 8: 1, } # The controller is not required to lay out the attached locations in any particular way, but @@ -1387,13 +1395,7 @@ def test_sharding_split_failures( else: attached_count += 1 - if exclude_ps_id is not None: - # For a node failure case, we expect there to be a secondary location - # scheduled on the offline node, so expect one fewer secondary in total - assert secondary_count == initial_shard_count - 1 - else: - assert secondary_count == initial_shard_count - + assert secondary_count == initial_shard_count assert attached_count == initial_shard_count def assert_split_done(exclude_ps_id: int | None = None) -> None: diff --git a/test_runner/regress/test_storage_controller.py b/test_runner/regress/test_storage_controller.py index da6d5b8622..350fe31099 100644 --- a/test_runner/regress/test_storage_controller.py +++ b/test_runner/regress/test_storage_controller.py @@ -113,6 +113,19 @@ def test_storage_controller_smoke(neon_env_builder: NeonEnvBuilder, combination) for tid in tenant_ids: env.create_tenant(tid, shard_count=shards_per_tenant) + # Tenant listing API should work + listed_tenants = env.storage_controller.tenant_list() + log.info(f"listed_tenants: {listed_tenants}") + assert set(t["tenant_id"] for t in listed_tenants) == set(str(t) for t in tenant_ids) + paged = env.storage_controller.tenant_list(limit=2, start_after=listed_tenants[0]["tenant_id"]) + assert len(paged) == 2 + assert paged[0] == listed_tenants[1] + assert paged[1] == listed_tenants[2] + paged = env.storage_controller.tenant_list( + limit=1000, start_after="ffffffffffffffffffffffffffffffff" + ) + assert paged == [] + # Validate high level metrics assert ( env.storage_controller.get_metric_value("storage_controller_tenant_shards") @@ -822,6 +835,122 @@ def test_storage_controller_stuck_compute_hook( env.storage_controller.consistency_check() +@run_only_on_default_postgres("postgres behavior is not relevant") +def test_storage_controller_compute_hook_retry( + httpserver: HTTPServer, + neon_env_builder: NeonEnvBuilder, + httpserver_listen_address: ListenAddress, +): + """ + Test that when a reconciler can't do its compute hook notification, it will keep + trying until it succeeds. + + Reproducer for https://github.com/neondatabase/cloud/issues/22612 + """ + + neon_env_builder.num_pageservers = 2 + (host, port) = httpserver_listen_address + neon_env_builder.control_plane_compute_hook_api = f"http://{host}:{port}/notify" + + handle_params = {"status": 200} + + notifications = [] + + def handler(request: Request): + status = handle_params["status"] + log.info(f"Notify request[{status}]: {request}") + notifications.append(request.json) + return Response(status=status) + + httpserver.expect_request("/notify", method="PUT").respond_with_handler(handler) + + # Start running + env = neon_env_builder.init_configs() + env.start() + + tenant_id = TenantId.generate() + env.create_tenant(tenant_id, placement_policy='{"Attached": 1}') + + # Initial notification from tenant creation + assert len(notifications) == 1 + expect: dict[str, list[dict[str, int]] | str | None | int] = { + "tenant_id": str(tenant_id), + "stripe_size": None, + "shards": [{"node_id": int(env.pageservers[0].id), "shard_number": 0}], + "preferred_az": DEFAULT_AZ_ID, + } + assert notifications[0] == expect + + # Block notifications, and fail a node + handle_params["status"] = 423 + env.pageservers[0].stop() + env.storage_controller.allowed_errors.append(NOTIFY_BLOCKED_LOG) + env.storage_controller.allowed_errors.extend(NOTIFY_FAILURE_LOGS) + + # Avoid waiting for heartbeats + env.storage_controller.node_configure(env.pageservers[0].id, {"availability": "Offline"}) + + # Make reconciler run and fail: it should leave itself in a state where the shard will retry notification later, + # and we will check that that happens + notifications = [] + try: + assert env.storage_controller.reconcile_all() == 1 + except StorageControllerApiException as e: + assert "Control plane tenant busy" in str(e) + assert len(notifications) == 1 + assert ( + env.storage_controller.tenant_describe(tenant_id)["shards"][0][ + "is_pending_compute_notification" + ] + is True + ) + + # Try reconciling again, it should try notifying again + notifications = [] + try: + assert env.storage_controller.reconcile_all() == 1 + except StorageControllerApiException as e: + assert "Control plane tenant busy" in str(e) + assert len(notifications) == 1 + assert ( + env.storage_controller.tenant_describe(tenant_id)["shards"][0][ + "is_pending_compute_notification" + ] + is True + ) + + # The describe API should indicate that a notification is pending + assert ( + env.storage_controller.tenant_describe(tenant_id)["shards"][0][ + "is_pending_compute_notification" + ] + is True + ) + + # Unblock notifications: reconcile should work now + handle_params["status"] = 200 + notifications = [] + assert env.storage_controller.reconcile_all() == 1 + assert len(notifications) == 1 + assert ( + env.storage_controller.tenant_describe(tenant_id)["shards"][0][ + "is_pending_compute_notification" + ] + is False + ) + + # Reconciler should be idle now that it succeeded in its compute notification + notifications = [] + assert env.storage_controller.reconcile_all() == 0 + assert len(notifications) == 0 + assert ( + env.storage_controller.tenant_describe(tenant_id)["shards"][0][ + "is_pending_compute_notification" + ] + is False + ) + + @run_only_on_default_postgres("this test doesn't start an endpoint") def test_storage_controller_compute_hook_revert( httpserver: HTTPServer, @@ -936,7 +1065,7 @@ def test_storage_controller_debug_apis(neon_env_builder: NeonEnvBuilder): that just hits the endpoints to check that they don't bitrot. """ - neon_env_builder.num_pageservers = 2 + neon_env_builder.num_pageservers = 3 env = neon_env_builder.init_start() tenant_id = TenantId.generate() @@ -961,7 +1090,7 @@ def test_storage_controller_debug_apis(neon_env_builder: NeonEnvBuilder): "GET", f"{env.storage_controller_api}/debug/v1/scheduler" ) # Two nodes, in a dict of node_id->node - assert len(response.json()["nodes"]) == 2 + assert len(response.json()["nodes"]) == 3 assert sum(v["shard_count"] for v in response.json()["nodes"].values()) == 3 assert all(v["may_schedule"] for v in response.json()["nodes"].values()) @@ -972,13 +1101,25 @@ def test_storage_controller_debug_apis(neon_env_builder: NeonEnvBuilder): headers=env.storage_controller.headers(TokenScope.ADMIN), ) + # Secondary migration API: superficial check that it migrates + secondary_dest = env.pageservers[2].id + env.storage_controller.request( + "PUT", + f"{env.storage_controller_api}/control/v1/tenant/{tenant_id}-0002/migrate_secondary", + headers=env.storage_controller.headers(TokenScope.ADMIN), + json={"tenant_shard_id": f"{tenant_id}-0002", "node_id": secondary_dest}, + ) + assert env.storage_controller.tenant_describe(tenant_id)["shards"][0]["node_secondary"] == [ + secondary_dest + ] + # Node unclean drop API response = env.storage_controller.request( "POST", f"{env.storage_controller_api}/debug/v1/node/{env.pageservers[1].id}/drop", headers=env.storage_controller.headers(TokenScope.ADMIN), ) - assert len(env.storage_controller.node_list()) == 1 + assert len(env.storage_controller.node_list()) == 2 # Tenant unclean drop API response = env.storage_controller.request( @@ -1378,7 +1519,7 @@ class PageserverFailpoint(Failure): def build_node_to_tenants_map(env: NeonEnv) -> dict[int, list[TenantId]]: - tenants = env.storage_controller.tenant_list() + tenants = env.storage_controller.tenant_shard_dump() node_to_tenants: dict[int, list[TenantId]] = {} for t in tenants: @@ -1696,7 +1837,13 @@ def test_storcon_cli(neon_env_builder: NeonEnvBuilder): """ output_dir = neon_env_builder.test_output_dir shard_count = 4 - env = neon_env_builder.init_start(initial_tenant_shard_count=shard_count) + neon_env_builder.num_pageservers = 2 + env = neon_env_builder.init_configs() + env.start() + + tenant_id = TenantId.generate() + env.create_tenant(tenant_id, placement_policy='{"Attached":1}', shard_count=shard_count) + base_args = [env.neon_binpath / "storcon_cli", "--api", env.storage_controller_api] def storcon_cli(args): @@ -1725,7 +1872,7 @@ def test_storcon_cli(neon_env_builder: NeonEnvBuilder): # List nodes node_lines = storcon_cli(["nodes"]) # Table header, footer, and one line of data - assert len(node_lines) == 5 + assert len(node_lines) == 7 assert "localhost" in node_lines[3] # Pause scheduling onto a node @@ -1743,10 +1890,21 @@ def test_storcon_cli(neon_env_builder: NeonEnvBuilder): storcon_cli(["node-configure", "--node-id", "1", "--availability", "offline"]) assert "Offline" in storcon_cli(["nodes"])[3] + # Restore node, verify status changes in CLI output + env.pageservers[0].start() + + def is_online(): + assert "Offline" not in storcon_cli(["nodes"]) + + wait_until(is_online) + + # Let everything stabilize after node failure to avoid interfering with subsequent steps + env.storage_controller.reconcile_until_idle(timeout_secs=10) + # List tenants tenant_lines = storcon_cli(["tenants"]) assert len(tenant_lines) == 5 - assert str(env.initial_tenant) in tenant_lines[3] + assert str(tenant_id) in tenant_lines[3] # Setting scheduling policies intentionally result in warnings, they're for rare use. env.storage_controller.allowed_errors.extend( @@ -1754,23 +1912,58 @@ def test_storcon_cli(neon_env_builder: NeonEnvBuilder): ) # Describe a tenant - tenant_lines = storcon_cli(["tenant-describe", "--tenant-id", str(env.initial_tenant)]) + tenant_lines = storcon_cli(["tenant-describe", "--tenant-id", str(tenant_id)]) assert len(tenant_lines) >= 3 + shard_count * 2 - assert str(env.initial_tenant) in tenant_lines[0] + assert str(tenant_id) in tenant_lines[0] + + # Migrate an attached location + def other_ps_id(current_ps_id): + return ( + env.pageservers[0].id + if current_ps_id == env.pageservers[1].id + else env.pageservers[1].id + ) + + storcon_cli( + [ + "tenant-shard-migrate", + "--tenant-shard-id", + f"{tenant_id}-0004", + "--node", + str( + other_ps_id( + env.storage_controller.tenant_describe(tenant_id)["shards"][0]["node_attached"] + ) + ), + ] + ) + + # Migrate a secondary location + storcon_cli( + [ + "tenant-shard-migrate-secondary", + "--tenant-shard-id", + f"{tenant_id}-0004", + "--node", + str( + other_ps_id( + env.storage_controller.tenant_describe(tenant_id)["shards"][0][ + "node_secondary" + ][0] + ) + ), + ] + ) # Pause changes on a tenant - storcon_cli(["tenant-policy", "--tenant-id", str(env.initial_tenant), "--scheduling", "stop"]) + storcon_cli(["tenant-policy", "--tenant-id", str(tenant_id), "--scheduling", "stop"]) assert "Stop" in storcon_cli(["tenants"])[3] # Cancel ongoing reconcile on a tenant - storcon_cli( - ["tenant-shard-cancel-reconcile", "--tenant-shard-id", f"{env.initial_tenant}-0104"] - ) + storcon_cli(["tenant-shard-cancel-reconcile", "--tenant-shard-id", f"{tenant_id}-0104"]) # Change a tenant's placement - storcon_cli( - ["tenant-policy", "--tenant-id", str(env.initial_tenant), "--placement", "secondary"] - ) + storcon_cli(["tenant-policy", "--tenant-id", str(tenant_id), "--placement", "secondary"]) assert "Secondary" in storcon_cli(["tenants"])[3] # Modify a tenant's config @@ -1778,7 +1971,7 @@ def test_storcon_cli(neon_env_builder: NeonEnvBuilder): [ "patch-tenant-config", "--tenant-id", - str(env.initial_tenant), + str(tenant_id), "--config", json.dumps({"pitr_interval": "1m"}), ] @@ -2201,6 +2394,7 @@ def test_storage_controller_node_deletion( Test that deleting a node works & properly reschedules everything that was on the node. """ neon_env_builder.num_pageservers = 3 + neon_env_builder.num_azs = 3 env = neon_env_builder.init_configs() env.start() @@ -2214,6 +2408,9 @@ def test_storage_controller_node_deletion( tid, placement_policy='{"Attached":1}', shard_count=shard_count_per_tenant ) + # Sanity check: initial creations should not leave the system in an unstable scheduling state + assert env.storage_controller.reconcile_all() == 0 + victim = env.pageservers[-1] # The procedure a human would follow is: @@ -2451,7 +2648,7 @@ def test_storage_controller_step_down(neon_env_builder: NeonEnvBuilder): # Validate that the storcon attempts to forward the request, but stops. # when it realises it is still the current leader. with pytest.raises(StorageControllerApiException, match="Leader is stepped down instance"): - env.storage_controller.tenant_list() + env.storage_controller.tenant_shard_dump() # Validate that we can step down multiple times and the observed state # doesn't change. @@ -2601,7 +2798,7 @@ def test_storage_controller_leadership_transfer( # Check that the stepped down instance forwards requests # to the new leader while it's still running. storage_controller_proxy.route_to(f"http://127.0.0.1:{storage_controller_1_port}") - env.storage_controller.tenant_list() + env.storage_controller.tenant_shard_dump() env.storage_controller.node_configure(env.pageservers[0].id, {"scheduling": "Pause"}) status = env.storage_controller.node_status(env.pageservers[0].id) assert status["scheduling"] == "Pause" @@ -3015,6 +3212,17 @@ def test_safekeeper_deployment_time_update(neon_env_builder: NeonEnvBuilder): assert eq_safekeeper_records(body, inserted_now) + # some small tests for the scheduling policy querying and returning APIs + newest_info = target.get_safekeeper(inserted["id"]) + assert newest_info + assert newest_info["scheduling_policy"] == "Pause" + target.safekeeper_scheduling_policy(inserted["id"], "Decomissioned") + newest_info = target.get_safekeeper(inserted["id"]) + assert newest_info + assert newest_info["scheduling_policy"] == "Decomissioned" + # Ensure idempotency + target.safekeeper_scheduling_policy(inserted["id"], "Decomissioned") + def eq_safekeeper_records(a: dict[str, Any], b: dict[str, Any]) -> bool: compared = [dict(a), dict(b)] @@ -3033,11 +3241,12 @@ def eq_safekeeper_records(a: dict[str, Any], b: dict[str, Any]) -> bool: @run_only_on_default_postgres("this is like a 'unit test' against storcon db") def test_shard_preferred_azs(neon_env_builder: NeonEnvBuilder): def assign_az(ps_cfg): - az = f"az-{ps_cfg['id']}" + az = f"az-{ps_cfg['id'] % 2}" + log.info("Assigned AZ {az}") ps_cfg["availability_zone"] = az neon_env_builder.pageserver_config_override = assign_az - neon_env_builder.num_pageservers = 2 + neon_env_builder.num_pageservers = 4 env = neon_env_builder.init_configs() env.start() @@ -3052,8 +3261,14 @@ def test_shard_preferred_azs(neon_env_builder: NeonEnvBuilder): assert shards[0]["preferred_az_id"] == expected_az + # When all other schedule scoring parameters are equal, tenants should round-robin on AZs + assert env.storage_controller.tenant_describe(tids[0])["shards"][0]["preferred_az_id"] == "az-0" + assert env.storage_controller.tenant_describe(tids[1])["shards"][0]["preferred_az_id"] == "az-1" + assert env.storage_controller.tenant_describe(tids[2])["shards"][0]["preferred_az_id"] == "az-0" + + # Try modifying preferred AZ updated = env.storage_controller.set_preferred_azs( - {TenantShardId(tid, 0, 0): "foo" for tid in tids} + {TenantShardId(tid, 0, 0): "az-0" for tid in tids} ) assert set(updated) == set([TenantShardId(tid, 0, 0) for tid in tids]) @@ -3061,29 +3276,24 @@ def test_shard_preferred_azs(neon_env_builder: NeonEnvBuilder): for tid in tids: shards = env.storage_controller.tenant_describe(tid)["shards"] assert len(shards) == 1 - assert shards[0]["preferred_az_id"] == "foo" + assert shards[0]["preferred_az_id"] == "az-0" - # Generate a layer to avoid shard split handling on ps from tripping - # up on debug assert. - timeline_id = TimelineId.generate() - env.create_timeline("bar", tids[0], timeline_id) - - workload = Workload(env, tids[0], timeline_id, branch_name="bar") - workload.init() - workload.write_rows(256) - workload.validate() + # Having modified preferred AZ, we should get moved there + env.storage_controller.reconcile_until_idle(max_interval=0.1) + for tid in tids: + shard = env.storage_controller.tenant_describe(tid)["shards"][0] + attached_to = shard["node_attached"] + attached_in_az = env.get_pageserver(attached_to).az_id + assert shard["preferred_az_id"] == attached_in_az == "az-0" env.storage_controller.tenant_shard_split(tids[0], shard_count=2) + env.storage_controller.reconcile_until_idle(max_interval=0.1) shards = env.storage_controller.tenant_describe(tids[0])["shards"] assert len(shards) == 2 for shard in shards: attached_to = shard["node_attached"] - expected_az = env.get_pageserver(attached_to).az_id - - # The scheduling optimization logic is not yet AZ-aware, so doesn't succeed - # in putting the tenant shards in the preferred AZ. - # To be fixed in https://github.com/neondatabase/neon/pull/9916 - # assert shard["preferred_az_id"] == expected_az + attached_in_az = env.get_pageserver(attached_to).az_id + assert shard["preferred_az_id"] == attached_in_az == "az-0" @run_only_on_default_postgres("Postgres version makes no difference here") diff --git a/test_runner/regress/test_tenant_delete.py b/test_runner/regress/test_tenant_delete.py index 48e55c1ab1..3720f653c5 100644 --- a/test_runner/regress/test_tenant_delete.py +++ b/test_runner/regress/test_tenant_delete.py @@ -1,6 +1,7 @@ from __future__ import annotations import json +from concurrent.futures import ThreadPoolExecutor from threading import Thread import pytest @@ -253,29 +254,8 @@ def test_tenant_delete_races_timeline_creation(neon_env_builder: NeonEnvBuilder) ps_http.configure_failpoints((BEFORE_INITDB_UPLOAD_FAILPOINT, "pause")) def timeline_create(): - try: - ps_http.timeline_create(env.pg_version, tenant_id, TimelineId.generate(), timeout=1) - raise RuntimeError("creation succeeded even though it shouldn't") - except ReadTimeout: - pass - - Thread(target=timeline_create).start() - - def hit_initdb_upload_failpoint(): - env.pageserver.assert_log_contains(f"at failpoint {BEFORE_INITDB_UPLOAD_FAILPOINT}") - - wait_until(hit_initdb_upload_failpoint) - - def creation_connection_timed_out(): - env.pageserver.assert_log_contains( - "POST.*/timeline.* request was dropped before completing" - ) - - # Wait so that we hit the timeout and the connection is dropped - # (But timeline creation still continues) - wait_until(creation_connection_timed_out) - - ps_http.configure_failpoints((DELETE_BEFORE_CLEANUP_FAILPOINT, "pause")) + ps_http.timeline_create(env.pg_version, tenant_id, TimelineId.generate(), timeout=1) + raise RuntimeError("creation succeeded even though it shouldn't") def tenant_delete(): def tenant_delete_inner(): @@ -283,21 +263,46 @@ def test_tenant_delete_races_timeline_creation(neon_env_builder: NeonEnvBuilder) wait_until(tenant_delete_inner) - Thread(target=tenant_delete).start() + # We will spawn background threads for timeline creation and tenant deletion. They will both + # get blocked on our failpoint. + with ThreadPoolExecutor(max_workers=1) as executor: + create_fut = executor.submit(timeline_create) - def deletion_arrived(): - env.pageserver.assert_log_contains( - f"cfg failpoint: {DELETE_BEFORE_CLEANUP_FAILPOINT} pause" - ) + def hit_initdb_upload_failpoint(): + env.pageserver.assert_log_contains(f"at failpoint {BEFORE_INITDB_UPLOAD_FAILPOINT}") - wait_until(deletion_arrived) + wait_until(hit_initdb_upload_failpoint) - ps_http.configure_failpoints((DELETE_BEFORE_CLEANUP_FAILPOINT, "off")) + def creation_connection_timed_out(): + env.pageserver.assert_log_contains( + "POST.*/timeline.* request was dropped before completing" + ) - # Disable the failpoint and wait for deletion to finish - ps_http.configure_failpoints((BEFORE_INITDB_UPLOAD_FAILPOINT, "off")) + # Wait so that we hit the timeout and the connection is dropped + # (But timeline creation still continues) + wait_until(creation_connection_timed_out) - ps_http.tenant_delete(tenant_id) + with pytest.raises(ReadTimeout): + # Our creation failed from the client's point of view. + create_fut.result() + + ps_http.configure_failpoints((DELETE_BEFORE_CLEANUP_FAILPOINT, "pause")) + + delete_fut = executor.submit(tenant_delete) + + def deletion_arrived(): + env.pageserver.assert_log_contains( + f"cfg failpoint: {DELETE_BEFORE_CLEANUP_FAILPOINT} pause" + ) + + wait_until(deletion_arrived) + + ps_http.configure_failpoints((DELETE_BEFORE_CLEANUP_FAILPOINT, "off")) + + # Disable the failpoint and wait for deletion to finish + ps_http.configure_failpoints((BEFORE_INITDB_UPLOAD_FAILPOINT, "off")) + + delete_fut.result() # Physical deletion should have happened assert_prefix_empty( diff --git a/test_runner/regress/test_tenants.py b/test_runner/regress/test_tenants.py index d31901b384..b4c968b217 100644 --- a/test_runner/regress/test_tenants.py +++ b/test_runner/regress/test_tenants.py @@ -194,7 +194,7 @@ def test_metrics_normal_work(neon_env_builder: NeonEnvBuilder): io_metrics = query_all_safekeepers( "safekeeper_pg_io_bytes_total", { - "app_name": "pageserver", + "app_name": f"pageserver-{env.pageserver.id}", "client_az": "test_ps_az", "dir": io_direction, "same_az": "false", diff --git a/test_runner/regress/test_wal_acceptor.py b/test_runner/regress/test_wal_acceptor.py index 0a8900b351..2b6a267bdf 100644 --- a/test_runner/regress/test_wal_acceptor.py +++ b/test_runner/regress/test_wal_acceptor.py @@ -48,7 +48,12 @@ from fixtures.remote_storage import ( default_remote_storage, s3_storage, ) -from fixtures.safekeeper.http import SafekeeperHttpClient +from fixtures.safekeeper.http import ( + Configuration, + SafekeeperHttpClient, + SafekeeperId, + TimelineCreateRequest, +) from fixtures.safekeeper.utils import wait_walreceivers_absent from fixtures.utils import ( PropagatingThread, @@ -658,7 +663,13 @@ def test_s3_wal_replay(neon_env_builder: NeonEnvBuilder): for sk in env.safekeepers: sk.start() cli = sk.http_client() - cli.timeline_create(tenant_id, timeline_id, pg_version, last_lsn) + mconf = Configuration(generation=0, members=[], new_members=None) + # set start_lsn to the beginning of the first segment to allow reading + # WAL from there (could you intidb LSN as well). + r = TimelineCreateRequest( + tenant_id, timeline_id, mconf, pg_version, Lsn("0/1000000"), commit_lsn=last_lsn + ) + cli.timeline_create(r) f_partial_path = ( Path(sk.data_dir) / str(tenant_id) / str(timeline_id) / f_partial_saved.name ) @@ -2237,6 +2248,63 @@ def test_pull_timeline_while_evicted(neon_env_builder: NeonEnvBuilder): wait_until(unevicted_on_dest, interval=0.1, timeout=1.0) +# Basic test for http API membership related calls: create timeline and switch +# configuration. Normally these are called by storage controller, but this +# allows to test them separately. +@run_only_on_default_postgres("tests only safekeeper API") +def test_membership_api(neon_env_builder: NeonEnvBuilder): + neon_env_builder.num_safekeepers = 1 + env = neon_env_builder.init_start() + + tenant_id = env.initial_tenant + timeline_id = env.initial_timeline + + sk = env.safekeepers[0] + http_cli = sk.http_client() + + sk_id_1 = SafekeeperId(env.safekeepers[0].id, "localhost", sk.port.pg_tenant_only) + sk_id_2 = SafekeeperId(11, "localhost", 5434) # just a mock + + # Request to switch before timeline creation should fail. + init_conf = Configuration(generation=1, members=[sk_id_1], new_members=None) + with pytest.raises(requests.exceptions.HTTPError): + http_cli.membership_switch(tenant_id, timeline_id, init_conf) + + # Create timeline. + create_r = TimelineCreateRequest( + tenant_id, timeline_id, init_conf, 150002, Lsn("0/1000000"), commit_lsn=None + ) + log.info(f"sending {create_r.to_json()}") + http_cli.timeline_create(create_r) + + # Switch into some conf. + joint_conf = Configuration(generation=4, members=[sk_id_1], new_members=[sk_id_2]) + resp = http_cli.membership_switch(tenant_id, timeline_id, joint_conf) + log.info(f"joint switch resp: {resp}") + assert resp.previous_conf.generation == 1 + assert resp.current_conf.generation == 4 + + # Restart sk, conf should be preserved. + sk.stop().start() + after_restart = http_cli.get_membership(tenant_id, timeline_id) + log.info(f"conf after restart: {after_restart}") + assert after_restart.generation == 4 + + # Switch into disjoint conf. + non_joint = Configuration(generation=5, members=[sk_id_2], new_members=None) + resp = http_cli.membership_switch(tenant_id, timeline_id, non_joint) + log.info(f"non joint switch resp: {resp}") + assert resp.previous_conf.generation == 4 + assert resp.current_conf.generation == 5 + + # Switch request to lower conf should be ignored. + lower_conf = Configuration(generation=3, members=[], new_members=None) + resp = http_cli.membership_switch(tenant_id, timeline_id, lower_conf) + log.info(f"lower switch resp: {resp}") + assert resp.previous_conf.generation == 5 + assert resp.current_conf.generation == 5 + + # In this test we check for excessive START_REPLICATION and START_WAL_PUSH queries # when compute is active, but there are no writes to the timeline. In that case # pageserver should maintain a single connection to safekeeper and don't attempt diff --git a/test_runner/websocket_tunnel.py b/test_runner/websocket_tunnel.py new file mode 100755 index 0000000000..facdb19140 --- /dev/null +++ b/test_runner/websocket_tunnel.py @@ -0,0 +1,154 @@ +#!/usr/bin/env python3 +# +# This program helps to test the WebSocket tunneling in proxy. It listens for a TCP +# connection on a port, and when you connect to it, it opens a websocket connection, +# and forwards all the traffic to the websocket connection, wrapped in WebSocket binary +# frames. +# +# This is used in the test_proxy::test_websockets test, but it is handy for manual testing too. +# +# Usage for manual testing: +# +# ## Launch Posgres on port 3000: +# postgres -D data -p3000 +# +# ## Launch proxy with WSS enabled: +# openssl req -new -x509 -days 365 -nodes -text -out server.crt -keyout server.key -subj '/CN=*.neon.localtest.me' +# ./target/debug/proxy --wss 127.0.0.1:40433 --http 127.0.0.1:28080 --mgmt 127.0.0.1:9099 --proxy 127.0.0.1:4433 --tls-key server.key --tls-cert server.crt --auth-backend postgres +# +# ## Launch the tunnel: +# +# poetry run ./test_runner/websocket_tunnel.py --ws-port 40433 --ws-url "wss://ep-test.neon.localtest.me" +# +# ## Now you can connect with psql: +# psql "postgresql://heikki@localhost:40433/postgres" +# + +import argparse +import asyncio +import logging +import ssl +from ssl import Purpose + +import websockets +from fixtures.log_helper import log + + +# Enable verbose logging of all the traffic +def enable_verbose_logging(): + logger = logging.getLogger("websockets") + logger.setLevel(logging.DEBUG) + logger.addHandler(logging.StreamHandler()) + + +async def start_server(tcp_listen_host, tcp_listen_port, ws_url, ctx): + server = await asyncio.start_server( + lambda r, w: handle_client(r, w, ws_url, ctx), tcp_listen_host, tcp_listen_port + ) + return server + + +async def handle_tcp_to_websocket(tcp_reader, ws): + try: + while not tcp_reader.at_eof(): + data = await tcp_reader.read(1024) + + await ws.send(data) + except websockets.exceptions.ConnectionClosedError as e: + log.debug(f"connection closed: {e}") + except websockets.exceptions.ConnectionClosedOK: + log.debug("connection closed") + except Exception as e: + log.error(e) + + +async def handle_websocket_to_tcp(ws, tcp_writer): + try: + async for message in ws: + tcp_writer.write(message) + await tcp_writer.drain() + except websockets.exceptions.ConnectionClosedError as e: + log.debug(f"connection closed: {e}") + except websockets.exceptions.ConnectionClosedOK: + log.debug("connection closed") + except Exception as e: + log.error(e) + + +async def handle_client(tcp_reader, tcp_writer, ws_url: str, ctx: ssl.SSLContext): + try: + log.info("Received TCP connection. Connecting to websockets proxy.") + + async with websockets.connect(ws_url, ssl=ctx) as ws: + try: + log.info("Connected to websockets proxy") + + async with asyncio.TaskGroup() as tg: + task1 = tg.create_task(handle_tcp_to_websocket(tcp_reader, ws)) + task2 = tg.create_task(handle_websocket_to_tcp(ws, tcp_writer)) + + done, pending = await asyncio.wait( + [task1, task2], return_when=asyncio.FIRST_COMPLETED + ) + tcp_writer.close() + await ws.close() + + except* Exception as ex: + log.error(ex.exceptions) + except Exception as e: + log.error(e) + + +async def main(): + parser = argparse.ArgumentParser() + parser.add_argument( + "--tcp-listen-addr", + default="localhost", + help="TCP addr to listen on", + ) + parser.add_argument( + "--tcp-listen-port", + default="40444", + help="TCP port to listen on", + ) + + parser.add_argument( + "--ws-url", + default="wss://localhost/", + help="websocket URL to connect to. This determines the Host header sent to the server", + ) + parser.add_argument( + "--ws-host", + default="127.0.0.1", + help="websockets host to connect to", + ) + parser.add_argument( + "--ws-port", + type=int, + default=443, + help="websockets port to connect to", + ) + parser.add_argument( + "--verbose", + action="store_true", + help="enable verbose logging", + ) + args = parser.parse_args() + + if args.verbose: + enable_verbose_logging() + + ctx = ssl.create_default_context(Purpose.SERVER_AUTH) + ctx.check_hostname = False + ctx.verify_mode = ssl.CERT_NONE + + server = await start_server(args.tcp_listen_addr, args.tcp_listen_port, args.ws_url, ctx) + print( + f"Listening for connections at {args.tcp_listen_addr}:{args.tcp_listen_port}, forwarding them to {args.ws_host}:{args.ws_port}" + ) + async with server: + await server.serve_forever() + + +if __name__ == "__main__": + asyncio.run(main()) diff --git a/vendor/postgres-v14 b/vendor/postgres-v14 index c2f65b3201..46082f2088 160000 --- a/vendor/postgres-v14 +++ b/vendor/postgres-v14 @@ -1 +1 @@ -Subproject commit c2f65b3201591e02ce45b66731392f98d3388e73 +Subproject commit 46082f20884f087a2d974b33ac65d63af26142bd diff --git a/vendor/postgres-v15 b/vendor/postgres-v15 index f262d631ad..dd0b28d6fb 160000 --- a/vendor/postgres-v15 +++ b/vendor/postgres-v15 @@ -1 +1 @@ -Subproject commit f262d631ad477a1819e84a183e5a7ef561830085 +Subproject commit dd0b28d6fbad39e227f3b77296fcca879af8b3a9 diff --git a/vendor/postgres-v16 b/vendor/postgres-v16 index 97f9fde349..d674efd776 160000 --- a/vendor/postgres-v16 +++ b/vendor/postgres-v16 @@ -1 +1 @@ -Subproject commit 97f9fde349c6de6d573f5ce96db07eca60ce6185 +Subproject commit d674efd776f59d78e8fa1535bd2f95c3e6984fca diff --git a/vendor/postgres-v17 b/vendor/postgres-v17 index 7e3f3974bc..a8dd6e779d 160000 --- a/vendor/postgres-v17 +++ b/vendor/postgres-v17 @@ -1 +1 @@ -Subproject commit 7e3f3974bc8895938308f94d0e96879ffae638cd +Subproject commit a8dd6e779dde907778006adb436b557ad652fb97 diff --git a/vendor/revisions.json b/vendor/revisions.json index bff2f70931..c899dbaa5a 100644 --- a/vendor/revisions.json +++ b/vendor/revisions.json @@ -1,18 +1,18 @@ { "v17": [ "17.2", - "7e3f3974bc8895938308f94d0e96879ffae638cd" + "a8dd6e779dde907778006adb436b557ad652fb97" ], "v16": [ "16.6", - "97f9fde349c6de6d573f5ce96db07eca60ce6185" + "d674efd776f59d78e8fa1535bd2f95c3e6984fca" ], "v15": [ "15.10", - "f262d631ad477a1819e84a183e5a7ef561830085" + "dd0b28d6fbad39e227f3b77296fcca879af8b3a9" ], "v14": [ "14.15", - "c2f65b3201591e02ce45b66731392f98d3388e73" + "46082f20884f087a2d974b33ac65d63af26142bd" ] }