diff --git a/.config/hakari.toml b/.config/hakari.toml index b5990d090e..3b6d9d8822 100644 --- a/.config/hakari.toml +++ b/.config/hakari.toml @@ -46,6 +46,9 @@ workspace-members = [ "utils", "wal_craft", "walproposer", + "postgres-protocol2", + "postgres-types2", + "tokio-postgres2", ] # Write out exact versions rather than a semver range. (Defaults to false.) diff --git a/.github/actions/allure-report-generate/action.yml b/.github/actions/allure-report-generate/action.yml index d1d09223db..d6219c31b4 100644 --- a/.github/actions/allure-report-generate/action.yml +++ b/.github/actions/allure-report-generate/action.yml @@ -43,7 +43,8 @@ runs: PR_NUMBER=$(jq --raw-output .pull_request.number "$GITHUB_EVENT_PATH" || true) if [ "${PR_NUMBER}" != "null" ]; then BRANCH_OR_PR=pr-${PR_NUMBER} - elif [ "${GITHUB_REF_NAME}" = "main" ] || [ "${GITHUB_REF_NAME}" = "release" ] || [ "${GITHUB_REF_NAME}" = "release-proxy" ]; then + elif [ "${GITHUB_REF_NAME}" = "main" ] || [ "${GITHUB_REF_NAME}" = "release" ] || \ + [ "${GITHUB_REF_NAME}" = "release-proxy" ] || [ "${GITHUB_REF_NAME}" = "release-compute" ]; then # Shortcut for special branches BRANCH_OR_PR=${GITHUB_REF_NAME} else diff --git a/.github/actions/allure-report-store/action.yml b/.github/actions/allure-report-store/action.yml index 9c376f420a..3c83656c89 100644 --- a/.github/actions/allure-report-store/action.yml +++ b/.github/actions/allure-report-store/action.yml @@ -23,7 +23,8 @@ runs: PR_NUMBER=$(jq --raw-output .pull_request.number "$GITHUB_EVENT_PATH" || true) if [ "${PR_NUMBER}" != "null" ]; then BRANCH_OR_PR=pr-${PR_NUMBER} - elif [ "${GITHUB_REF_NAME}" = "main" ] || [ "${GITHUB_REF_NAME}" = "release" ] || [ "${GITHUB_REF_NAME}" = "release-proxy" ]; then + elif [ "${GITHUB_REF_NAME}" = "main" ] || [ "${GITHUB_REF_NAME}" = "release" ] || \ + [ "${GITHUB_REF_NAME}" = "release-proxy" ] || [ "${GITHUB_REF_NAME}" = "release-compute" ]; then # Shortcut for special branches BRANCH_OR_PR=${GITHUB_REF_NAME} else diff --git a/.github/actions/run-python-test-set/action.yml b/.github/actions/run-python-test-set/action.yml index 275f161019..1159627302 100644 --- a/.github/actions/run-python-test-set/action.yml +++ b/.github/actions/run-python-test-set/action.yml @@ -36,8 +36,8 @@ inputs: description: 'Region name for real s3 tests' required: false default: '' - rerun_flaky: - description: 'Whether to rerun flaky tests' + rerun_failed: + description: 'Whether to rerun failed tests' required: false default: 'false' pg_version: @@ -108,7 +108,7 @@ runs: COMPATIBILITY_SNAPSHOT_DIR: /tmp/compatibility_snapshot_pg${{ inputs.pg_version }} ALLOW_BACKWARD_COMPATIBILITY_BREAKAGE: contains(github.event.pull_request.labels.*.name, 'backward compatibility breakage') ALLOW_FORWARD_COMPATIBILITY_BREAKAGE: contains(github.event.pull_request.labels.*.name, 'forward compatibility breakage') - RERUN_FLAKY: ${{ inputs.rerun_flaky }} + RERUN_FAILED: ${{ inputs.rerun_failed }} PG_VERSION: ${{ inputs.pg_version }} shell: bash -euxo pipefail {0} run: | @@ -154,15 +154,8 @@ runs: EXTRA_PARAMS="--out-dir $PERF_REPORT_DIR $EXTRA_PARAMS" fi - if [ "${RERUN_FLAKY}" == "true" ]; then - mkdir -p $TEST_OUTPUT - poetry run ./scripts/flaky_tests.py "${TEST_RESULT_CONNSTR}" \ - --days 7 \ - --output "$TEST_OUTPUT/flaky.json" \ - --pg-version "${DEFAULT_PG_VERSION}" \ - --build-type "${BUILD_TYPE}" - - EXTRA_PARAMS="--flaky-tests-json $TEST_OUTPUT/flaky.json $EXTRA_PARAMS" + if [ "${RERUN_FAILED}" == "true" ]; then + EXTRA_PARAMS="--reruns 2 $EXTRA_PARAMS" fi # We use pytest-split plugin to run benchmarks in parallel on different CI runners diff --git a/.github/workflows/_build-and-test-locally.yml b/.github/workflows/_build-and-test-locally.yml index bdf7c07c6a..42c32a23e3 100644 --- a/.github/workflows/_build-and-test-locally.yml +++ b/.github/workflows/_build-and-test-locally.yml @@ -293,7 +293,7 @@ jobs: run_with_real_s3: true real_s3_bucket: neon-github-ci-tests real_s3_region: eu-central-1 - rerun_flaky: true + rerun_failed: true pg_version: ${{ matrix.pg_version }} env: TEST_RESULT_CONNSTR: ${{ secrets.REGRESS_TEST_RESULT_CONNSTR_NEW }} diff --git a/.github/workflows/_create-release-pr.yml b/.github/workflows/_create-release-pr.yml index cc6994397f..3c130c8229 100644 --- a/.github/workflows/_create-release-pr.yml +++ b/.github/workflows/_create-release-pr.yml @@ -21,7 +21,7 @@ defaults: shell: bash -euo pipefail {0} jobs: - create-storage-release-branch: + create-release-branch: runs-on: ubuntu-22.04 permissions: diff --git a/.github/workflows/benchmarking.yml b/.github/workflows/benchmarking.yml index ea8fee80c2..7621d72f64 100644 --- a/.github/workflows/benchmarking.yml +++ b/.github/workflows/benchmarking.yml @@ -249,7 +249,7 @@ jobs: # Post both success and failure to the Slack channel - name: Post to a Slack channel - if: ${{ github.event.schedule }} + if: ${{ github.event.schedule && !cancelled() }} uses: slackapi/slack-github-action@v1 with: channel-id: "C06T9AMNDQQ" # on-call-compute-staging-stream diff --git a/.github/workflows/build_and_test.yml b/.github/workflows/build_and_test.yml index 9830c2a0c9..cb966f292e 100644 --- a/.github/workflows/build_and_test.yml +++ b/.github/workflows/build_and_test.yml @@ -6,6 +6,7 @@ on: - main - release - release-proxy + - release-compute pull_request: defaults: @@ -70,8 +71,10 @@ jobs: echo "tag=release-$(git rev-list --count HEAD)" >> $GITHUB_OUTPUT elif [[ "$GITHUB_REF_NAME" == "release-proxy" ]]; then echo "tag=release-proxy-$(git rev-list --count HEAD)" >> $GITHUB_OUTPUT + elif [[ "$GITHUB_REF_NAME" == "release-compute" ]]; then + echo "tag=release-compute-$(git rev-list --count HEAD)" >> $GITHUB_OUTPUT else - echo "GITHUB_REF_NAME (value '$GITHUB_REF_NAME') is not set to either 'main' or 'release'" + echo "GITHUB_REF_NAME (value '$GITHUB_REF_NAME') is not set to either 'main' or 'release', 'release-proxy', 'release-compute'" echo "tag=$GITHUB_RUN_ID" >> $GITHUB_OUTPUT fi shell: bash @@ -513,7 +516,7 @@ jobs: }) trigger-e2e-tests: - if: ${{ !github.event.pull_request.draft || contains( github.event.pull_request.labels.*.name, 'run-e2e-tests-in-draft') || github.ref_name == 'main' || github.ref_name == 'release' || github.ref_name == 'release-proxy' }} + if: ${{ !github.event.pull_request.draft || contains( github.event.pull_request.labels.*.name, 'run-e2e-tests-in-draft') || github.ref_name == 'main' || github.ref_name == 'release' || github.ref_name == 'release-proxy' || github.ref_name == 'release-compute' }} needs: [ check-permissions, promote-images, tag ] uses: ./.github/workflows/trigger-e2e-tests.yml secrets: inherit @@ -669,7 +672,7 @@ jobs: neondatabase/compute-node-${{ matrix.version.pg }}:${{ needs.tag.outputs.build-tag }}-${{ matrix.version.debian }}-${{ matrix.arch }} - name: Build neon extensions test image - if: matrix.version.pg == 'v16' + if: matrix.version.pg >= 'v16' uses: docker/build-push-action@v6 with: context: . @@ -684,8 +687,7 @@ jobs: pull: true file: compute/compute-node.Dockerfile target: neon-pg-ext-test - cache-from: type=registry,ref=cache.neon.build/neon-test-extensions-${{ matrix.version.pg }}:cache-${{ matrix.version.debian }}-${{ matrix.arch }} - cache-to: ${{ github.ref_name == 'main' && format('type=registry,ref=cache.neon.build/neon-test-extensions-{0}:cache-{1}-{2},mode=max', matrix.version.pg, matrix.version.debian, matrix.arch) || '' }} + cache-from: type=registry,ref=cache.neon.build/compute-node-${{ matrix.version.pg }}:cache-${{ matrix.version.debian }}-${{ matrix.arch }} tags: | neondatabase/neon-test-extensions-${{ matrix.version.pg }}:${{needs.tag.outputs.build-tag}}-${{ matrix.version.debian }}-${{ matrix.arch }} @@ -708,7 +710,7 @@ jobs: push: true pull: true file: compute/compute-node.Dockerfile - cache-from: type=registry,ref=cache.neon.build/neon-test-extensions-${{ matrix.version.pg }}:cache-${{ matrix.version.debian }}-${{ matrix.arch }} + cache-from: type=registry,ref=cache.neon.build/compute-node-${{ matrix.version.pg }}:cache-${{ matrix.version.debian }}-${{ matrix.arch }} cache-to: ${{ github.ref_name == 'main' && format('type=registry,ref=cache.neon.build/compute-tools-{0}:cache-{1}-{2},mode=max', matrix.version.pg, matrix.version.debian, matrix.arch) || '' }} tags: | neondatabase/compute-tools:${{ needs.tag.outputs.build-tag }}-${{ matrix.version.debian }}-${{ matrix.arch }} @@ -744,7 +746,7 @@ jobs: neondatabase/compute-node-${{ matrix.version.pg }}:${{ needs.tag.outputs.build-tag }}-${{ matrix.version.debian }}-arm64 - name: Create multi-arch neon-test-extensions image - if: matrix.version.pg == 'v16' + if: matrix.version.pg >= 'v16' run: | docker buildx imagetools create -t neondatabase/neon-test-extensions-${{ matrix.version.pg }}:${{ needs.tag.outputs.build-tag }} \ -t neondatabase/neon-test-extensions-${{ matrix.version.pg }}:${{ needs.tag.outputs.build-tag }}-${{ matrix.version.debian }} \ @@ -833,6 +835,7 @@ jobs: fail-fast: false matrix: arch: [ x64, arm64 ] + pg_version: [v16, v17] runs-on: ${{ fromJson(format('["self-hosted", "{0}"]', matrix.arch == 'arm64' && 'small-arm64' || 'small')) }} @@ -871,7 +874,10 @@ jobs: - name: Verify docker-compose example and test extensions timeout-minutes: 20 - run: env TAG=${{needs.tag.outputs.build-tag}} ./docker-compose/docker_compose_test.sh + env: + TAG: ${{needs.tag.outputs.build-tag}} + TEST_VERSION_ONLY: ${{ matrix.pg_version }} + run: ./docker-compose/docker_compose_test.sh - name: Print logs and clean up if: always() @@ -931,7 +937,7 @@ jobs: neondatabase/neon-test-extensions-v16:${{ needs.tag.outputs.build-tag }} - name: Configure AWS-prod credentials - if: github.ref_name == 'release'|| github.ref_name == 'release-proxy' + if: github.ref_name == 'release'|| github.ref_name == 'release-proxy' || github.ref_name == 'release-compute' uses: aws-actions/configure-aws-credentials@v4 with: aws-region: eu-central-1 @@ -940,12 +946,12 @@ jobs: - name: Login to prod ECR uses: docker/login-action@v3 - if: github.ref_name == 'release'|| github.ref_name == 'release-proxy' + if: github.ref_name == 'release'|| github.ref_name == 'release-proxy' || github.ref_name == 'release-compute' with: registry: 093970136003.dkr.ecr.eu-central-1.amazonaws.com - name: Copy all images to prod ECR - if: github.ref_name == 'release'|| github.ref_name == 'release-proxy' + if: github.ref_name == 'release' || github.ref_name == 'release-proxy' || github.ref_name == 'release-compute' run: | for image in neon compute-tools {vm-,}compute-node-{v14,v15,v16,v17}; do docker buildx imagetools create -t 093970136003.dkr.ecr.eu-central-1.amazonaws.com/${image}:${{ needs.tag.outputs.build-tag }} \ @@ -965,7 +971,7 @@ jobs: tenant_id: ${{ vars.AZURE_TENANT_ID }} push-to-acr-prod: - if: github.ref_name == 'release'|| github.ref_name == 'release-proxy' + if: github.ref_name == 'release' || github.ref_name == 'release-proxy' || github.ref_name == 'release-compute' needs: [ tag, promote-images ] uses: ./.github/workflows/_push-to-acr.yml with: @@ -1053,7 +1059,7 @@ jobs: deploy: needs: [ check-permissions, promote-images, tag, build-and-test-locally, trigger-custom-extensions-build-and-wait, push-to-acr-dev, push-to-acr-prod ] # `!failure() && !cancelled()` is required because the workflow depends on the job that can be skipped: `push-to-acr-dev` and `push-to-acr-prod` - if: (github.ref_name == 'main' || github.ref_name == 'release' || github.ref_name == 'release-proxy') && !failure() && !cancelled() + if: (github.ref_name == 'main' || github.ref_name == 'release' || github.ref_name == 'release-proxy' || github.ref_name == 'release-compute') && !failure() && !cancelled() runs-on: [ self-hosted, small ] container: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/ansible:latest @@ -1102,13 +1108,15 @@ jobs: -f deployProxyAuthBroker=true \ -f branch=main \ -f dockerTag=${{needs.tag.outputs.build-tag}} + elif [[ "$GITHUB_REF_NAME" == "release-compute" ]]; then + gh workflow --repo neondatabase/infra run deploy-compute-dev.yml --ref main -f dockerTag=${{needs.tag.outputs.build-tag}} else - echo "GITHUB_REF_NAME (value '$GITHUB_REF_NAME') is not set to either 'main' or 'release'" + echo "GITHUB_REF_NAME (value '$GITHUB_REF_NAME') is not set to either 'main', 'release', 'release-proxy' or 'release-compute'" exit 1 fi - name: Create git tag - if: github.ref_name == 'release' || github.ref_name == 'release-proxy' + if: github.ref_name == 'release' || github.ref_name == 'release-proxy' || github.ref_name == 'release-compute' uses: actions/github-script@v7 with: # Retry script for 5XX server errors: https://github.com/actions/github-script#retries diff --git a/.github/workflows/ingest_benchmark.yml b/.github/workflows/ingest_benchmark.yml index 1033dc6489..a5810e91a4 100644 --- a/.github/workflows/ingest_benchmark.yml +++ b/.github/workflows/ingest_benchmark.yml @@ -26,6 +26,7 @@ concurrency: jobs: ingest: strategy: + fail-fast: false # allow other variants to continue even if one fails matrix: target_project: [new_empty_project, large_existing_project] permissions: diff --git a/.github/workflows/release.yml b/.github/workflows/release.yml index 11f010b6d4..f0273b977f 100644 --- a/.github/workflows/release.yml +++ b/.github/workflows/release.yml @@ -15,6 +15,10 @@ on: type: boolean description: 'Create Proxy release PR' required: false + create-compute-release-branch: + type: boolean + description: 'Create Compute release PR' + required: false # No permission for GITHUB_TOKEN by default; the **minimal required** set of permissions should be granted in each job. permissions: {} @@ -25,20 +29,20 @@ defaults: jobs: create-storage-release-branch: - if: ${{ github.event.schedule == '0 6 * * MON' || format('{0}', inputs.create-storage-release-branch) == 'true' }} + if: ${{ github.event.schedule == '0 6 * * MON' || inputs.create-storage-release-branch }} permissions: contents: write uses: ./.github/workflows/_create-release-pr.yml with: - component-name: 'Storage & Compute' + component-name: 'Storage' release-branch: 'release' secrets: ci-access-token: ${{ secrets.CI_ACCESS_TOKEN }} create-proxy-release-branch: - if: ${{ github.event.schedule == '0 6 * * THU' || format('{0}', inputs.create-proxy-release-branch) == 'true' }} + if: ${{ github.event.schedule == '0 6 * * THU' || inputs.create-proxy-release-branch }} permissions: contents: write @@ -49,3 +53,16 @@ jobs: release-branch: 'release-proxy' secrets: ci-access-token: ${{ secrets.CI_ACCESS_TOKEN }} + + create-compute-release-branch: + if: inputs.create-compute-release-branch + + permissions: + contents: write + + uses: ./.github/workflows/_create-release-pr.yml + with: + component-name: 'Compute' + release-branch: 'release-compute' + secrets: + ci-access-token: ${{ secrets.CI_ACCESS_TOKEN }} diff --git a/.github/workflows/trigger-e2e-tests.yml b/.github/workflows/trigger-e2e-tests.yml index 1e7264c55a..70c2e8549f 100644 --- a/.github/workflows/trigger-e2e-tests.yml +++ b/.github/workflows/trigger-e2e-tests.yml @@ -51,6 +51,8 @@ jobs: echo "tag=release-$(git rev-list --count HEAD)" | tee -a $GITHUB_OUTPUT elif [[ "$GITHUB_REF_NAME" == "release-proxy" ]]; then echo "tag=release-proxy-$(git rev-list --count HEAD)" >> $GITHUB_OUTPUT + elif [[ "$GITHUB_REF_NAME" == "release-compute" ]]; then + echo "tag=release-compute-$(git rev-list --count HEAD)" >> $GITHUB_OUTPUT else echo "GITHUB_REF_NAME (value '$GITHUB_REF_NAME') is not set to either 'main' or 'release'" BUILD_AND_TEST_RUN_ID=$(gh run list -b $CURRENT_BRANCH -c $CURRENT_SHA -w 'Build and Test' -L 1 --json databaseId --jq '.[].databaseId') diff --git a/CODEOWNERS b/CODEOWNERS index 21b0e7c51f..f41462c98b 100644 --- a/CODEOWNERS +++ b/CODEOWNERS @@ -2,6 +2,7 @@ /compute_tools/ @neondatabase/control-plane @neondatabase/compute /libs/pageserver_api/ @neondatabase/storage /libs/postgres_ffi/ @neondatabase/compute @neondatabase/storage +/libs/proxy/ @neondatabase/proxy /libs/remote_storage/ @neondatabase/storage /libs/safekeeper_api/ @neondatabase/storage /libs/vm_monitor/ @neondatabase/autoscaling diff --git a/Cargo.lock b/Cargo.lock index 43a46fb1eb..62f06d45bd 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -1,6 +1,6 @@ # This file is automatically @generated by Cargo. # It is not intended for manual editing. -version = 3 +version = 4 [[package]] name = "RustyXML" @@ -84,16 +84,16 @@ checksum = "4b46cbb362ab8752921c97e041f5e366ee6297bd428a31275b9fcf1e380f7299" [[package]] name = "anstream" -version = "0.3.2" +version = "0.6.15" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "0ca84f3628370c59db74ee214b3263d58f9aadd9b4fe7e711fd87dc452b7f163" +checksum = "64e15c1ab1f89faffbf04a634d5e1962e9074f2741eef6d97f3c4e322426d526" dependencies = [ "anstyle", "anstyle-parse", "anstyle-query", "anstyle-wincon", "colorchoice", - "is-terminal", + "is_terminal_polyfill", "utf8parse", ] @@ -123,12 +123,12 @@ dependencies = [ [[package]] name = "anstyle-wincon" -version = "1.0.1" +version = "3.0.4" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "180abfa45703aebe0093f79badacc01b8fd4ea2e35118747e5811127f926e188" +checksum = "5bf74e1b6e971609db8ca7a9ce79fd5768ab6ae46441c572e46cf596f59e57f8" dependencies = [ "anstyle", - "windows-sys 0.48.0", + "windows-sys 0.52.0", ] [[package]] @@ -185,7 +185,7 @@ checksum = "965c2d33e53cb6b267e148a4cb0760bc01f4904c1cd4bb4002a085bb016d1490" dependencies = [ "proc-macro2", "quote", - "syn 2.0.52", + "syn 2.0.90", "synstructure", ] @@ -197,7 +197,7 @@ checksum = "7b18050c2cd6fe86c3a76584ef5e0baf286d038cda203eb6223df2cc413565f7" dependencies = [ "proc-macro2", "quote", - "syn 2.0.52", + "syn 2.0.90", ] [[package]] @@ -256,7 +256,7 @@ checksum = "16e62a023e7c117e27523144c5d2459f4397fcc3cab0085af8e2224f643a0193" dependencies = [ "proc-macro2", "quote", - "syn 2.0.52", + "syn 2.0.90", ] [[package]] @@ -267,7 +267,7 @@ checksum = "b9ccdd8f2a161be9bd5c023df56f1b2a0bd1d83872ae53b71a84a12c9bf6e842" dependencies = [ "proc-macro2", "quote", - "syn 2.0.52", + "syn 2.0.90", ] [[package]] @@ -301,7 +301,7 @@ dependencies = [ "aws-smithy-types", "aws-types", "bytes", - "fastrand 2.0.0", + "fastrand 2.2.0", "hex", "http 0.2.9", "hyper 0.14.30", @@ -341,7 +341,7 @@ dependencies = [ "aws-smithy-types", "aws-types", "bytes", - "fastrand 2.0.0", + "fastrand 2.2.0", "http 0.2.9", "http-body 0.4.5", "once_cell", @@ -417,7 +417,7 @@ dependencies = [ "aws-smithy-xml", "aws-types", "bytes", - "fastrand 2.0.0", + "fastrand 2.2.0", "hex", "hmac", "http 0.2.9", @@ -621,7 +621,7 @@ dependencies = [ "aws-smithy-runtime-api", "aws-smithy-types", "bytes", - "fastrand 2.0.0", + "fastrand 2.2.0", "h2 0.3.26", "http 0.2.9", "http-body 0.4.5", @@ -969,7 +969,7 @@ dependencies = [ "regex", "rustc-hash", "shlex", - "syn 2.0.52", + "syn 2.0.90", ] [[package]] @@ -1031,9 +1031,9 @@ checksum = "14c189c53d098945499cdfa7ecc63567cf3886b3332b312a5b4585d8d3a6a610" [[package]] name = "bytes" -version = "1.5.0" +version = "1.9.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a2bd12c1caf447e69cd4528f47f94d203fd2582878ecb9e9465484c4148a8223" +checksum = "325918d6fe32f23b19878fe4b34794ae41fc19ddbe53b10571a4874d44ffd39b" dependencies = [ "serde", ] @@ -1167,45 +1167,43 @@ dependencies = [ [[package]] name = "clap" -version = "4.3.0" +version = "4.5.22" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "93aae7a4192245f70fe75dd9157fc7b4a5bf53e88d30bd4396f7d8f9284d5acc" +checksum = "69371e34337c4c984bbe322360c2547210bf632eb2814bbe78a6e87a2935bd2b" dependencies = [ "clap_builder", "clap_derive", - "once_cell", ] [[package]] name = "clap_builder" -version = "4.3.0" +version = "4.5.22" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "4f423e341edefb78c9caba2d9c7f7687d0e72e89df3ce3394554754393ac3990" +checksum = "6e24c1b4099818523236a8ca881d2b45db98dadfb4625cf6608c12069fcbbde1" dependencies = [ "anstream", "anstyle", - "bitflags 1.3.2", "clap_lex", - "strsim", + "strsim 0.11.1", ] [[package]] name = "clap_derive" -version = "4.3.0" +version = "4.5.18" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "191d9573962933b4027f932c600cd252ce27a8ad5979418fe78e43c07996f27b" +checksum = "4ac6a0c7b1a9e9a5186361f67dfa1b88213572f427fb9ab038efb2bd8c582dab" dependencies = [ - "heck 0.4.1", + "heck", "proc-macro2", "quote", - "syn 2.0.52", + "syn 2.0.90", ] [[package]] name = "clap_lex" -version = "0.5.0" +version = "0.7.3" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "2da6da31387c7e4ef160ffab6d5e7f00c42626fe39aea70a7b0f1773f7dd6c1b" +checksum = "afb84c814227b90d6895e01398aee0d8033c00e7466aca416fb6a8e0eb19d8a7" [[package]] name = "colorchoice" @@ -1614,8 +1612,8 @@ dependencies = [ "ident_case", "proc-macro2", "quote", - "strsim", - "syn 2.0.52", + "strsim 0.10.0", + "syn 2.0.90", ] [[package]] @@ -1626,7 +1624,7 @@ checksum = "29a358ff9f12ec09c3e61fef9b5a9902623a695a46a917b07f269bff1445611a" dependencies = [ "darling_core", "quote", - "syn 2.0.52", + "syn 2.0.90", ] [[package]] @@ -1717,6 +1715,12 @@ dependencies = [ "utils", ] +[[package]] +name = "diatomic-waker" +version = "0.2.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ab03c107fafeb3ee9f5925686dbb7a73bc76e3932abb0d2b365cb64b169cf04c" + [[package]] name = "diesel" version = "2.2.3" @@ -1743,7 +1747,7 @@ dependencies = [ "dsl_auto_type", "proc-macro2", "quote", - "syn 2.0.52", + "syn 2.0.90", ] [[package]] @@ -1763,7 +1767,7 @@ version = "0.2.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "209c735641a413bc68c4923a9d6ad4bcb3ca306b794edaa7eb0b3228a99ffb25" dependencies = [ - "syn 2.0.52", + "syn 2.0.90", ] [[package]] @@ -1786,7 +1790,7 @@ checksum = "487585f4d0c6655fe74905e2504d8ad6908e4db67f744eb140876906c2f3175d" dependencies = [ "proc-macro2", "quote", - "syn 2.0.52", + "syn 2.0.90", ] [[package]] @@ -1806,10 +1810,10 @@ checksum = "0892a17df262a24294c382f0d5997571006e7a4348b4327557c4ff1cd4a8bccc" dependencies = [ "darling", "either", - "heck 0.5.0", + "heck", "proc-macro2", "quote", - "syn 2.0.52", + "syn 2.0.90", ] [[package]] @@ -1941,7 +1945,7 @@ dependencies = [ "darling", "proc-macro2", "quote", - "syn 2.0.52", + "syn 2.0.90", ] [[package]] @@ -1974,7 +1978,7 @@ checksum = "3bf679796c0322556351f287a51b49e48f7c4986e727b5dd78c972d30e2e16cc" dependencies = [ "proc-macro2", "quote", - "syn 2.0.52", + "syn 2.0.90", ] [[package]] @@ -2048,9 +2052,9 @@ dependencies = [ [[package]] name = "fastrand" -version = "2.0.0" +version = "2.2.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "6999dc1837253364c2ebb0704ba97994bd874e8f195d665c50b7548f6ea92764" +checksum = "486f806e73c5707928240ddc295403b1b93c96a02038563881c4a2fd84b81ac4" [[package]] name = "ff" @@ -2228,7 +2232,7 @@ checksum = "162ee34ebcb7c64a8abebc059ce0fee27c2262618d7b60ed8faf72fef13c3650" dependencies = [ "proc-macro2", "quote", - "syn 2.0.52", + "syn 2.0.90", ] [[package]] @@ -2331,7 +2335,7 @@ checksum = "53010ccb100b96a67bc32c0175f0ed1426b31b655d562898e57325f81c023ac0" dependencies = [ "proc-macro2", "quote", - "syn 2.0.52", + "syn 2.0.90", ] [[package]] @@ -2459,12 +2463,6 @@ dependencies = [ "num-traits", ] -[[package]] -name = "heck" -version = "0.4.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "95505c38b4572b2d910cecb0281560f54b440a19336cbbcb27bf6ce6adc6f5a8" - [[package]] name = "heck" version = "0.5.0" @@ -2882,6 +2880,12 @@ dependencies = [ "windows-sys 0.52.0", ] +[[package]] +name = "is_terminal_polyfill" +version = "1.70.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7943c866cc5cd64cbc25b2e01621d07fa8eb2a1a23160ee81ce38704e97b8ecf" + [[package]] name = "itertools" version = "0.10.5" @@ -2906,6 +2910,23 @@ version = "1.0.11" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "49f1f14873335454500d59611f1cf4a4b0f786f9ac11f4312a78e4cf2566695b" +[[package]] +name = "jemalloc_pprof" +version = "0.6.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1a883828bd6a4b957cd9f618886ff19e5f3ebd34e06ba0e855849e049fef32fb" +dependencies = [ + "anyhow", + "libc", + "mappings", + "once_cell", + "pprof_util", + "tempfile", + "tikv-jemalloc-ctl", + "tokio", + "tracing", +] + [[package]] name = "jobserver" version = "0.1.32" @@ -3016,9 +3037,9 @@ dependencies = [ [[package]] name = "libc" -version = "0.2.150" +version = "0.2.167" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "89d92a4743f9a61002fae18374ed11e7973f530cb3a3255fb354818118b2203c" +checksum = "09d6582e104315a817dff97f75133544b2e094ee22447d2acf4a74e189ba06fc" [[package]] name = "libloading" @@ -3038,9 +3059,9 @@ checksum = "4ec2a862134d2a7d32d7983ddcdd1c4923530833c9f2ea1a44fc5fa473989058" [[package]] name = "linux-raw-sys" -version = "0.4.13" +version = "0.4.14" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "01cda141df6706de531b6c46c3a33ecca755538219bd484262fa09410c13539c" +checksum = "78b3ae25bc7c8c38cec158d1f2757ee79e9b3740fbc7ccf0e59e4b08d793fa89" [[package]] name = "linux-raw-sys" @@ -3073,6 +3094,19 @@ dependencies = [ "hashbrown 0.14.5", ] +[[package]] +name = "mappings" +version = "0.6.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ce9229c438fbf1c333926e2053c4c091feabbd40a1b590ec62710fea2384af9e" +dependencies = [ + "anyhow", + "libc", + "once_cell", + "pprof_util", + "tracing", +] + [[package]] name = "matchers" version = "0.1.0" @@ -3133,10 +3167,10 @@ version = "0.0.22" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "b9e6777fc80a575f9503d908c8b498782a6c3ee88a06cb416dc3941401e43b94" dependencies = [ - "heck 0.5.0", + "heck", "proc-macro2", "quote", - "syn 2.0.52", + "syn 2.0.90", ] [[package]] @@ -3340,6 +3374,7 @@ version = "0.4.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "b05180d69e3da0e530ba2a1dae5110317e49e3b7f3d41be227dc5f92e49ee7af" dependencies = [ + "num-bigint", "num-complex", "num-integer", "num-iter", @@ -3428,6 +3463,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "0638a1c9d0a3c0914158145bc76cff373a75a627e6ecbfb71cbe6f453a5a19b0" dependencies = [ "autocfg", + "num-bigint", "num-integer", "num-traits", ] @@ -3491,9 +3527,9 @@ dependencies = [ [[package]] name = "once_cell" -version = "1.18.0" +version = "1.20.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "dd8b5dd2ae5ed71462c540258bedcb51965123ad7e7ccf4b9a8cafaa4a63576d" +checksum = "1261fe7e33c73b354eab43b1273a57c8f967d0391e80353e51f764ac02cf6775" [[package]] name = "oorandom" @@ -3509,9 +3545,9 @@ checksum = "ff011a302c396a5197692431fc1948019154afc178baf7d8e37367442a4601cf" [[package]] name = "opentelemetry" -version = "0.24.0" +version = "0.26.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "4c365a63eec4f55b7efeceb724f1336f26a9cf3427b70e59e2cd2a5b947fba96" +checksum = "570074cc999d1a58184080966e5bd3bf3a9a4af650c3b05047c2621e7405cd17" dependencies = [ "futures-core", "futures-sink", @@ -3523,9 +3559,9 @@ dependencies = [ [[package]] name = "opentelemetry-http" -version = "0.13.0" +version = "0.26.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ad31e9de44ee3538fb9d64fe3376c1362f406162434609e79aea2a41a0af78ab" +checksum = "6351496aeaa49d7c267fb480678d85d1cd30c5edb20b497c48c56f62a8c14b99" dependencies = [ "async-trait", "bytes", @@ -3536,9 +3572,9 @@ dependencies = [ [[package]] name = "opentelemetry-otlp" -version = "0.17.0" +version = "0.26.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "6b925a602ffb916fb7421276b86756027b37ee708f9dce2dbdcc51739f07e727" +checksum = "29e1f9c8b032d4f635c730c0efcf731d5e2530ea13fa8bef7939ddc8420696bd" dependencies = [ "async-trait", "futures-core", @@ -3554,9 +3590,9 @@ dependencies = [ [[package]] name = "opentelemetry-proto" -version = "0.7.0" +version = "0.26.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "30ee9f20bff9c984511a02f082dc8ede839e4a9bf15cc2487c8d6fea5ad850d9" +checksum = "c9d3968ce3aefdcca5c27e3c4ea4391b37547726a70893aab52d3de95d5f8b34" dependencies = [ "opentelemetry", "opentelemetry_sdk", @@ -3566,15 +3602,15 @@ dependencies = [ [[package]] name = "opentelemetry-semantic-conventions" -version = "0.16.0" +version = "0.26.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "1cefe0543875379e47eb5f1e68ff83f45cc41366a92dfd0d073d513bf68e9a05" +checksum = "db945c1eaea8ac6a9677185357480d215bb6999faa9f691d0c4d4d641eab7a09" [[package]] name = "opentelemetry_sdk" -version = "0.24.1" +version = "0.26.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "692eac490ec80f24a17828d49b40b60f5aeaccdfe6a503f939713afd22bc28df" +checksum = "d2c627d9f4c9cdc1f21a29ee4bfbd6028fcb8bcf2a857b43f3abdf72c9c862f3" dependencies = [ "async-trait", "futures-channel", @@ -3948,7 +3984,7 @@ dependencies = [ "parquet", "proc-macro2", "quote", - "syn 2.0.52", + "syn 2.0.90", ] [[package]] @@ -4050,7 +4086,7 @@ checksum = "39407670928234ebc5e6e580247dd567ad73a3578460c5990f9503df207e8f07" dependencies = [ "proc-macro2", "quote", - "syn 2.0.52", + "syn 2.0.90", ] [[package]] @@ -4133,7 +4169,7 @@ dependencies = [ [[package]] name = "postgres" version = "0.19.4" -source = "git+https://github.com/neondatabase/rust-postgres.git?branch=neon#00940fcdb57a8e99e805297b75839e7c4c7b1796" +source = "git+https://github.com/neondatabase/rust-postgres.git?branch=neon#511f998c00148ab7c847bd7e6cfd3a906d0e7473" dependencies = [ "bytes", "fallible-iterator", @@ -4146,7 +4182,7 @@ dependencies = [ [[package]] name = "postgres-protocol" version = "0.6.4" -source = "git+https://github.com/neondatabase/rust-postgres.git?branch=neon#00940fcdb57a8e99e805297b75839e7c4c7b1796" +source = "git+https://github.com/neondatabase/rust-postgres.git?branch=neon#511f998c00148ab7c847bd7e6cfd3a906d0e7473" dependencies = [ "base64 0.20.0", "byteorder", @@ -4159,19 +4195,41 @@ dependencies = [ "rand 0.8.5", "sha2", "stringprep", +] + +[[package]] +name = "postgres-protocol2" +version = "0.1.0" +dependencies = [ + "base64 0.20.0", + "byteorder", + "bytes", + "fallible-iterator", + "hmac", + "memchr", + "rand 0.8.5", + "sha2", + "stringprep", "tokio", ] [[package]] name = "postgres-types" version = "0.2.4" -source = "git+https://github.com/neondatabase/rust-postgres.git?branch=neon#00940fcdb57a8e99e805297b75839e7c4c7b1796" +source = "git+https://github.com/neondatabase/rust-postgres.git?branch=neon#511f998c00148ab7c847bd7e6cfd3a906d0e7473" dependencies = [ "bytes", "fallible-iterator", "postgres-protocol", - "serde", - "serde_json", +] + +[[package]] +name = "postgres-types2" +version = "0.1.0" +dependencies = [ + "bytes", + "fallible-iterator", + "postgres-protocol2", ] [[package]] @@ -4268,6 +4326,19 @@ dependencies = [ "thiserror", ] +[[package]] +name = "pprof_util" +version = "0.6.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "65c568b3f8c1c37886ae07459b1946249e725c315306b03be5632f84c239f781" +dependencies = [ + "anyhow", + "flate2", + "num", + "paste", + "prost", +] + [[package]] name = "ppv-lite86" version = "0.2.17" @@ -4304,7 +4375,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "8d3928fb5db768cb86f891ff014f0144589297e3c6a1aba6ed7cecfdace270c7" dependencies = [ "proc-macro2", - "syn 2.0.52", + "syn 2.0.90", ] [[package]] @@ -4318,9 +4389,9 @@ dependencies = [ [[package]] name = "proc-macro2" -version = "1.0.78" +version = "1.0.92" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e2422ad645d89c99f8f3e6b88a9fdeca7fabeac836b1002371c4367c8f984aae" +checksum = "37d3544b3f2748c54e147655edb5025752e2303145b5aefb3c3ea2c78b973bb0" dependencies = [ "unicode-ident", ] @@ -4384,7 +4455,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "0c1318b19085f08681016926435853bbf7858f9c082d0999b80550ff5d9abe15" dependencies = [ "bytes", - "heck 0.5.0", + "heck", "itertools 0.12.1", "log", "multimap", @@ -4394,7 +4465,7 @@ dependencies = [ "prost", "prost-types", "regex", - "syn 2.0.52", + "syn 2.0.90", "tempfile", ] @@ -4408,7 +4479,7 @@ dependencies = [ "itertools 0.12.1", "proc-macro2", "quote", - "syn 2.0.52", + "syn 2.0.90", ] [[package]] @@ -4471,6 +4542,7 @@ dependencies = [ "ecdsa 0.16.9", "env_logger", "fallible-iterator", + "flate2", "framed-websockets", "futures", "hashbrown 0.14.5", @@ -4501,7 +4573,7 @@ dependencies = [ "parquet_derive", "pbkdf2", "pin-project-lite", - "postgres-protocol", + "postgres-protocol2", "postgres_backend", "pq_proto", "prometheus", @@ -4537,7 +4609,7 @@ dependencies = [ "tikv-jemallocator", "tokio", "tokio-postgres", - "tokio-postgres-rustls", + "tokio-postgres2", "tokio-rustls 0.26.0", "tokio-tungstenite", "tokio-util", @@ -4962,9 +5034,9 @@ dependencies = [ [[package]] name = "reqwest-middleware" -version = "0.3.0" +version = "0.4.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "0209efb52486ad88136190094ee214759ef7507068b27992256ed6610eb71a01" +checksum = "d1ccd3b55e711f91a9885a2fa6fbbb2e39db1776420b062efc058c6410f7e5e3" dependencies = [ "anyhow", "async-trait", @@ -4977,13 +5049,12 @@ dependencies = [ [[package]] name = "reqwest-retry" -version = "0.5.0" +version = "0.7.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "40f342894422862af74c50e1e9601cf0931accc9c6981e5eb413c46603b616b5" +checksum = "29c73e4195a6bfbcb174b790d9b3407ab90646976c55de58a6515da25d851178" dependencies = [ "anyhow", "async-trait", - "chrono", "futures", "getrandom 0.2.11", "http 1.1.0", @@ -4992,6 +5063,7 @@ dependencies = [ "reqwest 0.12.4", "reqwest-middleware", "retry-policies", + "thiserror", "tokio", "tracing", "wasm-timer", @@ -4999,9 +5071,9 @@ dependencies = [ [[package]] name = "reqwest-tracing" -version = "0.5.3" +version = "0.5.4" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "bfdd9bfa64c72233d8dd99ab7883efcdefe9e16d46488ecb9228b71a2e2ceb45" +checksum = "ff82cf5730a1311fb9413b0bc2b8e743e0157cd73f010ab4ec374a923873b6a2" dependencies = [ "anyhow", "async-trait", @@ -5017,12 +5089,10 @@ dependencies = [ [[package]] name = "retry-policies" -version = "0.3.0" +version = "0.4.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "493b4243e32d6eedd29f9a398896e35c6943a123b55eec97dcaee98310d25810" +checksum = "5875471e6cab2871bc150ecb8c727db5113c9338cc3354dc5ee3425b6aa40a1c" dependencies = [ - "anyhow", - "chrono", "rand 0.8.5", ] @@ -5146,7 +5216,7 @@ dependencies = [ "regex", "relative-path", "rustc_version", - "syn 2.0.52", + "syn 2.0.90", "unicode-ident", ] @@ -5192,14 +5262,14 @@ dependencies = [ [[package]] name = "rustix" -version = "0.38.28" +version = "0.38.41" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "72e572a5e8ca657d7366229cdde4bd14c4eb5499a9573d4d366fe1b599daa316" +checksum = "d7f649912bc1495e167a6edee79151c84b1bad49748cb4f1f1167f459f6224f6" dependencies = [ "bitflags 2.4.1", "errno", "libc", - "linux-raw-sys 0.4.13", + "linux-raw-sys 0.4.14", "windows-sys 0.52.0", ] @@ -5386,6 +5456,7 @@ dependencies = [ "strum", "strum_macros", "thiserror", + "tikv-jemallocator", "tokio", "tokio-io-timeout", "tokio-postgres", @@ -5653,7 +5724,7 @@ checksum = "500cbc0ebeb6f46627f50f3f5811ccf6bf00643be300b4c3eabc0ef55dc5b5ba" dependencies = [ "proc-macro2", "quote", - "syn 2.0.52", + "syn 2.0.90", ] [[package]] @@ -5735,7 +5806,7 @@ dependencies = [ "darling", "proc-macro2", "quote", - "syn 2.0.52", + "syn 2.0.90", ] [[package]] @@ -6092,6 +6163,12 @@ version = "0.10.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "73473c0e59e6d5812c5dfe2a064a6444949f089e20eec9a2e5506596494e4623" +[[package]] +name = "strsim" +version = "0.11.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7da8b5736845d9f2fcb837ea5d9e2628564b3b043a70948a3f0b778838c5fb4f" + [[package]] name = "strum" version = "0.26.3" @@ -6104,11 +6181,11 @@ version = "0.26.4" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "4c6bee85a5a24955dc440386795aa378cd9cf82acd5f764469152d2270e581be" dependencies = [ - "heck 0.5.0", + "heck", "proc-macro2", "quote", "rustversion", - "syn 2.0.52", + "syn 2.0.90", ] [[package]] @@ -6159,9 +6236,9 @@ dependencies = [ [[package]] name = "syn" -version = "2.0.52" +version = "2.0.90" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b699d15b36d1f02c3e7c69f8ffef53de37aefae075d8488d4ba1a7788d574a07" +checksum = "919d3b74a5dd0ccd15aeb8f93e7006bd9e14c295087c9896a110f490752bcf31" dependencies = [ "proc-macro2", "quote", @@ -6191,7 +6268,7 @@ checksum = "c8af7666ab7b6390ab78131fb5b0fce11d6b7a6951602017c35fa82800708971" dependencies = [ "proc-macro2", "quote", - "syn 2.0.52", + "syn 2.0.90", ] [[package]] @@ -6222,13 +6299,13 @@ dependencies = [ [[package]] name = "tempfile" -version = "3.9.0" +version = "3.14.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "01ce4141aa927a6d1bd34a041795abd0db1cccba5d5f24b009f694bdf3a1f3fa" +checksum = "28cce251fcbc87fac86a866eeb0d6c2d536fc16d06f184bb61aeae11aa4cee0c" dependencies = [ "cfg-if", - "fastrand 2.0.0", - "redox_syscall 0.4.1", + "fastrand 2.2.0", + "once_cell", "rustix", "windows-sys 0.52.0", ] @@ -6269,27 +6346,27 @@ checksum = "78ea17a2dc368aeca6f554343ced1b1e31f76d63683fa8016e5844bd7a5144a1" dependencies = [ "proc-macro2", "quote", - "syn 2.0.52", + "syn 2.0.90", ] [[package]] name = "thiserror" -version = "1.0.57" +version = "1.0.69" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "1e45bcbe8ed29775f228095caf2cd67af7a4ccf756ebff23a306bf3e8b47b24b" +checksum = "b6aaf5339b578ea85b50e080feb250a3e8ae8cfcdff9a461c9ec2904bc923f52" dependencies = [ "thiserror-impl", ] [[package]] name = "thiserror-impl" -version = "1.0.57" +version = "1.0.69" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a953cb265bef375dae3de6663da4d3804eee9682ea80d8e2542529b73c531c81" +checksum = "4fee6c4efc90059e10f81e6d42c60a18f76588c3d74cb83a0b242a2b6c7504c1" dependencies = [ "proc-macro2", "quote", - "syn 2.0.52", + "syn 2.0.90", ] [[package]] @@ -6421,6 +6498,7 @@ dependencies = [ "libc", "mio", "num_cpus", + "parking_lot 0.12.1", "pin-project-lite", "signal-hook-registry", "socket2", @@ -6462,13 +6540,13 @@ checksum = "5f5ae998a069d4b5aba8ee9dad856af7d520c3699e6159b185c2acd48155d39a" dependencies = [ "proc-macro2", "quote", - "syn 2.0.52", + "syn 2.0.90", ] [[package]] name = "tokio-postgres" version = "0.7.7" -source = "git+https://github.com/neondatabase/rust-postgres.git?branch=neon#00940fcdb57a8e99e805297b75839e7c4c7b1796" +source = "git+https://github.com/neondatabase/rust-postgres.git?branch=neon#511f998c00148ab7c847bd7e6cfd3a906d0e7473" dependencies = [ "async-trait", "byteorder", @@ -6502,6 +6580,26 @@ dependencies = [ "x509-certificate", ] +[[package]] +name = "tokio-postgres2" +version = "0.1.0" +dependencies = [ + "async-trait", + "byteorder", + "bytes", + "fallible-iterator", + "futures-util", + "log", + "parking_lot 0.12.1", + "percent-encoding", + "phf", + "pin-project-lite", + "postgres-protocol2", + "postgres-types2", + "tokio", + "tokio-util", +] + [[package]] name = "tokio-rustls" version = "0.24.0" @@ -6667,7 +6765,7 @@ dependencies = [ "prost-build", "prost-types", "quote", - "syn 2.0.52", + "syn 2.0.90", ] [[package]] @@ -6704,9 +6802,9 @@ checksum = "b6bc1c9ce2b5135ac7f93c72918fc37feb872bdc6a5533a8b85eb4b86bfdae52" [[package]] name = "tracing" -version = "0.1.40" +version = "0.1.41" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "c3523ab5a71916ccf420eebdf5521fcef02141234bbc0b8a49f2fdc4544364ef" +checksum = "784e0ac535deb450455cbfa28a6f0df145ea1bb7ae51b821cf5e7927fdcfbdd0" dependencies = [ "log", "pin-project-lite", @@ -6727,20 +6825,20 @@ dependencies = [ [[package]] name = "tracing-attributes" -version = "0.1.27" +version = "0.1.28" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "34704c8d6ebcbc939824180af020566b01a7c01f80641264eba0999f6c2b6be7" +checksum = "395ae124c09f9e6918a2310af6038fba074bcf474ac352496d5910dd59a2226d" dependencies = [ "proc-macro2", "quote", - "syn 2.0.52", + "syn 2.0.90", ] [[package]] name = "tracing-core" -version = "0.1.32" +version = "0.1.33" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "c06d3da6113f116aaee68e4d601191614c9053067f9ab7f6edbcb161237daa54" +checksum = "e672c95779cf947c5311f83787af4fa8fffd12fb27e4993211a84bdfd9610f9c" dependencies = [ "once_cell", "valuable", @@ -6769,9 +6867,9 @@ dependencies = [ [[package]] name = "tracing-opentelemetry" -version = "0.25.0" +version = "0.27.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a9784ed4da7d921bc8df6963f8c80a0e4ce34ba6ba76668acadd3edbd985ff3b" +checksum = "dc58af5d3f6c5811462cabb3289aec0093f7338e367e5a33d28c0433b3c7360b" dependencies = [ "js-sys", "once_cell", @@ -6787,9 +6885,9 @@ dependencies = [ [[package]] name = "tracing-serde" -version = "0.1.3" +version = "0.2.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "bc6b213177105856957181934e4920de57730fc69bf42c37ee5bb664d406d9e1" +checksum = "704b1aeb7be0d0a84fc9828cae51dab5970fee5088f83d1dd7ee6f6246fc6ff1" dependencies = [ "serde", "tracing-core", @@ -6797,9 +6895,9 @@ dependencies = [ [[package]] name = "tracing-subscriber" -version = "0.3.18" +version = "0.3.19" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ad0f048c97dbd9faa9b7df56362b8ebcaa52adb06b498c050d2f4e32f90a7a8b" +checksum = "e8189decb5ac0fa7bc8b96b7cb9b2701d60d48805aca84a238004d665fcc4008" dependencies = [ "matchers", "once_cell", @@ -7000,6 +7098,7 @@ dependencies = [ "chrono", "const_format", "criterion", + "diatomic-waker", "fail", "futures", "git-version", @@ -7007,6 +7106,7 @@ dependencies = [ "hex-literal", "humantime", "hyper 0.14.30", + "jemalloc_pprof", "jsonwebtoken", "metrics", "nix 0.27.1", @@ -7018,6 +7118,7 @@ dependencies = [ "rand 0.8.5", "regex", "routerify", + "scopeguard", "sentry", "serde", "serde_assert", @@ -7204,7 +7305,7 @@ dependencies = [ "once_cell", "proc-macro2", "quote", - "syn 2.0.52", + "syn 2.0.90", "wasm-bindgen-shared", ] @@ -7238,7 +7339,7 @@ checksum = "e94f17b526d0a461a191c78ea52bbce64071ed5c04c9ffe424dcb38f74171bb7" dependencies = [ "proc-macro2", "quote", - "syn 2.0.52", + "syn 2.0.90", "wasm-bindgen-backend", "wasm-bindgen-shared", ] @@ -7592,12 +7693,15 @@ dependencies = [ "memchr", "nix 0.26.4", "nom", + "num", "num-bigint", + "num-complex", "num-integer", + "num-iter", + "num-rational", "num-traits", "once_cell", "parquet", - "postgres-types", "prettyplease", "proc-macro2", "prost", @@ -7616,13 +7720,13 @@ dependencies = [ "smallvec", "spki 0.7.3", "subtle", - "syn 2.0.52", + "syn 2.0.90", "sync_wrapper 0.1.2", + "tikv-jemalloc-ctl", "tikv-jemalloc-sys", "time", "time-macros", "tokio", - "tokio-postgres", "tokio-rustls 0.26.0", "tokio-stream", "tokio-util", @@ -7717,7 +7821,7 @@ checksum = "b3c129550b3e6de3fd0ba67ba5c81818f9805e58b8d7fee80a3a59d2c9fc601a" dependencies = [ "proc-macro2", "quote", - "syn 2.0.52", + "syn 2.0.90", ] [[package]] @@ -7738,7 +7842,7 @@ checksum = "ce36e65b0d2999d2aafac989fb249189a141aee1f53c612c1f37d72631959f69" dependencies = [ "proc-macro2", "quote", - "syn 2.0.52", + "syn 2.0.90", ] [[package]] diff --git a/Cargo.toml b/Cargo.toml index e3dc5b97f8..a35823e0c2 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -35,6 +35,9 @@ members = [ "libs/walproposer", "libs/wal_decoder", "libs/postgres_initdb", + "libs/proxy/postgres-protocol2", + "libs/proxy/postgres-types2", + "libs/proxy/tokio-postgres2", ] [workspace.package] @@ -71,7 +74,7 @@ bindgen = "0.70" bit_field = "0.10.2" bstr = "1.0" byteorder = "1.4" -bytes = "1.0" +bytes = "1.9" camino = "1.1.6" cfg-if = "1.0.0" chrono = { version = "0.4", default-features = false, features = ["clock"] } @@ -80,6 +83,7 @@ comfy-table = "7.1" const_format = "0.2" crc32c = "0.6" dashmap = { version = "5.5.0", features = ["raw-api"] } +diatomic-waker = { version = "0.2.3" } either = "1.8" enum-map = "2.4.2" enumset = "1.0.12" @@ -111,6 +115,7 @@ indoc = "2" ipnet = "2.10.0" itertools = "0.10" itoa = "1.0.11" +jemalloc_pprof = "0.6" jsonwebtoken = "9" lasso = "0.7" libc = "0.2" @@ -123,10 +128,10 @@ notify = "6.0.0" num_cpus = "1.15" num-traits = "0.2.15" once_cell = "1.13" -opentelemetry = "0.24" -opentelemetry_sdk = "0.24" -opentelemetry-otlp = { version = "0.17", default-features=false, features = ["http-proto", "trace", "http", "reqwest-client"] } -opentelemetry-semantic-conventions = "0.16" +opentelemetry = "0.26" +opentelemetry_sdk = "0.26" +opentelemetry-otlp = { version = "0.26", default-features=false, features = ["http-proto", "trace", "http", "reqwest-client"] } +opentelemetry-semantic-conventions = "0.26" parking_lot = "0.12" parquet = { version = "53", default-features = false, features = ["zstd"] } parquet_derive = "53" @@ -140,9 +145,9 @@ rand = "0.8" redis = { version = "0.25.2", features = ["tokio-rustls-comp", "keep-alive"] } regex = "1.10.2" reqwest = { version = "0.12", default-features = false, features = ["rustls-tls"] } -reqwest-tracing = { version = "0.5", features = ["opentelemetry_0_24"] } -reqwest-middleware = "0.3.0" -reqwest-retry = "0.5" +reqwest-tracing = { version = "0.5", features = ["opentelemetry_0_26"] } +reqwest-middleware = "0.4" +reqwest-retry = "0.7" routerify = "3" rpds = "0.13" rustc-hash = "1.1.0" @@ -171,7 +176,7 @@ sync_wrapper = "0.1.2" tar = "0.4" test-context = "0.3" thiserror = "1.0" -tikv-jemallocator = { version = "0.6", features = ["stats"] } +tikv-jemallocator = { version = "0.6", features = ["profiling", "stats", "unprefixed_malloc_on_supported_platforms"] } tikv-jemalloc-ctl = { version = "0.6", features = ["stats"] } tokio = { version = "1.17", features = ["macros"] } tokio-epoll-uring = { git = "https://github.com/neondatabase/tokio-epoll-uring.git" , branch = "main" } @@ -187,7 +192,7 @@ tonic = {version = "0.12.3", features = ["tls", "tls-roots"]} tower-service = "0.3.2" tracing = "0.1" tracing-error = "0.2" -tracing-opentelemetry = "0.25" +tracing-opentelemetry = "0.27" tracing-subscriber = { version = "0.3", default-features = false, features = ["smallvec", "fmt", "tracing-log", "std", "env-filter", "json"] } try-lock = "0.2.5" twox-hash = { version = "1.6.3", default-features = false } diff --git a/Makefile b/Makefile index dc67b87239..9cffc74508 100644 --- a/Makefile +++ b/Makefile @@ -147,6 +147,8 @@ postgres-%: postgres-configure-% \ $(MAKE) -C $(POSTGRES_INSTALL_DIR)/build/$*/contrib/pg_prewarm install +@echo "Compiling pg_buffercache $*" $(MAKE) -C $(POSTGRES_INSTALL_DIR)/build/$*/contrib/pg_buffercache install + +@echo "Compiling pg_visibility $*" + $(MAKE) -C $(POSTGRES_INSTALL_DIR)/build/$*/contrib/pg_visibility install +@echo "Compiling pageinspect $*" $(MAKE) -C $(POSTGRES_INSTALL_DIR)/build/$*/contrib/pageinspect install +@echo "Compiling amcheck $*" diff --git a/build-tools.Dockerfile b/build-tools.Dockerfile index 4f491afec5..2671702697 100644 --- a/build-tools.Dockerfile +++ b/build-tools.Dockerfile @@ -57,9 +57,9 @@ RUN mkdir -p /pgcopydb/bin && \ mkdir -p /pgcopydb/lib && \ chmod -R 755 /pgcopydb && \ chown -R nonroot:nonroot /pgcopydb - -COPY --from=pgcopydb_builder /usr/lib/postgresql/16/bin/pgcopydb /pgcopydb/bin/pgcopydb -COPY --from=pgcopydb_builder /pgcopydb/lib/libpq.so.5 /pgcopydb/lib/libpq.so.5 + +COPY --from=pgcopydb_builder /usr/lib/postgresql/16/bin/pgcopydb /pgcopydb/bin/pgcopydb +COPY --from=pgcopydb_builder /pgcopydb/lib/libpq.so.5 /pgcopydb/lib/libpq.so.5 # System deps # @@ -258,14 +258,14 @@ WORKDIR /home/nonroot # Rust # Please keep the version of llvm (installed above) in sync with rust llvm (`rustc --version --verbose | grep LLVM`) -ENV RUSTC_VERSION=1.82.0 +ENV RUSTC_VERSION=1.83.0 ENV RUSTUP_HOME="/home/nonroot/.rustup" ENV PATH="/home/nonroot/.cargo/bin:${PATH}" ARG RUSTFILT_VERSION=0.2.1 -ARG CARGO_HAKARI_VERSION=0.9.30 -ARG CARGO_DENY_VERSION=0.16.1 -ARG CARGO_HACK_VERSION=0.6.31 -ARG CARGO_NEXTEST_VERSION=0.9.72 +ARG CARGO_HAKARI_VERSION=0.9.33 +ARG CARGO_DENY_VERSION=0.16.2 +ARG CARGO_HACK_VERSION=0.6.33 +ARG CARGO_NEXTEST_VERSION=0.9.85 RUN curl -sSO https://static.rust-lang.org/rustup/dist/$(uname -m)-unknown-linux-gnu/rustup-init && whoami && \ chmod +x rustup-init && \ ./rustup-init -y --default-toolchain ${RUSTC_VERSION} && \ @@ -289,7 +289,7 @@ RUN whoami \ && cargo --version --verbose \ && rustup --version --verbose \ && rustc --version --verbose \ - && clang --version + && clang --version RUN if [ "${DEBIAN_VERSION}" = "bookworm" ]; then \ LD_LIBRARY_PATH=/pgcopydb/lib /pgcopydb/bin/pgcopydb --version; \ diff --git a/compute/compute-node.Dockerfile b/compute/compute-node.Dockerfile index 2fcd9985bc..bf6311bf2b 100644 --- a/compute/compute-node.Dockerfile +++ b/compute/compute-node.Dockerfile @@ -14,6 +14,9 @@ ARG DEBIAN_FLAVOR=${DEBIAN_VERSION}-slim FROM debian:$DEBIAN_FLAVOR AS build-deps ARG DEBIAN_VERSION +# Use strict mode for bash to catch errors early +SHELL ["/bin/bash", "-euo", "pipefail", "-c"] + RUN case $DEBIAN_VERSION in \ # Version-specific installs for Bullseye (PG14-PG16): # The h3_pg extension needs a cmake 3.20+, but Debian bullseye has 3.18. @@ -106,6 +109,7 @@ RUN cd postgres && \ # ######################################################################################### FROM build-deps AS postgis-build +ARG DEBIAN_VERSION ARG PG_VERSION COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/ RUN apt update && \ @@ -122,12 +126,12 @@ RUN apt update && \ # and also we must check backward compatibility with older versions of PostGIS. # # Use new version only for v17 -RUN case "${PG_VERSION}" in \ - "v17") \ +RUN case "${DEBIAN_VERSION}" in \ + "bookworm") \ export SFCGAL_VERSION=1.4.1 \ export SFCGAL_CHECKSUM=1800c8a26241588f11cddcf433049e9b9aea902e923414d2ecef33a3295626c3 \ ;; \ - "v14" | "v15" | "v16") \ + "bullseye") \ export SFCGAL_VERSION=1.3.10 \ export SFCGAL_CHECKSUM=4e39b3b2adada6254a7bdba6d297bb28e1a9835a9f879b74f37e2dab70203232 \ ;; \ @@ -228,6 +232,8 @@ FROM build-deps AS plv8-build ARG PG_VERSION COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/ +COPY compute/patches/plv8-3.1.10.patch /plv8-3.1.10.patch + RUN apt update && \ apt install --no-install-recommends -y ninja-build python3-dev libncurses5 binutils clang @@ -239,8 +245,6 @@ RUN apt update && \ # # Use new version only for v17 # because since v3.2, plv8 doesn't include plcoffee and plls extensions -ENV PLV8_TAG=v3.2.3 - RUN case "${PG_VERSION}" in \ "v17") \ export PLV8_TAG=v3.2.3 \ @@ -255,8 +259,9 @@ RUN case "${PG_VERSION}" in \ git clone --recurse-submodules --depth 1 --branch ${PLV8_TAG} https://github.com/plv8/plv8.git plv8-src && \ tar -czf plv8.tar.gz --exclude .git plv8-src && \ cd plv8-src && \ + if [[ "${PG_VERSION}" < "v17" ]]; then patch -p1 < /plv8-3.1.10.patch; fi && \ # generate and copy upgrade scripts - mkdir -p upgrade && ./generate_upgrade.sh 3.1.10 && \ + mkdir -p upgrade && ./generate_upgrade.sh ${PLV8_TAG#v} && \ cp upgrade/* /usr/local/pgsql/share/extension/ && \ export PATH="/usr/local/pgsql/bin:$PATH" && \ make DOCKER=1 -j $(getconf _NPROCESSORS_ONLN) install && \ @@ -353,10 +358,10 @@ COPY compute/patches/pgvector.patch /pgvector.patch # because we build the images on different machines than where we run them. # Pass OPTFLAGS="" to remove it. # -# vector 0.7.4 supports v17 -# last release v0.7.4 - Aug 5, 2024 -RUN wget https://github.com/pgvector/pgvector/archive/refs/tags/v0.7.4.tar.gz -O pgvector.tar.gz && \ - echo "0341edf89b1924ae0d552f617e14fb7f8867c0194ed775bcc44fa40288642583 pgvector.tar.gz" | sha256sum --check && \ +# vector >0.7.4 supports v17 +# last release v0.8.0 - Oct 30, 2024 +RUN wget https://github.com/pgvector/pgvector/archive/refs/tags/v0.8.0.tar.gz -O pgvector.tar.gz && \ + echo "867a2c328d4928a5a9d6f052cd3bc78c7d60228a9b914ad32aa3db88e9de27b0 pgvector.tar.gz" | sha256sum --check && \ mkdir pgvector-src && cd pgvector-src && tar xzf ../pgvector.tar.gz --strip-components=1 -C . && \ patch -p1 < /pgvector.patch && \ make -j $(getconf _NPROCESSORS_ONLN) OPTFLAGS="" PG_CONFIG=/usr/local/pgsql/bin/pg_config && \ @@ -1362,15 +1367,12 @@ RUN make PG_VERSION="${PG_VERSION}" -C compute FROM neon-pg-ext-build AS neon-pg-ext-test ARG PG_VERSION -RUN case "${PG_VERSION}" in "v17") \ - echo "v17 extensions are not supported yet. Quit" && exit 0;; \ - esac && \ - mkdir /ext-src +RUN mkdir /ext-src #COPY --from=postgis-build /postgis.tar.gz /ext-src/ #COPY --from=postgis-build /sfcgal/* /usr COPY --from=plv8-build /plv8.tar.gz /ext-src/ -COPY --from=h3-pg-build /h3-pg.tar.gz /ext-src/ +#COPY --from=h3-pg-build /h3-pg.tar.gz /ext-src/ COPY --from=unit-pg-build /postgresql-unit.tar.gz /ext-src/ COPY --from=vector-pg-build /pgvector.tar.gz /ext-src/ COPY --from=vector-pg-build /pgvector.patch /ext-src/ @@ -1390,7 +1392,7 @@ COPY --from=hll-pg-build /hll.tar.gz /ext-src COPY --from=plpgsql-check-pg-build /plpgsql_check.tar.gz /ext-src #COPY --from=timescaledb-pg-build /timescaledb.tar.gz /ext-src COPY --from=pg-hint-plan-pg-build /pg_hint_plan.tar.gz /ext-src -COPY compute/patches/pg_hint_plan.patch /ext-src +COPY compute/patches/pg_hint_plan_${PG_VERSION}.patch /ext-src COPY --from=pg-cron-pg-build /pg_cron.tar.gz /ext-src COPY compute/patches/pg_cron.patch /ext-src #COPY --from=pg-pgx-ulid-build /home/nonroot/pgx_ulid.tar.gz /ext-src @@ -1400,38 +1402,23 @@ COPY --from=pg-roaringbitmap-pg-build /pg_roaringbitmap.tar.gz /ext-src COPY --from=pg-semver-pg-build /pg_semver.tar.gz /ext-src #COPY --from=pg-embedding-pg-build /home/nonroot/pg_embedding-src/ /ext-src #COPY --from=wal2json-pg-build /wal2json_2_5.tar.gz /ext-src -COPY --from=pg-anon-pg-build /pg_anon.tar.gz /ext-src +#pg_anon is not supported yet for pg v17 so, don't fail if nothing found +COPY --from=pg-anon-pg-build /pg_anon.tar.g? /ext-src COPY compute/patches/pg_anon.patch /ext-src COPY --from=pg-ivm-build /pg_ivm.tar.gz /ext-src COPY --from=pg-partman-build /pg_partman.tar.gz /ext-src -RUN case "${PG_VERSION}" in "v17") \ - echo "v17 extensions are not supported yet. Quit" && exit 0;; \ - esac && \ - cd /ext-src/ && for f in *.tar.gz; \ +RUN cd /ext-src/ && for f in *.tar.gz; \ do echo $f; dname=$(echo $f | sed 's/\.tar.*//')-src; \ rm -rf $dname; mkdir $dname; tar xzf $f --strip-components=1 -C $dname \ || exit 1; rm -f $f; done -RUN case "${PG_VERSION}" in "v17") \ - echo "v17 extensions are not supported yet. Quit" && exit 0;; \ - esac && \ - cd /ext-src/rum-src && patch -p1 <../rum.patch -RUN case "${PG_VERSION}" in "v17") \ - echo "v17 extensions are not supported yet. Quit" && exit 0;; \ - esac && \ - cd /ext-src/pgvector-src && patch -p1 <../pgvector.patch -RUN case "${PG_VERSION}" in "v17") \ - echo "v17 extensions are not supported yet. Quit" && exit 0;; \ - esac && \ - cd /ext-src/pg_hint_plan-src && patch -p1 < /ext-src/pg_hint_plan.patch +RUN cd /ext-src/rum-src && patch -p1 <../rum.patch +RUN cd /ext-src/pgvector-src && patch -p1 <../pgvector.patch +RUN cd /ext-src/pg_hint_plan-src && patch -p1 < /ext-src/pg_hint_plan_${PG_VERSION}.patch COPY --chmod=755 docker-compose/run-tests.sh /run-tests.sh RUN case "${PG_VERSION}" in "v17") \ - echo "v17 extensions are not supported yet. Quit" && exit 0;; \ - esac && \ - patch -p1 +Date: Sat Nov 30 18:29:32 2024 +0000 + + Fix v8 9.7.37 compilation on Debian 12 + +diff --git a/patches/code/84cf3230a9680aac3b73c410c2b758760b6d3066.patch b/patches/code/84cf3230a9680aac3b73c410c2b758760b6d3066.patch +new file mode 100644 +index 0000000..f0a5dc7 +--- /dev/null ++++ b/patches/code/84cf3230a9680aac3b73c410c2b758760b6d3066.patch +@@ -0,0 +1,30 @@ ++From 84cf3230a9680aac3b73c410c2b758760b6d3066 Mon Sep 17 00:00:00 2001 ++From: Michael Lippautz ++Date: Thu, 27 Jan 2022 14:14:11 +0100 ++Subject: [PATCH] cppgc: Fix include ++ ++Add to cover for std::exchange. ++ ++Bug: v8:12585 ++Change-Id: Ida65144e93e466be8914527d0e646f348c136bcb ++Reviewed-on: https://chromium-review.googlesource.com/c/v8/v8/+/3420309 ++Auto-Submit: Michael Lippautz ++Reviewed-by: Omer Katz ++Commit-Queue: Michael Lippautz ++Cr-Commit-Position: refs/heads/main@{#78820} ++--- ++ src/heap/cppgc/prefinalizer-handler.h | 1 + ++ 1 file changed, 1 insertion(+) ++ ++diff --git a/src/heap/cppgc/prefinalizer-handler.h b/src/heap/cppgc/prefinalizer-handler.h ++index bc17c99b1838..c82c91ff5a45 100644 ++--- a/src/heap/cppgc/prefinalizer-handler.h +++++ b/src/heap/cppgc/prefinalizer-handler.h ++@@ -5,6 +5,7 @@ ++ #ifndef V8_HEAP_CPPGC_PREFINALIZER_HANDLER_H_ ++ #define V8_HEAP_CPPGC_PREFINALIZER_HANDLER_H_ ++ +++#include ++ #include ++ ++ #include "include/cppgc/prefinalizer.h" diff --git a/compute_tools/src/bin/compute_ctl.rs b/compute_tools/src/bin/compute_ctl.rs index 6b670de2ea..e73ccd908e 100644 --- a/compute_tools/src/bin/compute_ctl.rs +++ b/compute_tools/src/bin/compute_ctl.rs @@ -37,6 +37,7 @@ use std::collections::HashMap; use std::fs::File; use std::path::Path; use std::process::exit; +use std::str::FromStr; use std::sync::atomic::Ordering; use std::sync::{mpsc, Arc, Condvar, Mutex, RwLock}; use std::{thread, time::Duration}; @@ -322,11 +323,19 @@ fn wait_spec( } else { spec_set = false; } + let connstr = Url::parse(connstr).context("cannot parse connstr as a URL")?; + let conn_conf = postgres::config::Config::from_str(connstr.as_str()) + .context("cannot build postgres config from connstr")?; + let tokio_conn_conf = tokio_postgres::config::Config::from_str(connstr.as_str()) + .context("cannot build tokio postgres config from connstr")?; let compute_node = ComputeNode { - connstr: Url::parse(connstr).context("cannot parse connstr as a URL")?, + connstr, + conn_conf, + tokio_conn_conf, pgdata: pgdata.to_string(), pgbin: pgbin.to_string(), pgversion: get_pg_version_string(pgbin), + http_port, live_config_allowed, state: Mutex::new(new_state), state_changed: Condvar::new(), @@ -381,7 +390,6 @@ fn wait_spec( Ok(WaitSpecResult { compute, - http_port, resize_swap_on_bind, set_disk_quota_for_fs: set_disk_quota_for_fs.cloned(), }) @@ -389,8 +397,6 @@ fn wait_spec( struct WaitSpecResult { compute: Arc, - // passed through from ProcessCliResult - http_port: u16, resize_swap_on_bind: bool, set_disk_quota_for_fs: Option, } @@ -400,7 +406,6 @@ fn start_postgres( #[allow(unused_variables)] matches: &clap::ArgMatches, WaitSpecResult { compute, - http_port, resize_swap_on_bind, set_disk_quota_for_fs, }: WaitSpecResult, @@ -473,12 +478,10 @@ fn start_postgres( } } - let extension_server_port: u16 = http_port; - // Start Postgres let mut pg = None; if !prestartup_failed { - pg = match compute.start_compute(extension_server_port) { + pg = match compute.start_compute() { Ok(pg) => Some(pg), Err(err) => { error!("could not start the compute node: {:#}", err); diff --git a/compute_tools/src/bin/fast_import.rs b/compute_tools/src/bin/fast_import.rs index 6716cc6234..b6db3eb11a 100644 --- a/compute_tools/src/bin/fast_import.rs +++ b/compute_tools/src/bin/fast_import.rs @@ -21,7 +21,7 @@ //! - Build the image with the following command: //! //! ```bash -//! docker buildx build --build-arg DEBIAN_FLAVOR=bullseye-slim --build-arg GIT_VERSION=local --build-arg PG_VERSION=v14 --build-arg BUILD_TAG="$(date --iso-8601=s -u)" -t localhost:3030/localregistry/compute-node-v14:latest -f compute/Dockerfile.com +//! docker buildx build --platform linux/amd64 --build-arg DEBIAN_VERSION=bullseye --build-arg GIT_VERSION=local --build-arg PG_VERSION=v14 --build-arg BUILD_TAG="$(date --iso-8601=s -u)" -t localhost:3030/localregistry/compute-node-v14:latest -f compute/compute-node.Dockerfile . //! docker push localhost:3030/localregistry/compute-node-v14:latest //! ``` @@ -132,7 +132,8 @@ pub(crate) async fn main() -> anyhow::Result<()> { // // Initialize pgdata // - let pg_version = match get_pg_version(pg_bin_dir.as_str()) { + let pgbin = pg_bin_dir.join("postgres"); + let pg_version = match get_pg_version(pgbin.as_ref()) { PostgresMajorVersion::V14 => 14, PostgresMajorVersion::V15 => 15, PostgresMajorVersion::V16 => 16, @@ -155,7 +156,7 @@ pub(crate) async fn main() -> anyhow::Result<()> { // // Launch postgres process // - let mut postgres_proc = tokio::process::Command::new(pg_bin_dir.join("postgres")) + let mut postgres_proc = tokio::process::Command::new(pgbin) .arg("-D") .arg(&pgdata_dir) .args(["-c", "wal_level=minimal"]) diff --git a/compute_tools/src/catalog.rs b/compute_tools/src/catalog.rs index 2f6f82dd39..72198a9479 100644 --- a/compute_tools/src/catalog.rs +++ b/compute_tools/src/catalog.rs @@ -1,4 +1,3 @@ -use compute_api::responses::CatalogObjects; use futures::Stream; use postgres::NoTls; use std::{path::Path, process::Stdio, result::Result, sync::Arc}; @@ -7,19 +6,17 @@ use tokio::{ process::Command, spawn, }; -use tokio_postgres::connect; use tokio_stream::{self as stream, StreamExt}; use tokio_util::codec::{BytesCodec, FramedRead}; use tracing::warn; use crate::compute::ComputeNode; -use crate::pg_helpers::{get_existing_dbs_async, get_existing_roles_async}; +use crate::pg_helpers::{get_existing_dbs_async, get_existing_roles_async, postgres_conf_for_db}; +use compute_api::responses::CatalogObjects; pub async fn get_dbs_and_roles(compute: &Arc) -> anyhow::Result { - let connstr = compute.connstr.clone(); - - let (client, connection): (tokio_postgres::Client, _) = - connect(connstr.as_str(), NoTls).await?; + let conf = compute.get_tokio_conn_conf(Some("compute_ctl:get_dbs_and_roles")); + let (client, connection): (tokio_postgres::Client, _) = conf.connect(NoTls).await?; spawn(async move { if let Err(e) = connection.await { @@ -43,6 +40,8 @@ pub enum SchemaDumpError { DatabaseDoesNotExist, #[error("Failed to execute pg_dump.")] IO(#[from] std::io::Error), + #[error("Unexpected error.")] + Unexpected, } // It uses the pg_dump utility to dump the schema of the specified database. @@ -60,11 +59,38 @@ pub async fn get_database_schema( let pgbin = &compute.pgbin; let basepath = Path::new(pgbin).parent().unwrap(); let pgdump = basepath.join("pg_dump"); - let mut connstr = compute.connstr.clone(); - connstr.set_path(dbname); + + // Replace the DB in the connection string and disable it to parts. + // This is the only option to handle DBs with special characters. + let conf = + postgres_conf_for_db(&compute.connstr, dbname).map_err(|_| SchemaDumpError::Unexpected)?; + let host = conf + .get_hosts() + .first() + .ok_or(SchemaDumpError::Unexpected)?; + let host = match host { + tokio_postgres::config::Host::Tcp(ip) => ip.to_string(), + #[cfg(unix)] + tokio_postgres::config::Host::Unix(path) => path.to_string_lossy().to_string(), + }; + let port = conf + .get_ports() + .first() + .ok_or(SchemaDumpError::Unexpected)?; + let user = conf.get_user().ok_or(SchemaDumpError::Unexpected)?; + let dbname = conf.get_dbname().ok_or(SchemaDumpError::Unexpected)?; + let mut cmd = Command::new(pgdump) + // XXX: this seems to be the only option to deal with DBs with `=` in the name + // See + .env("PGDATABASE", dbname) + .arg("--host") + .arg(host) + .arg("--port") + .arg(port.to_string()) + .arg("--username") + .arg(user) .arg("--schema-only") - .arg(connstr.as_str()) .stdout(Stdio::piped()) .stderr(Stdio::piped()) .kill_on_drop(true) diff --git a/compute_tools/src/checker.rs b/compute_tools/src/checker.rs index cec2b1bed8..62d61a8bc9 100644 --- a/compute_tools/src/checker.rs +++ b/compute_tools/src/checker.rs @@ -9,7 +9,8 @@ use crate::compute::ComputeNode; #[instrument(skip_all)] pub async fn check_writability(compute: &ComputeNode) -> Result<()> { // Connect to the database. - let (client, connection) = tokio_postgres::connect(compute.connstr.as_str(), NoTls).await?; + let conf = compute.get_tokio_conn_conf(Some("compute_ctl:availability_checker")); + let (client, connection) = conf.connect(NoTls).await?; if client.is_closed() { return Err(anyhow!("connection to postgres closed")); } diff --git a/compute_tools/src/compute.rs b/compute_tools/src/compute.rs index 4f67425ba8..0d1e6d680f 100644 --- a/compute_tools/src/compute.rs +++ b/compute_tools/src/compute.rs @@ -20,8 +20,9 @@ use futures::future::join_all; use futures::stream::FuturesUnordered; use futures::StreamExt; use nix::unistd::Pid; +use postgres; use postgres::error::SqlState; -use postgres::{Client, NoTls}; +use postgres::NoTls; use tracing::{debug, error, info, instrument, warn}; use utils::id::{TenantId, TimelineId}; use utils::lsn::Lsn; @@ -34,9 +35,8 @@ use utils::measured_stream::MeasuredReader; use nix::sys::signal::{kill, Signal}; use remote_storage::{DownloadError, RemotePath}; use tokio::spawn; -use url::Url; -use crate::installed_extensions::get_installed_extensions_sync; +use crate::installed_extensions::get_installed_extensions; use crate::local_proxy; use crate::pg_helpers::*; use crate::spec::*; @@ -59,6 +59,10 @@ pub static PG_PID: AtomicU32 = AtomicU32::new(0); pub struct ComputeNode { // Url type maintains proper escaping pub connstr: url::Url, + // We connect to Postgres from many different places, so build configs once + // and reuse them where needed. + pub conn_conf: postgres::config::Config, + pub tokio_conn_conf: tokio_postgres::config::Config, pub pgdata: String, pub pgbin: String, pub pgversion: String, @@ -75,6 +79,8 @@ pub struct ComputeNode { /// - we push spec and it does configuration /// - but then it is restarted without any spec again pub live_config_allowed: bool, + /// The port that the compute's HTTP server listens on + pub http_port: u16, /// Volatile part of the `ComputeNode`, which should be used under `Mutex`. /// To allow HTTP API server to serving status requests, while configuration /// is in progress, lock should be held only for short periods of time to do @@ -607,11 +613,7 @@ impl ComputeNode { /// Do all the preparations like PGDATA directory creation, configuration, /// safekeepers sync, basebackup, etc. #[instrument(skip_all)] - pub fn prepare_pgdata( - &self, - compute_state: &ComputeState, - extension_server_port: u16, - ) -> Result<()> { + pub fn prepare_pgdata(&self, compute_state: &ComputeState) -> Result<()> { let pspec = compute_state.pspec.as_ref().expect("spec must be set"); let spec = &pspec.spec; let pgdata_path = Path::new(&self.pgdata); @@ -621,7 +623,7 @@ impl ComputeNode { config::write_postgres_conf( &pgdata_path.join("postgresql.conf"), &pspec.spec, - Some(extension_server_port), + self.http_port, )?; // Syncing safekeepers is only safe with primary nodes: if a primary @@ -801,10 +803,10 @@ impl ComputeNode { /// version. In the future, it may upgrade all 3rd-party extensions. #[instrument(skip_all)] pub fn post_apply_config(&self) -> Result<()> { - let connstr = self.connstr.clone(); + let conf = self.get_conn_conf(Some("compute_ctl:post_apply_config")); thread::spawn(move || { let func = || { - let mut client = Client::connect(connstr.as_str(), NoTls)?; + let mut client = conf.connect(NoTls)?; handle_neon_extension_upgrade(&mut client) .context("handle_neon_extension_upgrade")?; Ok::<_, anyhow::Error>(()) @@ -816,30 +818,48 @@ impl ComputeNode { Ok(()) } - async fn get_maintenance_client(url: &Url) -> Result { - let mut connstr = url.clone(); + pub fn get_conn_conf(&self, application_name: Option<&str>) -> postgres::Config { + let mut conf = self.conn_conf.clone(); + if let Some(application_name) = application_name { + conf.application_name(application_name); + } + conf + } - connstr - .query_pairs_mut() - .append_pair("application_name", "apply_config"); + pub fn get_tokio_conn_conf(&self, application_name: Option<&str>) -> tokio_postgres::Config { + let mut conf = self.tokio_conn_conf.clone(); + if let Some(application_name) = application_name { + conf.application_name(application_name); + } + conf + } - let (client, conn) = match tokio_postgres::connect(connstr.as_str(), NoTls).await { + async fn get_maintenance_client( + conf: &tokio_postgres::Config, + ) -> Result { + let mut conf = conf.clone(); + conf.application_name("compute_ctl:apply_config"); + + let (client, conn) = match conf.connect(NoTls).await { + // If connection fails, it may be the old node with `zenith_admin` superuser. + // + // In this case we need to connect with old `zenith_admin` name + // and create new user. We cannot simply rename connected user, + // but we can create a new one and grant it all privileges. Err(e) => match e.code() { Some(&SqlState::INVALID_PASSWORD) | Some(&SqlState::INVALID_AUTHORIZATION_SPECIFICATION) => { - // connect with zenith_admin if cloud_admin could not authenticate + // Connect with zenith_admin if cloud_admin could not authenticate info!( "cannot connect to postgres: {}, retrying with `zenith_admin` username", e ); - let mut zenith_admin_connstr = connstr.clone(); - - zenith_admin_connstr - .set_username("zenith_admin") - .map_err(|_| anyhow::anyhow!("invalid connstr"))?; + let mut zenith_admin_conf = postgres::config::Config::from(conf.clone()); + zenith_admin_conf.application_name("compute_ctl:apply_config"); + zenith_admin_conf.user("zenith_admin"); let mut client = - Client::connect(zenith_admin_connstr.as_str(), NoTls) + zenith_admin_conf.connect(NoTls) .context("broken cloud_admin credential: tried connecting with cloud_admin but could not authenticate, and zenith_admin does not work either")?; // Disable forwarding so that users don't get a cloud_admin role @@ -853,8 +873,8 @@ impl ComputeNode { drop(client); - // reconnect with connstring with expected name - tokio_postgres::connect(connstr.as_str(), NoTls).await? + // Reconnect with connstring with expected name + conf.connect(NoTls).await? } _ => return Err(e.into()), }, @@ -885,7 +905,7 @@ impl ComputeNode { pub fn apply_spec_sql( &self, spec: Arc, - url: Arc, + conf: Arc, concurrency: usize, ) -> Result<()> { let rt = tokio::runtime::Builder::new_multi_thread() @@ -897,7 +917,7 @@ impl ComputeNode { rt.block_on(async { // Proceed with post-startup configuration. Note, that order of operations is important. - let client = Self::get_maintenance_client(&url).await?; + let client = Self::get_maintenance_client(&conf).await?; let spec = spec.clone(); let databases = get_existing_dbs_async(&client).await?; @@ -931,7 +951,7 @@ impl ComputeNode { RenameAndDeleteDatabases, CreateAndAlterDatabases, ] { - debug!("Applying phase {:?}", &phase); + info!("Applying phase {:?}", &phase); apply_operations( spec.clone(), ctx.clone(), @@ -942,6 +962,7 @@ impl ComputeNode { .await?; } + info!("Applying RunInEachDatabase phase"); let concurrency_token = Arc::new(tokio::sync::Semaphore::new(concurrency)); let db_processes = spec @@ -955,7 +976,7 @@ impl ComputeNode { let spec = spec.clone(); let ctx = ctx.clone(); let jwks_roles = jwks_roles.clone(); - let mut url = url.as_ref().clone(); + let mut conf = conf.as_ref().clone(); let concurrency_token = concurrency_token.clone(); let db = db.clone(); @@ -964,14 +985,14 @@ impl ComputeNode { match &db { DB::SystemDB => {} DB::UserDB(db) => { - url.set_path(db.name.as_str()); + conf.dbname(db.name.as_str()); } } - let url = Arc::new(url); + let conf = Arc::new(conf); let fut = Self::apply_spec_sql_db( spec.clone(), - url, + conf, ctx.clone(), jwks_roles.clone(), concurrency_token.clone(), @@ -1017,7 +1038,7 @@ impl ComputeNode { /// semaphore. The caller has to make sure the semaphore isn't exhausted. async fn apply_spec_sql_db( spec: Arc, - url: Arc, + conf: Arc, ctx: Arc>, jwks_roles: Arc>, concurrency_token: Arc, @@ -1046,7 +1067,7 @@ impl ComputeNode { // that database. || async { if client_conn.is_none() { - let db_client = Self::get_maintenance_client(&url).await?; + let db_client = Self::get_maintenance_client(&conf).await?; client_conn.replace(db_client); } let client = client_conn.as_ref().unwrap(); @@ -1061,34 +1082,16 @@ impl ComputeNode { Ok::<(), anyhow::Error>(()) } - /// Do initial configuration of the already started Postgres. - #[instrument(skip_all)] - pub fn apply_config(&self, compute_state: &ComputeState) -> Result<()> { - // If connection fails, - // it may be the old node with `zenith_admin` superuser. - // - // In this case we need to connect with old `zenith_admin` name - // and create new user. We cannot simply rename connected user, - // but we can create a new one and grant it all privileges. - let mut url = self.connstr.clone(); - url.query_pairs_mut() - .append_pair("application_name", "apply_config"); - - let url = Arc::new(url); - let spec = Arc::new( - compute_state - .pspec - .as_ref() - .expect("spec must be set") - .spec - .clone(), - ); - - // Choose how many concurrent connections to use for applying the spec changes. - // If the cluster is not currently Running we don't have to deal with user connections, + /// Choose how many concurrent connections to use for applying the spec changes. + pub fn max_service_connections( + &self, + compute_state: &ComputeState, + spec: &ComputeSpec, + ) -> usize { + // If the cluster is in Init state we don't have to deal with user connections, // and can thus use all `max_connections` connection slots. However, that's generally not // very efficient, so we generally still limit it to a smaller number. - let max_concurrent_connections = if compute_state.status != ComputeStatus::Running { + if compute_state.status == ComputeStatus::Init { // If the settings contain 'max_connections', use that as template if let Some(config) = spec.cluster.settings.find("max_connections") { config.parse::().ok() @@ -1144,10 +1147,28 @@ impl ComputeNode { .map(|val| if val > 1 { val - 1 } else { 1 }) .last() .unwrap_or(3) - }; + } + } + + /// Do initial configuration of the already started Postgres. + #[instrument(skip_all)] + pub fn apply_config(&self, compute_state: &ComputeState) -> Result<()> { + let conf = self.get_tokio_conn_conf(Some("compute_ctl:apply_config")); + + let conf = Arc::new(conf); + let spec = Arc::new( + compute_state + .pspec + .as_ref() + .expect("spec must be set") + .spec + .clone(), + ); + + let max_concurrent_connections = self.max_service_connections(compute_state, &spec); // Merge-apply spec & changes to PostgreSQL state. - self.apply_spec_sql(spec.clone(), url.clone(), max_concurrent_connections)?; + self.apply_spec_sql(spec.clone(), conf.clone(), max_concurrent_connections)?; if let Some(ref local_proxy) = &spec.clone().local_proxy_config { info!("configuring local_proxy"); @@ -1156,12 +1177,11 @@ impl ComputeNode { // Run migrations separately to not hold up cold starts thread::spawn(move || { - let mut connstr = url.as_ref().clone(); - connstr - .query_pairs_mut() - .append_pair("application_name", "migrations"); + let conf = conf.as_ref().clone(); + let mut conf = postgres::config::Config::from(conf); + conf.application_name("compute_ctl:migrations"); - let mut client = Client::connect(connstr.as_str(), NoTls)?; + let mut client = conf.connect(NoTls)?; handle_migrations(&mut client).context("apply_config handle_migrations") }); @@ -1221,22 +1241,29 @@ impl ComputeNode { // Write new config let pgdata_path = Path::new(&self.pgdata); let postgresql_conf_path = pgdata_path.join("postgresql.conf"); - config::write_postgres_conf(&postgresql_conf_path, &spec, None)?; - // temporarily reset max_cluster_size in config + config::write_postgres_conf(&postgresql_conf_path, &spec, self.http_port)?; + + // TODO(ololobus): We need a concurrency during reconfiguration as well, + // but DB is already running and used by user. We can easily get out of + // `max_connections` limit, and the current code won't handle that. + // let compute_state = self.state.lock().unwrap().clone(); + // let max_concurrent_connections = self.max_service_connections(&compute_state, &spec); + let max_concurrent_connections = 1; + + // Temporarily reset max_cluster_size in config // to avoid the possibility of hitting the limit, while we are reconfiguring: - // creating new extensions, roles, etc... + // creating new extensions, roles, etc. config::with_compute_ctl_tmp_override(pgdata_path, "neon.max_cluster_size=-1", || { self.pg_reload_conf()?; if spec.mode == ComputeMode::Primary { - let mut url = self.connstr.clone(); - url.query_pairs_mut() - .append_pair("application_name", "apply_config"); - let url = Arc::new(url); + let mut conf = tokio_postgres::Config::from_str(self.connstr.as_str()).unwrap(); + conf.application_name("apply_config"); + let conf = Arc::new(conf); let spec = Arc::new(spec.clone()); - self.apply_spec_sql(spec, url, 1)?; + self.apply_spec_sql(spec, conf, max_concurrent_connections)?; } Ok(()) @@ -1255,10 +1282,7 @@ impl ComputeNode { } #[instrument(skip_all)] - pub fn start_compute( - &self, - extension_server_port: u16, - ) -> Result<(std::process::Child, std::thread::JoinHandle<()>)> { + pub fn start_compute(&self) -> Result<(std::process::Child, std::thread::JoinHandle<()>)> { let compute_state = self.state.lock().unwrap().clone(); let pspec = compute_state.pspec.as_ref().expect("spec must be set"); info!( @@ -1333,7 +1357,7 @@ impl ComputeNode { info!("{:?}", remote_ext_metrics); } - self.prepare_pgdata(&compute_state, extension_server_port)?; + self.prepare_pgdata(&compute_state)?; let start_time = Utc::now(); let pg_process = self.start_postgres(pspec.storage_auth_token.clone())?; @@ -1360,9 +1384,19 @@ impl ComputeNode { } self.post_apply_config()?; - let connstr = self.connstr.clone(); + let conf = self.get_conn_conf(None); thread::spawn(move || { - get_installed_extensions_sync(connstr).context("get_installed_extensions") + let res = get_installed_extensions(conf); + match res { + Ok(extensions) => { + info!( + "[NEON_EXT_STAT] {}", + serde_json::to_string(&extensions) + .expect("failed to serialize extensions list") + ); + } + Err(err) => error!("could not get installed extensions: {err:?}"), + } }); } @@ -1491,7 +1525,8 @@ impl ComputeNode { /// Select `pg_stat_statements` data and return it as a stringified JSON pub async fn collect_insights(&self) -> String { let mut result_rows: Vec = Vec::new(); - let connect_result = tokio_postgres::connect(self.connstr.as_str(), NoTls).await; + let conf = self.get_tokio_conn_conf(Some("compute_ctl:collect_insights")); + let connect_result = conf.connect(NoTls).await; let (client, connection) = connect_result.unwrap(); tokio::spawn(async move { if let Err(e) = connection.await { @@ -1617,10 +1652,9 @@ LIMIT 100", privileges: &[Privilege], role_name: &PgIdent, ) -> Result<()> { - use tokio_postgres::config::Config; use tokio_postgres::NoTls; - let mut conf = Config::from_str(self.connstr.as_str()).unwrap(); + let mut conf = self.get_tokio_conn_conf(Some("compute_ctl:set_role_grants")); conf.dbname(db_name); let (db_client, conn) = conf @@ -1657,10 +1691,9 @@ LIMIT 100", db_name: &PgIdent, ext_version: ExtVersion, ) -> Result { - use tokio_postgres::config::Config; use tokio_postgres::NoTls; - let mut conf = Config::from_str(self.connstr.as_str()).unwrap(); + let mut conf = self.get_tokio_conn_conf(Some("compute_ctl:install_extension")); conf.dbname(db_name); let (db_client, conn) = conf diff --git a/compute_tools/src/config.rs b/compute_tools/src/config.rs index d65fe73194..b257c8a68f 100644 --- a/compute_tools/src/config.rs +++ b/compute_tools/src/config.rs @@ -37,7 +37,7 @@ pub fn line_in_file(path: &Path, line: &str) -> Result { pub fn write_postgres_conf( path: &Path, spec: &ComputeSpec, - extension_server_port: Option, + extension_server_port: u16, ) -> Result<()> { // File::create() destroys the file content if it exists. let mut file = File::create(path)?; @@ -127,9 +127,7 @@ pub fn write_postgres_conf( writeln!(file, "# Managed by compute_ctl: end")?; } - if let Some(port) = extension_server_port { - writeln!(file, "neon.extension_server_port={}", port)?; - } + writeln!(file, "neon.extension_server_port={}", extension_server_port)?; // This is essential to keep this line at the end of the file, // because it is intended to override any settings above. diff --git a/compute_tools/src/http/api.rs b/compute_tools/src/http/api.rs index 8a047634df..7fa6426d8f 100644 --- a/compute_tools/src/http/api.rs +++ b/compute_tools/src/http/api.rs @@ -295,8 +295,12 @@ async fn routes(req: Request, compute: &Arc) -> Response render_json(Body::from(serde_json::to_string(&res).unwrap())), Err(e) => render_json_error( diff --git a/compute_tools/src/installed_extensions.rs b/compute_tools/src/installed_extensions.rs index 79d8b2ca04..5f62f08858 100644 --- a/compute_tools/src/installed_extensions.rs +++ b/compute_tools/src/installed_extensions.rs @@ -2,12 +2,9 @@ use compute_api::responses::{InstalledExtension, InstalledExtensions}; use metrics::proto::MetricFamily; use std::collections::HashMap; use std::collections::HashSet; -use tracing::info; -use url::Url; use anyhow::Result; use postgres::{Client, NoTls}; -use tokio::task; use metrics::core::Collector; use metrics::{register_uint_gauge_vec, UIntGaugeVec}; @@ -42,75 +39,53 @@ fn list_dbs(client: &mut Client) -> Result> { /// /// Same extension can be installed in multiple databases with different versions, /// we only keep the highest and lowest version across all databases. -pub async fn get_installed_extensions(connstr: Url) -> Result { - let mut connstr = connstr.clone(); +pub fn get_installed_extensions(mut conf: postgres::config::Config) -> Result { + conf.application_name("compute_ctl:get_installed_extensions"); + let mut client = conf.connect(NoTls)?; - task::spawn_blocking(move || { - let mut client = Client::connect(connstr.as_str(), NoTls)?; - let databases: Vec = list_dbs(&mut client)?; + let databases: Vec = list_dbs(&mut client)?; - let mut extensions_map: HashMap = HashMap::new(); - for db in databases.iter() { - connstr.set_path(db); - let mut db_client = Client::connect(connstr.as_str(), NoTls)?; - let extensions: Vec<(String, String)> = db_client - .query( - "SELECT extname, extversion FROM pg_catalog.pg_extension;", - &[], - )? - .iter() - .map(|row| (row.get("extname"), row.get("extversion"))) - .collect(); + let mut extensions_map: HashMap = HashMap::new(); + for db in databases.iter() { + conf.dbname(db); + let mut db_client = conf.connect(NoTls)?; + let extensions: Vec<(String, String)> = db_client + .query( + "SELECT extname, extversion FROM pg_catalog.pg_extension;", + &[], + )? + .iter() + .map(|row| (row.get("extname"), row.get("extversion"))) + .collect(); - for (extname, v) in extensions.iter() { - let version = v.to_string(); + for (extname, v) in extensions.iter() { + let version = v.to_string(); - // increment the number of databases where the version of extension is installed - INSTALLED_EXTENSIONS - .with_label_values(&[extname, &version]) - .inc(); + // increment the number of databases where the version of extension is installed + INSTALLED_EXTENSIONS + .with_label_values(&[extname, &version]) + .inc(); - extensions_map - .entry(extname.to_string()) - .and_modify(|e| { - e.versions.insert(version.clone()); - // count the number of databases where the extension is installed - e.n_databases += 1; - }) - .or_insert(InstalledExtension { - extname: extname.to_string(), - versions: HashSet::from([version.clone()]), - n_databases: 1, - }); - } + extensions_map + .entry(extname.to_string()) + .and_modify(|e| { + e.versions.insert(version.clone()); + // count the number of databases where the extension is installed + e.n_databases += 1; + }) + .or_insert(InstalledExtension { + extname: extname.to_string(), + versions: HashSet::from([version.clone()]), + n_databases: 1, + }); } + } - let res = InstalledExtensions { - extensions: extensions_map.values().cloned().collect(), - }; + let res = InstalledExtensions { + extensions: extensions_map.into_values().collect(), + }; - Ok(res) - }) - .await? -} - -// Gather info about installed extensions -pub fn get_installed_extensions_sync(connstr: Url) -> Result<()> { - let rt = tokio::runtime::Builder::new_current_thread() - .enable_all() - .build() - .expect("failed to create runtime"); - let result = rt - .block_on(crate::installed_extensions::get_installed_extensions( - connstr, - )) - .expect("failed to get installed extensions"); - - info!( - "[NEON_EXT_STAT] {}", - serde_json::to_string(&result).expect("failed to serialize extensions list") - ); - Ok(()) + Ok(res) } static INSTALLED_EXTENSIONS: Lazy = Lazy::new(|| { diff --git a/compute_tools/src/monitor.rs b/compute_tools/src/monitor.rs index d7127aac32..184f380a8d 100644 --- a/compute_tools/src/monitor.rs +++ b/compute_tools/src/monitor.rs @@ -17,11 +17,8 @@ const MONITOR_CHECK_INTERVAL: Duration = Duration::from_millis(500); // should be handled gracefully. fn watch_compute_activity(compute: &ComputeNode) { // Suppose that `connstr` doesn't change - let mut connstr = compute.connstr.clone(); - connstr - .query_pairs_mut() - .append_pair("application_name", "compute_activity_monitor"); - let connstr = connstr.as_str(); + let connstr = compute.connstr.clone(); + let conf = compute.get_conn_conf(Some("compute_ctl:activity_monitor")); // During startup and configuration we connect to every Postgres database, // but we don't want to count this as some user activity. So wait until @@ -29,7 +26,7 @@ fn watch_compute_activity(compute: &ComputeNode) { wait_for_postgres_start(compute); // Define `client` outside of the loop to reuse existing connection if it's active. - let mut client = Client::connect(connstr, NoTls); + let mut client = conf.connect(NoTls); let mut sleep = false; let mut prev_active_time: Option = None; @@ -57,7 +54,7 @@ fn watch_compute_activity(compute: &ComputeNode) { info!("connection to Postgres is closed, trying to reconnect"); // Connection is closed, reconnect and try again. - client = Client::connect(connstr, NoTls); + client = conf.connect(NoTls); continue; } @@ -196,7 +193,7 @@ fn watch_compute_activity(compute: &ComputeNode) { debug!("could not connect to Postgres: {}, retrying", e); // Establish a new connection and try again. - client = Client::connect(connstr, NoTls); + client = conf.connect(NoTls); } } } diff --git a/compute_tools/src/pg_helpers.rs b/compute_tools/src/pg_helpers.rs index 4a1e5ee0e8..e03b410699 100644 --- a/compute_tools/src/pg_helpers.rs +++ b/compute_tools/src/pg_helpers.rs @@ -6,6 +6,7 @@ use std::io::{BufRead, BufReader}; use std::os::unix::fs::PermissionsExt; use std::path::Path; use std::process::Child; +use std::str::FromStr; use std::thread::JoinHandle; use std::time::{Duration, Instant}; @@ -13,8 +14,10 @@ use anyhow::{bail, Result}; use futures::StreamExt; use ini::Ini; use notify::{RecursiveMode, Watcher}; +use postgres::config::Config; use tokio::io::AsyncBufReadExt; use tokio::time::timeout; +use tokio_postgres; use tokio_postgres::NoTls; use tracing::{debug, error, info, instrument}; @@ -542,3 +545,11 @@ async fn handle_postgres_logs_async(stderr: tokio::process::ChildStderr) -> Resu Ok(()) } + +/// `Postgres::config::Config` handles database names with whitespaces +/// and special characters properly. +pub fn postgres_conf_for_db(connstr: &url::Url, dbname: &str) -> Result { + let mut conf = Config::from_str(connstr.as_str())?; + conf.dbname(dbname); + Ok(conf) +} diff --git a/control_plane/src/endpoint.rs b/control_plane/src/endpoint.rs index 71514daa7c..1ca6dc43c4 100644 --- a/control_plane/src/endpoint.rs +++ b/control_plane/src/endpoint.rs @@ -310,6 +310,10 @@ impl Endpoint { conf.append("wal_log_hints", "off"); conf.append("max_replication_slots", "10"); conf.append("hot_standby", "on"); + // Set to 1MB to both exercise getPage requests/LFC, and still have enough room for + // Postgres to operate. Everything smaller might be not enough for Postgres under load, + // and can cause errors like 'no unpinned buffers available', see + // conf.append("shared_buffers", "1MB"); conf.append("fsync", "off"); conf.append("max_connections", "100"); diff --git a/control_plane/src/safekeeper.rs b/control_plane/src/safekeeper.rs index 7a019bce88..f0c3722925 100644 --- a/control_plane/src/safekeeper.rs +++ b/control_plane/src/safekeeper.rs @@ -5,6 +5,7 @@ //! ```text //! .neon/safekeepers/ //! ``` +use std::error::Error as _; use std::future::Future; use std::io::Write; use std::path::PathBuf; @@ -26,7 +27,7 @@ use crate::{ #[derive(Error, Debug)] pub enum SafekeeperHttpError { - #[error("Reqwest error: {0}")] + #[error("request error: {0}{}", .0.source().map(|e| format!(": {e}")).unwrap_or_default())] Transport(#[from] reqwest::Error), #[error("Error: {0}")] diff --git a/control_plane/storcon_cli/src/main.rs b/control_plane/storcon_cli/src/main.rs index b7f38c6286..e879424532 100644 --- a/control_plane/storcon_cli/src/main.rs +++ b/control_plane/storcon_cli/src/main.rs @@ -560,14 +560,26 @@ async fn main() -> anyhow::Result<()> { .await?; } Command::TenantDescribe { tenant_id } => { - let describe_response = storcon_client + let TenantDescribeResponse { + tenant_id, + shards, + stripe_size, + policy, + config, + } = storcon_client .dispatch::<(), TenantDescribeResponse>( Method::GET, format!("control/v1/tenant/{tenant_id}"), None, ) .await?; - let shards = describe_response.shards; + println!("Tenant {tenant_id}"); + let mut table = comfy_table::Table::new(); + table.add_row(["Policy", &format!("{:?}", policy)]); + table.add_row(["Stripe size", &format!("{:?}", stripe_size)]); + table.add_row(["Config", &serde_json::to_string_pretty(&config).unwrap()]); + println!("{table}"); + println!("Shards:"); let mut table = comfy_table::Table::new(); table.set_header(["Shard", "Attached", "Secondary", "Last error", "status"]); for shard in shards { diff --git a/docker-compose/compute_wrapper/Dockerfile b/docker-compose/compute_wrapper/Dockerfile index 8378f37b48..05a2cf124c 100644 --- a/docker-compose/compute_wrapper/Dockerfile +++ b/docker-compose/compute_wrapper/Dockerfile @@ -4,14 +4,16 @@ ARG TAG=latest FROM $REPOSITORY/${COMPUTE_IMAGE}:$TAG +ARG COMPUTE_IMAGE + USER root RUN apt-get update && \ apt-get install -y curl \ jq \ python3-pip \ - netcat + netcat-openbsd #Faker is required for the pg_anon test -RUN pip3 install Faker +RUN case $COMPUTE_IMAGE in compute-node-v17) OPT="--break-system-packages";; *) OPT= ;; esac && pip3 install $OPT Faker #This is required for the pg_hintplan test RUN mkdir -p /ext-src/pg_hint_plan-src && chown postgres /ext-src/pg_hint_plan-src diff --git a/docker-compose/docker_compose_test.sh b/docker-compose/docker_compose_test.sh index 10805a9952..c97dfaa901 100755 --- a/docker-compose/docker_compose_test.sh +++ b/docker-compose/docker_compose_test.sh @@ -30,10 +30,17 @@ cleanup() { docker compose --profile test-extensions -f $COMPOSE_FILE down } -for pg_version in 14 15 16; do +for pg_version in ${TEST_VERSION_ONLY-14 15 16 17}; do + pg_version=${pg_version/v/} echo "clean up containers if exists" cleanup - PG_TEST_VERSION=$(($pg_version < 16 ? 16 : $pg_version)) + PG_TEST_VERSION=$((pg_version < 16 ? 16 : pg_version)) + # The support of pg_anon not yet added to PG17, so we have to remove the corresponding option + if [ $pg_version -eq 17 ]; then + SPEC_PATH="compute_wrapper/var/db/postgres/specs" + mv $SPEC_PATH/spec.json $SPEC_PATH/spec.bak + jq 'del(.cluster.settings[] | select (.name == "session_preload_libraries"))' $SPEC_PATH/spec.bak > $SPEC_PATH/spec.json + fi PG_VERSION=$pg_version PG_TEST_VERSION=$PG_TEST_VERSION docker compose --profile test-extensions -f $COMPOSE_FILE up --build -d echo "wait until the compute is ready. timeout after 60s. " @@ -54,8 +61,7 @@ for pg_version in 14 15 16; do fi done - if [ $pg_version -ge 16 ] - then + if [ $pg_version -ge 16 ]; then echo Enabling trust connection docker exec $COMPUTE_CONTAINER_NAME bash -c "sed -i '\$d' /var/db/postgres/compute/pg_hba.conf && echo -e 'host\t all\t all\t all\t trust' >> /var/db/postgres/compute/pg_hba.conf && psql $PSQL_OPTION -c 'select pg_reload_conf()' " echo Adding postgres role @@ -68,10 +74,13 @@ for pg_version in 14 15 16; do # The test assumes that it is running on the same host with the postgres engine. # In our case it's not true, that's why we are copying files to the compute node TMPDIR=$(mktemp -d) - docker cp $TEST_CONTAINER_NAME:/ext-src/pg_anon-src/data $TMPDIR/data - echo -e '1\t too \t many \t tabs' > $TMPDIR/data/bad.csv - docker cp $TMPDIR/data $COMPUTE_CONTAINER_NAME:/tmp/tmp_anon_alternate_data + # Add support for pg_anon for pg_v16 + if [ $pg_version -ne 17 ]; then + docker cp $TEST_CONTAINER_NAME:/ext-src/pg_anon-src/data $TMPDIR/data + echo -e '1\t too \t many \t tabs' > $TMPDIR/data/bad.csv + docker cp $TMPDIR/data $COMPUTE_CONTAINER_NAME:/tmp/tmp_anon_alternate_data rm -rf $TMPDIR + fi TMPDIR=$(mktemp -d) # The following block does the same for the pg_hintplan test docker cp $TEST_CONTAINER_NAME:/ext-src/pg_hint_plan-src/data $TMPDIR/data @@ -97,4 +106,8 @@ for pg_version in 14 15 16; do fi fi cleanup + # The support of pg_anon not yet added to PG17, so we have to remove the corresponding option + if [ $pg_version -eq 17 ]; then + mv $SPEC_PATH/spec.bak $SPEC_PATH/spec.json + fi done diff --git a/libs/consumption_metrics/src/lib.rs b/libs/consumption_metrics/src/lib.rs index fbe2e6830f..448134f31a 100644 --- a/libs/consumption_metrics/src/lib.rs +++ b/libs/consumption_metrics/src/lib.rs @@ -103,11 +103,12 @@ impl<'a> IdempotencyKey<'a> { } } +/// Split into chunks of 1000 metrics to avoid exceeding the max request size. pub const CHUNK_SIZE: usize = 1000; // Just a wrapper around a slice of events // to serialize it as `{"events" : [ ] } -#[derive(serde::Serialize, Deserialize)] -pub struct EventChunk<'a, T: Clone> { +#[derive(Debug, serde::Serialize, serde::Deserialize, PartialEq)] +pub struct EventChunk<'a, T: Clone + PartialEq> { pub events: std::borrow::Cow<'a, [T]>, } diff --git a/libs/pageserver_api/src/config.rs b/libs/pageserver_api/src/config.rs index 721d97404b..09cfbc55fd 100644 --- a/libs/pageserver_api/src/config.rs +++ b/libs/pageserver_api/src/config.rs @@ -118,9 +118,8 @@ pub struct ConfigToml { pub virtual_file_io_mode: Option, #[serde(skip_serializing_if = "Option::is_none")] pub no_sync: Option, - #[serde(with = "humantime_serde")] - pub server_side_batch_timeout: Option, pub wal_receiver_protocol: PostgresClientProtocol, + pub page_service_pipelining: PageServicePipeliningConfig, } #[derive(Debug, Clone, PartialEq, Eq, serde::Serialize, serde::Deserialize)] @@ -137,6 +136,28 @@ pub struct DiskUsageEvictionTaskConfig { pub eviction_order: EvictionOrder, } +#[derive(Debug, Clone, PartialEq, Eq, serde::Serialize, serde::Deserialize)] +#[serde(tag = "mode", rename_all = "kebab-case")] +#[serde(deny_unknown_fields)] +pub enum PageServicePipeliningConfig { + Serial, + Pipelined(PageServicePipeliningConfigPipelined), +} +#[derive(Debug, Clone, PartialEq, Eq, serde::Serialize, serde::Deserialize)] +#[serde(deny_unknown_fields)] +pub struct PageServicePipeliningConfigPipelined { + /// Causes runtime errors if larger than max get_vectored batch size. + pub max_batch_size: NonZeroUsize, + pub execution: PageServiceProtocolPipelinedExecutionStrategy, +} + +#[derive(Debug, Clone, Copy, PartialEq, Eq, serde::Serialize, serde::Deserialize)] +#[serde(rename_all = "kebab-case")] +pub enum PageServiceProtocolPipelinedExecutionStrategy { + ConcurrentFutures, + Tasks, +} + pub mod statvfs { pub mod mock { #[derive(Debug, Clone, PartialEq, Eq, serde::Serialize, serde::Deserialize)] @@ -332,8 +353,6 @@ pub mod defaults { pub const DEFAULT_IO_BUFFER_ALIGNMENT: usize = 512; - pub const DEFAULT_SERVER_SIDE_BATCH_TIMEOUT: Option<&str> = None; - pub const DEFAULT_WAL_RECEIVER_PROTOCOL: utils::postgres_client::PostgresClientProtocol = utils::postgres_client::PostgresClientProtocol::Vanilla; } @@ -420,11 +439,17 @@ impl Default for ConfigToml { ephemeral_bytes_per_memory_kb: (DEFAULT_EPHEMERAL_BYTES_PER_MEMORY_KB), l0_flush: None, virtual_file_io_mode: None, - server_side_batch_timeout: DEFAULT_SERVER_SIDE_BATCH_TIMEOUT - .map(|duration| humantime::parse_duration(duration).unwrap()), tenant_config: TenantConfigToml::default(), no_sync: None, wal_receiver_protocol: DEFAULT_WAL_RECEIVER_PROTOCOL, + page_service_pipelining: if !cfg!(test) { + PageServicePipeliningConfig::Serial + } else { + PageServicePipeliningConfig::Pipelined(PageServicePipeliningConfigPipelined { + max_batch_size: NonZeroUsize::new(32).unwrap(), + execution: PageServiceProtocolPipelinedExecutionStrategy::ConcurrentFutures, + }) + }, } } } diff --git a/libs/pageserver_api/src/controller_api.rs b/libs/pageserver_api/src/controller_api.rs index 0ea30ce54f..9a5ebc95bd 100644 --- a/libs/pageserver_api/src/controller_api.rs +++ b/libs/pageserver_api/src/controller_api.rs @@ -48,7 +48,7 @@ pub struct TenantCreateResponse { pub shards: Vec, } -#[derive(Serialize, Deserialize)] +#[derive(Serialize, Deserialize, Debug, Clone)] pub struct NodeRegisterRequest { pub node_id: NodeId, @@ -75,7 +75,7 @@ pub struct TenantPolicyRequest { pub scheduling: Option, } -#[derive(Clone, Serialize, Deserialize, PartialEq, Eq, Hash)] +#[derive(Clone, Serialize, Deserialize, PartialEq, Eq, Hash, Debug)] pub struct AvailabilityZone(pub String); impl Display for AvailabilityZone { diff --git a/libs/pageserver_api/src/key.rs b/libs/pageserver_api/src/key.rs index 523d143381..37dff6fe46 100644 --- a/libs/pageserver_api/src/key.rs +++ b/libs/pageserver_api/src/key.rs @@ -770,6 +770,11 @@ impl Key { && self.field6 == 1 } + #[inline(always)] + pub fn is_aux_file_key(&self) -> bool { + self.field1 == AUX_KEY_PREFIX + } + /// Guaranteed to return `Ok()` if [`Self::is_rel_block_key`] returns `true` for `key`. #[inline(always)] pub fn to_rel_block(self) -> anyhow::Result<(RelTag, BlockNumber)> { diff --git a/libs/pageserver_api/src/models.rs b/libs/pageserver_api/src/models.rs index 42c5d10c05..5488f7b2c2 100644 --- a/libs/pageserver_api/src/models.rs +++ b/libs/pageserver_api/src/models.rs @@ -501,7 +501,9 @@ pub struct EvictionPolicyLayerAccessThreshold { #[derive(Debug, Serialize, Deserialize, Clone, PartialEq, Eq)] pub struct ThrottleConfig { - pub task_kinds: Vec, // TaskKind + /// See [`ThrottleConfigTaskKinds`] for why we do the serde `rename`. + #[serde(rename = "task_kinds")] + pub enabled: ThrottleConfigTaskKinds, pub initial: u32, #[serde(with = "humantime_serde")] pub refill_interval: Duration, @@ -509,10 +511,38 @@ pub struct ThrottleConfig { pub max: u32, } +/// Before +/// the throttle was a per `Timeline::get`/`Timeline::get_vectored` call. +/// The `task_kinds` field controlled which Pageserver "Task Kind"s +/// were subject to the throttle. +/// +/// After that PR, the throttle is applied at pagestream request level +/// and the `task_kinds` field does not apply since the only task kind +/// that us subject to the throttle is that of the page service. +/// +/// However, we don't want to make a breaking config change right now +/// because it means we have to migrate all the tenant configs. +/// This will be done in a future PR. +/// +/// In the meantime, we use emptiness / non-emptsiness of the `task_kinds` +/// field to determine if the throttle is enabled or not. +#[derive(Serialize, Deserialize, Debug, Clone, PartialEq, Eq)] +#[serde(transparent)] +pub struct ThrottleConfigTaskKinds(Vec); + +impl ThrottleConfigTaskKinds { + pub fn disabled() -> Self { + Self(vec![]) + } + pub fn is_enabled(&self) -> bool { + !self.0.is_empty() + } +} + impl ThrottleConfig { pub fn disabled() -> Self { Self { - task_kinds: vec![], // effectively disables the throttle + enabled: ThrottleConfigTaskKinds::disabled(), // other values don't matter with emtpy `task_kinds`. initial: 0, refill_interval: Duration::from_millis(1), @@ -526,6 +556,30 @@ impl ThrottleConfig { } } +#[cfg(test)] +mod throttle_config_tests { + use super::*; + + #[test] + fn test_disabled_is_disabled() { + let config = ThrottleConfig::disabled(); + assert!(!config.enabled.is_enabled()); + } + #[test] + fn test_enabled_backwards_compat() { + let input = serde_json::json!({ + "task_kinds": ["PageRequestHandler"], + "initial": 40000, + "refill_interval": "50ms", + "refill_amount": 1000, + "max": 40000, + "fair": true + }); + let config: ThrottleConfig = serde_json::from_value(input).unwrap(); + assert!(config.enabled.is_enabled()); + } +} + /// A flattened analog of a `pagesever::tenant::LocationMode`, which /// lists out all possible states (and the virtual "Detached" state) /// in a flat form rather than using rust-style enums. diff --git a/libs/pageserver_api/src/shard.rs b/libs/pageserver_api/src/shard.rs index e83cf4c855..a5c94a82c1 100644 --- a/libs/pageserver_api/src/shard.rs +++ b/libs/pageserver_api/src/shard.rs @@ -170,19 +170,37 @@ impl ShardIdentity { } } + /// Return true if the key should be stored on all shards, not just one. + fn is_key_global(&self, key: &Key) -> bool { + if key.is_slru_block_key() || key.is_slru_segment_size_key() || key.is_aux_file_key() { + // Special keys that are only stored on shard 0 + false + } else if key.is_rel_block_key() { + // Ordinary relation blocks are distributed across shards + false + } else if key.is_rel_size_key() { + // All shards maintain rel size keys (although only shard 0 is responsible for + // keeping it strictly accurate, other shards just reflect the highest block they've ingested) + true + } else { + // For everything else, we assume it must be kept everywhere, because ingest code + // might assume this -- this covers functionality where the ingest code has + // not (yet) been made fully shard aware. + true + } + } + /// Return true if the key should be discarded if found in this shard's /// data store, e.g. during compaction after a split. /// /// Shards _may_ drop keys which return false here, but are not obliged to. pub fn is_key_disposable(&self, key: &Key) -> bool { - if key_is_shard0(key) { - // Q: Why can't we dispose of shard0 content if we're not shard 0? - // A1: because the WAL ingestion logic currently ingests some shard 0 - // content on all shards, even though it's only read on shard 0. If we - // dropped it, then subsequent WAL ingest to these keys would encounter - // an error. - // A2: because key_is_shard0 also covers relation size keys, which are written - // on all shards even though they're only maintained accurately on shard 0. + if self.count < ShardCount(2) { + // Fast path: unsharded tenant doesn't dispose of anything + return false; + } + + if self.is_key_global(key) { false } else { !self.is_key_local(key) diff --git a/libs/pq_proto/src/lib.rs b/libs/pq_proto/src/lib.rs index 4b0331999d..94714359a3 100644 --- a/libs/pq_proto/src/lib.rs +++ b/libs/pq_proto/src/lib.rs @@ -100,7 +100,7 @@ impl StartupMessageParamsBuilder { #[derive(Debug, Clone, Default)] pub struct StartupMessageParams { - params: Bytes, + pub params: Bytes, } impl StartupMessageParams { @@ -565,6 +565,8 @@ pub enum BeMessage<'a> { /// Batch of interpreted, shard filtered WAL records, /// ready for the pageserver to ingest InterpretedWalRecords(InterpretedWalRecordsBody<'a>), + + Raw(u8, &'a [u8]), } /// Common shorthands. @@ -754,6 +756,10 @@ impl BeMessage<'_> { /// one more buffer. pub fn write(buf: &mut BytesMut, message: &BeMessage) -> Result<(), ProtocolError> { match message { + BeMessage::Raw(code, data) => { + buf.put_u8(*code); + write_body(buf, |b| b.put_slice(data)) + } BeMessage::AuthenticationOk => { buf.put_u8(b'R'); write_body(buf, |buf| { diff --git a/libs/proxy/README.md b/libs/proxy/README.md new file mode 100644 index 0000000000..2ae6210e46 --- /dev/null +++ b/libs/proxy/README.md @@ -0,0 +1,6 @@ +This directory contains libraries that are specific for proxy. + +Currently, it contains a signficant fork/refactoring of rust-postgres that no longer reflects the API +of the original library. Since it was so significant, it made sense to upgrade it to it's own set of libraries. + +Proxy needs unique access to the protocol, which explains why such heavy modifications were necessary. diff --git a/libs/proxy/postgres-protocol2/Cargo.toml b/libs/proxy/postgres-protocol2/Cargo.toml new file mode 100644 index 0000000000..f71c1599c7 --- /dev/null +++ b/libs/proxy/postgres-protocol2/Cargo.toml @@ -0,0 +1,20 @@ +[package] +name = "postgres-protocol2" +version = "0.1.0" +edition = "2018" +license = "MIT/Apache-2.0" + +[dependencies] +base64 = "0.20" +byteorder.workspace = true +bytes.workspace = true +fallible-iterator.workspace = true +hmac.workspace = true +memchr = "2.0" +rand.workspace = true +sha2.workspace = true +stringprep = "0.1" +tokio = { workspace = true, features = ["rt"] } + +[dev-dependencies] +tokio = { workspace = true, features = ["full"] } diff --git a/libs/proxy/postgres-protocol2/src/authentication/mod.rs b/libs/proxy/postgres-protocol2/src/authentication/mod.rs new file mode 100644 index 0000000000..0bdc177143 --- /dev/null +++ b/libs/proxy/postgres-protocol2/src/authentication/mod.rs @@ -0,0 +1,2 @@ +//! Authentication protocol support. +pub mod sasl; diff --git a/libs/proxy/postgres-protocol2/src/authentication/sasl.rs b/libs/proxy/postgres-protocol2/src/authentication/sasl.rs new file mode 100644 index 0000000000..f2200a40ce --- /dev/null +++ b/libs/proxy/postgres-protocol2/src/authentication/sasl.rs @@ -0,0 +1,516 @@ +//! SASL-based authentication support. + +use hmac::{Hmac, Mac}; +use rand::{self, Rng}; +use sha2::digest::FixedOutput; +use sha2::{Digest, Sha256}; +use std::fmt::Write; +use std::io; +use std::iter; +use std::mem; +use std::str; +use tokio::task::yield_now; + +const NONCE_LENGTH: usize = 24; + +/// The identifier of the SCRAM-SHA-256 SASL authentication mechanism. +pub const SCRAM_SHA_256: &str = "SCRAM-SHA-256"; +/// The identifier of the SCRAM-SHA-256-PLUS SASL authentication mechanism. +pub const SCRAM_SHA_256_PLUS: &str = "SCRAM-SHA-256-PLUS"; + +// since postgres passwords are not required to exclude saslprep-prohibited +// characters or even be valid UTF8, we run saslprep if possible and otherwise +// return the raw password. +fn normalize(pass: &[u8]) -> Vec { + let pass = match str::from_utf8(pass) { + Ok(pass) => pass, + Err(_) => return pass.to_vec(), + }; + + match stringprep::saslprep(pass) { + Ok(pass) => pass.into_owned().into_bytes(), + Err(_) => pass.as_bytes().to_vec(), + } +} + +pub(crate) async fn hi(str: &[u8], salt: &[u8], iterations: u32) -> [u8; 32] { + let mut hmac = + Hmac::::new_from_slice(str).expect("HMAC is able to accept all key sizes"); + hmac.update(salt); + hmac.update(&[0, 0, 0, 1]); + let mut prev = hmac.finalize().into_bytes(); + + let mut hi = prev; + + for i in 1..iterations { + let mut hmac = Hmac::::new_from_slice(str).expect("already checked above"); + hmac.update(&prev); + prev = hmac.finalize().into_bytes(); + + for (hi, prev) in hi.iter_mut().zip(prev) { + *hi ^= prev; + } + // yield every ~250us + // hopefully reduces tail latencies + if i % 1024 == 0 { + yield_now().await + } + } + + hi.into() +} + +enum ChannelBindingInner { + Unrequested, + Unsupported, + TlsServerEndPoint(Vec), +} + +/// The channel binding configuration for a SCRAM authentication exchange. +pub struct ChannelBinding(ChannelBindingInner); + +impl ChannelBinding { + /// The server did not request channel binding. + pub fn unrequested() -> ChannelBinding { + ChannelBinding(ChannelBindingInner::Unrequested) + } + + /// The server requested channel binding but the client is unable to provide it. + pub fn unsupported() -> ChannelBinding { + ChannelBinding(ChannelBindingInner::Unsupported) + } + + /// The server requested channel binding and the client will use the `tls-server-end-point` + /// method. + pub fn tls_server_end_point(signature: Vec) -> ChannelBinding { + ChannelBinding(ChannelBindingInner::TlsServerEndPoint(signature)) + } + + fn gs2_header(&self) -> &'static str { + match self.0 { + ChannelBindingInner::Unrequested => "y,,", + ChannelBindingInner::Unsupported => "n,,", + ChannelBindingInner::TlsServerEndPoint(_) => "p=tls-server-end-point,,", + } + } + + fn cbind_data(&self) -> &[u8] { + match self.0 { + ChannelBindingInner::Unrequested | ChannelBindingInner::Unsupported => &[], + ChannelBindingInner::TlsServerEndPoint(ref buf) => buf, + } + } +} + +/// A pair of keys for the SCRAM-SHA-256 mechanism. +/// See for details. +#[derive(Debug, Clone, Copy, PartialEq, Eq)] +pub struct ScramKeys { + /// Used by server to authenticate client. + pub client_key: [u8; N], + /// Used by client to verify server's signature. + pub server_key: [u8; N], +} + +/// Password or keys which were derived from it. +enum Credentials { + /// A regular password as a vector of bytes. + Password(Vec), + /// A precomputed pair of keys. + Keys(ScramKeys), +} + +enum State { + Update { + nonce: String, + password: Credentials<32>, + channel_binding: ChannelBinding, + }, + Finish { + server_key: [u8; 32], + auth_message: String, + }, + Done, +} + +/// A type which handles the client side of the SCRAM-SHA-256/SCRAM-SHA-256-PLUS authentication +/// process. +/// +/// During the authentication process, if the backend sends an `AuthenticationSASL` message which +/// includes `SCRAM-SHA-256` as an authentication mechanism, this type can be used. +/// +/// After a `ScramSha256` is constructed, the buffer returned by the `message()` method should be +/// sent to the backend in a `SASLInitialResponse` message along with the mechanism name. +/// +/// The server will reply with an `AuthenticationSASLContinue` message. Its contents should be +/// passed to the `update()` method, after which the buffer returned by the `message()` method +/// should be sent to the backend in a `SASLResponse` message. +/// +/// The server will reply with an `AuthenticationSASLFinal` message. Its contents should be passed +/// to the `finish()` method, after which the authentication process is complete. +pub struct ScramSha256 { + message: String, + state: State, +} + +fn nonce() -> String { + // rand 0.5's ThreadRng is cryptographically secure + let mut rng = rand::thread_rng(); + (0..NONCE_LENGTH) + .map(|_| { + let mut v = rng.gen_range(0x21u8..0x7e); + if v == 0x2c { + v = 0x7e + } + v as char + }) + .collect() +} + +impl ScramSha256 { + /// Constructs a new instance which will use the provided password for authentication. + pub fn new(password: &[u8], channel_binding: ChannelBinding) -> ScramSha256 { + let password = Credentials::Password(normalize(password)); + ScramSha256::new_inner(password, channel_binding, nonce()) + } + + /// Constructs a new instance which will use the provided key pair for authentication. + pub fn new_with_keys(keys: ScramKeys<32>, channel_binding: ChannelBinding) -> ScramSha256 { + let password = Credentials::Keys(keys); + ScramSha256::new_inner(password, channel_binding, nonce()) + } + + fn new_inner( + password: Credentials<32>, + channel_binding: ChannelBinding, + nonce: String, + ) -> ScramSha256 { + ScramSha256 { + message: format!("{}n=,r={}", channel_binding.gs2_header(), nonce), + state: State::Update { + nonce, + password, + channel_binding, + }, + } + } + + /// Returns the message which should be sent to the backend in an `SASLResponse` message. + pub fn message(&self) -> &[u8] { + if let State::Done = self.state { + panic!("invalid SCRAM state"); + } + self.message.as_bytes() + } + + /// Updates the state machine with the response from the backend. + /// + /// This should be called when an `AuthenticationSASLContinue` message is received. + pub async fn update(&mut self, message: &[u8]) -> io::Result<()> { + let (client_nonce, password, channel_binding) = + match mem::replace(&mut self.state, State::Done) { + State::Update { + nonce, + password, + channel_binding, + } => (nonce, password, channel_binding), + _ => return Err(io::Error::new(io::ErrorKind::Other, "invalid SCRAM state")), + }; + + let message = + str::from_utf8(message).map_err(|e| io::Error::new(io::ErrorKind::InvalidInput, e))?; + + let parsed = Parser::new(message).server_first_message()?; + + if !parsed.nonce.starts_with(&client_nonce) { + return Err(io::Error::new(io::ErrorKind::InvalidInput, "invalid nonce")); + } + + let (client_key, server_key) = match password { + Credentials::Password(password) => { + let salt = match base64::decode(parsed.salt) { + Ok(salt) => salt, + Err(e) => return Err(io::Error::new(io::ErrorKind::InvalidInput, e)), + }; + + let salted_password = hi(&password, &salt, parsed.iteration_count).await; + + let make_key = |name| { + let mut hmac = Hmac::::new_from_slice(&salted_password) + .expect("HMAC is able to accept all key sizes"); + hmac.update(name); + + let mut key = [0u8; 32]; + key.copy_from_slice(hmac.finalize().into_bytes().as_slice()); + key + }; + + (make_key(b"Client Key"), make_key(b"Server Key")) + } + Credentials::Keys(keys) => (keys.client_key, keys.server_key), + }; + + let mut hash = Sha256::default(); + hash.update(client_key); + let stored_key = hash.finalize_fixed(); + + let mut cbind_input = vec![]; + cbind_input.extend(channel_binding.gs2_header().as_bytes()); + cbind_input.extend(channel_binding.cbind_data()); + let cbind_input = base64::encode(&cbind_input); + + self.message.clear(); + write!(&mut self.message, "c={},r={}", cbind_input, parsed.nonce).unwrap(); + + let auth_message = format!("n=,r={},{},{}", client_nonce, message, self.message); + + let mut hmac = Hmac::::new_from_slice(&stored_key) + .expect("HMAC is able to accept all key sizes"); + hmac.update(auth_message.as_bytes()); + let client_signature = hmac.finalize().into_bytes(); + + let mut client_proof = client_key; + for (proof, signature) in client_proof.iter_mut().zip(client_signature) { + *proof ^= signature; + } + + write!(&mut self.message, ",p={}", base64::encode(client_proof)).unwrap(); + + self.state = State::Finish { + server_key, + auth_message, + }; + Ok(()) + } + + /// Finalizes the authentication process. + /// + /// This should be called when the backend sends an `AuthenticationSASLFinal` message. + /// Authentication has only succeeded if this method returns `Ok(())`. + pub fn finish(&mut self, message: &[u8]) -> io::Result<()> { + let (server_key, auth_message) = match mem::replace(&mut self.state, State::Done) { + State::Finish { + server_key, + auth_message, + } => (server_key, auth_message), + _ => return Err(io::Error::new(io::ErrorKind::Other, "invalid SCRAM state")), + }; + + let message = + str::from_utf8(message).map_err(|e| io::Error::new(io::ErrorKind::InvalidInput, e))?; + + let parsed = Parser::new(message).server_final_message()?; + + let verifier = match parsed { + ServerFinalMessage::Error(e) => { + return Err(io::Error::new( + io::ErrorKind::Other, + format!("SCRAM error: {}", e), + )); + } + ServerFinalMessage::Verifier(verifier) => verifier, + }; + + let verifier = match base64::decode(verifier) { + Ok(verifier) => verifier, + Err(e) => return Err(io::Error::new(io::ErrorKind::InvalidInput, e)), + }; + + let mut hmac = Hmac::::new_from_slice(&server_key) + .expect("HMAC is able to accept all key sizes"); + hmac.update(auth_message.as_bytes()); + hmac.verify_slice(&verifier) + .map_err(|_| io::Error::new(io::ErrorKind::InvalidInput, "SCRAM verification error")) + } +} + +struct Parser<'a> { + s: &'a str, + it: iter::Peekable>, +} + +impl<'a> Parser<'a> { + fn new(s: &'a str) -> Parser<'a> { + Parser { + s, + it: s.char_indices().peekable(), + } + } + + fn eat(&mut self, target: char) -> io::Result<()> { + match self.it.next() { + Some((_, c)) if c == target => Ok(()), + Some((i, c)) => { + let m = format!( + "unexpected character at byte {}: expected `{}` but got `{}", + i, target, c + ); + Err(io::Error::new(io::ErrorKind::InvalidInput, m)) + } + None => Err(io::Error::new( + io::ErrorKind::UnexpectedEof, + "unexpected EOF", + )), + } + } + + fn take_while(&mut self, f: F) -> io::Result<&'a str> + where + F: Fn(char) -> bool, + { + let start = match self.it.peek() { + Some(&(i, _)) => i, + None => return Ok(""), + }; + + loop { + match self.it.peek() { + Some(&(_, c)) if f(c) => { + self.it.next(); + } + Some(&(i, _)) => return Ok(&self.s[start..i]), + None => return Ok(&self.s[start..]), + } + } + } + + fn printable(&mut self) -> io::Result<&'a str> { + self.take_while(|c| matches!(c, '\x21'..='\x2b' | '\x2d'..='\x7e')) + } + + fn nonce(&mut self) -> io::Result<&'a str> { + self.eat('r')?; + self.eat('=')?; + self.printable() + } + + fn base64(&mut self) -> io::Result<&'a str> { + self.take_while(|c| matches!(c, 'a'..='z' | 'A'..='Z' | '0'..='9' | '/' | '+' | '=')) + } + + fn salt(&mut self) -> io::Result<&'a str> { + self.eat('s')?; + self.eat('=')?; + self.base64() + } + + fn posit_number(&mut self) -> io::Result { + let n = self.take_while(|c| c.is_ascii_digit())?; + n.parse() + .map_err(|e| io::Error::new(io::ErrorKind::InvalidInput, e)) + } + + fn iteration_count(&mut self) -> io::Result { + self.eat('i')?; + self.eat('=')?; + self.posit_number() + } + + fn eof(&mut self) -> io::Result<()> { + match self.it.peek() { + Some(&(i, _)) => Err(io::Error::new( + io::ErrorKind::InvalidInput, + format!("unexpected trailing data at byte {}", i), + )), + None => Ok(()), + } + } + + fn server_first_message(&mut self) -> io::Result> { + let nonce = self.nonce()?; + self.eat(',')?; + let salt = self.salt()?; + self.eat(',')?; + let iteration_count = self.iteration_count()?; + self.eof()?; + + Ok(ServerFirstMessage { + nonce, + salt, + iteration_count, + }) + } + + fn value(&mut self) -> io::Result<&'a str> { + self.take_while(|c| matches!(c, '\0' | '=' | ',')) + } + + fn server_error(&mut self) -> io::Result> { + match self.it.peek() { + Some(&(_, 'e')) => {} + _ => return Ok(None), + } + + self.eat('e')?; + self.eat('=')?; + self.value().map(Some) + } + + fn verifier(&mut self) -> io::Result<&'a str> { + self.eat('v')?; + self.eat('=')?; + self.base64() + } + + fn server_final_message(&mut self) -> io::Result> { + let message = match self.server_error()? { + Some(error) => ServerFinalMessage::Error(error), + None => ServerFinalMessage::Verifier(self.verifier()?), + }; + self.eof()?; + Ok(message) + } +} + +struct ServerFirstMessage<'a> { + nonce: &'a str, + salt: &'a str, + iteration_count: u32, +} + +enum ServerFinalMessage<'a> { + Error(&'a str), + Verifier(&'a str), +} + +#[cfg(test)] +mod test { + use super::*; + + #[test] + fn parse_server_first_message() { + let message = "r=fyko+d2lbbFgONRv9qkxdawL3rfcNHYJY1ZVvWVs7j,s=QSXCR+Q6sek8bf92,i=4096"; + let message = Parser::new(message).server_first_message().unwrap(); + assert_eq!(message.nonce, "fyko+d2lbbFgONRv9qkxdawL3rfcNHYJY1ZVvWVs7j"); + assert_eq!(message.salt, "QSXCR+Q6sek8bf92"); + assert_eq!(message.iteration_count, 4096); + } + + // recorded auth exchange from psql + #[tokio::test] + async fn exchange() { + let password = "foobar"; + let nonce = "9IZ2O01zb9IgiIZ1WJ/zgpJB"; + + let client_first = "n,,n=,r=9IZ2O01zb9IgiIZ1WJ/zgpJB"; + let server_first = + "r=9IZ2O01zb9IgiIZ1WJ/zgpJBjx/oIRLs02gGSHcw1KEty3eY,s=fs3IXBy7U7+IvVjZ,i\ + =4096"; + let client_final = + "c=biws,r=9IZ2O01zb9IgiIZ1WJ/zgpJBjx/oIRLs02gGSHcw1KEty3eY,p=AmNKosjJzS3\ + 1NTlQYNs5BTeQjdHdk7lOflDo5re2an8="; + let server_final = "v=U+ppxD5XUKtradnv8e2MkeupiA8FU87Sg8CXzXHDAzw="; + + let mut scram = ScramSha256::new_inner( + Credentials::Password(normalize(password.as_bytes())), + ChannelBinding::unsupported(), + nonce.to_string(), + ); + assert_eq!(str::from_utf8(scram.message()).unwrap(), client_first); + + scram.update(server_first.as_bytes()).await.unwrap(); + assert_eq!(str::from_utf8(scram.message()).unwrap(), client_final); + + scram.finish(server_final.as_bytes()).unwrap(); + } +} diff --git a/libs/proxy/postgres-protocol2/src/escape/mod.rs b/libs/proxy/postgres-protocol2/src/escape/mod.rs new file mode 100644 index 0000000000..0ba7efdcac --- /dev/null +++ b/libs/proxy/postgres-protocol2/src/escape/mod.rs @@ -0,0 +1,93 @@ +//! Provides functions for escaping literals and identifiers for use +//! in SQL queries. +//! +//! Prefer parameterized queries where possible. Do not escape +//! parameters in a parameterized query. + +#[cfg(test)] +mod test; + +/// Escape a literal and surround result with single quotes. Not +/// recommended in most cases. +/// +/// If input contains backslashes, result will be of the form ` +/// E'...'` so it is safe to use regardless of the setting of +/// standard_conforming_strings. +pub fn escape_literal(input: &str) -> String { + escape_internal(input, false) +} + +/// Escape an identifier and surround result with double quotes. +pub fn escape_identifier(input: &str) -> String { + escape_internal(input, true) +} + +// Translation of PostgreSQL libpq's PQescapeInternal(). Does not +// require a connection because input string is known to be valid +// UTF-8. +// +// Escape arbitrary strings. If as_ident is true, we escape the +// result as an identifier; if false, as a literal. The result is +// returned in a newly allocated buffer. If we fail due to an +// encoding violation or out of memory condition, we return NULL, +// storing an error message into conn. +fn escape_internal(input: &str, as_ident: bool) -> String { + let mut num_backslashes = 0; + let mut num_quotes = 0; + let quote_char = if as_ident { '"' } else { '\'' }; + + // Scan the string for characters that must be escaped. + for ch in input.chars() { + if ch == quote_char { + num_quotes += 1; + } else if ch == '\\' { + num_backslashes += 1; + } + } + + // Allocate output String. + let mut result_size = input.len() + num_quotes + 3; // two quotes, plus a NUL + if !as_ident && num_backslashes > 0 { + result_size += num_backslashes + 2; + } + + let mut output = String::with_capacity(result_size); + + // If we are escaping a literal that contains backslashes, we use + // the escape string syntax so that the result is correct under + // either value of standard_conforming_strings. We also emit a + // leading space in this case, to guard against the possibility + // that the result might be interpolated immediately following an + // identifier. + if !as_ident && num_backslashes > 0 { + output.push(' '); + output.push('E'); + } + + // Opening quote. + output.push(quote_char); + + // Use fast path if possible. + // + // We've already verified that the input string is well-formed in + // the current encoding. If it contains no quotes and, in the + // case of literal-escaping, no backslashes, then we can just copy + // it directly to the output buffer, adding the necessary quotes. + // + // If not, we must rescan the input and process each character + // individually. + if num_quotes == 0 && (num_backslashes == 0 || as_ident) { + output.push_str(input); + } else { + for ch in input.chars() { + if ch == quote_char || (!as_ident && ch == '\\') { + output.push(ch); + } + output.push(ch); + } + } + + output.push(quote_char); + + output +} diff --git a/libs/proxy/postgres-protocol2/src/escape/test.rs b/libs/proxy/postgres-protocol2/src/escape/test.rs new file mode 100644 index 0000000000..4816a103b7 --- /dev/null +++ b/libs/proxy/postgres-protocol2/src/escape/test.rs @@ -0,0 +1,17 @@ +use crate::escape::{escape_identifier, escape_literal}; + +#[test] +fn test_escape_idenifier() { + assert_eq!(escape_identifier("foo"), String::from("\"foo\"")); + assert_eq!(escape_identifier("f\\oo"), String::from("\"f\\oo\"")); + assert_eq!(escape_identifier("f'oo"), String::from("\"f'oo\"")); + assert_eq!(escape_identifier("f\"oo"), String::from("\"f\"\"oo\"")); +} + +#[test] +fn test_escape_literal() { + assert_eq!(escape_literal("foo"), String::from("'foo'")); + assert_eq!(escape_literal("f\\oo"), String::from(" E'f\\\\oo'")); + assert_eq!(escape_literal("f'oo"), String::from("'f''oo'")); + assert_eq!(escape_literal("f\"oo"), String::from("'f\"oo'")); +} diff --git a/libs/proxy/postgres-protocol2/src/lib.rs b/libs/proxy/postgres-protocol2/src/lib.rs new file mode 100644 index 0000000000..947f2f835d --- /dev/null +++ b/libs/proxy/postgres-protocol2/src/lib.rs @@ -0,0 +1,78 @@ +//! Low level Postgres protocol APIs. +//! +//! This crate implements the low level components of Postgres's communication +//! protocol, including message and value serialization and deserialization. +//! It is designed to be used as a building block by higher level APIs such as +//! `rust-postgres`, and should not typically be used directly. +//! +//! # Note +//! +//! This library assumes that the `client_encoding` backend parameter has been +//! set to `UTF8`. It will most likely not behave properly if that is not the case. +#![doc(html_root_url = "https://docs.rs/postgres-protocol/0.6")] +#![warn(missing_docs, rust_2018_idioms, clippy::all)] + +use byteorder::{BigEndian, ByteOrder}; +use bytes::{BufMut, BytesMut}; +use std::io; + +pub mod authentication; +pub mod escape; +pub mod message; +pub mod password; +pub mod types; + +/// A Postgres OID. +pub type Oid = u32; + +/// A Postgres Log Sequence Number (LSN). +pub type Lsn = u64; + +/// An enum indicating if a value is `NULL` or not. +pub enum IsNull { + /// The value is `NULL`. + Yes, + /// The value is not `NULL`. + No, +} + +fn write_nullable(serializer: F, buf: &mut BytesMut) -> Result<(), E> +where + F: FnOnce(&mut BytesMut) -> Result, + E: From, +{ + let base = buf.len(); + buf.put_i32(0); + let size = match serializer(buf)? { + IsNull::No => i32::from_usize(buf.len() - base - 4)?, + IsNull::Yes => -1, + }; + BigEndian::write_i32(&mut buf[base..], size); + + Ok(()) +} + +trait FromUsize: Sized { + fn from_usize(x: usize) -> Result; +} + +macro_rules! from_usize { + ($t:ty) => { + impl FromUsize for $t { + #[inline] + fn from_usize(x: usize) -> io::Result<$t> { + if x > <$t>::MAX as usize { + Err(io::Error::new( + io::ErrorKind::InvalidInput, + "value too large to transmit", + )) + } else { + Ok(x as $t) + } + } + } + }; +} + +from_usize!(i16); +from_usize!(i32); diff --git a/libs/proxy/postgres-protocol2/src/message/backend.rs b/libs/proxy/postgres-protocol2/src/message/backend.rs new file mode 100644 index 0000000000..097964f9c1 --- /dev/null +++ b/libs/proxy/postgres-protocol2/src/message/backend.rs @@ -0,0 +1,766 @@ +#![allow(missing_docs)] + +use byteorder::{BigEndian, ByteOrder, ReadBytesExt}; +use bytes::{Bytes, BytesMut}; +use fallible_iterator::FallibleIterator; +use memchr::memchr; +use std::cmp; +use std::io::{self, Read}; +use std::ops::Range; +use std::str; + +use crate::Oid; + +// top-level message tags +const PARSE_COMPLETE_TAG: u8 = b'1'; +const BIND_COMPLETE_TAG: u8 = b'2'; +const CLOSE_COMPLETE_TAG: u8 = b'3'; +pub const NOTIFICATION_RESPONSE_TAG: u8 = b'A'; +const COPY_DONE_TAG: u8 = b'c'; +const COMMAND_COMPLETE_TAG: u8 = b'C'; +const COPY_DATA_TAG: u8 = b'd'; +const DATA_ROW_TAG: u8 = b'D'; +const ERROR_RESPONSE_TAG: u8 = b'E'; +const COPY_IN_RESPONSE_TAG: u8 = b'G'; +const COPY_OUT_RESPONSE_TAG: u8 = b'H'; +const COPY_BOTH_RESPONSE_TAG: u8 = b'W'; +const EMPTY_QUERY_RESPONSE_TAG: u8 = b'I'; +const BACKEND_KEY_DATA_TAG: u8 = b'K'; +pub const NO_DATA_TAG: u8 = b'n'; +pub const NOTICE_RESPONSE_TAG: u8 = b'N'; +const AUTHENTICATION_TAG: u8 = b'R'; +const PORTAL_SUSPENDED_TAG: u8 = b's'; +pub const PARAMETER_STATUS_TAG: u8 = b'S'; +const PARAMETER_DESCRIPTION_TAG: u8 = b't'; +const ROW_DESCRIPTION_TAG: u8 = b'T'; +pub const READY_FOR_QUERY_TAG: u8 = b'Z'; + +#[derive(Debug, Copy, Clone)] +pub struct Header { + tag: u8, + len: i32, +} + +#[allow(clippy::len_without_is_empty)] +impl Header { + #[inline] + pub fn parse(buf: &[u8]) -> io::Result> { + if buf.len() < 5 { + return Ok(None); + } + + let tag = buf[0]; + let len = BigEndian::read_i32(&buf[1..]); + + if len < 4 { + return Err(io::Error::new( + io::ErrorKind::InvalidData, + "invalid message length: header length < 4", + )); + } + + Ok(Some(Header { tag, len })) + } + + #[inline] + pub fn tag(self) -> u8 { + self.tag + } + + #[inline] + pub fn len(self) -> i32 { + self.len + } +} + +/// An enum representing Postgres backend messages. +#[non_exhaustive] +pub enum Message { + AuthenticationCleartextPassword, + AuthenticationGss, + AuthenticationKerberosV5, + AuthenticationMd5Password, + AuthenticationOk, + AuthenticationScmCredential, + AuthenticationSspi, + AuthenticationGssContinue, + AuthenticationSasl(AuthenticationSaslBody), + AuthenticationSaslContinue(AuthenticationSaslContinueBody), + AuthenticationSaslFinal(AuthenticationSaslFinalBody), + BackendKeyData(BackendKeyDataBody), + BindComplete, + CloseComplete, + CommandComplete(CommandCompleteBody), + CopyData, + CopyDone, + CopyInResponse, + CopyOutResponse, + CopyBothResponse, + DataRow(DataRowBody), + EmptyQueryResponse, + ErrorResponse(ErrorResponseBody), + NoData, + NoticeResponse(NoticeResponseBody), + NotificationResponse(NotificationResponseBody), + ParameterDescription(ParameterDescriptionBody), + ParameterStatus(ParameterStatusBody), + ParseComplete, + PortalSuspended, + ReadyForQuery(ReadyForQueryBody), + RowDescription(RowDescriptionBody), +} + +impl Message { + #[inline] + pub fn parse(buf: &mut BytesMut) -> io::Result> { + if buf.len() < 5 { + let to_read = 5 - buf.len(); + buf.reserve(to_read); + return Ok(None); + } + + let tag = buf[0]; + let len = (&buf[1..5]).read_u32::().unwrap(); + + if len < 4 { + return Err(io::Error::new( + io::ErrorKind::InvalidInput, + "invalid message length: parsing u32", + )); + } + + let total_len = len as usize + 1; + if buf.len() < total_len { + let to_read = total_len - buf.len(); + buf.reserve(to_read); + return Ok(None); + } + + let mut buf = Buffer { + bytes: buf.split_to(total_len).freeze(), + idx: 5, + }; + + let message = match tag { + PARSE_COMPLETE_TAG => Message::ParseComplete, + BIND_COMPLETE_TAG => Message::BindComplete, + CLOSE_COMPLETE_TAG => Message::CloseComplete, + NOTIFICATION_RESPONSE_TAG => { + let process_id = buf.read_i32::()?; + let channel = buf.read_cstr()?; + let message = buf.read_cstr()?; + Message::NotificationResponse(NotificationResponseBody { + process_id, + channel, + message, + }) + } + COPY_DONE_TAG => Message::CopyDone, + COMMAND_COMPLETE_TAG => { + let tag = buf.read_cstr()?; + Message::CommandComplete(CommandCompleteBody { tag }) + } + COPY_DATA_TAG => Message::CopyData, + DATA_ROW_TAG => { + let len = buf.read_u16::()?; + let storage = buf.read_all(); + Message::DataRow(DataRowBody { storage, len }) + } + ERROR_RESPONSE_TAG => { + let storage = buf.read_all(); + Message::ErrorResponse(ErrorResponseBody { storage }) + } + COPY_IN_RESPONSE_TAG => Message::CopyInResponse, + COPY_OUT_RESPONSE_TAG => Message::CopyOutResponse, + COPY_BOTH_RESPONSE_TAG => Message::CopyBothResponse, + EMPTY_QUERY_RESPONSE_TAG => Message::EmptyQueryResponse, + BACKEND_KEY_DATA_TAG => { + let process_id = buf.read_i32::()?; + let secret_key = buf.read_i32::()?; + Message::BackendKeyData(BackendKeyDataBody { + process_id, + secret_key, + }) + } + NO_DATA_TAG => Message::NoData, + NOTICE_RESPONSE_TAG => { + let storage = buf.read_all(); + Message::NoticeResponse(NoticeResponseBody { storage }) + } + AUTHENTICATION_TAG => match buf.read_i32::()? { + 0 => Message::AuthenticationOk, + 2 => Message::AuthenticationKerberosV5, + 3 => Message::AuthenticationCleartextPassword, + 5 => Message::AuthenticationMd5Password, + 6 => Message::AuthenticationScmCredential, + 7 => Message::AuthenticationGss, + 8 => Message::AuthenticationGssContinue, + 9 => Message::AuthenticationSspi, + 10 => { + let storage = buf.read_all(); + Message::AuthenticationSasl(AuthenticationSaslBody(storage)) + } + 11 => { + let storage = buf.read_all(); + Message::AuthenticationSaslContinue(AuthenticationSaslContinueBody(storage)) + } + 12 => { + let storage = buf.read_all(); + Message::AuthenticationSaslFinal(AuthenticationSaslFinalBody(storage)) + } + tag => { + return Err(io::Error::new( + io::ErrorKind::InvalidInput, + format!("unknown authentication tag `{}`", tag), + )); + } + }, + PORTAL_SUSPENDED_TAG => Message::PortalSuspended, + PARAMETER_STATUS_TAG => { + let name = buf.read_cstr()?; + let value = buf.read_cstr()?; + Message::ParameterStatus(ParameterStatusBody { name, value }) + } + PARAMETER_DESCRIPTION_TAG => { + let len = buf.read_u16::()?; + let storage = buf.read_all(); + Message::ParameterDescription(ParameterDescriptionBody { storage, len }) + } + ROW_DESCRIPTION_TAG => { + let len = buf.read_u16::()?; + let storage = buf.read_all(); + Message::RowDescription(RowDescriptionBody { storage, len }) + } + READY_FOR_QUERY_TAG => { + let status = buf.read_u8()?; + Message::ReadyForQuery(ReadyForQueryBody { status }) + } + tag => { + return Err(io::Error::new( + io::ErrorKind::InvalidInput, + format!("unknown message tag `{}`", tag), + )); + } + }; + + if !buf.is_empty() { + return Err(io::Error::new( + io::ErrorKind::InvalidInput, + "invalid message length: expected buffer to be empty", + )); + } + + Ok(Some(message)) + } +} + +struct Buffer { + bytes: Bytes, + idx: usize, +} + +impl Buffer { + #[inline] + fn slice(&self) -> &[u8] { + &self.bytes[self.idx..] + } + + #[inline] + fn is_empty(&self) -> bool { + self.slice().is_empty() + } + + #[inline] + fn read_cstr(&mut self) -> io::Result { + match memchr(0, self.slice()) { + Some(pos) => { + let start = self.idx; + let end = start + pos; + let cstr = self.bytes.slice(start..end); + self.idx = end + 1; + Ok(cstr) + } + None => Err(io::Error::new( + io::ErrorKind::UnexpectedEof, + "unexpected EOF", + )), + } + } + + #[inline] + fn read_all(&mut self) -> Bytes { + let buf = self.bytes.slice(self.idx..); + self.idx = self.bytes.len(); + buf + } +} + +impl Read for Buffer { + #[inline] + fn read(&mut self, buf: &mut [u8]) -> io::Result { + let len = { + let slice = self.slice(); + let len = cmp::min(slice.len(), buf.len()); + buf[..len].copy_from_slice(&slice[..len]); + len + }; + self.idx += len; + Ok(len) + } +} + +pub struct AuthenticationMd5PasswordBody { + salt: [u8; 4], +} + +impl AuthenticationMd5PasswordBody { + #[inline] + pub fn salt(&self) -> [u8; 4] { + self.salt + } +} + +pub struct AuthenticationSaslBody(Bytes); + +impl AuthenticationSaslBody { + #[inline] + pub fn mechanisms(&self) -> SaslMechanisms<'_> { + SaslMechanisms(&self.0) + } +} + +pub struct SaslMechanisms<'a>(&'a [u8]); + +impl<'a> FallibleIterator for SaslMechanisms<'a> { + type Item = &'a str; + type Error = io::Error; + + #[inline] + fn next(&mut self) -> io::Result> { + let value_end = find_null(self.0, 0)?; + if value_end == 0 { + if self.0.len() != 1 { + return Err(io::Error::new( + io::ErrorKind::InvalidData, + "invalid message length: expected to be at end of iterator for sasl", + )); + } + Ok(None) + } else { + let value = get_str(&self.0[..value_end])?; + self.0 = &self.0[value_end + 1..]; + Ok(Some(value)) + } + } +} + +pub struct AuthenticationSaslContinueBody(Bytes); + +impl AuthenticationSaslContinueBody { + #[inline] + pub fn data(&self) -> &[u8] { + &self.0 + } +} + +pub struct AuthenticationSaslFinalBody(Bytes); + +impl AuthenticationSaslFinalBody { + #[inline] + pub fn data(&self) -> &[u8] { + &self.0 + } +} + +pub struct BackendKeyDataBody { + process_id: i32, + secret_key: i32, +} + +impl BackendKeyDataBody { + #[inline] + pub fn process_id(&self) -> i32 { + self.process_id + } + + #[inline] + pub fn secret_key(&self) -> i32 { + self.secret_key + } +} + +pub struct CommandCompleteBody { + tag: Bytes, +} + +impl CommandCompleteBody { + #[inline] + pub fn tag(&self) -> io::Result<&str> { + get_str(&self.tag) + } +} + +#[derive(Debug)] +pub struct DataRowBody { + storage: Bytes, + len: u16, +} + +impl DataRowBody { + #[inline] + pub fn ranges(&self) -> DataRowRanges<'_> { + DataRowRanges { + buf: &self.storage, + len: self.storage.len(), + remaining: self.len, + } + } + + #[inline] + pub fn buffer(&self) -> &[u8] { + &self.storage + } +} + +pub struct DataRowRanges<'a> { + buf: &'a [u8], + len: usize, + remaining: u16, +} + +impl FallibleIterator for DataRowRanges<'_> { + type Item = Option>; + type Error = io::Error; + + #[inline] + fn next(&mut self) -> io::Result>>> { + if self.remaining == 0 { + if self.buf.is_empty() { + return Ok(None); + } else { + return Err(io::Error::new( + io::ErrorKind::InvalidInput, + "invalid message length: datarowrange is not empty", + )); + } + } + + self.remaining -= 1; + let len = self.buf.read_i32::()?; + if len < 0 { + Ok(Some(None)) + } else { + let len = len as usize; + if self.buf.len() < len { + return Err(io::Error::new( + io::ErrorKind::UnexpectedEof, + "unexpected EOF", + )); + } + let base = self.len - self.buf.len(); + self.buf = &self.buf[len..]; + Ok(Some(Some(base..base + len))) + } + } + + #[inline] + fn size_hint(&self) -> (usize, Option) { + let len = self.remaining as usize; + (len, Some(len)) + } +} + +pub struct ErrorResponseBody { + storage: Bytes, +} + +impl ErrorResponseBody { + #[inline] + pub fn fields(&self) -> ErrorFields<'_> { + ErrorFields { buf: &self.storage } + } +} + +pub struct ErrorFields<'a> { + buf: &'a [u8], +} + +impl<'a> FallibleIterator for ErrorFields<'a> { + type Item = ErrorField<'a>; + type Error = io::Error; + + #[inline] + fn next(&mut self) -> io::Result>> { + let type_ = self.buf.read_u8()?; + if type_ == 0 { + if self.buf.is_empty() { + return Ok(None); + } else { + return Err(io::Error::new( + io::ErrorKind::InvalidInput, + "invalid message length: error fields is not drained", + )); + } + } + + let value_end = find_null(self.buf, 0)?; + let value = get_str(&self.buf[..value_end])?; + self.buf = &self.buf[value_end + 1..]; + + Ok(Some(ErrorField { type_, value })) + } +} + +pub struct ErrorField<'a> { + type_: u8, + value: &'a str, +} + +impl ErrorField<'_> { + #[inline] + pub fn type_(&self) -> u8 { + self.type_ + } + + #[inline] + pub fn value(&self) -> &str { + self.value + } +} + +pub struct NoticeResponseBody { + storage: Bytes, +} + +impl NoticeResponseBody { + #[inline] + pub fn fields(&self) -> ErrorFields<'_> { + ErrorFields { buf: &self.storage } + } + + pub fn as_bytes(&self) -> &[u8] { + &self.storage + } +} + +pub struct NotificationResponseBody { + process_id: i32, + channel: Bytes, + message: Bytes, +} + +impl NotificationResponseBody { + #[inline] + pub fn process_id(&self) -> i32 { + self.process_id + } + + #[inline] + pub fn channel(&self) -> io::Result<&str> { + get_str(&self.channel) + } + + #[inline] + pub fn message(&self) -> io::Result<&str> { + get_str(&self.message) + } +} + +pub struct ParameterDescriptionBody { + storage: Bytes, + len: u16, +} + +impl ParameterDescriptionBody { + #[inline] + pub fn parameters(&self) -> Parameters<'_> { + Parameters { + buf: &self.storage, + remaining: self.len, + } + } +} + +pub struct Parameters<'a> { + buf: &'a [u8], + remaining: u16, +} + +impl FallibleIterator for Parameters<'_> { + type Item = Oid; + type Error = io::Error; + + #[inline] + fn next(&mut self) -> io::Result> { + if self.remaining == 0 { + if self.buf.is_empty() { + return Ok(None); + } else { + return Err(io::Error::new( + io::ErrorKind::InvalidInput, + "invalid message length: parameters is not drained", + )); + } + } + + self.remaining -= 1; + self.buf.read_u32::().map(Some) + } + + #[inline] + fn size_hint(&self) -> (usize, Option) { + let len = self.remaining as usize; + (len, Some(len)) + } +} + +pub struct ParameterStatusBody { + name: Bytes, + value: Bytes, +} + +impl ParameterStatusBody { + #[inline] + pub fn name(&self) -> io::Result<&str> { + get_str(&self.name) + } + + #[inline] + pub fn value(&self) -> io::Result<&str> { + get_str(&self.value) + } +} + +pub struct ReadyForQueryBody { + status: u8, +} + +impl ReadyForQueryBody { + #[inline] + pub fn status(&self) -> u8 { + self.status + } +} + +pub struct RowDescriptionBody { + storage: Bytes, + len: u16, +} + +impl RowDescriptionBody { + #[inline] + pub fn fields(&self) -> Fields<'_> { + Fields { + buf: &self.storage, + remaining: self.len, + } + } +} + +pub struct Fields<'a> { + buf: &'a [u8], + remaining: u16, +} + +impl<'a> FallibleIterator for Fields<'a> { + type Item = Field<'a>; + type Error = io::Error; + + #[inline] + fn next(&mut self) -> io::Result>> { + if self.remaining == 0 { + if self.buf.is_empty() { + return Ok(None); + } else { + return Err(io::Error::new( + io::ErrorKind::InvalidInput, + "invalid message length: field is not drained", + )); + } + } + + self.remaining -= 1; + let name_end = find_null(self.buf, 0)?; + let name = get_str(&self.buf[..name_end])?; + self.buf = &self.buf[name_end + 1..]; + let table_oid = self.buf.read_u32::()?; + let column_id = self.buf.read_i16::()?; + let type_oid = self.buf.read_u32::()?; + let type_size = self.buf.read_i16::()?; + let type_modifier = self.buf.read_i32::()?; + let format = self.buf.read_i16::()?; + + Ok(Some(Field { + name, + table_oid, + column_id, + type_oid, + type_size, + type_modifier, + format, + })) + } +} + +pub struct Field<'a> { + name: &'a str, + table_oid: Oid, + column_id: i16, + type_oid: Oid, + type_size: i16, + type_modifier: i32, + format: i16, +} + +impl<'a> Field<'a> { + #[inline] + pub fn name(&self) -> &'a str { + self.name + } + + #[inline] + pub fn table_oid(&self) -> Oid { + self.table_oid + } + + #[inline] + pub fn column_id(&self) -> i16 { + self.column_id + } + + #[inline] + pub fn type_oid(&self) -> Oid { + self.type_oid + } + + #[inline] + pub fn type_size(&self) -> i16 { + self.type_size + } + + #[inline] + pub fn type_modifier(&self) -> i32 { + self.type_modifier + } + + #[inline] + pub fn format(&self) -> i16 { + self.format + } +} + +#[inline] +fn find_null(buf: &[u8], start: usize) -> io::Result { + match memchr(0, &buf[start..]) { + Some(pos) => Ok(pos + start), + None => Err(io::Error::new( + io::ErrorKind::UnexpectedEof, + "unexpected EOF", + )), + } +} + +#[inline] +fn get_str(buf: &[u8]) -> io::Result<&str> { + str::from_utf8(buf).map_err(|e| io::Error::new(io::ErrorKind::InvalidInput, e)) +} diff --git a/libs/proxy/postgres-protocol2/src/message/frontend.rs b/libs/proxy/postgres-protocol2/src/message/frontend.rs new file mode 100644 index 0000000000..bc6168f337 --- /dev/null +++ b/libs/proxy/postgres-protocol2/src/message/frontend.rs @@ -0,0 +1,309 @@ +//! Frontend message serialization. +#![allow(missing_docs)] + +use byteorder::{BigEndian, ByteOrder}; +use bytes::{Buf, BufMut, BytesMut}; +use std::convert::TryFrom; +use std::error::Error; +use std::io; +use std::marker; + +use crate::{write_nullable, FromUsize, IsNull, Oid}; + +#[inline] +fn write_body(buf: &mut BytesMut, f: F) -> Result<(), E> +where + F: FnOnce(&mut BytesMut) -> Result<(), E>, + E: From, +{ + let base = buf.len(); + buf.extend_from_slice(&[0; 4]); + + f(buf)?; + + let size = i32::from_usize(buf.len() - base)?; + BigEndian::write_i32(&mut buf[base..], size); + Ok(()) +} + +pub enum BindError { + Conversion(Box), + Serialization(io::Error), +} + +impl From> for BindError { + #[inline] + fn from(e: Box) -> BindError { + BindError::Conversion(e) + } +} + +impl From for BindError { + #[inline] + fn from(e: io::Error) -> BindError { + BindError::Serialization(e) + } +} + +#[inline] +pub fn bind( + portal: &str, + statement: &str, + formats: I, + values: J, + mut serializer: F, + result_formats: K, + buf: &mut BytesMut, +) -> Result<(), BindError> +where + I: IntoIterator, + J: IntoIterator, + F: FnMut(T, &mut BytesMut) -> Result>, + K: IntoIterator, +{ + buf.put_u8(b'B'); + + write_body(buf, |buf| { + write_cstr(portal.as_bytes(), buf)?; + write_cstr(statement.as_bytes(), buf)?; + write_counted( + formats, + |f, buf| { + buf.put_i16(f); + Ok::<_, io::Error>(()) + }, + buf, + )?; + write_counted( + values, + |v, buf| write_nullable(|buf| serializer(v, buf), buf), + buf, + )?; + write_counted( + result_formats, + |f, buf| { + buf.put_i16(f); + Ok::<_, io::Error>(()) + }, + buf, + )?; + + Ok(()) + }) +} + +#[inline] +fn write_counted(items: I, mut serializer: F, buf: &mut BytesMut) -> Result<(), E> +where + I: IntoIterator, + F: FnMut(T, &mut BytesMut) -> Result<(), E>, + E: From, +{ + let base = buf.len(); + buf.extend_from_slice(&[0; 2]); + let mut count = 0; + for item in items { + serializer(item, buf)?; + count += 1; + } + let count = i16::from_usize(count)?; + BigEndian::write_i16(&mut buf[base..], count); + + Ok(()) +} + +#[inline] +pub fn cancel_request(process_id: i32, secret_key: i32, buf: &mut BytesMut) { + write_body(buf, |buf| { + buf.put_i32(80_877_102); + buf.put_i32(process_id); + buf.put_i32(secret_key); + Ok::<_, io::Error>(()) + }) + .unwrap(); +} + +#[inline] +pub fn close(variant: u8, name: &str, buf: &mut BytesMut) -> io::Result<()> { + buf.put_u8(b'C'); + write_body(buf, |buf| { + buf.put_u8(variant); + write_cstr(name.as_bytes(), buf) + }) +} + +pub struct CopyData { + buf: T, + len: i32, +} + +impl CopyData +where + T: Buf, +{ + pub fn new(buf: T) -> io::Result> { + let len = buf + .remaining() + .checked_add(4) + .and_then(|l| i32::try_from(l).ok()) + .ok_or_else(|| { + io::Error::new(io::ErrorKind::InvalidInput, "message length overflow") + })?; + + Ok(CopyData { buf, len }) + } + + pub fn write(self, out: &mut BytesMut) { + out.put_u8(b'd'); + out.put_i32(self.len); + out.put(self.buf); + } +} + +#[inline] +pub fn copy_done(buf: &mut BytesMut) { + buf.put_u8(b'c'); + write_body(buf, |_| Ok::<(), io::Error>(())).unwrap(); +} + +#[inline] +pub fn copy_fail(message: &str, buf: &mut BytesMut) -> io::Result<()> { + buf.put_u8(b'f'); + write_body(buf, |buf| write_cstr(message.as_bytes(), buf)) +} + +#[inline] +pub fn describe(variant: u8, name: &str, buf: &mut BytesMut) -> io::Result<()> { + buf.put_u8(b'D'); + write_body(buf, |buf| { + buf.put_u8(variant); + write_cstr(name.as_bytes(), buf) + }) +} + +#[inline] +pub fn execute(portal: &str, max_rows: i32, buf: &mut BytesMut) -> io::Result<()> { + buf.put_u8(b'E'); + write_body(buf, |buf| { + write_cstr(portal.as_bytes(), buf)?; + buf.put_i32(max_rows); + Ok(()) + }) +} + +#[inline] +pub fn parse(name: &str, query: &str, param_types: I, buf: &mut BytesMut) -> io::Result<()> +where + I: IntoIterator, +{ + buf.put_u8(b'P'); + write_body(buf, |buf| { + write_cstr(name.as_bytes(), buf)?; + write_cstr(query.as_bytes(), buf)?; + write_counted( + param_types, + |t, buf| { + buf.put_u32(t); + Ok::<_, io::Error>(()) + }, + buf, + )?; + Ok(()) + }) +} + +#[inline] +pub fn password_message(password: &[u8], buf: &mut BytesMut) -> io::Result<()> { + buf.put_u8(b'p'); + write_body(buf, |buf| write_cstr(password, buf)) +} + +#[inline] +pub fn query(query: &str, buf: &mut BytesMut) -> io::Result<()> { + buf.put_u8(b'Q'); + write_body(buf, |buf| write_cstr(query.as_bytes(), buf)) +} + +#[inline] +pub fn sasl_initial_response(mechanism: &str, data: &[u8], buf: &mut BytesMut) -> io::Result<()> { + buf.put_u8(b'p'); + write_body(buf, |buf| { + write_cstr(mechanism.as_bytes(), buf)?; + let len = i32::from_usize(data.len())?; + buf.put_i32(len); + buf.put_slice(data); + Ok(()) + }) +} + +#[inline] +pub fn sasl_response(data: &[u8], buf: &mut BytesMut) -> io::Result<()> { + buf.put_u8(b'p'); + write_body(buf, |buf| { + buf.put_slice(data); + Ok(()) + }) +} + +#[inline] +pub fn ssl_request(buf: &mut BytesMut) { + write_body(buf, |buf| { + buf.put_i32(80_877_103); + Ok::<_, io::Error>(()) + }) + .unwrap(); +} + +#[inline] +pub fn startup_message(parameters: &StartupMessageParams, buf: &mut BytesMut) -> io::Result<()> { + write_body(buf, |buf| { + // postgres protocol version 3.0(196608) in bigger-endian + buf.put_i32(0x00_03_00_00); + buf.put_slice(¶meters.params); + buf.put_u8(0); + Ok(()) + }) +} + +#[derive(Debug, Clone, Default, PartialEq, Eq)] +pub struct StartupMessageParams { + pub params: BytesMut, +} + +impl StartupMessageParams { + /// Set parameter's value by its name. + pub fn insert(&mut self, name: &str, value: &str) { + if name.contains('\0') || value.contains('\0') { + panic!("startup parameter name or value contained a null") + } + self.params.put_slice(name.as_bytes()); + self.params.put_u8(0); + self.params.put_slice(value.as_bytes()); + self.params.put_u8(0); + } +} + +#[inline] +pub fn sync(buf: &mut BytesMut) { + buf.put_u8(b'S'); + write_body(buf, |_| Ok::<(), io::Error>(())).unwrap(); +} + +#[inline] +pub fn terminate(buf: &mut BytesMut) { + buf.put_u8(b'X'); + write_body(buf, |_| Ok::<(), io::Error>(())).unwrap(); +} + +#[inline] +fn write_cstr(s: &[u8], buf: &mut BytesMut) -> Result<(), io::Error> { + if s.contains(&0) { + return Err(io::Error::new( + io::ErrorKind::InvalidInput, + "string contains embedded null", + )); + } + buf.put_slice(s); + buf.put_u8(0); + Ok(()) +} diff --git a/libs/proxy/postgres-protocol2/src/message/mod.rs b/libs/proxy/postgres-protocol2/src/message/mod.rs new file mode 100644 index 0000000000..9e5d997548 --- /dev/null +++ b/libs/proxy/postgres-protocol2/src/message/mod.rs @@ -0,0 +1,8 @@ +//! Postgres message protocol support. +//! +//! See [Postgres's documentation][docs] for more information on message flow. +//! +//! [docs]: https://www.postgresql.org/docs/9.5/static/protocol-flow.html + +pub mod backend; +pub mod frontend; diff --git a/libs/proxy/postgres-protocol2/src/password/mod.rs b/libs/proxy/postgres-protocol2/src/password/mod.rs new file mode 100644 index 0000000000..38eb31dfcf --- /dev/null +++ b/libs/proxy/postgres-protocol2/src/password/mod.rs @@ -0,0 +1,89 @@ +//! Functions to encrypt a password in the client. +//! +//! This is intended to be used by client applications that wish to +//! send commands like `ALTER USER joe PASSWORD 'pwd'`. The password +//! need not be sent in cleartext if it is encrypted on the client +//! side. This is good because it ensures the cleartext password won't +//! end up in logs pg_stat displays, etc. + +use crate::authentication::sasl; +use hmac::{Hmac, Mac}; +use rand::RngCore; +use sha2::digest::FixedOutput; +use sha2::{Digest, Sha256}; + +#[cfg(test)] +mod test; + +const SCRAM_DEFAULT_ITERATIONS: u32 = 4096; +const SCRAM_DEFAULT_SALT_LEN: usize = 16; + +/// Hash password using SCRAM-SHA-256 with a randomly-generated +/// salt. +/// +/// The client may assume the returned string doesn't contain any +/// special characters that would require escaping in an SQL command. +pub async fn scram_sha_256(password: &[u8]) -> String { + let mut salt: [u8; SCRAM_DEFAULT_SALT_LEN] = [0; SCRAM_DEFAULT_SALT_LEN]; + let mut rng = rand::thread_rng(); + rng.fill_bytes(&mut salt); + scram_sha_256_salt(password, salt).await +} + +// Internal implementation of scram_sha_256 with a caller-provided +// salt. This is useful for testing. +pub(crate) async fn scram_sha_256_salt( + password: &[u8], + salt: [u8; SCRAM_DEFAULT_SALT_LEN], +) -> String { + // Prepare the password, per [RFC + // 4013](https://tools.ietf.org/html/rfc4013), if possible. + // + // Postgres treats passwords as byte strings (without embedded NUL + // bytes), but SASL expects passwords to be valid UTF-8. + // + // Follow the behavior of libpq's PQencryptPasswordConn(), and + // also the backend. If the password is not valid UTF-8, or if it + // contains prohibited characters (such as non-ASCII whitespace), + // just skip the SASLprep step and use the original byte + // sequence. + let prepared: Vec = match std::str::from_utf8(password) { + Ok(password_str) => { + match stringprep::saslprep(password_str) { + Ok(p) => p.into_owned().into_bytes(), + // contains invalid characters; skip saslprep + Err(_) => Vec::from(password), + } + } + // not valid UTF-8; skip saslprep + Err(_) => Vec::from(password), + }; + + // salt password + let salted_password = sasl::hi(&prepared, &salt, SCRAM_DEFAULT_ITERATIONS).await; + + // client key + let mut hmac = Hmac::::new_from_slice(&salted_password) + .expect("HMAC is able to accept all key sizes"); + hmac.update(b"Client Key"); + let client_key = hmac.finalize().into_bytes(); + + // stored key + let mut hash = Sha256::default(); + hash.update(client_key.as_slice()); + let stored_key = hash.finalize_fixed(); + + // server key + let mut hmac = Hmac::::new_from_slice(&salted_password) + .expect("HMAC is able to accept all key sizes"); + hmac.update(b"Server Key"); + let server_key = hmac.finalize().into_bytes(); + + format!( + "SCRAM-SHA-256${}:{}${}:{}", + SCRAM_DEFAULT_ITERATIONS, + base64::encode(salt), + base64::encode(stored_key), + base64::encode(server_key) + ) +} diff --git a/libs/proxy/postgres-protocol2/src/password/test.rs b/libs/proxy/postgres-protocol2/src/password/test.rs new file mode 100644 index 0000000000..0692c07adb --- /dev/null +++ b/libs/proxy/postgres-protocol2/src/password/test.rs @@ -0,0 +1,11 @@ +use crate::password; + +#[tokio::test] +async fn test_encrypt_scram_sha_256() { + // Specify the salt to make the test deterministic. Any bytes will do. + let salt: [u8; 16] = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16]; + assert_eq!( + password::scram_sha_256_salt(b"secret", salt).await, + "SCRAM-SHA-256$4096:AQIDBAUGBwgJCgsMDQ4PEA==$8rrDg00OqaiWXJ7p+sCgHEIaBSHY89ZJl3mfIsf32oY=:05L1f+yZbiN8O0AnO40Og85NNRhvzTS57naKRWCcsIA=" + ); +} diff --git a/libs/proxy/postgres-protocol2/src/types/mod.rs b/libs/proxy/postgres-protocol2/src/types/mod.rs new file mode 100644 index 0000000000..78131c05bf --- /dev/null +++ b/libs/proxy/postgres-protocol2/src/types/mod.rs @@ -0,0 +1,294 @@ +//! Conversions to and from Postgres's binary format for various types. +use byteorder::{BigEndian, ReadBytesExt}; +use bytes::{BufMut, BytesMut}; +use fallible_iterator::FallibleIterator; +use std::boxed::Box as StdBox; +use std::error::Error; +use std::str; + +use crate::Oid; + +#[cfg(test)] +mod test; + +/// Serializes a `TEXT`, `VARCHAR`, `CHAR(n)`, `NAME`, or `CITEXT` value. +#[inline] +pub fn text_to_sql(v: &str, buf: &mut BytesMut) { + buf.put_slice(v.as_bytes()); +} + +/// Deserializes a `TEXT`, `VARCHAR`, `CHAR(n)`, `NAME`, or `CITEXT` value. +#[inline] +pub fn text_from_sql(buf: &[u8]) -> Result<&str, StdBox> { + Ok(str::from_utf8(buf)?) +} + +/// Deserializes a `"char"` value. +#[inline] +pub fn char_from_sql(mut buf: &[u8]) -> Result> { + let v = buf.read_i8()?; + if !buf.is_empty() { + return Err("invalid buffer size".into()); + } + Ok(v) +} + +/// Serializes an `OID` value. +#[inline] +pub fn oid_to_sql(v: Oid, buf: &mut BytesMut) { + buf.put_u32(v); +} + +/// Deserializes an `OID` value. +#[inline] +pub fn oid_from_sql(mut buf: &[u8]) -> Result> { + let v = buf.read_u32::()?; + if !buf.is_empty() { + return Err("invalid buffer size".into()); + } + Ok(v) +} + +/// A fallible iterator over `HSTORE` entries. +pub struct HstoreEntries<'a> { + remaining: i32, + buf: &'a [u8], +} + +impl<'a> FallibleIterator for HstoreEntries<'a> { + type Item = (&'a str, Option<&'a str>); + type Error = StdBox; + + #[inline] + #[allow(clippy::type_complexity)] + fn next( + &mut self, + ) -> Result)>, StdBox> { + if self.remaining == 0 { + if !self.buf.is_empty() { + return Err("invalid buffer size".into()); + } + return Ok(None); + } + + self.remaining -= 1; + + let key_len = self.buf.read_i32::()?; + if key_len < 0 { + return Err("invalid key length".into()); + } + let (key, buf) = self.buf.split_at(key_len as usize); + let key = str::from_utf8(key)?; + self.buf = buf; + + let value_len = self.buf.read_i32::()?; + let value = if value_len < 0 { + None + } else { + let (value, buf) = self.buf.split_at(value_len as usize); + let value = str::from_utf8(value)?; + self.buf = buf; + Some(value) + }; + + Ok(Some((key, value))) + } + + #[inline] + fn size_hint(&self) -> (usize, Option) { + let len = self.remaining as usize; + (len, Some(len)) + } +} + +/// Deserializes an array value. +#[inline] +pub fn array_from_sql(mut buf: &[u8]) -> Result, StdBox> { + let dimensions = buf.read_i32::()?; + if dimensions < 0 { + return Err("invalid dimension count".into()); + } + + let mut r = buf; + let mut elements = 1i32; + for _ in 0..dimensions { + let len = r.read_i32::()?; + if len < 0 { + return Err("invalid dimension size".into()); + } + let _lower_bound = r.read_i32::()?; + elements = match elements.checked_mul(len) { + Some(elements) => elements, + None => return Err("too many array elements".into()), + }; + } + + if dimensions == 0 { + elements = 0; + } + + Ok(Array { + dimensions, + elements, + buf, + }) +} + +/// A Postgres array. +pub struct Array<'a> { + dimensions: i32, + elements: i32, + buf: &'a [u8], +} + +impl<'a> Array<'a> { + /// Returns an iterator over the dimensions of the array. + #[inline] + pub fn dimensions(&self) -> ArrayDimensions<'a> { + ArrayDimensions(&self.buf[..self.dimensions as usize * 8]) + } + + /// Returns an iterator over the values of the array. + #[inline] + pub fn values(&self) -> ArrayValues<'a> { + ArrayValues { + remaining: self.elements, + buf: &self.buf[self.dimensions as usize * 8..], + } + } +} + +/// An iterator over the dimensions of an array. +pub struct ArrayDimensions<'a>(&'a [u8]); + +impl FallibleIterator for ArrayDimensions<'_> { + type Item = ArrayDimension; + type Error = StdBox; + + #[inline] + fn next(&mut self) -> Result, StdBox> { + if self.0.is_empty() { + return Ok(None); + } + + let len = self.0.read_i32::()?; + let lower_bound = self.0.read_i32::()?; + + Ok(Some(ArrayDimension { len, lower_bound })) + } + + #[inline] + fn size_hint(&self) -> (usize, Option) { + let len = self.0.len() / 8; + (len, Some(len)) + } +} + +/// Information about a dimension of an array. +#[derive(Debug, Copy, Clone, PartialEq, Eq)] +pub struct ArrayDimension { + /// The length of this dimension. + pub len: i32, + + /// The base value used to index into this dimension. + pub lower_bound: i32, +} + +/// An iterator over the values of an array, in row-major order. +pub struct ArrayValues<'a> { + remaining: i32, + buf: &'a [u8], +} + +impl<'a> FallibleIterator for ArrayValues<'a> { + type Item = Option<&'a [u8]>; + type Error = StdBox; + + #[inline] + fn next(&mut self) -> Result>, StdBox> { + if self.remaining == 0 { + if !self.buf.is_empty() { + return Err("invalid message length: arrayvalue not drained".into()); + } + return Ok(None); + } + self.remaining -= 1; + + let len = self.buf.read_i32::()?; + let val = if len < 0 { + None + } else { + if self.buf.len() < len as usize { + return Err("invalid value length".into()); + } + + let (val, buf) = self.buf.split_at(len as usize); + self.buf = buf; + Some(val) + }; + + Ok(Some(val)) + } + + fn size_hint(&self) -> (usize, Option) { + let len = self.remaining as usize; + (len, Some(len)) + } +} + +/// Serializes a Postgres ltree string +#[inline] +pub fn ltree_to_sql(v: &str, buf: &mut BytesMut) { + // A version number is prepended to an ltree string per spec + buf.put_u8(1); + // Append the rest of the query + buf.put_slice(v.as_bytes()); +} + +/// Deserialize a Postgres ltree string +#[inline] +pub fn ltree_from_sql(buf: &[u8]) -> Result<&str, StdBox> { + match buf { + // Remove the version number from the front of the ltree per spec + [1u8, rest @ ..] => Ok(str::from_utf8(rest)?), + _ => Err("ltree version 1 only supported".into()), + } +} + +/// Serializes a Postgres lquery string +#[inline] +pub fn lquery_to_sql(v: &str, buf: &mut BytesMut) { + // A version number is prepended to an lquery string per spec + buf.put_u8(1); + // Append the rest of the query + buf.put_slice(v.as_bytes()); +} + +/// Deserialize a Postgres lquery string +#[inline] +pub fn lquery_from_sql(buf: &[u8]) -> Result<&str, StdBox> { + match buf { + // Remove the version number from the front of the lquery per spec + [1u8, rest @ ..] => Ok(str::from_utf8(rest)?), + _ => Err("lquery version 1 only supported".into()), + } +} + +/// Serializes a Postgres ltxtquery string +#[inline] +pub fn ltxtquery_to_sql(v: &str, buf: &mut BytesMut) { + // A version number is prepended to an ltxtquery string per spec + buf.put_u8(1); + // Append the rest of the query + buf.put_slice(v.as_bytes()); +} + +/// Deserialize a Postgres ltxtquery string +#[inline] +pub fn ltxtquery_from_sql(buf: &[u8]) -> Result<&str, StdBox> { + match buf { + // Remove the version number from the front of the ltxtquery per spec + [1u8, rest @ ..] => Ok(str::from_utf8(rest)?), + _ => Err("ltxtquery version 1 only supported".into()), + } +} diff --git a/libs/proxy/postgres-protocol2/src/types/test.rs b/libs/proxy/postgres-protocol2/src/types/test.rs new file mode 100644 index 0000000000..96cc055bc3 --- /dev/null +++ b/libs/proxy/postgres-protocol2/src/types/test.rs @@ -0,0 +1,87 @@ +use bytes::{Buf, BytesMut}; + +use super::*; + +#[test] +fn ltree_sql() { + let mut query = vec![1u8]; + query.extend_from_slice("A.B.C".as_bytes()); + + let mut buf = BytesMut::new(); + + ltree_to_sql("A.B.C", &mut buf); + + assert_eq!(query.as_slice(), buf.chunk()); +} + +#[test] +fn ltree_str() { + let mut query = vec![1u8]; + query.extend_from_slice("A.B.C".as_bytes()); + + assert!(ltree_from_sql(query.as_slice()).is_ok()) +} + +#[test] +fn ltree_wrong_version() { + let mut query = vec![2u8]; + query.extend_from_slice("A.B.C".as_bytes()); + + assert!(ltree_from_sql(query.as_slice()).is_err()) +} + +#[test] +fn lquery_sql() { + let mut query = vec![1u8]; + query.extend_from_slice("A.B.C".as_bytes()); + + let mut buf = BytesMut::new(); + + lquery_to_sql("A.B.C", &mut buf); + + assert_eq!(query.as_slice(), buf.chunk()); +} + +#[test] +fn lquery_str() { + let mut query = vec![1u8]; + query.extend_from_slice("A.B.C".as_bytes()); + + assert!(lquery_from_sql(query.as_slice()).is_ok()) +} + +#[test] +fn lquery_wrong_version() { + let mut query = vec![2u8]; + query.extend_from_slice("A.B.C".as_bytes()); + + assert!(lquery_from_sql(query.as_slice()).is_err()) +} + +#[test] +fn ltxtquery_sql() { + let mut query = vec![1u8]; + query.extend_from_slice("a & b*".as_bytes()); + + let mut buf = BytesMut::new(); + + ltree_to_sql("a & b*", &mut buf); + + assert_eq!(query.as_slice(), buf.chunk()); +} + +#[test] +fn ltxtquery_str() { + let mut query = vec![1u8]; + query.extend_from_slice("a & b*".as_bytes()); + + assert!(ltree_from_sql(query.as_slice()).is_ok()) +} + +#[test] +fn ltxtquery_wrong_version() { + let mut query = vec![2u8]; + query.extend_from_slice("a & b*".as_bytes()); + + assert!(ltree_from_sql(query.as_slice()).is_err()) +} diff --git a/libs/proxy/postgres-types2/Cargo.toml b/libs/proxy/postgres-types2/Cargo.toml new file mode 100644 index 0000000000..58cfb5571f --- /dev/null +++ b/libs/proxy/postgres-types2/Cargo.toml @@ -0,0 +1,10 @@ +[package] +name = "postgres-types2" +version = "0.1.0" +edition = "2018" +license = "MIT/Apache-2.0" + +[dependencies] +bytes.workspace = true +fallible-iterator.workspace = true +postgres-protocol2 = { path = "../postgres-protocol2" } diff --git a/libs/proxy/postgres-types2/src/lib.rs b/libs/proxy/postgres-types2/src/lib.rs new file mode 100644 index 0000000000..18ba032151 --- /dev/null +++ b/libs/proxy/postgres-types2/src/lib.rs @@ -0,0 +1,477 @@ +//! Conversions to and from Postgres types. +//! +//! This crate is used by the `tokio-postgres` and `postgres` crates. You normally don't need to depend directly on it +//! unless you want to define your own `ToSql` or `FromSql` definitions. +#![doc(html_root_url = "https://docs.rs/postgres-types/0.2")] +#![warn(clippy::all, rust_2018_idioms, missing_docs)] + +use fallible_iterator::FallibleIterator; +use postgres_protocol2::types; +use std::any::type_name; +use std::error::Error; +use std::fmt; +use std::sync::Arc; + +use crate::type_gen::{Inner, Other}; + +#[doc(inline)] +pub use postgres_protocol2::Oid; + +use bytes::BytesMut; + +/// Generates a simple implementation of `ToSql::accepts` which accepts the +/// types passed to it. +macro_rules! accepts { + ($($expected:ident),+) => ( + fn accepts(ty: &$crate::Type) -> bool { + matches!(*ty, $($crate::Type::$expected)|+) + } + ) +} + +/// Generates an implementation of `ToSql::to_sql_checked`. +/// +/// All `ToSql` implementations should use this macro. +macro_rules! to_sql_checked { + () => { + fn to_sql_checked( + &self, + ty: &$crate::Type, + out: &mut $crate::private::BytesMut, + ) -> ::std::result::Result< + $crate::IsNull, + Box, + > { + $crate::__to_sql_checked(self, ty, out) + } + }; +} + +// WARNING: this function is not considered part of this crate's public API. +// It is subject to change at any time. +#[doc(hidden)] +pub fn __to_sql_checked( + v: &T, + ty: &Type, + out: &mut BytesMut, +) -> Result> +where + T: ToSql, +{ + if !T::accepts(ty) { + return Err(Box::new(WrongType::new::(ty.clone()))); + } + v.to_sql(ty, out) +} + +// mod pg_lsn; +#[doc(hidden)] +pub mod private; +// mod special; +mod type_gen; + +/// A Postgres type. +#[derive(PartialEq, Eq, Clone, Hash)] +pub struct Type(Inner); + +impl fmt::Debug for Type { + fn fmt(&self, fmt: &mut fmt::Formatter<'_>) -> fmt::Result { + fmt::Debug::fmt(&self.0, fmt) + } +} + +impl fmt::Display for Type { + fn fmt(&self, fmt: &mut fmt::Formatter<'_>) -> fmt::Result { + match self.schema() { + "public" | "pg_catalog" => {} + schema => write!(fmt, "{}.", schema)?, + } + fmt.write_str(self.name()) + } +} + +impl Type { + /// Creates a new `Type`. + pub fn new(name: String, oid: Oid, kind: Kind, schema: String) -> Type { + Type(Inner::Other(Arc::new(Other { + name, + oid, + kind, + schema, + }))) + } + + /// Returns the `Type` corresponding to the provided `Oid` if it + /// corresponds to a built-in type. + pub fn from_oid(oid: Oid) -> Option { + Inner::from_oid(oid).map(Type) + } + + /// Returns the OID of the `Type`. + pub fn oid(&self) -> Oid { + self.0.oid() + } + + /// Returns the kind of this type. + pub fn kind(&self) -> &Kind { + self.0.kind() + } + + /// Returns the schema of this type. + pub fn schema(&self) -> &str { + match self.0 { + Inner::Other(ref u) => &u.schema, + _ => "pg_catalog", + } + } + + /// Returns the name of this type. + pub fn name(&self) -> &str { + self.0.name() + } +} + +/// Represents the kind of a Postgres type. +#[derive(Debug, Clone, PartialEq, Eq, Hash)] +#[non_exhaustive] +pub enum Kind { + /// A simple type like `VARCHAR` or `INTEGER`. + Simple, + /// An enumerated type along with its variants. + Enum(Vec), + /// A pseudo-type. + Pseudo, + /// An array type along with the type of its elements. + Array(Type), + /// A range type along with the type of its elements. + Range(Type), + /// A multirange type along with the type of its elements. + Multirange(Type), + /// A domain type along with its underlying type. + Domain(Type), + /// A composite type along with information about its fields. + Composite(Vec), +} + +/// Information about a field of a composite type. +#[derive(Debug, Clone, PartialEq, Eq, Hash)] +pub struct Field { + name: String, + type_: Type, +} + +impl Field { + /// Creates a new `Field`. + pub fn new(name: String, type_: Type) -> Field { + Field { name, type_ } + } + + /// Returns the name of the field. + pub fn name(&self) -> &str { + &self.name + } + + /// Returns the type of the field. + pub fn type_(&self) -> &Type { + &self.type_ + } +} + +/// An error indicating that a `NULL` Postgres value was passed to a `FromSql` +/// implementation that does not support `NULL` values. +#[derive(Debug, Clone, Copy)] +pub struct WasNull; + +impl fmt::Display for WasNull { + fn fmt(&self, fmt: &mut fmt::Formatter<'_>) -> fmt::Result { + fmt.write_str("a Postgres value was `NULL`") + } +} + +impl Error for WasNull {} + +/// An error indicating that a conversion was attempted between incompatible +/// Rust and Postgres types. +#[derive(Debug)] +pub struct WrongType { + postgres: Type, + rust: &'static str, +} + +impl fmt::Display for WrongType { + fn fmt(&self, fmt: &mut fmt::Formatter<'_>) -> fmt::Result { + write!( + fmt, + "cannot convert between the Rust type `{}` and the Postgres type `{}`", + self.rust, self.postgres, + ) + } +} + +impl Error for WrongType {} + +impl WrongType { + /// Creates a new `WrongType` error. + pub fn new(ty: Type) -> WrongType { + WrongType { + postgres: ty, + rust: type_name::(), + } + } +} + +/// An error indicating that a as_text conversion was attempted on a binary +/// result. +#[derive(Debug)] +pub struct WrongFormat {} + +impl Error for WrongFormat {} + +impl fmt::Display for WrongFormat { + fn fmt(&self, fmt: &mut fmt::Formatter<'_>) -> fmt::Result { + write!( + fmt, + "cannot read column as text while it is in binary format" + ) + } +} + +/// A trait for types that can be created from a Postgres value. +pub trait FromSql<'a>: Sized { + /// Creates a new value of this type from a buffer of data of the specified + /// Postgres `Type` in its binary format. + /// + /// The caller of this method is responsible for ensuring that this type + /// is compatible with the Postgres `Type`. + fn from_sql(ty: &Type, raw: &'a [u8]) -> Result>; + + /// Creates a new value of this type from a `NULL` SQL value. + /// + /// The caller of this method is responsible for ensuring that this type + /// is compatible with the Postgres `Type`. + /// + /// The default implementation returns `Err(Box::new(WasNull))`. + #[allow(unused_variables)] + fn from_sql_null(ty: &Type) -> Result> { + Err(Box::new(WasNull)) + } + + /// A convenience function that delegates to `from_sql` and `from_sql_null` depending on the + /// value of `raw`. + fn from_sql_nullable( + ty: &Type, + raw: Option<&'a [u8]>, + ) -> Result> { + match raw { + Some(raw) => Self::from_sql(ty, raw), + None => Self::from_sql_null(ty), + } + } + + /// Determines if a value of this type can be created from the specified + /// Postgres `Type`. + fn accepts(ty: &Type) -> bool; +} + +/// A trait for types which can be created from a Postgres value without borrowing any data. +/// +/// This is primarily useful for trait bounds on functions. +pub trait FromSqlOwned: for<'a> FromSql<'a> {} + +impl FromSqlOwned for T where T: for<'a> FromSql<'a> {} + +impl<'a, T: FromSql<'a>> FromSql<'a> for Option { + fn from_sql(ty: &Type, raw: &'a [u8]) -> Result, Box> { + ::from_sql(ty, raw).map(Some) + } + + fn from_sql_null(_: &Type) -> Result, Box> { + Ok(None) + } + + fn accepts(ty: &Type) -> bool { + ::accepts(ty) + } +} + +impl<'a, T: FromSql<'a>> FromSql<'a> for Vec { + fn from_sql(ty: &Type, raw: &'a [u8]) -> Result, Box> { + let member_type = match *ty.kind() { + Kind::Array(ref member) => member, + _ => panic!("expected array type"), + }; + + let array = types::array_from_sql(raw)?; + if array.dimensions().count()? > 1 { + return Err("array contains too many dimensions".into()); + } + + array + .values() + .map(|v| T::from_sql_nullable(member_type, v)) + .collect() + } + + fn accepts(ty: &Type) -> bool { + match *ty.kind() { + Kind::Array(ref inner) => T::accepts(inner), + _ => false, + } + } +} + +impl<'a> FromSql<'a> for String { + fn from_sql(ty: &Type, raw: &'a [u8]) -> Result> { + <&str as FromSql>::from_sql(ty, raw).map(ToString::to_string) + } + + fn accepts(ty: &Type) -> bool { + <&str as FromSql>::accepts(ty) + } +} + +impl<'a> FromSql<'a> for &'a str { + fn from_sql(ty: &Type, raw: &'a [u8]) -> Result<&'a str, Box> { + match *ty { + ref ty if ty.name() == "ltree" => types::ltree_from_sql(raw), + ref ty if ty.name() == "lquery" => types::lquery_from_sql(raw), + ref ty if ty.name() == "ltxtquery" => types::ltxtquery_from_sql(raw), + _ => types::text_from_sql(raw), + } + } + + fn accepts(ty: &Type) -> bool { + match *ty { + Type::VARCHAR | Type::TEXT | Type::BPCHAR | Type::NAME | Type::UNKNOWN => true, + ref ty + if (ty.name() == "citext" + || ty.name() == "ltree" + || ty.name() == "lquery" + || ty.name() == "ltxtquery") => + { + true + } + _ => false, + } + } +} + +macro_rules! simple_from { + ($t:ty, $f:ident, $($expected:ident),+) => { + impl<'a> FromSql<'a> for $t { + fn from_sql(_: &Type, raw: &'a [u8]) -> Result<$t, Box> { + types::$f(raw) + } + + accepts!($($expected),+); + } + } +} + +simple_from!(i8, char_from_sql, CHAR); +simple_from!(u32, oid_from_sql, OID); + +/// An enum representing the nullability of a Postgres value. +pub enum IsNull { + /// The value is NULL. + Yes, + /// The value is not NULL. + No, +} + +/// A trait for types that can be converted into Postgres values. +pub trait ToSql: fmt::Debug { + /// Converts the value of `self` into the binary format of the specified + /// Postgres `Type`, appending it to `out`. + /// + /// The caller of this method is responsible for ensuring that this type + /// is compatible with the Postgres `Type`. + /// + /// The return value indicates if this value should be represented as + /// `NULL`. If this is the case, implementations **must not** write + /// anything to `out`. + fn to_sql(&self, ty: &Type, out: &mut BytesMut) -> Result> + where + Self: Sized; + + /// Determines if a value of this type can be converted to the specified + /// Postgres `Type`. + fn accepts(ty: &Type) -> bool + where + Self: Sized; + + /// An adaptor method used internally by Rust-Postgres. + /// + /// *All* implementations of this method should be generated by the + /// `to_sql_checked!()` macro. + fn to_sql_checked( + &self, + ty: &Type, + out: &mut BytesMut, + ) -> Result>; + + /// Specify the encode format + fn encode_format(&self, _ty: &Type) -> Format { + Format::Binary + } +} + +/// Supported Postgres message format types +/// +/// Using Text format in a message assumes a Postgres `SERVER_ENCODING` of `UTF8` +#[derive(Clone, Copy, Debug, PartialEq)] +pub enum Format { + /// Text format (UTF-8) + Text, + /// Compact, typed binary format + Binary, +} + +impl ToSql for &str { + fn to_sql(&self, ty: &Type, w: &mut BytesMut) -> Result> { + match *ty { + ref ty if ty.name() == "ltree" => types::ltree_to_sql(self, w), + ref ty if ty.name() == "lquery" => types::lquery_to_sql(self, w), + ref ty if ty.name() == "ltxtquery" => types::ltxtquery_to_sql(self, w), + _ => types::text_to_sql(self, w), + } + Ok(IsNull::No) + } + + fn accepts(ty: &Type) -> bool { + match *ty { + Type::VARCHAR | Type::TEXT | Type::BPCHAR | Type::NAME | Type::UNKNOWN => true, + ref ty + if (ty.name() == "citext" + || ty.name() == "ltree" + || ty.name() == "lquery" + || ty.name() == "ltxtquery") => + { + true + } + _ => false, + } + } + + to_sql_checked!(); +} + +macro_rules! simple_to { + ($t:ty, $f:ident, $($expected:ident),+) => { + impl ToSql for $t { + fn to_sql(&self, + _: &Type, + w: &mut BytesMut) + -> Result> { + types::$f(*self, w); + Ok(IsNull::No) + } + + accepts!($($expected),+); + + to_sql_checked!(); + } + } +} + +simple_to!(u32, oid_to_sql, OID); diff --git a/libs/proxy/postgres-types2/src/private.rs b/libs/proxy/postgres-types2/src/private.rs new file mode 100644 index 0000000000..774f9a301c --- /dev/null +++ b/libs/proxy/postgres-types2/src/private.rs @@ -0,0 +1,34 @@ +use crate::{FromSql, Type}; +pub use bytes::BytesMut; +use std::error::Error; + +pub fn read_be_i32(buf: &mut &[u8]) -> Result> { + if buf.len() < 4 { + return Err("invalid buffer size".into()); + } + let mut bytes = [0; 4]; + bytes.copy_from_slice(&buf[..4]); + *buf = &buf[4..]; + Ok(i32::from_be_bytes(bytes)) +} + +pub fn read_value<'a, T>( + type_: &Type, + buf: &mut &'a [u8], +) -> Result> +where + T: FromSql<'a>, +{ + let len = read_be_i32(buf)?; + let value = if len < 0 { + None + } else { + if len as usize > buf.len() { + return Err("invalid buffer size".into()); + } + let (head, tail) = buf.split_at(len as usize); + *buf = tail; + Some(head) + }; + T::from_sql_nullable(type_, value) +} diff --git a/libs/proxy/postgres-types2/src/type_gen.rs b/libs/proxy/postgres-types2/src/type_gen.rs new file mode 100644 index 0000000000..a1bc3f85c0 --- /dev/null +++ b/libs/proxy/postgres-types2/src/type_gen.rs @@ -0,0 +1,1524 @@ +// Autogenerated file - DO NOT EDIT +use std::sync::Arc; + +use crate::{Kind, Oid, Type}; + +#[derive(PartialEq, Eq, Debug, Hash)] +pub struct Other { + pub name: String, + pub oid: Oid, + pub kind: Kind, + pub schema: String, +} + +#[derive(PartialEq, Eq, Clone, Debug, Hash)] +pub enum Inner { + Bool, + Bytea, + Char, + Name, + Int8, + Int2, + Int2Vector, + Int4, + Regproc, + Text, + Oid, + Tid, + Xid, + Cid, + OidVector, + PgDdlCommand, + Json, + Xml, + XmlArray, + PgNodeTree, + JsonArray, + TableAmHandler, + Xid8Array, + IndexAmHandler, + Point, + Lseg, + Path, + Box, + Polygon, + Line, + LineArray, + Cidr, + CidrArray, + Float4, + Float8, + Unknown, + Circle, + CircleArray, + Macaddr8, + Macaddr8Array, + Money, + MoneyArray, + Macaddr, + Inet, + BoolArray, + ByteaArray, + CharArray, + NameArray, + Int2Array, + Int2VectorArray, + Int4Array, + RegprocArray, + TextArray, + TidArray, + XidArray, + CidArray, + OidVectorArray, + BpcharArray, + VarcharArray, + Int8Array, + PointArray, + LsegArray, + PathArray, + BoxArray, + Float4Array, + Float8Array, + PolygonArray, + OidArray, + Aclitem, + AclitemArray, + MacaddrArray, + InetArray, + Bpchar, + Varchar, + Date, + Time, + Timestamp, + TimestampArray, + DateArray, + TimeArray, + Timestamptz, + TimestamptzArray, + Interval, + IntervalArray, + NumericArray, + CstringArray, + Timetz, + TimetzArray, + Bit, + BitArray, + Varbit, + VarbitArray, + Numeric, + Refcursor, + RefcursorArray, + Regprocedure, + Regoper, + Regoperator, + Regclass, + Regtype, + RegprocedureArray, + RegoperArray, + RegoperatorArray, + RegclassArray, + RegtypeArray, + Record, + Cstring, + Any, + Anyarray, + Void, + Trigger, + LanguageHandler, + Internal, + Anyelement, + RecordArray, + Anynonarray, + TxidSnapshotArray, + Uuid, + UuidArray, + TxidSnapshot, + FdwHandler, + PgLsn, + PgLsnArray, + TsmHandler, + PgNdistinct, + PgDependencies, + Anyenum, + TsVector, + Tsquery, + GtsVector, + TsVectorArray, + GtsVectorArray, + TsqueryArray, + Regconfig, + RegconfigArray, + Regdictionary, + RegdictionaryArray, + Jsonb, + JsonbArray, + AnyRange, + EventTrigger, + Int4Range, + Int4RangeArray, + NumRange, + NumRangeArray, + TsRange, + TsRangeArray, + TstzRange, + TstzRangeArray, + DateRange, + DateRangeArray, + Int8Range, + Int8RangeArray, + Jsonpath, + JsonpathArray, + Regnamespace, + RegnamespaceArray, + Regrole, + RegroleArray, + Regcollation, + RegcollationArray, + Int4multiRange, + NummultiRange, + TsmultiRange, + TstzmultiRange, + DatemultiRange, + Int8multiRange, + AnymultiRange, + AnycompatiblemultiRange, + PgBrinBloomSummary, + PgBrinMinmaxMultiSummary, + PgMcvList, + PgSnapshot, + PgSnapshotArray, + Xid8, + Anycompatible, + Anycompatiblearray, + Anycompatiblenonarray, + AnycompatibleRange, + Int4multiRangeArray, + NummultiRangeArray, + TsmultiRangeArray, + TstzmultiRangeArray, + DatemultiRangeArray, + Int8multiRangeArray, + Other(Arc), +} + +impl Inner { + pub fn from_oid(oid: Oid) -> Option { + match oid { + 16 => Some(Inner::Bool), + 17 => Some(Inner::Bytea), + 18 => Some(Inner::Char), + 19 => Some(Inner::Name), + 20 => Some(Inner::Int8), + 21 => Some(Inner::Int2), + 22 => Some(Inner::Int2Vector), + 23 => Some(Inner::Int4), + 24 => Some(Inner::Regproc), + 25 => Some(Inner::Text), + 26 => Some(Inner::Oid), + 27 => Some(Inner::Tid), + 28 => Some(Inner::Xid), + 29 => Some(Inner::Cid), + 30 => Some(Inner::OidVector), + 32 => Some(Inner::PgDdlCommand), + 114 => Some(Inner::Json), + 142 => Some(Inner::Xml), + 143 => Some(Inner::XmlArray), + 194 => Some(Inner::PgNodeTree), + 199 => Some(Inner::JsonArray), + 269 => Some(Inner::TableAmHandler), + 271 => Some(Inner::Xid8Array), + 325 => Some(Inner::IndexAmHandler), + 600 => Some(Inner::Point), + 601 => Some(Inner::Lseg), + 602 => Some(Inner::Path), + 603 => Some(Inner::Box), + 604 => Some(Inner::Polygon), + 628 => Some(Inner::Line), + 629 => Some(Inner::LineArray), + 650 => Some(Inner::Cidr), + 651 => Some(Inner::CidrArray), + 700 => Some(Inner::Float4), + 701 => Some(Inner::Float8), + 705 => Some(Inner::Unknown), + 718 => Some(Inner::Circle), + 719 => Some(Inner::CircleArray), + 774 => Some(Inner::Macaddr8), + 775 => Some(Inner::Macaddr8Array), + 790 => Some(Inner::Money), + 791 => Some(Inner::MoneyArray), + 829 => Some(Inner::Macaddr), + 869 => Some(Inner::Inet), + 1000 => Some(Inner::BoolArray), + 1001 => Some(Inner::ByteaArray), + 1002 => Some(Inner::CharArray), + 1003 => Some(Inner::NameArray), + 1005 => Some(Inner::Int2Array), + 1006 => Some(Inner::Int2VectorArray), + 1007 => Some(Inner::Int4Array), + 1008 => Some(Inner::RegprocArray), + 1009 => Some(Inner::TextArray), + 1010 => Some(Inner::TidArray), + 1011 => Some(Inner::XidArray), + 1012 => Some(Inner::CidArray), + 1013 => Some(Inner::OidVectorArray), + 1014 => Some(Inner::BpcharArray), + 1015 => Some(Inner::VarcharArray), + 1016 => Some(Inner::Int8Array), + 1017 => Some(Inner::PointArray), + 1018 => Some(Inner::LsegArray), + 1019 => Some(Inner::PathArray), + 1020 => Some(Inner::BoxArray), + 1021 => Some(Inner::Float4Array), + 1022 => Some(Inner::Float8Array), + 1027 => Some(Inner::PolygonArray), + 1028 => Some(Inner::OidArray), + 1033 => Some(Inner::Aclitem), + 1034 => Some(Inner::AclitemArray), + 1040 => Some(Inner::MacaddrArray), + 1041 => Some(Inner::InetArray), + 1042 => Some(Inner::Bpchar), + 1043 => Some(Inner::Varchar), + 1082 => Some(Inner::Date), + 1083 => Some(Inner::Time), + 1114 => Some(Inner::Timestamp), + 1115 => Some(Inner::TimestampArray), + 1182 => Some(Inner::DateArray), + 1183 => Some(Inner::TimeArray), + 1184 => Some(Inner::Timestamptz), + 1185 => Some(Inner::TimestamptzArray), + 1186 => Some(Inner::Interval), + 1187 => Some(Inner::IntervalArray), + 1231 => Some(Inner::NumericArray), + 1263 => Some(Inner::CstringArray), + 1266 => Some(Inner::Timetz), + 1270 => Some(Inner::TimetzArray), + 1560 => Some(Inner::Bit), + 1561 => Some(Inner::BitArray), + 1562 => Some(Inner::Varbit), + 1563 => Some(Inner::VarbitArray), + 1700 => Some(Inner::Numeric), + 1790 => Some(Inner::Refcursor), + 2201 => Some(Inner::RefcursorArray), + 2202 => Some(Inner::Regprocedure), + 2203 => Some(Inner::Regoper), + 2204 => Some(Inner::Regoperator), + 2205 => Some(Inner::Regclass), + 2206 => Some(Inner::Regtype), + 2207 => Some(Inner::RegprocedureArray), + 2208 => Some(Inner::RegoperArray), + 2209 => Some(Inner::RegoperatorArray), + 2210 => Some(Inner::RegclassArray), + 2211 => Some(Inner::RegtypeArray), + 2249 => Some(Inner::Record), + 2275 => Some(Inner::Cstring), + 2276 => Some(Inner::Any), + 2277 => Some(Inner::Anyarray), + 2278 => Some(Inner::Void), + 2279 => Some(Inner::Trigger), + 2280 => Some(Inner::LanguageHandler), + 2281 => Some(Inner::Internal), + 2283 => Some(Inner::Anyelement), + 2287 => Some(Inner::RecordArray), + 2776 => Some(Inner::Anynonarray), + 2949 => Some(Inner::TxidSnapshotArray), + 2950 => Some(Inner::Uuid), + 2951 => Some(Inner::UuidArray), + 2970 => Some(Inner::TxidSnapshot), + 3115 => Some(Inner::FdwHandler), + 3220 => Some(Inner::PgLsn), + 3221 => Some(Inner::PgLsnArray), + 3310 => Some(Inner::TsmHandler), + 3361 => Some(Inner::PgNdistinct), + 3402 => Some(Inner::PgDependencies), + 3500 => Some(Inner::Anyenum), + 3614 => Some(Inner::TsVector), + 3615 => Some(Inner::Tsquery), + 3642 => Some(Inner::GtsVector), + 3643 => Some(Inner::TsVectorArray), + 3644 => Some(Inner::GtsVectorArray), + 3645 => Some(Inner::TsqueryArray), + 3734 => Some(Inner::Regconfig), + 3735 => Some(Inner::RegconfigArray), + 3769 => Some(Inner::Regdictionary), + 3770 => Some(Inner::RegdictionaryArray), + 3802 => Some(Inner::Jsonb), + 3807 => Some(Inner::JsonbArray), + 3831 => Some(Inner::AnyRange), + 3838 => Some(Inner::EventTrigger), + 3904 => Some(Inner::Int4Range), + 3905 => Some(Inner::Int4RangeArray), + 3906 => Some(Inner::NumRange), + 3907 => Some(Inner::NumRangeArray), + 3908 => Some(Inner::TsRange), + 3909 => Some(Inner::TsRangeArray), + 3910 => Some(Inner::TstzRange), + 3911 => Some(Inner::TstzRangeArray), + 3912 => Some(Inner::DateRange), + 3913 => Some(Inner::DateRangeArray), + 3926 => Some(Inner::Int8Range), + 3927 => Some(Inner::Int8RangeArray), + 4072 => Some(Inner::Jsonpath), + 4073 => Some(Inner::JsonpathArray), + 4089 => Some(Inner::Regnamespace), + 4090 => Some(Inner::RegnamespaceArray), + 4096 => Some(Inner::Regrole), + 4097 => Some(Inner::RegroleArray), + 4191 => Some(Inner::Regcollation), + 4192 => Some(Inner::RegcollationArray), + 4451 => Some(Inner::Int4multiRange), + 4532 => Some(Inner::NummultiRange), + 4533 => Some(Inner::TsmultiRange), + 4534 => Some(Inner::TstzmultiRange), + 4535 => Some(Inner::DatemultiRange), + 4536 => Some(Inner::Int8multiRange), + 4537 => Some(Inner::AnymultiRange), + 4538 => Some(Inner::AnycompatiblemultiRange), + 4600 => Some(Inner::PgBrinBloomSummary), + 4601 => Some(Inner::PgBrinMinmaxMultiSummary), + 5017 => Some(Inner::PgMcvList), + 5038 => Some(Inner::PgSnapshot), + 5039 => Some(Inner::PgSnapshotArray), + 5069 => Some(Inner::Xid8), + 5077 => Some(Inner::Anycompatible), + 5078 => Some(Inner::Anycompatiblearray), + 5079 => Some(Inner::Anycompatiblenonarray), + 5080 => Some(Inner::AnycompatibleRange), + 6150 => Some(Inner::Int4multiRangeArray), + 6151 => Some(Inner::NummultiRangeArray), + 6152 => Some(Inner::TsmultiRangeArray), + 6153 => Some(Inner::TstzmultiRangeArray), + 6155 => Some(Inner::DatemultiRangeArray), + 6157 => Some(Inner::Int8multiRangeArray), + _ => None, + } + } + + pub fn oid(&self) -> Oid { + match *self { + Inner::Bool => 16, + Inner::Bytea => 17, + Inner::Char => 18, + Inner::Name => 19, + Inner::Int8 => 20, + Inner::Int2 => 21, + Inner::Int2Vector => 22, + Inner::Int4 => 23, + Inner::Regproc => 24, + Inner::Text => 25, + Inner::Oid => 26, + Inner::Tid => 27, + Inner::Xid => 28, + Inner::Cid => 29, + Inner::OidVector => 30, + Inner::PgDdlCommand => 32, + Inner::Json => 114, + Inner::Xml => 142, + Inner::XmlArray => 143, + Inner::PgNodeTree => 194, + Inner::JsonArray => 199, + Inner::TableAmHandler => 269, + Inner::Xid8Array => 271, + Inner::IndexAmHandler => 325, + Inner::Point => 600, + Inner::Lseg => 601, + Inner::Path => 602, + Inner::Box => 603, + Inner::Polygon => 604, + Inner::Line => 628, + Inner::LineArray => 629, + Inner::Cidr => 650, + Inner::CidrArray => 651, + Inner::Float4 => 700, + Inner::Float8 => 701, + Inner::Unknown => 705, + Inner::Circle => 718, + Inner::CircleArray => 719, + Inner::Macaddr8 => 774, + Inner::Macaddr8Array => 775, + Inner::Money => 790, + Inner::MoneyArray => 791, + Inner::Macaddr => 829, + Inner::Inet => 869, + Inner::BoolArray => 1000, + Inner::ByteaArray => 1001, + Inner::CharArray => 1002, + Inner::NameArray => 1003, + Inner::Int2Array => 1005, + Inner::Int2VectorArray => 1006, + Inner::Int4Array => 1007, + Inner::RegprocArray => 1008, + Inner::TextArray => 1009, + Inner::TidArray => 1010, + Inner::XidArray => 1011, + Inner::CidArray => 1012, + Inner::OidVectorArray => 1013, + Inner::BpcharArray => 1014, + Inner::VarcharArray => 1015, + Inner::Int8Array => 1016, + Inner::PointArray => 1017, + Inner::LsegArray => 1018, + Inner::PathArray => 1019, + Inner::BoxArray => 1020, + Inner::Float4Array => 1021, + Inner::Float8Array => 1022, + Inner::PolygonArray => 1027, + Inner::OidArray => 1028, + Inner::Aclitem => 1033, + Inner::AclitemArray => 1034, + Inner::MacaddrArray => 1040, + Inner::InetArray => 1041, + Inner::Bpchar => 1042, + Inner::Varchar => 1043, + Inner::Date => 1082, + Inner::Time => 1083, + Inner::Timestamp => 1114, + Inner::TimestampArray => 1115, + Inner::DateArray => 1182, + Inner::TimeArray => 1183, + Inner::Timestamptz => 1184, + Inner::TimestamptzArray => 1185, + Inner::Interval => 1186, + Inner::IntervalArray => 1187, + Inner::NumericArray => 1231, + Inner::CstringArray => 1263, + Inner::Timetz => 1266, + Inner::TimetzArray => 1270, + Inner::Bit => 1560, + Inner::BitArray => 1561, + Inner::Varbit => 1562, + Inner::VarbitArray => 1563, + Inner::Numeric => 1700, + Inner::Refcursor => 1790, + Inner::RefcursorArray => 2201, + Inner::Regprocedure => 2202, + Inner::Regoper => 2203, + Inner::Regoperator => 2204, + Inner::Regclass => 2205, + Inner::Regtype => 2206, + Inner::RegprocedureArray => 2207, + Inner::RegoperArray => 2208, + Inner::RegoperatorArray => 2209, + Inner::RegclassArray => 2210, + Inner::RegtypeArray => 2211, + Inner::Record => 2249, + Inner::Cstring => 2275, + Inner::Any => 2276, + Inner::Anyarray => 2277, + Inner::Void => 2278, + Inner::Trigger => 2279, + Inner::LanguageHandler => 2280, + Inner::Internal => 2281, + Inner::Anyelement => 2283, + Inner::RecordArray => 2287, + Inner::Anynonarray => 2776, + Inner::TxidSnapshotArray => 2949, + Inner::Uuid => 2950, + Inner::UuidArray => 2951, + Inner::TxidSnapshot => 2970, + Inner::FdwHandler => 3115, + Inner::PgLsn => 3220, + Inner::PgLsnArray => 3221, + Inner::TsmHandler => 3310, + Inner::PgNdistinct => 3361, + Inner::PgDependencies => 3402, + Inner::Anyenum => 3500, + Inner::TsVector => 3614, + Inner::Tsquery => 3615, + Inner::GtsVector => 3642, + Inner::TsVectorArray => 3643, + Inner::GtsVectorArray => 3644, + Inner::TsqueryArray => 3645, + Inner::Regconfig => 3734, + Inner::RegconfigArray => 3735, + Inner::Regdictionary => 3769, + Inner::RegdictionaryArray => 3770, + Inner::Jsonb => 3802, + Inner::JsonbArray => 3807, + Inner::AnyRange => 3831, + Inner::EventTrigger => 3838, + Inner::Int4Range => 3904, + Inner::Int4RangeArray => 3905, + Inner::NumRange => 3906, + Inner::NumRangeArray => 3907, + Inner::TsRange => 3908, + Inner::TsRangeArray => 3909, + Inner::TstzRange => 3910, + Inner::TstzRangeArray => 3911, + Inner::DateRange => 3912, + Inner::DateRangeArray => 3913, + Inner::Int8Range => 3926, + Inner::Int8RangeArray => 3927, + Inner::Jsonpath => 4072, + Inner::JsonpathArray => 4073, + Inner::Regnamespace => 4089, + Inner::RegnamespaceArray => 4090, + Inner::Regrole => 4096, + Inner::RegroleArray => 4097, + Inner::Regcollation => 4191, + Inner::RegcollationArray => 4192, + Inner::Int4multiRange => 4451, + Inner::NummultiRange => 4532, + Inner::TsmultiRange => 4533, + Inner::TstzmultiRange => 4534, + Inner::DatemultiRange => 4535, + Inner::Int8multiRange => 4536, + Inner::AnymultiRange => 4537, + Inner::AnycompatiblemultiRange => 4538, + Inner::PgBrinBloomSummary => 4600, + Inner::PgBrinMinmaxMultiSummary => 4601, + Inner::PgMcvList => 5017, + Inner::PgSnapshot => 5038, + Inner::PgSnapshotArray => 5039, + Inner::Xid8 => 5069, + Inner::Anycompatible => 5077, + Inner::Anycompatiblearray => 5078, + Inner::Anycompatiblenonarray => 5079, + Inner::AnycompatibleRange => 5080, + Inner::Int4multiRangeArray => 6150, + Inner::NummultiRangeArray => 6151, + Inner::TsmultiRangeArray => 6152, + Inner::TstzmultiRangeArray => 6153, + Inner::DatemultiRangeArray => 6155, + Inner::Int8multiRangeArray => 6157, + Inner::Other(ref u) => u.oid, + } + } + + pub fn kind(&self) -> &Kind { + match *self { + Inner::Bool => &Kind::Simple, + Inner::Bytea => &Kind::Simple, + Inner::Char => &Kind::Simple, + Inner::Name => &Kind::Simple, + Inner::Int8 => &Kind::Simple, + Inner::Int2 => &Kind::Simple, + Inner::Int2Vector => &Kind::Array(Type(Inner::Int2)), + Inner::Int4 => &Kind::Simple, + Inner::Regproc => &Kind::Simple, + Inner::Text => &Kind::Simple, + Inner::Oid => &Kind::Simple, + Inner::Tid => &Kind::Simple, + Inner::Xid => &Kind::Simple, + Inner::Cid => &Kind::Simple, + Inner::OidVector => &Kind::Array(Type(Inner::Oid)), + Inner::PgDdlCommand => &Kind::Pseudo, + Inner::Json => &Kind::Simple, + Inner::Xml => &Kind::Simple, + Inner::XmlArray => &Kind::Array(Type(Inner::Xml)), + Inner::PgNodeTree => &Kind::Simple, + Inner::JsonArray => &Kind::Array(Type(Inner::Json)), + Inner::TableAmHandler => &Kind::Pseudo, + Inner::Xid8Array => &Kind::Array(Type(Inner::Xid8)), + Inner::IndexAmHandler => &Kind::Pseudo, + Inner::Point => &Kind::Simple, + Inner::Lseg => &Kind::Simple, + Inner::Path => &Kind::Simple, + Inner::Box => &Kind::Simple, + Inner::Polygon => &Kind::Simple, + Inner::Line => &Kind::Simple, + Inner::LineArray => &Kind::Array(Type(Inner::Line)), + Inner::Cidr => &Kind::Simple, + Inner::CidrArray => &Kind::Array(Type(Inner::Cidr)), + Inner::Float4 => &Kind::Simple, + Inner::Float8 => &Kind::Simple, + Inner::Unknown => &Kind::Simple, + Inner::Circle => &Kind::Simple, + Inner::CircleArray => &Kind::Array(Type(Inner::Circle)), + Inner::Macaddr8 => &Kind::Simple, + Inner::Macaddr8Array => &Kind::Array(Type(Inner::Macaddr8)), + Inner::Money => &Kind::Simple, + Inner::MoneyArray => &Kind::Array(Type(Inner::Money)), + Inner::Macaddr => &Kind::Simple, + Inner::Inet => &Kind::Simple, + Inner::BoolArray => &Kind::Array(Type(Inner::Bool)), + Inner::ByteaArray => &Kind::Array(Type(Inner::Bytea)), + Inner::CharArray => &Kind::Array(Type(Inner::Char)), + Inner::NameArray => &Kind::Array(Type(Inner::Name)), + Inner::Int2Array => &Kind::Array(Type(Inner::Int2)), + Inner::Int2VectorArray => &Kind::Array(Type(Inner::Int2Vector)), + Inner::Int4Array => &Kind::Array(Type(Inner::Int4)), + Inner::RegprocArray => &Kind::Array(Type(Inner::Regproc)), + Inner::TextArray => &Kind::Array(Type(Inner::Text)), + Inner::TidArray => &Kind::Array(Type(Inner::Tid)), + Inner::XidArray => &Kind::Array(Type(Inner::Xid)), + Inner::CidArray => &Kind::Array(Type(Inner::Cid)), + Inner::OidVectorArray => &Kind::Array(Type(Inner::OidVector)), + Inner::BpcharArray => &Kind::Array(Type(Inner::Bpchar)), + Inner::VarcharArray => &Kind::Array(Type(Inner::Varchar)), + Inner::Int8Array => &Kind::Array(Type(Inner::Int8)), + Inner::PointArray => &Kind::Array(Type(Inner::Point)), + Inner::LsegArray => &Kind::Array(Type(Inner::Lseg)), + Inner::PathArray => &Kind::Array(Type(Inner::Path)), + Inner::BoxArray => &Kind::Array(Type(Inner::Box)), + Inner::Float4Array => &Kind::Array(Type(Inner::Float4)), + Inner::Float8Array => &Kind::Array(Type(Inner::Float8)), + Inner::PolygonArray => &Kind::Array(Type(Inner::Polygon)), + Inner::OidArray => &Kind::Array(Type(Inner::Oid)), + Inner::Aclitem => &Kind::Simple, + Inner::AclitemArray => &Kind::Array(Type(Inner::Aclitem)), + Inner::MacaddrArray => &Kind::Array(Type(Inner::Macaddr)), + Inner::InetArray => &Kind::Array(Type(Inner::Inet)), + Inner::Bpchar => &Kind::Simple, + Inner::Varchar => &Kind::Simple, + Inner::Date => &Kind::Simple, + Inner::Time => &Kind::Simple, + Inner::Timestamp => &Kind::Simple, + Inner::TimestampArray => &Kind::Array(Type(Inner::Timestamp)), + Inner::DateArray => &Kind::Array(Type(Inner::Date)), + Inner::TimeArray => &Kind::Array(Type(Inner::Time)), + Inner::Timestamptz => &Kind::Simple, + Inner::TimestamptzArray => &Kind::Array(Type(Inner::Timestamptz)), + Inner::Interval => &Kind::Simple, + Inner::IntervalArray => &Kind::Array(Type(Inner::Interval)), + Inner::NumericArray => &Kind::Array(Type(Inner::Numeric)), + Inner::CstringArray => &Kind::Array(Type(Inner::Cstring)), + Inner::Timetz => &Kind::Simple, + Inner::TimetzArray => &Kind::Array(Type(Inner::Timetz)), + Inner::Bit => &Kind::Simple, + Inner::BitArray => &Kind::Array(Type(Inner::Bit)), + Inner::Varbit => &Kind::Simple, + Inner::VarbitArray => &Kind::Array(Type(Inner::Varbit)), + Inner::Numeric => &Kind::Simple, + Inner::Refcursor => &Kind::Simple, + Inner::RefcursorArray => &Kind::Array(Type(Inner::Refcursor)), + Inner::Regprocedure => &Kind::Simple, + Inner::Regoper => &Kind::Simple, + Inner::Regoperator => &Kind::Simple, + Inner::Regclass => &Kind::Simple, + Inner::Regtype => &Kind::Simple, + Inner::RegprocedureArray => &Kind::Array(Type(Inner::Regprocedure)), + Inner::RegoperArray => &Kind::Array(Type(Inner::Regoper)), + Inner::RegoperatorArray => &Kind::Array(Type(Inner::Regoperator)), + Inner::RegclassArray => &Kind::Array(Type(Inner::Regclass)), + Inner::RegtypeArray => &Kind::Array(Type(Inner::Regtype)), + Inner::Record => &Kind::Pseudo, + Inner::Cstring => &Kind::Pseudo, + Inner::Any => &Kind::Pseudo, + Inner::Anyarray => &Kind::Pseudo, + Inner::Void => &Kind::Pseudo, + Inner::Trigger => &Kind::Pseudo, + Inner::LanguageHandler => &Kind::Pseudo, + Inner::Internal => &Kind::Pseudo, + Inner::Anyelement => &Kind::Pseudo, + Inner::RecordArray => &Kind::Pseudo, + Inner::Anynonarray => &Kind::Pseudo, + Inner::TxidSnapshotArray => &Kind::Array(Type(Inner::TxidSnapshot)), + Inner::Uuid => &Kind::Simple, + Inner::UuidArray => &Kind::Array(Type(Inner::Uuid)), + Inner::TxidSnapshot => &Kind::Simple, + Inner::FdwHandler => &Kind::Pseudo, + Inner::PgLsn => &Kind::Simple, + Inner::PgLsnArray => &Kind::Array(Type(Inner::PgLsn)), + Inner::TsmHandler => &Kind::Pseudo, + Inner::PgNdistinct => &Kind::Simple, + Inner::PgDependencies => &Kind::Simple, + Inner::Anyenum => &Kind::Pseudo, + Inner::TsVector => &Kind::Simple, + Inner::Tsquery => &Kind::Simple, + Inner::GtsVector => &Kind::Simple, + Inner::TsVectorArray => &Kind::Array(Type(Inner::TsVector)), + Inner::GtsVectorArray => &Kind::Array(Type(Inner::GtsVector)), + Inner::TsqueryArray => &Kind::Array(Type(Inner::Tsquery)), + Inner::Regconfig => &Kind::Simple, + Inner::RegconfigArray => &Kind::Array(Type(Inner::Regconfig)), + Inner::Regdictionary => &Kind::Simple, + Inner::RegdictionaryArray => &Kind::Array(Type(Inner::Regdictionary)), + Inner::Jsonb => &Kind::Simple, + Inner::JsonbArray => &Kind::Array(Type(Inner::Jsonb)), + Inner::AnyRange => &Kind::Pseudo, + Inner::EventTrigger => &Kind::Pseudo, + Inner::Int4Range => &Kind::Range(Type(Inner::Int4)), + Inner::Int4RangeArray => &Kind::Array(Type(Inner::Int4Range)), + Inner::NumRange => &Kind::Range(Type(Inner::Numeric)), + Inner::NumRangeArray => &Kind::Array(Type(Inner::NumRange)), + Inner::TsRange => &Kind::Range(Type(Inner::Timestamp)), + Inner::TsRangeArray => &Kind::Array(Type(Inner::TsRange)), + Inner::TstzRange => &Kind::Range(Type(Inner::Timestamptz)), + Inner::TstzRangeArray => &Kind::Array(Type(Inner::TstzRange)), + Inner::DateRange => &Kind::Range(Type(Inner::Date)), + Inner::DateRangeArray => &Kind::Array(Type(Inner::DateRange)), + Inner::Int8Range => &Kind::Range(Type(Inner::Int8)), + Inner::Int8RangeArray => &Kind::Array(Type(Inner::Int8Range)), + Inner::Jsonpath => &Kind::Simple, + Inner::JsonpathArray => &Kind::Array(Type(Inner::Jsonpath)), + Inner::Regnamespace => &Kind::Simple, + Inner::RegnamespaceArray => &Kind::Array(Type(Inner::Regnamespace)), + Inner::Regrole => &Kind::Simple, + Inner::RegroleArray => &Kind::Array(Type(Inner::Regrole)), + Inner::Regcollation => &Kind::Simple, + Inner::RegcollationArray => &Kind::Array(Type(Inner::Regcollation)), + Inner::Int4multiRange => &Kind::Multirange(Type(Inner::Int4)), + Inner::NummultiRange => &Kind::Multirange(Type(Inner::Numeric)), + Inner::TsmultiRange => &Kind::Multirange(Type(Inner::Timestamp)), + Inner::TstzmultiRange => &Kind::Multirange(Type(Inner::Timestamptz)), + Inner::DatemultiRange => &Kind::Multirange(Type(Inner::Date)), + Inner::Int8multiRange => &Kind::Multirange(Type(Inner::Int8)), + Inner::AnymultiRange => &Kind::Pseudo, + Inner::AnycompatiblemultiRange => &Kind::Pseudo, + Inner::PgBrinBloomSummary => &Kind::Simple, + Inner::PgBrinMinmaxMultiSummary => &Kind::Simple, + Inner::PgMcvList => &Kind::Simple, + Inner::PgSnapshot => &Kind::Simple, + Inner::PgSnapshotArray => &Kind::Array(Type(Inner::PgSnapshot)), + Inner::Xid8 => &Kind::Simple, + Inner::Anycompatible => &Kind::Pseudo, + Inner::Anycompatiblearray => &Kind::Pseudo, + Inner::Anycompatiblenonarray => &Kind::Pseudo, + Inner::AnycompatibleRange => &Kind::Pseudo, + Inner::Int4multiRangeArray => &Kind::Array(Type(Inner::Int4multiRange)), + Inner::NummultiRangeArray => &Kind::Array(Type(Inner::NummultiRange)), + Inner::TsmultiRangeArray => &Kind::Array(Type(Inner::TsmultiRange)), + Inner::TstzmultiRangeArray => &Kind::Array(Type(Inner::TstzmultiRange)), + Inner::DatemultiRangeArray => &Kind::Array(Type(Inner::DatemultiRange)), + Inner::Int8multiRangeArray => &Kind::Array(Type(Inner::Int8multiRange)), + Inner::Other(ref u) => &u.kind, + } + } + + pub fn name(&self) -> &str { + match *self { + Inner::Bool => "bool", + Inner::Bytea => "bytea", + Inner::Char => "char", + Inner::Name => "name", + Inner::Int8 => "int8", + Inner::Int2 => "int2", + Inner::Int2Vector => "int2vector", + Inner::Int4 => "int4", + Inner::Regproc => "regproc", + Inner::Text => "text", + Inner::Oid => "oid", + Inner::Tid => "tid", + Inner::Xid => "xid", + Inner::Cid => "cid", + Inner::OidVector => "oidvector", + Inner::PgDdlCommand => "pg_ddl_command", + Inner::Json => "json", + Inner::Xml => "xml", + Inner::XmlArray => "_xml", + Inner::PgNodeTree => "pg_node_tree", + Inner::JsonArray => "_json", + Inner::TableAmHandler => "table_am_handler", + Inner::Xid8Array => "_xid8", + Inner::IndexAmHandler => "index_am_handler", + Inner::Point => "point", + Inner::Lseg => "lseg", + Inner::Path => "path", + Inner::Box => "box", + Inner::Polygon => "polygon", + Inner::Line => "line", + Inner::LineArray => "_line", + Inner::Cidr => "cidr", + Inner::CidrArray => "_cidr", + Inner::Float4 => "float4", + Inner::Float8 => "float8", + Inner::Unknown => "unknown", + Inner::Circle => "circle", + Inner::CircleArray => "_circle", + Inner::Macaddr8 => "macaddr8", + Inner::Macaddr8Array => "_macaddr8", + Inner::Money => "money", + Inner::MoneyArray => "_money", + Inner::Macaddr => "macaddr", + Inner::Inet => "inet", + Inner::BoolArray => "_bool", + Inner::ByteaArray => "_bytea", + Inner::CharArray => "_char", + Inner::NameArray => "_name", + Inner::Int2Array => "_int2", + Inner::Int2VectorArray => "_int2vector", + Inner::Int4Array => "_int4", + Inner::RegprocArray => "_regproc", + Inner::TextArray => "_text", + Inner::TidArray => "_tid", + Inner::XidArray => "_xid", + Inner::CidArray => "_cid", + Inner::OidVectorArray => "_oidvector", + Inner::BpcharArray => "_bpchar", + Inner::VarcharArray => "_varchar", + Inner::Int8Array => "_int8", + Inner::PointArray => "_point", + Inner::LsegArray => "_lseg", + Inner::PathArray => "_path", + Inner::BoxArray => "_box", + Inner::Float4Array => "_float4", + Inner::Float8Array => "_float8", + Inner::PolygonArray => "_polygon", + Inner::OidArray => "_oid", + Inner::Aclitem => "aclitem", + Inner::AclitemArray => "_aclitem", + Inner::MacaddrArray => "_macaddr", + Inner::InetArray => "_inet", + Inner::Bpchar => "bpchar", + Inner::Varchar => "varchar", + Inner::Date => "date", + Inner::Time => "time", + Inner::Timestamp => "timestamp", + Inner::TimestampArray => "_timestamp", + Inner::DateArray => "_date", + Inner::TimeArray => "_time", + Inner::Timestamptz => "timestamptz", + Inner::TimestamptzArray => "_timestamptz", + Inner::Interval => "interval", + Inner::IntervalArray => "_interval", + Inner::NumericArray => "_numeric", + Inner::CstringArray => "_cstring", + Inner::Timetz => "timetz", + Inner::TimetzArray => "_timetz", + Inner::Bit => "bit", + Inner::BitArray => "_bit", + Inner::Varbit => "varbit", + Inner::VarbitArray => "_varbit", + Inner::Numeric => "numeric", + Inner::Refcursor => "refcursor", + Inner::RefcursorArray => "_refcursor", + Inner::Regprocedure => "regprocedure", + Inner::Regoper => "regoper", + Inner::Regoperator => "regoperator", + Inner::Regclass => "regclass", + Inner::Regtype => "regtype", + Inner::RegprocedureArray => "_regprocedure", + Inner::RegoperArray => "_regoper", + Inner::RegoperatorArray => "_regoperator", + Inner::RegclassArray => "_regclass", + Inner::RegtypeArray => "_regtype", + Inner::Record => "record", + Inner::Cstring => "cstring", + Inner::Any => "any", + Inner::Anyarray => "anyarray", + Inner::Void => "void", + Inner::Trigger => "trigger", + Inner::LanguageHandler => "language_handler", + Inner::Internal => "internal", + Inner::Anyelement => "anyelement", + Inner::RecordArray => "_record", + Inner::Anynonarray => "anynonarray", + Inner::TxidSnapshotArray => "_txid_snapshot", + Inner::Uuid => "uuid", + Inner::UuidArray => "_uuid", + Inner::TxidSnapshot => "txid_snapshot", + Inner::FdwHandler => "fdw_handler", + Inner::PgLsn => "pg_lsn", + Inner::PgLsnArray => "_pg_lsn", + Inner::TsmHandler => "tsm_handler", + Inner::PgNdistinct => "pg_ndistinct", + Inner::PgDependencies => "pg_dependencies", + Inner::Anyenum => "anyenum", + Inner::TsVector => "tsvector", + Inner::Tsquery => "tsquery", + Inner::GtsVector => "gtsvector", + Inner::TsVectorArray => "_tsvector", + Inner::GtsVectorArray => "_gtsvector", + Inner::TsqueryArray => "_tsquery", + Inner::Regconfig => "regconfig", + Inner::RegconfigArray => "_regconfig", + Inner::Regdictionary => "regdictionary", + Inner::RegdictionaryArray => "_regdictionary", + Inner::Jsonb => "jsonb", + Inner::JsonbArray => "_jsonb", + Inner::AnyRange => "anyrange", + Inner::EventTrigger => "event_trigger", + Inner::Int4Range => "int4range", + Inner::Int4RangeArray => "_int4range", + Inner::NumRange => "numrange", + Inner::NumRangeArray => "_numrange", + Inner::TsRange => "tsrange", + Inner::TsRangeArray => "_tsrange", + Inner::TstzRange => "tstzrange", + Inner::TstzRangeArray => "_tstzrange", + Inner::DateRange => "daterange", + Inner::DateRangeArray => "_daterange", + Inner::Int8Range => "int8range", + Inner::Int8RangeArray => "_int8range", + Inner::Jsonpath => "jsonpath", + Inner::JsonpathArray => "_jsonpath", + Inner::Regnamespace => "regnamespace", + Inner::RegnamespaceArray => "_regnamespace", + Inner::Regrole => "regrole", + Inner::RegroleArray => "_regrole", + Inner::Regcollation => "regcollation", + Inner::RegcollationArray => "_regcollation", + Inner::Int4multiRange => "int4multirange", + Inner::NummultiRange => "nummultirange", + Inner::TsmultiRange => "tsmultirange", + Inner::TstzmultiRange => "tstzmultirange", + Inner::DatemultiRange => "datemultirange", + Inner::Int8multiRange => "int8multirange", + Inner::AnymultiRange => "anymultirange", + Inner::AnycompatiblemultiRange => "anycompatiblemultirange", + Inner::PgBrinBloomSummary => "pg_brin_bloom_summary", + Inner::PgBrinMinmaxMultiSummary => "pg_brin_minmax_multi_summary", + Inner::PgMcvList => "pg_mcv_list", + Inner::PgSnapshot => "pg_snapshot", + Inner::PgSnapshotArray => "_pg_snapshot", + Inner::Xid8 => "xid8", + Inner::Anycompatible => "anycompatible", + Inner::Anycompatiblearray => "anycompatiblearray", + Inner::Anycompatiblenonarray => "anycompatiblenonarray", + Inner::AnycompatibleRange => "anycompatiblerange", + Inner::Int4multiRangeArray => "_int4multirange", + Inner::NummultiRangeArray => "_nummultirange", + Inner::TsmultiRangeArray => "_tsmultirange", + Inner::TstzmultiRangeArray => "_tstzmultirange", + Inner::DatemultiRangeArray => "_datemultirange", + Inner::Int8multiRangeArray => "_int8multirange", + Inner::Other(ref u) => &u.name, + } + } +} +impl Type { + /// BOOL - boolean, 'true'/'false' + pub const BOOL: Type = Type(Inner::Bool); + + /// BYTEA - variable-length string, binary values escaped + pub const BYTEA: Type = Type(Inner::Bytea); + + /// CHAR - single character + pub const CHAR: Type = Type(Inner::Char); + + /// NAME - 63-byte type for storing system identifiers + pub const NAME: Type = Type(Inner::Name); + + /// INT8 - ~18 digit integer, 8-byte storage + pub const INT8: Type = Type(Inner::Int8); + + /// INT2 - -32 thousand to 32 thousand, 2-byte storage + pub const INT2: Type = Type(Inner::Int2); + + /// INT2VECTOR - array of int2, used in system tables + pub const INT2_VECTOR: Type = Type(Inner::Int2Vector); + + /// INT4 - -2 billion to 2 billion integer, 4-byte storage + pub const INT4: Type = Type(Inner::Int4); + + /// REGPROC - registered procedure + pub const REGPROC: Type = Type(Inner::Regproc); + + /// TEXT - variable-length string, no limit specified + pub const TEXT: Type = Type(Inner::Text); + + /// OID - object identifier(oid), maximum 4 billion + pub const OID: Type = Type(Inner::Oid); + + /// TID - (block, offset), physical location of tuple + pub const TID: Type = Type(Inner::Tid); + + /// XID - transaction id + pub const XID: Type = Type(Inner::Xid); + + /// CID - command identifier type, sequence in transaction id + pub const CID: Type = Type(Inner::Cid); + + /// OIDVECTOR - array of oids, used in system tables + pub const OID_VECTOR: Type = Type(Inner::OidVector); + + /// PG_DDL_COMMAND - internal type for passing CollectedCommand + pub const PG_DDL_COMMAND: Type = Type(Inner::PgDdlCommand); + + /// JSON - JSON stored as text + pub const JSON: Type = Type(Inner::Json); + + /// XML - XML content + pub const XML: Type = Type(Inner::Xml); + + /// XML[] + pub const XML_ARRAY: Type = Type(Inner::XmlArray); + + /// PG_NODE_TREE - string representing an internal node tree + pub const PG_NODE_TREE: Type = Type(Inner::PgNodeTree); + + /// JSON[] + pub const JSON_ARRAY: Type = Type(Inner::JsonArray); + + /// TABLE_AM_HANDLER + pub const TABLE_AM_HANDLER: Type = Type(Inner::TableAmHandler); + + /// XID8[] + pub const XID8_ARRAY: Type = Type(Inner::Xid8Array); + + /// INDEX_AM_HANDLER - pseudo-type for the result of an index AM handler function + pub const INDEX_AM_HANDLER: Type = Type(Inner::IndexAmHandler); + + /// POINT - geometric point '(x, y)' + pub const POINT: Type = Type(Inner::Point); + + /// LSEG - geometric line segment '(pt1,pt2)' + pub const LSEG: Type = Type(Inner::Lseg); + + /// PATH - geometric path '(pt1,...)' + pub const PATH: Type = Type(Inner::Path); + + /// BOX - geometric box '(lower left,upper right)' + pub const BOX: Type = Type(Inner::Box); + + /// POLYGON - geometric polygon '(pt1,...)' + pub const POLYGON: Type = Type(Inner::Polygon); + + /// LINE - geometric line + pub const LINE: Type = Type(Inner::Line); + + /// LINE[] + pub const LINE_ARRAY: Type = Type(Inner::LineArray); + + /// CIDR - network IP address/netmask, network address + pub const CIDR: Type = Type(Inner::Cidr); + + /// CIDR[] + pub const CIDR_ARRAY: Type = Type(Inner::CidrArray); + + /// FLOAT4 - single-precision floating point number, 4-byte storage + pub const FLOAT4: Type = Type(Inner::Float4); + + /// FLOAT8 - double-precision floating point number, 8-byte storage + pub const FLOAT8: Type = Type(Inner::Float8); + + /// UNKNOWN - pseudo-type representing an undetermined type + pub const UNKNOWN: Type = Type(Inner::Unknown); + + /// CIRCLE - geometric circle '(center,radius)' + pub const CIRCLE: Type = Type(Inner::Circle); + + /// CIRCLE[] + pub const CIRCLE_ARRAY: Type = Type(Inner::CircleArray); + + /// MACADDR8 - XX:XX:XX:XX:XX:XX:XX:XX, MAC address + pub const MACADDR8: Type = Type(Inner::Macaddr8); + + /// MACADDR8[] + pub const MACADDR8_ARRAY: Type = Type(Inner::Macaddr8Array); + + /// MONEY - monetary amounts, $d,ddd.cc + pub const MONEY: Type = Type(Inner::Money); + + /// MONEY[] + pub const MONEY_ARRAY: Type = Type(Inner::MoneyArray); + + /// MACADDR - XX:XX:XX:XX:XX:XX, MAC address + pub const MACADDR: Type = Type(Inner::Macaddr); + + /// INET - IP address/netmask, host address, netmask optional + pub const INET: Type = Type(Inner::Inet); + + /// BOOL[] + pub const BOOL_ARRAY: Type = Type(Inner::BoolArray); + + /// BYTEA[] + pub const BYTEA_ARRAY: Type = Type(Inner::ByteaArray); + + /// CHAR[] + pub const CHAR_ARRAY: Type = Type(Inner::CharArray); + + /// NAME[] + pub const NAME_ARRAY: Type = Type(Inner::NameArray); + + /// INT2[] + pub const INT2_ARRAY: Type = Type(Inner::Int2Array); + + /// INT2VECTOR[] + pub const INT2_VECTOR_ARRAY: Type = Type(Inner::Int2VectorArray); + + /// INT4[] + pub const INT4_ARRAY: Type = Type(Inner::Int4Array); + + /// REGPROC[] + pub const REGPROC_ARRAY: Type = Type(Inner::RegprocArray); + + /// TEXT[] + pub const TEXT_ARRAY: Type = Type(Inner::TextArray); + + /// TID[] + pub const TID_ARRAY: Type = Type(Inner::TidArray); + + /// XID[] + pub const XID_ARRAY: Type = Type(Inner::XidArray); + + /// CID[] + pub const CID_ARRAY: Type = Type(Inner::CidArray); + + /// OIDVECTOR[] + pub const OID_VECTOR_ARRAY: Type = Type(Inner::OidVectorArray); + + /// BPCHAR[] + pub const BPCHAR_ARRAY: Type = Type(Inner::BpcharArray); + + /// VARCHAR[] + pub const VARCHAR_ARRAY: Type = Type(Inner::VarcharArray); + + /// INT8[] + pub const INT8_ARRAY: Type = Type(Inner::Int8Array); + + /// POINT[] + pub const POINT_ARRAY: Type = Type(Inner::PointArray); + + /// LSEG[] + pub const LSEG_ARRAY: Type = Type(Inner::LsegArray); + + /// PATH[] + pub const PATH_ARRAY: Type = Type(Inner::PathArray); + + /// BOX[] + pub const BOX_ARRAY: Type = Type(Inner::BoxArray); + + /// FLOAT4[] + pub const FLOAT4_ARRAY: Type = Type(Inner::Float4Array); + + /// FLOAT8[] + pub const FLOAT8_ARRAY: Type = Type(Inner::Float8Array); + + /// POLYGON[] + pub const POLYGON_ARRAY: Type = Type(Inner::PolygonArray); + + /// OID[] + pub const OID_ARRAY: Type = Type(Inner::OidArray); + + /// ACLITEM - access control list + pub const ACLITEM: Type = Type(Inner::Aclitem); + + /// ACLITEM[] + pub const ACLITEM_ARRAY: Type = Type(Inner::AclitemArray); + + /// MACADDR[] + pub const MACADDR_ARRAY: Type = Type(Inner::MacaddrArray); + + /// INET[] + pub const INET_ARRAY: Type = Type(Inner::InetArray); + + /// BPCHAR - char(length), blank-padded string, fixed storage length + pub const BPCHAR: Type = Type(Inner::Bpchar); + + /// VARCHAR - varchar(length), non-blank-padded string, variable storage length + pub const VARCHAR: Type = Type(Inner::Varchar); + + /// DATE - date + pub const DATE: Type = Type(Inner::Date); + + /// TIME - time of day + pub const TIME: Type = Type(Inner::Time); + + /// TIMESTAMP - date and time + pub const TIMESTAMP: Type = Type(Inner::Timestamp); + + /// TIMESTAMP[] + pub const TIMESTAMP_ARRAY: Type = Type(Inner::TimestampArray); + + /// DATE[] + pub const DATE_ARRAY: Type = Type(Inner::DateArray); + + /// TIME[] + pub const TIME_ARRAY: Type = Type(Inner::TimeArray); + + /// TIMESTAMPTZ - date and time with time zone + pub const TIMESTAMPTZ: Type = Type(Inner::Timestamptz); + + /// TIMESTAMPTZ[] + pub const TIMESTAMPTZ_ARRAY: Type = Type(Inner::TimestamptzArray); + + /// INTERVAL - @ <number> <units>, time interval + pub const INTERVAL: Type = Type(Inner::Interval); + + /// INTERVAL[] + pub const INTERVAL_ARRAY: Type = Type(Inner::IntervalArray); + + /// NUMERIC[] + pub const NUMERIC_ARRAY: Type = Type(Inner::NumericArray); + + /// CSTRING[] + pub const CSTRING_ARRAY: Type = Type(Inner::CstringArray); + + /// TIMETZ - time of day with time zone + pub const TIMETZ: Type = Type(Inner::Timetz); + + /// TIMETZ[] + pub const TIMETZ_ARRAY: Type = Type(Inner::TimetzArray); + + /// BIT - fixed-length bit string + pub const BIT: Type = Type(Inner::Bit); + + /// BIT[] + pub const BIT_ARRAY: Type = Type(Inner::BitArray); + + /// VARBIT - variable-length bit string + pub const VARBIT: Type = Type(Inner::Varbit); + + /// VARBIT[] + pub const VARBIT_ARRAY: Type = Type(Inner::VarbitArray); + + /// NUMERIC - numeric(precision, decimal), arbitrary precision number + pub const NUMERIC: Type = Type(Inner::Numeric); + + /// REFCURSOR - reference to cursor (portal name) + pub const REFCURSOR: Type = Type(Inner::Refcursor); + + /// REFCURSOR[] + pub const REFCURSOR_ARRAY: Type = Type(Inner::RefcursorArray); + + /// REGPROCEDURE - registered procedure (with args) + pub const REGPROCEDURE: Type = Type(Inner::Regprocedure); + + /// REGOPER - registered operator + pub const REGOPER: Type = Type(Inner::Regoper); + + /// REGOPERATOR - registered operator (with args) + pub const REGOPERATOR: Type = Type(Inner::Regoperator); + + /// REGCLASS - registered class + pub const REGCLASS: Type = Type(Inner::Regclass); + + /// REGTYPE - registered type + pub const REGTYPE: Type = Type(Inner::Regtype); + + /// REGPROCEDURE[] + pub const REGPROCEDURE_ARRAY: Type = Type(Inner::RegprocedureArray); + + /// REGOPER[] + pub const REGOPER_ARRAY: Type = Type(Inner::RegoperArray); + + /// REGOPERATOR[] + pub const REGOPERATOR_ARRAY: Type = Type(Inner::RegoperatorArray); + + /// REGCLASS[] + pub const REGCLASS_ARRAY: Type = Type(Inner::RegclassArray); + + /// REGTYPE[] + pub const REGTYPE_ARRAY: Type = Type(Inner::RegtypeArray); + + /// RECORD - pseudo-type representing any composite type + pub const RECORD: Type = Type(Inner::Record); + + /// CSTRING - C-style string + pub const CSTRING: Type = Type(Inner::Cstring); + + /// ANY - pseudo-type representing any type + pub const ANY: Type = Type(Inner::Any); + + /// ANYARRAY - pseudo-type representing a polymorphic array type + pub const ANYARRAY: Type = Type(Inner::Anyarray); + + /// VOID - pseudo-type for the result of a function with no real result + pub const VOID: Type = Type(Inner::Void); + + /// TRIGGER - pseudo-type for the result of a trigger function + pub const TRIGGER: Type = Type(Inner::Trigger); + + /// LANGUAGE_HANDLER - pseudo-type for the result of a language handler function + pub const LANGUAGE_HANDLER: Type = Type(Inner::LanguageHandler); + + /// INTERNAL - pseudo-type representing an internal data structure + pub const INTERNAL: Type = Type(Inner::Internal); + + /// ANYELEMENT - pseudo-type representing a polymorphic base type + pub const ANYELEMENT: Type = Type(Inner::Anyelement); + + /// RECORD[] + pub const RECORD_ARRAY: Type = Type(Inner::RecordArray); + + /// ANYNONARRAY - pseudo-type representing a polymorphic base type that is not an array + pub const ANYNONARRAY: Type = Type(Inner::Anynonarray); + + /// TXID_SNAPSHOT[] + pub const TXID_SNAPSHOT_ARRAY: Type = Type(Inner::TxidSnapshotArray); + + /// UUID - UUID datatype + pub const UUID: Type = Type(Inner::Uuid); + + /// UUID[] + pub const UUID_ARRAY: Type = Type(Inner::UuidArray); + + /// TXID_SNAPSHOT - txid snapshot + pub const TXID_SNAPSHOT: Type = Type(Inner::TxidSnapshot); + + /// FDW_HANDLER - pseudo-type for the result of an FDW handler function + pub const FDW_HANDLER: Type = Type(Inner::FdwHandler); + + /// PG_LSN - PostgreSQL LSN datatype + pub const PG_LSN: Type = Type(Inner::PgLsn); + + /// PG_LSN[] + pub const PG_LSN_ARRAY: Type = Type(Inner::PgLsnArray); + + /// TSM_HANDLER - pseudo-type for the result of a tablesample method function + pub const TSM_HANDLER: Type = Type(Inner::TsmHandler); + + /// PG_NDISTINCT - multivariate ndistinct coefficients + pub const PG_NDISTINCT: Type = Type(Inner::PgNdistinct); + + /// PG_DEPENDENCIES - multivariate dependencies + pub const PG_DEPENDENCIES: Type = Type(Inner::PgDependencies); + + /// ANYENUM - pseudo-type representing a polymorphic base type that is an enum + pub const ANYENUM: Type = Type(Inner::Anyenum); + + /// TSVECTOR - text representation for text search + pub const TS_VECTOR: Type = Type(Inner::TsVector); + + /// TSQUERY - query representation for text search + pub const TSQUERY: Type = Type(Inner::Tsquery); + + /// GTSVECTOR - GiST index internal text representation for text search + pub const GTS_VECTOR: Type = Type(Inner::GtsVector); + + /// TSVECTOR[] + pub const TS_VECTOR_ARRAY: Type = Type(Inner::TsVectorArray); + + /// GTSVECTOR[] + pub const GTS_VECTOR_ARRAY: Type = Type(Inner::GtsVectorArray); + + /// TSQUERY[] + pub const TSQUERY_ARRAY: Type = Type(Inner::TsqueryArray); + + /// REGCONFIG - registered text search configuration + pub const REGCONFIG: Type = Type(Inner::Regconfig); + + /// REGCONFIG[] + pub const REGCONFIG_ARRAY: Type = Type(Inner::RegconfigArray); + + /// REGDICTIONARY - registered text search dictionary + pub const REGDICTIONARY: Type = Type(Inner::Regdictionary); + + /// REGDICTIONARY[] + pub const REGDICTIONARY_ARRAY: Type = Type(Inner::RegdictionaryArray); + + /// JSONB - Binary JSON + pub const JSONB: Type = Type(Inner::Jsonb); + + /// JSONB[] + pub const JSONB_ARRAY: Type = Type(Inner::JsonbArray); + + /// ANYRANGE - pseudo-type representing a range over a polymorphic base type + pub const ANY_RANGE: Type = Type(Inner::AnyRange); + + /// EVENT_TRIGGER - pseudo-type for the result of an event trigger function + pub const EVENT_TRIGGER: Type = Type(Inner::EventTrigger); + + /// INT4RANGE - range of integers + pub const INT4_RANGE: Type = Type(Inner::Int4Range); + + /// INT4RANGE[] + pub const INT4_RANGE_ARRAY: Type = Type(Inner::Int4RangeArray); + + /// NUMRANGE - range of numerics + pub const NUM_RANGE: Type = Type(Inner::NumRange); + + /// NUMRANGE[] + pub const NUM_RANGE_ARRAY: Type = Type(Inner::NumRangeArray); + + /// TSRANGE - range of timestamps without time zone + pub const TS_RANGE: Type = Type(Inner::TsRange); + + /// TSRANGE[] + pub const TS_RANGE_ARRAY: Type = Type(Inner::TsRangeArray); + + /// TSTZRANGE - range of timestamps with time zone + pub const TSTZ_RANGE: Type = Type(Inner::TstzRange); + + /// TSTZRANGE[] + pub const TSTZ_RANGE_ARRAY: Type = Type(Inner::TstzRangeArray); + + /// DATERANGE - range of dates + pub const DATE_RANGE: Type = Type(Inner::DateRange); + + /// DATERANGE[] + pub const DATE_RANGE_ARRAY: Type = Type(Inner::DateRangeArray); + + /// INT8RANGE - range of bigints + pub const INT8_RANGE: Type = Type(Inner::Int8Range); + + /// INT8RANGE[] + pub const INT8_RANGE_ARRAY: Type = Type(Inner::Int8RangeArray); + + /// JSONPATH - JSON path + pub const JSONPATH: Type = Type(Inner::Jsonpath); + + /// JSONPATH[] + pub const JSONPATH_ARRAY: Type = Type(Inner::JsonpathArray); + + /// REGNAMESPACE - registered namespace + pub const REGNAMESPACE: Type = Type(Inner::Regnamespace); + + /// REGNAMESPACE[] + pub const REGNAMESPACE_ARRAY: Type = Type(Inner::RegnamespaceArray); + + /// REGROLE - registered role + pub const REGROLE: Type = Type(Inner::Regrole); + + /// REGROLE[] + pub const REGROLE_ARRAY: Type = Type(Inner::RegroleArray); + + /// REGCOLLATION - registered collation + pub const REGCOLLATION: Type = Type(Inner::Regcollation); + + /// REGCOLLATION[] + pub const REGCOLLATION_ARRAY: Type = Type(Inner::RegcollationArray); + + /// INT4MULTIRANGE - multirange of integers + pub const INT4MULTI_RANGE: Type = Type(Inner::Int4multiRange); + + /// NUMMULTIRANGE - multirange of numerics + pub const NUMMULTI_RANGE: Type = Type(Inner::NummultiRange); + + /// TSMULTIRANGE - multirange of timestamps without time zone + pub const TSMULTI_RANGE: Type = Type(Inner::TsmultiRange); + + /// TSTZMULTIRANGE - multirange of timestamps with time zone + pub const TSTZMULTI_RANGE: Type = Type(Inner::TstzmultiRange); + + /// DATEMULTIRANGE - multirange of dates + pub const DATEMULTI_RANGE: Type = Type(Inner::DatemultiRange); + + /// INT8MULTIRANGE - multirange of bigints + pub const INT8MULTI_RANGE: Type = Type(Inner::Int8multiRange); + + /// ANYMULTIRANGE - pseudo-type representing a polymorphic base type that is a multirange + pub const ANYMULTI_RANGE: Type = Type(Inner::AnymultiRange); + + /// ANYCOMPATIBLEMULTIRANGE - pseudo-type representing a multirange over a polymorphic common type + pub const ANYCOMPATIBLEMULTI_RANGE: Type = Type(Inner::AnycompatiblemultiRange); + + /// PG_BRIN_BLOOM_SUMMARY - BRIN bloom summary + pub const PG_BRIN_BLOOM_SUMMARY: Type = Type(Inner::PgBrinBloomSummary); + + /// PG_BRIN_MINMAX_MULTI_SUMMARY - BRIN minmax-multi summary + pub const PG_BRIN_MINMAX_MULTI_SUMMARY: Type = Type(Inner::PgBrinMinmaxMultiSummary); + + /// PG_MCV_LIST - multivariate MCV list + pub const PG_MCV_LIST: Type = Type(Inner::PgMcvList); + + /// PG_SNAPSHOT - snapshot + pub const PG_SNAPSHOT: Type = Type(Inner::PgSnapshot); + + /// PG_SNAPSHOT[] + pub const PG_SNAPSHOT_ARRAY: Type = Type(Inner::PgSnapshotArray); + + /// XID8 - full transaction id + pub const XID8: Type = Type(Inner::Xid8); + + /// ANYCOMPATIBLE - pseudo-type representing a polymorphic common type + pub const ANYCOMPATIBLE: Type = Type(Inner::Anycompatible); + + /// ANYCOMPATIBLEARRAY - pseudo-type representing an array of polymorphic common type elements + pub const ANYCOMPATIBLEARRAY: Type = Type(Inner::Anycompatiblearray); + + /// ANYCOMPATIBLENONARRAY - pseudo-type representing a polymorphic common type that is not an array + pub const ANYCOMPATIBLENONARRAY: Type = Type(Inner::Anycompatiblenonarray); + + /// ANYCOMPATIBLERANGE - pseudo-type representing a range over a polymorphic common type + pub const ANYCOMPATIBLE_RANGE: Type = Type(Inner::AnycompatibleRange); + + /// INT4MULTIRANGE[] + pub const INT4MULTI_RANGE_ARRAY: Type = Type(Inner::Int4multiRangeArray); + + /// NUMMULTIRANGE[] + pub const NUMMULTI_RANGE_ARRAY: Type = Type(Inner::NummultiRangeArray); + + /// TSMULTIRANGE[] + pub const TSMULTI_RANGE_ARRAY: Type = Type(Inner::TsmultiRangeArray); + + /// TSTZMULTIRANGE[] + pub const TSTZMULTI_RANGE_ARRAY: Type = Type(Inner::TstzmultiRangeArray); + + /// DATEMULTIRANGE[] + pub const DATEMULTI_RANGE_ARRAY: Type = Type(Inner::DatemultiRangeArray); + + /// INT8MULTIRANGE[] + pub const INT8MULTI_RANGE_ARRAY: Type = Type(Inner::Int8multiRangeArray); +} diff --git a/libs/proxy/tokio-postgres2/Cargo.toml b/libs/proxy/tokio-postgres2/Cargo.toml new file mode 100644 index 0000000000..7130c1b726 --- /dev/null +++ b/libs/proxy/tokio-postgres2/Cargo.toml @@ -0,0 +1,21 @@ +[package] +name = "tokio-postgres2" +version = "0.1.0" +edition = "2018" +license = "MIT/Apache-2.0" + +[dependencies] +async-trait.workspace = true +bytes.workspace = true +byteorder.workspace = true +fallible-iterator.workspace = true +futures-util = { workspace = true, features = ["sink"] } +log = "0.4" +parking_lot.workspace = true +percent-encoding = "2.0" +pin-project-lite.workspace = true +phf = "0.11" +postgres-protocol2 = { path = "../postgres-protocol2" } +postgres-types2 = { path = "../postgres-types2" } +tokio = { workspace = true, features = ["io-util", "time", "net"] } +tokio-util = { workspace = true, features = ["codec"] } diff --git a/libs/proxy/tokio-postgres2/src/cancel_query.rs b/libs/proxy/tokio-postgres2/src/cancel_query.rs new file mode 100644 index 0000000000..cddbf16336 --- /dev/null +++ b/libs/proxy/tokio-postgres2/src/cancel_query.rs @@ -0,0 +1,40 @@ +use tokio::net::TcpStream; + +use crate::client::SocketConfig; +use crate::config::{Host, SslMode}; +use crate::tls::MakeTlsConnect; +use crate::{cancel_query_raw, connect_socket, Error}; +use std::io; + +pub(crate) async fn cancel_query( + config: Option, + ssl_mode: SslMode, + mut tls: T, + process_id: i32, + secret_key: i32, +) -> Result<(), Error> +where + T: MakeTlsConnect, +{ + let config = match config { + Some(config) => config, + None => { + return Err(Error::connect(io::Error::new( + io::ErrorKind::InvalidInput, + "unknown host", + ))) + } + }; + + let hostname = match &config.host { + Host::Tcp(host) => &**host, + }; + let tls = tls + .make_tls_connect(hostname) + .map_err(|e| Error::tls(e.into()))?; + + let socket = + connect_socket::connect_socket(&config.host, config.port, config.connect_timeout).await?; + + cancel_query_raw::cancel_query_raw(socket, ssl_mode, tls, process_id, secret_key).await +} diff --git a/libs/proxy/tokio-postgres2/src/cancel_query_raw.rs b/libs/proxy/tokio-postgres2/src/cancel_query_raw.rs new file mode 100644 index 0000000000..8c08296435 --- /dev/null +++ b/libs/proxy/tokio-postgres2/src/cancel_query_raw.rs @@ -0,0 +1,29 @@ +use crate::config::SslMode; +use crate::tls::TlsConnect; +use crate::{connect_tls, Error}; +use bytes::BytesMut; +use postgres_protocol2::message::frontend; +use tokio::io::{AsyncRead, AsyncWrite, AsyncWriteExt}; + +pub async fn cancel_query_raw( + stream: S, + mode: SslMode, + tls: T, + process_id: i32, + secret_key: i32, +) -> Result<(), Error> +where + S: AsyncRead + AsyncWrite + Unpin, + T: TlsConnect, +{ + let mut stream = connect_tls::connect_tls(stream, mode, tls).await?; + + let mut buf = BytesMut::new(); + frontend::cancel_request(process_id, secret_key, &mut buf); + + stream.write_all(&buf).await.map_err(Error::io)?; + stream.flush().await.map_err(Error::io)?; + stream.shutdown().await.map_err(Error::io)?; + + Ok(()) +} diff --git a/libs/proxy/tokio-postgres2/src/cancel_token.rs b/libs/proxy/tokio-postgres2/src/cancel_token.rs new file mode 100644 index 0000000000..a10e8bf5c3 --- /dev/null +++ b/libs/proxy/tokio-postgres2/src/cancel_token.rs @@ -0,0 +1,62 @@ +use crate::config::SslMode; +use crate::tls::TlsConnect; + +use crate::{cancel_query, client::SocketConfig, tls::MakeTlsConnect}; +use crate::{cancel_query_raw, Error}; +use tokio::io::{AsyncRead, AsyncWrite}; +use tokio::net::TcpStream; + +/// The capability to request cancellation of in-progress queries on a +/// connection. +#[derive(Clone)] +pub struct CancelToken { + pub socket_config: Option, + pub ssl_mode: SslMode, + pub process_id: i32, + pub secret_key: i32, +} + +impl CancelToken { + /// Attempts to cancel the in-progress query on the connection associated + /// with this `CancelToken`. + /// + /// The server provides no information about whether a cancellation attempt was successful or not. An error will + /// only be returned if the client was unable to connect to the database. + /// + /// Cancellation is inherently racy. There is no guarantee that the + /// cancellation request will reach the server before the query terminates + /// normally, or that the connection associated with this token is still + /// active. + /// + /// Requires the `runtime` Cargo feature (enabled by default). + pub async fn cancel_query(&self, tls: T) -> Result<(), Error> + where + T: MakeTlsConnect, + { + cancel_query::cancel_query( + self.socket_config.clone(), + self.ssl_mode, + tls, + self.process_id, + self.secret_key, + ) + .await + } + + /// Like `cancel_query`, but uses a stream which is already connected to the server rather than opening a new + /// connection itself. + pub async fn cancel_query_raw(&self, stream: S, tls: T) -> Result<(), Error> + where + S: AsyncRead + AsyncWrite + Unpin, + T: TlsConnect, + { + cancel_query_raw::cancel_query_raw( + stream, + self.ssl_mode, + tls, + self.process_id, + self.secret_key, + ) + .await + } +} diff --git a/libs/proxy/tokio-postgres2/src/client.rs b/libs/proxy/tokio-postgres2/src/client.rs new file mode 100644 index 0000000000..a7cd53afc3 --- /dev/null +++ b/libs/proxy/tokio-postgres2/src/client.rs @@ -0,0 +1,436 @@ +use crate::codec::{BackendMessages, FrontendMessage}; + +use crate::config::Host; +use crate::config::SslMode; +use crate::connection::{Request, RequestMessages}; + +use crate::query::RowStream; +use crate::simple_query::SimpleQueryStream; + +use crate::types::{Oid, ToSql, Type}; + +use crate::{ + prepare, query, simple_query, slice_iter, CancelToken, Error, ReadyForQueryStatus, Row, + SimpleQueryMessage, Statement, ToStatement, Transaction, TransactionBuilder, +}; +use bytes::BytesMut; +use fallible_iterator::FallibleIterator; +use futures_util::{future, ready, TryStreamExt}; +use parking_lot::Mutex; +use postgres_protocol2::message::{backend::Message, frontend}; +use std::collections::HashMap; +use std::fmt; +use std::sync::Arc; +use std::task::{Context, Poll}; +use tokio::sync::mpsc; + +use std::time::Duration; + +pub struct Responses { + receiver: mpsc::Receiver, + cur: BackendMessages, +} + +impl Responses { + pub fn poll_next(&mut self, cx: &mut Context<'_>) -> Poll> { + loop { + match self.cur.next().map_err(Error::parse)? { + Some(Message::ErrorResponse(body)) => return Poll::Ready(Err(Error::db(body))), + Some(message) => return Poll::Ready(Ok(message)), + None => {} + } + + match ready!(self.receiver.poll_recv(cx)) { + Some(messages) => self.cur = messages, + None => return Poll::Ready(Err(Error::closed())), + } + } + } + + pub async fn next(&mut self) -> Result { + future::poll_fn(|cx| self.poll_next(cx)).await + } +} + +/// A cache of type info and prepared statements for fetching type info +/// (corresponding to the queries in the [prepare] module). +#[derive(Default)] +struct CachedTypeInfo { + /// A statement for basic information for a type from its + /// OID. Corresponds to [TYPEINFO_QUERY](prepare::TYPEINFO_QUERY) (or its + /// fallback). + typeinfo: Option, + /// A statement for getting information for a composite type from its OID. + /// Corresponds to [TYPEINFO_QUERY](prepare::TYPEINFO_COMPOSITE_QUERY). + typeinfo_composite: Option, + /// A statement for getting information for a composite type from its OID. + /// Corresponds to [TYPEINFO_QUERY](prepare::TYPEINFO_COMPOSITE_QUERY) (or + /// its fallback). + typeinfo_enum: Option, + + /// Cache of types already looked up. + types: HashMap, +} + +pub struct InnerClient { + sender: mpsc::UnboundedSender, + cached_typeinfo: Mutex, + + /// A buffer to use when writing out postgres commands. + buffer: Mutex, +} + +impl InnerClient { + pub fn send(&self, messages: RequestMessages) -> Result { + let (sender, receiver) = mpsc::channel(1); + let request = Request { messages, sender }; + self.sender.send(request).map_err(|_| Error::closed())?; + + Ok(Responses { + receiver, + cur: BackendMessages::empty(), + }) + } + + pub fn typeinfo(&self) -> Option { + self.cached_typeinfo.lock().typeinfo.clone() + } + + pub fn set_typeinfo(&self, statement: &Statement) { + self.cached_typeinfo.lock().typeinfo = Some(statement.clone()); + } + + pub fn typeinfo_composite(&self) -> Option { + self.cached_typeinfo.lock().typeinfo_composite.clone() + } + + pub fn set_typeinfo_composite(&self, statement: &Statement) { + self.cached_typeinfo.lock().typeinfo_composite = Some(statement.clone()); + } + + pub fn typeinfo_enum(&self) -> Option { + self.cached_typeinfo.lock().typeinfo_enum.clone() + } + + pub fn set_typeinfo_enum(&self, statement: &Statement) { + self.cached_typeinfo.lock().typeinfo_enum = Some(statement.clone()); + } + + pub fn type_(&self, oid: Oid) -> Option { + self.cached_typeinfo.lock().types.get(&oid).cloned() + } + + pub fn set_type(&self, oid: Oid, type_: &Type) { + self.cached_typeinfo.lock().types.insert(oid, type_.clone()); + } + + /// Call the given function with a buffer to be used when writing out + /// postgres commands. + pub fn with_buf(&self, f: F) -> R + where + F: FnOnce(&mut BytesMut) -> R, + { + let mut buffer = self.buffer.lock(); + let r = f(&mut buffer); + buffer.clear(); + r + } +} + +#[derive(Clone)] +pub struct SocketConfig { + pub host: Host, + pub port: u16, + pub connect_timeout: Option, + // pub keepalive: Option, +} + +/// An asynchronous PostgreSQL client. +/// +/// The client is one half of what is returned when a connection is established. Users interact with the database +/// through this client object. +pub struct Client { + inner: Arc, + + socket_config: SocketConfig, + ssl_mode: SslMode, + process_id: i32, + secret_key: i32, +} + +impl Client { + pub(crate) fn new( + sender: mpsc::UnboundedSender, + socket_config: SocketConfig, + ssl_mode: SslMode, + process_id: i32, + secret_key: i32, + ) -> Client { + Client { + inner: Arc::new(InnerClient { + sender, + cached_typeinfo: Default::default(), + buffer: Default::default(), + }), + + socket_config, + ssl_mode, + process_id, + secret_key, + } + } + + /// Returns process_id. + pub fn get_process_id(&self) -> i32 { + self.process_id + } + + pub(crate) fn inner(&self) -> &Arc { + &self.inner + } + + /// Creates a new prepared statement. + /// + /// Prepared statements can be executed repeatedly, and may contain query parameters (indicated by `$1`, `$2`, etc), + /// which are set when executed. Prepared statements can only be used with the connection that created them. + pub async fn prepare(&self, query: &str) -> Result { + self.prepare_typed(query, &[]).await + } + + /// Like `prepare`, but allows the types of query parameters to be explicitly specified. + /// + /// The list of types may be smaller than the number of parameters - the types of the remaining parameters will be + /// inferred. For example, `client.prepare_typed(query, &[])` is equivalent to `client.prepare(query)`. + pub async fn prepare_typed( + &self, + query: &str, + parameter_types: &[Type], + ) -> Result { + prepare::prepare(&self.inner, query, parameter_types).await + } + + /// Executes a statement, returning a vector of the resulting rows. + /// + /// A statement may contain parameters, specified by `$n`, where `n` is the index of the parameter of the list + /// provided, 1-indexed. + /// + /// The `statement` argument can either be a `Statement`, or a raw query string. If the same statement will be + /// repeatedly executed (perhaps with different query parameters), consider preparing the statement up front + /// with the `prepare` method. + /// + /// # Panics + /// + /// Panics if the number of parameters provided does not match the number expected. + pub async fn query( + &self, + statement: &T, + params: &[&(dyn ToSql + Sync)], + ) -> Result, Error> + where + T: ?Sized + ToStatement, + { + self.query_raw(statement, slice_iter(params)) + .await? + .try_collect() + .await + } + + /// The maximally flexible version of [`query`]. + /// + /// A statement may contain parameters, specified by `$n`, where `n` is the index of the parameter of the list + /// provided, 1-indexed. + /// + /// The `statement` argument can either be a `Statement`, or a raw query string. If the same statement will be + /// repeatedly executed (perhaps with different query parameters), consider preparing the statement up front + /// with the `prepare` method. + /// + /// # Panics + /// + /// Panics if the number of parameters provided does not match the number expected. + /// + /// [`query`]: #method.query + pub async fn query_raw<'a, T, I>(&self, statement: &T, params: I) -> Result + where + T: ?Sized + ToStatement, + I: IntoIterator, + I::IntoIter: ExactSizeIterator, + { + let statement = statement.__convert().into_statement(self).await?; + query::query(&self.inner, statement, params).await + } + + /// Pass text directly to the Postgres backend to allow it to sort out typing itself and + /// to save a roundtrip + pub async fn query_raw_txt(&self, statement: &str, params: I) -> Result + where + S: AsRef, + I: IntoIterator>, + I::IntoIter: ExactSizeIterator, + { + query::query_txt(&self.inner, statement, params).await + } + + /// Executes a statement, returning the number of rows modified. + /// + /// A statement may contain parameters, specified by `$n`, where `n` is the index of the parameter of the list + /// provided, 1-indexed. + /// + /// The `statement` argument can either be a `Statement`, or a raw query string. If the same statement will be + /// repeatedly executed (perhaps with different query parameters), consider preparing the statement up front + /// with the `prepare` method. + /// + /// If the statement does not modify any rows (e.g. `SELECT`), 0 is returned. + /// + /// # Panics + /// + /// Panics if the number of parameters provided does not match the number expected. + pub async fn execute( + &self, + statement: &T, + params: &[&(dyn ToSql + Sync)], + ) -> Result + where + T: ?Sized + ToStatement, + { + self.execute_raw(statement, slice_iter(params)).await + } + + /// The maximally flexible version of [`execute`]. + /// + /// A statement may contain parameters, specified by `$n`, where `n` is the index of the parameter of the list + /// provided, 1-indexed. + /// + /// The `statement` argument can either be a `Statement`, or a raw query string. If the same statement will be + /// repeatedly executed (perhaps with different query parameters), consider preparing the statement up front + /// with the `prepare` method. + /// + /// # Panics + /// + /// Panics if the number of parameters provided does not match the number expected. + /// + /// [`execute`]: #method.execute + pub async fn execute_raw<'a, T, I>(&self, statement: &T, params: I) -> Result + where + T: ?Sized + ToStatement, + I: IntoIterator, + I::IntoIter: ExactSizeIterator, + { + let statement = statement.__convert().into_statement(self).await?; + query::execute(self.inner(), statement, params).await + } + + /// Executes a sequence of SQL statements using the simple query protocol, returning the resulting rows. + /// + /// Statements should be separated by semicolons. If an error occurs, execution of the sequence will stop at that + /// point. The simple query protocol returns the values in rows as strings rather than in their binary encodings, + /// so the associated row type doesn't work with the `FromSql` trait. Rather than simply returning a list of the + /// rows, this method returns a list of an enum which indicates either the completion of one of the commands, + /// or a row of data. This preserves the framing between the separate statements in the request. + /// + /// # Warning + /// + /// Prepared statements should be use for any query which contains user-specified data, as they provided the + /// functionality to safely embed that data in the request. Do not form statements via string concatenation and pass + /// them to this method! + pub async fn simple_query(&self, query: &str) -> Result, Error> { + self.simple_query_raw(query).await?.try_collect().await + } + + pub(crate) async fn simple_query_raw(&self, query: &str) -> Result { + simple_query::simple_query(self.inner(), query).await + } + + /// Executes a sequence of SQL statements using the simple query protocol. + /// + /// Statements should be separated by semicolons. If an error occurs, execution of the sequence will stop at that + /// point. This is intended for use when, for example, initializing a database schema. + /// + /// # Warning + /// + /// Prepared statements should be use for any query which contains user-specified data, as they provided the + /// functionality to safely embed that data in the request. Do not form statements via string concatenation and pass + /// them to this method! + pub async fn batch_execute(&self, query: &str) -> Result { + simple_query::batch_execute(self.inner(), query).await + } + + /// Begins a new database transaction. + /// + /// The transaction will roll back by default - use the `commit` method to commit it. + pub async fn transaction(&mut self) -> Result, Error> { + struct RollbackIfNotDone<'me> { + client: &'me Client, + done: bool, + } + + impl Drop for RollbackIfNotDone<'_> { + fn drop(&mut self) { + if self.done { + return; + } + + let buf = self.client.inner().with_buf(|buf| { + frontend::query("ROLLBACK", buf).unwrap(); + buf.split().freeze() + }); + let _ = self + .client + .inner() + .send(RequestMessages::Single(FrontendMessage::Raw(buf))); + } + } + + // This is done, as `Future` created by this method can be dropped after + // `RequestMessages` is synchronously send to the `Connection` by + // `batch_execute()`, but before `Responses` is asynchronously polled to + // completion. In that case `Transaction` won't be created and thus + // won't be rolled back. + { + let mut cleaner = RollbackIfNotDone { + client: self, + done: false, + }; + self.batch_execute("BEGIN").await?; + cleaner.done = true; + } + + Ok(Transaction::new(self)) + } + + /// Returns a builder for a transaction with custom settings. + /// + /// Unlike the `transaction` method, the builder can be used to control the transaction's isolation level and other + /// attributes. + pub fn build_transaction(&mut self) -> TransactionBuilder<'_> { + TransactionBuilder::new(self) + } + + /// Constructs a cancellation token that can later be used to request cancellation of a query running on the + /// connection associated with this client. + pub fn cancel_token(&self) -> CancelToken { + CancelToken { + socket_config: Some(self.socket_config.clone()), + ssl_mode: self.ssl_mode, + process_id: self.process_id, + secret_key: self.secret_key, + } + } + + /// Query for type information + pub async fn get_type(&self, oid: Oid) -> Result { + crate::prepare::get_type(&self.inner, oid).await + } + + /// Determines if the connection to the server has already closed. + /// + /// In that case, all future queries will fail. + pub fn is_closed(&self) -> bool { + self.inner.sender.is_closed() + } +} + +impl fmt::Debug for Client { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + f.debug_struct("Client").finish() + } +} diff --git a/libs/proxy/tokio-postgres2/src/codec.rs b/libs/proxy/tokio-postgres2/src/codec.rs new file mode 100644 index 0000000000..0ec46198ce --- /dev/null +++ b/libs/proxy/tokio-postgres2/src/codec.rs @@ -0,0 +1,98 @@ +use bytes::{Buf, Bytes, BytesMut}; +use fallible_iterator::FallibleIterator; +use postgres_protocol2::message::backend; +use postgres_protocol2::message::frontend::CopyData; +use std::io; +use tokio_util::codec::{Decoder, Encoder}; + +pub enum FrontendMessage { + Raw(Bytes), + CopyData(CopyData>), +} + +pub enum BackendMessage { + Normal { + messages: BackendMessages, + request_complete: bool, + }, + Async(backend::Message), +} + +pub struct BackendMessages(BytesMut); + +impl BackendMessages { + pub fn empty() -> BackendMessages { + BackendMessages(BytesMut::new()) + } +} + +impl FallibleIterator for BackendMessages { + type Item = backend::Message; + type Error = io::Error; + + fn next(&mut self) -> io::Result> { + backend::Message::parse(&mut self.0) + } +} + +pub struct PostgresCodec; + +impl Encoder for PostgresCodec { + type Error = io::Error; + + fn encode(&mut self, item: FrontendMessage, dst: &mut BytesMut) -> io::Result<()> { + match item { + FrontendMessage::Raw(buf) => dst.extend_from_slice(&buf), + FrontendMessage::CopyData(data) => data.write(dst), + } + + Ok(()) + } +} + +impl Decoder for PostgresCodec { + type Item = BackendMessage; + type Error = io::Error; + + fn decode(&mut self, src: &mut BytesMut) -> Result, io::Error> { + let mut idx = 0; + let mut request_complete = false; + + while let Some(header) = backend::Header::parse(&src[idx..])? { + let len = header.len() as usize + 1; + if src[idx..].len() < len { + break; + } + + match header.tag() { + backend::NOTICE_RESPONSE_TAG + | backend::NOTIFICATION_RESPONSE_TAG + | backend::PARAMETER_STATUS_TAG => { + if idx == 0 { + let message = backend::Message::parse(src)?.unwrap(); + return Ok(Some(BackendMessage::Async(message))); + } else { + break; + } + } + _ => {} + } + + idx += len; + + if header.tag() == backend::READY_FOR_QUERY_TAG { + request_complete = true; + break; + } + } + + if idx == 0 { + Ok(None) + } else { + Ok(Some(BackendMessage::Normal { + messages: BackendMessages(src.split_to(idx)), + request_complete, + })) + } + } +} diff --git a/libs/proxy/tokio-postgres2/src/config.rs b/libs/proxy/tokio-postgres2/src/config.rs new file mode 100644 index 0000000000..11a361a81b --- /dev/null +++ b/libs/proxy/tokio-postgres2/src/config.rs @@ -0,0 +1,264 @@ +//! Connection configuration. + +use crate::connect::connect; +use crate::connect_raw::connect_raw; +use crate::connect_raw::RawConnection; +use crate::tls::MakeTlsConnect; +use crate::tls::TlsConnect; +use crate::{Client, Connection, Error}; +use postgres_protocol2::message::frontend::StartupMessageParams; +use std::fmt; +use std::str; +use std::time::Duration; +use tokio::io::{AsyncRead, AsyncWrite}; + +pub use postgres_protocol2::authentication::sasl::ScramKeys; +use tokio::net::TcpStream; + +/// TLS configuration. +#[derive(Debug, Copy, Clone, PartialEq, Eq)] +#[non_exhaustive] +pub enum SslMode { + /// Do not use TLS. + Disable, + /// Attempt to connect with TLS but allow sessions without. + Prefer, + /// Require the use of TLS. + Require, +} + +/// Channel binding configuration. +#[derive(Debug, Copy, Clone, PartialEq, Eq)] +#[non_exhaustive] +pub enum ChannelBinding { + /// Do not use channel binding. + Disable, + /// Attempt to use channel binding but allow sessions without. + Prefer, + /// Require the use of channel binding. + Require, +} + +/// Replication mode configuration. +#[derive(Debug, Copy, Clone, PartialEq, Eq)] +#[non_exhaustive] +pub enum ReplicationMode { + /// Physical replication. + Physical, + /// Logical replication. + Logical, +} + +/// A host specification. +#[derive(Debug, Clone, PartialEq, Eq)] +pub enum Host { + /// A TCP hostname. + Tcp(String), +} + +/// Precomputed keys which may override password during auth. +#[derive(Debug, Clone, Copy, PartialEq, Eq)] +pub enum AuthKeys { + /// A `ClientKey` & `ServerKey` pair for `SCRAM-SHA-256`. + ScramSha256(ScramKeys<32>), +} + +/// Connection configuration. +#[derive(Clone, PartialEq, Eq)] +pub struct Config { + pub(crate) host: Host, + pub(crate) port: u16, + + pub(crate) password: Option>, + pub(crate) auth_keys: Option>, + pub(crate) ssl_mode: SslMode, + pub(crate) connect_timeout: Option, + pub(crate) channel_binding: ChannelBinding, + pub(crate) server_params: StartupMessageParams, + + database: bool, + username: bool, +} + +impl Config { + /// Creates a new configuration. + pub fn new(host: String, port: u16) -> Config { + Config { + host: Host::Tcp(host), + port, + password: None, + auth_keys: None, + ssl_mode: SslMode::Prefer, + connect_timeout: None, + channel_binding: ChannelBinding::Prefer, + server_params: StartupMessageParams::default(), + + database: false, + username: false, + } + } + + /// Sets the user to authenticate with. + /// + /// Required. + pub fn user(&mut self, user: &str) -> &mut Config { + self.set_param("user", user) + } + + /// Gets the user to authenticate with, if one has been configured with + /// the `user` method. + pub fn user_is_set(&self) -> bool { + self.username + } + + /// Sets the password to authenticate with. + pub fn password(&mut self, password: T) -> &mut Config + where + T: AsRef<[u8]>, + { + self.password = Some(password.as_ref().to_vec()); + self + } + + /// Gets the password to authenticate with, if one has been configured with + /// the `password` method. + pub fn get_password(&self) -> Option<&[u8]> { + self.password.as_deref() + } + + /// Sets precomputed protocol-specific keys to authenticate with. + /// When set, this option will override `password`. + /// See [`AuthKeys`] for more information. + pub fn auth_keys(&mut self, keys: AuthKeys) -> &mut Config { + self.auth_keys = Some(Box::new(keys)); + self + } + + /// Gets precomputed protocol-specific keys to authenticate with. + /// if one has been configured with the `auth_keys` method. + pub fn get_auth_keys(&self) -> Option { + self.auth_keys.as_deref().copied() + } + + /// Sets the name of the database to connect to. + /// + /// Defaults to the user. + pub fn dbname(&mut self, dbname: &str) -> &mut Config { + self.set_param("database", dbname) + } + + /// Gets the name of the database to connect to, if one has been configured + /// with the `dbname` method. + pub fn db_is_set(&self) -> bool { + self.database + } + + pub fn set_param(&mut self, name: &str, value: &str) -> &mut Config { + if name == "database" { + self.database = true; + } else if name == "user" { + self.username = true; + } + + self.server_params.insert(name, value); + self + } + + /// Sets the SSL configuration. + /// + /// Defaults to `prefer`. + pub fn ssl_mode(&mut self, ssl_mode: SslMode) -> &mut Config { + self.ssl_mode = ssl_mode; + self + } + + /// Gets the SSL configuration. + pub fn get_ssl_mode(&self) -> SslMode { + self.ssl_mode + } + + /// Gets the hosts that have been added to the configuration with `host`. + pub fn get_host(&self) -> &Host { + &self.host + } + + /// Gets the ports that have been added to the configuration with `port`. + pub fn get_port(&self) -> u16 { + self.port + } + + /// Sets the timeout applied to socket-level connection attempts. + /// + /// Note that hostnames can resolve to multiple IP addresses, and this timeout will apply to each address of each + /// host separately. Defaults to no limit. + pub fn connect_timeout(&mut self, connect_timeout: Duration) -> &mut Config { + self.connect_timeout = Some(connect_timeout); + self + } + + /// Gets the connection timeout, if one has been set with the + /// `connect_timeout` method. + pub fn get_connect_timeout(&self) -> Option<&Duration> { + self.connect_timeout.as_ref() + } + + /// Sets the channel binding behavior. + /// + /// Defaults to `prefer`. + pub fn channel_binding(&mut self, channel_binding: ChannelBinding) -> &mut Config { + self.channel_binding = channel_binding; + self + } + + /// Gets the channel binding behavior. + pub fn get_channel_binding(&self) -> ChannelBinding { + self.channel_binding + } + + /// Opens a connection to a PostgreSQL database. + /// + /// Requires the `runtime` Cargo feature (enabled by default). + pub async fn connect( + &self, + tls: T, + ) -> Result<(Client, Connection), Error> + where + T: MakeTlsConnect, + { + connect(tls, self).await + } + + pub async fn connect_raw( + &self, + stream: S, + tls: T, + ) -> Result, Error> + where + S: AsyncRead + AsyncWrite + Unpin, + T: TlsConnect, + { + connect_raw(stream, tls, self).await + } +} + +// Omit password from debug output +impl fmt::Debug for Config { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + struct Redaction {} + impl fmt::Debug for Redaction { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + write!(f, "_") + } + } + + f.debug_struct("Config") + .field("password", &self.password.as_ref().map(|_| Redaction {})) + .field("ssl_mode", &self.ssl_mode) + .field("host", &self.host) + .field("port", &self.port) + .field("connect_timeout", &self.connect_timeout) + .field("channel_binding", &self.channel_binding) + .field("server_params", &self.server_params) + .finish() + } +} diff --git a/libs/proxy/tokio-postgres2/src/connect.rs b/libs/proxy/tokio-postgres2/src/connect.rs new file mode 100644 index 0000000000..e0cb69748d --- /dev/null +++ b/libs/proxy/tokio-postgres2/src/connect.rs @@ -0,0 +1,75 @@ +use crate::client::SocketConfig; +use crate::codec::BackendMessage; +use crate::config::Host; +use crate::connect_raw::connect_raw; +use crate::connect_socket::connect_socket; +use crate::tls::{MakeTlsConnect, TlsConnect}; +use crate::{Client, Config, Connection, Error, RawConnection}; +use postgres_protocol2::message::backend::Message; +use tokio::net::TcpStream; +use tokio::sync::mpsc; + +pub async fn connect( + mut tls: T, + config: &Config, +) -> Result<(Client, Connection), Error> +where + T: MakeTlsConnect, +{ + let hostname = match &config.host { + Host::Tcp(host) => host.as_str(), + }; + + let tls = tls + .make_tls_connect(hostname) + .map_err(|e| Error::tls(e.into()))?; + + match connect_once(&config.host, config.port, tls, config).await { + Ok((client, connection)) => Ok((client, connection)), + Err(e) => Err(e), + } +} + +async fn connect_once( + host: &Host, + port: u16, + tls: T, + config: &Config, +) -> Result<(Client, Connection), Error> +where + T: TlsConnect, +{ + let socket = connect_socket(host, port, config.connect_timeout).await?; + let RawConnection { + stream, + parameters, + delayed_notice, + process_id, + secret_key, + } = connect_raw(socket, tls, config).await?; + + let socket_config = SocketConfig { + host: host.clone(), + port, + connect_timeout: config.connect_timeout, + }; + + let (sender, receiver) = mpsc::unbounded_channel(); + let client = Client::new( + sender, + socket_config, + config.ssl_mode, + process_id, + secret_key, + ); + + // delayed notices are always sent as "Async" messages. + let delayed = delayed_notice + .into_iter() + .map(|m| BackendMessage::Async(Message::NoticeResponse(m))) + .collect(); + + let connection = Connection::new(stream, delayed, parameters, receiver); + + Ok((client, connection)) +} diff --git a/libs/proxy/tokio-postgres2/src/connect_raw.rs b/libs/proxy/tokio-postgres2/src/connect_raw.rs new file mode 100644 index 0000000000..66db85e07d --- /dev/null +++ b/libs/proxy/tokio-postgres2/src/connect_raw.rs @@ -0,0 +1,326 @@ +use crate::codec::{BackendMessage, BackendMessages, FrontendMessage, PostgresCodec}; +use crate::config::{self, AuthKeys, Config}; +use crate::connect_tls::connect_tls; +use crate::maybe_tls_stream::MaybeTlsStream; +use crate::tls::{TlsConnect, TlsStream}; +use crate::Error; +use bytes::BytesMut; +use fallible_iterator::FallibleIterator; +use futures_util::{ready, Sink, SinkExt, Stream, TryStreamExt}; +use postgres_protocol2::authentication::sasl; +use postgres_protocol2::authentication::sasl::ScramSha256; +use postgres_protocol2::message::backend::{AuthenticationSaslBody, Message, NoticeResponseBody}; +use postgres_protocol2::message::frontend; +use std::collections::HashMap; +use std::io; +use std::pin::Pin; +use std::task::{Context, Poll}; +use tokio::io::{AsyncRead, AsyncWrite}; +use tokio_util::codec::Framed; + +pub struct StartupStream { + inner: Framed, PostgresCodec>, + buf: BackendMessages, + delayed_notice: Vec, +} + +impl Sink for StartupStream +where + S: AsyncRead + AsyncWrite + Unpin, + T: AsyncRead + AsyncWrite + Unpin, +{ + type Error = io::Error; + + fn poll_ready(mut self: Pin<&mut Self>, cx: &mut Context<'_>) -> Poll> { + Pin::new(&mut self.inner).poll_ready(cx) + } + + fn start_send(mut self: Pin<&mut Self>, item: FrontendMessage) -> io::Result<()> { + Pin::new(&mut self.inner).start_send(item) + } + + fn poll_flush(mut self: Pin<&mut Self>, cx: &mut Context<'_>) -> Poll> { + Pin::new(&mut self.inner).poll_flush(cx) + } + + fn poll_close(mut self: Pin<&mut Self>, cx: &mut Context<'_>) -> Poll> { + Pin::new(&mut self.inner).poll_close(cx) + } +} + +impl Stream for StartupStream +where + S: AsyncRead + AsyncWrite + Unpin, + T: AsyncRead + AsyncWrite + Unpin, +{ + type Item = io::Result; + + fn poll_next( + mut self: Pin<&mut Self>, + cx: &mut Context<'_>, + ) -> Poll>> { + loop { + match self.buf.next() { + Ok(Some(message)) => return Poll::Ready(Some(Ok(message))), + Ok(None) => {} + Err(e) => return Poll::Ready(Some(Err(e))), + } + + match ready!(Pin::new(&mut self.inner).poll_next(cx)) { + Some(Ok(BackendMessage::Normal { messages, .. })) => self.buf = messages, + Some(Ok(BackendMessage::Async(message))) => return Poll::Ready(Some(Ok(message))), + Some(Err(e)) => return Poll::Ready(Some(Err(e))), + None => return Poll::Ready(None), + } + } + } +} + +pub struct RawConnection { + pub stream: Framed, PostgresCodec>, + pub parameters: HashMap, + pub delayed_notice: Vec, + pub process_id: i32, + pub secret_key: i32, +} + +pub async fn connect_raw( + stream: S, + tls: T, + config: &Config, +) -> Result, Error> +where + S: AsyncRead + AsyncWrite + Unpin, + T: TlsConnect, +{ + let stream = connect_tls(stream, config.ssl_mode, tls).await?; + + let mut stream = StartupStream { + inner: Framed::new(stream, PostgresCodec), + buf: BackendMessages::empty(), + delayed_notice: Vec::new(), + }; + + startup(&mut stream, config).await?; + authenticate(&mut stream, config).await?; + let (process_id, secret_key, parameters) = read_info(&mut stream).await?; + + Ok(RawConnection { + stream: stream.inner, + parameters, + delayed_notice: stream.delayed_notice, + process_id, + secret_key, + }) +} + +async fn startup(stream: &mut StartupStream, config: &Config) -> Result<(), Error> +where + S: AsyncRead + AsyncWrite + Unpin, + T: AsyncRead + AsyncWrite + Unpin, +{ + let mut buf = BytesMut::new(); + frontend::startup_message(&config.server_params, &mut buf).map_err(Error::encode)?; + + stream + .send(FrontendMessage::Raw(buf.freeze())) + .await + .map_err(Error::io) +} + +async fn authenticate(stream: &mut StartupStream, config: &Config) -> Result<(), Error> +where + S: AsyncRead + AsyncWrite + Unpin, + T: TlsStream + Unpin, +{ + match stream.try_next().await.map_err(Error::io)? { + Some(Message::AuthenticationOk) => { + can_skip_channel_binding(config)?; + return Ok(()); + } + Some(Message::AuthenticationCleartextPassword) => { + can_skip_channel_binding(config)?; + + let pass = config + .password + .as_ref() + .ok_or_else(|| Error::config("password missing".into()))?; + + authenticate_password(stream, pass).await?; + } + Some(Message::AuthenticationSasl(body)) => { + authenticate_sasl(stream, body, config).await?; + } + Some(Message::AuthenticationMd5Password) + | Some(Message::AuthenticationKerberosV5) + | Some(Message::AuthenticationScmCredential) + | Some(Message::AuthenticationGss) + | Some(Message::AuthenticationSspi) => { + return Err(Error::authentication( + "unsupported authentication method".into(), + )) + } + Some(Message::ErrorResponse(body)) => return Err(Error::db(body)), + Some(_) => return Err(Error::unexpected_message()), + None => return Err(Error::closed()), + } + + match stream.try_next().await.map_err(Error::io)? { + Some(Message::AuthenticationOk) => Ok(()), + Some(Message::ErrorResponse(body)) => Err(Error::db(body)), + Some(_) => Err(Error::unexpected_message()), + None => Err(Error::closed()), + } +} + +fn can_skip_channel_binding(config: &Config) -> Result<(), Error> { + match config.channel_binding { + config::ChannelBinding::Disable | config::ChannelBinding::Prefer => Ok(()), + config::ChannelBinding::Require => Err(Error::authentication( + "server did not use channel binding".into(), + )), + } +} + +async fn authenticate_password( + stream: &mut StartupStream, + password: &[u8], +) -> Result<(), Error> +where + S: AsyncRead + AsyncWrite + Unpin, + T: AsyncRead + AsyncWrite + Unpin, +{ + let mut buf = BytesMut::new(); + frontend::password_message(password, &mut buf).map_err(Error::encode)?; + + stream + .send(FrontendMessage::Raw(buf.freeze())) + .await + .map_err(Error::io) +} + +async fn authenticate_sasl( + stream: &mut StartupStream, + body: AuthenticationSaslBody, + config: &Config, +) -> Result<(), Error> +where + S: AsyncRead + AsyncWrite + Unpin, + T: TlsStream + Unpin, +{ + let mut has_scram = false; + let mut has_scram_plus = false; + let mut mechanisms = body.mechanisms(); + while let Some(mechanism) = mechanisms.next().map_err(Error::parse)? { + match mechanism { + sasl::SCRAM_SHA_256 => has_scram = true, + sasl::SCRAM_SHA_256_PLUS => has_scram_plus = true, + _ => {} + } + } + + let channel_binding = stream + .inner + .get_ref() + .channel_binding() + .tls_server_end_point + .filter(|_| config.channel_binding != config::ChannelBinding::Disable) + .map(sasl::ChannelBinding::tls_server_end_point); + + let (channel_binding, mechanism) = if has_scram_plus { + match channel_binding { + Some(channel_binding) => (channel_binding, sasl::SCRAM_SHA_256_PLUS), + None => (sasl::ChannelBinding::unsupported(), sasl::SCRAM_SHA_256), + } + } else if has_scram { + match channel_binding { + Some(_) => (sasl::ChannelBinding::unrequested(), sasl::SCRAM_SHA_256), + None => (sasl::ChannelBinding::unsupported(), sasl::SCRAM_SHA_256), + } + } else { + return Err(Error::authentication("unsupported SASL mechanism".into())); + }; + + if mechanism != sasl::SCRAM_SHA_256_PLUS { + can_skip_channel_binding(config)?; + } + + let mut scram = if let Some(AuthKeys::ScramSha256(keys)) = config.get_auth_keys() { + ScramSha256::new_with_keys(keys, channel_binding) + } else if let Some(password) = config.get_password() { + ScramSha256::new(password, channel_binding) + } else { + return Err(Error::config("password or auth keys missing".into())); + }; + + let mut buf = BytesMut::new(); + frontend::sasl_initial_response(mechanism, scram.message(), &mut buf).map_err(Error::encode)?; + stream + .send(FrontendMessage::Raw(buf.freeze())) + .await + .map_err(Error::io)?; + + let body = match stream.try_next().await.map_err(Error::io)? { + Some(Message::AuthenticationSaslContinue(body)) => body, + Some(Message::ErrorResponse(body)) => return Err(Error::db(body)), + Some(_) => return Err(Error::unexpected_message()), + None => return Err(Error::closed()), + }; + + scram + .update(body.data()) + .await + .map_err(|e| Error::authentication(e.into()))?; + + let mut buf = BytesMut::new(); + frontend::sasl_response(scram.message(), &mut buf).map_err(Error::encode)?; + stream + .send(FrontendMessage::Raw(buf.freeze())) + .await + .map_err(Error::io)?; + + let body = match stream.try_next().await.map_err(Error::io)? { + Some(Message::AuthenticationSaslFinal(body)) => body, + Some(Message::ErrorResponse(body)) => return Err(Error::db(body)), + Some(_) => return Err(Error::unexpected_message()), + None => return Err(Error::closed()), + }; + + scram + .finish(body.data()) + .map_err(|e| Error::authentication(e.into()))?; + + Ok(()) +} + +async fn read_info( + stream: &mut StartupStream, +) -> Result<(i32, i32, HashMap), Error> +where + S: AsyncRead + AsyncWrite + Unpin, + T: AsyncRead + AsyncWrite + Unpin, +{ + let mut process_id = 0; + let mut secret_key = 0; + let mut parameters = HashMap::new(); + + loop { + match stream.try_next().await.map_err(Error::io)? { + Some(Message::BackendKeyData(body)) => { + process_id = body.process_id(); + secret_key = body.secret_key(); + } + Some(Message::ParameterStatus(body)) => { + parameters.insert( + body.name().map_err(Error::parse)?.to_string(), + body.value().map_err(Error::parse)?.to_string(), + ); + } + Some(Message::NoticeResponse(body)) => stream.delayed_notice.push(body), + Some(Message::ReadyForQuery(_)) => return Ok((process_id, secret_key, parameters)), + Some(Message::ErrorResponse(body)) => return Err(Error::db(body)), + Some(_) => return Err(Error::unexpected_message()), + None => return Err(Error::closed()), + } + } +} diff --git a/libs/proxy/tokio-postgres2/src/connect_socket.rs b/libs/proxy/tokio-postgres2/src/connect_socket.rs new file mode 100644 index 0000000000..336a13317f --- /dev/null +++ b/libs/proxy/tokio-postgres2/src/connect_socket.rs @@ -0,0 +1,65 @@ +use crate::config::Host; +use crate::Error; +use std::future::Future; +use std::io; +use std::time::Duration; +use tokio::net::{self, TcpStream}; +use tokio::time; + +pub(crate) async fn connect_socket( + host: &Host, + port: u16, + connect_timeout: Option, +) -> Result { + match host { + Host::Tcp(host) => { + let addrs = net::lookup_host((&**host, port)) + .await + .map_err(Error::connect)?; + + let mut last_err = None; + + for addr in addrs { + let stream = + match connect_with_timeout(TcpStream::connect(addr), connect_timeout).await { + Ok(stream) => stream, + Err(e) => { + last_err = Some(e); + continue; + } + }; + + stream.set_nodelay(true).map_err(Error::connect)?; + + return Ok(stream); + } + + Err(last_err.unwrap_or_else(|| { + Error::connect(io::Error::new( + io::ErrorKind::InvalidInput, + "could not resolve any addresses", + )) + })) + } + } +} + +async fn connect_with_timeout(connect: F, timeout: Option) -> Result +where + F: Future>, +{ + match timeout { + Some(timeout) => match time::timeout(timeout, connect).await { + Ok(Ok(socket)) => Ok(socket), + Ok(Err(e)) => Err(Error::connect(e)), + Err(_) => Err(Error::connect(io::Error::new( + io::ErrorKind::TimedOut, + "connection timed out", + ))), + }, + None => match connect.await { + Ok(socket) => Ok(socket), + Err(e) => Err(Error::connect(e)), + }, + } +} diff --git a/libs/proxy/tokio-postgres2/src/connect_tls.rs b/libs/proxy/tokio-postgres2/src/connect_tls.rs new file mode 100644 index 0000000000..64b0b68abc --- /dev/null +++ b/libs/proxy/tokio-postgres2/src/connect_tls.rs @@ -0,0 +1,48 @@ +use crate::config::SslMode; +use crate::maybe_tls_stream::MaybeTlsStream; +use crate::tls::private::ForcePrivateApi; +use crate::tls::TlsConnect; +use crate::Error; +use bytes::BytesMut; +use postgres_protocol2::message::frontend; +use tokio::io::{AsyncRead, AsyncReadExt, AsyncWrite, AsyncWriteExt}; + +pub async fn connect_tls( + mut stream: S, + mode: SslMode, + tls: T, +) -> Result, Error> +where + S: AsyncRead + AsyncWrite + Unpin, + T: TlsConnect, +{ + match mode { + SslMode::Disable => return Ok(MaybeTlsStream::Raw(stream)), + SslMode::Prefer if !tls.can_connect(ForcePrivateApi) => { + return Ok(MaybeTlsStream::Raw(stream)) + } + SslMode::Prefer | SslMode::Require => {} + } + + let mut buf = BytesMut::new(); + frontend::ssl_request(&mut buf); + stream.write_all(&buf).await.map_err(Error::io)?; + + let mut buf = [0]; + stream.read_exact(&mut buf).await.map_err(Error::io)?; + + if buf[0] != b'S' { + if SslMode::Require == mode { + return Err(Error::tls("server does not support TLS".into())); + } else { + return Ok(MaybeTlsStream::Raw(stream)); + } + } + + let stream = tls + .connect(stream) + .await + .map_err(|e| Error::tls(e.into()))?; + + Ok(MaybeTlsStream::Tls(stream)) +} diff --git a/libs/proxy/tokio-postgres2/src/connection.rs b/libs/proxy/tokio-postgres2/src/connection.rs new file mode 100644 index 0000000000..0aa5c77e22 --- /dev/null +++ b/libs/proxy/tokio-postgres2/src/connection.rs @@ -0,0 +1,323 @@ +use crate::codec::{BackendMessage, BackendMessages, FrontendMessage, PostgresCodec}; +use crate::error::DbError; +use crate::maybe_tls_stream::MaybeTlsStream; +use crate::{AsyncMessage, Error, Notification}; +use bytes::BytesMut; +use fallible_iterator::FallibleIterator; +use futures_util::{ready, Sink, Stream}; +use log::{info, trace}; +use postgres_protocol2::message::backend::Message; +use postgres_protocol2::message::frontend; +use std::collections::{HashMap, VecDeque}; +use std::future::Future; +use std::pin::Pin; +use std::task::{Context, Poll}; +use tokio::io::{AsyncRead, AsyncWrite}; +use tokio::sync::mpsc; +use tokio_util::codec::Framed; +use tokio_util::sync::PollSender; + +pub enum RequestMessages { + Single(FrontendMessage), +} + +pub struct Request { + pub messages: RequestMessages, + pub sender: mpsc::Sender, +} + +pub struct Response { + sender: PollSender, +} + +#[derive(PartialEq, Debug)] +enum State { + Active, + Terminating, + Closing, +} + +/// A connection to a PostgreSQL database. +/// +/// This is one half of what is returned when a new connection is established. It performs the actual IO with the +/// server, and should generally be spawned off onto an executor to run in the background. +/// +/// `Connection` implements `Future`, and only resolves when the connection is closed, either because a fatal error has +/// occurred, or because its associated `Client` has dropped and all outstanding work has completed. +#[must_use = "futures do nothing unless polled"] +pub struct Connection { + /// HACK: we need this in the Neon Proxy. + pub stream: Framed, PostgresCodec>, + /// HACK: we need this in the Neon Proxy to forward params. + pub parameters: HashMap, + receiver: mpsc::UnboundedReceiver, + pending_request: Option, + pending_responses: VecDeque, + responses: VecDeque, + state: State, +} + +impl Connection +where + S: AsyncRead + AsyncWrite + Unpin, + T: AsyncRead + AsyncWrite + Unpin, +{ + pub(crate) fn new( + stream: Framed, PostgresCodec>, + pending_responses: VecDeque, + parameters: HashMap, + receiver: mpsc::UnboundedReceiver, + ) -> Connection { + Connection { + stream, + parameters, + receiver, + pending_request: None, + pending_responses, + responses: VecDeque::new(), + state: State::Active, + } + } + + fn poll_response( + &mut self, + cx: &mut Context<'_>, + ) -> Poll>> { + if let Some(message) = self.pending_responses.pop_front() { + trace!("retrying pending response"); + return Poll::Ready(Some(Ok(message))); + } + + Pin::new(&mut self.stream) + .poll_next(cx) + .map(|o| o.map(|r| r.map_err(Error::io))) + } + + fn poll_read(&mut self, cx: &mut Context<'_>) -> Result, Error> { + if self.state != State::Active { + trace!("poll_read: done"); + return Ok(None); + } + + loop { + let message = match self.poll_response(cx)? { + Poll::Ready(Some(message)) => message, + Poll::Ready(None) => return Err(Error::closed()), + Poll::Pending => { + trace!("poll_read: waiting on response"); + return Ok(None); + } + }; + + let (mut messages, request_complete) = match message { + BackendMessage::Async(Message::NoticeResponse(body)) => { + let error = DbError::parse(&mut body.fields()).map_err(Error::parse)?; + return Ok(Some(AsyncMessage::Notice(error))); + } + BackendMessage::Async(Message::NotificationResponse(body)) => { + let notification = Notification { + process_id: body.process_id(), + channel: body.channel().map_err(Error::parse)?.to_string(), + payload: body.message().map_err(Error::parse)?.to_string(), + }; + return Ok(Some(AsyncMessage::Notification(notification))); + } + BackendMessage::Async(Message::ParameterStatus(body)) => { + self.parameters.insert( + body.name().map_err(Error::parse)?.to_string(), + body.value().map_err(Error::parse)?.to_string(), + ); + continue; + } + BackendMessage::Async(_) => unreachable!(), + BackendMessage::Normal { + messages, + request_complete, + } => (messages, request_complete), + }; + + let mut response = match self.responses.pop_front() { + Some(response) => response, + None => match messages.next().map_err(Error::parse)? { + Some(Message::ErrorResponse(error)) => return Err(Error::db(error)), + _ => return Err(Error::unexpected_message()), + }, + }; + + match response.sender.poll_reserve(cx) { + Poll::Ready(Ok(())) => { + let _ = response.sender.send_item(messages); + if !request_complete { + self.responses.push_front(response); + } + } + Poll::Ready(Err(_)) => { + // we need to keep paging through the rest of the messages even if the receiver's hung up + if !request_complete { + self.responses.push_front(response); + } + } + Poll::Pending => { + self.responses.push_front(response); + self.pending_responses.push_back(BackendMessage::Normal { + messages, + request_complete, + }); + trace!("poll_read: waiting on sender"); + return Ok(None); + } + } + } + } + + fn poll_request(&mut self, cx: &mut Context<'_>) -> Poll> { + if let Some(messages) = self.pending_request.take() { + trace!("retrying pending request"); + return Poll::Ready(Some(messages)); + } + + if self.receiver.is_closed() { + return Poll::Ready(None); + } + + match self.receiver.poll_recv(cx) { + Poll::Ready(Some(request)) => { + trace!("polled new request"); + self.responses.push_back(Response { + sender: PollSender::new(request.sender), + }); + Poll::Ready(Some(request.messages)) + } + Poll::Ready(None) => Poll::Ready(None), + Poll::Pending => Poll::Pending, + } + } + + fn poll_write(&mut self, cx: &mut Context<'_>) -> Result { + loop { + if self.state == State::Closing { + trace!("poll_write: done"); + return Ok(false); + } + + if Pin::new(&mut self.stream) + .poll_ready(cx) + .map_err(Error::io)? + .is_pending() + { + trace!("poll_write: waiting on socket"); + return Ok(false); + } + + let request = match self.poll_request(cx) { + Poll::Ready(Some(request)) => request, + Poll::Ready(None) if self.responses.is_empty() && self.state == State::Active => { + trace!("poll_write: at eof, terminating"); + self.state = State::Terminating; + let mut request = BytesMut::new(); + frontend::terminate(&mut request); + RequestMessages::Single(FrontendMessage::Raw(request.freeze())) + } + Poll::Ready(None) => { + trace!( + "poll_write: at eof, pending responses {}", + self.responses.len() + ); + return Ok(true); + } + Poll::Pending => { + trace!("poll_write: waiting on request"); + return Ok(true); + } + }; + + match request { + RequestMessages::Single(request) => { + Pin::new(&mut self.stream) + .start_send(request) + .map_err(Error::io)?; + if self.state == State::Terminating { + trace!("poll_write: sent eof, closing"); + self.state = State::Closing; + } + } + } + } + } + + fn poll_flush(&mut self, cx: &mut Context<'_>) -> Result<(), Error> { + match Pin::new(&mut self.stream) + .poll_flush(cx) + .map_err(Error::io)? + { + Poll::Ready(()) => trace!("poll_flush: flushed"), + Poll::Pending => trace!("poll_flush: waiting on socket"), + } + Ok(()) + } + + fn poll_shutdown(&mut self, cx: &mut Context<'_>) -> Poll> { + if self.state != State::Closing { + return Poll::Pending; + } + + match Pin::new(&mut self.stream) + .poll_close(cx) + .map_err(Error::io)? + { + Poll::Ready(()) => { + trace!("poll_shutdown: complete"); + Poll::Ready(Ok(())) + } + Poll::Pending => { + trace!("poll_shutdown: waiting on socket"); + Poll::Pending + } + } + } + + /// Returns the value of a runtime parameter for this connection. + pub fn parameter(&self, name: &str) -> Option<&str> { + self.parameters.get(name).map(|s| &**s) + } + + /// Polls for asynchronous messages from the server. + /// + /// The server can send notices as well as notifications asynchronously to the client. Applications that wish to + /// examine those messages should use this method to drive the connection rather than its `Future` implementation. + pub fn poll_message( + &mut self, + cx: &mut Context<'_>, + ) -> Poll>> { + let message = self.poll_read(cx)?; + let want_flush = self.poll_write(cx)?; + if want_flush { + self.poll_flush(cx)?; + } + match message { + Some(message) => Poll::Ready(Some(Ok(message))), + None => match self.poll_shutdown(cx) { + Poll::Ready(Ok(())) => Poll::Ready(None), + Poll::Ready(Err(e)) => Poll::Ready(Some(Err(e))), + Poll::Pending => Poll::Pending, + }, + } + } +} + +impl Future for Connection +where + S: AsyncRead + AsyncWrite + Unpin, + T: AsyncRead + AsyncWrite + Unpin, +{ + type Output = Result<(), Error>; + + fn poll(mut self: Pin<&mut Self>, cx: &mut Context<'_>) -> Poll> { + while let Some(message) = ready!(self.poll_message(cx)?) { + if let AsyncMessage::Notice(notice) = message { + info!("{}: {}", notice.severity(), notice.message()); + } + } + Poll::Ready(Ok(())) + } +} diff --git a/libs/proxy/tokio-postgres2/src/error/mod.rs b/libs/proxy/tokio-postgres2/src/error/mod.rs new file mode 100644 index 0000000000..922c348525 --- /dev/null +++ b/libs/proxy/tokio-postgres2/src/error/mod.rs @@ -0,0 +1,495 @@ +//! Errors. + +use fallible_iterator::FallibleIterator; +use postgres_protocol2::message::backend::{ErrorFields, ErrorResponseBody}; +use std::error::{self, Error as _Error}; +use std::fmt; +use std::io; + +pub use self::sqlstate::*; + +#[allow(clippy::unreadable_literal)] +mod sqlstate; + +/// The severity of a Postgres error or notice. +#[derive(Debug, Copy, Clone, PartialEq, Eq)] +pub enum Severity { + /// PANIC + Panic, + /// FATAL + Fatal, + /// ERROR + Error, + /// WARNING + Warning, + /// NOTICE + Notice, + /// DEBUG + Debug, + /// INFO + Info, + /// LOG + Log, +} + +impl fmt::Display for Severity { + fn fmt(&self, fmt: &mut fmt::Formatter<'_>) -> fmt::Result { + let s = match *self { + Severity::Panic => "PANIC", + Severity::Fatal => "FATAL", + Severity::Error => "ERROR", + Severity::Warning => "WARNING", + Severity::Notice => "NOTICE", + Severity::Debug => "DEBUG", + Severity::Info => "INFO", + Severity::Log => "LOG", + }; + fmt.write_str(s) + } +} + +impl Severity { + fn from_str(s: &str) -> Option { + match s { + "PANIC" => Some(Severity::Panic), + "FATAL" => Some(Severity::Fatal), + "ERROR" => Some(Severity::Error), + "WARNING" => Some(Severity::Warning), + "NOTICE" => Some(Severity::Notice), + "DEBUG" => Some(Severity::Debug), + "INFO" => Some(Severity::Info), + "LOG" => Some(Severity::Log), + _ => None, + } + } +} + +/// A Postgres error or notice. +#[derive(Debug, Clone, PartialEq, Eq)] +pub struct DbError { + severity: String, + parsed_severity: Option, + code: SqlState, + message: String, + detail: Option, + hint: Option, + position: Option, + where_: Option, + schema: Option, + table: Option, + column: Option, + datatype: Option, + constraint: Option, + file: Option, + line: Option, + routine: Option, +} + +impl DbError { + pub(crate) fn parse(fields: &mut ErrorFields<'_>) -> io::Result { + let mut severity = None; + let mut parsed_severity = None; + let mut code = None; + let mut message = None; + let mut detail = None; + let mut hint = None; + let mut normal_position = None; + let mut internal_position = None; + let mut internal_query = None; + let mut where_ = None; + let mut schema = None; + let mut table = None; + let mut column = None; + let mut datatype = None; + let mut constraint = None; + let mut file = None; + let mut line = None; + let mut routine = None; + + while let Some(field) = fields.next()? { + match field.type_() { + b'S' => severity = Some(field.value().to_owned()), + b'C' => code = Some(SqlState::from_code(field.value())), + b'M' => message = Some(field.value().to_owned()), + b'D' => detail = Some(field.value().to_owned()), + b'H' => hint = Some(field.value().to_owned()), + b'P' => { + normal_position = Some(field.value().parse::().map_err(|_| { + io::Error::new( + io::ErrorKind::InvalidInput, + "`P` field did not contain an integer", + ) + })?); + } + b'p' => { + internal_position = Some(field.value().parse::().map_err(|_| { + io::Error::new( + io::ErrorKind::InvalidInput, + "`p` field did not contain an integer", + ) + })?); + } + b'q' => internal_query = Some(field.value().to_owned()), + b'W' => where_ = Some(field.value().to_owned()), + b's' => schema = Some(field.value().to_owned()), + b't' => table = Some(field.value().to_owned()), + b'c' => column = Some(field.value().to_owned()), + b'd' => datatype = Some(field.value().to_owned()), + b'n' => constraint = Some(field.value().to_owned()), + b'F' => file = Some(field.value().to_owned()), + b'L' => { + line = Some(field.value().parse::().map_err(|_| { + io::Error::new( + io::ErrorKind::InvalidInput, + "`L` field did not contain an integer", + ) + })?); + } + b'R' => routine = Some(field.value().to_owned()), + b'V' => { + parsed_severity = Some(Severity::from_str(field.value()).ok_or_else(|| { + io::Error::new( + io::ErrorKind::InvalidInput, + "`V` field contained an invalid value", + ) + })?); + } + _ => {} + } + } + + Ok(DbError { + severity: severity + .ok_or_else(|| io::Error::new(io::ErrorKind::InvalidInput, "`S` field missing"))?, + parsed_severity, + code: code + .ok_or_else(|| io::Error::new(io::ErrorKind::InvalidInput, "`C` field missing"))?, + message: message + .ok_or_else(|| io::Error::new(io::ErrorKind::InvalidInput, "`M` field missing"))?, + detail, + hint, + position: match normal_position { + Some(position) => Some(ErrorPosition::Original(position)), + None => match internal_position { + Some(position) => Some(ErrorPosition::Internal { + position, + query: internal_query.ok_or_else(|| { + io::Error::new( + io::ErrorKind::InvalidInput, + "`q` field missing but `p` field present", + ) + })?, + }), + None => None, + }, + }, + where_, + schema, + table, + column, + datatype, + constraint, + file, + line, + routine, + }) + } + + /// The field contents are ERROR, FATAL, or PANIC (in an error message), + /// or WARNING, NOTICE, DEBUG, INFO, or LOG (in a notice message), or a + /// localized translation of one of these. + pub fn severity(&self) -> &str { + &self.severity + } + + /// A parsed, nonlocalized version of `severity`. (PostgreSQL 9.6+) + pub fn parsed_severity(&self) -> Option { + self.parsed_severity + } + + /// The SQLSTATE code for the error. + pub fn code(&self) -> &SqlState { + &self.code + } + + /// The primary human-readable error message. + /// + /// This should be accurate but terse (typically one line). + pub fn message(&self) -> &str { + &self.message + } + + /// An optional secondary error message carrying more detail about the + /// problem. + /// + /// Might run to multiple lines. + pub fn detail(&self) -> Option<&str> { + self.detail.as_deref() + } + + /// An optional suggestion what to do about the problem. + /// + /// This is intended to differ from `detail` in that it offers advice + /// (potentially inappropriate) rather than hard facts. Might run to + /// multiple lines. + pub fn hint(&self) -> Option<&str> { + self.hint.as_deref() + } + + /// An optional error cursor position into either the original query string + /// or an internally generated query. + pub fn position(&self) -> Option<&ErrorPosition> { + self.position.as_ref() + } + + /// An indication of the context in which the error occurred. + /// + /// Presently this includes a call stack traceback of active procedural + /// language functions and internally-generated queries. The trace is one + /// entry per line, most recent first. + pub fn where_(&self) -> Option<&str> { + self.where_.as_deref() + } + + /// If the error was associated with a specific database object, the name + /// of the schema containing that object, if any. (PostgreSQL 9.3+) + pub fn schema(&self) -> Option<&str> { + self.schema.as_deref() + } + + /// If the error was associated with a specific table, the name of the + /// table. (Refer to the schema name field for the name of the table's + /// schema.) (PostgreSQL 9.3+) + pub fn table(&self) -> Option<&str> { + self.table.as_deref() + } + + /// If the error was associated with a specific table column, the name of + /// the column. + /// + /// (Refer to the schema and table name fields to identify the table.) + /// (PostgreSQL 9.3+) + pub fn column(&self) -> Option<&str> { + self.column.as_deref() + } + + /// If the error was associated with a specific data type, the name of the + /// data type. (Refer to the schema name field for the name of the data + /// type's schema.) (PostgreSQL 9.3+) + pub fn datatype(&self) -> Option<&str> { + self.datatype.as_deref() + } + + /// If the error was associated with a specific constraint, the name of the + /// constraint. + /// + /// Refer to fields listed above for the associated table or domain. + /// (For this purpose, indexes are treated as constraints, even if they + /// weren't created with constraint syntax.) (PostgreSQL 9.3+) + pub fn constraint(&self) -> Option<&str> { + self.constraint.as_deref() + } + + /// The file name of the source-code location where the error was reported. + pub fn file(&self) -> Option<&str> { + self.file.as_deref() + } + + /// The line number of the source-code location where the error was + /// reported. + pub fn line(&self) -> Option { + self.line + } + + /// The name of the source-code routine reporting the error. + pub fn routine(&self) -> Option<&str> { + self.routine.as_deref() + } +} + +impl fmt::Display for DbError { + fn fmt(&self, fmt: &mut fmt::Formatter<'_>) -> fmt::Result { + write!(fmt, "{}: {}", self.severity, self.message)?; + if let Some(detail) = &self.detail { + write!(fmt, "\nDETAIL: {}", detail)?; + } + if let Some(hint) = &self.hint { + write!(fmt, "\nHINT: {}", hint)?; + } + Ok(()) + } +} + +impl error::Error for DbError {} + +/// Represents the position of an error in a query. +#[derive(Clone, PartialEq, Eq, Debug)] +pub enum ErrorPosition { + /// A position in the original query. + Original(u32), + /// A position in an internally generated query. + Internal { + /// The byte position. + position: u32, + /// A query generated by the Postgres server. + query: String, + }, +} + +#[derive(Debug, PartialEq)] +enum Kind { + Io, + UnexpectedMessage, + Tls, + ToSql(usize), + FromSql(usize), + Column(String), + Closed, + Db, + Parse, + Encode, + Authentication, + Config, + Connect, + Timeout, +} + +struct ErrorInner { + kind: Kind, + cause: Option>, +} + +/// An error communicating with the Postgres server. +pub struct Error(Box); + +impl fmt::Debug for Error { + fn fmt(&self, fmt: &mut fmt::Formatter<'_>) -> fmt::Result { + fmt.debug_struct("Error") + .field("kind", &self.0.kind) + .field("cause", &self.0.cause) + .finish() + } +} + +impl fmt::Display for Error { + fn fmt(&self, fmt: &mut fmt::Formatter<'_>) -> fmt::Result { + match &self.0.kind { + Kind::Io => fmt.write_str("error communicating with the server")?, + Kind::UnexpectedMessage => fmt.write_str("unexpected message from server")?, + Kind::Tls => fmt.write_str("error performing TLS handshake")?, + Kind::ToSql(idx) => write!(fmt, "error serializing parameter {}", idx)?, + Kind::FromSql(idx) => write!(fmt, "error deserializing column {}", idx)?, + Kind::Column(column) => write!(fmt, "invalid column `{}`", column)?, + Kind::Closed => fmt.write_str("connection closed")?, + Kind::Db => fmt.write_str("db error")?, + Kind::Parse => fmt.write_str("error parsing response from server")?, + Kind::Encode => fmt.write_str("error encoding message to server")?, + Kind::Authentication => fmt.write_str("authentication error")?, + Kind::Config => fmt.write_str("invalid configuration")?, + Kind::Connect => fmt.write_str("error connecting to server")?, + Kind::Timeout => fmt.write_str("timeout waiting for server")?, + }; + if let Some(ref cause) = self.0.cause { + write!(fmt, ": {}", cause)?; + } + Ok(()) + } +} + +impl error::Error for Error { + fn source(&self) -> Option<&(dyn error::Error + 'static)> { + self.0.cause.as_ref().map(|e| &**e as _) + } +} + +impl Error { + /// Consumes the error, returning its cause. + pub fn into_source(self) -> Option> { + self.0.cause + } + + /// Returns the source of this error if it was a `DbError`. + /// + /// This is a simple convenience method. + pub fn as_db_error(&self) -> Option<&DbError> { + self.source().and_then(|e| e.downcast_ref::()) + } + + /// Determines if the error was associated with closed connection. + pub fn is_closed(&self) -> bool { + self.0.kind == Kind::Closed + } + + /// Returns the SQLSTATE error code associated with the error. + /// + /// This is a convenience method that downcasts the cause to a `DbError` and returns its code. + pub fn code(&self) -> Option<&SqlState> { + self.as_db_error().map(DbError::code) + } + + fn new(kind: Kind, cause: Option>) -> Error { + Error(Box::new(ErrorInner { kind, cause })) + } + + pub(crate) fn closed() -> Error { + Error::new(Kind::Closed, None) + } + + pub(crate) fn unexpected_message() -> Error { + Error::new(Kind::UnexpectedMessage, None) + } + + #[allow(clippy::needless_pass_by_value)] + pub(crate) fn db(error: ErrorResponseBody) -> Error { + match DbError::parse(&mut error.fields()) { + Ok(e) => Error::new(Kind::Db, Some(Box::new(e))), + Err(e) => Error::new(Kind::Parse, Some(Box::new(e))), + } + } + + pub(crate) fn parse(e: io::Error) -> Error { + Error::new(Kind::Parse, Some(Box::new(e))) + } + + pub(crate) fn encode(e: io::Error) -> Error { + Error::new(Kind::Encode, Some(Box::new(e))) + } + + #[allow(clippy::wrong_self_convention)] + pub(crate) fn to_sql(e: Box, idx: usize) -> Error { + Error::new(Kind::ToSql(idx), Some(e)) + } + + pub(crate) fn from_sql(e: Box, idx: usize) -> Error { + Error::new(Kind::FromSql(idx), Some(e)) + } + + pub(crate) fn column(column: String) -> Error { + Error::new(Kind::Column(column), None) + } + + pub(crate) fn tls(e: Box) -> Error { + Error::new(Kind::Tls, Some(e)) + } + + pub(crate) fn io(e: io::Error) -> Error { + Error::new(Kind::Io, Some(Box::new(e))) + } + + pub(crate) fn authentication(e: Box) -> Error { + Error::new(Kind::Authentication, Some(e)) + } + + pub(crate) fn config(e: Box) -> Error { + Error::new(Kind::Config, Some(e)) + } + + pub(crate) fn connect(e: io::Error) -> Error { + Error::new(Kind::Connect, Some(Box::new(e))) + } + + #[doc(hidden)] + pub fn __private_api_timeout() -> Error { + Error::new(Kind::Timeout, None) + } +} diff --git a/libs/proxy/tokio-postgres2/src/error/sqlstate.rs b/libs/proxy/tokio-postgres2/src/error/sqlstate.rs new file mode 100644 index 0000000000..13a1d75f95 --- /dev/null +++ b/libs/proxy/tokio-postgres2/src/error/sqlstate.rs @@ -0,0 +1,1670 @@ +// Autogenerated file - DO NOT EDIT + +/// A SQLSTATE error code +#[derive(PartialEq, Eq, Clone, Debug)] +pub struct SqlState(Inner); + +impl SqlState { + /// Creates a `SqlState` from its error code. + pub fn from_code(s: &str) -> SqlState { + match SQLSTATE_MAP.get(s) { + Some(state) => state.clone(), + None => SqlState(Inner::Other(s.into())), + } + } + + /// Returns the error code corresponding to the `SqlState`. + pub fn code(&self) -> &str { + match &self.0 { + Inner::E00000 => "00000", + Inner::E01000 => "01000", + Inner::E0100C => "0100C", + Inner::E01008 => "01008", + Inner::E01003 => "01003", + Inner::E01007 => "01007", + Inner::E01006 => "01006", + Inner::E01004 => "01004", + Inner::E01P01 => "01P01", + Inner::E02000 => "02000", + Inner::E02001 => "02001", + Inner::E03000 => "03000", + Inner::E08000 => "08000", + Inner::E08003 => "08003", + Inner::E08006 => "08006", + Inner::E08001 => "08001", + Inner::E08004 => "08004", + Inner::E08007 => "08007", + Inner::E08P01 => "08P01", + Inner::E09000 => "09000", + Inner::E0A000 => "0A000", + Inner::E0B000 => "0B000", + Inner::E0F000 => "0F000", + Inner::E0F001 => "0F001", + Inner::E0L000 => "0L000", + Inner::E0LP01 => "0LP01", + Inner::E0P000 => "0P000", + Inner::E0Z000 => "0Z000", + Inner::E0Z002 => "0Z002", + Inner::E20000 => "20000", + Inner::E21000 => "21000", + Inner::E22000 => "22000", + Inner::E2202E => "2202E", + Inner::E22021 => "22021", + Inner::E22008 => "22008", + Inner::E22012 => "22012", + Inner::E22005 => "22005", + Inner::E2200B => "2200B", + Inner::E22022 => "22022", + Inner::E22015 => "22015", + Inner::E2201E => "2201E", + Inner::E22014 => "22014", + Inner::E22016 => "22016", + Inner::E2201F => "2201F", + Inner::E2201G => "2201G", + Inner::E22018 => "22018", + Inner::E22007 => "22007", + Inner::E22019 => "22019", + Inner::E2200D => "2200D", + Inner::E22025 => "22025", + Inner::E22P06 => "22P06", + Inner::E22010 => "22010", + Inner::E22023 => "22023", + Inner::E22013 => "22013", + Inner::E2201B => "2201B", + Inner::E2201W => "2201W", + Inner::E2201X => "2201X", + Inner::E2202H => "2202H", + Inner::E2202G => "2202G", + Inner::E22009 => "22009", + Inner::E2200C => "2200C", + Inner::E2200G => "2200G", + Inner::E22004 => "22004", + Inner::E22002 => "22002", + Inner::E22003 => "22003", + Inner::E2200H => "2200H", + Inner::E22026 => "22026", + Inner::E22001 => "22001", + Inner::E22011 => "22011", + Inner::E22027 => "22027", + Inner::E22024 => "22024", + Inner::E2200F => "2200F", + Inner::E22P01 => "22P01", + Inner::E22P02 => "22P02", + Inner::E22P03 => "22P03", + Inner::E22P04 => "22P04", + Inner::E22P05 => "22P05", + Inner::E2200L => "2200L", + Inner::E2200M => "2200M", + Inner::E2200N => "2200N", + Inner::E2200S => "2200S", + Inner::E2200T => "2200T", + Inner::E22030 => "22030", + Inner::E22031 => "22031", + Inner::E22032 => "22032", + Inner::E22033 => "22033", + Inner::E22034 => "22034", + Inner::E22035 => "22035", + Inner::E22036 => "22036", + Inner::E22037 => "22037", + Inner::E22038 => "22038", + Inner::E22039 => "22039", + Inner::E2203A => "2203A", + Inner::E2203B => "2203B", + Inner::E2203C => "2203C", + Inner::E2203D => "2203D", + Inner::E2203E => "2203E", + Inner::E2203F => "2203F", + Inner::E2203G => "2203G", + Inner::E23000 => "23000", + Inner::E23001 => "23001", + Inner::E23502 => "23502", + Inner::E23503 => "23503", + Inner::E23505 => "23505", + Inner::E23514 => "23514", + Inner::E23P01 => "23P01", + Inner::E24000 => "24000", + Inner::E25000 => "25000", + Inner::E25001 => "25001", + Inner::E25002 => "25002", + Inner::E25008 => "25008", + Inner::E25003 => "25003", + Inner::E25004 => "25004", + Inner::E25005 => "25005", + Inner::E25006 => "25006", + Inner::E25007 => "25007", + Inner::E25P01 => "25P01", + Inner::E25P02 => "25P02", + Inner::E25P03 => "25P03", + Inner::E26000 => "26000", + Inner::E27000 => "27000", + Inner::E28000 => "28000", + Inner::E28P01 => "28P01", + Inner::E2B000 => "2B000", + Inner::E2BP01 => "2BP01", + Inner::E2D000 => "2D000", + Inner::E2F000 => "2F000", + Inner::E2F005 => "2F005", + Inner::E2F002 => "2F002", + Inner::E2F003 => "2F003", + Inner::E2F004 => "2F004", + Inner::E34000 => "34000", + Inner::E38000 => "38000", + Inner::E38001 => "38001", + Inner::E38002 => "38002", + Inner::E38003 => "38003", + Inner::E38004 => "38004", + Inner::E39000 => "39000", + Inner::E39001 => "39001", + Inner::E39004 => "39004", + Inner::E39P01 => "39P01", + Inner::E39P02 => "39P02", + Inner::E39P03 => "39P03", + Inner::E3B000 => "3B000", + Inner::E3B001 => "3B001", + Inner::E3D000 => "3D000", + Inner::E3F000 => "3F000", + Inner::E40000 => "40000", + Inner::E40002 => "40002", + Inner::E40001 => "40001", + Inner::E40003 => "40003", + Inner::E40P01 => "40P01", + Inner::E42000 => "42000", + Inner::E42601 => "42601", + Inner::E42501 => "42501", + Inner::E42846 => "42846", + Inner::E42803 => "42803", + Inner::E42P20 => "42P20", + Inner::E42P19 => "42P19", + Inner::E42830 => "42830", + Inner::E42602 => "42602", + Inner::E42622 => "42622", + Inner::E42939 => "42939", + Inner::E42804 => "42804", + Inner::E42P18 => "42P18", + Inner::E42P21 => "42P21", + Inner::E42P22 => "42P22", + Inner::E42809 => "42809", + Inner::E428C9 => "428C9", + Inner::E42703 => "42703", + Inner::E42883 => "42883", + Inner::E42P01 => "42P01", + Inner::E42P02 => "42P02", + Inner::E42704 => "42704", + Inner::E42701 => "42701", + Inner::E42P03 => "42P03", + Inner::E42P04 => "42P04", + Inner::E42723 => "42723", + Inner::E42P05 => "42P05", + Inner::E42P06 => "42P06", + Inner::E42P07 => "42P07", + Inner::E42712 => "42712", + Inner::E42710 => "42710", + Inner::E42702 => "42702", + Inner::E42725 => "42725", + Inner::E42P08 => "42P08", + Inner::E42P09 => "42P09", + Inner::E42P10 => "42P10", + Inner::E42611 => "42611", + Inner::E42P11 => "42P11", + Inner::E42P12 => "42P12", + Inner::E42P13 => "42P13", + Inner::E42P14 => "42P14", + Inner::E42P15 => "42P15", + Inner::E42P16 => "42P16", + Inner::E42P17 => "42P17", + Inner::E44000 => "44000", + Inner::E53000 => "53000", + Inner::E53100 => "53100", + Inner::E53200 => "53200", + Inner::E53300 => "53300", + Inner::E53400 => "53400", + Inner::E54000 => "54000", + Inner::E54001 => "54001", + Inner::E54011 => "54011", + Inner::E54023 => "54023", + Inner::E55000 => "55000", + Inner::E55006 => "55006", + Inner::E55P02 => "55P02", + Inner::E55P03 => "55P03", + Inner::E55P04 => "55P04", + Inner::E57000 => "57000", + Inner::E57014 => "57014", + Inner::E57P01 => "57P01", + Inner::E57P02 => "57P02", + Inner::E57P03 => "57P03", + Inner::E57P04 => "57P04", + Inner::E57P05 => "57P05", + Inner::E58000 => "58000", + Inner::E58030 => "58030", + Inner::E58P01 => "58P01", + Inner::E58P02 => "58P02", + Inner::E72000 => "72000", + Inner::EF0000 => "F0000", + Inner::EF0001 => "F0001", + Inner::EHV000 => "HV000", + Inner::EHV005 => "HV005", + Inner::EHV002 => "HV002", + Inner::EHV010 => "HV010", + Inner::EHV021 => "HV021", + Inner::EHV024 => "HV024", + Inner::EHV007 => "HV007", + Inner::EHV008 => "HV008", + Inner::EHV004 => "HV004", + Inner::EHV006 => "HV006", + Inner::EHV091 => "HV091", + Inner::EHV00B => "HV00B", + Inner::EHV00C => "HV00C", + Inner::EHV00D => "HV00D", + Inner::EHV090 => "HV090", + Inner::EHV00A => "HV00A", + Inner::EHV009 => "HV009", + Inner::EHV014 => "HV014", + Inner::EHV001 => "HV001", + Inner::EHV00P => "HV00P", + Inner::EHV00J => "HV00J", + Inner::EHV00K => "HV00K", + Inner::EHV00Q => "HV00Q", + Inner::EHV00R => "HV00R", + Inner::EHV00L => "HV00L", + Inner::EHV00M => "HV00M", + Inner::EHV00N => "HV00N", + Inner::EP0000 => "P0000", + Inner::EP0001 => "P0001", + Inner::EP0002 => "P0002", + Inner::EP0003 => "P0003", + Inner::EP0004 => "P0004", + Inner::EXX000 => "XX000", + Inner::EXX001 => "XX001", + Inner::EXX002 => "XX002", + Inner::Other(code) => code, + } + } + + /// 00000 + pub const SUCCESSFUL_COMPLETION: SqlState = SqlState(Inner::E00000); + + /// 01000 + pub const WARNING: SqlState = SqlState(Inner::E01000); + + /// 0100C + pub const WARNING_DYNAMIC_RESULT_SETS_RETURNED: SqlState = SqlState(Inner::E0100C); + + /// 01008 + pub const WARNING_IMPLICIT_ZERO_BIT_PADDING: SqlState = SqlState(Inner::E01008); + + /// 01003 + pub const WARNING_NULL_VALUE_ELIMINATED_IN_SET_FUNCTION: SqlState = SqlState(Inner::E01003); + + /// 01007 + pub const WARNING_PRIVILEGE_NOT_GRANTED: SqlState = SqlState(Inner::E01007); + + /// 01006 + pub const WARNING_PRIVILEGE_NOT_REVOKED: SqlState = SqlState(Inner::E01006); + + /// 01004 + pub const WARNING_STRING_DATA_RIGHT_TRUNCATION: SqlState = SqlState(Inner::E01004); + + /// 01P01 + pub const WARNING_DEPRECATED_FEATURE: SqlState = SqlState(Inner::E01P01); + + /// 02000 + pub const NO_DATA: SqlState = SqlState(Inner::E02000); + + /// 02001 + pub const NO_ADDITIONAL_DYNAMIC_RESULT_SETS_RETURNED: SqlState = SqlState(Inner::E02001); + + /// 03000 + pub const SQL_STATEMENT_NOT_YET_COMPLETE: SqlState = SqlState(Inner::E03000); + + /// 08000 + pub const CONNECTION_EXCEPTION: SqlState = SqlState(Inner::E08000); + + /// 08003 + pub const CONNECTION_DOES_NOT_EXIST: SqlState = SqlState(Inner::E08003); + + /// 08006 + pub const CONNECTION_FAILURE: SqlState = SqlState(Inner::E08006); + + /// 08001 + pub const SQLCLIENT_UNABLE_TO_ESTABLISH_SQLCONNECTION: SqlState = SqlState(Inner::E08001); + + /// 08004 + pub const SQLSERVER_REJECTED_ESTABLISHMENT_OF_SQLCONNECTION: SqlState = SqlState(Inner::E08004); + + /// 08007 + pub const TRANSACTION_RESOLUTION_UNKNOWN: SqlState = SqlState(Inner::E08007); + + /// 08P01 + pub const PROTOCOL_VIOLATION: SqlState = SqlState(Inner::E08P01); + + /// 09000 + pub const TRIGGERED_ACTION_EXCEPTION: SqlState = SqlState(Inner::E09000); + + /// 0A000 + pub const FEATURE_NOT_SUPPORTED: SqlState = SqlState(Inner::E0A000); + + /// 0B000 + pub const INVALID_TRANSACTION_INITIATION: SqlState = SqlState(Inner::E0B000); + + /// 0F000 + pub const LOCATOR_EXCEPTION: SqlState = SqlState(Inner::E0F000); + + /// 0F001 + pub const L_E_INVALID_SPECIFICATION: SqlState = SqlState(Inner::E0F001); + + /// 0L000 + pub const INVALID_GRANTOR: SqlState = SqlState(Inner::E0L000); + + /// 0LP01 + pub const INVALID_GRANT_OPERATION: SqlState = SqlState(Inner::E0LP01); + + /// 0P000 + pub const INVALID_ROLE_SPECIFICATION: SqlState = SqlState(Inner::E0P000); + + /// 0Z000 + pub const DIAGNOSTICS_EXCEPTION: SqlState = SqlState(Inner::E0Z000); + + /// 0Z002 + pub const STACKED_DIAGNOSTICS_ACCESSED_WITHOUT_ACTIVE_HANDLER: SqlState = + SqlState(Inner::E0Z002); + + /// 20000 + pub const CASE_NOT_FOUND: SqlState = SqlState(Inner::E20000); + + /// 21000 + pub const CARDINALITY_VIOLATION: SqlState = SqlState(Inner::E21000); + + /// 22000 + pub const DATA_EXCEPTION: SqlState = SqlState(Inner::E22000); + + /// 2202E + pub const ARRAY_ELEMENT_ERROR: SqlState = SqlState(Inner::E2202E); + + /// 2202E + pub const ARRAY_SUBSCRIPT_ERROR: SqlState = SqlState(Inner::E2202E); + + /// 22021 + pub const CHARACTER_NOT_IN_REPERTOIRE: SqlState = SqlState(Inner::E22021); + + /// 22008 + pub const DATETIME_FIELD_OVERFLOW: SqlState = SqlState(Inner::E22008); + + /// 22008 + pub const DATETIME_VALUE_OUT_OF_RANGE: SqlState = SqlState(Inner::E22008); + + /// 22012 + pub const DIVISION_BY_ZERO: SqlState = SqlState(Inner::E22012); + + /// 22005 + pub const ERROR_IN_ASSIGNMENT: SqlState = SqlState(Inner::E22005); + + /// 2200B + pub const ESCAPE_CHARACTER_CONFLICT: SqlState = SqlState(Inner::E2200B); + + /// 22022 + pub const INDICATOR_OVERFLOW: SqlState = SqlState(Inner::E22022); + + /// 22015 + pub const INTERVAL_FIELD_OVERFLOW: SqlState = SqlState(Inner::E22015); + + /// 2201E + pub const INVALID_ARGUMENT_FOR_LOG: SqlState = SqlState(Inner::E2201E); + + /// 22014 + pub const INVALID_ARGUMENT_FOR_NTILE: SqlState = SqlState(Inner::E22014); + + /// 22016 + pub const INVALID_ARGUMENT_FOR_NTH_VALUE: SqlState = SqlState(Inner::E22016); + + /// 2201F + pub const INVALID_ARGUMENT_FOR_POWER_FUNCTION: SqlState = SqlState(Inner::E2201F); + + /// 2201G + pub const INVALID_ARGUMENT_FOR_WIDTH_BUCKET_FUNCTION: SqlState = SqlState(Inner::E2201G); + + /// 22018 + pub const INVALID_CHARACTER_VALUE_FOR_CAST: SqlState = SqlState(Inner::E22018); + + /// 22007 + pub const INVALID_DATETIME_FORMAT: SqlState = SqlState(Inner::E22007); + + /// 22019 + pub const INVALID_ESCAPE_CHARACTER: SqlState = SqlState(Inner::E22019); + + /// 2200D + pub const INVALID_ESCAPE_OCTET: SqlState = SqlState(Inner::E2200D); + + /// 22025 + pub const INVALID_ESCAPE_SEQUENCE: SqlState = SqlState(Inner::E22025); + + /// 22P06 + pub const NONSTANDARD_USE_OF_ESCAPE_CHARACTER: SqlState = SqlState(Inner::E22P06); + + /// 22010 + pub const INVALID_INDICATOR_PARAMETER_VALUE: SqlState = SqlState(Inner::E22010); + + /// 22023 + pub const INVALID_PARAMETER_VALUE: SqlState = SqlState(Inner::E22023); + + /// 22013 + pub const INVALID_PRECEDING_OR_FOLLOWING_SIZE: SqlState = SqlState(Inner::E22013); + + /// 2201B + pub const INVALID_REGULAR_EXPRESSION: SqlState = SqlState(Inner::E2201B); + + /// 2201W + pub const INVALID_ROW_COUNT_IN_LIMIT_CLAUSE: SqlState = SqlState(Inner::E2201W); + + /// 2201X + pub const INVALID_ROW_COUNT_IN_RESULT_OFFSET_CLAUSE: SqlState = SqlState(Inner::E2201X); + + /// 2202H + pub const INVALID_TABLESAMPLE_ARGUMENT: SqlState = SqlState(Inner::E2202H); + + /// 2202G + pub const INVALID_TABLESAMPLE_REPEAT: SqlState = SqlState(Inner::E2202G); + + /// 22009 + pub const INVALID_TIME_ZONE_DISPLACEMENT_VALUE: SqlState = SqlState(Inner::E22009); + + /// 2200C + pub const INVALID_USE_OF_ESCAPE_CHARACTER: SqlState = SqlState(Inner::E2200C); + + /// 2200G + pub const MOST_SPECIFIC_TYPE_MISMATCH: SqlState = SqlState(Inner::E2200G); + + /// 22004 + pub const NULL_VALUE_NOT_ALLOWED: SqlState = SqlState(Inner::E22004); + + /// 22002 + pub const NULL_VALUE_NO_INDICATOR_PARAMETER: SqlState = SqlState(Inner::E22002); + + /// 22003 + pub const NUMERIC_VALUE_OUT_OF_RANGE: SqlState = SqlState(Inner::E22003); + + /// 2200H + pub const SEQUENCE_GENERATOR_LIMIT_EXCEEDED: SqlState = SqlState(Inner::E2200H); + + /// 22026 + pub const STRING_DATA_LENGTH_MISMATCH: SqlState = SqlState(Inner::E22026); + + /// 22001 + pub const STRING_DATA_RIGHT_TRUNCATION: SqlState = SqlState(Inner::E22001); + + /// 22011 + pub const SUBSTRING_ERROR: SqlState = SqlState(Inner::E22011); + + /// 22027 + pub const TRIM_ERROR: SqlState = SqlState(Inner::E22027); + + /// 22024 + pub const UNTERMINATED_C_STRING: SqlState = SqlState(Inner::E22024); + + /// 2200F + pub const ZERO_LENGTH_CHARACTER_STRING: SqlState = SqlState(Inner::E2200F); + + /// 22P01 + pub const FLOATING_POINT_EXCEPTION: SqlState = SqlState(Inner::E22P01); + + /// 22P02 + pub const INVALID_TEXT_REPRESENTATION: SqlState = SqlState(Inner::E22P02); + + /// 22P03 + pub const INVALID_BINARY_REPRESENTATION: SqlState = SqlState(Inner::E22P03); + + /// 22P04 + pub const BAD_COPY_FILE_FORMAT: SqlState = SqlState(Inner::E22P04); + + /// 22P05 + pub const UNTRANSLATABLE_CHARACTER: SqlState = SqlState(Inner::E22P05); + + /// 2200L + pub const NOT_AN_XML_DOCUMENT: SqlState = SqlState(Inner::E2200L); + + /// 2200M + pub const INVALID_XML_DOCUMENT: SqlState = SqlState(Inner::E2200M); + + /// 2200N + pub const INVALID_XML_CONTENT: SqlState = SqlState(Inner::E2200N); + + /// 2200S + pub const INVALID_XML_COMMENT: SqlState = SqlState(Inner::E2200S); + + /// 2200T + pub const INVALID_XML_PROCESSING_INSTRUCTION: SqlState = SqlState(Inner::E2200T); + + /// 22030 + pub const DUPLICATE_JSON_OBJECT_KEY_VALUE: SqlState = SqlState(Inner::E22030); + + /// 22031 + pub const INVALID_ARGUMENT_FOR_SQL_JSON_DATETIME_FUNCTION: SqlState = SqlState(Inner::E22031); + + /// 22032 + pub const INVALID_JSON_TEXT: SqlState = SqlState(Inner::E22032); + + /// 22033 + pub const INVALID_SQL_JSON_SUBSCRIPT: SqlState = SqlState(Inner::E22033); + + /// 22034 + pub const MORE_THAN_ONE_SQL_JSON_ITEM: SqlState = SqlState(Inner::E22034); + + /// 22035 + pub const NO_SQL_JSON_ITEM: SqlState = SqlState(Inner::E22035); + + /// 22036 + pub const NON_NUMERIC_SQL_JSON_ITEM: SqlState = SqlState(Inner::E22036); + + /// 22037 + pub const NON_UNIQUE_KEYS_IN_A_JSON_OBJECT: SqlState = SqlState(Inner::E22037); + + /// 22038 + pub const SINGLETON_SQL_JSON_ITEM_REQUIRED: SqlState = SqlState(Inner::E22038); + + /// 22039 + pub const SQL_JSON_ARRAY_NOT_FOUND: SqlState = SqlState(Inner::E22039); + + /// 2203A + pub const SQL_JSON_MEMBER_NOT_FOUND: SqlState = SqlState(Inner::E2203A); + + /// 2203B + pub const SQL_JSON_NUMBER_NOT_FOUND: SqlState = SqlState(Inner::E2203B); + + /// 2203C + pub const SQL_JSON_OBJECT_NOT_FOUND: SqlState = SqlState(Inner::E2203C); + + /// 2203D + pub const TOO_MANY_JSON_ARRAY_ELEMENTS: SqlState = SqlState(Inner::E2203D); + + /// 2203E + pub const TOO_MANY_JSON_OBJECT_MEMBERS: SqlState = SqlState(Inner::E2203E); + + /// 2203F + pub const SQL_JSON_SCALAR_REQUIRED: SqlState = SqlState(Inner::E2203F); + + /// 2203G + pub const SQL_JSON_ITEM_CANNOT_BE_CAST_TO_TARGET_TYPE: SqlState = SqlState(Inner::E2203G); + + /// 23000 + pub const INTEGRITY_CONSTRAINT_VIOLATION: SqlState = SqlState(Inner::E23000); + + /// 23001 + pub const RESTRICT_VIOLATION: SqlState = SqlState(Inner::E23001); + + /// 23502 + pub const NOT_NULL_VIOLATION: SqlState = SqlState(Inner::E23502); + + /// 23503 + pub const FOREIGN_KEY_VIOLATION: SqlState = SqlState(Inner::E23503); + + /// 23505 + pub const UNIQUE_VIOLATION: SqlState = SqlState(Inner::E23505); + + /// 23514 + pub const CHECK_VIOLATION: SqlState = SqlState(Inner::E23514); + + /// 23P01 + pub const EXCLUSION_VIOLATION: SqlState = SqlState(Inner::E23P01); + + /// 24000 + pub const INVALID_CURSOR_STATE: SqlState = SqlState(Inner::E24000); + + /// 25000 + pub const INVALID_TRANSACTION_STATE: SqlState = SqlState(Inner::E25000); + + /// 25001 + pub const ACTIVE_SQL_TRANSACTION: SqlState = SqlState(Inner::E25001); + + /// 25002 + pub const BRANCH_TRANSACTION_ALREADY_ACTIVE: SqlState = SqlState(Inner::E25002); + + /// 25008 + pub const HELD_CURSOR_REQUIRES_SAME_ISOLATION_LEVEL: SqlState = SqlState(Inner::E25008); + + /// 25003 + pub const INAPPROPRIATE_ACCESS_MODE_FOR_BRANCH_TRANSACTION: SqlState = SqlState(Inner::E25003); + + /// 25004 + pub const INAPPROPRIATE_ISOLATION_LEVEL_FOR_BRANCH_TRANSACTION: SqlState = + SqlState(Inner::E25004); + + /// 25005 + pub const NO_ACTIVE_SQL_TRANSACTION_FOR_BRANCH_TRANSACTION: SqlState = SqlState(Inner::E25005); + + /// 25006 + pub const READ_ONLY_SQL_TRANSACTION: SqlState = SqlState(Inner::E25006); + + /// 25007 + pub const SCHEMA_AND_DATA_STATEMENT_MIXING_NOT_SUPPORTED: SqlState = SqlState(Inner::E25007); + + /// 25P01 + pub const NO_ACTIVE_SQL_TRANSACTION: SqlState = SqlState(Inner::E25P01); + + /// 25P02 + pub const IN_FAILED_SQL_TRANSACTION: SqlState = SqlState(Inner::E25P02); + + /// 25P03 + pub const IDLE_IN_TRANSACTION_SESSION_TIMEOUT: SqlState = SqlState(Inner::E25P03); + + /// 26000 + pub const INVALID_SQL_STATEMENT_NAME: SqlState = SqlState(Inner::E26000); + + /// 26000 + pub const UNDEFINED_PSTATEMENT: SqlState = SqlState(Inner::E26000); + + /// 27000 + pub const TRIGGERED_DATA_CHANGE_VIOLATION: SqlState = SqlState(Inner::E27000); + + /// 28000 + pub const INVALID_AUTHORIZATION_SPECIFICATION: SqlState = SqlState(Inner::E28000); + + /// 28P01 + pub const INVALID_PASSWORD: SqlState = SqlState(Inner::E28P01); + + /// 2B000 + pub const DEPENDENT_PRIVILEGE_DESCRIPTORS_STILL_EXIST: SqlState = SqlState(Inner::E2B000); + + /// 2BP01 + pub const DEPENDENT_OBJECTS_STILL_EXIST: SqlState = SqlState(Inner::E2BP01); + + /// 2D000 + pub const INVALID_TRANSACTION_TERMINATION: SqlState = SqlState(Inner::E2D000); + + /// 2F000 + pub const SQL_ROUTINE_EXCEPTION: SqlState = SqlState(Inner::E2F000); + + /// 2F005 + pub const S_R_E_FUNCTION_EXECUTED_NO_RETURN_STATEMENT: SqlState = SqlState(Inner::E2F005); + + /// 2F002 + pub const S_R_E_MODIFYING_SQL_DATA_NOT_PERMITTED: SqlState = SqlState(Inner::E2F002); + + /// 2F003 + pub const S_R_E_PROHIBITED_SQL_STATEMENT_ATTEMPTED: SqlState = SqlState(Inner::E2F003); + + /// 2F004 + pub const S_R_E_READING_SQL_DATA_NOT_PERMITTED: SqlState = SqlState(Inner::E2F004); + + /// 34000 + pub const INVALID_CURSOR_NAME: SqlState = SqlState(Inner::E34000); + + /// 34000 + pub const UNDEFINED_CURSOR: SqlState = SqlState(Inner::E34000); + + /// 38000 + pub const EXTERNAL_ROUTINE_EXCEPTION: SqlState = SqlState(Inner::E38000); + + /// 38001 + pub const E_R_E_CONTAINING_SQL_NOT_PERMITTED: SqlState = SqlState(Inner::E38001); + + /// 38002 + pub const E_R_E_MODIFYING_SQL_DATA_NOT_PERMITTED: SqlState = SqlState(Inner::E38002); + + /// 38003 + pub const E_R_E_PROHIBITED_SQL_STATEMENT_ATTEMPTED: SqlState = SqlState(Inner::E38003); + + /// 38004 + pub const E_R_E_READING_SQL_DATA_NOT_PERMITTED: SqlState = SqlState(Inner::E38004); + + /// 39000 + pub const EXTERNAL_ROUTINE_INVOCATION_EXCEPTION: SqlState = SqlState(Inner::E39000); + + /// 39001 + pub const E_R_I_E_INVALID_SQLSTATE_RETURNED: SqlState = SqlState(Inner::E39001); + + /// 39004 + pub const E_R_I_E_NULL_VALUE_NOT_ALLOWED: SqlState = SqlState(Inner::E39004); + + /// 39P01 + pub const E_R_I_E_TRIGGER_PROTOCOL_VIOLATED: SqlState = SqlState(Inner::E39P01); + + /// 39P02 + pub const E_R_I_E_SRF_PROTOCOL_VIOLATED: SqlState = SqlState(Inner::E39P02); + + /// 39P03 + pub const E_R_I_E_EVENT_TRIGGER_PROTOCOL_VIOLATED: SqlState = SqlState(Inner::E39P03); + + /// 3B000 + pub const SAVEPOINT_EXCEPTION: SqlState = SqlState(Inner::E3B000); + + /// 3B001 + pub const S_E_INVALID_SPECIFICATION: SqlState = SqlState(Inner::E3B001); + + /// 3D000 + pub const INVALID_CATALOG_NAME: SqlState = SqlState(Inner::E3D000); + + /// 3D000 + pub const UNDEFINED_DATABASE: SqlState = SqlState(Inner::E3D000); + + /// 3F000 + pub const INVALID_SCHEMA_NAME: SqlState = SqlState(Inner::E3F000); + + /// 3F000 + pub const UNDEFINED_SCHEMA: SqlState = SqlState(Inner::E3F000); + + /// 40000 + pub const TRANSACTION_ROLLBACK: SqlState = SqlState(Inner::E40000); + + /// 40002 + pub const T_R_INTEGRITY_CONSTRAINT_VIOLATION: SqlState = SqlState(Inner::E40002); + + /// 40001 + pub const T_R_SERIALIZATION_FAILURE: SqlState = SqlState(Inner::E40001); + + /// 40003 + pub const T_R_STATEMENT_COMPLETION_UNKNOWN: SqlState = SqlState(Inner::E40003); + + /// 40P01 + pub const T_R_DEADLOCK_DETECTED: SqlState = SqlState(Inner::E40P01); + + /// 42000 + pub const SYNTAX_ERROR_OR_ACCESS_RULE_VIOLATION: SqlState = SqlState(Inner::E42000); + + /// 42601 + pub const SYNTAX_ERROR: SqlState = SqlState(Inner::E42601); + + /// 42501 + pub const INSUFFICIENT_PRIVILEGE: SqlState = SqlState(Inner::E42501); + + /// 42846 + pub const CANNOT_COERCE: SqlState = SqlState(Inner::E42846); + + /// 42803 + pub const GROUPING_ERROR: SqlState = SqlState(Inner::E42803); + + /// 42P20 + pub const WINDOWING_ERROR: SqlState = SqlState(Inner::E42P20); + + /// 42P19 + pub const INVALID_RECURSION: SqlState = SqlState(Inner::E42P19); + + /// 42830 + pub const INVALID_FOREIGN_KEY: SqlState = SqlState(Inner::E42830); + + /// 42602 + pub const INVALID_NAME: SqlState = SqlState(Inner::E42602); + + /// 42622 + pub const NAME_TOO_LONG: SqlState = SqlState(Inner::E42622); + + /// 42939 + pub const RESERVED_NAME: SqlState = SqlState(Inner::E42939); + + /// 42804 + pub const DATATYPE_MISMATCH: SqlState = SqlState(Inner::E42804); + + /// 42P18 + pub const INDETERMINATE_DATATYPE: SqlState = SqlState(Inner::E42P18); + + /// 42P21 + pub const COLLATION_MISMATCH: SqlState = SqlState(Inner::E42P21); + + /// 42P22 + pub const INDETERMINATE_COLLATION: SqlState = SqlState(Inner::E42P22); + + /// 42809 + pub const WRONG_OBJECT_TYPE: SqlState = SqlState(Inner::E42809); + + /// 428C9 + pub const GENERATED_ALWAYS: SqlState = SqlState(Inner::E428C9); + + /// 42703 + pub const UNDEFINED_COLUMN: SqlState = SqlState(Inner::E42703); + + /// 42883 + pub const UNDEFINED_FUNCTION: SqlState = SqlState(Inner::E42883); + + /// 42P01 + pub const UNDEFINED_TABLE: SqlState = SqlState(Inner::E42P01); + + /// 42P02 + pub const UNDEFINED_PARAMETER: SqlState = SqlState(Inner::E42P02); + + /// 42704 + pub const UNDEFINED_OBJECT: SqlState = SqlState(Inner::E42704); + + /// 42701 + pub const DUPLICATE_COLUMN: SqlState = SqlState(Inner::E42701); + + /// 42P03 + pub const DUPLICATE_CURSOR: SqlState = SqlState(Inner::E42P03); + + /// 42P04 + pub const DUPLICATE_DATABASE: SqlState = SqlState(Inner::E42P04); + + /// 42723 + pub const DUPLICATE_FUNCTION: SqlState = SqlState(Inner::E42723); + + /// 42P05 + pub const DUPLICATE_PSTATEMENT: SqlState = SqlState(Inner::E42P05); + + /// 42P06 + pub const DUPLICATE_SCHEMA: SqlState = SqlState(Inner::E42P06); + + /// 42P07 + pub const DUPLICATE_TABLE: SqlState = SqlState(Inner::E42P07); + + /// 42712 + pub const DUPLICATE_ALIAS: SqlState = SqlState(Inner::E42712); + + /// 42710 + pub const DUPLICATE_OBJECT: SqlState = SqlState(Inner::E42710); + + /// 42702 + pub const AMBIGUOUS_COLUMN: SqlState = SqlState(Inner::E42702); + + /// 42725 + pub const AMBIGUOUS_FUNCTION: SqlState = SqlState(Inner::E42725); + + /// 42P08 + pub const AMBIGUOUS_PARAMETER: SqlState = SqlState(Inner::E42P08); + + /// 42P09 + pub const AMBIGUOUS_ALIAS: SqlState = SqlState(Inner::E42P09); + + /// 42P10 + pub const INVALID_COLUMN_REFERENCE: SqlState = SqlState(Inner::E42P10); + + /// 42611 + pub const INVALID_COLUMN_DEFINITION: SqlState = SqlState(Inner::E42611); + + /// 42P11 + pub const INVALID_CURSOR_DEFINITION: SqlState = SqlState(Inner::E42P11); + + /// 42P12 + pub const INVALID_DATABASE_DEFINITION: SqlState = SqlState(Inner::E42P12); + + /// 42P13 + pub const INVALID_FUNCTION_DEFINITION: SqlState = SqlState(Inner::E42P13); + + /// 42P14 + pub const INVALID_PSTATEMENT_DEFINITION: SqlState = SqlState(Inner::E42P14); + + /// 42P15 + pub const INVALID_SCHEMA_DEFINITION: SqlState = SqlState(Inner::E42P15); + + /// 42P16 + pub const INVALID_TABLE_DEFINITION: SqlState = SqlState(Inner::E42P16); + + /// 42P17 + pub const INVALID_OBJECT_DEFINITION: SqlState = SqlState(Inner::E42P17); + + /// 44000 + pub const WITH_CHECK_OPTION_VIOLATION: SqlState = SqlState(Inner::E44000); + + /// 53000 + pub const INSUFFICIENT_RESOURCES: SqlState = SqlState(Inner::E53000); + + /// 53100 + pub const DISK_FULL: SqlState = SqlState(Inner::E53100); + + /// 53200 + pub const OUT_OF_MEMORY: SqlState = SqlState(Inner::E53200); + + /// 53300 + pub const TOO_MANY_CONNECTIONS: SqlState = SqlState(Inner::E53300); + + /// 53400 + pub const CONFIGURATION_LIMIT_EXCEEDED: SqlState = SqlState(Inner::E53400); + + /// 54000 + pub const PROGRAM_LIMIT_EXCEEDED: SqlState = SqlState(Inner::E54000); + + /// 54001 + pub const STATEMENT_TOO_COMPLEX: SqlState = SqlState(Inner::E54001); + + /// 54011 + pub const TOO_MANY_COLUMNS: SqlState = SqlState(Inner::E54011); + + /// 54023 + pub const TOO_MANY_ARGUMENTS: SqlState = SqlState(Inner::E54023); + + /// 55000 + pub const OBJECT_NOT_IN_PREREQUISITE_STATE: SqlState = SqlState(Inner::E55000); + + /// 55006 + pub const OBJECT_IN_USE: SqlState = SqlState(Inner::E55006); + + /// 55P02 + pub const CANT_CHANGE_RUNTIME_PARAM: SqlState = SqlState(Inner::E55P02); + + /// 55P03 + pub const LOCK_NOT_AVAILABLE: SqlState = SqlState(Inner::E55P03); + + /// 55P04 + pub const UNSAFE_NEW_ENUM_VALUE_USAGE: SqlState = SqlState(Inner::E55P04); + + /// 57000 + pub const OPERATOR_INTERVENTION: SqlState = SqlState(Inner::E57000); + + /// 57014 + pub const QUERY_CANCELED: SqlState = SqlState(Inner::E57014); + + /// 57P01 + pub const ADMIN_SHUTDOWN: SqlState = SqlState(Inner::E57P01); + + /// 57P02 + pub const CRASH_SHUTDOWN: SqlState = SqlState(Inner::E57P02); + + /// 57P03 + pub const CANNOT_CONNECT_NOW: SqlState = SqlState(Inner::E57P03); + + /// 57P04 + pub const DATABASE_DROPPED: SqlState = SqlState(Inner::E57P04); + + /// 57P05 + pub const IDLE_SESSION_TIMEOUT: SqlState = SqlState(Inner::E57P05); + + /// 58000 + pub const SYSTEM_ERROR: SqlState = SqlState(Inner::E58000); + + /// 58030 + pub const IO_ERROR: SqlState = SqlState(Inner::E58030); + + /// 58P01 + pub const UNDEFINED_FILE: SqlState = SqlState(Inner::E58P01); + + /// 58P02 + pub const DUPLICATE_FILE: SqlState = SqlState(Inner::E58P02); + + /// 72000 + pub const SNAPSHOT_TOO_OLD: SqlState = SqlState(Inner::E72000); + + /// F0000 + pub const CONFIG_FILE_ERROR: SqlState = SqlState(Inner::EF0000); + + /// F0001 + pub const LOCK_FILE_EXISTS: SqlState = SqlState(Inner::EF0001); + + /// HV000 + pub const FDW_ERROR: SqlState = SqlState(Inner::EHV000); + + /// HV005 + pub const FDW_COLUMN_NAME_NOT_FOUND: SqlState = SqlState(Inner::EHV005); + + /// HV002 + pub const FDW_DYNAMIC_PARAMETER_VALUE_NEEDED: SqlState = SqlState(Inner::EHV002); + + /// HV010 + pub const FDW_FUNCTION_SEQUENCE_ERROR: SqlState = SqlState(Inner::EHV010); + + /// HV021 + pub const FDW_INCONSISTENT_DESCRIPTOR_INFORMATION: SqlState = SqlState(Inner::EHV021); + + /// HV024 + pub const FDW_INVALID_ATTRIBUTE_VALUE: SqlState = SqlState(Inner::EHV024); + + /// HV007 + pub const FDW_INVALID_COLUMN_NAME: SqlState = SqlState(Inner::EHV007); + + /// HV008 + pub const FDW_INVALID_COLUMN_NUMBER: SqlState = SqlState(Inner::EHV008); + + /// HV004 + pub const FDW_INVALID_DATA_TYPE: SqlState = SqlState(Inner::EHV004); + + /// HV006 + pub const FDW_INVALID_DATA_TYPE_DESCRIPTORS: SqlState = SqlState(Inner::EHV006); + + /// HV091 + pub const FDW_INVALID_DESCRIPTOR_FIELD_IDENTIFIER: SqlState = SqlState(Inner::EHV091); + + /// HV00B + pub const FDW_INVALID_HANDLE: SqlState = SqlState(Inner::EHV00B); + + /// HV00C + pub const FDW_INVALID_OPTION_INDEX: SqlState = SqlState(Inner::EHV00C); + + /// HV00D + pub const FDW_INVALID_OPTION_NAME: SqlState = SqlState(Inner::EHV00D); + + /// HV090 + pub const FDW_INVALID_STRING_LENGTH_OR_BUFFER_LENGTH: SqlState = SqlState(Inner::EHV090); + + /// HV00A + pub const FDW_INVALID_STRING_FORMAT: SqlState = SqlState(Inner::EHV00A); + + /// HV009 + pub const FDW_INVALID_USE_OF_NULL_POINTER: SqlState = SqlState(Inner::EHV009); + + /// HV014 + pub const FDW_TOO_MANY_HANDLES: SqlState = SqlState(Inner::EHV014); + + /// HV001 + pub const FDW_OUT_OF_MEMORY: SqlState = SqlState(Inner::EHV001); + + /// HV00P + pub const FDW_NO_SCHEMAS: SqlState = SqlState(Inner::EHV00P); + + /// HV00J + pub const FDW_OPTION_NAME_NOT_FOUND: SqlState = SqlState(Inner::EHV00J); + + /// HV00K + pub const FDW_REPLY_HANDLE: SqlState = SqlState(Inner::EHV00K); + + /// HV00Q + pub const FDW_SCHEMA_NOT_FOUND: SqlState = SqlState(Inner::EHV00Q); + + /// HV00R + pub const FDW_TABLE_NOT_FOUND: SqlState = SqlState(Inner::EHV00R); + + /// HV00L + pub const FDW_UNABLE_TO_CREATE_EXECUTION: SqlState = SqlState(Inner::EHV00L); + + /// HV00M + pub const FDW_UNABLE_TO_CREATE_REPLY: SqlState = SqlState(Inner::EHV00M); + + /// HV00N + pub const FDW_UNABLE_TO_ESTABLISH_CONNECTION: SqlState = SqlState(Inner::EHV00N); + + /// P0000 + pub const PLPGSQL_ERROR: SqlState = SqlState(Inner::EP0000); + + /// P0001 + pub const RAISE_EXCEPTION: SqlState = SqlState(Inner::EP0001); + + /// P0002 + pub const NO_DATA_FOUND: SqlState = SqlState(Inner::EP0002); + + /// P0003 + pub const TOO_MANY_ROWS: SqlState = SqlState(Inner::EP0003); + + /// P0004 + pub const ASSERT_FAILURE: SqlState = SqlState(Inner::EP0004); + + /// XX000 + pub const INTERNAL_ERROR: SqlState = SqlState(Inner::EXX000); + + /// XX001 + pub const DATA_CORRUPTED: SqlState = SqlState(Inner::EXX001); + + /// XX002 + pub const INDEX_CORRUPTED: SqlState = SqlState(Inner::EXX002); +} + +#[derive(PartialEq, Eq, Clone, Debug)] +#[allow(clippy::upper_case_acronyms)] +enum Inner { + E00000, + E01000, + E0100C, + E01008, + E01003, + E01007, + E01006, + E01004, + E01P01, + E02000, + E02001, + E03000, + E08000, + E08003, + E08006, + E08001, + E08004, + E08007, + E08P01, + E09000, + E0A000, + E0B000, + E0F000, + E0F001, + E0L000, + E0LP01, + E0P000, + E0Z000, + E0Z002, + E20000, + E21000, + E22000, + E2202E, + E22021, + E22008, + E22012, + E22005, + E2200B, + E22022, + E22015, + E2201E, + E22014, + E22016, + E2201F, + E2201G, + E22018, + E22007, + E22019, + E2200D, + E22025, + E22P06, + E22010, + E22023, + E22013, + E2201B, + E2201W, + E2201X, + E2202H, + E2202G, + E22009, + E2200C, + E2200G, + E22004, + E22002, + E22003, + E2200H, + E22026, + E22001, + E22011, + E22027, + E22024, + E2200F, + E22P01, + E22P02, + E22P03, + E22P04, + E22P05, + E2200L, + E2200M, + E2200N, + E2200S, + E2200T, + E22030, + E22031, + E22032, + E22033, + E22034, + E22035, + E22036, + E22037, + E22038, + E22039, + E2203A, + E2203B, + E2203C, + E2203D, + E2203E, + E2203F, + E2203G, + E23000, + E23001, + E23502, + E23503, + E23505, + E23514, + E23P01, + E24000, + E25000, + E25001, + E25002, + E25008, + E25003, + E25004, + E25005, + E25006, + E25007, + E25P01, + E25P02, + E25P03, + E26000, + E27000, + E28000, + E28P01, + E2B000, + E2BP01, + E2D000, + E2F000, + E2F005, + E2F002, + E2F003, + E2F004, + E34000, + E38000, + E38001, + E38002, + E38003, + E38004, + E39000, + E39001, + E39004, + E39P01, + E39P02, + E39P03, + E3B000, + E3B001, + E3D000, + E3F000, + E40000, + E40002, + E40001, + E40003, + E40P01, + E42000, + E42601, + E42501, + E42846, + E42803, + E42P20, + E42P19, + E42830, + E42602, + E42622, + E42939, + E42804, + E42P18, + E42P21, + E42P22, + E42809, + E428C9, + E42703, + E42883, + E42P01, + E42P02, + E42704, + E42701, + E42P03, + E42P04, + E42723, + E42P05, + E42P06, + E42P07, + E42712, + E42710, + E42702, + E42725, + E42P08, + E42P09, + E42P10, + E42611, + E42P11, + E42P12, + E42P13, + E42P14, + E42P15, + E42P16, + E42P17, + E44000, + E53000, + E53100, + E53200, + E53300, + E53400, + E54000, + E54001, + E54011, + E54023, + E55000, + E55006, + E55P02, + E55P03, + E55P04, + E57000, + E57014, + E57P01, + E57P02, + E57P03, + E57P04, + E57P05, + E58000, + E58030, + E58P01, + E58P02, + E72000, + EF0000, + EF0001, + EHV000, + EHV005, + EHV002, + EHV010, + EHV021, + EHV024, + EHV007, + EHV008, + EHV004, + EHV006, + EHV091, + EHV00B, + EHV00C, + EHV00D, + EHV090, + EHV00A, + EHV009, + EHV014, + EHV001, + EHV00P, + EHV00J, + EHV00K, + EHV00Q, + EHV00R, + EHV00L, + EHV00M, + EHV00N, + EP0000, + EP0001, + EP0002, + EP0003, + EP0004, + EXX000, + EXX001, + EXX002, + Other(Box), +} + +#[rustfmt::skip] +static SQLSTATE_MAP: phf::Map<&'static str, SqlState> = +::phf::Map { + key: 12913932095322966823, + disps: &[ + (0, 24), + (0, 12), + (0, 74), + (0, 109), + (0, 11), + (0, 9), + (0, 0), + (4, 38), + (3, 155), + (0, 6), + (1, 242), + (0, 66), + (0, 53), + (5, 180), + (3, 221), + (7, 230), + (0, 125), + (1, 46), + (0, 11), + (1, 2), + (0, 5), + (0, 13), + (0, 171), + (0, 15), + (0, 4), + (0, 22), + (1, 85), + (0, 75), + (2, 0), + (1, 25), + (7, 47), + (0, 45), + (0, 35), + (0, 7), + (7, 124), + (0, 0), + (14, 104), + (1, 183), + (61, 50), + (3, 76), + (0, 12), + (0, 7), + (4, 189), + (0, 1), + (64, 102), + (0, 0), + (16, 192), + (24, 19), + (0, 5), + (0, 87), + (0, 89), + (0, 14), + ], + entries: &[ + ("2F000", SqlState::SQL_ROUTINE_EXCEPTION), + ("01008", SqlState::WARNING_IMPLICIT_ZERO_BIT_PADDING), + ("42501", SqlState::INSUFFICIENT_PRIVILEGE), + ("22000", SqlState::DATA_EXCEPTION), + ("0100C", SqlState::WARNING_DYNAMIC_RESULT_SETS_RETURNED), + ("2200N", SqlState::INVALID_XML_CONTENT), + ("40001", SqlState::T_R_SERIALIZATION_FAILURE), + ("28P01", SqlState::INVALID_PASSWORD), + ("38000", SqlState::EXTERNAL_ROUTINE_EXCEPTION), + ("25006", SqlState::READ_ONLY_SQL_TRANSACTION), + ("2203D", SqlState::TOO_MANY_JSON_ARRAY_ELEMENTS), + ("42P09", SqlState::AMBIGUOUS_ALIAS), + ("F0000", SqlState::CONFIG_FILE_ERROR), + ("42P18", SqlState::INDETERMINATE_DATATYPE), + ("40002", SqlState::T_R_INTEGRITY_CONSTRAINT_VIOLATION), + ("22009", SqlState::INVALID_TIME_ZONE_DISPLACEMENT_VALUE), + ("42P08", SqlState::AMBIGUOUS_PARAMETER), + ("08000", SqlState::CONNECTION_EXCEPTION), + ("25P01", SqlState::NO_ACTIVE_SQL_TRANSACTION), + ("22024", SqlState::UNTERMINATED_C_STRING), + ("55000", SqlState::OBJECT_NOT_IN_PREREQUISITE_STATE), + ("25001", SqlState::ACTIVE_SQL_TRANSACTION), + ("03000", SqlState::SQL_STATEMENT_NOT_YET_COMPLETE), + ("42710", SqlState::DUPLICATE_OBJECT), + ("2D000", SqlState::INVALID_TRANSACTION_TERMINATION), + ("2200G", SqlState::MOST_SPECIFIC_TYPE_MISMATCH), + ("22022", SqlState::INDICATOR_OVERFLOW), + ("55006", SqlState::OBJECT_IN_USE), + ("53200", SqlState::OUT_OF_MEMORY), + ("22012", SqlState::DIVISION_BY_ZERO), + ("P0002", SqlState::NO_DATA_FOUND), + ("XX001", SqlState::DATA_CORRUPTED), + ("22P05", SqlState::UNTRANSLATABLE_CHARACTER), + ("40003", SqlState::T_R_STATEMENT_COMPLETION_UNKNOWN), + ("22021", SqlState::CHARACTER_NOT_IN_REPERTOIRE), + ("25000", SqlState::INVALID_TRANSACTION_STATE), + ("42P15", SqlState::INVALID_SCHEMA_DEFINITION), + ("0B000", SqlState::INVALID_TRANSACTION_INITIATION), + ("22004", SqlState::NULL_VALUE_NOT_ALLOWED), + ("42804", SqlState::DATATYPE_MISMATCH), + ("42803", SqlState::GROUPING_ERROR), + ("02001", SqlState::NO_ADDITIONAL_DYNAMIC_RESULT_SETS_RETURNED), + ("25002", SqlState::BRANCH_TRANSACTION_ALREADY_ACTIVE), + ("28000", SqlState::INVALID_AUTHORIZATION_SPECIFICATION), + ("HV009", SqlState::FDW_INVALID_USE_OF_NULL_POINTER), + ("22P01", SqlState::FLOATING_POINT_EXCEPTION), + ("2B000", SqlState::DEPENDENT_PRIVILEGE_DESCRIPTORS_STILL_EXIST), + ("42723", SqlState::DUPLICATE_FUNCTION), + ("21000", SqlState::CARDINALITY_VIOLATION), + ("0Z002", SqlState::STACKED_DIAGNOSTICS_ACCESSED_WITHOUT_ACTIVE_HANDLER), + ("23505", SqlState::UNIQUE_VIOLATION), + ("HV00J", SqlState::FDW_OPTION_NAME_NOT_FOUND), + ("23P01", SqlState::EXCLUSION_VIOLATION), + ("39P03", SqlState::E_R_I_E_EVENT_TRIGGER_PROTOCOL_VIOLATED), + ("42P10", SqlState::INVALID_COLUMN_REFERENCE), + ("2202H", SqlState::INVALID_TABLESAMPLE_ARGUMENT), + ("55P04", SqlState::UNSAFE_NEW_ENUM_VALUE_USAGE), + ("P0000", SqlState::PLPGSQL_ERROR), + ("2F005", SqlState::S_R_E_FUNCTION_EXECUTED_NO_RETURN_STATEMENT), + ("HV00M", SqlState::FDW_UNABLE_TO_CREATE_REPLY), + ("0A000", SqlState::FEATURE_NOT_SUPPORTED), + ("24000", SqlState::INVALID_CURSOR_STATE), + ("25008", SqlState::HELD_CURSOR_REQUIRES_SAME_ISOLATION_LEVEL), + ("01003", SqlState::WARNING_NULL_VALUE_ELIMINATED_IN_SET_FUNCTION), + ("42712", SqlState::DUPLICATE_ALIAS), + ("HV014", SqlState::FDW_TOO_MANY_HANDLES), + ("58030", SqlState::IO_ERROR), + ("2201W", SqlState::INVALID_ROW_COUNT_IN_LIMIT_CLAUSE), + ("22033", SqlState::INVALID_SQL_JSON_SUBSCRIPT), + ("2BP01", SqlState::DEPENDENT_OBJECTS_STILL_EXIST), + ("HV005", SqlState::FDW_COLUMN_NAME_NOT_FOUND), + ("25004", SqlState::INAPPROPRIATE_ISOLATION_LEVEL_FOR_BRANCH_TRANSACTION), + ("54000", SqlState::PROGRAM_LIMIT_EXCEEDED), + ("20000", SqlState::CASE_NOT_FOUND), + ("2203G", SqlState::SQL_JSON_ITEM_CANNOT_BE_CAST_TO_TARGET_TYPE), + ("22038", SqlState::SINGLETON_SQL_JSON_ITEM_REQUIRED), + ("22007", SqlState::INVALID_DATETIME_FORMAT), + ("08004", SqlState::SQLSERVER_REJECTED_ESTABLISHMENT_OF_SQLCONNECTION), + ("2200H", SqlState::SEQUENCE_GENERATOR_LIMIT_EXCEEDED), + ("HV00D", SqlState::FDW_INVALID_OPTION_NAME), + ("P0004", SqlState::ASSERT_FAILURE), + ("22018", SqlState::INVALID_CHARACTER_VALUE_FOR_CAST), + ("0L000", SqlState::INVALID_GRANTOR), + ("22P04", SqlState::BAD_COPY_FILE_FORMAT), + ("22031", SqlState::INVALID_ARGUMENT_FOR_SQL_JSON_DATETIME_FUNCTION), + ("01P01", SqlState::WARNING_DEPRECATED_FEATURE), + ("0LP01", SqlState::INVALID_GRANT_OPERATION), + ("58P02", SqlState::DUPLICATE_FILE), + ("26000", SqlState::INVALID_SQL_STATEMENT_NAME), + ("54001", SqlState::STATEMENT_TOO_COMPLEX), + ("22010", SqlState::INVALID_INDICATOR_PARAMETER_VALUE), + ("HV00C", SqlState::FDW_INVALID_OPTION_INDEX), + ("22008", SqlState::DATETIME_FIELD_OVERFLOW), + ("42P06", SqlState::DUPLICATE_SCHEMA), + ("25007", SqlState::SCHEMA_AND_DATA_STATEMENT_MIXING_NOT_SUPPORTED), + ("42P20", SqlState::WINDOWING_ERROR), + ("HV091", SqlState::FDW_INVALID_DESCRIPTOR_FIELD_IDENTIFIER), + ("HV021", SqlState::FDW_INCONSISTENT_DESCRIPTOR_INFORMATION), + ("42702", SqlState::AMBIGUOUS_COLUMN), + ("02000", SqlState::NO_DATA), + ("54011", SqlState::TOO_MANY_COLUMNS), + ("HV004", SqlState::FDW_INVALID_DATA_TYPE), + ("01006", SqlState::WARNING_PRIVILEGE_NOT_REVOKED), + ("42701", SqlState::DUPLICATE_COLUMN), + ("08P01", SqlState::PROTOCOL_VIOLATION), + ("42622", SqlState::NAME_TOO_LONG), + ("P0003", SqlState::TOO_MANY_ROWS), + ("22003", SqlState::NUMERIC_VALUE_OUT_OF_RANGE), + ("42P03", SqlState::DUPLICATE_CURSOR), + ("23001", SqlState::RESTRICT_VIOLATION), + ("57000", SqlState::OPERATOR_INTERVENTION), + ("22027", SqlState::TRIM_ERROR), + ("42P12", SqlState::INVALID_DATABASE_DEFINITION), + ("3B000", SqlState::SAVEPOINT_EXCEPTION), + ("2201B", SqlState::INVALID_REGULAR_EXPRESSION), + ("22030", SqlState::DUPLICATE_JSON_OBJECT_KEY_VALUE), + ("2F004", SqlState::S_R_E_READING_SQL_DATA_NOT_PERMITTED), + ("428C9", SqlState::GENERATED_ALWAYS), + ("2200S", SqlState::INVALID_XML_COMMENT), + ("22039", SqlState::SQL_JSON_ARRAY_NOT_FOUND), + ("42809", SqlState::WRONG_OBJECT_TYPE), + ("2201X", SqlState::INVALID_ROW_COUNT_IN_RESULT_OFFSET_CLAUSE), + ("39001", SqlState::E_R_I_E_INVALID_SQLSTATE_RETURNED), + ("25P02", SqlState::IN_FAILED_SQL_TRANSACTION), + ("0P000", SqlState::INVALID_ROLE_SPECIFICATION), + ("HV00N", SqlState::FDW_UNABLE_TO_ESTABLISH_CONNECTION), + ("53100", SqlState::DISK_FULL), + ("42601", SqlState::SYNTAX_ERROR), + ("23000", SqlState::INTEGRITY_CONSTRAINT_VIOLATION), + ("HV006", SqlState::FDW_INVALID_DATA_TYPE_DESCRIPTORS), + ("HV00B", SqlState::FDW_INVALID_HANDLE), + ("HV00Q", SqlState::FDW_SCHEMA_NOT_FOUND), + ("01000", SqlState::WARNING), + ("42883", SqlState::UNDEFINED_FUNCTION), + ("57P01", SqlState::ADMIN_SHUTDOWN), + ("22037", SqlState::NON_UNIQUE_KEYS_IN_A_JSON_OBJECT), + ("00000", SqlState::SUCCESSFUL_COMPLETION), + ("55P03", SqlState::LOCK_NOT_AVAILABLE), + ("42P01", SqlState::UNDEFINED_TABLE), + ("42830", SqlState::INVALID_FOREIGN_KEY), + ("22005", SqlState::ERROR_IN_ASSIGNMENT), + ("22025", SqlState::INVALID_ESCAPE_SEQUENCE), + ("XX002", SqlState::INDEX_CORRUPTED), + ("42P16", SqlState::INVALID_TABLE_DEFINITION), + ("55P02", SqlState::CANT_CHANGE_RUNTIME_PARAM), + ("22019", SqlState::INVALID_ESCAPE_CHARACTER), + ("P0001", SqlState::RAISE_EXCEPTION), + ("72000", SqlState::SNAPSHOT_TOO_OLD), + ("42P11", SqlState::INVALID_CURSOR_DEFINITION), + ("40P01", SqlState::T_R_DEADLOCK_DETECTED), + ("57P02", SqlState::CRASH_SHUTDOWN), + ("HV00A", SqlState::FDW_INVALID_STRING_FORMAT), + ("2F002", SqlState::S_R_E_MODIFYING_SQL_DATA_NOT_PERMITTED), + ("23503", SqlState::FOREIGN_KEY_VIOLATION), + ("40000", SqlState::TRANSACTION_ROLLBACK), + ("22032", SqlState::INVALID_JSON_TEXT), + ("2202E", SqlState::ARRAY_ELEMENT_ERROR), + ("42P19", SqlState::INVALID_RECURSION), + ("42611", SqlState::INVALID_COLUMN_DEFINITION), + ("42P13", SqlState::INVALID_FUNCTION_DEFINITION), + ("25003", SqlState::INAPPROPRIATE_ACCESS_MODE_FOR_BRANCH_TRANSACTION), + ("39P02", SqlState::E_R_I_E_SRF_PROTOCOL_VIOLATED), + ("XX000", SqlState::INTERNAL_ERROR), + ("08006", SqlState::CONNECTION_FAILURE), + ("57P04", SqlState::DATABASE_DROPPED), + ("42P07", SqlState::DUPLICATE_TABLE), + ("22P03", SqlState::INVALID_BINARY_REPRESENTATION), + ("22035", SqlState::NO_SQL_JSON_ITEM), + ("42P14", SqlState::INVALID_PSTATEMENT_DEFINITION), + ("01007", SqlState::WARNING_PRIVILEGE_NOT_GRANTED), + ("38004", SqlState::E_R_E_READING_SQL_DATA_NOT_PERMITTED), + ("42P21", SqlState::COLLATION_MISMATCH), + ("0Z000", SqlState::DIAGNOSTICS_EXCEPTION), + ("HV001", SqlState::FDW_OUT_OF_MEMORY), + ("0F000", SqlState::LOCATOR_EXCEPTION), + ("22013", SqlState::INVALID_PRECEDING_OR_FOLLOWING_SIZE), + ("2201E", SqlState::INVALID_ARGUMENT_FOR_LOG), + ("22011", SqlState::SUBSTRING_ERROR), + ("42602", SqlState::INVALID_NAME), + ("01004", SqlState::WARNING_STRING_DATA_RIGHT_TRUNCATION), + ("42P02", SqlState::UNDEFINED_PARAMETER), + ("2203C", SqlState::SQL_JSON_OBJECT_NOT_FOUND), + ("HV002", SqlState::FDW_DYNAMIC_PARAMETER_VALUE_NEEDED), + ("0F001", SqlState::L_E_INVALID_SPECIFICATION), + ("58P01", SqlState::UNDEFINED_FILE), + ("38001", SqlState::E_R_E_CONTAINING_SQL_NOT_PERMITTED), + ("42703", SqlState::UNDEFINED_COLUMN), + ("57P05", SqlState::IDLE_SESSION_TIMEOUT), + ("57P03", SqlState::CANNOT_CONNECT_NOW), + ("HV007", SqlState::FDW_INVALID_COLUMN_NAME), + ("22014", SqlState::INVALID_ARGUMENT_FOR_NTILE), + ("22P06", SqlState::NONSTANDARD_USE_OF_ESCAPE_CHARACTER), + ("2203F", SqlState::SQL_JSON_SCALAR_REQUIRED), + ("2200F", SqlState::ZERO_LENGTH_CHARACTER_STRING), + ("09000", SqlState::TRIGGERED_ACTION_EXCEPTION), + ("2201F", SqlState::INVALID_ARGUMENT_FOR_POWER_FUNCTION), + ("08003", SqlState::CONNECTION_DOES_NOT_EXIST), + ("38002", SqlState::E_R_E_MODIFYING_SQL_DATA_NOT_PERMITTED), + ("F0001", SqlState::LOCK_FILE_EXISTS), + ("42P22", SqlState::INDETERMINATE_COLLATION), + ("2200C", SqlState::INVALID_USE_OF_ESCAPE_CHARACTER), + ("2203E", SqlState::TOO_MANY_JSON_OBJECT_MEMBERS), + ("23514", SqlState::CHECK_VIOLATION), + ("22P02", SqlState::INVALID_TEXT_REPRESENTATION), + ("54023", SqlState::TOO_MANY_ARGUMENTS), + ("2200T", SqlState::INVALID_XML_PROCESSING_INSTRUCTION), + ("22016", SqlState::INVALID_ARGUMENT_FOR_NTH_VALUE), + ("25P03", SqlState::IDLE_IN_TRANSACTION_SESSION_TIMEOUT), + ("3B001", SqlState::S_E_INVALID_SPECIFICATION), + ("08001", SqlState::SQLCLIENT_UNABLE_TO_ESTABLISH_SQLCONNECTION), + ("22036", SqlState::NON_NUMERIC_SQL_JSON_ITEM), + ("3F000", SqlState::INVALID_SCHEMA_NAME), + ("39P01", SqlState::E_R_I_E_TRIGGER_PROTOCOL_VIOLATED), + ("22026", SqlState::STRING_DATA_LENGTH_MISMATCH), + ("42P17", SqlState::INVALID_OBJECT_DEFINITION), + ("22034", SqlState::MORE_THAN_ONE_SQL_JSON_ITEM), + ("HV000", SqlState::FDW_ERROR), + ("2200B", SqlState::ESCAPE_CHARACTER_CONFLICT), + ("HV008", SqlState::FDW_INVALID_COLUMN_NUMBER), + ("34000", SqlState::INVALID_CURSOR_NAME), + ("2201G", SqlState::INVALID_ARGUMENT_FOR_WIDTH_BUCKET_FUNCTION), + ("44000", SqlState::WITH_CHECK_OPTION_VIOLATION), + ("HV010", SqlState::FDW_FUNCTION_SEQUENCE_ERROR), + ("39004", SqlState::E_R_I_E_NULL_VALUE_NOT_ALLOWED), + ("22001", SqlState::STRING_DATA_RIGHT_TRUNCATION), + ("3D000", SqlState::INVALID_CATALOG_NAME), + ("25005", SqlState::NO_ACTIVE_SQL_TRANSACTION_FOR_BRANCH_TRANSACTION), + ("2200L", SqlState::NOT_AN_XML_DOCUMENT), + ("27000", SqlState::TRIGGERED_DATA_CHANGE_VIOLATION), + ("HV090", SqlState::FDW_INVALID_STRING_LENGTH_OR_BUFFER_LENGTH), + ("42939", SqlState::RESERVED_NAME), + ("58000", SqlState::SYSTEM_ERROR), + ("2200M", SqlState::INVALID_XML_DOCUMENT), + ("HV00L", SqlState::FDW_UNABLE_TO_CREATE_EXECUTION), + ("57014", SqlState::QUERY_CANCELED), + ("23502", SqlState::NOT_NULL_VIOLATION), + ("22002", SqlState::NULL_VALUE_NO_INDICATOR_PARAMETER), + ("HV00R", SqlState::FDW_TABLE_NOT_FOUND), + ("HV00P", SqlState::FDW_NO_SCHEMAS), + ("38003", SqlState::E_R_E_PROHIBITED_SQL_STATEMENT_ATTEMPTED), + ("39000", SqlState::EXTERNAL_ROUTINE_INVOCATION_EXCEPTION), + ("22015", SqlState::INTERVAL_FIELD_OVERFLOW), + ("HV00K", SqlState::FDW_REPLY_HANDLE), + ("HV024", SqlState::FDW_INVALID_ATTRIBUTE_VALUE), + ("2200D", SqlState::INVALID_ESCAPE_OCTET), + ("08007", SqlState::TRANSACTION_RESOLUTION_UNKNOWN), + ("2F003", SqlState::S_R_E_PROHIBITED_SQL_STATEMENT_ATTEMPTED), + ("42725", SqlState::AMBIGUOUS_FUNCTION), + ("2203A", SqlState::SQL_JSON_MEMBER_NOT_FOUND), + ("42846", SqlState::CANNOT_COERCE), + ("42P04", SqlState::DUPLICATE_DATABASE), + ("42000", SqlState::SYNTAX_ERROR_OR_ACCESS_RULE_VIOLATION), + ("2203B", SqlState::SQL_JSON_NUMBER_NOT_FOUND), + ("42P05", SqlState::DUPLICATE_PSTATEMENT), + ("53300", SqlState::TOO_MANY_CONNECTIONS), + ("53400", SqlState::CONFIGURATION_LIMIT_EXCEEDED), + ("42704", SqlState::UNDEFINED_OBJECT), + ("2202G", SqlState::INVALID_TABLESAMPLE_REPEAT), + ("22023", SqlState::INVALID_PARAMETER_VALUE), + ("53000", SqlState::INSUFFICIENT_RESOURCES), + ], +}; diff --git a/libs/proxy/tokio-postgres2/src/generic_client.rs b/libs/proxy/tokio-postgres2/src/generic_client.rs new file mode 100644 index 0000000000..768213f8ed --- /dev/null +++ b/libs/proxy/tokio-postgres2/src/generic_client.rs @@ -0,0 +1,64 @@ +use crate::query::RowStream; +use crate::types::Type; +use crate::{Client, Error, Transaction}; +use async_trait::async_trait; +use postgres_protocol2::Oid; + +mod private { + pub trait Sealed {} +} + +/// A trait allowing abstraction over connections and transactions. +/// +/// This trait is "sealed", and cannot be implemented outside of this crate. +#[async_trait] +pub trait GenericClient: private::Sealed { + /// Like `Client::query_raw_txt`. + async fn query_raw_txt(&self, statement: &str, params: I) -> Result + where + S: AsRef + Sync + Send, + I: IntoIterator> + Sync + Send, + I::IntoIter: ExactSizeIterator + Sync + Send; + + /// Query for type information + async fn get_type(&self, oid: Oid) -> Result; +} + +impl private::Sealed for Client {} + +#[async_trait] +impl GenericClient for Client { + async fn query_raw_txt(&self, statement: &str, params: I) -> Result + where + S: AsRef + Sync + Send, + I: IntoIterator> + Sync + Send, + I::IntoIter: ExactSizeIterator + Sync + Send, + { + self.query_raw_txt(statement, params).await + } + + /// Query for type information + async fn get_type(&self, oid: Oid) -> Result { + self.get_type(oid).await + } +} + +impl private::Sealed for Transaction<'_> {} + +#[async_trait] +#[allow(clippy::needless_lifetimes)] +impl GenericClient for Transaction<'_> { + async fn query_raw_txt(&self, statement: &str, params: I) -> Result + where + S: AsRef + Sync + Send, + I: IntoIterator> + Sync + Send, + I::IntoIter: ExactSizeIterator + Sync + Send, + { + self.query_raw_txt(statement, params).await + } + + /// Query for type information + async fn get_type(&self, oid: Oid) -> Result { + self.client().get_type(oid).await + } +} diff --git a/libs/proxy/tokio-postgres2/src/lib.rs b/libs/proxy/tokio-postgres2/src/lib.rs new file mode 100644 index 0000000000..901ed0c96c --- /dev/null +++ b/libs/proxy/tokio-postgres2/src/lib.rs @@ -0,0 +1,129 @@ +//! An asynchronous, pipelined, PostgreSQL client. +#![warn(rust_2018_idioms, clippy::all)] + +pub use crate::cancel_token::CancelToken; +pub use crate::client::{Client, SocketConfig}; +pub use crate::config::Config; +pub use crate::connect_raw::RawConnection; +pub use crate::connection::Connection; +use crate::error::DbError; +pub use crate::error::Error; +pub use crate::generic_client::GenericClient; +pub use crate::query::RowStream; +pub use crate::row::{Row, SimpleQueryRow}; +pub use crate::simple_query::SimpleQueryStream; +pub use crate::statement::{Column, Statement}; +pub use crate::tls::NoTls; +pub use crate::to_statement::ToStatement; +pub use crate::transaction::Transaction; +pub use crate::transaction_builder::{IsolationLevel, TransactionBuilder}; +use crate::types::ToSql; +use postgres_protocol2::message::backend::ReadyForQueryBody; + +/// After executing a query, the connection will be in one of these states +#[derive(Clone, Copy, Debug, PartialEq)] +#[repr(u8)] +pub enum ReadyForQueryStatus { + /// Connection state is unknown + Unknown, + /// Connection is idle (no transactions) + Idle = b'I', + /// Connection is in a transaction block + Transaction = b'T', + /// Connection is in a failed transaction block + FailedTransaction = b'E', +} + +impl From for ReadyForQueryStatus { + fn from(value: ReadyForQueryBody) -> Self { + match value.status() { + b'I' => Self::Idle, + b'T' => Self::Transaction, + b'E' => Self::FailedTransaction, + _ => Self::Unknown, + } + } +} + +mod cancel_query; +mod cancel_query_raw; +mod cancel_token; +mod client; +mod codec; +pub mod config; +mod connect; +mod connect_raw; +mod connect_socket; +mod connect_tls; +mod connection; +pub mod error; +mod generic_client; +pub mod maybe_tls_stream; +mod prepare; +mod query; +pub mod row; +mod simple_query; +mod statement; +pub mod tls; +mod to_statement; +mod transaction; +mod transaction_builder; +pub mod types; + +/// An asynchronous notification. +#[derive(Clone, Debug)] +pub struct Notification { + process_id: i32, + channel: String, + payload: String, +} + +impl Notification { + /// The process ID of the notifying backend process. + pub fn process_id(&self) -> i32 { + self.process_id + } + + /// The name of the channel that the notify has been raised on. + pub fn channel(&self) -> &str { + &self.channel + } + + /// The "payload" string passed from the notifying process. + pub fn payload(&self) -> &str { + &self.payload + } +} + +/// An asynchronous message from the server. +#[allow(clippy::large_enum_variant)] +#[derive(Debug, Clone)] +#[non_exhaustive] +pub enum AsyncMessage { + /// A notice. + /// + /// Notices use the same format as errors, but aren't "errors" per-se. + Notice(DbError), + /// A notification. + /// + /// Connections can subscribe to notifications with the `LISTEN` command. + Notification(Notification), +} + +/// Message returned by the `SimpleQuery` stream. +#[derive(Debug)] +#[non_exhaustive] +pub enum SimpleQueryMessage { + /// A row of data. + Row(SimpleQueryRow), + /// A statement in the query has completed. + /// + /// The number of rows modified or selected is returned. + CommandComplete(u64), +} + +fn slice_iter<'a>( + s: &'a [&'a (dyn ToSql + Sync)], +) -> impl ExactSizeIterator + 'a { + s.iter().map(|s| *s as _) +} diff --git a/libs/proxy/tokio-postgres2/src/maybe_tls_stream.rs b/libs/proxy/tokio-postgres2/src/maybe_tls_stream.rs new file mode 100644 index 0000000000..9a7e248997 --- /dev/null +++ b/libs/proxy/tokio-postgres2/src/maybe_tls_stream.rs @@ -0,0 +1,77 @@ +//! MaybeTlsStream. +//! +//! Represents a stream that may or may not be encrypted with TLS. +use crate::tls::{ChannelBinding, TlsStream}; +use std::io; +use std::pin::Pin; +use std::task::{Context, Poll}; +use tokio::io::{AsyncRead, AsyncWrite, ReadBuf}; + +/// A stream that may or may not be encrypted with TLS. +pub enum MaybeTlsStream { + /// An unencrypted stream. + Raw(S), + /// An encrypted stream. + Tls(T), +} + +impl AsyncRead for MaybeTlsStream +where + S: AsyncRead + Unpin, + T: AsyncRead + Unpin, +{ + fn poll_read( + mut self: Pin<&mut Self>, + cx: &mut Context<'_>, + buf: &mut ReadBuf<'_>, + ) -> Poll> { + match &mut *self { + MaybeTlsStream::Raw(s) => Pin::new(s).poll_read(cx, buf), + MaybeTlsStream::Tls(s) => Pin::new(s).poll_read(cx, buf), + } + } +} + +impl AsyncWrite for MaybeTlsStream +where + S: AsyncWrite + Unpin, + T: AsyncWrite + Unpin, +{ + fn poll_write( + mut self: Pin<&mut Self>, + cx: &mut Context<'_>, + buf: &[u8], + ) -> Poll> { + match &mut *self { + MaybeTlsStream::Raw(s) => Pin::new(s).poll_write(cx, buf), + MaybeTlsStream::Tls(s) => Pin::new(s).poll_write(cx, buf), + } + } + + fn poll_flush(mut self: Pin<&mut Self>, cx: &mut Context<'_>) -> Poll> { + match &mut *self { + MaybeTlsStream::Raw(s) => Pin::new(s).poll_flush(cx), + MaybeTlsStream::Tls(s) => Pin::new(s).poll_flush(cx), + } + } + + fn poll_shutdown(mut self: Pin<&mut Self>, cx: &mut Context<'_>) -> Poll> { + match &mut *self { + MaybeTlsStream::Raw(s) => Pin::new(s).poll_shutdown(cx), + MaybeTlsStream::Tls(s) => Pin::new(s).poll_shutdown(cx), + } + } +} + +impl TlsStream for MaybeTlsStream +where + S: AsyncRead + AsyncWrite + Unpin, + T: TlsStream + Unpin, +{ + fn channel_binding(&self) -> ChannelBinding { + match self { + MaybeTlsStream::Raw(_) => ChannelBinding::none(), + MaybeTlsStream::Tls(s) => s.channel_binding(), + } + } +} diff --git a/libs/proxy/tokio-postgres2/src/prepare.rs b/libs/proxy/tokio-postgres2/src/prepare.rs new file mode 100644 index 0000000000..da0c755c5b --- /dev/null +++ b/libs/proxy/tokio-postgres2/src/prepare.rs @@ -0,0 +1,262 @@ +use crate::client::InnerClient; +use crate::codec::FrontendMessage; +use crate::connection::RequestMessages; +use crate::error::SqlState; +use crate::types::{Field, Kind, Oid, Type}; +use crate::{query, slice_iter}; +use crate::{Column, Error, Statement}; +use bytes::Bytes; +use fallible_iterator::FallibleIterator; +use futures_util::{pin_mut, TryStreamExt}; +use log::debug; +use postgres_protocol2::message::backend::Message; +use postgres_protocol2::message::frontend; +use std::future::Future; +use std::pin::Pin; +use std::sync::atomic::{AtomicUsize, Ordering}; +use std::sync::Arc; + +pub(crate) const TYPEINFO_QUERY: &str = "\ +SELECT t.typname, t.typtype, t.typelem, r.rngsubtype, t.typbasetype, n.nspname, t.typrelid +FROM pg_catalog.pg_type t +LEFT OUTER JOIN pg_catalog.pg_range r ON r.rngtypid = t.oid +INNER JOIN pg_catalog.pg_namespace n ON t.typnamespace = n.oid +WHERE t.oid = $1 +"; + +// Range types weren't added until Postgres 9.2, so pg_range may not exist +const TYPEINFO_FALLBACK_QUERY: &str = "\ +SELECT t.typname, t.typtype, t.typelem, NULL::OID, t.typbasetype, n.nspname, t.typrelid +FROM pg_catalog.pg_type t +INNER JOIN pg_catalog.pg_namespace n ON t.typnamespace = n.oid +WHERE t.oid = $1 +"; + +const TYPEINFO_ENUM_QUERY: &str = "\ +SELECT enumlabel +FROM pg_catalog.pg_enum +WHERE enumtypid = $1 +ORDER BY enumsortorder +"; + +// Postgres 9.0 didn't have enumsortorder +const TYPEINFO_ENUM_FALLBACK_QUERY: &str = "\ +SELECT enumlabel +FROM pg_catalog.pg_enum +WHERE enumtypid = $1 +ORDER BY oid +"; + +pub(crate) const TYPEINFO_COMPOSITE_QUERY: &str = "\ +SELECT attname, atttypid +FROM pg_catalog.pg_attribute +WHERE attrelid = $1 +AND NOT attisdropped +AND attnum > 0 +ORDER BY attnum +"; + +static NEXT_ID: AtomicUsize = AtomicUsize::new(0); + +pub async fn prepare( + client: &Arc, + query: &str, + types: &[Type], +) -> Result { + let name = format!("s{}", NEXT_ID.fetch_add(1, Ordering::SeqCst)); + let buf = encode(client, &name, query, types)?; + let mut responses = client.send(RequestMessages::Single(FrontendMessage::Raw(buf)))?; + + match responses.next().await? { + Message::ParseComplete => {} + _ => return Err(Error::unexpected_message()), + } + + let parameter_description = match responses.next().await? { + Message::ParameterDescription(body) => body, + _ => return Err(Error::unexpected_message()), + }; + + let row_description = match responses.next().await? { + Message::RowDescription(body) => Some(body), + Message::NoData => None, + _ => return Err(Error::unexpected_message()), + }; + + let mut parameters = vec![]; + let mut it = parameter_description.parameters(); + while let Some(oid) = it.next().map_err(Error::parse)? { + let type_ = get_type(client, oid).await?; + parameters.push(type_); + } + + let mut columns = vec![]; + if let Some(row_description) = row_description { + let mut it = row_description.fields(); + while let Some(field) = it.next().map_err(Error::parse)? { + let type_ = get_type(client, field.type_oid()).await?; + let column = Column::new(field.name().to_string(), type_, field); + columns.push(column); + } + } + + Ok(Statement::new(client, name, parameters, columns)) +} + +fn prepare_rec<'a>( + client: &'a Arc, + query: &'a str, + types: &'a [Type], +) -> Pin> + 'a + Send>> { + Box::pin(prepare(client, query, types)) +} + +fn encode(client: &InnerClient, name: &str, query: &str, types: &[Type]) -> Result { + if types.is_empty() { + debug!("preparing query {}: {}", name, query); + } else { + debug!("preparing query {} with types {:?}: {}", name, types, query); + } + + client.with_buf(|buf| { + frontend::parse(name, query, types.iter().map(Type::oid), buf).map_err(Error::encode)?; + frontend::describe(b'S', name, buf).map_err(Error::encode)?; + frontend::sync(buf); + Ok(buf.split().freeze()) + }) +} + +pub async fn get_type(client: &Arc, oid: Oid) -> Result { + if let Some(type_) = Type::from_oid(oid) { + return Ok(type_); + } + + if let Some(type_) = client.type_(oid) { + return Ok(type_); + } + + let stmt = typeinfo_statement(client).await?; + + let rows = query::query(client, stmt, slice_iter(&[&oid])).await?; + pin_mut!(rows); + + let row = match rows.try_next().await? { + Some(row) => row, + None => return Err(Error::unexpected_message()), + }; + + let name: String = row.try_get(0)?; + let type_: i8 = row.try_get(1)?; + let elem_oid: Oid = row.try_get(2)?; + let rngsubtype: Option = row.try_get(3)?; + let basetype: Oid = row.try_get(4)?; + let schema: String = row.try_get(5)?; + let relid: Oid = row.try_get(6)?; + + let kind = if type_ == b'e' as i8 { + let variants = get_enum_variants(client, oid).await?; + Kind::Enum(variants) + } else if type_ == b'p' as i8 { + Kind::Pseudo + } else if basetype != 0 { + let type_ = get_type_rec(client, basetype).await?; + Kind::Domain(type_) + } else if elem_oid != 0 { + let type_ = get_type_rec(client, elem_oid).await?; + Kind::Array(type_) + } else if relid != 0 { + let fields = get_composite_fields(client, relid).await?; + Kind::Composite(fields) + } else if let Some(rngsubtype) = rngsubtype { + let type_ = get_type_rec(client, rngsubtype).await?; + Kind::Range(type_) + } else { + Kind::Simple + }; + + let type_ = Type::new(name, oid, kind, schema); + client.set_type(oid, &type_); + + Ok(type_) +} + +fn get_type_rec<'a>( + client: &'a Arc, + oid: Oid, +) -> Pin> + Send + 'a>> { + Box::pin(get_type(client, oid)) +} + +async fn typeinfo_statement(client: &Arc) -> Result { + if let Some(stmt) = client.typeinfo() { + return Ok(stmt); + } + + let stmt = match prepare_rec(client, TYPEINFO_QUERY, &[]).await { + Ok(stmt) => stmt, + Err(ref e) if e.code() == Some(&SqlState::UNDEFINED_TABLE) => { + prepare_rec(client, TYPEINFO_FALLBACK_QUERY, &[]).await? + } + Err(e) => return Err(e), + }; + + client.set_typeinfo(&stmt); + Ok(stmt) +} + +async fn get_enum_variants(client: &Arc, oid: Oid) -> Result, Error> { + let stmt = typeinfo_enum_statement(client).await?; + + query::query(client, stmt, slice_iter(&[&oid])) + .await? + .and_then(|row| async move { row.try_get(0) }) + .try_collect() + .await +} + +async fn typeinfo_enum_statement(client: &Arc) -> Result { + if let Some(stmt) = client.typeinfo_enum() { + return Ok(stmt); + } + + let stmt = match prepare_rec(client, TYPEINFO_ENUM_QUERY, &[]).await { + Ok(stmt) => stmt, + Err(ref e) if e.code() == Some(&SqlState::UNDEFINED_COLUMN) => { + prepare_rec(client, TYPEINFO_ENUM_FALLBACK_QUERY, &[]).await? + } + Err(e) => return Err(e), + }; + + client.set_typeinfo_enum(&stmt); + Ok(stmt) +} + +async fn get_composite_fields(client: &Arc, oid: Oid) -> Result, Error> { + let stmt = typeinfo_composite_statement(client).await?; + + let rows = query::query(client, stmt, slice_iter(&[&oid])) + .await? + .try_collect::>() + .await?; + + let mut fields = vec![]; + for row in rows { + let name = row.try_get(0)?; + let oid = row.try_get(1)?; + let type_ = get_type_rec(client, oid).await?; + fields.push(Field::new(name, type_)); + } + + Ok(fields) +} + +async fn typeinfo_composite_statement(client: &Arc) -> Result { + if let Some(stmt) = client.typeinfo_composite() { + return Ok(stmt); + } + + let stmt = prepare_rec(client, TYPEINFO_COMPOSITE_QUERY, &[]).await?; + + client.set_typeinfo_composite(&stmt); + Ok(stmt) +} diff --git a/libs/proxy/tokio-postgres2/src/query.rs b/libs/proxy/tokio-postgres2/src/query.rs new file mode 100644 index 0000000000..534195a707 --- /dev/null +++ b/libs/proxy/tokio-postgres2/src/query.rs @@ -0,0 +1,340 @@ +use crate::client::{InnerClient, Responses}; +use crate::codec::FrontendMessage; +use crate::connection::RequestMessages; +use crate::types::IsNull; +use crate::{Column, Error, ReadyForQueryStatus, Row, Statement}; +use bytes::{BufMut, Bytes, BytesMut}; +use fallible_iterator::FallibleIterator; +use futures_util::{ready, Stream}; +use log::{debug, log_enabled, Level}; +use pin_project_lite::pin_project; +use postgres_protocol2::message::backend::Message; +use postgres_protocol2::message::frontend; +use postgres_types2::{Format, ToSql, Type}; +use std::fmt; +use std::marker::PhantomPinned; +use std::pin::Pin; +use std::sync::Arc; +use std::task::{Context, Poll}; + +struct BorrowToSqlParamsDebug<'a>(&'a [&'a (dyn ToSql + Sync)]); + +impl fmt::Debug for BorrowToSqlParamsDebug<'_> { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + f.debug_list().entries(self.0.iter()).finish() + } +} + +pub async fn query<'a, I>( + client: &InnerClient, + statement: Statement, + params: I, +) -> Result +where + I: IntoIterator, + I::IntoIter: ExactSizeIterator, +{ + let buf = if log_enabled!(Level::Debug) { + let params = params.into_iter().collect::>(); + debug!( + "executing statement {} with parameters: {:?}", + statement.name(), + BorrowToSqlParamsDebug(params.as_slice()), + ); + encode(client, &statement, params)? + } else { + encode(client, &statement, params)? + }; + let responses = start(client, buf).await?; + Ok(RowStream { + statement, + responses, + command_tag: None, + status: ReadyForQueryStatus::Unknown, + output_format: Format::Binary, + _p: PhantomPinned, + }) +} + +pub async fn query_txt( + client: &Arc, + query: &str, + params: I, +) -> Result +where + S: AsRef, + I: IntoIterator>, + I::IntoIter: ExactSizeIterator, +{ + let params = params.into_iter(); + + let buf = client.with_buf(|buf| { + frontend::parse( + "", // unnamed prepared statement + query, // query to parse + std::iter::empty(), // give no type info + buf, + ) + .map_err(Error::encode)?; + frontend::describe(b'S', "", buf).map_err(Error::encode)?; + // Bind, pass params as text, retrieve as binary + match frontend::bind( + "", // empty string selects the unnamed portal + "", // unnamed prepared statement + std::iter::empty(), // all parameters use the default format (text) + params, + |param, buf| match param { + Some(param) => { + buf.put_slice(param.as_ref().as_bytes()); + Ok(postgres_protocol2::IsNull::No) + } + None => Ok(postgres_protocol2::IsNull::Yes), + }, + Some(0), // all text + buf, + ) { + Ok(()) => Ok(()), + Err(frontend::BindError::Conversion(e)) => Err(Error::to_sql(e, 0)), + Err(frontend::BindError::Serialization(e)) => Err(Error::encode(e)), + }?; + + // Execute + frontend::execute("", 0, buf).map_err(Error::encode)?; + // Sync + frontend::sync(buf); + + Ok(buf.split().freeze()) + })?; + + // now read the responses + let mut responses = client.send(RequestMessages::Single(FrontendMessage::Raw(buf)))?; + + match responses.next().await? { + Message::ParseComplete => {} + _ => return Err(Error::unexpected_message()), + } + + let parameter_description = match responses.next().await? { + Message::ParameterDescription(body) => body, + _ => return Err(Error::unexpected_message()), + }; + + let row_description = match responses.next().await? { + Message::RowDescription(body) => Some(body), + Message::NoData => None, + _ => return Err(Error::unexpected_message()), + }; + + match responses.next().await? { + Message::BindComplete => {} + _ => return Err(Error::unexpected_message()), + } + + let mut parameters = vec![]; + let mut it = parameter_description.parameters(); + while let Some(oid) = it.next().map_err(Error::parse)? { + let type_ = Type::from_oid(oid).unwrap_or(Type::UNKNOWN); + parameters.push(type_); + } + + let mut columns = vec![]; + if let Some(row_description) = row_description { + let mut it = row_description.fields(); + while let Some(field) = it.next().map_err(Error::parse)? { + let type_ = Type::from_oid(field.type_oid()).unwrap_or(Type::UNKNOWN); + let column = Column::new(field.name().to_string(), type_, field); + columns.push(column); + } + } + + Ok(RowStream { + statement: Statement::new_anonymous(parameters, columns), + responses, + command_tag: None, + status: ReadyForQueryStatus::Unknown, + output_format: Format::Text, + _p: PhantomPinned, + }) +} + +pub async fn execute<'a, I>( + client: &InnerClient, + statement: Statement, + params: I, +) -> Result +where + I: IntoIterator, + I::IntoIter: ExactSizeIterator, +{ + let buf = if log_enabled!(Level::Debug) { + let params = params.into_iter().collect::>(); + debug!( + "executing statement {} with parameters: {:?}", + statement.name(), + BorrowToSqlParamsDebug(params.as_slice()), + ); + encode(client, &statement, params)? + } else { + encode(client, &statement, params)? + }; + let mut responses = start(client, buf).await?; + + let mut rows = 0; + loop { + match responses.next().await? { + Message::DataRow(_) => {} + Message::CommandComplete(body) => { + rows = body + .tag() + .map_err(Error::parse)? + .rsplit(' ') + .next() + .unwrap() + .parse() + .unwrap_or(0); + } + Message::EmptyQueryResponse => rows = 0, + Message::ReadyForQuery(_) => return Ok(rows), + _ => return Err(Error::unexpected_message()), + } + } +} + +async fn start(client: &InnerClient, buf: Bytes) -> Result { + let mut responses = client.send(RequestMessages::Single(FrontendMessage::Raw(buf)))?; + + match responses.next().await? { + Message::BindComplete => {} + _ => return Err(Error::unexpected_message()), + } + + Ok(responses) +} + +pub fn encode<'a, I>(client: &InnerClient, statement: &Statement, params: I) -> Result +where + I: IntoIterator, + I::IntoIter: ExactSizeIterator, +{ + client.with_buf(|buf| { + encode_bind(statement, params, "", buf)?; + frontend::execute("", 0, buf).map_err(Error::encode)?; + frontend::sync(buf); + Ok(buf.split().freeze()) + }) +} + +pub fn encode_bind<'a, I>( + statement: &Statement, + params: I, + portal: &str, + buf: &mut BytesMut, +) -> Result<(), Error> +where + I: IntoIterator, + I::IntoIter: ExactSizeIterator, +{ + let param_types = statement.params(); + let params = params.into_iter(); + + assert!( + param_types.len() == params.len(), + "expected {} parameters but got {}", + param_types.len(), + params.len() + ); + + let (param_formats, params): (Vec<_>, Vec<_>) = params + .zip(param_types.iter()) + .map(|(p, ty)| (p.encode_format(ty) as i16, p)) + .unzip(); + + let params = params.into_iter(); + + let mut error_idx = 0; + let r = frontend::bind( + portal, + statement.name(), + param_formats, + params.zip(param_types).enumerate(), + |(idx, (param, ty)), buf| match param.to_sql_checked(ty, buf) { + Ok(IsNull::No) => Ok(postgres_protocol2::IsNull::No), + Ok(IsNull::Yes) => Ok(postgres_protocol2::IsNull::Yes), + Err(e) => { + error_idx = idx; + Err(e) + } + }, + Some(1), + buf, + ); + match r { + Ok(()) => Ok(()), + Err(frontend::BindError::Conversion(e)) => Err(Error::to_sql(e, error_idx)), + Err(frontend::BindError::Serialization(e)) => Err(Error::encode(e)), + } +} + +pin_project! { + /// A stream of table rows. + pub struct RowStream { + statement: Statement, + responses: Responses, + command_tag: Option, + output_format: Format, + status: ReadyForQueryStatus, + #[pin] + _p: PhantomPinned, + } +} + +impl Stream for RowStream { + type Item = Result; + + fn poll_next(self: Pin<&mut Self>, cx: &mut Context<'_>) -> Poll> { + let this = self.project(); + loop { + match ready!(this.responses.poll_next(cx)?) { + Message::DataRow(body) => { + return Poll::Ready(Some(Ok(Row::new( + this.statement.clone(), + body, + *this.output_format, + )?))) + } + Message::EmptyQueryResponse | Message::PortalSuspended => {} + Message::CommandComplete(body) => { + if let Ok(tag) = body.tag() { + *this.command_tag = Some(tag.to_string()); + } + } + Message::ReadyForQuery(status) => { + *this.status = status.into(); + return Poll::Ready(None); + } + _ => return Poll::Ready(Some(Err(Error::unexpected_message()))), + } + } + } +} + +impl RowStream { + /// Returns information about the columns of data in the row. + pub fn columns(&self) -> &[Column] { + self.statement.columns() + } + + /// Returns the command tag of this query. + /// + /// This is only available after the stream has been exhausted. + pub fn command_tag(&self) -> Option { + self.command_tag.clone() + } + + /// Returns if the connection is ready for querying, with the status of the connection. + /// + /// This might be available only after the stream has been exhausted. + pub fn ready_status(&self) -> ReadyForQueryStatus { + self.status + } +} diff --git a/libs/proxy/tokio-postgres2/src/row.rs b/libs/proxy/tokio-postgres2/src/row.rs new file mode 100644 index 0000000000..10e130707d --- /dev/null +++ b/libs/proxy/tokio-postgres2/src/row.rs @@ -0,0 +1,300 @@ +//! Rows. + +use crate::row::sealed::{AsName, Sealed}; +use crate::simple_query::SimpleColumn; +use crate::statement::Column; +use crate::types::{FromSql, Type, WrongType}; +use crate::{Error, Statement}; +use fallible_iterator::FallibleIterator; +use postgres_protocol2::message::backend::DataRowBody; +use postgres_types2::{Format, WrongFormat}; +use std::fmt; +use std::ops::Range; +use std::str; +use std::sync::Arc; + +mod sealed { + pub trait Sealed {} + + pub trait AsName { + fn as_name(&self) -> &str; + } +} + +impl AsName for Column { + fn as_name(&self) -> &str { + self.name() + } +} + +impl AsName for String { + fn as_name(&self) -> &str { + self + } +} + +/// A trait implemented by types that can index into columns of a row. +/// +/// This cannot be implemented outside of this crate. +pub trait RowIndex: Sealed { + #[doc(hidden)] + fn __idx(&self, columns: &[T]) -> Option + where + T: AsName; +} + +impl Sealed for usize {} + +impl RowIndex for usize { + #[inline] + fn __idx(&self, columns: &[T]) -> Option + where + T: AsName, + { + if *self >= columns.len() { + None + } else { + Some(*self) + } + } +} + +impl Sealed for str {} + +impl RowIndex for str { + #[inline] + fn __idx(&self, columns: &[T]) -> Option + where + T: AsName, + { + if let Some(idx) = columns.iter().position(|d| d.as_name() == self) { + return Some(idx); + }; + + // FIXME ASCII-only case insensitivity isn't really the right thing to + // do. Postgres itself uses a dubious wrapper around tolower and JDBC + // uses the US locale. + columns + .iter() + .position(|d| d.as_name().eq_ignore_ascii_case(self)) + } +} + +impl Sealed for &T where T: ?Sized + Sealed {} + +impl RowIndex for &T +where + T: ?Sized + RowIndex, +{ + #[inline] + fn __idx(&self, columns: &[U]) -> Option + where + U: AsName, + { + T::__idx(*self, columns) + } +} + +/// A row of data returned from the database by a query. +pub struct Row { + statement: Statement, + output_format: Format, + body: DataRowBody, + ranges: Vec>>, +} + +impl fmt::Debug for Row { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + f.debug_struct("Row") + .field("columns", &self.columns()) + .finish() + } +} + +impl Row { + pub(crate) fn new( + statement: Statement, + body: DataRowBody, + output_format: Format, + ) -> Result { + let ranges = body.ranges().collect().map_err(Error::parse)?; + Ok(Row { + statement, + body, + ranges, + output_format, + }) + } + + /// Returns information about the columns of data in the row. + pub fn columns(&self) -> &[Column] { + self.statement.columns() + } + + /// Determines if the row contains no values. + pub fn is_empty(&self) -> bool { + self.len() == 0 + } + + /// Returns the number of values in the row. + pub fn len(&self) -> usize { + self.columns().len() + } + + /// Deserializes a value from the row. + /// + /// The value can be specified either by its numeric index in the row, or by its column name. + /// + /// # Panics + /// + /// Panics if the index is out of bounds or if the value cannot be converted to the specified type. + pub fn get<'a, I, T>(&'a self, idx: I) -> T + where + I: RowIndex + fmt::Display, + T: FromSql<'a>, + { + match self.get_inner(&idx) { + Ok(ok) => ok, + Err(err) => panic!("error retrieving column {}: {}", idx, err), + } + } + + /// Like `Row::get`, but returns a `Result` rather than panicking. + pub fn try_get<'a, I, T>(&'a self, idx: I) -> Result + where + I: RowIndex + fmt::Display, + T: FromSql<'a>, + { + self.get_inner(&idx) + } + + fn get_inner<'a, I, T>(&'a self, idx: &I) -> Result + where + I: RowIndex + fmt::Display, + T: FromSql<'a>, + { + let idx = match idx.__idx(self.columns()) { + Some(idx) => idx, + None => return Err(Error::column(idx.to_string())), + }; + + let ty = self.columns()[idx].type_(); + if !T::accepts(ty) { + return Err(Error::from_sql( + Box::new(WrongType::new::(ty.clone())), + idx, + )); + } + + FromSql::from_sql_nullable(ty, self.col_buffer(idx)).map_err(|e| Error::from_sql(e, idx)) + } + + /// Get the raw bytes for the column at the given index. + fn col_buffer(&self, idx: usize) -> Option<&[u8]> { + let range = self.ranges.get(idx)?.to_owned()?; + Some(&self.body.buffer()[range]) + } + + /// Interpret the column at the given index as text + /// + /// Useful when using query_raw_txt() which sets text transfer mode + pub fn as_text(&self, idx: usize) -> Result, Error> { + if self.output_format == Format::Text { + match self.col_buffer(idx) { + Some(raw) => { + FromSql::from_sql(&Type::TEXT, raw).map_err(|e| Error::from_sql(e, idx)) + } + None => Ok(None), + } + } else { + Err(Error::from_sql(Box::new(WrongFormat {}), idx)) + } + } + + /// Row byte size + pub fn body_len(&self) -> usize { + self.body.buffer().len() + } +} + +impl AsName for SimpleColumn { + fn as_name(&self) -> &str { + self.name() + } +} + +/// A row of data returned from the database by a simple query. +#[derive(Debug)] +pub struct SimpleQueryRow { + columns: Arc<[SimpleColumn]>, + body: DataRowBody, + ranges: Vec>>, +} + +impl SimpleQueryRow { + #[allow(clippy::new_ret_no_self)] + pub(crate) fn new( + columns: Arc<[SimpleColumn]>, + body: DataRowBody, + ) -> Result { + let ranges = body.ranges().collect().map_err(Error::parse)?; + Ok(SimpleQueryRow { + columns, + body, + ranges, + }) + } + + /// Returns information about the columns of data in the row. + pub fn columns(&self) -> &[SimpleColumn] { + &self.columns + } + + /// Determines if the row contains no values. + pub fn is_empty(&self) -> bool { + self.len() == 0 + } + + /// Returns the number of values in the row. + pub fn len(&self) -> usize { + self.columns.len() + } + + /// Returns a value from the row. + /// + /// The value can be specified either by its numeric index in the row, or by its column name. + /// + /// # Panics + /// + /// Panics if the index is out of bounds or if the value cannot be converted to the specified type. + pub fn get(&self, idx: I) -> Option<&str> + where + I: RowIndex + fmt::Display, + { + match self.get_inner(&idx) { + Ok(ok) => ok, + Err(err) => panic!("error retrieving column {}: {}", idx, err), + } + } + + /// Like `SimpleQueryRow::get`, but returns a `Result` rather than panicking. + pub fn try_get(&self, idx: I) -> Result, Error> + where + I: RowIndex + fmt::Display, + { + self.get_inner(&idx) + } + + fn get_inner(&self, idx: &I) -> Result, Error> + where + I: RowIndex + fmt::Display, + { + let idx = match idx.__idx(&self.columns) { + Some(idx) => idx, + None => return Err(Error::column(idx.to_string())), + }; + + let buf = self.ranges[idx].clone().map(|r| &self.body.buffer()[r]); + FromSql::from_sql_nullable(&Type::TEXT, buf).map_err(|e| Error::from_sql(e, idx)) + } +} diff --git a/libs/proxy/tokio-postgres2/src/simple_query.rs b/libs/proxy/tokio-postgres2/src/simple_query.rs new file mode 100644 index 0000000000..fb2550377b --- /dev/null +++ b/libs/proxy/tokio-postgres2/src/simple_query.rs @@ -0,0 +1,142 @@ +use crate::client::{InnerClient, Responses}; +use crate::codec::FrontendMessage; +use crate::connection::RequestMessages; +use crate::{Error, ReadyForQueryStatus, SimpleQueryMessage, SimpleQueryRow}; +use bytes::Bytes; +use fallible_iterator::FallibleIterator; +use futures_util::{ready, Stream}; +use log::debug; +use pin_project_lite::pin_project; +use postgres_protocol2::message::backend::Message; +use postgres_protocol2::message::frontend; +use std::marker::PhantomPinned; +use std::pin::Pin; +use std::sync::Arc; +use std::task::{Context, Poll}; + +/// Information about a column of a single query row. +#[derive(Debug)] +pub struct SimpleColumn { + name: String, +} + +impl SimpleColumn { + pub(crate) fn new(name: String) -> SimpleColumn { + SimpleColumn { name } + } + + /// Returns the name of the column. + pub fn name(&self) -> &str { + &self.name + } +} + +pub async fn simple_query(client: &InnerClient, query: &str) -> Result { + debug!("executing simple query: {}", query); + + let buf = encode(client, query)?; + let responses = client.send(RequestMessages::Single(FrontendMessage::Raw(buf)))?; + + Ok(SimpleQueryStream { + responses, + columns: None, + status: ReadyForQueryStatus::Unknown, + _p: PhantomPinned, + }) +} + +pub async fn batch_execute( + client: &InnerClient, + query: &str, +) -> Result { + debug!("executing statement batch: {}", query); + + let buf = encode(client, query)?; + let mut responses = client.send(RequestMessages::Single(FrontendMessage::Raw(buf)))?; + + loop { + match responses.next().await? { + Message::ReadyForQuery(status) => return Ok(status.into()), + Message::CommandComplete(_) + | Message::EmptyQueryResponse + | Message::RowDescription(_) + | Message::DataRow(_) => {} + _ => return Err(Error::unexpected_message()), + } + } +} + +pub(crate) fn encode(client: &InnerClient, query: &str) -> Result { + client.with_buf(|buf| { + frontend::query(query, buf).map_err(Error::encode)?; + Ok(buf.split().freeze()) + }) +} + +pin_project! { + /// A stream of simple query results. + pub struct SimpleQueryStream { + responses: Responses, + columns: Option>, + status: ReadyForQueryStatus, + #[pin] + _p: PhantomPinned, + } +} + +impl SimpleQueryStream { + /// Returns if the connection is ready for querying, with the status of the connection. + /// + /// This might be available only after the stream has been exhausted. + pub fn ready_status(&self) -> ReadyForQueryStatus { + self.status + } +} + +impl Stream for SimpleQueryStream { + type Item = Result; + + fn poll_next(self: Pin<&mut Self>, cx: &mut Context<'_>) -> Poll> { + let this = self.project(); + loop { + match ready!(this.responses.poll_next(cx)?) { + Message::CommandComplete(body) => { + let rows = body + .tag() + .map_err(Error::parse)? + .rsplit(' ') + .next() + .unwrap() + .parse() + .unwrap_or(0); + return Poll::Ready(Some(Ok(SimpleQueryMessage::CommandComplete(rows)))); + } + Message::EmptyQueryResponse => { + return Poll::Ready(Some(Ok(SimpleQueryMessage::CommandComplete(0)))); + } + Message::RowDescription(body) => { + let columns = body + .fields() + .map(|f| Ok(SimpleColumn::new(f.name().to_string()))) + .collect::>() + .map_err(Error::parse)? + .into(); + + *this.columns = Some(columns); + } + Message::DataRow(body) => { + let row = match &this.columns { + Some(columns) => SimpleQueryRow::new(columns.clone(), body)?, + None => return Poll::Ready(Some(Err(Error::unexpected_message()))), + }; + return Poll::Ready(Some(Ok(SimpleQueryMessage::Row(row)))); + } + Message::ReadyForQuery(s) => { + *this.status = s.into(); + return Poll::Ready(None); + } + _ => return Poll::Ready(Some(Err(Error::unexpected_message()))), + } + } + } +} diff --git a/libs/proxy/tokio-postgres2/src/statement.rs b/libs/proxy/tokio-postgres2/src/statement.rs new file mode 100644 index 0000000000..22e160fc05 --- /dev/null +++ b/libs/proxy/tokio-postgres2/src/statement.rs @@ -0,0 +1,157 @@ +use crate::client::InnerClient; +use crate::codec::FrontendMessage; +use crate::connection::RequestMessages; +use crate::types::Type; +use postgres_protocol2::{ + message::{backend::Field, frontend}, + Oid, +}; +use std::{ + fmt, + sync::{Arc, Weak}, +}; + +struct StatementInner { + client: Weak, + name: String, + params: Vec, + columns: Vec, +} + +impl Drop for StatementInner { + fn drop(&mut self) { + if let Some(client) = self.client.upgrade() { + let buf = client.with_buf(|buf| { + frontend::close(b'S', &self.name, buf).unwrap(); + frontend::sync(buf); + buf.split().freeze() + }); + let _ = client.send(RequestMessages::Single(FrontendMessage::Raw(buf))); + } + } +} + +/// A prepared statement. +/// +/// Prepared statements can only be used with the connection that created them. +#[derive(Clone)] +pub struct Statement(Arc); + +impl Statement { + pub(crate) fn new( + inner: &Arc, + name: String, + params: Vec, + columns: Vec, + ) -> Statement { + Statement(Arc::new(StatementInner { + client: Arc::downgrade(inner), + name, + params, + columns, + })) + } + + pub(crate) fn new_anonymous(params: Vec, columns: Vec) -> Statement { + Statement(Arc::new(StatementInner { + client: Weak::new(), + name: String::new(), + params, + columns, + })) + } + + pub(crate) fn name(&self) -> &str { + &self.0.name + } + + /// Returns the expected types of the statement's parameters. + pub fn params(&self) -> &[Type] { + &self.0.params + } + + /// Returns information about the columns returned when the statement is queried. + pub fn columns(&self) -> &[Column] { + &self.0.columns + } +} + +/// Information about a column of a query. +pub struct Column { + name: String, + type_: Type, + + // raw fields from RowDescription + table_oid: Oid, + column_id: i16, + format: i16, + + // that better be stored in self.type_, but that is more radical refactoring + type_oid: Oid, + type_size: i16, + type_modifier: i32, +} + +impl Column { + pub(crate) fn new(name: String, type_: Type, raw_field: Field<'_>) -> Column { + Column { + name, + type_, + table_oid: raw_field.table_oid(), + column_id: raw_field.column_id(), + format: raw_field.format(), + type_oid: raw_field.type_oid(), + type_size: raw_field.type_size(), + type_modifier: raw_field.type_modifier(), + } + } + + /// Returns the name of the column. + pub fn name(&self) -> &str { + &self.name + } + + /// Returns the type of the column. + pub fn type_(&self) -> &Type { + &self.type_ + } + + /// Returns the table OID of the column. + pub fn table_oid(&self) -> Oid { + self.table_oid + } + + /// Returns the column ID of the column. + pub fn column_id(&self) -> i16 { + self.column_id + } + + /// Returns the format of the column. + pub fn format(&self) -> i16 { + self.format + } + + /// Returns the type OID of the column. + pub fn type_oid(&self) -> Oid { + self.type_oid + } + + /// Returns the type size of the column. + pub fn type_size(&self) -> i16 { + self.type_size + } + + /// Returns the type modifier of the column. + pub fn type_modifier(&self) -> i32 { + self.type_modifier + } +} + +impl fmt::Debug for Column { + fn fmt(&self, fmt: &mut fmt::Formatter<'_>) -> fmt::Result { + fmt.debug_struct("Column") + .field("name", &self.name) + .field("type", &self.type_) + .finish() + } +} diff --git a/libs/proxy/tokio-postgres2/src/tls.rs b/libs/proxy/tokio-postgres2/src/tls.rs new file mode 100644 index 0000000000..dc8140719f --- /dev/null +++ b/libs/proxy/tokio-postgres2/src/tls.rs @@ -0,0 +1,162 @@ +//! TLS support. + +use std::error::Error; +use std::future::Future; +use std::pin::Pin; +use std::task::{Context, Poll}; +use std::{fmt, io}; +use tokio::io::{AsyncRead, AsyncWrite, ReadBuf}; + +pub(crate) mod private { + pub struct ForcePrivateApi; +} + +/// Channel binding information returned from a TLS handshake. +pub struct ChannelBinding { + pub(crate) tls_server_end_point: Option>, +} + +impl ChannelBinding { + /// Creates a `ChannelBinding` containing no information. + pub fn none() -> ChannelBinding { + ChannelBinding { + tls_server_end_point: None, + } + } + + /// Creates a `ChannelBinding` containing `tls-server-end-point` channel binding information. + pub fn tls_server_end_point(tls_server_end_point: Vec) -> ChannelBinding { + ChannelBinding { + tls_server_end_point: Some(tls_server_end_point), + } + } +} + +/// A constructor of `TlsConnect`ors. +/// +/// Requires the `runtime` Cargo feature (enabled by default). +pub trait MakeTlsConnect { + /// The stream type created by the `TlsConnect` implementation. + type Stream: TlsStream + Unpin; + /// The `TlsConnect` implementation created by this type. + type TlsConnect: TlsConnect; + /// The error type returned by the `TlsConnect` implementation. + type Error: Into>; + + /// Creates a new `TlsConnect`or. + /// + /// The domain name is provided for certificate verification and SNI. + fn make_tls_connect(&mut self, domain: &str) -> Result; +} + +/// An asynchronous function wrapping a stream in a TLS session. +pub trait TlsConnect { + /// The stream returned by the future. + type Stream: TlsStream + Unpin; + /// The error returned by the future. + type Error: Into>; + /// The future returned by the connector. + type Future: Future>; + + /// Returns a future performing a TLS handshake over the stream. + fn connect(self, stream: S) -> Self::Future; + + #[doc(hidden)] + fn can_connect(&self, _: private::ForcePrivateApi) -> bool { + true + } +} + +/// A TLS-wrapped connection to a PostgreSQL database. +pub trait TlsStream: AsyncRead + AsyncWrite { + /// Returns channel binding information for the session. + fn channel_binding(&self) -> ChannelBinding; +} + +/// A `MakeTlsConnect` and `TlsConnect` implementation which simply returns an error. +/// +/// This can be used when `sslmode` is `none` or `prefer`. +#[derive(Debug, Copy, Clone)] +pub struct NoTls; + +impl MakeTlsConnect for NoTls { + type Stream = NoTlsStream; + type TlsConnect = NoTls; + type Error = NoTlsError; + + fn make_tls_connect(&mut self, _: &str) -> Result { + Ok(NoTls) + } +} + +impl TlsConnect for NoTls { + type Stream = NoTlsStream; + type Error = NoTlsError; + type Future = NoTlsFuture; + + fn connect(self, _: S) -> NoTlsFuture { + NoTlsFuture(()) + } + + fn can_connect(&self, _: private::ForcePrivateApi) -> bool { + false + } +} + +/// The future returned by `NoTls`. +pub struct NoTlsFuture(()); + +impl Future for NoTlsFuture { + type Output = Result; + + fn poll(self: Pin<&mut Self>, _: &mut Context<'_>) -> Poll { + Poll::Ready(Err(NoTlsError(()))) + } +} + +/// The TLS "stream" type produced by the `NoTls` connector. +/// +/// Since `NoTls` doesn't support TLS, this type is uninhabited. +pub enum NoTlsStream {} + +impl AsyncRead for NoTlsStream { + fn poll_read( + self: Pin<&mut Self>, + _: &mut Context<'_>, + _: &mut ReadBuf<'_>, + ) -> Poll> { + match *self {} + } +} + +impl AsyncWrite for NoTlsStream { + fn poll_write(self: Pin<&mut Self>, _: &mut Context<'_>, _: &[u8]) -> Poll> { + match *self {} + } + + fn poll_flush(self: Pin<&mut Self>, _: &mut Context<'_>) -> Poll> { + match *self {} + } + + fn poll_shutdown(self: Pin<&mut Self>, _: &mut Context<'_>) -> Poll> { + match *self {} + } +} + +impl TlsStream for NoTlsStream { + fn channel_binding(&self) -> ChannelBinding { + match *self {} + } +} + +/// The error returned by `NoTls`. +#[derive(Debug)] +pub struct NoTlsError(()); + +impl fmt::Display for NoTlsError { + fn fmt(&self, fmt: &mut fmt::Formatter<'_>) -> fmt::Result { + fmt.write_str("no TLS implementation configured") + } +} + +impl Error for NoTlsError {} diff --git a/libs/proxy/tokio-postgres2/src/to_statement.rs b/libs/proxy/tokio-postgres2/src/to_statement.rs new file mode 100644 index 0000000000..427f77dd79 --- /dev/null +++ b/libs/proxy/tokio-postgres2/src/to_statement.rs @@ -0,0 +1,57 @@ +use crate::to_statement::private::{Sealed, ToStatementType}; +use crate::Statement; + +mod private { + use crate::{Client, Error, Statement}; + + pub trait Sealed {} + + pub enum ToStatementType<'a> { + Statement(&'a Statement), + Query(&'a str), + } + + impl<'a> ToStatementType<'a> { + pub async fn into_statement(self, client: &Client) -> Result { + match self { + ToStatementType::Statement(s) => Ok(s.clone()), + ToStatementType::Query(s) => client.prepare(s).await, + } + } + } +} + +/// A trait abstracting over prepared and unprepared statements. +/// +/// Many methods are generic over this bound, so that they support both a raw query string as well as a statement which +/// was prepared previously. +/// +/// This trait is "sealed" and cannot be implemented by anything outside this crate. +pub trait ToStatement: Sealed { + #[doc(hidden)] + fn __convert(&self) -> ToStatementType<'_>; +} + +impl ToStatement for Statement { + fn __convert(&self) -> ToStatementType<'_> { + ToStatementType::Statement(self) + } +} + +impl Sealed for Statement {} + +impl ToStatement for str { + fn __convert(&self) -> ToStatementType<'_> { + ToStatementType::Query(self) + } +} + +impl Sealed for str {} + +impl ToStatement for String { + fn __convert(&self) -> ToStatementType<'_> { + ToStatementType::Query(self) + } +} + +impl Sealed for String {} diff --git a/libs/proxy/tokio-postgres2/src/transaction.rs b/libs/proxy/tokio-postgres2/src/transaction.rs new file mode 100644 index 0000000000..03a57e4947 --- /dev/null +++ b/libs/proxy/tokio-postgres2/src/transaction.rs @@ -0,0 +1,74 @@ +use crate::codec::FrontendMessage; +use crate::connection::RequestMessages; +use crate::query::RowStream; +use crate::{CancelToken, Client, Error, ReadyForQueryStatus}; +use postgres_protocol2::message::frontend; + +/// A representation of a PostgreSQL database transaction. +/// +/// Transactions will implicitly roll back when dropped. Use the `commit` method to commit the changes made in the +/// transaction. Transactions can be nested, with inner transactions implemented via safepoints. +pub struct Transaction<'a> { + client: &'a mut Client, + done: bool, +} + +impl Drop for Transaction<'_> { + fn drop(&mut self) { + if self.done { + return; + } + + let buf = self.client.inner().with_buf(|buf| { + frontend::query("ROLLBACK", buf).unwrap(); + buf.split().freeze() + }); + let _ = self + .client + .inner() + .send(RequestMessages::Single(FrontendMessage::Raw(buf))); + } +} + +impl<'a> Transaction<'a> { + pub(crate) fn new(client: &'a mut Client) -> Transaction<'a> { + Transaction { + client, + done: false, + } + } + + /// Consumes the transaction, committing all changes made within it. + pub async fn commit(mut self) -> Result { + self.done = true; + self.client.batch_execute("COMMIT").await + } + + /// Rolls the transaction back, discarding all changes made within it. + /// + /// This is equivalent to `Transaction`'s `Drop` implementation, but provides any error encountered to the caller. + pub async fn rollback(mut self) -> Result { + self.done = true; + self.client.batch_execute("ROLLBACK").await + } + + /// Like `Client::query_raw_txt`. + pub async fn query_raw_txt(&self, statement: &str, params: I) -> Result + where + S: AsRef, + I: IntoIterator>, + I::IntoIter: ExactSizeIterator, + { + self.client.query_raw_txt(statement, params).await + } + + /// Like `Client::cancel_token`. + pub fn cancel_token(&self) -> CancelToken { + self.client.cancel_token() + } + + /// Returns a reference to the underlying `Client`. + pub fn client(&self) -> &Client { + self.client + } +} diff --git a/libs/proxy/tokio-postgres2/src/transaction_builder.rs b/libs/proxy/tokio-postgres2/src/transaction_builder.rs new file mode 100644 index 0000000000..9718ac588c --- /dev/null +++ b/libs/proxy/tokio-postgres2/src/transaction_builder.rs @@ -0,0 +1,113 @@ +use crate::{Client, Error, Transaction}; + +/// The isolation level of a database transaction. +#[derive(Debug, Copy, Clone)] +#[non_exhaustive] +pub enum IsolationLevel { + /// Equivalent to `ReadCommitted`. + ReadUncommitted, + + /// An individual statement in the transaction will see rows committed before it began. + ReadCommitted, + + /// All statements in the transaction will see the same view of rows committed before the first query in the + /// transaction. + RepeatableRead, + + /// The reads and writes in this transaction must be able to be committed as an atomic "unit" with respect to reads + /// and writes of all other concurrent serializable transactions without interleaving. + Serializable, +} + +/// A builder for database transactions. +pub struct TransactionBuilder<'a> { + client: &'a mut Client, + isolation_level: Option, + read_only: Option, + deferrable: Option, +} + +impl<'a> TransactionBuilder<'a> { + pub(crate) fn new(client: &'a mut Client) -> TransactionBuilder<'a> { + TransactionBuilder { + client, + isolation_level: None, + read_only: None, + deferrable: None, + } + } + + /// Sets the isolation level of the transaction. + pub fn isolation_level(mut self, isolation_level: IsolationLevel) -> Self { + self.isolation_level = Some(isolation_level); + self + } + + /// Sets the access mode of the transaction. + pub fn read_only(mut self, read_only: bool) -> Self { + self.read_only = Some(read_only); + self + } + + /// Sets the deferrability of the transaction. + /// + /// If the transaction is also serializable and read only, creation of the transaction may block, but when it + /// completes the transaction is able to run with less overhead and a guarantee that it will not be aborted due to + /// serialization failure. + pub fn deferrable(mut self, deferrable: bool) -> Self { + self.deferrable = Some(deferrable); + self + } + + /// Begins the transaction. + /// + /// The transaction will roll back by default - use the `commit` method to commit it. + pub async fn start(self) -> Result, Error> { + let mut query = "START TRANSACTION".to_string(); + let mut first = true; + + if let Some(level) = self.isolation_level { + first = false; + + query.push_str(" ISOLATION LEVEL "); + let level = match level { + IsolationLevel::ReadUncommitted => "READ UNCOMMITTED", + IsolationLevel::ReadCommitted => "READ COMMITTED", + IsolationLevel::RepeatableRead => "REPEATABLE READ", + IsolationLevel::Serializable => "SERIALIZABLE", + }; + query.push_str(level); + } + + if let Some(read_only) = self.read_only { + if !first { + query.push(','); + } + first = false; + + let s = if read_only { + " READ ONLY" + } else { + " READ WRITE" + }; + query.push_str(s); + } + + if let Some(deferrable) = self.deferrable { + if !first { + query.push(','); + } + + let s = if deferrable { + " DEFERRABLE" + } else { + " NOT DEFERRABLE" + }; + query.push_str(s); + } + + self.client.batch_execute(&query).await?; + + Ok(Transaction::new(self.client)) + } +} diff --git a/libs/proxy/tokio-postgres2/src/types.rs b/libs/proxy/tokio-postgres2/src/types.rs new file mode 100644 index 0000000000..e571d7ee00 --- /dev/null +++ b/libs/proxy/tokio-postgres2/src/types.rs @@ -0,0 +1,6 @@ +//! Types. +//! +//! This module is a reexport of the `postgres_types` crate. + +#[doc(inline)] +pub use postgres_types2::*; diff --git a/libs/remote_storage/src/azure_blob.rs b/libs/remote_storage/src/azure_blob.rs index ae0a94295c..8d1962fa29 100644 --- a/libs/remote_storage/src/azure_blob.rs +++ b/libs/remote_storage/src/azure_blob.rs @@ -35,6 +35,7 @@ use utils::backoff; use utils::backoff::exponential_backoff_duration_seconds; use crate::metrics::{start_measuring_requests, AttemptOutcome, RequestKind}; +use crate::DownloadKind; use crate::{ config::AzureConfig, error::Cancelled, ConcurrencyLimiter, Download, DownloadError, DownloadOpts, Listing, ListingMode, ListingObject, RemotePath, RemoteStorage, StorageMetadata, @@ -49,10 +50,17 @@ pub struct AzureBlobStorage { concurrency_limiter: ConcurrencyLimiter, // Per-request timeout. Accessible for tests. pub timeout: Duration, + + // Alternative timeout used for metadata objects which are expected to be small + pub small_timeout: Duration, } impl AzureBlobStorage { - pub fn new(azure_config: &AzureConfig, timeout: Duration) -> Result { + pub fn new( + azure_config: &AzureConfig, + timeout: Duration, + small_timeout: Duration, + ) -> Result { debug!( "Creating azure remote storage for azure container {}", azure_config.container_name @@ -94,6 +102,7 @@ impl AzureBlobStorage { max_keys_per_list_response, concurrency_limiter: ConcurrencyLimiter::new(azure_config.concurrency_limit.get()), timeout, + small_timeout, }) } @@ -133,6 +142,7 @@ impl AzureBlobStorage { async fn download_for_builder( &self, builder: GetBlobBuilder, + timeout: Duration, cancel: &CancellationToken, ) -> Result { let kind = RequestKind::Get; @@ -156,7 +166,7 @@ impl AzureBlobStorage { .map_err(to_download_error); // apply per request timeout - let response = tokio_stream::StreamExt::timeout(response, self.timeout); + let response = tokio_stream::StreamExt::timeout(response, timeout); // flatten let response = response.map(|res| match res { @@ -220,6 +230,11 @@ impl AzureBlobStorage { let started_at = ScopeGuard::into_inner(started_at); let outcome = match &download { Ok(_) => AttemptOutcome::Ok, + // At this level in the stack 404 and 304 responses do not indicate an error. + // There's expected cases when a blob may not exist or hasn't been modified since + // the last get (e.g. probing for timeline indices and heatmap downloads). + // Callers should handle errors if they are unexpected. + Err(DownloadError::NotFound | DownloadError::Unmodified) => AttemptOutcome::Ok, Err(_) => AttemptOutcome::Err, }; crate::metrics::BUCKET_METRICS @@ -410,7 +425,7 @@ impl RemoteStorage for AzureBlobStorage { let blob_client = self.client.blob_client(self.relative_path_to_name(key)); let properties_future = blob_client.get_properties().into_future(); - let properties_future = tokio::time::timeout(self.timeout, properties_future); + let properties_future = tokio::time::timeout(self.small_timeout, properties_future); let res = tokio::select! { res = properties_future => res, @@ -516,7 +531,12 @@ impl RemoteStorage for AzureBlobStorage { }); } - self.download_for_builder(builder, cancel).await + let timeout = match opts.kind { + DownloadKind::Small => self.small_timeout, + DownloadKind::Large => self.timeout, + }; + + self.download_for_builder(builder, timeout, cancel).await } async fn delete(&self, path: &RemotePath, cancel: &CancellationToken) -> anyhow::Result<()> { diff --git a/libs/remote_storage/src/config.rs b/libs/remote_storage/src/config.rs index e99ae4f747..f6ef31077c 100644 --- a/libs/remote_storage/src/config.rs +++ b/libs/remote_storage/src/config.rs @@ -24,6 +24,13 @@ pub struct RemoteStorageConfig { skip_serializing_if = "is_default_timeout" )] pub timeout: Duration, + /// Alternative timeout used for metadata objects which are expected to be small + #[serde( + with = "humantime_serde", + default = "default_small_timeout", + skip_serializing_if = "is_default_small_timeout" + )] + pub small_timeout: Duration, } impl RemoteStorageKind { @@ -40,10 +47,18 @@ fn default_timeout() -> Duration { RemoteStorageConfig::DEFAULT_TIMEOUT } +fn default_small_timeout() -> Duration { + RemoteStorageConfig::DEFAULT_SMALL_TIMEOUT +} + fn is_default_timeout(d: &Duration) -> bool { *d == RemoteStorageConfig::DEFAULT_TIMEOUT } +fn is_default_small_timeout(d: &Duration) -> bool { + *d == RemoteStorageConfig::DEFAULT_SMALL_TIMEOUT +} + /// A kind of a remote storage to connect to, with its connection configuration. #[derive(Debug, Clone, PartialEq, Eq, Deserialize, Serialize)] #[serde(untagged)] @@ -184,6 +199,7 @@ fn serialize_storage_class( impl RemoteStorageConfig { pub const DEFAULT_TIMEOUT: Duration = std::time::Duration::from_secs(120); + pub const DEFAULT_SMALL_TIMEOUT: Duration = std::time::Duration::from_secs(30); pub fn from_toml(toml: &toml_edit::Item) -> anyhow::Result { Ok(utils::toml_edit_ext::deserialize_item(toml)?) @@ -219,7 +235,8 @@ timeout = '5s'"; storage: RemoteStorageKind::LocalFs { local_path: Utf8PathBuf::from(".") }, - timeout: Duration::from_secs(5) + timeout: Duration::from_secs(5), + small_timeout: RemoteStorageConfig::DEFAULT_SMALL_TIMEOUT } ); } @@ -247,7 +264,8 @@ timeout = '5s'"; max_keys_per_list_response: DEFAULT_MAX_KEYS_PER_LIST_RESPONSE, upload_storage_class: Some(StorageClass::IntelligentTiering), }), - timeout: Duration::from_secs(7) + timeout: Duration::from_secs(7), + small_timeout: RemoteStorageConfig::DEFAULT_SMALL_TIMEOUT } ); } @@ -299,7 +317,8 @@ timeout = '5s'"; concurrency_limit: default_remote_storage_azure_concurrency_limit(), max_keys_per_list_response: DEFAULT_MAX_KEYS_PER_LIST_RESPONSE, }), - timeout: Duration::from_secs(7) + timeout: Duration::from_secs(7), + small_timeout: RemoteStorageConfig::DEFAULT_SMALL_TIMEOUT } ); } diff --git a/libs/remote_storage/src/lib.rs b/libs/remote_storage/src/lib.rs index 719608dd5f..0ece29d99e 100644 --- a/libs/remote_storage/src/lib.rs +++ b/libs/remote_storage/src/lib.rs @@ -178,6 +178,15 @@ pub struct DownloadOpts { /// The end of the byte range to download, or unbounded. Must be after the /// start bound. pub byte_end: Bound, + /// Indicate whether we're downloading something small or large: this indirectly controls + /// timeouts: for something like an index/manifest/heatmap, we should time out faster than + /// for layer files + pub kind: DownloadKind, +} + +pub enum DownloadKind { + Large, + Small, } impl Default for DownloadOpts { @@ -186,6 +195,7 @@ impl Default for DownloadOpts { etag: Default::default(), byte_start: Bound::Unbounded, byte_end: Bound::Unbounded, + kind: DownloadKind::Large, } } } @@ -584,6 +594,10 @@ impl GenericRemoteStorage> { impl GenericRemoteStorage { pub async fn from_config(storage_config: &RemoteStorageConfig) -> anyhow::Result { let timeout = storage_config.timeout; + + // If somkeone overrides timeout to be small without adjusting small_timeout, then adjust it automatically + let small_timeout = std::cmp::min(storage_config.small_timeout, timeout); + Ok(match &storage_config.storage { RemoteStorageKind::LocalFs { local_path: path } => { info!("Using fs root '{path}' as a remote storage"); @@ -606,7 +620,11 @@ impl GenericRemoteStorage { .unwrap_or(""); info!("Using azure container '{}' in account '{storage_account}' in region '{}' as a remote storage, prefix in container: '{:?}'", azure_config.container_name, azure_config.container_region, azure_config.prefix_in_container); - Self::AzureBlob(Arc::new(AzureBlobStorage::new(azure_config, timeout)?)) + Self::AzureBlob(Arc::new(AzureBlobStorage::new( + azure_config, + timeout, + small_timeout, + )?)) } }) } diff --git a/libs/remote_storage/tests/test_real_azure.rs b/libs/remote_storage/tests/test_real_azure.rs index 3a20649490..92d579fec8 100644 --- a/libs/remote_storage/tests/test_real_azure.rs +++ b/libs/remote_storage/tests/test_real_azure.rs @@ -219,7 +219,8 @@ async fn create_azure_client( concurrency_limit: NonZeroUsize::new(100).unwrap(), max_keys_per_list_response, }), - timeout: Duration::from_secs(120), + timeout: RemoteStorageConfig::DEFAULT_TIMEOUT, + small_timeout: RemoteStorageConfig::DEFAULT_SMALL_TIMEOUT, }; Ok(Arc::new( GenericRemoteStorage::from_config(&remote_storage_config) diff --git a/libs/remote_storage/tests/test_real_s3.rs b/libs/remote_storage/tests/test_real_s3.rs index 3e99a65fac..e60ec18c93 100644 --- a/libs/remote_storage/tests/test_real_s3.rs +++ b/libs/remote_storage/tests/test_real_s3.rs @@ -396,6 +396,7 @@ async fn create_s3_client( upload_storage_class: None, }), timeout: RemoteStorageConfig::DEFAULT_TIMEOUT, + small_timeout: RemoteStorageConfig::DEFAULT_SMALL_TIMEOUT, }; Ok(Arc::new( GenericRemoteStorage::from_config(&remote_storage_config) diff --git a/libs/utils/Cargo.toml b/libs/utils/Cargo.toml index f440b81d8f..66500fb141 100644 --- a/libs/utils/Cargo.toml +++ b/libs/utils/Cargo.toml @@ -19,12 +19,14 @@ bincode.workspace = true bytes.workspace = true camino.workspace = true chrono.workspace = true +diatomic-waker.workspace = true git-version.workspace = true hex = { workspace = true, features = ["serde"] } humantime.workspace = true hyper0 = { workspace = true, features = ["full"] } fail.workspace = true futures = { workspace = true} +jemalloc_pprof.workspace = true jsonwebtoken.workspace = true nix.workspace = true once_cell.workspace = true @@ -45,6 +47,7 @@ tracing.workspace = true tracing-error.workspace = true tracing-subscriber = { workspace = true, features = ["json", "registry"] } rand.workspace = true +scopeguard.workspace = true strum.workspace = true strum_macros.workspace = true url.workspace = true diff --git a/libs/utils/src/http/endpoint.rs b/libs/utils/src/http/endpoint.rs index 6a85f0ddeb..d975b63677 100644 --- a/libs/utils/src/http/endpoint.rs +++ b/libs/utils/src/http/endpoint.rs @@ -10,6 +10,7 @@ use metrics::{register_int_counter, Encoder, IntCounter, TextEncoder}; use once_cell::sync::Lazy; use routerify::ext::RequestExt; use routerify::{Middleware, RequestInfo, Router, RouterBuilder}; +use tokio_util::io::ReaderStream; use tracing::{debug, info, info_span, warn, Instrument}; use std::future::Future; @@ -407,6 +408,69 @@ pub async fn profile_cpu_handler(req: Request) -> Result, A } } +/// Generates heap profiles. +/// +/// This only works with jemalloc on Linux. +pub async fn profile_heap_handler(req: Request) -> Result, ApiError> { + enum Format { + Jemalloc, + Pprof, + } + + // Parameters. + let format = match get_query_param(&req, "format")?.as_deref() { + None => Format::Pprof, + Some("jemalloc") => Format::Jemalloc, + Some("pprof") => Format::Pprof, + Some(format) => return Err(ApiError::BadRequest(anyhow!("invalid format {format}"))), + }; + + // Obtain profiler handle. + let mut prof_ctl = jemalloc_pprof::PROF_CTL + .as_ref() + .ok_or(ApiError::InternalServerError(anyhow!( + "heap profiling not enabled" + )))? + .lock() + .await; + if !prof_ctl.activated() { + return Err(ApiError::InternalServerError(anyhow!( + "heap profiling not enabled" + ))); + } + + // Take and return the profile. + match format { + Format::Jemalloc => { + // NB: file is an open handle to a tempfile that's already deleted. + let file = tokio::task::spawn_blocking(move || prof_ctl.dump()) + .await + .map_err(|join_err| ApiError::InternalServerError(join_err.into()))? + .map_err(ApiError::InternalServerError)?; + let stream = ReaderStream::new(tokio::fs::File::from_std(file)); + Response::builder() + .status(200) + .header(CONTENT_TYPE, "application/octet-stream") + .header(CONTENT_DISPOSITION, "attachment; filename=\"heap.dump\"") + .body(Body::wrap_stream(stream)) + .map_err(|err| ApiError::InternalServerError(err.into())) + } + + Format::Pprof => { + let data = tokio::task::spawn_blocking(move || prof_ctl.dump_pprof()) + .await + .map_err(|join_err| ApiError::InternalServerError(join_err.into()))? + .map_err(ApiError::InternalServerError)?; + Response::builder() + .status(200) + .header(CONTENT_TYPE, "application/octet-stream") + .header(CONTENT_DISPOSITION, "attachment; filename=\"heap.pb\"") + .body(Body::from(data)) + .map_err(|err| ApiError::InternalServerError(err.into())) + } + } +} + pub fn add_request_id_middleware( ) -> Middleware { Middleware::pre(move |req| async move { diff --git a/libs/utils/src/sync.rs b/libs/utils/src/sync.rs index 2ee8f35449..280637de8f 100644 --- a/libs/utils/src/sync.rs +++ b/libs/utils/src/sync.rs @@ -1,3 +1,6 @@ pub mod heavier_once_cell; +pub mod duplex; pub mod gate; + +pub mod spsc_fold; diff --git a/libs/utils/src/sync/duplex.rs b/libs/utils/src/sync/duplex.rs new file mode 100644 index 0000000000..fac79297a0 --- /dev/null +++ b/libs/utils/src/sync/duplex.rs @@ -0,0 +1 @@ +pub mod mpsc; diff --git a/libs/utils/src/sync/duplex/mpsc.rs b/libs/utils/src/sync/duplex/mpsc.rs new file mode 100644 index 0000000000..56b4e6d2b3 --- /dev/null +++ b/libs/utils/src/sync/duplex/mpsc.rs @@ -0,0 +1,36 @@ +use tokio::sync::mpsc; + +/// A bi-directional channel. +pub struct Duplex { + pub tx: mpsc::Sender, + pub rx: mpsc::Receiver, +} + +/// Creates a bi-directional channel. +/// +/// The channel will buffer up to the provided number of messages. Once the buffer is full, +/// attempts to send new messages will wait until a message is received from the channel. +/// The provided buffer capacity must be at least 1. +pub fn channel(buffer: usize) -> (Duplex, Duplex) { + let (tx_a, rx_a) = mpsc::channel::(buffer); + let (tx_b, rx_b) = mpsc::channel::(buffer); + + (Duplex { tx: tx_a, rx: rx_b }, Duplex { tx: tx_b, rx: rx_a }) +} + +impl Duplex { + /// Sends a value, waiting until there is capacity. + /// + /// A successful send occurs when it is determined that the other end of the channel has not hung up already. + pub async fn send(&self, x: S) -> Result<(), mpsc::error::SendError> { + self.tx.send(x).await + } + + /// Receives the next value for this receiver. + /// + /// This method returns `None` if the channel has been closed and there are + /// no remaining messages in the channel's buffer. + pub async fn recv(&mut self) -> Option { + self.rx.recv().await + } +} diff --git a/libs/utils/src/sync/spsc_fold.rs b/libs/utils/src/sync/spsc_fold.rs new file mode 100644 index 0000000000..b44f766ef0 --- /dev/null +++ b/libs/utils/src/sync/spsc_fold.rs @@ -0,0 +1,452 @@ +use core::{future::poll_fn, task::Poll}; +use std::sync::{Arc, Mutex}; + +use diatomic_waker::DiatomicWaker; + +pub struct Sender { + state: Arc>, +} + +pub struct Receiver { + state: Arc>, +} + +struct Inner { + wake_receiver: DiatomicWaker, + wake_sender: DiatomicWaker, + value: Mutex>, +} + +enum State { + NoData, + HasData(T), + TryFoldFailed, // transient state + SenderWaitsForReceiverToConsume(T), + SenderGone(Option), + ReceiverGone, + AllGone, + SenderDropping, // transient state + ReceiverDropping, // transient state +} + +pub fn channel() -> (Sender, Receiver) { + let inner = Inner { + wake_receiver: DiatomicWaker::new(), + wake_sender: DiatomicWaker::new(), + value: Mutex::new(State::NoData), + }; + + let state = Arc::new(inner); + ( + Sender { + state: state.clone(), + }, + Receiver { state }, + ) +} + +#[derive(Debug, thiserror::Error)] +pub enum SendError { + #[error("receiver is gone")] + ReceiverGone, +} + +impl Sender { + /// # Panics + /// + /// If `try_fold` panics, any subsequent call to `send` panic. + pub async fn send(&mut self, value: T, try_fold: F) -> Result<(), SendError> + where + F: Fn(&mut T, T) -> Result<(), T>, + { + let mut value = Some(value); + poll_fn(|cx| { + let mut guard = self.state.value.lock().unwrap(); + match &mut *guard { + State::NoData => { + *guard = State::HasData(value.take().unwrap()); + self.state.wake_receiver.notify(); + Poll::Ready(Ok(())) + } + State::HasData(_) => { + let State::HasData(acc_mut) = &mut *guard else { + unreachable!("this match arm guarantees that the guard is HasData"); + }; + match try_fold(acc_mut, value.take().unwrap()) { + Ok(()) => { + // no need to wake receiver, if it was waiting it already + // got a wake-up when we transitioned from NoData to HasData + Poll::Ready(Ok(())) + } + Err(unfoldable_value) => { + value = Some(unfoldable_value); + let State::HasData(acc) = + std::mem::replace(&mut *guard, State::TryFoldFailed) + else { + unreachable!("this match arm guarantees that the guard is HasData"); + }; + *guard = State::SenderWaitsForReceiverToConsume(acc); + // SAFETY: send is single threaded due to `&mut self` requirement, + // therefore register is not concurrent. + unsafe { + self.state.wake_sender.register(cx.waker()); + } + Poll::Pending + } + } + } + State::SenderWaitsForReceiverToConsume(_data) => { + // Really, we shouldn't be polled until receiver has consumed and wakes us. + Poll::Pending + } + State::ReceiverGone => Poll::Ready(Err(SendError::ReceiverGone)), + State::SenderGone(_) + | State::AllGone + | State::SenderDropping + | State::ReceiverDropping + | State::TryFoldFailed => { + unreachable!(); + } + } + }) + .await + } +} + +impl Drop for Sender { + fn drop(&mut self) { + scopeguard::defer! { + self.state.wake_receiver.notify() + }; + let Ok(mut guard) = self.state.value.lock() else { + return; + }; + *guard = match std::mem::replace(&mut *guard, State::SenderDropping) { + State::NoData => State::SenderGone(None), + State::HasData(data) | State::SenderWaitsForReceiverToConsume(data) => { + State::SenderGone(Some(data)) + } + State::ReceiverGone => State::AllGone, + State::TryFoldFailed + | State::SenderGone(_) + | State::AllGone + | State::SenderDropping + | State::ReceiverDropping => { + unreachable!("unreachable state {:?}", guard.discriminant_str()) + } + } + } +} + +#[derive(Debug, thiserror::Error)] +pub enum RecvError { + #[error("sender is gone")] + SenderGone, +} + +impl Receiver { + pub async fn recv(&mut self) -> Result { + poll_fn(|cx| { + let mut guard = self.state.value.lock().unwrap(); + match &mut *guard { + State::NoData => { + // SAFETY: recv is single threaded due to `&mut self` requirement, + // therefore register is not concurrent. + unsafe { + self.state.wake_receiver.register(cx.waker()); + } + Poll::Pending + } + guard @ State::HasData(_) + | guard @ State::SenderWaitsForReceiverToConsume(_) + | guard @ State::SenderGone(Some(_)) => { + let data = guard + .take_data() + .expect("in these states, data is guaranteed to be present"); + self.state.wake_sender.notify(); + Poll::Ready(Ok(data)) + } + State::SenderGone(None) => Poll::Ready(Err(RecvError::SenderGone)), + State::ReceiverGone + | State::AllGone + | State::SenderDropping + | State::ReceiverDropping + | State::TryFoldFailed => { + unreachable!("unreachable state {:?}", guard.discriminant_str()); + } + } + }) + .await + } +} + +impl Drop for Receiver { + fn drop(&mut self) { + scopeguard::defer! { + self.state.wake_sender.notify() + }; + let Ok(mut guard) = self.state.value.lock() else { + return; + }; + *guard = match std::mem::replace(&mut *guard, State::ReceiverDropping) { + State::NoData => State::ReceiverGone, + State::HasData(_) | State::SenderWaitsForReceiverToConsume(_) => State::ReceiverGone, + State::SenderGone(_) => State::AllGone, + State::TryFoldFailed + | State::ReceiverGone + | State::AllGone + | State::SenderDropping + | State::ReceiverDropping => { + unreachable!("unreachable state {:?}", guard.discriminant_str()) + } + } + } +} + +impl State { + fn take_data(&mut self) -> Option { + match self { + State::HasData(_) => { + let State::HasData(data) = std::mem::replace(self, State::NoData) else { + unreachable!("this match arm guarantees that the state is HasData"); + }; + Some(data) + } + State::SenderWaitsForReceiverToConsume(_) => { + let State::SenderWaitsForReceiverToConsume(data) = + std::mem::replace(self, State::NoData) + else { + unreachable!( + "this match arm guarantees that the state is SenderWaitsForReceiverToConsume" + ); + }; + Some(data) + } + State::SenderGone(data) => Some(data.take().unwrap()), + State::NoData + | State::TryFoldFailed + | State::ReceiverGone + | State::AllGone + | State::SenderDropping + | State::ReceiverDropping => None, + } + } + fn discriminant_str(&self) -> &'static str { + match self { + State::NoData => "NoData", + State::HasData(_) => "HasData", + State::TryFoldFailed => "TryFoldFailed", + State::SenderWaitsForReceiverToConsume(_) => "SenderWaitsForReceiverToConsume", + State::SenderGone(_) => "SenderGone", + State::ReceiverGone => "ReceiverGone", + State::AllGone => "AllGone", + State::SenderDropping => "SenderDropping", + State::ReceiverDropping => "ReceiverDropping", + } + } +} + +#[cfg(test)] +mod tests { + + use super::*; + + const FOREVER: std::time::Duration = std::time::Duration::from_secs(u64::MAX); + + #[tokio::test] + async fn test_send_recv() { + let (mut sender, mut receiver) = channel(); + + sender + .send(42, |acc, val| { + *acc += val; + Ok(()) + }) + .await + .unwrap(); + + let received = receiver.recv().await.unwrap(); + assert_eq!(received, 42); + } + + #[tokio::test] + async fn test_send_recv_with_fold() { + let (mut sender, mut receiver) = channel(); + + sender + .send(1, |acc, val| { + *acc += val; + Ok(()) + }) + .await + .unwrap(); + sender + .send(2, |acc, val| { + *acc += val; + Ok(()) + }) + .await + .unwrap(); + + let received = receiver.recv().await.unwrap(); + assert_eq!(received, 3); + } + + #[tokio::test(start_paused = true)] + async fn test_sender_waits_for_receiver_if_try_fold_fails() { + let (mut sender, mut receiver) = channel(); + + sender.send(23, |_, _| panic!("first send")).await.unwrap(); + + let send_fut = sender.send(42, |_, val| Err(val)); + let mut send_fut = std::pin::pin!(send_fut); + + tokio::select! { + _ = tokio::time::sleep(FOREVER) => {}, + _ = &mut send_fut => { + panic!("send should not complete"); + }, + } + + let val = receiver.recv().await.unwrap(); + assert_eq!(val, 23); + + tokio::select! { + _ = tokio::time::sleep(FOREVER) => { + panic!("receiver should have consumed the value"); + }, + _ = &mut send_fut => { }, + } + + let val = receiver.recv().await.unwrap(); + assert_eq!(val, 42); + } + + #[tokio::test(start_paused = true)] + async fn test_sender_errors_if_waits_for_receiver_and_receiver_drops() { + let (mut sender, receiver) = channel(); + + sender.send(23, |_, _| unreachable!()).await.unwrap(); + + let send_fut = sender.send(42, |_, val| Err(val)); + let send_fut = std::pin::pin!(send_fut); + + drop(receiver); + + let result = send_fut.await; + assert!(matches!(result, Err(SendError::ReceiverGone))); + } + + #[tokio::test(start_paused = true)] + async fn test_receiver_errors_if_waits_for_sender_and_sender_drops() { + let (sender, mut receiver) = channel::<()>(); + + let recv_fut = receiver.recv(); + let recv_fut = std::pin::pin!(recv_fut); + + drop(sender); + + let result = recv_fut.await; + assert!(matches!(result, Err(RecvError::SenderGone))); + } + + #[tokio::test(start_paused = true)] + async fn test_receiver_errors_if_waits_for_sender_and_sender_drops_with_data() { + let (mut sender, mut receiver) = channel(); + + sender.send(42, |_, _| unreachable!()).await.unwrap(); + + { + let recv_fut = receiver.recv(); + let recv_fut = std::pin::pin!(recv_fut); + + drop(sender); + + let val = recv_fut.await.unwrap(); + assert_eq!(val, 42); + } + + let result = receiver.recv().await; + assert!(matches!(result, Err(RecvError::SenderGone))); + } + + #[tokio::test(start_paused = true)] + async fn test_receiver_waits_for_sender_if_no_data() { + let (mut sender, mut receiver) = channel(); + + let recv_fut = receiver.recv(); + let mut recv_fut = std::pin::pin!(recv_fut); + + tokio::select! { + _ = tokio::time::sleep(FOREVER) => {}, + _ = &mut recv_fut => { + panic!("recv should not complete"); + }, + } + + sender.send(42, |_, _| Ok(())).await.unwrap(); + + let val = recv_fut.await.unwrap(); + assert_eq!(val, 42); + } + + #[tokio::test] + async fn test_receiver_gone_while_nodata() { + let (mut sender, receiver) = channel(); + drop(receiver); + + let result = sender.send(42, |_, _| Ok(())).await; + assert!(matches!(result, Err(SendError::ReceiverGone))); + } + + #[tokio::test] + async fn test_sender_gone_while_nodata() { + let (sender, mut receiver) = super::channel::(); + drop(sender); + + let result = receiver.recv().await; + assert!(matches!(result, Err(RecvError::SenderGone))); + } + + #[tokio::test(start_paused = true)] + async fn test_receiver_drops_after_sender_went_to_sleep() { + let (mut sender, receiver) = channel(); + let state = receiver.state.clone(); + + sender.send(23, |_, _| unreachable!()).await.unwrap(); + + let send_task = tokio::spawn(async move { sender.send(42, |_, v| Err(v)).await }); + + tokio::time::sleep(FOREVER).await; + + assert!(matches!( + &*state.value.lock().unwrap(), + &State::SenderWaitsForReceiverToConsume(_) + )); + + drop(receiver); + + let err = send_task + .await + .unwrap() + .expect_err("should unblock immediately"); + assert!(matches!(err, SendError::ReceiverGone)); + } + + #[tokio::test(start_paused = true)] + async fn test_sender_drops_after_receiver_went_to_sleep() { + let (sender, mut receiver) = channel::(); + let state = sender.state.clone(); + + let recv_task = tokio::spawn(async move { receiver.recv().await }); + + tokio::time::sleep(FOREVER).await; + + assert!(matches!(&*state.value.lock().unwrap(), &State::NoData)); + + drop(sender); + + let err = recv_task.await.unwrap().expect_err("should error"); + assert!(matches!(err, RecvError::SenderGone)); + } +} diff --git a/libs/wal_decoder/src/decoder.rs b/libs/wal_decoder/src/decoder.rs index 36c4b19266..aa50c62911 100644 --- a/libs/wal_decoder/src/decoder.rs +++ b/libs/wal_decoder/src/decoder.rs @@ -112,30 +112,38 @@ impl MetadataRecord { }; // Next, filter the metadata record by shard. - - // Route VM page updates to the shards that own them. VM pages are stored in the VM fork - // of the main relation. These are sharded and managed just like regular relation pages. - // See: https://github.com/neondatabase/neon/issues/9855 - if let Some( - MetadataRecord::Heapam(HeapamRecord::ClearVmBits(ref mut clear_vm_bits)) - | MetadataRecord::Neonrmgr(NeonrmgrRecord::ClearVmBits(ref mut clear_vm_bits)), - ) = metadata_record - { - let is_local_vm_page = |heap_blk| { - let vm_blk = pg_constants::HEAPBLK_TO_MAPBLOCK(heap_blk); - shard.is_key_local(&rel_block_to_key(clear_vm_bits.vm_rel, vm_blk)) - }; - // Send the old and new VM page updates to their respective shards. - clear_vm_bits.old_heap_blkno = clear_vm_bits - .old_heap_blkno - .filter(|&blkno| is_local_vm_page(blkno)); - clear_vm_bits.new_heap_blkno = clear_vm_bits - .new_heap_blkno - .filter(|&blkno| is_local_vm_page(blkno)); - // If neither VM page belongs to this shard, discard the record. - if clear_vm_bits.old_heap_blkno.is_none() && clear_vm_bits.new_heap_blkno.is_none() { - metadata_record = None + match metadata_record { + Some( + MetadataRecord::Heapam(HeapamRecord::ClearVmBits(ref mut clear_vm_bits)) + | MetadataRecord::Neonrmgr(NeonrmgrRecord::ClearVmBits(ref mut clear_vm_bits)), + ) => { + // Route VM page updates to the shards that own them. VM pages are stored in the VM fork + // of the main relation. These are sharded and managed just like regular relation pages. + // See: https://github.com/neondatabase/neon/issues/9855 + let is_local_vm_page = |heap_blk| { + let vm_blk = pg_constants::HEAPBLK_TO_MAPBLOCK(heap_blk); + shard.is_key_local(&rel_block_to_key(clear_vm_bits.vm_rel, vm_blk)) + }; + // Send the old and new VM page updates to their respective shards. + clear_vm_bits.old_heap_blkno = clear_vm_bits + .old_heap_blkno + .filter(|&blkno| is_local_vm_page(blkno)); + clear_vm_bits.new_heap_blkno = clear_vm_bits + .new_heap_blkno + .filter(|&blkno| is_local_vm_page(blkno)); + // If neither VM page belongs to this shard, discard the record. + if clear_vm_bits.old_heap_blkno.is_none() && clear_vm_bits.new_heap_blkno.is_none() + { + metadata_record = None + } } + Some(MetadataRecord::LogicalMessage(LogicalMessageRecord::Put(_))) => { + // Filter LogicalMessage records (AUX files) to only be stored on shard zero + if !shard.is_shard_zero() { + metadata_record = None; + } + } + _ => {} } Ok(metadata_record) diff --git a/pageserver/benches/bench_ingest.rs b/pageserver/benches/bench_ingest.rs index caacd365b3..b67a9cc479 100644 --- a/pageserver/benches/bench_ingest.rs +++ b/pageserver/benches/bench_ingest.rs @@ -62,10 +62,8 @@ async fn ingest( let ctx = RequestContext::new(TaskKind::DebugTool, DownloadBehavior::Error); let gate = utils::sync::gate::Gate::default(); - let entered = gate.enter().unwrap(); - let layer = - InMemoryLayer::create(conf, timeline_id, tenant_shard_id, lsn, entered, &ctx).await?; + let layer = InMemoryLayer::create(conf, timeline_id, tenant_shard_id, lsn, &gate, &ctx).await?; let data = Value::Image(Bytes::from(vec![0u8; put_size])); let data_ser_size = data.serialized_size().unwrap() as usize; diff --git a/pageserver/client/src/mgmt_api.rs b/pageserver/client/src/mgmt_api.rs index 4d76c66905..c3a1ef8140 100644 --- a/pageserver/client/src/mgmt_api.rs +++ b/pageserver/client/src/mgmt_api.rs @@ -1,4 +1,4 @@ -use std::collections::HashMap; +use std::{collections::HashMap, error::Error as _}; use bytes::Bytes; use detach_ancestor::AncestorDetached; @@ -25,10 +25,10 @@ pub struct Client { #[derive(thiserror::Error, Debug)] pub enum Error { - #[error("send request: {0}")] + #[error("send request: {0}{}", .0.source().map(|e| format!(": {e}")).unwrap_or_default())] SendRequest(reqwest::Error), - #[error("receive body: {0}")] + #[error("receive body: {0}{}", .0.source().map(|e| format!(": {e}")).unwrap_or_default())] ReceiveBody(reqwest::Error), #[error("receive error body: {0}")] diff --git a/pageserver/src/bin/pageserver.rs b/pageserver/src/bin/pageserver.rs index a8c2c2e992..567a69da3b 100644 --- a/pageserver/src/bin/pageserver.rs +++ b/pageserver/src/bin/pageserver.rs @@ -53,6 +53,11 @@ project_build_tag!(BUILD_TAG); #[global_allocator] static GLOBAL: tikv_jemallocator::Jemalloc = tikv_jemallocator::Jemalloc; +/// Configure jemalloc to sample allocations for profiles every 1 MB (1 << 20). +#[allow(non_upper_case_globals)] +#[export_name = "malloc_conf"] +pub static malloc_conf: &[u8] = b"prof:true,prof_active:true,lg_prof_sample:20\0"; + const PID_FILE_NAME: &str = "pageserver.pid"; const FEATURES: &[&str] = &[ @@ -127,6 +132,7 @@ fn main() -> anyhow::Result<()> { info!(?conf.virtual_file_io_engine, "starting with virtual_file IO engine"); info!(?conf.virtual_file_io_mode, "starting with virtual_file IO mode"); info!(?conf.wal_receiver_protocol, "starting with WAL receiver protocol"); + info!(?conf.page_service_pipelining, "starting with page service pipelining config"); // The tenants directory contains all the pageserver local disk state. // Create if not exists and make sure all the contents are durable before proceeding. @@ -302,7 +308,7 @@ fn start_pageserver( pageserver::metrics::tokio_epoll_uring::Collector::new(), )) .unwrap(); - pageserver::preinitialize_metrics(); + pageserver::preinitialize_metrics(conf); // If any failpoints were set from FAILPOINTS environment variable, // print them to the log for debugging purposes @@ -630,45 +636,59 @@ fn start_pageserver( tokio::net::TcpListener::from_std(pageserver_listener).context("create tokio listener")? }); - let mut shutdown_pageserver = Some(shutdown_pageserver.drop_guard()); - // All started up! Now just sit and wait for shutdown signal. + BACKGROUND_RUNTIME.block_on(async move { + let signal_token = CancellationToken::new(); + let signal_cancel = signal_token.child_token(); - { - BACKGROUND_RUNTIME.block_on(async move { + // Spawn signal handlers. Runs in a loop since we want to be responsive to multiple signals + // even after triggering shutdown (e.g. a SIGQUIT after a slow SIGTERM shutdown). See: + // https://github.com/neondatabase/neon/issues/9740. + tokio::spawn(async move { let mut sigint = tokio::signal::unix::signal(SignalKind::interrupt()).unwrap(); let mut sigterm = tokio::signal::unix::signal(SignalKind::terminate()).unwrap(); let mut sigquit = tokio::signal::unix::signal(SignalKind::quit()).unwrap(); - let signal = tokio::select! { - _ = sigquit.recv() => { - info!("Got signal SIGQUIT. Terminating in immediate shutdown mode",); - std::process::exit(111); + + loop { + let signal = tokio::select! { + _ = sigquit.recv() => { + info!("Got signal SIGQUIT. Terminating in immediate shutdown mode."); + std::process::exit(111); + } + _ = sigint.recv() => "SIGINT", + _ = sigterm.recv() => "SIGTERM", + }; + + if !signal_token.is_cancelled() { + info!("Got signal {signal}. Terminating gracefully in fast shutdown mode."); + signal_token.cancel(); + } else { + info!("Got signal {signal}. Already shutting down."); } - _ = sigint.recv() => { "SIGINT" }, - _ = sigterm.recv() => { "SIGTERM" }, - }; + } + }); - info!("Got signal {signal}. Terminating gracefully in fast shutdown mode",); + // Wait for cancellation signal and shut down the pageserver. + // + // This cancels the `shutdown_pageserver` cancellation tree. Right now that tree doesn't + // reach very far, and `task_mgr` is used instead. The plan is to change that over time. + signal_cancel.cancelled().await; - // This cancels the `shutdown_pageserver` cancellation tree. - // Right now that tree doesn't reach very far, and `task_mgr` is used instead. - // The plan is to change that over time. - shutdown_pageserver.take(); - pageserver::shutdown_pageserver( - http_endpoint_listener, - page_service, - consumption_metrics_tasks, - disk_usage_eviction_task, - &tenant_manager, - background_purges, - deletion_queue.clone(), - secondary_controller_tasks, - 0, - ) - .await; - unreachable!() - }) - } + shutdown_pageserver.cancel(); + pageserver::shutdown_pageserver( + http_endpoint_listener, + page_service, + consumption_metrics_tasks, + disk_usage_eviction_task, + &tenant_manager, + background_purges, + deletion_queue.clone(), + secondary_controller_tasks, + 0, + ) + .await; + unreachable!(); + }) } async fn create_remote_storage_client( diff --git a/pageserver/src/config.rs b/pageserver/src/config.rs index 2cf237e72b..1651db8500 100644 --- a/pageserver/src/config.rs +++ b/pageserver/src/config.rs @@ -188,11 +188,9 @@ pub struct PageServerConf { /// Optionally disable disk syncs (unsafe!) pub no_sync: bool, - /// Maximum amount of time for which a get page request request - /// might be held up for request merging. - pub server_side_batch_timeout: Option, - pub wal_receiver_protocol: PostgresClientProtocol, + + pub page_service_pipelining: pageserver_api::config::PageServicePipeliningConfig, } /// Token for authentication to safekeepers @@ -350,10 +348,10 @@ impl PageServerConf { concurrent_tenant_warmup, concurrent_tenant_size_logical_size_queries, virtual_file_io_engine, - server_side_batch_timeout, tenant_config, no_sync, wal_receiver_protocol, + page_service_pipelining, } = config_toml; let mut conf = PageServerConf { @@ -393,11 +391,11 @@ impl PageServerConf { image_compression, timeline_offloading, ephemeral_bytes_per_memory_kb, - server_side_batch_timeout, import_pgdata_upcall_api, import_pgdata_upcall_api_token: import_pgdata_upcall_api_token.map(SecretString::from), import_pgdata_aws_endpoint_url, wal_receiver_protocol, + page_service_pipelining, // ------------------------------------------------------------ // fields that require additional validation or custom handling diff --git a/pageserver/src/consumption_metrics/upload.rs b/pageserver/src/consumption_metrics/upload.rs index 1cb4e917c0..448bf47525 100644 --- a/pageserver/src/consumption_metrics/upload.rs +++ b/pageserver/src/consumption_metrics/upload.rs @@ -1,3 +1,4 @@ +use std::error::Error as _; use std::time::SystemTime; use chrono::{DateTime, Utc}; @@ -350,7 +351,11 @@ impl std::fmt::Display for UploadError { match self { Rejected(code) => write!(f, "server rejected the metrics with {code}"), - Reqwest(e) => write!(f, "request failed: {e}"), + Reqwest(e) => write!( + f, + "request failed: {e}{}", + e.source().map(|e| format!(": {e}")).unwrap_or_default() + ), Cancelled => write!(f, "cancelled"), } } diff --git a/pageserver/src/context.rs b/pageserver/src/context.rs index 7afcf52cf2..8f2177fe5b 100644 --- a/pageserver/src/context.rs +++ b/pageserver/src/context.rs @@ -91,8 +91,6 @@ use crate::task_mgr::TaskKind; -pub(crate) mod optional_counter; - // The main structure of this module, see module-level comment. #[derive(Debug)] pub struct RequestContext { @@ -100,7 +98,6 @@ pub struct RequestContext { download_behavior: DownloadBehavior, access_stats_behavior: AccessStatsBehavior, page_content_kind: PageContentKind, - pub micros_spent_throttled: optional_counter::MicroSecondsCounterU32, } /// The kind of access to the page cache. @@ -158,7 +155,6 @@ impl RequestContextBuilder { download_behavior: DownloadBehavior::Download, access_stats_behavior: AccessStatsBehavior::Update, page_content_kind: PageContentKind::Unknown, - micros_spent_throttled: Default::default(), }, } } @@ -172,7 +168,6 @@ impl RequestContextBuilder { download_behavior: original.download_behavior, access_stats_behavior: original.access_stats_behavior, page_content_kind: original.page_content_kind, - micros_spent_throttled: Default::default(), }, } } diff --git a/pageserver/src/context/optional_counter.rs b/pageserver/src/context/optional_counter.rs deleted file mode 100644 index 100c649f18..0000000000 --- a/pageserver/src/context/optional_counter.rs +++ /dev/null @@ -1,101 +0,0 @@ -use std::{ - sync::atomic::{AtomicU32, Ordering}, - time::Duration, -}; - -#[derive(Debug)] -pub struct CounterU32 { - inner: AtomicU32, -} -impl Default for CounterU32 { - fn default() -> Self { - Self { - inner: AtomicU32::new(u32::MAX), - } - } -} -impl CounterU32 { - pub fn open(&self) -> Result<(), &'static str> { - match self - .inner - .compare_exchange(u32::MAX, 0, Ordering::Relaxed, Ordering::Relaxed) - { - Ok(_) => Ok(()), - Err(_) => Err("open() called on clsoed state"), - } - } - pub fn close(&self) -> Result { - match self.inner.swap(u32::MAX, Ordering::Relaxed) { - u32::MAX => Err("close() called on closed state"), - x => Ok(x), - } - } - - pub fn add(&self, count: u32) -> Result<(), &'static str> { - if count == 0 { - return Ok(()); - } - let mut had_err = None; - self.inner - .fetch_update(Ordering::Relaxed, Ordering::Relaxed, |cur| match cur { - u32::MAX => { - had_err = Some("add() called on closed state"); - None - } - x => { - let (new, overflowed) = x.overflowing_add(count); - if new == u32::MAX || overflowed { - had_err = Some("add() overflowed the counter"); - None - } else { - Some(new) - } - } - }) - .map_err(|_| had_err.expect("we set it whenever the function returns None")) - .map(|_| ()) - } -} - -#[derive(Default, Debug)] -pub struct MicroSecondsCounterU32 { - inner: CounterU32, -} - -impl MicroSecondsCounterU32 { - pub fn open(&self) -> Result<(), &'static str> { - self.inner.open() - } - pub fn add(&self, duration: Duration) -> Result<(), &'static str> { - match duration.as_micros().try_into() { - Ok(x) => self.inner.add(x), - Err(_) => Err("add(): duration conversion error"), - } - } - pub fn close_and_checked_sub_from(&self, from: Duration) -> Result { - let val = self.inner.close()?; - let val = Duration::from_micros(val as u64); - let subbed = match from.checked_sub(val) { - Some(v) => v, - None => return Err("Duration::checked_sub"), - }; - Ok(subbed) - } -} - -#[cfg(test)] -mod tests { - - use super::*; - - #[test] - fn test_basic() { - let counter = MicroSecondsCounterU32::default(); - counter.open().unwrap(); - counter.add(Duration::from_micros(23)).unwrap(); - let res = counter - .close_and_checked_sub_from(Duration::from_micros(42)) - .unwrap(); - assert_eq!(res, Duration::from_micros(42 - 23)); - } -} diff --git a/pageserver/src/controller_upcall_client.rs b/pageserver/src/controller_upcall_client.rs index 73fc6dc3ab..d41bfd9021 100644 --- a/pageserver/src/controller_upcall_client.rs +++ b/pageserver/src/controller_upcall_client.rs @@ -115,6 +115,10 @@ impl ControllerUpcallClient { Ok(res) } + + pub(crate) fn base_url(&self) -> &Url { + &self.base_url + } } impl ControlPlaneGenerationsApi for ControllerUpcallClient { @@ -191,13 +195,15 @@ impl ControlPlaneGenerationsApi for ControllerUpcallClient { let request = ReAttachRequest { node_id: self.node_id, - register, + register: register.clone(), }; let response: ReAttachResponse = self.retry_http_forever(&re_attach_path, request).await?; tracing::info!( - "Received re-attach response with {} tenants", - response.tenants.len() + "Received re-attach response with {} tenants (node {}, register: {:?})", + response.tenants.len(), + self.node_id, + register, ); failpoint_support::sleep_millis_async!("control-plane-client-re-attach"); diff --git a/pageserver/src/deletion_queue.rs b/pageserver/src/deletion_queue.rs index e74c8ecf5a..1d508f5fe9 100644 --- a/pageserver/src/deletion_queue.rs +++ b/pageserver/src/deletion_queue.rs @@ -838,6 +838,7 @@ mod test { local_path: remote_fs_dir.clone(), }, timeout: RemoteStorageConfig::DEFAULT_TIMEOUT, + small_timeout: RemoteStorageConfig::DEFAULT_SMALL_TIMEOUT, }; let storage = GenericRemoteStorage::from_config(&storage_config) .await diff --git a/pageserver/src/http/routes.rs b/pageserver/src/http/routes.rs index ceb1c3b012..e04f1460a8 100644 --- a/pageserver/src/http/routes.rs +++ b/pageserver/src/http/routes.rs @@ -56,9 +56,9 @@ use tokio_util::sync::CancellationToken; use tracing::*; use utils::auth::JwtAuth; use utils::failpoint_support::failpoints_handler; -use utils::http::endpoint::profile_cpu_handler; -use utils::http::endpoint::prometheus_metrics_handler; -use utils::http::endpoint::request_span; +use utils::http::endpoint::{ + profile_cpu_handler, profile_heap_handler, prometheus_metrics_handler, request_span, +}; use utils::http::request::must_parse_query_param; use utils::http::request::{get_request_param, must_get_query_param, parse_query_param}; @@ -155,6 +155,7 @@ impl State { "/swagger.yml", "/metrics", "/profile/cpu", + "/profile/heap", ]; Ok(Self { conf, @@ -278,7 +279,10 @@ impl From for ApiError { impl From for ApiError { fn from(tse: GetTenantError) -> ApiError { match tse { - GetTenantError::NotFound(tid) => ApiError::NotFound(anyhow!("tenant {}", tid).into()), + GetTenantError::NotFound(tid) => ApiError::NotFound(anyhow!("tenant {tid}").into()), + GetTenantError::ShardNotFound(tid) => { + ApiError::NotFound(anyhow!("tenant {tid}").into()) + } GetTenantError::NotActive(_) => { // Why is this not `ApiError::NotFound`? // Because we must be careful to never return 404 for a tenant if it does @@ -386,6 +390,16 @@ impl From for ApiError { } } +impl From for ApiError { + fn from(ste: crate::tenant::secondary::SecondaryTenantError) -> ApiError { + use crate::tenant::secondary::SecondaryTenantError; + match ste { + SecondaryTenantError::GetTenant(gte) => gte.into(), + SecondaryTenantError::ShuttingDown => ApiError::ShuttingDown, + } + } +} + // Helper function to construct a TimelineInfo struct for a timeline async fn build_timeline_info( timeline: &Arc, @@ -1046,9 +1060,11 @@ async fn timeline_delete_handler( match e { // GetTenantError has a built-in conversion to ApiError, but in this context we don't // want to treat missing tenants as 404, to avoid ambiguity with successful deletions. - GetTenantError::NotFound(_) => ApiError::PreconditionFailed( - "Requested tenant is missing".to_string().into_boxed_str(), - ), + GetTenantError::NotFound(_) | GetTenantError::ShardNotFound(_) => { + ApiError::PreconditionFailed( + "Requested tenant is missing".to_string().into_boxed_str(), + ) + } e => e.into(), } })?; @@ -2461,8 +2477,7 @@ async fn secondary_upload_handler( state .secondary_controller .upload_tenant(tenant_shard_id) - .await - .map_err(ApiError::InternalServerError)?; + .await?; json_response(StatusCode::OK, ()) } @@ -2577,7 +2592,7 @@ async fn secondary_download_handler( // Edge case: downloads aren't usually fallible: things like a missing heatmap are considered // okay. We could get an error here in the unlikely edge case that the tenant // was detached between our check above and executing the download job. - Ok(Err(e)) => return Err(ApiError::InternalServerError(e)), + Ok(Err(e)) => return Err(e.into()), // A timeout is not an error: we have started the download, we're just not done // yet. The caller will get a response body indicating status. Err(_) => StatusCode::ACCEPTED, @@ -3203,6 +3218,7 @@ pub fn make_router( .data(state) .get("/metrics", |r| request_span(r, prometheus_metrics_handler)) .get("/profile/cpu", |r| request_span(r, profile_cpu_handler)) + .get("/profile/heap", |r| request_span(r, profile_heap_handler)) .get("/v1/status", |r| api_handler(r, status_handler)) .put("/v1/failpoints", |r| { testing_api_handler("manage failpoints", r, failpoints_handler) diff --git a/pageserver/src/import_datadir.rs b/pageserver/src/import_datadir.rs index 06c4553e1c..c061714010 100644 --- a/pageserver/src/import_datadir.rs +++ b/pageserver/src/import_datadir.rs @@ -575,18 +575,24 @@ async fn import_file( } else if file_path.starts_with("pg_xact") { let slru = SlruKind::Clog; - import_slru(modification, slru, file_path, reader, len, ctx).await?; - debug!("imported clog slru"); + if modification.tline.tenant_shard_id.is_shard_zero() { + import_slru(modification, slru, file_path, reader, len, ctx).await?; + debug!("imported clog slru"); + } } else if file_path.starts_with("pg_multixact/offsets") { let slru = SlruKind::MultiXactOffsets; - import_slru(modification, slru, file_path, reader, len, ctx).await?; - debug!("imported multixact offsets slru"); + if modification.tline.tenant_shard_id.is_shard_zero() { + import_slru(modification, slru, file_path, reader, len, ctx).await?; + debug!("imported multixact offsets slru"); + } } else if file_path.starts_with("pg_multixact/members") { let slru = SlruKind::MultiXactMembers; - import_slru(modification, slru, file_path, reader, len, ctx).await?; - debug!("imported multixact members slru"); + if modification.tline.tenant_shard_id.is_shard_zero() { + import_slru(modification, slru, file_path, reader, len, ctx).await?; + debug!("imported multixact members slru"); + } } else if file_path.starts_with("pg_twophase") { let bytes = read_all_bytes(reader).await?; diff --git a/pageserver/src/lib.rs b/pageserver/src/lib.rs index ef6711397a..ff6af3566c 100644 --- a/pageserver/src/lib.rs +++ b/pageserver/src/lib.rs @@ -356,6 +356,25 @@ async fn timed( } } +/// Like [`timed`], but the warning timeout only starts after `cancel` has been cancelled. +async fn timed_after_cancellation( + fut: Fut, + name: &str, + warn_at: std::time::Duration, + cancel: &CancellationToken, +) -> ::Output { + let mut fut = std::pin::pin!(fut); + + tokio::select! { + _ = cancel.cancelled() => { + timed(fut, name, warn_at).await + } + ret = &mut fut => { + ret + } + } +} + #[cfg(test)] mod timed_tests { use super::timed; diff --git a/pageserver/src/metrics.rs b/pageserver/src/metrics.rs index 86be97587f..998c15ccaf 100644 --- a/pageserver/src/metrics.rs +++ b/pageserver/src/metrics.rs @@ -7,6 +7,10 @@ use metrics::{ IntCounterPairVec, IntCounterVec, IntGauge, IntGaugeVec, UIntGauge, UIntGaugeVec, }; use once_cell::sync::Lazy; +use pageserver_api::config::{ + PageServicePipeliningConfig, PageServicePipeliningConfigPipelined, + PageServiceProtocolPipelinedExecutionStrategy, +}; use pageserver_api::shard::TenantShardId; use postgres_backend::{is_expected_io_error, QueryError}; use pq_proto::framed::ConnectionError; @@ -213,31 +217,16 @@ impl<'a> ScanLatencyOngoingRecording<'a> { ScanLatencyOngoingRecording { parent, start } } - pub(crate) fn observe(self, throttled: Option) { + pub(crate) fn observe(self) { let elapsed = self.start.elapsed(); - let ex_throttled = if let Some(throttled) = throttled { - elapsed.checked_sub(throttled) - } else { - Some(elapsed) - }; - if let Some(ex_throttled) = ex_throttled { - self.parent.observe(ex_throttled.as_secs_f64()); - } else { - use utils::rate_limit::RateLimit; - static LOGGED: Lazy> = - Lazy::new(|| Mutex::new(RateLimit::new(Duration::from_secs(10)))); - let mut rate_limit = LOGGED.lock().unwrap(); - rate_limit.call(|| { - warn!("error deducting time spent throttled; this message is logged at a global rate limit"); - }); - } + self.parent.observe(elapsed.as_secs_f64()); } } pub(crate) static GET_VECTORED_LATENCY: Lazy = Lazy::new(|| { let inner = register_histogram_vec!( "pageserver_get_vectored_seconds", - "Time spent in get_vectored, excluding time spent in timeline_get_throttle.", + "Time spent in get_vectored.", &["task_kind"], CRITICAL_OP_BUCKETS.into(), ) @@ -260,7 +249,7 @@ pub(crate) static GET_VECTORED_LATENCY: Lazy = Lazy::new(|| pub(crate) static SCAN_LATENCY: Lazy = Lazy::new(|| { let inner = register_histogram_vec!( "pageserver_scan_seconds", - "Time spent in scan, excluding time spent in timeline_get_throttle.", + "Time spent in scan.", &["task_kind"], CRITICAL_OP_BUCKETS.into(), ) @@ -1216,28 +1205,33 @@ pub(crate) mod virtual_file_io_engine { }); } -struct GlobalAndPerTimelineHistogramTimer<'a, 'c> { - global_latency_histo: &'a Histogram, +pub(crate) struct SmgrOpTimer { + global_latency_histo: Histogram, // Optional because not all op types are tracked per-timeline - per_timeline_latency_histo: Option<&'a Histogram>, + per_timeline_latency_histo: Option, - ctx: &'c RequestContext, - start: std::time::Instant, + start: Instant, + throttled: Duration, op: SmgrQueryType, - count: usize, } -impl Drop for GlobalAndPerTimelineHistogramTimer<'_, '_> { +impl SmgrOpTimer { + pub(crate) fn deduct_throttle(&mut self, throttle: &Option) { + let Some(throttle) = throttle else { + return; + }; + self.throttled += *throttle; + } +} + +impl Drop for SmgrOpTimer { fn drop(&mut self) { let elapsed = self.start.elapsed(); - let ex_throttled = self - .ctx - .micros_spent_throttled - .close_and_checked_sub_from(elapsed); - let ex_throttled = match ex_throttled { - Ok(res) => res, - Err(error) => { + + let elapsed = match elapsed.checked_sub(self.throttled) { + Some(elapsed) => elapsed, + None => { use utils::rate_limit::RateLimit; static LOGGED: Lazy>> = Lazy::new(|| { @@ -1248,18 +1242,17 @@ impl Drop for GlobalAndPerTimelineHistogramTimer<'_, '_> { let mut guard = LOGGED.lock().unwrap(); let rate_limit = &mut guard[self.op]; rate_limit.call(|| { - warn!(op=?self.op, error, "error deducting time spent throttled; this message is logged at a global rate limit"); + warn!(op=?self.op, ?elapsed, ?self.throttled, "implementation error: time spent throttled exceeds total request wall clock time"); }); - elapsed + elapsed // un-throttled time, more info than just saturating to 0 } }; - for _ in 0..self.count { - self.global_latency_histo - .observe(ex_throttled.as_secs_f64()); - if let Some(per_timeline_getpage_histo) = self.per_timeline_latency_histo { - per_timeline_getpage_histo.observe(ex_throttled.as_secs_f64()); - } + let elapsed = elapsed.as_secs_f64(); + + self.global_latency_histo.observe(elapsed); + if let Some(per_timeline_getpage_histo) = &self.per_timeline_latency_histo { + per_timeline_getpage_histo.observe(elapsed); } } } @@ -1289,6 +1282,8 @@ pub(crate) struct SmgrQueryTimePerTimeline { global_latency: [Histogram; SmgrQueryType::COUNT], per_timeline_getpage_started: IntCounter, per_timeline_getpage_latency: Histogram, + global_batch_size: Histogram, + per_timeline_batch_size: Histogram, } static SMGR_QUERY_STARTED_GLOBAL: Lazy = Lazy::new(|| { @@ -1381,6 +1376,76 @@ static SMGR_QUERY_TIME_GLOBAL: Lazy = Lazy::new(|| { .expect("failed to define a metric") }); +static PAGE_SERVICE_BATCH_SIZE_BUCKETS_GLOBAL: Lazy> = Lazy::new(|| { + (1..=u32::try_from(Timeline::MAX_GET_VECTORED_KEYS).unwrap()) + .map(|v| v.into()) + .collect() +}); + +static PAGE_SERVICE_BATCH_SIZE_GLOBAL: Lazy = Lazy::new(|| { + register_histogram!( + "pageserver_page_service_batch_size_global", + "Batch size of pageserver page service requests", + PAGE_SERVICE_BATCH_SIZE_BUCKETS_GLOBAL.clone(), + ) + .expect("failed to define a metric") +}); + +static PAGE_SERVICE_BATCH_SIZE_BUCKETS_PER_TIMELINE: Lazy> = Lazy::new(|| { + let mut buckets = Vec::new(); + for i in 0.. { + let bucket = 1 << i; + if bucket > u32::try_from(Timeline::MAX_GET_VECTORED_KEYS).unwrap() { + break; + } + buckets.push(bucket.into()); + } + buckets +}); + +static PAGE_SERVICE_BATCH_SIZE_PER_TENANT_TIMELINE: Lazy = Lazy::new(|| { + register_histogram_vec!( + "pageserver_page_service_batch_size", + "Batch size of pageserver page service requests", + &["tenant_id", "shard_id", "timeline_id"], + PAGE_SERVICE_BATCH_SIZE_BUCKETS_PER_TIMELINE.clone() + ) + .expect("failed to define a metric") +}); + +pub(crate) static PAGE_SERVICE_CONFIG_MAX_BATCH_SIZE: Lazy = Lazy::new(|| { + register_int_gauge_vec!( + "pageserver_page_service_config_max_batch_size", + "Configured maximum batch size for the server-side batching functionality of page_service. \ + Labels expose more of the configuration parameters.", + &["mode", "execution"] + ) + .expect("failed to define a metric") +}); + +fn set_page_service_config_max_batch_size(conf: &PageServicePipeliningConfig) { + PAGE_SERVICE_CONFIG_MAX_BATCH_SIZE.reset(); + let (label_values, value) = match conf { + PageServicePipeliningConfig::Serial => (["serial", "-"], 1), + PageServicePipeliningConfig::Pipelined(PageServicePipeliningConfigPipelined { + max_batch_size, + execution, + }) => { + let mode = "pipelined"; + let execution = match execution { + PageServiceProtocolPipelinedExecutionStrategy::ConcurrentFutures => { + "concurrent-futures" + } + PageServiceProtocolPipelinedExecutionStrategy::Tasks => "tasks", + }; + ([mode, execution], max_batch_size.get()) + } + }; + PAGE_SERVICE_CONFIG_MAX_BATCH_SIZE + .with_label_values(&label_values) + .set(value.try_into().unwrap()); +} + impl SmgrQueryTimePerTimeline { pub(crate) fn new(tenant_shard_id: &TenantShardId, timeline_id: &TimelineId) -> Self { let tenant_id = tenant_shard_id.tenant_id.to_string(); @@ -1416,78 +1481,53 @@ impl SmgrQueryTimePerTimeline { ]) .unwrap(); + let global_batch_size = PAGE_SERVICE_BATCH_SIZE_GLOBAL.clone(); + let per_timeline_batch_size = PAGE_SERVICE_BATCH_SIZE_PER_TENANT_TIMELINE + .get_metric_with_label_values(&[&tenant_id, &shard_slug, &timeline_id]) + .unwrap(); + Self { global_started, global_latency, per_timeline_getpage_latency, per_timeline_getpage_started, + global_batch_size, + per_timeline_batch_size, } } - pub(crate) fn start_timer<'c: 'a, 'a>( - &'a self, - op: SmgrQueryType, - ctx: &'c RequestContext, - ) -> Option { - self.start_timer_many(op, 1, ctx) - } - pub(crate) fn start_timer_many<'c: 'a, 'a>( - &'a self, - op: SmgrQueryType, - count: usize, - ctx: &'c RequestContext, - ) -> Option { - let start = Instant::now(); - + pub(crate) fn start_smgr_op(&self, op: SmgrQueryType, started_at: Instant) -> SmgrOpTimer { self.global_started[op as usize].inc(); - // We subtract time spent throttled from the observed latency. - match ctx.micros_spent_throttled.open() { - Ok(()) => (), - Err(error) => { - use utils::rate_limit::RateLimit; - static LOGGED: Lazy>> = - Lazy::new(|| { - Mutex::new(enum_map::EnumMap::from_array(std::array::from_fn(|_| { - RateLimit::new(Duration::from_secs(10)) - }))) - }); - let mut guard = LOGGED.lock().unwrap(); - let rate_limit = &mut guard[op]; - rate_limit.call(|| { - warn!(?op, error, "error opening micros_spent_throttled; this message is logged at a global rate limit"); - }); - } - } - let per_timeline_latency_histo = if matches!(op, SmgrQueryType::GetPageAtLsn) { self.per_timeline_getpage_started.inc(); - Some(&self.per_timeline_getpage_latency) + Some(self.per_timeline_getpage_latency.clone()) } else { None }; - Some(GlobalAndPerTimelineHistogramTimer { - global_latency_histo: &self.global_latency[op as usize], + SmgrOpTimer { + global_latency_histo: self.global_latency[op as usize].clone(), per_timeline_latency_histo, - ctx, - start, + start: started_at, op, - count, - }) + throttled: Duration::ZERO, + } + } + + pub(crate) fn observe_getpage_batch_start(&self, batch_size: usize) { + self.global_batch_size.observe(batch_size as f64); + self.per_timeline_batch_size.observe(batch_size as f64); } } #[cfg(test)] mod smgr_query_time_tests { + use std::time::Instant; + use pageserver_api::shard::TenantShardId; use strum::IntoEnumIterator; use utils::id::{TenantId, TimelineId}; - use crate::{ - context::{DownloadBehavior, RequestContext}, - task_mgr::TaskKind, - }; - // Regression test, we used hard-coded string constants before using an enum. #[test] fn op_label_name() { @@ -1531,8 +1571,7 @@ mod smgr_query_time_tests { let (pre_global, pre_per_tenant_timeline) = get_counts(); assert_eq!(pre_per_tenant_timeline, 0); - let ctx = RequestContext::new(TaskKind::UnitTest, DownloadBehavior::Download); - let timer = metrics.start_timer(*op, &ctx); + let timer = metrics.start_smgr_op(*op, Instant::now()); drop(timer); let (post_global, post_per_tenant_timeline) = get_counts(); @@ -1579,58 +1618,24 @@ pub(crate) static BASEBACKUP_QUERY_TIME: Lazy = Lazy::new(| } }); -pub(crate) struct BasebackupQueryTimeOngoingRecording<'a, 'c> { +pub(crate) struct BasebackupQueryTimeOngoingRecording<'a> { parent: &'a BasebackupQueryTime, - ctx: &'c RequestContext, start: std::time::Instant, } impl BasebackupQueryTime { - pub(crate) fn start_recording<'c: 'a, 'a>( - &'a self, - ctx: &'c RequestContext, - ) -> BasebackupQueryTimeOngoingRecording<'a, 'a> { + pub(crate) fn start_recording(&self) -> BasebackupQueryTimeOngoingRecording<'_> { let start = Instant::now(); - match ctx.micros_spent_throttled.open() { - Ok(()) => (), - Err(error) => { - use utils::rate_limit::RateLimit; - static LOGGED: Lazy> = - Lazy::new(|| Mutex::new(RateLimit::new(Duration::from_secs(10)))); - let mut rate_limit = LOGGED.lock().unwrap(); - rate_limit.call(|| { - warn!(error, "error opening micros_spent_throttled; this message is logged at a global rate limit"); - }); - } - } BasebackupQueryTimeOngoingRecording { parent: self, - ctx, start, } } } -impl BasebackupQueryTimeOngoingRecording<'_, '_> { +impl BasebackupQueryTimeOngoingRecording<'_> { pub(crate) fn observe(self, res: &Result) { - let elapsed = self.start.elapsed(); - let ex_throttled = self - .ctx - .micros_spent_throttled - .close_and_checked_sub_from(elapsed); - let ex_throttled = match ex_throttled { - Ok(ex_throttled) => ex_throttled, - Err(error) => { - use utils::rate_limit::RateLimit; - static LOGGED: Lazy> = - Lazy::new(|| Mutex::new(RateLimit::new(Duration::from_secs(10)))); - let mut rate_limit = LOGGED.lock().unwrap(); - rate_limit.call(|| { - warn!(error, "error deducting time spent throttled; this message is logged at a global rate limit"); - }); - elapsed - } - }; + let elapsed = self.start.elapsed().as_secs_f64(); // If you want to change categorize of a specific error, also change it in `log_query_error`. let metric = match res { Ok(_) => &self.parent.ok, @@ -1641,7 +1646,7 @@ impl BasebackupQueryTimeOngoingRecording<'_, '_> { } Err(_) => &self.parent.error, }; - metric.observe(ex_throttled.as_secs_f64()); + metric.observe(elapsed); } } @@ -2722,6 +2727,11 @@ impl TimelineMetrics { shard_id, timeline_id, ]); + let _ = PAGE_SERVICE_BATCH_SIZE_PER_TENANT_TIMELINE.remove_label_values(&[ + tenant_id, + shard_id, + timeline_id, + ]); } } @@ -2747,10 +2757,12 @@ use std::sync::{Arc, Mutex}; use std::task::{Context, Poll}; use std::time::{Duration, Instant}; +use crate::config::PageServerConf; use crate::context::{PageContentKind, RequestContext}; use crate::task_mgr::TaskKind; use crate::tenant::mgr::TenantSlot; use crate::tenant::tasks::BackgroundLoopKind; +use crate::tenant::Timeline; /// Maintain a per timeline gauge in addition to the global gauge. pub(crate) struct PerTimelineRemotePhysicalSizeGauge { @@ -3307,7 +3319,7 @@ pub(crate) mod tenant_throttling { use once_cell::sync::Lazy; use utils::shard::TenantShardId; - use crate::tenant::{self, throttle::Metric}; + use crate::tenant::{self}; struct GlobalAndPerTenantIntCounter { global: IntCounter, @@ -3326,7 +3338,7 @@ pub(crate) mod tenant_throttling { } } - pub(crate) struct TimelineGet { + pub(crate) struct Metrics { count_accounted_start: GlobalAndPerTenantIntCounter, count_accounted_finish: GlobalAndPerTenantIntCounter, wait_time: GlobalAndPerTenantIntCounter, @@ -3399,40 +3411,41 @@ pub(crate) mod tenant_throttling { .unwrap() }); - const KIND: &str = "timeline_get"; + const KINDS: &[&str] = &["pagestream"]; + pub type Pagestream = Metrics<0>; - impl TimelineGet { + impl Metrics { pub(crate) fn new(tenant_shard_id: &TenantShardId) -> Self { let per_tenant_label_values = &[ - KIND, + KINDS[KIND], &tenant_shard_id.tenant_id.to_string(), &tenant_shard_id.shard_slug().to_string(), ]; - TimelineGet { + Metrics { count_accounted_start: { GlobalAndPerTenantIntCounter { - global: COUNT_ACCOUNTED_START.with_label_values(&[KIND]), + global: COUNT_ACCOUNTED_START.with_label_values(&[KINDS[KIND]]), per_tenant: COUNT_ACCOUNTED_START_PER_TENANT .with_label_values(per_tenant_label_values), } }, count_accounted_finish: { GlobalAndPerTenantIntCounter { - global: COUNT_ACCOUNTED_FINISH.with_label_values(&[KIND]), + global: COUNT_ACCOUNTED_FINISH.with_label_values(&[KINDS[KIND]]), per_tenant: COUNT_ACCOUNTED_FINISH_PER_TENANT .with_label_values(per_tenant_label_values), } }, wait_time: { GlobalAndPerTenantIntCounter { - global: WAIT_USECS.with_label_values(&[KIND]), + global: WAIT_USECS.with_label_values(&[KINDS[KIND]]), per_tenant: WAIT_USECS_PER_TENANT .with_label_values(per_tenant_label_values), } }, count_throttled: { GlobalAndPerTenantIntCounter { - global: WAIT_COUNT.with_label_values(&[KIND]), + global: WAIT_COUNT.with_label_values(&[KINDS[KIND]]), per_tenant: WAIT_COUNT_PER_TENANT .with_label_values(per_tenant_label_values), } @@ -3455,15 +3468,17 @@ pub(crate) mod tenant_throttling { &WAIT_USECS_PER_TENANT, &WAIT_COUNT_PER_TENANT, ] { - let _ = m.remove_label_values(&[ - KIND, - &tenant_shard_id.tenant_id.to_string(), - &tenant_shard_id.shard_slug().to_string(), - ]); + for kind in KINDS { + let _ = m.remove_label_values(&[ + kind, + &tenant_shard_id.tenant_id.to_string(), + &tenant_shard_id.shard_slug().to_string(), + ]); + } } } - impl Metric for TimelineGet { + impl tenant::throttle::Metric for Metrics { #[inline(always)] fn accounting_start(&self) { self.count_accounted_start.inc(); @@ -3562,7 +3577,9 @@ pub(crate) fn set_tokio_runtime_setup(setup: &str, num_threads: NonZeroUsize) { .set(u64::try_from(num_threads.get()).unwrap()); } -pub fn preinitialize_metrics() { +pub fn preinitialize_metrics(conf: &'static PageServerConf) { + set_page_service_config_max_batch_size(&conf.page_service_pipelining); + // Python tests need these and on some we do alerting. // // FIXME(4813): make it so that we have no top level metrics as this fn will easily fall out of @@ -3630,6 +3647,7 @@ pub fn preinitialize_metrics() { &WAL_REDO_RECORDS_HISTOGRAM, &WAL_REDO_BYTES_HISTOGRAM, &WAL_REDO_PROCESS_LAUNCH_DURATION_HISTOGRAM, + &PAGE_SERVICE_BATCH_SIZE_GLOBAL, ] .into_iter() .for_each(|h| { diff --git a/pageserver/src/page_service.rs b/pageserver/src/page_service.rs index 5fd02d8749..7026df9527 100644 --- a/pageserver/src/page_service.rs +++ b/pageserver/src/page_service.rs @@ -7,6 +7,10 @@ use bytes::Buf; use futures::FutureExt; use itertools::Itertools; use once_cell::sync::OnceCell; +use pageserver_api::config::{ + PageServicePipeliningConfig, PageServicePipeliningConfigPipelined, + PageServiceProtocolPipelinedExecutionStrategy, +}; use pageserver_api::models::{self, TenantState}; use pageserver_api::models::{ PagestreamBeMessage, PagestreamDbSizeRequest, PagestreamDbSizeResponse, @@ -16,12 +20,15 @@ use pageserver_api::models::{ PagestreamProtocolVersion, }; use pageserver_api::shard::TenantShardId; -use postgres_backend::{is_expected_io_error, AuthType, PostgresBackend, QueryError}; +use postgres_backend::{ + is_expected_io_error, AuthType, PostgresBackend, PostgresBackendReader, QueryError, +}; use pq_proto::framed::ConnectionError; use pq_proto::FeStartupPacket; use pq_proto::{BeMessage, FeMessage, RowDescriptor}; use std::borrow::Cow; use std::io; +use std::num::NonZeroUsize; use std::str; use std::str::FromStr; use std::sync::Arc; @@ -32,6 +39,7 @@ use tokio::io::{AsyncWriteExt, BufWriter}; use tokio::task::JoinHandle; use tokio_util::sync::CancellationToken; use tracing::*; +use utils::sync::spsc_fold; use utils::{ auth::{Claims, Scope, SwappableJwtAuth}, id::{TenantId, TimelineId}, @@ -40,11 +48,10 @@ use utils::{ }; use crate::auth::check_permission; -use crate::basebackup; use crate::basebackup::BasebackupError; use crate::config::PageServerConf; use crate::context::{DownloadBehavior, RequestContext}; -use crate::metrics::{self}; +use crate::metrics::{self, SmgrOpTimer}; use crate::metrics::{ComputeCommandKind, COMPUTE_COMMANDS_COUNTERS, LIVE_CONNECTIONS}; use crate::pgdatadir_mapping::Version; use crate::span::debug_assert_current_span_has_tenant_and_timeline_id; @@ -58,6 +65,7 @@ use crate::tenant::timeline::{self, WaitLsnError}; use crate::tenant::GetTimelineError; use crate::tenant::PageReconstructError; use crate::tenant::Timeline; +use crate::{basebackup, timed_after_cancellation}; use pageserver_api::key::rel_block_to_key; use pageserver_api::reltag::{BlockNumber, RelTag, SlruKind}; use postgres_ffi::pg_constants::DEFAULTTABLESPACE_OID; @@ -105,7 +113,7 @@ pub fn spawn( pg_auth, tcp_listener, conf.pg_auth_type, - conf.server_side_batch_timeout, + conf.page_service_pipelining.clone(), libpq_ctx, cancel.clone(), ) @@ -154,7 +162,7 @@ pub async fn libpq_listener_main( auth: Option>, listener: tokio::net::TcpListener, auth_type: AuthType, - server_side_batch_timeout: Option, + pipelining_config: PageServicePipeliningConfig, listener_ctx: RequestContext, listener_cancel: CancellationToken, ) -> Connections { @@ -185,7 +193,7 @@ pub async fn libpq_listener_main( local_auth, socket, auth_type, - server_side_batch_timeout, + pipelining_config.clone(), connection_ctx, connections_cancel.child_token(), )); @@ -213,7 +221,7 @@ async fn page_service_conn_main( auth: Option>, socket: tokio::net::TcpStream, auth_type: AuthType, - server_side_batch_timeout: Option, + pipelining_config: PageServicePipeliningConfig, connection_ctx: RequestContext, cancel: CancellationToken, ) -> ConnectionHandlerResult { @@ -256,7 +264,7 @@ async fn page_service_conn_main( // a while: we will tear down this PageServerHandler and instantiate a new one if/when // they reconnect. socket.set_timeout(Some(std::time::Duration::from_millis(socket_timeout_ms))); - let socket = std::pin::pin!(socket); + let socket = Box::pin(socket); fail::fail_point!("ps::connection-start::pre-login"); @@ -267,7 +275,7 @@ async fn page_service_conn_main( let mut conn_handler = PageServerHandler::new( tenant_manager, auth, - server_side_batch_timeout, + pipelining_config, connection_ctx, cancel.clone(), ); @@ -283,7 +291,7 @@ async fn page_service_conn_main( info!("Postgres client disconnected ({io_error})"); Ok(()) } else { - let tenant_id = conn_handler.timeline_handles.tenant_id(); + let tenant_id = conn_handler.timeline_handles.as_ref().unwrap().tenant_id(); Err(io_error).context(format!( "Postgres connection error for tenant_id={:?} client at peer_addr={}", tenant_id, peer_addr @@ -291,7 +299,7 @@ async fn page_service_conn_main( } } other => { - let tenant_id = conn_handler.timeline_handles.tenant_id(); + let tenant_id = conn_handler.timeline_handles.as_ref().unwrap().tenant_id(); other.context(format!( "Postgres query error for tenant_id={:?} client peer_addr={}", tenant_id, peer_addr @@ -312,13 +320,10 @@ struct PageServerHandler { cancel: CancellationToken, - timeline_handles: TimelineHandles, + /// None only while pagestream protocol is being processed. + timeline_handles: Option, - /// Messages queued up for the next processing batch - next_batch: Option, - - /// See [`PageServerConf::server_side_batch_timeout`] - server_side_batch_timeout: Option, + pipelining_config: PageServicePipeliningConfig, } struct TimelineHandles { @@ -535,24 +540,32 @@ impl From for QueryError { enum BatchedFeMessage { Exists { span: Span, + timer: SmgrOpTimer, + shard: timeline::handle::Handle, req: models::PagestreamExistsRequest, }, Nblocks { span: Span, + timer: SmgrOpTimer, + shard: timeline::handle::Handle, req: models::PagestreamNblocksRequest, }, GetPage { span: Span, shard: timeline::handle::Handle, effective_request_lsn: Lsn, - pages: smallvec::SmallVec<[(RelTag, BlockNumber); 1]>, + pages: smallvec::SmallVec<[(RelTag, BlockNumber, SmgrOpTimer); 1]>, }, DbSize { span: Span, + timer: SmgrOpTimer, + shard: timeline::handle::Handle, req: models::PagestreamDbSizeRequest, }, GetSlruSegment { span: Span, + timer: SmgrOpTimer, + shard: timeline::handle::Handle, req: models::PagestreamGetSlruSegmentRequest, }, RespondError { @@ -561,18 +574,46 @@ enum BatchedFeMessage { }, } -enum BatchOrEof { - /// In the common case, this has one entry. - /// At most, it has two entries: the first is the leftover batch, the second is an error. - Batch(smallvec::SmallVec<[BatchedFeMessage; 1]>), - Eof, +impl BatchedFeMessage { + async fn throttle(&mut self, cancel: &CancellationToken) -> Result<(), QueryError> { + let (shard, tokens, timers) = match self { + BatchedFeMessage::Exists { shard, timer, .. } + | BatchedFeMessage::Nblocks { shard, timer, .. } + | BatchedFeMessage::DbSize { shard, timer, .. } + | BatchedFeMessage::GetSlruSegment { shard, timer, .. } => { + ( + shard, + // 1 token is probably under-estimating because these + // request handlers typically do several Timeline::get calls. + 1, + itertools::Either::Left(std::iter::once(timer)), + ) + } + BatchedFeMessage::GetPage { shard, pages, .. } => ( + shard, + pages.len(), + itertools::Either::Right(pages.iter_mut().map(|(_, _, timer)| timer)), + ), + BatchedFeMessage::RespondError { .. } => return Ok(()), + }; + let throttled = tokio::select! { + throttled = shard.pagestream_throttle.throttle(tokens) => { throttled } + _ = cancel.cancelled() => { + return Err(QueryError::Shutdown); + } + }; + for timer in timers { + timer.deduct_throttle(&throttled); + } + Ok(()) + } } impl PageServerHandler { pub fn new( tenant_manager: Arc, auth: Option>, - server_side_batch_timeout: Option, + pipelining_config: PageServicePipeliningConfig, connection_ctx: RequestContext, cancel: CancellationToken, ) -> Self { @@ -580,10 +621,9 @@ impl PageServerHandler { auth, claims: None, connection_ctx, - timeline_handles: TimelineHandles::new(tenant_manager), + timeline_handles: Some(TimelineHandles::new(tenant_manager)), cancel, - next_batch: None, - server_side_batch_timeout, + pipelining_config, } } @@ -611,219 +651,428 @@ impl PageServerHandler { ) } - async fn read_batch_from_connection( - &mut self, - pgb: &mut PostgresBackend, - tenant_id: &TenantId, - timeline_id: &TimelineId, + async fn pagestream_read_message( + pgb: &mut PostgresBackendReader, + tenant_id: TenantId, + timeline_id: TimelineId, + timeline_handles: &mut TimelineHandles, + cancel: &CancellationToken, ctx: &RequestContext, - ) -> Result, QueryError> + parent_span: Span, + ) -> Result, QueryError> + where + IO: AsyncRead + AsyncWrite + Send + Sync + Unpin + 'static, + { + let msg = tokio::select! { + biased; + _ = cancel.cancelled() => { + return Err(QueryError::Shutdown) + } + msg = pgb.read_message() => { msg } + }; + + let received_at = Instant::now(); + + let copy_data_bytes = match msg? { + Some(FeMessage::CopyData(bytes)) => bytes, + Some(FeMessage::Terminate) => { + return Ok(None); + } + Some(m) => { + return Err(QueryError::Other(anyhow::anyhow!( + "unexpected message: {m:?} during COPY" + ))); + } + None => { + return Ok(None); + } // client disconnected + }; + trace!("query: {copy_data_bytes:?}"); + + fail::fail_point!("ps::handle-pagerequest-message"); + + // parse request + let neon_fe_msg = PagestreamFeMessage::parse(&mut copy_data_bytes.reader())?; + + let batched_msg = match neon_fe_msg { + PagestreamFeMessage::Exists(req) => { + let span = tracing::info_span!(parent: parent_span, "handle_get_rel_exists_request", rel = %req.rel, req_lsn = %req.request_lsn); + let shard = timeline_handles + .get(tenant_id, timeline_id, ShardSelector::Zero) + .instrument(span.clone()) // sets `shard_id` field + .await?; + let timer = shard + .query_metrics + .start_smgr_op(metrics::SmgrQueryType::GetRelExists, received_at); + BatchedFeMessage::Exists { + span, + timer, + shard, + req, + } + } + PagestreamFeMessage::Nblocks(req) => { + let span = tracing::info_span!(parent: parent_span, "handle_get_nblocks_request", rel = %req.rel, req_lsn = %req.request_lsn); + let shard = timeline_handles + .get(tenant_id, timeline_id, ShardSelector::Zero) + .instrument(span.clone()) // sets `shard_id` field + .await?; + let timer = shard + .query_metrics + .start_smgr_op(metrics::SmgrQueryType::GetRelSize, received_at); + BatchedFeMessage::Nblocks { + span, + timer, + shard, + req, + } + } + PagestreamFeMessage::DbSize(req) => { + let span = tracing::info_span!(parent: parent_span, "handle_db_size_request", dbnode = %req.dbnode, req_lsn = %req.request_lsn); + let shard = timeline_handles + .get(tenant_id, timeline_id, ShardSelector::Zero) + .instrument(span.clone()) // sets `shard_id` field + .await?; + let timer = shard + .query_metrics + .start_smgr_op(metrics::SmgrQueryType::GetDbSize, received_at); + BatchedFeMessage::DbSize { + span, + timer, + shard, + req, + } + } + PagestreamFeMessage::GetSlruSegment(req) => { + let span = tracing::info_span!(parent: parent_span, "handle_get_slru_segment_request", kind = %req.kind, segno = %req.segno, req_lsn = %req.request_lsn); + let shard = timeline_handles + .get(tenant_id, timeline_id, ShardSelector::Zero) + .instrument(span.clone()) // sets `shard_id` field + .await?; + let timer = shard + .query_metrics + .start_smgr_op(metrics::SmgrQueryType::GetSlruSegment, received_at); + BatchedFeMessage::GetSlruSegment { + span, + timer, + shard, + req, + } + } + PagestreamFeMessage::GetPage(PagestreamGetPageRequest { + request_lsn, + not_modified_since, + rel, + blkno, + }) => { + let span = tracing::info_span!(parent: parent_span, "handle_get_page_at_lsn_request_batched", req_lsn = %request_lsn); + + macro_rules! respond_error { + ($error:expr) => {{ + let error = BatchedFeMessage::RespondError { + span, + error: $error, + }; + Ok(Some(error)) + }}; + } + + let key = rel_block_to_key(rel, blkno); + let shard = match timeline_handles + .get(tenant_id, timeline_id, ShardSelector::Page(key)) + .instrument(span.clone()) // sets `shard_id` field + .await + { + Ok(tl) => tl, + Err(GetActiveTimelineError::Tenant(GetActiveTenantError::NotFound(_))) => { + // We already know this tenant exists in general, because we resolved it at + // start of connection. Getting a NotFound here indicates that the shard containing + // the requested page is not present on this node: the client's knowledge of shard->pageserver + // mapping is out of date. + // + // Closing the connection by returning ``::Reconnect` has the side effect of rate-limiting above message, via + // client's reconnect backoff, as well as hopefully prompting the client to load its updated configuration + // and talk to a different pageserver. + return respond_error!(PageStreamError::Reconnect( + "getpage@lsn request routed to wrong shard".into() + )); + } + Err(e) => { + return respond_error!(e.into()); + } + }; + + // It's important to start the timer before waiting for the LSN + // so that the _started counters are incremented before we do + // any serious waiting, e.g., for LSNs. + let timer = shard + .query_metrics + .start_smgr_op(metrics::SmgrQueryType::GetPageAtLsn, received_at); + + let effective_request_lsn = match Self::wait_or_get_last_lsn( + &shard, + request_lsn, + not_modified_since, + &shard.get_latest_gc_cutoff_lsn(), + ctx, + ) + // TODO: if we actually need to wait for lsn here, it delays the entire batch which doesn't need to wait + .await + { + Ok(lsn) => lsn, + Err(e) => { + return respond_error!(e); + } + }; + BatchedFeMessage::GetPage { + span, + shard, + effective_request_lsn, + pages: smallvec::smallvec![(rel, blkno, timer)], + } + } + }; + Ok(Some(batched_msg)) + } + + /// Post-condition: `batch` is Some() + #[instrument(skip_all, level = tracing::Level::TRACE)] + #[allow(clippy::boxed_local)] + fn pagestream_do_batch( + max_batch_size: NonZeroUsize, + batch: &mut Result, + this_msg: Result, + ) -> Result<(), Result> { + debug_assert_current_span_has_tenant_and_timeline_id_no_shard_id(); + + let this_msg = match this_msg { + Ok(this_msg) => this_msg, + Err(e) => return Err(Err(e)), + }; + + match (&mut *batch, this_msg) { + // something batched already, let's see if we can add this message to the batch + ( + Ok(BatchedFeMessage::GetPage { + span: _, + shard: accum_shard, + pages: ref mut accum_pages, + effective_request_lsn: accum_lsn, + }), + BatchedFeMessage::GetPage { + span: _, + shard: this_shard, + pages: this_pages, + effective_request_lsn: this_lsn, + }, + ) if (|| { + assert_eq!(this_pages.len(), 1); + if accum_pages.len() >= max_batch_size.get() { + trace!(%accum_lsn, %this_lsn, %max_batch_size, "stopping batching because of batch size"); + assert_eq!(accum_pages.len(), max_batch_size.get()); + return false; + } + if (accum_shard.tenant_shard_id, accum_shard.timeline_id) + != (this_shard.tenant_shard_id, this_shard.timeline_id) + { + trace!(%accum_lsn, %this_lsn, "stopping batching because timeline object mismatch"); + // TODO: we _could_ batch & execute each shard seperately (and in parallel). + // But the current logic for keeping responses in order does not support that. + return false; + } + // the vectored get currently only supports a single LSN, so, bounce as soon + // as the effective request_lsn changes + if *accum_lsn != this_lsn { + trace!(%accum_lsn, %this_lsn, "stopping batching because LSN changed"); + return false; + } + true + })() => + { + // ok to batch + accum_pages.extend(this_pages); + Ok(()) + } + // something batched already but this message is unbatchable + (_, this_msg) => { + // by default, don't continue batching + Err(Ok(this_msg)) + } + } + } + + #[instrument(level = tracing::Level::DEBUG, skip_all)] + async fn pagesteam_handle_batched_message( + &mut self, + pgb_writer: &mut PostgresBackend, + batch: BatchedFeMessage, + cancel: &CancellationToken, + ctx: &RequestContext, + ) -> Result<(), QueryError> where IO: AsyncRead + AsyncWrite + Send + Sync + Unpin, { - let mut batch = self.next_batch.take(); - let mut batch_started_at: Option = None; - - let next_batch: Option = loop { - let sleep_fut = match (self.server_side_batch_timeout, batch_started_at) { - (Some(batch_timeout), Some(started_at)) => futures::future::Either::Left( - tokio::time::sleep_until((started_at + batch_timeout).into()), - ), - _ => futures::future::Either::Right(futures::future::pending()), - }; - - let msg = tokio::select! { - biased; - _ = self.cancel.cancelled() => { - return Err(QueryError::Shutdown) - } - msg = pgb.read_message() => { - msg - } - _ = sleep_fut => { - assert!(batch.is_some()); - break None; - } - }; - let copy_data_bytes = match msg? { - Some(FeMessage::CopyData(bytes)) => bytes, - Some(FeMessage::Terminate) => { - return Ok(Some(BatchOrEof::Eof)); - } - Some(m) => { - return Err(QueryError::Other(anyhow::anyhow!( - "unexpected message: {m:?} during COPY" - ))); - } - None => { - return Ok(Some(BatchOrEof::Eof)); - } // client disconnected - }; - trace!("query: {copy_data_bytes:?}"); - fail::fail_point!("ps::handle-pagerequest-message"); - - // parse request - let neon_fe_msg = PagestreamFeMessage::parse(&mut copy_data_bytes.reader())?; - - let this_msg = match neon_fe_msg { - PagestreamFeMessage::Exists(req) => BatchedFeMessage::Exists { - span: tracing::info_span!("handle_get_rel_exists_request", rel = %req.rel, req_lsn = %req.request_lsn), - req, - }, - PagestreamFeMessage::Nblocks(req) => BatchedFeMessage::Nblocks { - span: tracing::info_span!("handle_get_nblocks_request", rel = %req.rel, req_lsn = %req.request_lsn), - req, - }, - PagestreamFeMessage::DbSize(req) => BatchedFeMessage::DbSize { - span: tracing::info_span!("handle_db_size_request", dbnode = %req.dbnode, req_lsn = %req.request_lsn), - req, - }, - PagestreamFeMessage::GetSlruSegment(req) => BatchedFeMessage::GetSlruSegment { - span: tracing::info_span!("handle_get_slru_segment_request", kind = %req.kind, segno = %req.segno, req_lsn = %req.request_lsn), - req, - }, - PagestreamFeMessage::GetPage(PagestreamGetPageRequest { - request_lsn, - not_modified_since, - rel, - blkno, - }) => { - // shard_id is filled in by the handler - let span = tracing::info_span!( - "handle_get_page_at_lsn_request_batched", - %tenant_id, %timeline_id, shard_id = tracing::field::Empty, req_lsn = %request_lsn, - batch_size = tracing::field::Empty, batch_id = tracing::field::Empty - ); - - macro_rules! current_batch_and_error { - ($error:expr) => {{ - let error = BatchedFeMessage::RespondError { - span, - error: $error, - }; - let batch_and_error = match batch { - Some(b) => smallvec::smallvec![b, error], - None => smallvec::smallvec![error], - }; - Ok(Some(BatchOrEof::Batch(batch_and_error))) - }}; - } - - let key = rel_block_to_key(rel, blkno); - let shard = match self - .timeline_handles - .get(*tenant_id, *timeline_id, ShardSelector::Page(key)) + // invoke handler function + let (handler_results, span): ( + Vec>, + _, + ) = match batch { + BatchedFeMessage::Exists { + span, + timer, + shard, + req, + } => { + fail::fail_point!("ps::handle-pagerequest-message::exists"); + ( + vec![self + .handle_get_rel_exists_request(&shard, &req, ctx) .instrument(span.clone()) .await - { - Ok(tl) => tl, - Err(GetActiveTimelineError::Tenant(GetActiveTenantError::NotFound(_))) => { - // We already know this tenant exists in general, because we resolved it at - // start of connection. Getting a NotFound here indicates that the shard containing - // the requested page is not present on this node: the client's knowledge of shard->pageserver - // mapping is out of date. - // - // Closing the connection by returning ``::Reconnect` has the side effect of rate-limiting above message, via - // client's reconnect backoff, as well as hopefully prompting the client to load its updated configuration - // and talk to a different pageserver. - return current_batch_and_error!(PageStreamError::Reconnect( - "getpage@lsn request routed to wrong shard".into() - )); - } - Err(e) => { - return current_batch_and_error!(e.into()); - } - }; - let effective_request_lsn = match Self::wait_or_get_last_lsn( - &shard, - request_lsn, - not_modified_since, - &shard.get_latest_gc_cutoff_lsn(), - ctx, - ) - // TODO: if we actually need to wait for lsn here, it delays the entire batch which doesn't need to wait - .await - { - Ok(lsn) => lsn, - Err(e) => { - return current_batch_and_error!(e); - } - }; - BatchedFeMessage::GetPage { - span, - shard, - effective_request_lsn, - pages: smallvec::smallvec![(rel, blkno)], - } - } - }; - - let batch_timeout = match self.server_side_batch_timeout { - Some(value) => value, - None => { - // Batching is not enabled - stop on the first message. - return Ok(Some(BatchOrEof::Batch(smallvec::smallvec![this_msg]))); - } - }; - - // check if we can batch - match (&mut batch, this_msg) { - (None, this_msg) => { - batch = Some(this_msg); - } - ( - Some(BatchedFeMessage::GetPage { - span: _, - shard: accum_shard, - pages: accum_pages, - effective_request_lsn: accum_lsn, - }), - BatchedFeMessage::GetPage { - span: _, - shard: this_shard, - pages: this_pages, - effective_request_lsn: this_lsn, - }, - ) if async { - assert_eq!(this_pages.len(), 1); - if accum_pages.len() >= Timeline::MAX_GET_VECTORED_KEYS as usize { - assert_eq!(accum_pages.len(), Timeline::MAX_GET_VECTORED_KEYS as usize); - return false; - } - if (accum_shard.tenant_shard_id, accum_shard.timeline_id) - != (this_shard.tenant_shard_id, this_shard.timeline_id) - { - // TODO: we _could_ batch & execute each shard seperately (and in parallel). - // But the current logic for keeping responses in order does not support that. - return false; - } - // the vectored get currently only supports a single LSN, so, bounce as soon - // as the effective request_lsn changes - if *accum_lsn != this_lsn { - return false; - } - true - } - .await => - { - // ok to batch - accum_pages.extend(this_pages); - } - (Some(_), this_msg) => { - // by default, don't continue batching - break Some(this_msg); - } + .map(|msg| (msg, timer))], + span, + ) } - - // batching impl piece - let started_at = batch_started_at.get_or_insert_with(Instant::now); - if started_at.elapsed() > batch_timeout { - break None; + BatchedFeMessage::Nblocks { + span, + timer, + shard, + req, + } => { + fail::fail_point!("ps::handle-pagerequest-message::nblocks"); + ( + vec![self + .handle_get_nblocks_request(&shard, &req, ctx) + .instrument(span.clone()) + .await + .map(|msg| (msg, timer))], + span, + ) + } + BatchedFeMessage::GetPage { + span, + shard, + effective_request_lsn, + pages, + } => { + fail::fail_point!("ps::handle-pagerequest-message::getpage"); + ( + { + let npages = pages.len(); + trace!(npages, "handling getpage request"); + let res = self + .handle_get_page_at_lsn_request_batched( + &shard, + effective_request_lsn, + pages, + ctx, + ) + .instrument(span.clone()) + .await; + assert_eq!(res.len(), npages); + res + }, + span, + ) + } + BatchedFeMessage::DbSize { + span, + timer, + shard, + req, + } => { + fail::fail_point!("ps::handle-pagerequest-message::dbsize"); + ( + vec![self + .handle_db_size_request(&shard, &req, ctx) + .instrument(span.clone()) + .await + .map(|msg| (msg, timer))], + span, + ) + } + BatchedFeMessage::GetSlruSegment { + span, + timer, + shard, + req, + } => { + fail::fail_point!("ps::handle-pagerequest-message::slrusegment"); + ( + vec![self + .handle_get_slru_segment_request(&shard, &req, ctx) + .instrument(span.clone()) + .await + .map(|msg| (msg, timer))], + span, + ) + } + BatchedFeMessage::RespondError { span, error } => { + // We've already decided to respond with an error, so we don't need to + // call the handler. + (vec![Err(error)], span) } }; - self.next_batch = next_batch; - Ok(batch.map(|b| BatchOrEof::Batch(smallvec::smallvec![b]))) + // Map handler result to protocol behavior. + // Some handler errors cause exit from pagestream protocol. + // Other handler errors are sent back as an error message and we stay in pagestream protocol. + let mut timers: smallvec::SmallVec<[_; 1]> = + smallvec::SmallVec::with_capacity(handler_results.len()); + for handler_result in handler_results { + let response_msg = match handler_result { + Err(e) => match &e { + PageStreamError::Shutdown => { + // If we fail to fulfil a request during shutdown, which may be _because_ of + // shutdown, then do not send the error to the client. Instead just drop the + // connection. + span.in_scope(|| info!("dropping connection due to shutdown")); + return Err(QueryError::Shutdown); + } + PageStreamError::Reconnect(reason) => { + span.in_scope(|| info!("handler requested reconnect: {reason}")); + return Err(QueryError::Reconnect); + } + PageStreamError::Read(_) + | PageStreamError::LsnTimeout(_) + | PageStreamError::NotFound(_) + | PageStreamError::BadRequest(_) => { + // print the all details to the log with {:#}, but for the client the + // error message is enough. Do not log if shutting down, as the anyhow::Error + // here includes cancellation which is not an error. + let full = utils::error::report_compact_sources(&e); + span.in_scope(|| { + error!("error reading relation or page version: {full:#}") + }); + PagestreamBeMessage::Error(PagestreamErrorResponse { + message: e.to_string(), + }) + } + }, + Ok((response_msg, timer)) => { + // Extending the lifetime of the timers so observations on drop + // include the flush time. + timers.push(timer); + response_msg + } + }; + + // marshal & transmit response message + pgb_writer.write_message_noflush(&BeMessage::CopyData(&response_msg.serialize()))?; + } + tokio::select! { + biased; + _ = cancel.cancelled() => { + // We were requested to shut down. + info!("shutdown request received in page handler"); + return Err(QueryError::Shutdown) + } + res = pgb_writer.flush() => { + res?; + } + } + drop(timers); + Ok(()) } /// Pagestream sub-protocol handler. @@ -845,7 +1094,7 @@ impl PageServerHandler { ctx: RequestContext, ) -> Result<(), QueryError> where - IO: AsyncRead + AsyncWrite + Send + Sync + Unpin, + IO: AsyncRead + AsyncWrite + Send + Sync + Unpin + 'static, { debug_assert_current_span_has_tenant_and_timeline_id_no_shard_id(); @@ -861,169 +1110,289 @@ impl PageServerHandler { } } - // If [`PageServerHandler`] is reused for multiple pagestreams, - // then make sure to not process requests from the previous ones. - self.next_batch = None; + let pgb_reader = pgb + .split() + .context("implementation error: split pgb into reader and writer")?; - loop { - let maybe_batched = self - .read_batch_from_connection(pgb, &tenant_id, &timeline_id, &ctx) - .await?; - let batched = match maybe_batched { - Some(BatchOrEof::Batch(b)) => b, - Some(BatchOrEof::Eof) => { - break; - } + let timeline_handles = self + .timeline_handles + .take() + .expect("implementation error: timeline_handles should not be locked"); + + let request_span = info_span!("request", shard_id = tracing::field::Empty); + let ((pgb_reader, timeline_handles), result) = match self.pipelining_config.clone() { + PageServicePipeliningConfig::Pipelined(pipelining_config) => { + self.handle_pagerequests_pipelined( + pgb, + pgb_reader, + tenant_id, + timeline_id, + timeline_handles, + request_span, + pipelining_config, + &ctx, + ) + .await + } + PageServicePipeliningConfig::Serial => { + self.handle_pagerequests_serial( + pgb, + pgb_reader, + tenant_id, + timeline_id, + timeline_handles, + request_span, + &ctx, + ) + .await + } + }; + + debug!("pagestream subprotocol shut down cleanly"); + + pgb.unsplit(pgb_reader) + .context("implementation error: unsplit pgb")?; + + let replaced = self.timeline_handles.replace(timeline_handles); + assert!(replaced.is_none()); + + result + } + + #[allow(clippy::too_many_arguments)] + async fn handle_pagerequests_serial( + &mut self, + pgb_writer: &mut PostgresBackend, + mut pgb_reader: PostgresBackendReader, + tenant_id: TenantId, + timeline_id: TimelineId, + mut timeline_handles: TimelineHandles, + request_span: Span, + ctx: &RequestContext, + ) -> ( + (PostgresBackendReader, TimelineHandles), + Result<(), QueryError>, + ) + where + IO: AsyncRead + AsyncWrite + Send + Sync + Unpin + 'static, + { + let cancel = self.cancel.clone(); + let err = loop { + let msg = Self::pagestream_read_message( + &mut pgb_reader, + tenant_id, + timeline_id, + &mut timeline_handles, + &cancel, + ctx, + request_span.clone(), + ) + .await; + let msg = match msg { + Ok(msg) => msg, + Err(e) => break e, + }; + let mut msg = match msg { + Some(msg) => msg, None => { - continue; + debug!("pagestream subprotocol end observed"); + return ((pgb_reader, timeline_handles), Ok(())); } }; - for batch in batched { - // invoke handler function - let (handler_results, span): ( - Vec>, - _, - ) = match batch { - BatchedFeMessage::Exists { span, req } => { - fail::fail_point!("ps::handle-pagerequest-message::exists"); - ( - vec![ - self.handle_get_rel_exists_request( - tenant_id, - timeline_id, - &req, - &ctx, - ) - .instrument(span.clone()) - .await, - ], - span, - ) - } - BatchedFeMessage::Nblocks { span, req } => { - fail::fail_point!("ps::handle-pagerequest-message::nblocks"); - ( - vec![ - self.handle_get_nblocks_request(tenant_id, timeline_id, &req, &ctx) - .instrument(span.clone()) - .await, - ], - span, - ) - } - BatchedFeMessage::GetPage { - span, - shard, - effective_request_lsn, - pages, - } => { - fail::fail_point!("ps::handle-pagerequest-message::getpage"); - ( - { - let npages = pages.len(); - let res = self - .handle_get_page_at_lsn_request_batched( - &shard, - effective_request_lsn, - pages, - &ctx, - ) - .instrument(span.clone()) - .await; - assert_eq!(res.len(), npages); - res - }, - span, - ) - } - BatchedFeMessage::DbSize { span, req } => { - fail::fail_point!("ps::handle-pagerequest-message::dbsize"); - ( - vec![ - self.handle_db_size_request(tenant_id, timeline_id, &req, &ctx) - .instrument(span.clone()) - .await, - ], - span, - ) - } - BatchedFeMessage::GetSlruSegment { span, req } => { - fail::fail_point!("ps::handle-pagerequest-message::slrusegment"); - ( - vec![ - self.handle_get_slru_segment_request( - tenant_id, - timeline_id, - &req, - &ctx, - ) - .instrument(span.clone()) - .await, - ], - span, - ) - } - BatchedFeMessage::RespondError { span, error } => { - // We've already decided to respond with an error, so we don't need to - // call the handler. - (vec![Err(error)], span) - } - }; + if let Err(cancelled) = msg.throttle(&self.cancel).await { + break cancelled; + } - // Map handler result to protocol behavior. - // Some handler errors cause exit from pagestream protocol. - // Other handler errors are sent back as an error message and we stay in pagestream protocol. - for handler_result in handler_results { - let response_msg = match handler_result { - Err(e) => match &e { - PageStreamError::Shutdown => { - // If we fail to fulfil a request during shutdown, which may be _because_ of - // shutdown, then do not send the error to the client. Instead just drop the - // connection. - span.in_scope(|| info!("dropping connection due to shutdown")); - return Err(QueryError::Shutdown); - } - PageStreamError::Reconnect(reason) => { - span.in_scope(|| info!("handler requested reconnect: {reason}")); - return Err(QueryError::Reconnect); - } - PageStreamError::Read(_) - | PageStreamError::LsnTimeout(_) - | PageStreamError::NotFound(_) - | PageStreamError::BadRequest(_) => { - // print the all details to the log with {:#}, but for the client the - // error message is enough. Do not log if shutting down, as the anyhow::Error - // here includes cancellation which is not an error. - let full = utils::error::report_compact_sources(&e); - span.in_scope(|| { - error!("error reading relation or page version: {full:#}") - }); - PagestreamBeMessage::Error(PagestreamErrorResponse { - message: e.to_string(), - }) - } - }, - Ok(response_msg) => response_msg, - }; + let err = self + .pagesteam_handle_batched_message(pgb_writer, msg, &cancel, ctx) + .await; + match err { + Ok(()) => {} + Err(e) => break e, + } + }; + ((pgb_reader, timeline_handles), Err(err)) + } - // marshal & transmit response message - pgb.write_message_noflush(&BeMessage::CopyData(&response_msg.serialize()))?; + /// # Cancel-Safety + /// + /// May leak tokio tasks if not polled to completion. + #[allow(clippy::too_many_arguments)] + async fn handle_pagerequests_pipelined( + &mut self, + pgb_writer: &mut PostgresBackend, + pgb_reader: PostgresBackendReader, + tenant_id: TenantId, + timeline_id: TimelineId, + mut timeline_handles: TimelineHandles, + request_span: Span, + pipelining_config: PageServicePipeliningConfigPipelined, + ctx: &RequestContext, + ) -> ( + (PostgresBackendReader, TimelineHandles), + Result<(), QueryError>, + ) + where + IO: AsyncRead + AsyncWrite + Send + Sync + Unpin + 'static, + { + // + // Pipelined pagestream handling consists of + // - a Batcher that reads requests off the wire and + // and batches them if possible, + // - an Executor that processes the batched requests. + // + // The batch is built up inside an `spsc_fold` channel, + // shared betwen Batcher (Sender) and Executor (Receiver). + // + // The Batcher continously folds client requests into the batch, + // while the Executor can at any time take out what's in the batch + // in order to process it. + // This means the next batch builds up while the Executor + // executes the last batch. + // + // CANCELLATION + // + // We run both Batcher and Executor futures to completion before + // returning from this function. + // + // If Executor exits first, it signals cancellation to the Batcher + // via a CancellationToken that is child of `self.cancel`. + // If Batcher exits first, it signals cancellation to the Executor + // by dropping the spsc_fold channel Sender. + // + // CLEAN SHUTDOWN + // + // Clean shutdown means that the client ends the COPYBOTH session. + // In response to such a client message, the Batcher exits. + // The Executor continues to run, draining the spsc_fold channel. + // Once drained, the spsc_fold recv will fail with a distinct error + // indicating that the sender disconnected. + // The Executor exits with Ok(()) in response to that error. + // + // Server initiated shutdown is not clean shutdown, but instead + // is an error Err(QueryError::Shutdown) that is propagated through + // error propagation. + // + // ERROR PROPAGATION + // + // When the Batcher encounter an error, it sends it as a value + // through the spsc_fold channel and exits afterwards. + // When the Executor observes such an error in the channel, + // it exits returning that error value. + // + // This design ensures that the Executor stage will still process + // the batch that was in flight when the Batcher encountered an error, + // thereby beahving identical to a serial implementation. + + let PageServicePipeliningConfigPipelined { + max_batch_size, + execution, + } = pipelining_config; + + // Macro to _define_ a pipeline stage. + macro_rules! pipeline_stage { + ($name:literal, $cancel:expr, $make_fut:expr) => {{ + let cancel: CancellationToken = $cancel; + let stage_fut = $make_fut(cancel.clone()); + async move { + scopeguard::defer! { + debug!("exiting"); + } + timed_after_cancellation(stage_fut, $name, Duration::from_millis(100), &cancel) + .await } - tokio::select! { - biased; - _ = self.cancel.cancelled() => { - // We were requested to shut down. - info!("shutdown request received in page handler"); - return Err(QueryError::Shutdown) - } - res = pgb.flush() => { - res?; - } + .instrument(tracing::info_span!($name)) + }}; + } + + // + // Batcher + // + + let cancel_batcher = self.cancel.child_token(); + let (mut batch_tx, mut batch_rx) = spsc_fold::channel(); + let batcher = pipeline_stage!("batcher", cancel_batcher.clone(), move |cancel_batcher| { + let ctx = ctx.attached_child(); + async move { + let mut pgb_reader = pgb_reader; + let mut exit = false; + while !exit { + let read_res = Self::pagestream_read_message( + &mut pgb_reader, + tenant_id, + timeline_id, + &mut timeline_handles, + &cancel_batcher, + &ctx, + request_span.clone(), + ) + .await; + let Some(read_res) = read_res.transpose() else { + debug!("client-initiated shutdown"); + break; + }; + exit |= read_res.is_err(); + let could_send = batch_tx + .send(read_res, |batch, res| { + Self::pagestream_do_batch(max_batch_size, batch, res) + }) + .await; + exit |= could_send.is_err(); + } + (pgb_reader, timeline_handles) + } + }); + + // + // Executor + // + + let executor = pipeline_stage!("executor", self.cancel.clone(), move |cancel| { + let ctx = ctx.attached_child(); + async move { + let _cancel_batcher = cancel_batcher.drop_guard(); + loop { + let maybe_batch = batch_rx.recv().await; + let batch = match maybe_batch { + Ok(batch) => batch, + Err(spsc_fold::RecvError::SenderGone) => { + debug!("upstream gone"); + return Ok(()); + } + }; + let mut batch = match batch { + Ok(batch) => batch, + Err(e) => { + return Err(e); + } + }; + batch.throttle(&self.cancel).await?; + self.pagesteam_handle_batched_message(pgb_writer, batch, &cancel, &ctx) + .await?; } } + }); + + // + // Execute the stages. + // + + match execution { + PageServiceProtocolPipelinedExecutionStrategy::ConcurrentFutures => { + tokio::join!(batcher, executor) + } + PageServiceProtocolPipelinedExecutionStrategy::Tasks => { + // These tasks are not tracked anywhere. + let read_messages_task = tokio::spawn(batcher); + let (read_messages_task_res, executor_res_) = + tokio::join!(read_messages_task, executor,); + ( + read_messages_task_res.expect("propagated panic from read_messages"), + executor_res_, + ) + } } - Ok(()) } /// Helper function to handle the LSN from client request. @@ -1131,6 +1500,8 @@ impl PageServerHandler { { let timeline = self .timeline_handles + .as_mut() + .unwrap() .get( tenant_shard_id.tenant_id, timeline_id, @@ -1165,22 +1536,13 @@ impl PageServerHandler { #[instrument(skip_all, fields(shard_id))] async fn handle_get_rel_exists_request( &mut self, - tenant_id: TenantId, - timeline_id: TimelineId, + timeline: &Timeline, req: &PagestreamExistsRequest, ctx: &RequestContext, ) -> Result { - let timeline = self - .timeline_handles - .get(tenant_id, timeline_id, ShardSelector::Zero) - .await?; - let _timer = timeline - .query_metrics - .start_timer(metrics::SmgrQueryType::GetRelExists, ctx); - let latest_gc_cutoff_lsn = timeline.get_latest_gc_cutoff_lsn(); let lsn = Self::wait_or_get_last_lsn( - &timeline, + timeline, req.request_lsn, req.not_modified_since, &latest_gc_cutoff_lsn, @@ -1200,23 +1562,13 @@ impl PageServerHandler { #[instrument(skip_all, fields(shard_id))] async fn handle_get_nblocks_request( &mut self, - tenant_id: TenantId, - timeline_id: TimelineId, + timeline: &Timeline, req: &PagestreamNblocksRequest, ctx: &RequestContext, ) -> Result { - let timeline = self - .timeline_handles - .get(tenant_id, timeline_id, ShardSelector::Zero) - .await?; - - let _timer = timeline - .query_metrics - .start_timer(metrics::SmgrQueryType::GetRelSize, ctx); - let latest_gc_cutoff_lsn = timeline.get_latest_gc_cutoff_lsn(); let lsn = Self::wait_or_get_last_lsn( - &timeline, + timeline, req.request_lsn, req.not_modified_since, &latest_gc_cutoff_lsn, @@ -1236,23 +1588,13 @@ impl PageServerHandler { #[instrument(skip_all, fields(shard_id))] async fn handle_db_size_request( &mut self, - tenant_id: TenantId, - timeline_id: TimelineId, + timeline: &Timeline, req: &PagestreamDbSizeRequest, ctx: &RequestContext, ) -> Result { - let timeline = self - .timeline_handles - .get(tenant_id, timeline_id, ShardSelector::Zero) - .await?; - - let _timer = timeline - .query_metrics - .start_timer(metrics::SmgrQueryType::GetDbSize, ctx); - let latest_gc_cutoff_lsn = timeline.get_latest_gc_cutoff_lsn(); let lsn = Self::wait_or_get_last_lsn( - &timeline, + timeline, req.request_lsn, req.not_modified_since, &latest_gc_cutoff_lsn, @@ -1275,48 +1617,53 @@ impl PageServerHandler { &mut self, timeline: &Timeline, effective_lsn: Lsn, - pages: smallvec::SmallVec<[(RelTag, BlockNumber); 1]>, + requests: smallvec::SmallVec<[(RelTag, BlockNumber, SmgrOpTimer); 1]>, ctx: &RequestContext, - ) -> Vec> { + ) -> Vec> { debug_assert_current_span_has_tenant_and_timeline_id(); - let _timer = timeline.query_metrics.start_timer_many( - metrics::SmgrQueryType::GetPageAtLsn, - pages.len(), - ctx, - ); - let pages = timeline - .get_rel_page_at_lsn_batched(pages, effective_lsn, ctx) + timeline + .query_metrics + .observe_getpage_batch_start(requests.len()); + + let results = timeline + .get_rel_page_at_lsn_batched( + requests.iter().map(|(reltag, blkno, _)| (reltag, blkno)), + effective_lsn, + ctx, + ) .await; + assert_eq!(results.len(), requests.len()); - Vec::from_iter(pages.into_iter().map(|page| { - page.map(|page| { - PagestreamBeMessage::GetPage(models::PagestreamGetPageResponse { page }) - }) - .map_err(PageStreamError::from) - })) + // TODO: avoid creating the new Vec here + Vec::from_iter( + requests + .into_iter() + .zip(results.into_iter()) + .map(|((_, _, timer), res)| { + res.map(|page| { + ( + PagestreamBeMessage::GetPage(models::PagestreamGetPageResponse { + page, + }), + timer, + ) + }) + .map_err(PageStreamError::from) + }), + ) } #[instrument(skip_all, fields(shard_id))] async fn handle_get_slru_segment_request( &mut self, - tenant_id: TenantId, - timeline_id: TimelineId, + timeline: &Timeline, req: &PagestreamGetSlruSegmentRequest, ctx: &RequestContext, ) -> Result { - let timeline = self - .timeline_handles - .get(tenant_id, timeline_id, ShardSelector::Zero) - .await?; - - let _timer = timeline - .query_metrics - .start_timer(metrics::SmgrQueryType::GetSlruSegment, ctx); - let latest_gc_cutoff_lsn = timeline.get_latest_gc_cutoff_lsn(); let lsn = Self::wait_or_get_last_lsn( - &timeline, + timeline, req.request_lsn, req.not_modified_since, &latest_gc_cutoff_lsn, @@ -1374,6 +1721,8 @@ impl PageServerHandler { let timeline = self .timeline_handles + .as_mut() + .unwrap() .get(tenant_id, timeline_id, ShardSelector::Zero) .await?; @@ -1716,7 +2065,7 @@ impl PageServiceCmd { impl postgres_backend::Handler for PageServerHandler where - IO: AsyncRead + AsyncWrite + Send + Sync + Unpin, + IO: AsyncRead + AsyncWrite + Send + Sync + Unpin + 'static, { fn check_auth_jwt( &mut self, @@ -1812,7 +2161,7 @@ where COMPUTE_COMMANDS_COUNTERS .for_command(ComputeCommandKind::Basebackup) .inc(); - let metric_recording = metrics::BASEBACKUP_QUERY_TIME.start_recording(&ctx); + let metric_recording = metrics::BASEBACKUP_QUERY_TIME.start_recording(); let res = async { self.handle_basebackup_request( pgb, diff --git a/pageserver/src/pgdatadir_mapping.rs b/pageserver/src/pgdatadir_mapping.rs index d48a1ba117..255bd01e25 100644 --- a/pageserver/src/pgdatadir_mapping.rs +++ b/pageserver/src/pgdatadir_mapping.rs @@ -203,9 +203,13 @@ impl Timeline { ) -> Result { match version { Version::Lsn(effective_lsn) => { - let pages = smallvec::smallvec![(tag, blknum)]; + let pages: smallvec::SmallVec<[_; 1]> = smallvec::smallvec![(tag, blknum)]; let res = self - .get_rel_page_at_lsn_batched(pages, effective_lsn, ctx) + .get_rel_page_at_lsn_batched( + pages.iter().map(|(tag, blknum)| (tag, blknum)), + effective_lsn, + ctx, + ) .await; assert_eq!(res.len(), 1); res.into_iter().next().unwrap() @@ -240,7 +244,7 @@ impl Timeline { /// The ordering of the returned vec corresponds to the ordering of `pages`. pub(crate) async fn get_rel_page_at_lsn_batched( &self, - pages: smallvec::SmallVec<[(RelTag, BlockNumber); 1]>, + pages: impl ExactSizeIterator, effective_lsn: Lsn, ctx: &RequestContext, ) -> Vec> { @@ -254,7 +258,7 @@ impl Timeline { let result_slots = result.spare_capacity_mut(); let mut keys_slots: BTreeMap> = BTreeMap::default(); - for (response_slot_idx, (tag, blknum)) in pages.into_iter().enumerate() { + for (response_slot_idx, (tag, blknum)) in pages.enumerate() { if tag.relnode == 0 { result_slots[response_slot_idx].write(Err(PageReconstructError::Other( RelationError::InvalidRelnode.into(), @@ -265,7 +269,7 @@ impl Timeline { } let nblocks = match self - .get_rel_size(tag, Version::Lsn(effective_lsn), ctx) + .get_rel_size(*tag, Version::Lsn(effective_lsn), ctx) .await { Ok(nblocks) => nblocks, @@ -276,7 +280,7 @@ impl Timeline { } }; - if blknum >= nblocks { + if *blknum >= nblocks { debug!( "read beyond EOF at {} blk {} at {}, size is {}: returning all-zeros page", tag, blknum, effective_lsn, nblocks @@ -286,7 +290,7 @@ impl Timeline { continue; } - let key = rel_block_to_key(tag, blknum); + let key = rel_block_to_key(*tag, *blknum); let key_slots = keys_slots.entry(key).or_default(); key_slots.push(response_slot_idx); @@ -526,6 +530,7 @@ impl Timeline { lsn: Lsn, ctx: &RequestContext, ) -> Result { + assert!(self.tenant_shard_id.is_shard_zero()); let n_blocks = self .get_slru_segment_size(kind, segno, Version::Lsn(lsn), ctx) .await?; @@ -548,6 +553,7 @@ impl Timeline { lsn: Lsn, ctx: &RequestContext, ) -> Result { + assert!(self.tenant_shard_id.is_shard_zero()); let key = slru_block_to_key(kind, segno, blknum); self.get(key, lsn, ctx).await } @@ -560,6 +566,7 @@ impl Timeline { version: Version<'_>, ctx: &RequestContext, ) -> Result { + assert!(self.tenant_shard_id.is_shard_zero()); let key = slru_segment_size_to_key(kind, segno); let mut buf = version.get(self, key, ctx).await?; Ok(buf.get_u32_le()) @@ -573,6 +580,7 @@ impl Timeline { version: Version<'_>, ctx: &RequestContext, ) -> Result { + assert!(self.tenant_shard_id.is_shard_zero()); // fetch directory listing let key = slru_dir_to_key(kind); let buf = version.get(self, key, ctx).await?; @@ -1043,26 +1051,28 @@ impl Timeline { } // Iterate SLRUs next - for kind in [ - SlruKind::Clog, - SlruKind::MultiXactMembers, - SlruKind::MultiXactOffsets, - ] { - let slrudir_key = slru_dir_to_key(kind); - result.add_key(slrudir_key); - let buf = self.get(slrudir_key, lsn, ctx).await?; - let dir = SlruSegmentDirectory::des(&buf)?; - let mut segments: Vec = dir.segments.iter().cloned().collect(); - segments.sort_unstable(); - for segno in segments { - let segsize_key = slru_segment_size_to_key(kind, segno); - let mut buf = self.get(segsize_key, lsn, ctx).await?; - let segsize = buf.get_u32_le(); + if self.tenant_shard_id.is_shard_zero() { + for kind in [ + SlruKind::Clog, + SlruKind::MultiXactMembers, + SlruKind::MultiXactOffsets, + ] { + let slrudir_key = slru_dir_to_key(kind); + result.add_key(slrudir_key); + let buf = self.get(slrudir_key, lsn, ctx).await?; + let dir = SlruSegmentDirectory::des(&buf)?; + let mut segments: Vec = dir.segments.iter().cloned().collect(); + segments.sort_unstable(); + for segno in segments { + let segsize_key = slru_segment_size_to_key(kind, segno); + let mut buf = self.get(segsize_key, lsn, ctx).await?; + let segsize = buf.get_u32_le(); - result.add_range( - slru_block_to_key(kind, segno, 0)..slru_block_to_key(kind, segno, segsize), - ); - result.add_key(segsize_key); + result.add_range( + slru_block_to_key(kind, segno, 0)..slru_block_to_key(kind, segno, segsize), + ); + result.add_key(segsize_key); + } } } @@ -1464,6 +1474,10 @@ impl<'a> DatadirModification<'a> { blknum: BlockNumber, rec: NeonWalRecord, ) -> anyhow::Result<()> { + if !self.tline.tenant_shard_id.is_shard_zero() { + return Ok(()); + } + self.put( slru_block_to_key(kind, segno, blknum), Value::WalRecord(rec), @@ -1497,6 +1511,8 @@ impl<'a> DatadirModification<'a> { blknum: BlockNumber, img: Bytes, ) -> anyhow::Result<()> { + assert!(self.tline.tenant_shard_id.is_shard_zero()); + let key = slru_block_to_key(kind, segno, blknum); if !key.is_valid_key_on_write_path() { anyhow::bail!( @@ -1538,6 +1554,7 @@ impl<'a> DatadirModification<'a> { segno: u32, blknum: BlockNumber, ) -> anyhow::Result<()> { + assert!(self.tline.tenant_shard_id.is_shard_zero()); let key = slru_block_to_key(kind, segno, blknum); if !key.is_valid_key_on_write_path() { anyhow::bail!( @@ -1849,6 +1866,8 @@ impl<'a> DatadirModification<'a> { nblocks: BlockNumber, ctx: &RequestContext, ) -> anyhow::Result<()> { + assert!(self.tline.tenant_shard_id.is_shard_zero()); + // Add it to the directory entry let dir_key = slru_dir_to_key(kind); let buf = self.get(dir_key, ctx).await?; @@ -1881,6 +1900,8 @@ impl<'a> DatadirModification<'a> { segno: u32, nblocks: BlockNumber, ) -> anyhow::Result<()> { + assert!(self.tline.tenant_shard_id.is_shard_zero()); + // Put size let size_key = slru_segment_size_to_key(kind, segno); let buf = nblocks.to_le_bytes(); diff --git a/pageserver/src/tenant.rs b/pageserver/src/tenant.rs index 339a3ca1bb..5a9e398586 100644 --- a/pageserver/src/tenant.rs +++ b/pageserver/src/tenant.rs @@ -357,8 +357,8 @@ pub struct Tenant { /// Throttle applied at the top of [`Timeline::get`]. /// All [`Tenant::timelines`] of a given [`Tenant`] instance share the same [`throttle::Throttle`] instance. - pub(crate) timeline_get_throttle: - Arc>, + pub(crate) pagestream_throttle: + Arc>, /// An ongoing timeline detach concurrency limiter. /// @@ -1678,7 +1678,7 @@ impl Tenant { remote_metadata, TimelineResources { remote_client, - timeline_get_throttle: self.timeline_get_throttle.clone(), + pagestream_throttle: self.pagestream_throttle.clone(), l0_flush_global_state: self.l0_flush_global_state.clone(), }, LoadTimelineCause::Attach, @@ -3422,7 +3422,7 @@ impl Tenant { r.map_err( |_e: tokio::sync::watch::error::RecvError| // Tenant existed but was dropped: report it as non-existent - GetActiveTenantError::NotFound(GetTenantError::NotFound(self.tenant_shard_id.tenant_id)) + GetActiveTenantError::NotFound(GetTenantError::ShardNotFound(self.tenant_shard_id)) )? } Err(TimeoutCancellableError::Cancelled) => { @@ -3835,7 +3835,7 @@ impl Tenant { } } - fn get_timeline_get_throttle_config( + fn get_pagestream_throttle_config( psconf: &'static PageServerConf, overrides: &TenantConfOpt, ) -> throttle::Config { @@ -3846,8 +3846,8 @@ impl Tenant { } pub(crate) fn tenant_conf_updated(&self, new_conf: &TenantConfOpt) { - let conf = Self::get_timeline_get_throttle_config(self.conf, new_conf); - self.timeline_get_throttle.reconfigure(conf) + let conf = Self::get_pagestream_throttle_config(self.conf, new_conf); + self.pagestream_throttle.reconfigure(conf) } /// Helper function to create a new Timeline struct. @@ -4009,9 +4009,9 @@ impl Tenant { attach_wal_lag_cooldown: Arc::new(std::sync::OnceLock::new()), cancel: CancellationToken::default(), gate: Gate::default(), - timeline_get_throttle: Arc::new(throttle::Throttle::new( - Tenant::get_timeline_get_throttle_config(conf, &attached_conf.tenant_conf), - crate::metrics::tenant_throttling::TimelineGet::new(&tenant_shard_id), + pagestream_throttle: Arc::new(throttle::Throttle::new( + Tenant::get_pagestream_throttle_config(conf, &attached_conf.tenant_conf), + crate::metrics::tenant_throttling::Metrics::new(&tenant_shard_id), )), tenant_conf: Arc::new(ArcSwap::from_pointee(attached_conf)), ongoing_timeline_detach: std::sync::Mutex::default(), @@ -4909,7 +4909,7 @@ impl Tenant { fn build_timeline_resources(&self, timeline_id: TimelineId) -> TimelineResources { TimelineResources { remote_client: self.build_timeline_remote_client(timeline_id), - timeline_get_throttle: self.timeline_get_throttle.clone(), + pagestream_throttle: self.pagestream_throttle.clone(), l0_flush_global_state: self.l0_flush_global_state.clone(), } } @@ -5423,6 +5423,7 @@ pub(crate) mod harness { local_path: remote_fs_dir.clone(), }, timeout: RemoteStorageConfig::DEFAULT_TIMEOUT, + small_timeout: RemoteStorageConfig::DEFAULT_SMALL_TIMEOUT, }; let remote_storage = GenericRemoteStorage::from_config(&config).await.unwrap(); let deletion_queue = MockDeletionQueue::new(Some(remote_storage.clone())); diff --git a/pageserver/src/tenant/ephemeral_file.rs b/pageserver/src/tenant/ephemeral_file.rs index de0abab4c0..aaec8a4c31 100644 --- a/pageserver/src/tenant/ephemeral_file.rs +++ b/pageserver/src/tenant/ephemeral_file.rs @@ -8,10 +8,8 @@ use crate::page_cache; use crate::tenant::storage_layer::inmemory_layer::vectored_dio_read::File; use crate::virtual_file::owned_buffers_io::io_buf_aligned::IoBufAlignedMut; use crate::virtual_file::owned_buffers_io::slice::SliceMutExt; -use crate::virtual_file::owned_buffers_io::util::size_tracking_writer; use crate::virtual_file::owned_buffers_io::write::Buffer; use crate::virtual_file::{self, owned_buffers_io, IoBufferMut, VirtualFile}; -use bytes::BytesMut; use camino::Utf8PathBuf; use num_traits::Num; use pageserver_api::shard::TenantShardId; @@ -20,6 +18,7 @@ use tracing::error; use std::io; use std::sync::atomic::AtomicU64; +use std::sync::Arc; use utils::id::TimelineId; pub struct EphemeralFile { @@ -27,10 +26,7 @@ pub struct EphemeralFile { _timeline_id: TimelineId, page_cache_file_id: page_cache::FileId, bytes_written: u64, - buffered_writer: owned_buffers_io::write::BufferedWriter< - BytesMut, - size_tracking_writer::Writer, - >, + buffered_writer: owned_buffers_io::write::BufferedWriter, /// Gate guard is held on as long as we need to do operations in the path (delete on drop) _gate_guard: utils::sync::gate::GateGuard, } @@ -42,9 +38,9 @@ impl EphemeralFile { conf: &PageServerConf, tenant_shard_id: TenantShardId, timeline_id: TimelineId, - gate_guard: utils::sync::gate::GateGuard, + gate: &utils::sync::gate::Gate, ctx: &RequestContext, - ) -> Result { + ) -> anyhow::Result { static NEXT_FILENAME: AtomicU64 = AtomicU64::new(1); let filename_disambiguator = NEXT_FILENAME.fetch_add(1, std::sync::atomic::Ordering::Relaxed); @@ -55,15 +51,17 @@ impl EphemeralFile { "ephemeral-{filename_disambiguator}" ))); - let file = VirtualFile::open_with_options( - &filename, - virtual_file::OpenOptions::new() - .read(true) - .write(true) - .create(true), - ctx, - ) - .await?; + let file = Arc::new( + VirtualFile::open_with_options_v2( + &filename, + virtual_file::OpenOptions::new() + .read(true) + .write(true) + .create(true), + ctx, + ) + .await?, + ); let page_cache_file_id = page_cache::next_file_id(); // XXX get rid, we're not page-caching anymore @@ -73,10 +71,12 @@ impl EphemeralFile { page_cache_file_id, bytes_written: 0, buffered_writer: owned_buffers_io::write::BufferedWriter::new( - size_tracking_writer::Writer::new(file), - BytesMut::with_capacity(TAIL_SZ), + file, + || IoBufferMut::with_capacity(TAIL_SZ), + gate.enter()?, + ctx, ), - _gate_guard: gate_guard, + _gate_guard: gate.enter()?, }) } } @@ -85,7 +85,7 @@ impl Drop for EphemeralFile { fn drop(&mut self) { // unlink the file // we are clear to do this, because we have entered a gate - let path = self.buffered_writer.as_inner().as_inner().path(); + let path = self.buffered_writer.as_inner().path(); let res = std::fs::remove_file(path); if let Err(e) = res { if e.kind() != std::io::ErrorKind::NotFound { @@ -132,6 +132,18 @@ impl EphemeralFile { srcbuf: &[u8], ctx: &RequestContext, ) -> std::io::Result { + let (pos, control) = self.write_raw_controlled(srcbuf, ctx).await?; + if let Some(control) = control { + control.release().await; + } + Ok(pos) + } + + async fn write_raw_controlled( + &mut self, + srcbuf: &[u8], + ctx: &RequestContext, + ) -> std::io::Result<(u64, Option)> { let pos = self.bytes_written; let new_bytes_written = pos.checked_add(srcbuf.len().into_u64()).ok_or_else(|| { @@ -145,9 +157,9 @@ impl EphemeralFile { })?; // Write the payload - let nwritten = self + let (nwritten, control) = self .buffered_writer - .write_buffered_borrowed(srcbuf, ctx) + .write_buffered_borrowed_controlled(srcbuf, ctx) .await?; assert_eq!( nwritten, @@ -157,7 +169,7 @@ impl EphemeralFile { self.bytes_written = new_bytes_written; - Ok(pos) + Ok((pos, control)) } } @@ -168,11 +180,12 @@ impl super::storage_layer::inmemory_layer::vectored_dio_read::File for Ephemeral dst: tokio_epoll_uring::Slice, ctx: &'a RequestContext, ) -> std::io::Result<(tokio_epoll_uring::Slice, usize)> { - let file_size_tracking_writer = self.buffered_writer.as_inner(); - let flushed_offset = file_size_tracking_writer.bytes_written(); + let submitted_offset = self.buffered_writer.bytes_submitted(); - let buffer = self.buffered_writer.inspect_buffer(); - let buffered = &buffer[0..buffer.pending()]; + let mutable = self.buffered_writer.inspect_mutable(); + let mutable = &mutable[0..mutable.pending()]; + + let maybe_flushed = self.buffered_writer.inspect_maybe_flushed(); let dst_cap = dst.bytes_total().into_u64(); let end = { @@ -197,11 +210,42 @@ impl super::storage_layer::inmemory_layer::vectored_dio_read::File for Ephemeral } } } - let written_range = Range(start, std::cmp::min(end, flushed_offset)); - let buffered_range = Range(std::cmp::max(start, flushed_offset), end); + + let (written_range, maybe_flushed_range) = { + if maybe_flushed.is_some() { + // [ written ][ maybe_flushed ][ mutable ] + // <- TAIL_SZ -><- TAIL_SZ -> + // ^ + // `submitted_offset` + // <++++++ on disk +++++++????????????????> + ( + Range( + start, + std::cmp::min(end, submitted_offset.saturating_sub(TAIL_SZ as u64)), + ), + Range( + std::cmp::max(start, submitted_offset.saturating_sub(TAIL_SZ as u64)), + std::cmp::min(end, submitted_offset), + ), + ) + } else { + // [ written ][ mutable ] + // <- TAIL_SZ -> + // ^ + // `submitted_offset` + // <++++++ on disk +++++++++++++++++++++++> + ( + Range(start, std::cmp::min(end, submitted_offset)), + // zero len + Range(submitted_offset, u64::MIN), + ) + } + }; + + let mutable_range = Range(std::cmp::max(start, submitted_offset), end); let dst = if written_range.len() > 0 { - let file: &VirtualFile = file_size_tracking_writer.as_inner(); + let file: &VirtualFile = self.buffered_writer.as_inner(); let bounds = dst.bounds(); let slice = file .read_exact_at(dst.slice(0..written_range.len().into_usize()), start, ctx) @@ -211,19 +255,21 @@ impl super::storage_layer::inmemory_layer::vectored_dio_read::File for Ephemeral dst }; - let dst = if buffered_range.len() > 0 { - let offset_in_buffer = buffered_range + let dst = if maybe_flushed_range.len() > 0 { + let offset_in_buffer = maybe_flushed_range .0 - .checked_sub(flushed_offset) + .checked_sub(submitted_offset.saturating_sub(TAIL_SZ as u64)) .unwrap() .into_usize(); - let to_copy = - &buffered[offset_in_buffer..(offset_in_buffer + buffered_range.len().into_usize())]; + // Checked previously the buffer is Some. + let maybe_flushed = maybe_flushed.unwrap(); + let to_copy = &maybe_flushed + [offset_in_buffer..(offset_in_buffer + maybe_flushed_range.len().into_usize())]; let bounds = dst.bounds(); let mut view = dst.slice({ let start = written_range.len().into_usize(); let end = start - .checked_add(buffered_range.len().into_usize()) + .checked_add(maybe_flushed_range.len().into_usize()) .unwrap(); start..end }); @@ -234,6 +280,28 @@ impl super::storage_layer::inmemory_layer::vectored_dio_read::File for Ephemeral dst }; + let dst = if mutable_range.len() > 0 { + let offset_in_buffer = mutable_range + .0 + .checked_sub(submitted_offset) + .unwrap() + .into_usize(); + let to_copy = + &mutable[offset_in_buffer..(offset_in_buffer + mutable_range.len().into_usize())]; + let bounds = dst.bounds(); + let mut view = dst.slice({ + let start = + written_range.len().into_usize() + maybe_flushed_range.len().into_usize(); + let end = start.checked_add(mutable_range.len().into_usize()).unwrap(); + start..end + }); + view.as_mut_rust_slice_full_zeroed() + .copy_from_slice(to_copy); + Slice::from_buf_bounds(Slice::into_inner(view), bounds) + } else { + dst + }; + // TODO: in debug mode, randomize the remaining bytes in `dst` to catch bugs Ok((dst, (end - start).into_usize())) @@ -295,7 +363,7 @@ mod tests { let gate = utils::sync::gate::Gate::default(); - let file = EphemeralFile::create(conf, tenant_id, timeline_id, gate.enter().unwrap(), &ctx) + let file = EphemeralFile::create(conf, tenant_id, timeline_id, &gate, &ctx) .await .unwrap(); @@ -326,14 +394,15 @@ mod tests { let gate = utils::sync::gate::Gate::default(); - let mut file = - EphemeralFile::create(conf, tenant_id, timeline_id, gate.enter().unwrap(), &ctx) - .await - .unwrap(); + let mut file = EphemeralFile::create(conf, tenant_id, timeline_id, &gate, &ctx) + .await + .unwrap(); - let cap = file.buffered_writer.inspect_buffer().capacity(); + let mutable = file.buffered_writer.inspect_mutable(); + let cap = mutable.capacity(); + let align = mutable.align(); - let write_nbytes = cap + cap / 2; + let write_nbytes = cap * 2 + cap / 2; let content: Vec = rand::thread_rng() .sample_iter(rand::distributions::Standard) @@ -341,30 +410,39 @@ mod tests { .collect(); let mut value_offsets = Vec::new(); - for i in 0..write_nbytes { - let off = file.write_raw(&content[i..i + 1], &ctx).await.unwrap(); + for range in (0..write_nbytes) + .step_by(align) + .map(|start| start..(start + align).min(write_nbytes)) + { + let off = file.write_raw(&content[range], &ctx).await.unwrap(); value_offsets.push(off); } - assert!(file.len() as usize == write_nbytes); - for i in 0..write_nbytes { - assert_eq!(value_offsets[i], i.into_u64()); - let buf = IoBufferMut::with_capacity(1); + assert_eq!(file.len() as usize, write_nbytes); + for (i, range) in (0..write_nbytes) + .step_by(align) + .map(|start| start..(start + align).min(write_nbytes)) + .enumerate() + { + assert_eq!(value_offsets[i], range.start.into_u64()); + let buf = IoBufferMut::with_capacity(range.len()); let (buf_slice, nread) = file - .read_exact_at_eof_ok(i.into_u64(), buf.slice_full(), &ctx) + .read_exact_at_eof_ok(range.start.into_u64(), buf.slice_full(), &ctx) .await .unwrap(); let buf = buf_slice.into_inner(); - assert_eq!(nread, 1); - assert_eq!(&buf, &content[i..i + 1]); + assert_eq!(nread, range.len()); + assert_eq!(&buf, &content[range]); } - let file_contents = - std::fs::read(file.buffered_writer.as_inner().as_inner().path()).unwrap(); - assert_eq!(file_contents, &content[0..cap]); + let file_contents = std::fs::read(file.buffered_writer.as_inner().path()).unwrap(); + assert!(file_contents == content[0..cap * 2]); - let buffer_contents = file.buffered_writer.inspect_buffer(); - assert_eq!(buffer_contents, &content[cap..write_nbytes]); + let maybe_flushed_buffer_contents = file.buffered_writer.inspect_maybe_flushed().unwrap(); + assert_eq!(&maybe_flushed_buffer_contents[..], &content[cap..cap * 2]); + + let mutable_buffer_contents = file.buffered_writer.inspect_mutable(); + assert_eq!(mutable_buffer_contents, &content[cap * 2..write_nbytes]); } #[tokio::test] @@ -373,16 +451,16 @@ mod tests { let gate = utils::sync::gate::Gate::default(); - let mut file = - EphemeralFile::create(conf, tenant_id, timeline_id, gate.enter().unwrap(), &ctx) - .await - .unwrap(); + let mut file = EphemeralFile::create(conf, tenant_id, timeline_id, &gate, &ctx) + .await + .unwrap(); - let cap = file.buffered_writer.inspect_buffer().capacity(); + // mutable buffer and maybe_flushed buffer each has `cap` bytes. + let cap = file.buffered_writer.inspect_mutable().capacity(); let content: Vec = rand::thread_rng() .sample_iter(rand::distributions::Standard) - .take(cap + cap / 2) + .take(cap * 2 + cap / 2) .collect(); file.write_raw(&content, &ctx).await.unwrap(); @@ -390,23 +468,21 @@ mod tests { // assert the state is as this test expects it to be assert_eq!( &file.load_to_io_buf(&ctx).await.unwrap(), - &content[0..cap + cap / 2] + &content[0..cap * 2 + cap / 2] ); - let md = file - .buffered_writer - .as_inner() - .as_inner() - .path() - .metadata() - .unwrap(); + let md = file.buffered_writer.as_inner().path().metadata().unwrap(); assert_eq!( md.len(), - cap.into_u64(), - "buffered writer does one write if we write 1.5x buffer capacity" + 2 * cap.into_u64(), + "buffered writer requires one write to be flushed if we write 2.5x buffer capacity" ); assert_eq!( - &file.buffered_writer.inspect_buffer()[0..cap / 2], - &content[cap..cap + cap / 2] + &file.buffered_writer.inspect_maybe_flushed().unwrap()[0..cap], + &content[cap..cap * 2] + ); + assert_eq!( + &file.buffered_writer.inspect_mutable()[0..cap / 2], + &content[cap * 2..cap * 2 + cap / 2] ); } @@ -422,19 +498,19 @@ mod tests { let gate = utils::sync::gate::Gate::default(); - let mut file = - EphemeralFile::create(conf, tenant_id, timeline_id, gate.enter().unwrap(), &ctx) - .await - .unwrap(); - - let cap = file.buffered_writer.inspect_buffer().capacity(); + let mut file = EphemeralFile::create(conf, tenant_id, timeline_id, &gate, &ctx) + .await + .unwrap(); + let mutable = file.buffered_writer.inspect_mutable(); + let cap = mutable.capacity(); + let align = mutable.align(); let content: Vec = rand::thread_rng() .sample_iter(rand::distributions::Standard) - .take(cap + cap / 2) + .take(cap * 2 + cap / 2) .collect(); - file.write_raw(&content, &ctx).await.unwrap(); + let (_, control) = file.write_raw_controlled(&content, &ctx).await.unwrap(); let test_read = |start: usize, len: usize| { let file = &file; @@ -454,16 +530,38 @@ mod tests { } }; + let test_read_all_offset_combinations = || { + async move { + test_read(align, align).await; + // border onto edge of file + test_read(cap - align, align).await; + // read across file and buffer + test_read(cap - align, 2 * align).await; + // stay from start of maybe flushed buffer + test_read(cap, align).await; + // completely within maybe flushed buffer + test_read(cap + align, align).await; + // border onto edge of maybe flushed buffer. + test_read(cap * 2 - align, align).await; + // read across maybe flushed and mutable buffer + test_read(cap * 2 - align, 2 * align).await; + // read across three segments + test_read(cap - align, cap + 2 * align).await; + // completely within mutable buffer + test_read(cap * 2 + align, align).await; + } + }; + // completely within the file range - assert!(20 < cap, "test assumption"); - test_read(10, 10).await; - // border onto edge of file - test_read(cap - 10, 10).await; - // read across file and buffer - test_read(cap - 10, 20).await; - // stay from start of buffer - test_read(cap, 10).await; - // completely within buffer - test_read(cap + 10, 10).await; + assert!(align < cap, "test assumption"); + assert!(cap % align == 0); + + // test reads at different flush stages. + let not_started = control.unwrap().into_not_started(); + test_read_all_offset_combinations().await; + let in_progress = not_started.ready_to_flush(); + test_read_all_offset_combinations().await; + in_progress.wait_until_flush_is_done().await; + test_read_all_offset_combinations().await; } } diff --git a/pageserver/src/tenant/mgr.rs b/pageserver/src/tenant/mgr.rs index eb8191e43e..e8b0d1d4dd 100644 --- a/pageserver/src/tenant/mgr.rs +++ b/pageserver/src/tenant/mgr.rs @@ -347,7 +347,7 @@ async fn init_load_generations( ); emergency_generations(tenant_confs) } else if let Some(client) = ControllerUpcallClient::new(conf, cancel) { - info!("Calling control plane API to re-attach tenants"); + info!("Calling {} API to re-attach tenants", client.base_url()); // If we are configured to use the control plane API, then it is the source of truth for what tenants to load. match client.re_attach(conf).await { Ok(tenants) => tenants @@ -894,7 +894,7 @@ impl TenantManager { Some(TenantSlot::Attached(tenant)) => Ok(Arc::clone(tenant)), Some(TenantSlot::InProgress(_)) => Err(GetTenantError::NotActive(tenant_shard_id)), None | Some(TenantSlot::Secondary(_)) => { - Err(GetTenantError::NotFound(tenant_shard_id.tenant_id)) + Err(GetTenantError::ShardNotFound(tenant_shard_id)) } } } @@ -2258,6 +2258,9 @@ pub(crate) enum GetTenantError { #[error("Tenant {0} not found")] NotFound(TenantId), + #[error("Tenant {0} not found")] + ShardNotFound(TenantShardId), + #[error("Tenant {0} is not active")] NotActive(TenantShardId), diff --git a/pageserver/src/tenant/remote_timeline_client.rs b/pageserver/src/tenant/remote_timeline_client.rs index 007bd3eef0..89b935947d 100644 --- a/pageserver/src/tenant/remote_timeline_client.rs +++ b/pageserver/src/tenant/remote_timeline_client.rs @@ -681,6 +681,7 @@ impl RemoteTimelineClient { layer_file_name: &LayerName, layer_metadata: &LayerFileMetadata, local_path: &Utf8Path, + gate: &utils::sync::gate::Gate, cancel: &CancellationToken, ctx: &RequestContext, ) -> Result { @@ -700,6 +701,7 @@ impl RemoteTimelineClient { layer_file_name, layer_metadata, local_path, + gate, cancel, ctx, ) @@ -2564,9 +2566,9 @@ pub fn parse_remote_index_path(path: RemotePath) -> Option { } /// Given the key of a tenant manifest, parse out the generation number -pub(crate) fn parse_remote_tenant_manifest_path(path: RemotePath) -> Option { +pub fn parse_remote_tenant_manifest_path(path: RemotePath) -> Option { static RE: OnceLock = OnceLock::new(); - let re = RE.get_or_init(|| Regex::new(r".+tenant-manifest-([0-9a-f]{8}).json").unwrap()); + let re = RE.get_or_init(|| Regex::new(r".*tenant-manifest-([0-9a-f]{8}).json").unwrap()); re.captures(path.get_path().as_str()) .and_then(|c| c.get(1)) .and_then(|m| Generation::parse_suffix(m.as_str())) diff --git a/pageserver/src/tenant/remote_timeline_client/download.rs b/pageserver/src/tenant/remote_timeline_client/download.rs index d632e595ad..d15f161fb6 100644 --- a/pageserver/src/tenant/remote_timeline_client/download.rs +++ b/pageserver/src/tenant/remote_timeline_client/download.rs @@ -26,11 +26,11 @@ use crate::span::{ use crate::tenant::remote_timeline_client::{remote_layer_path, remote_timelines_path}; use crate::tenant::storage_layer::LayerName; use crate::tenant::Generation; -#[cfg_attr(target_os = "macos", allow(unused_imports))] -use crate::virtual_file::owned_buffers_io::io_buf_ext::IoBufExt; use crate::virtual_file::{on_fatal_io_error, MaybeFatalIo, VirtualFile}; use crate::TEMP_FILE_SUFFIX; -use remote_storage::{DownloadError, DownloadOpts, GenericRemoteStorage, ListingMode, RemotePath}; +use remote_storage::{ + DownloadError, DownloadKind, DownloadOpts, GenericRemoteStorage, ListingMode, RemotePath, +}; use utils::crashsafe::path_with_suffix_extension; use utils::id::{TenantId, TimelineId}; use utils::pausable_failpoint; @@ -58,6 +58,7 @@ pub async fn download_layer_file<'a>( layer_file_name: &'a LayerName, layer_metadata: &'a LayerFileMetadata, local_path: &Utf8Path, + gate: &utils::sync::gate::Gate, cancel: &CancellationToken, ctx: &RequestContext, ) -> Result { @@ -86,7 +87,9 @@ pub async fn download_layer_file<'a>( let temp_file_path = path_with_suffix_extension(local_path, TEMP_DOWNLOAD_EXTENSION); let bytes_amount = download_retry( - || async { download_object(storage, &remote_path, &temp_file_path, cancel, ctx).await }, + || async { + download_object(storage, &remote_path, &temp_file_path, gate, cancel, ctx).await + }, &format!("download {remote_path:?}"), cancel, ) @@ -146,6 +149,7 @@ async fn download_object<'a>( storage: &'a GenericRemoteStorage, src_path: &RemotePath, dst_path: &Utf8PathBuf, + #[cfg_attr(target_os = "macos", allow(unused_variables))] gate: &utils::sync::gate::Gate, cancel: &CancellationToken, #[cfg_attr(target_os = "macos", allow(unused_variables))] ctx: &RequestContext, ) -> Result { @@ -203,13 +207,18 @@ async fn download_object<'a>( } #[cfg(target_os = "linux")] crate::virtual_file::io_engine::IoEngine::TokioEpollUring => { - use crate::virtual_file::owned_buffers_io::{self, util::size_tracking_writer}; - use bytes::BytesMut; + use crate::virtual_file::owned_buffers_io; + use crate::virtual_file::IoBufferMut; + use std::sync::Arc; async { - let destination_file = VirtualFile::create(dst_path, ctx) - .await - .with_context(|| format!("create a destination file for layer '{dst_path}'")) - .map_err(DownloadError::Other)?; + let destination_file = Arc::new( + VirtualFile::create(dst_path, ctx) + .await + .with_context(|| { + format!("create a destination file for layer '{dst_path}'") + }) + .map_err(DownloadError::Other)?, + ); let mut download = storage .download(src_path, &DownloadOpts::default(), cancel) @@ -217,14 +226,16 @@ async fn download_object<'a>( pausable_failpoint!("before-downloading-layer-stream-pausable"); + let mut buffered = owned_buffers_io::write::BufferedWriter::::new( + destination_file, + || IoBufferMut::with_capacity(super::BUFFER_SIZE), + gate.enter().map_err(|_| DownloadError::Cancelled)?, + ctx, + ); + // TODO: use vectored write (writev) once supported by tokio-epoll-uring. // There's chunks_vectored() on the stream. let (bytes_amount, destination_file) = async { - let size_tracking = size_tracking_writer::Writer::new(destination_file); - let mut buffered = owned_buffers_io::write::BufferedWriter::::new( - size_tracking, - BytesMut::with_capacity(super::BUFFER_SIZE), - ); while let Some(res) = futures::StreamExt::next(&mut download.download_stream).await { @@ -232,10 +243,10 @@ async fn download_object<'a>( Ok(chunk) => chunk, Err(e) => return Err(e), }; - buffered.write_buffered(chunk.slice_len(), ctx).await?; + buffered.write_buffered_borrowed(&chunk, ctx).await?; } - let size_tracking = buffered.flush_and_into_inner(ctx).await?; - Ok(size_tracking.into_inner()) + let inner = buffered.flush_and_into_inner(ctx).await?; + Ok(inner) } .await?; @@ -345,12 +356,13 @@ pub async fn list_remote_timelines( async fn do_download_remote_path_retry_forever( storage: &GenericRemoteStorage, remote_path: &RemotePath, + download_opts: DownloadOpts, cancel: &CancellationToken, ) -> Result<(Vec, SystemTime), DownloadError> { download_retry_forever( || async { let download = storage - .download(remote_path, &DownloadOpts::default(), cancel) + .download(remote_path, &download_opts, cancel) .await?; let mut bytes = Vec::new(); @@ -377,8 +389,13 @@ async fn do_download_tenant_manifest( ) -> Result<(TenantManifest, Generation, SystemTime), DownloadError> { let remote_path = remote_tenant_manifest_path(tenant_shard_id, generation); + let download_opts = DownloadOpts { + kind: DownloadKind::Small, + ..Default::default() + }; + let (manifest_bytes, manifest_bytes_mtime) = - do_download_remote_path_retry_forever(storage, &remote_path, cancel).await?; + do_download_remote_path_retry_forever(storage, &remote_path, download_opts, cancel).await?; let tenant_manifest = TenantManifest::from_json_bytes(&manifest_bytes) .with_context(|| format!("deserialize tenant manifest file at {remote_path:?}")) @@ -398,8 +415,13 @@ async fn do_download_index_part( timeline_id.expect("A timeline ID is always provided when downloading an index"); let remote_path = remote_index_path(tenant_shard_id, timeline_id, index_generation); + let download_opts = DownloadOpts { + kind: DownloadKind::Small, + ..Default::default() + }; + let (index_part_bytes, index_part_mtime) = - do_download_remote_path_retry_forever(storage, &remote_path, cancel).await?; + do_download_remote_path_retry_forever(storage, &remote_path, download_opts, cancel).await?; let index_part: IndexPart = serde_json::from_slice(&index_part_bytes) .with_context(|| format!("deserialize index part file at {remote_path:?}")) diff --git a/pageserver/src/tenant/remote_timeline_client/manifest.rs b/pageserver/src/tenant/remote_timeline_client/manifest.rs index c4382cb648..2029847a12 100644 --- a/pageserver/src/tenant/remote_timeline_client/manifest.rs +++ b/pageserver/src/tenant/remote_timeline_client/manifest.rs @@ -43,7 +43,7 @@ impl TenantManifest { offloaded_timelines: vec![], } } - pub(crate) fn from_json_bytes(bytes: &[u8]) -> Result { + pub fn from_json_bytes(bytes: &[u8]) -> Result { serde_json::from_slice::(bytes) } diff --git a/pageserver/src/tenant/secondary.rs b/pageserver/src/tenant/secondary.rs index 3df89a928c..4bc208331b 100644 --- a/pageserver/src/tenant/secondary.rs +++ b/pageserver/src/tenant/secondary.rs @@ -22,6 +22,7 @@ use super::{ mgr::TenantManager, span::debug_assert_current_span_has_tenant_id, storage_layer::LayerName, + GetTenantError, }; use crate::metrics::SECONDARY_RESIDENT_PHYSICAL_SIZE; @@ -66,7 +67,21 @@ struct CommandRequest { } struct CommandResponse { - result: anyhow::Result<()>, + result: Result<(), SecondaryTenantError>, +} + +#[derive(thiserror::Error, Debug)] +pub(crate) enum SecondaryTenantError { + #[error("{0}")] + GetTenant(GetTenantError), + #[error("shutting down")] + ShuttingDown, +} + +impl From for SecondaryTenantError { + fn from(gte: GetTenantError) -> Self { + Self::GetTenant(gte) + } } // Whereas [`Tenant`] represents an attached tenant, this type represents the work @@ -285,7 +300,7 @@ impl SecondaryController { &self, queue: &tokio::sync::mpsc::Sender>, payload: T, - ) -> anyhow::Result<()> { + ) -> Result<(), SecondaryTenantError> { let (response_tx, response_rx) = tokio::sync::oneshot::channel(); queue @@ -294,20 +309,26 @@ impl SecondaryController { response_tx, }) .await - .map_err(|_| anyhow::anyhow!("Receiver shut down"))?; + .map_err(|_| SecondaryTenantError::ShuttingDown)?; let response = response_rx .await - .map_err(|_| anyhow::anyhow!("Request dropped"))?; + .map_err(|_| SecondaryTenantError::ShuttingDown)?; response.result } - pub async fn upload_tenant(&self, tenant_shard_id: TenantShardId) -> anyhow::Result<()> { + pub(crate) async fn upload_tenant( + &self, + tenant_shard_id: TenantShardId, + ) -> Result<(), SecondaryTenantError> { self.dispatch(&self.upload_req_tx, UploadCommand::Upload(tenant_shard_id)) .await } - pub async fn download_tenant(&self, tenant_shard_id: TenantShardId) -> anyhow::Result<()> { + pub(crate) async fn download_tenant( + &self, + tenant_shard_id: TenantShardId, + ) -> Result<(), SecondaryTenantError> { self.dispatch( &self.download_req_tx, DownloadCommand::Download(tenant_shard_id), diff --git a/pageserver/src/tenant/secondary/downloader.rs b/pageserver/src/tenant/secondary/downloader.rs index 7443261a9c..395e34e404 100644 --- a/pageserver/src/tenant/secondary/downloader.rs +++ b/pageserver/src/tenant/secondary/downloader.rs @@ -35,7 +35,7 @@ use super::{ self, period_jitter, period_warmup, Completion, JobGenerator, SchedulingResult, TenantBackgroundJobs, }, - SecondaryTenant, + GetTenantError, SecondaryTenant, SecondaryTenantError, }; use crate::tenant::{ @@ -49,7 +49,7 @@ use futures::Future; use metrics::UIntGauge; use pageserver_api::models::SecondaryProgress; use pageserver_api::shard::TenantShardId; -use remote_storage::{DownloadError, DownloadOpts, Etag, GenericRemoteStorage}; +use remote_storage::{DownloadError, DownloadKind, DownloadOpts, Etag, GenericRemoteStorage}; use tokio_util::sync::CancellationToken; use tracing::{info_span, instrument, warn, Instrument}; @@ -470,15 +470,16 @@ impl JobGenerator anyhow::Result { + fn on_command( + &mut self, + command: DownloadCommand, + ) -> Result { let tenant_shard_id = command.get_tenant_shard_id(); let tenant = self .tenant_manager - .get_secondary_tenant_shard(*tenant_shard_id); - let Some(tenant) = tenant else { - return Err(anyhow::anyhow!("Not found or not in Secondary mode")); - }; + .get_secondary_tenant_shard(*tenant_shard_id) + .ok_or(GetTenantError::ShardNotFound(*tenant_shard_id))?; Ok(PendingDownload { target_time: None, @@ -946,6 +947,7 @@ impl<'a> TenantDownloader<'a> { let cancel = &self.secondary_state.cancel; let opts = DownloadOpts { etag: prev_etag.cloned(), + kind: DownloadKind::Small, ..Default::default() }; @@ -1181,6 +1183,7 @@ impl<'a> TenantDownloader<'a> { &layer.name, &layer.metadata, &local_path, + &self.secondary_state.gate, &self.secondary_state.cancel, ctx, ) diff --git a/pageserver/src/tenant/secondary/heatmap_uploader.rs b/pageserver/src/tenant/secondary/heatmap_uploader.rs index e680fd705b..c5e5e04945 100644 --- a/pageserver/src/tenant/secondary/heatmap_uploader.rs +++ b/pageserver/src/tenant/secondary/heatmap_uploader.rs @@ -28,7 +28,7 @@ use super::{ self, period_jitter, period_warmup, JobGenerator, RunningJob, SchedulingResult, TenantBackgroundJobs, }, - CommandRequest, UploadCommand, + CommandRequest, SecondaryTenantError, UploadCommand, }; use tokio_util::sync::CancellationToken; use tracing::{info_span, instrument, Instrument}; @@ -279,7 +279,10 @@ impl JobGenerator }.instrument(info_span!(parent: None, "heatmap_upload", tenant_id=%tenant_shard_id.tenant_id, shard_id=%tenant_shard_id.shard_slug())))) } - fn on_command(&mut self, command: UploadCommand) -> anyhow::Result { + fn on_command( + &mut self, + command: UploadCommand, + ) -> Result { let tenant_shard_id = command.get_tenant_shard_id(); tracing::info!( @@ -287,8 +290,7 @@ impl JobGenerator "Starting heatmap write on command"); let tenant = self .tenant_manager - .get_attached_tenant_shard(*tenant_shard_id) - .map_err(|e| anyhow::anyhow!(e))?; + .get_attached_tenant_shard(*tenant_shard_id)?; if !tenant.is_active() { return Err(GetTenantError::NotActive(*tenant_shard_id).into()); } diff --git a/pageserver/src/tenant/secondary/scheduler.rs b/pageserver/src/tenant/secondary/scheduler.rs index 28cf2125df..e963c722b9 100644 --- a/pageserver/src/tenant/secondary/scheduler.rs +++ b/pageserver/src/tenant/secondary/scheduler.rs @@ -12,7 +12,7 @@ use tokio::task::JoinSet; use tokio_util::sync::CancellationToken; use utils::{completion::Barrier, yielding_loop::yielding_loop}; -use super::{CommandRequest, CommandResponse}; +use super::{CommandRequest, CommandResponse, SecondaryTenantError}; /// Scheduling interval is the time between calls to JobGenerator::schedule. /// When we schedule jobs, the job generator may provide a hint of its preferred @@ -112,7 +112,7 @@ where /// Called when a command is received. A job will be spawned immediately if the return /// value is Some, ignoring concurrency limits and the pending queue. - fn on_command(&mut self, cmd: CMD) -> anyhow::Result; + fn on_command(&mut self, cmd: CMD) -> Result; } /// [`JobGenerator`] returns this to provide pending jobs, and hints about scheduling diff --git a/pageserver/src/tenant/storage_layer/inmemory_layer.rs b/pageserver/src/tenant/storage_layer/inmemory_layer.rs index af6112d535..71e53da20f 100644 --- a/pageserver/src/tenant/storage_layer/inmemory_layer.rs +++ b/pageserver/src/tenant/storage_layer/inmemory_layer.rs @@ -555,13 +555,12 @@ impl InMemoryLayer { timeline_id: TimelineId, tenant_shard_id: TenantShardId, start_lsn: Lsn, - gate_guard: utils::sync::gate::GateGuard, + gate: &utils::sync::gate::Gate, ctx: &RequestContext, ) -> Result { trace!("initializing new empty InMemoryLayer for writing on timeline {timeline_id} at {start_lsn}"); - let file = - EphemeralFile::create(conf, tenant_shard_id, timeline_id, gate_guard, ctx).await?; + let file = EphemeralFile::create(conf, tenant_shard_id, timeline_id, gate, ctx).await?; let key = InMemoryLayerFileId(file.page_cache_file_id()); Ok(InMemoryLayer { diff --git a/pageserver/src/tenant/storage_layer/layer.rs b/pageserver/src/tenant/storage_layer/layer.rs index a9f1189b41..8933e8ceb1 100644 --- a/pageserver/src/tenant/storage_layer/layer.rs +++ b/pageserver/src/tenant/storage_layer/layer.rs @@ -1149,6 +1149,7 @@ impl LayerInner { &self.desc.layer_name(), &self.metadata(), &self.path, + &timeline.gate, &timeline.cancel, ctx, ) diff --git a/pageserver/src/tenant/tasks.rs b/pageserver/src/tenant/tasks.rs index 16dac10dca..0118a5ce5f 100644 --- a/pageserver/src/tenant/tasks.rs +++ b/pageserver/src/tenant/tasks.rs @@ -471,14 +471,14 @@ async fn ingest_housekeeping_loop(tenant: Arc, cancel: CancellationToken // TODO: rename the background loop kind to something more generic, like, tenant housekeeping. // Or just spawn another background loop for this throttle, it's not like it's super costly. - info_span!(parent: None, "timeline_get_throttle", tenant_id=%tenant.tenant_shard_id, shard_id=%tenant.tenant_shard_id.shard_slug()).in_scope(|| { + info_span!(parent: None, "pagestream_throttle", tenant_id=%tenant.tenant_shard_id, shard_id=%tenant.tenant_shard_id.shard_slug()).in_scope(|| { let now = Instant::now(); let prev = std::mem::replace(&mut last_throttle_flag_reset_at, now); - let Stats { count_accounted_start, count_accounted_finish, count_throttled, sum_throttled_usecs} = tenant.timeline_get_throttle.reset_stats(); + let Stats { count_accounted_start, count_accounted_finish, count_throttled, sum_throttled_usecs} = tenant.pagestream_throttle.reset_stats(); if count_throttled == 0 { return; } - let allowed_rps = tenant.timeline_get_throttle.steady_rps(); + let allowed_rps = tenant.pagestream_throttle.steady_rps(); let delta = now - prev; info!( n_seconds=%format_args!("{:.3}", delta.as_secs_f64()), diff --git a/pageserver/src/tenant/throttle.rs b/pageserver/src/tenant/throttle.rs index 6a80953901..54c0e59daa 100644 --- a/pageserver/src/tenant/throttle.rs +++ b/pageserver/src/tenant/throttle.rs @@ -1,19 +1,14 @@ use std::{ - str::FromStr, sync::{ atomic::{AtomicU64, Ordering}, - Arc, Mutex, + Arc, }, time::{Duration, Instant}, }; use arc_swap::ArcSwap; -use enumset::EnumSet; -use tracing::{error, warn}; use utils::leaky_bucket::{LeakyBucketConfig, RateLimiter}; -use crate::{context::RequestContext, task_mgr::TaskKind}; - /// Throttle for `async` functions. /// /// Runtime reconfigurable. @@ -35,7 +30,7 @@ pub struct Throttle { } pub struct Inner { - task_kinds: EnumSet, + enabled: bool, rate_limiter: Arc, } @@ -79,26 +74,12 @@ where } fn new_inner(config: Config) -> Inner { let Config { - task_kinds, + enabled, initial, refill_interval, refill_amount, max, } = config; - let task_kinds: EnumSet = task_kinds - .iter() - .filter_map(|s| match TaskKind::from_str(s) { - Ok(v) => Some(v), - Err(e) => { - // TODO: avoid this failure mode - error!( - "cannot parse task kind, ignoring for rate limiting {}", - utils::error::report_compact_sources(&e) - ); - None - } - }) - .collect(); // steady rate, we expect `refill_amount` requests per `refill_interval`. // dividing gives us the rps. @@ -112,7 +93,7 @@ where let rate_limiter = RateLimiter::with_initial_tokens(config, f64::from(initial_tokens)); Inner { - task_kinds, + enabled: enabled.is_enabled(), rate_limiter: Arc::new(rate_limiter), } } @@ -141,11 +122,13 @@ where self.inner.load().rate_limiter.steady_rps() } - pub async fn throttle(&self, ctx: &RequestContext, key_count: usize) -> Option { + pub async fn throttle(&self, key_count: usize) -> Option { let inner = self.inner.load_full(); // clones the `Inner` Arc - if !inner.task_kinds.contains(ctx.task_kind()) { + + if !inner.enabled { return None; - }; + } + let start = std::time::Instant::now(); self.metric.accounting_start(); @@ -162,19 +145,6 @@ where .fetch_add(wait_time.as_micros() as u64, Ordering::Relaxed); let observation = Observation { wait_time }; self.metric.observe_throttling(&observation); - match ctx.micros_spent_throttled.add(wait_time) { - Ok(res) => res, - Err(error) => { - use once_cell::sync::Lazy; - use utils::rate_limit::RateLimit; - static WARN_RATE_LIMIT: Lazy> = - Lazy::new(|| Mutex::new(RateLimit::new(Duration::from_secs(10)))); - let mut guard = WARN_RATE_LIMIT.lock().unwrap(); - guard.call(move || { - warn!(error, "error adding time spent throttled; this message is logged at a global rate limit"); - }); - } - } Some(wait_time) } else { None diff --git a/pageserver/src/tenant/timeline.rs b/pageserver/src/tenant/timeline.rs index 730477a7f4..fc741826ab 100644 --- a/pageserver/src/tenant/timeline.rs +++ b/pageserver/src/tenant/timeline.rs @@ -208,8 +208,8 @@ fn drop_wlock(rlock: tokio::sync::RwLockWriteGuard<'_, T>) { /// The outward-facing resources required to build a Timeline pub struct TimelineResources { pub remote_client: RemoteTimelineClient, - pub timeline_get_throttle: - Arc>, + pub pagestream_throttle: + Arc>, pub l0_flush_global_state: l0_flush::L0FlushGlobalState, } @@ -411,9 +411,9 @@ pub struct Timeline { /// Timeline deletion will acquire both compaction and gc locks in whatever order. gc_lock: tokio::sync::Mutex<()>, - /// Cloned from [`super::Tenant::timeline_get_throttle`] on construction. - timeline_get_throttle: - Arc>, + /// Cloned from [`super::Tenant::pagestream_throttle`] on construction. + pub(crate) pagestream_throttle: + Arc>, /// Size estimator for aux file v2 pub(crate) aux_file_size_estimator: AuxFileSizeEstimator, @@ -949,7 +949,7 @@ impl Timeline { /// If a remote layer file is needed, it is downloaded as part of this /// call. /// - /// This method enforces [`Self::timeline_get_throttle`] internally. + /// This method enforces [`Self::pagestream_throttle`] internally. /// /// NOTE: It is considered an error to 'get' a key that doesn't exist. The /// abstraction above this needs to store suitable metadata to track what @@ -977,8 +977,6 @@ impl Timeline { // page_service. debug_assert!(!self.shard_identity.is_key_disposable(&key)); - self.timeline_get_throttle.throttle(ctx, 1).await; - let keyspace = KeySpace { ranges: vec![key..key.next()], }; @@ -1058,13 +1056,6 @@ impl Timeline { .for_task_kind(ctx.task_kind()) .map(|metric| (metric, Instant::now())); - // start counting after throttle so that throttle time - // is always less than observation time - let throttled = self - .timeline_get_throttle - .throttle(ctx, key_count as usize) - .await; - let res = self .get_vectored_impl( keyspace.clone(), @@ -1076,23 +1067,7 @@ impl Timeline { if let Some((metric, start)) = start { let elapsed = start.elapsed(); - let ex_throttled = if let Some(throttled) = throttled { - elapsed.checked_sub(throttled) - } else { - Some(elapsed) - }; - - if let Some(ex_throttled) = ex_throttled { - metric.observe(ex_throttled.as_secs_f64()); - } else { - use utils::rate_limit::RateLimit; - static LOGGED: Lazy> = - Lazy::new(|| Mutex::new(RateLimit::new(Duration::from_secs(10)))); - let mut rate_limit = LOGGED.lock().unwrap(); - rate_limit.call(|| { - warn!("error deducting time spent throttled; this message is logged at a global rate limit"); - }); - } + metric.observe(elapsed.as_secs_f64()); } res @@ -1137,14 +1112,6 @@ impl Timeline { .for_task_kind(ctx.task_kind()) .map(ScanLatencyOngoingRecording::start_recording); - // start counting after throttle so that throttle time - // is always less than observation time - let throttled = self - .timeline_get_throttle - // assume scan = 1 quota for now until we find a better way to process this - .throttle(ctx, 1) - .await; - let vectored_res = self .get_vectored_impl( keyspace.clone(), @@ -1155,7 +1122,7 @@ impl Timeline { .await; if let Some(recording) = start { - recording.observe(throttled); + recording.observe(); } vectored_res @@ -2371,7 +2338,7 @@ impl Timeline { standby_horizon: AtomicLsn::new(0), - timeline_get_throttle: resources.timeline_get_throttle, + pagestream_throttle: resources.pagestream_throttle, aux_file_size_estimator: AuxFileSizeEstimator::new(aux_file_metrics), @@ -3488,7 +3455,6 @@ impl Timeline { ctx: &RequestContext, ) -> anyhow::Result> { let mut guard = self.layers.write().await; - let gate_guard = self.gate.enter().context("enter gate for inmem layer")?; let last_record_lsn = self.get_last_record_lsn(); ensure!( @@ -3505,7 +3471,7 @@ impl Timeline { self.conf, self.timeline_id, self.tenant_shard_id, - gate_guard, + &self.gate, ctx, ) .await?; diff --git a/pageserver/src/tenant/timeline/delete.rs b/pageserver/src/tenant/timeline/delete.rs index 67fc710c44..47a93b19d2 100644 --- a/pageserver/src/tenant/timeline/delete.rs +++ b/pageserver/src/tenant/timeline/delete.rs @@ -298,7 +298,7 @@ impl DeleteTimelineFlow { None, // Ancestor is not needed for deletion. TimelineResources { remote_client, - timeline_get_throttle: tenant.timeline_get_throttle.clone(), + pagestream_throttle: tenant.pagestream_throttle.clone(), l0_flush_global_state: tenant.l0_flush_global_state.clone(), }, // Important. We dont pass ancestor above because it can be missing. diff --git a/pageserver/src/tenant/timeline/import_pgdata/flow.rs b/pageserver/src/tenant/timeline/import_pgdata/flow.rs index cbd4168c06..4388072606 100644 --- a/pageserver/src/tenant/timeline/import_pgdata/flow.rs +++ b/pageserver/src/tenant/timeline/import_pgdata/flow.rs @@ -129,22 +129,23 @@ impl Flow { } // Import SLRUs - - // pg_xact (01:00 keyspace) - self.import_slru(SlruKind::Clog, &self.storage.pgdata().join("pg_xact")) + if self.timeline.tenant_shard_id.is_shard_zero() { + // pg_xact (01:00 keyspace) + self.import_slru(SlruKind::Clog, &self.storage.pgdata().join("pg_xact")) + .await?; + // pg_multixact/members (01:01 keyspace) + self.import_slru( + SlruKind::MultiXactMembers, + &self.storage.pgdata().join("pg_multixact/members"), + ) .await?; - // pg_multixact/members (01:01 keyspace) - self.import_slru( - SlruKind::MultiXactMembers, - &self.storage.pgdata().join("pg_multixact/members"), - ) - .await?; - // pg_multixact/offsets (01:02 keyspace) - self.import_slru( - SlruKind::MultiXactOffsets, - &self.storage.pgdata().join("pg_multixact/offsets"), - ) - .await?; + // pg_multixact/offsets (01:02 keyspace) + self.import_slru( + SlruKind::MultiXactOffsets, + &self.storage.pgdata().join("pg_multixact/offsets"), + ) + .await?; + } // Import pg_twophase. // TODO: as empty @@ -302,6 +303,8 @@ impl Flow { } async fn import_slru(&mut self, kind: SlruKind, path: &RemotePath) -> anyhow::Result<()> { + assert!(self.timeline.tenant_shard_id.is_shard_zero()); + let segments = self.storage.listfilesindir(path).await?; let segments: Vec<(String, u32, usize)> = segments .into_iter() @@ -337,7 +340,6 @@ impl Flow { debug!(%p, segno=%segno, %size, %start_key, %end_key, "scheduling SLRU segment"); self.tasks .push(AnyImportTask::SlruBlocks(ImportSlruBlocksTask::new( - *self.timeline.get_shard_identity(), start_key..end_key, &p, self.storage.clone(), @@ -631,21 +633,14 @@ impl ImportTask for ImportRelBlocksTask { } struct ImportSlruBlocksTask { - shard_identity: ShardIdentity, key_range: Range, path: RemotePath, storage: RemoteStorageWrapper, } impl ImportSlruBlocksTask { - fn new( - shard_identity: ShardIdentity, - key_range: Range, - path: &RemotePath, - storage: RemoteStorageWrapper, - ) -> Self { + fn new(key_range: Range, path: &RemotePath, storage: RemoteStorageWrapper) -> Self { ImportSlruBlocksTask { - shard_identity, key_range, path: path.clone(), storage, @@ -673,17 +668,13 @@ impl ImportTask for ImportSlruBlocksTask { let mut file_offset = 0; while blknum < end_blk { let key = slru_block_to_key(kind, segno, blknum); - assert!( - !self.shard_identity.is_key_disposable(&key), - "SLRU keys need to go into every shard" - ); let buf = &buf[file_offset..(file_offset + 8192)]; file_offset += 8192; layer_writer .put_image(key, Bytes::copy_from_slice(buf), ctx) .await?; - blknum += 1; nimages += 1; + blknum += 1; } Ok(nimages) } diff --git a/pageserver/src/tenant/timeline/import_pgdata/importbucket_client.rs b/pageserver/src/tenant/timeline/import_pgdata/importbucket_client.rs index 8d5ab1780f..bc4d148a29 100644 --- a/pageserver/src/tenant/timeline/import_pgdata/importbucket_client.rs +++ b/pageserver/src/tenant/timeline/import_pgdata/importbucket_client.rs @@ -4,7 +4,8 @@ use anyhow::Context; use bytes::Bytes; use postgres_ffi::ControlFileData; use remote_storage::{ - Download, DownloadError, DownloadOpts, GenericRemoteStorage, Listing, ListingObject, RemotePath, + Download, DownloadError, DownloadKind, DownloadOpts, GenericRemoteStorage, Listing, + ListingObject, RemotePath, }; use serde::de::DeserializeOwned; use tokio_util::sync::CancellationToken; @@ -239,6 +240,7 @@ impl RemoteStorageWrapper { .download( path, &DownloadOpts { + kind: DownloadKind::Large, etag: None, byte_start: Bound::Included(start_inclusive), byte_end: Bound::Excluded(end_exclusive) diff --git a/pageserver/src/tenant/timeline/layer_manager.rs b/pageserver/src/tenant/timeline/layer_manager.rs index 4293a44dca..3888e7f86a 100644 --- a/pageserver/src/tenant/timeline/layer_manager.rs +++ b/pageserver/src/tenant/timeline/layer_manager.rs @@ -182,7 +182,7 @@ impl OpenLayerManager { conf: &'static PageServerConf, timeline_id: TimelineId, tenant_shard_id: TenantShardId, - gate_guard: utils::sync::gate::GateGuard, + gate: &utils::sync::gate::Gate, ctx: &RequestContext, ) -> anyhow::Result> { ensure!(lsn.is_aligned()); @@ -212,15 +212,9 @@ impl OpenLayerManager { lsn ); - let new_layer = InMemoryLayer::create( - conf, - timeline_id, - tenant_shard_id, - start_lsn, - gate_guard, - ctx, - ) - .await?; + let new_layer = + InMemoryLayer::create(conf, timeline_id, tenant_shard_id, start_lsn, gate, ctx) + .await?; let layer = Arc::new(new_layer); self.layer_map.open_layer = Some(layer.clone()); diff --git a/pageserver/src/tenant/timeline/walreceiver/walreceiver_connection.rs b/pageserver/src/tenant/timeline/walreceiver/walreceiver_connection.rs index 31cf1b6307..d90ffbfa2c 100644 --- a/pageserver/src/tenant/timeline/walreceiver/walreceiver_connection.rs +++ b/pageserver/src/tenant/timeline/walreceiver/walreceiver_connection.rs @@ -454,6 +454,10 @@ pub(super) async fn handle_walreceiver_connection( timeline.get_last_record_lsn() ); + if let Some(lsn) = next_record_lsn { + last_rec_lsn = lsn; + } + Some(streaming_lsn) } diff --git a/pageserver/src/virtual_file.rs b/pageserver/src/virtual_file.rs index b9f8c7ea20..8a7f4a4bf5 100644 --- a/pageserver/src/virtual_file.rs +++ b/pageserver/src/virtual_file.rs @@ -20,7 +20,7 @@ use camino::{Utf8Path, Utf8PathBuf}; use once_cell::sync::OnceCell; use owned_buffers_io::aligned_buffer::buffer::AlignedBuffer; use owned_buffers_io::aligned_buffer::{AlignedBufferMut, AlignedSlice, ConstAlign}; -use owned_buffers_io::io_buf_aligned::IoBufAlignedMut; +use owned_buffers_io::io_buf_aligned::{IoBufAligned, IoBufAlignedMut}; use owned_buffers_io::io_buf_ext::FullSlice; use pageserver_api::config::defaults::DEFAULT_IO_BUFFER_ALIGNMENT; use pageserver_api::shard::TenantShardId; @@ -63,9 +63,6 @@ pub(crate) mod owned_buffers_io { pub(crate) mod io_buf_ext; pub(crate) mod slice; pub(crate) mod write; - pub(crate) mod util { - pub(crate) mod size_tracking_writer; - } } #[derive(Debug)] @@ -221,7 +218,7 @@ impl VirtualFile { self.inner.read_exact_at_page(page, offset, ctx).await } - pub async fn write_all_at( + pub async fn write_all_at( &self, buf: FullSlice, offset: u64, @@ -1325,14 +1322,14 @@ impl Drop for VirtualFileInner { } impl OwnedAsyncWriter for VirtualFile { - #[inline(always)] - async fn write_all( - &mut self, + async fn write_all_at( + &self, buf: FullSlice, + offset: u64, ctx: &RequestContext, - ) -> std::io::Result<(usize, FullSlice)> { - let (buf, res) = VirtualFile::write_all(self, buf, ctx).await; - res.map(move |v| (v, buf)) + ) -> std::io::Result> { + let (buf, res) = VirtualFile::write_all_at(self, buf, offset, ctx).await; + res.map(|_| buf) } } @@ -1451,7 +1448,7 @@ mod tests { } } } - async fn write_all_at( + async fn write_all_at( &self, buf: FullSlice, offset: u64, @@ -1594,6 +1591,7 @@ mod tests { &ctx, ) .await?; + file_a .write_all(b"foobar".to_vec().slice_len(), &ctx) .await?; @@ -1652,10 +1650,10 @@ mod tests { ) .await?; file_b - .write_all_at(b"BAR".to_vec().slice_len(), 3, &ctx) + .write_all_at(IoBuffer::from(b"BAR").slice_len(), 3, &ctx) .await?; file_b - .write_all_at(b"FOO".to_vec().slice_len(), 0, &ctx) + .write_all_at(IoBuffer::from(b"FOO").slice_len(), 0, &ctx) .await?; assert_eq!(file_b.read_string_at(2, 3, &ctx).await?, "OBA"); diff --git a/pageserver/src/virtual_file/owned_buffers_io/aligned_buffer/alignment.rs b/pageserver/src/virtual_file/owned_buffers_io/aligned_buffer/alignment.rs index 933b78a13b..6b9992643f 100644 --- a/pageserver/src/virtual_file/owned_buffers_io/aligned_buffer/alignment.rs +++ b/pageserver/src/virtual_file/owned_buffers_io/aligned_buffer/alignment.rs @@ -4,7 +4,7 @@ pub trait Alignment: std::marker::Unpin + 'static { } /// Alignment at compile time. -#[derive(Debug)] +#[derive(Debug, Clone, Copy)] pub struct ConstAlign; impl Alignment for ConstAlign { @@ -14,7 +14,7 @@ impl Alignment for ConstAlign { } /// Alignment at run time. -#[derive(Debug)] +#[derive(Debug, Clone, Copy)] pub struct RuntimeAlign { align: usize, } diff --git a/pageserver/src/virtual_file/owned_buffers_io/aligned_buffer/buffer.rs b/pageserver/src/virtual_file/owned_buffers_io/aligned_buffer/buffer.rs index 2fba6d699b..a5c26cd746 100644 --- a/pageserver/src/virtual_file/owned_buffers_io/aligned_buffer/buffer.rs +++ b/pageserver/src/virtual_file/owned_buffers_io/aligned_buffer/buffer.rs @@ -3,9 +3,10 @@ use std::{ sync::Arc, }; -use super::{alignment::Alignment, raw::RawAlignedBuffer}; +use super::{alignment::Alignment, raw::RawAlignedBuffer, AlignedBufferMut, ConstAlign}; /// An shared, immutable aligned buffer type. +#[derive(Clone, Debug)] pub struct AlignedBuffer { /// Shared raw buffer. raw: Arc>, @@ -86,6 +87,13 @@ impl AlignedBuffer { range: begin..end, } } + + /// Returns the mutable aligned buffer, if the immutable aligned buffer + /// has exactly one strong reference. Otherwise returns `None`. + pub fn into_mut(self) -> Option> { + let raw = Arc::into_inner(self.raw)?; + Some(AlignedBufferMut::from_raw(raw)) + } } impl Deref for AlignedBuffer { @@ -108,6 +116,14 @@ impl PartialEq<[u8]> for AlignedBuffer { } } +impl From<&[u8; N]> for AlignedBuffer> { + fn from(value: &[u8; N]) -> Self { + let mut buf = AlignedBufferMut::with_capacity(N); + buf.extend_from_slice(value); + buf.freeze() + } +} + /// SAFETY: the underlying buffer references a stable memory region. unsafe impl tokio_epoll_uring::IoBuf for AlignedBuffer { fn stable_ptr(&self) -> *const u8 { diff --git a/pageserver/src/virtual_file/owned_buffers_io/aligned_buffer/buffer_mut.rs b/pageserver/src/virtual_file/owned_buffers_io/aligned_buffer/buffer_mut.rs index b3675d1aea..d2f5e206bb 100644 --- a/pageserver/src/virtual_file/owned_buffers_io/aligned_buffer/buffer_mut.rs +++ b/pageserver/src/virtual_file/owned_buffers_io/aligned_buffer/buffer_mut.rs @@ -1,4 +1,7 @@ -use std::ops::{Deref, DerefMut}; +use std::{ + mem::MaybeUninit, + ops::{Deref, DerefMut}, +}; use super::{ alignment::{Alignment, ConstAlign}, @@ -46,6 +49,11 @@ impl AlignedBufferMut> { } impl AlignedBufferMut { + /// Constructs a mutable aligned buffer from raw. + pub(super) fn from_raw(raw: RawAlignedBuffer) -> Self { + AlignedBufferMut { raw } + } + /// Returns the total number of bytes the buffer can hold. #[inline] pub fn capacity(&self) -> usize { @@ -128,6 +136,39 @@ impl AlignedBufferMut { let len = self.len(); AlignedBuffer::from_raw(self.raw, 0..len) } + + /// Clones and appends all elements in a slice to the buffer. Reserves additional capacity as needed. + #[inline] + pub fn extend_from_slice(&mut self, extend: &[u8]) { + let cnt = extend.len(); + self.reserve(cnt); + + // SAFETY: we already reserved additional `cnt` bytes, safe to perform memcpy. + unsafe { + let dst = self.spare_capacity_mut(); + // Reserved above + debug_assert!(dst.len() >= cnt); + + core::ptr::copy_nonoverlapping(extend.as_ptr(), dst.as_mut_ptr().cast(), cnt); + } + // SAFETY: We do have at least `cnt` bytes remaining before advance. + unsafe { + bytes::BufMut::advance_mut(self, cnt); + } + } + + /// Returns the remaining spare capacity of the vector as a slice of `MaybeUninit`. + #[inline] + fn spare_capacity_mut(&mut self) -> &mut [MaybeUninit] { + // SAFETY: we guarantees that the `Self::capacity()` bytes from + // `Self::as_mut_ptr()` are allocated. + unsafe { + let ptr = self.as_mut_ptr().add(self.len()); + let len = self.capacity() - self.len(); + + core::slice::from_raw_parts_mut(ptr.cast(), len) + } + } } impl Deref for AlignedBufferMut { diff --git a/pageserver/src/virtual_file/owned_buffers_io/io_buf_aligned.rs b/pageserver/src/virtual_file/owned_buffers_io/io_buf_aligned.rs index dba695196e..4ea6b17744 100644 --- a/pageserver/src/virtual_file/owned_buffers_io/io_buf_aligned.rs +++ b/pageserver/src/virtual_file/owned_buffers_io/io_buf_aligned.rs @@ -1,9 +1,15 @@ -use tokio_epoll_uring::IoBufMut; +use tokio_epoll_uring::{IoBuf, IoBufMut}; -use crate::virtual_file::{IoBufferMut, PageWriteGuardBuf}; +use crate::virtual_file::{IoBuffer, IoBufferMut, PageWriteGuardBuf}; +/// A marker trait for a mutable aligned buffer type. pub trait IoBufAlignedMut: IoBufMut {} +/// A marker trait for an aligned buffer type. +pub trait IoBufAligned: IoBuf {} + impl IoBufAlignedMut for IoBufferMut {} +impl IoBufAligned for IoBuffer {} + impl IoBufAlignedMut for PageWriteGuardBuf {} diff --git a/pageserver/src/virtual_file/owned_buffers_io/io_buf_ext.rs b/pageserver/src/virtual_file/owned_buffers_io/io_buf_ext.rs index c3940cf6ce..525f447b6d 100644 --- a/pageserver/src/virtual_file/owned_buffers_io/io_buf_ext.rs +++ b/pageserver/src/virtual_file/owned_buffers_io/io_buf_ext.rs @@ -5,6 +5,8 @@ use bytes::{Bytes, BytesMut}; use std::ops::{Deref, Range}; use tokio_epoll_uring::{BoundedBuf, IoBuf, Slice}; +use super::write::CheapCloneForRead; + /// The true owned equivalent for Rust [`slice`]. Use this for the write path. /// /// Unlike [`tokio_epoll_uring::Slice`], which we unfortunately inherited from `tokio-uring`, @@ -43,6 +45,18 @@ where } } +impl CheapCloneForRead for FullSlice +where + B: IoBuf + CheapCloneForRead, +{ + fn cheap_clone(&self) -> Self { + let bounds = self.slice.bounds(); + let clone = self.slice.get_ref().cheap_clone(); + let slice = clone.slice(bounds); + Self { slice } + } +} + pub(crate) trait IoBufExt { /// Get a [`FullSlice`] for the entire buffer, i.e., `self[..]` or `self[0..self.len()]`. fn slice_len(self) -> FullSlice diff --git a/pageserver/src/virtual_file/owned_buffers_io/util/size_tracking_writer.rs b/pageserver/src/virtual_file/owned_buffers_io/util/size_tracking_writer.rs deleted file mode 100644 index efcb61ba65..0000000000 --- a/pageserver/src/virtual_file/owned_buffers_io/util/size_tracking_writer.rs +++ /dev/null @@ -1,50 +0,0 @@ -use crate::{ - context::RequestContext, - virtual_file::owned_buffers_io::{io_buf_ext::FullSlice, write::OwnedAsyncWriter}, -}; -use tokio_epoll_uring::IoBuf; - -pub struct Writer { - dst: W, - bytes_amount: u64, -} - -impl Writer { - pub fn new(dst: W) -> Self { - Self { - dst, - bytes_amount: 0, - } - } - - pub fn bytes_written(&self) -> u64 { - self.bytes_amount - } - - pub fn as_inner(&self) -> &W { - &self.dst - } - - /// Returns the wrapped `VirtualFile` object as well as the number - /// of bytes that were written to it through this object. - #[cfg_attr(target_os = "macos", allow(dead_code))] - pub fn into_inner(self) -> (u64, W) { - (self.bytes_amount, self.dst) - } -} - -impl OwnedAsyncWriter for Writer -where - W: OwnedAsyncWriter, -{ - #[inline(always)] - async fn write_all( - &mut self, - buf: FullSlice, - ctx: &RequestContext, - ) -> std::io::Result<(usize, FullSlice)> { - let (nwritten, buf) = self.dst.write_all(buf, ctx).await?; - self.bytes_amount += u64::try_from(nwritten).unwrap(); - Ok((nwritten, buf)) - } -} diff --git a/pageserver/src/virtual_file/owned_buffers_io/write.rs b/pageserver/src/virtual_file/owned_buffers_io/write.rs index 568cf62e56..7299d83703 100644 --- a/pageserver/src/virtual_file/owned_buffers_io/write.rs +++ b/pageserver/src/virtual_file/owned_buffers_io/write.rs @@ -1,55 +1,88 @@ -use bytes::BytesMut; +mod flush; +use std::sync::Arc; + +use flush::FlushHandle; use tokio_epoll_uring::IoBuf; -use crate::context::RequestContext; +use crate::{ + context::RequestContext, + virtual_file::{IoBuffer, IoBufferMut}, +}; -use super::io_buf_ext::{FullSlice, IoBufExt}; +use super::{ + io_buf_aligned::IoBufAligned, + io_buf_ext::{FullSlice, IoBufExt}, +}; + +pub(crate) use flush::FlushControl; + +pub(crate) trait CheapCloneForRead { + /// Returns a cheap clone of the buffer. + fn cheap_clone(&self) -> Self; +} + +impl CheapCloneForRead for IoBuffer { + fn cheap_clone(&self) -> Self { + // Cheap clone over an `Arc`. + self.clone() + } +} /// A trait for doing owned-buffer write IO. /// Think [`tokio::io::AsyncWrite`] but with owned buffers. +/// The owned buffers need to be aligned due to Direct IO requirements. pub trait OwnedAsyncWriter { - async fn write_all( - &mut self, + fn write_all_at( + &self, buf: FullSlice, + offset: u64, ctx: &RequestContext, - ) -> std::io::Result<(usize, FullSlice)>; + ) -> impl std::future::Future>> + Send; } /// A wrapper aorund an [`OwnedAsyncWriter`] that uses a [`Buffer`] to batch /// small writes into larger writes of size [`Buffer::cap`]. -/// -/// # Passthrough Of Large Writers -/// -/// Calls to [`BufferedWriter::write_buffered`] that are larger than [`Buffer::cap`] -/// cause the internal buffer to be flushed prematurely so that the large -/// buffered write is passed through to the underlying [`OwnedAsyncWriter`]. -/// -/// This pass-through is generally beneficial for throughput, but if -/// the storage backend of the [`OwnedAsyncWriter`] is a shared resource, -/// unlimited large writes may cause latency or fairness issues. -/// -/// In such cases, a different implementation that always buffers in memory -/// may be preferable. -pub struct BufferedWriter { - writer: W, +// TODO(yuchen): For large write, implementing buffer bypass for aligned parts of the write could be beneficial to throughput, +// since we would avoid copying majority of the data into the internal buffer. +pub struct BufferedWriter { + writer: Arc, /// invariant: always remains Some(buf) except /// - while IO is ongoing => goes back to Some() once the IO completed successfully /// - after an IO error => stays `None` forever /// /// In these exceptional cases, it's `None`. - buf: Option, + mutable: Option, + /// A handle to the background flush task for writting data to disk. + flush_handle: FlushHandle, + /// The number of bytes submitted to the background task. + bytes_submitted: u64, } impl BufferedWriter where - B: Buffer + Send, - Buf: IoBuf + Send, - W: OwnedAsyncWriter, + B: Buffer + Send + 'static, + Buf: IoBufAligned + Send + Sync + CheapCloneForRead, + W: OwnedAsyncWriter + Send + Sync + 'static + std::fmt::Debug, { - pub fn new(writer: W, buf: B) -> Self { + /// Creates a new buffered writer. + /// + /// The `buf_new` function provides a way to initialize the owned buffers used by this writer. + pub fn new( + writer: Arc, + buf_new: impl Fn() -> B, + gate_guard: utils::sync::gate::GateGuard, + ctx: &RequestContext, + ) -> Self { Self { - writer, - buf: Some(buf), + writer: writer.clone(), + mutable: Some(buf_new()), + flush_handle: FlushHandle::spawn_new( + writer, + buf_new(), + gate_guard, + ctx.attached_child(), + ), + bytes_submitted: 0, } } @@ -57,87 +90,71 @@ where &self.writer } + /// Returns the number of bytes submitted to the background flush task. + pub fn bytes_submitted(&self) -> u64 { + self.bytes_submitted + } + /// Panics if used after any of the write paths returned an error - pub fn inspect_buffer(&self) -> &B { - self.buf() + pub fn inspect_mutable(&self) -> &B { + self.mutable() + } + + /// Gets a reference to the maybe flushed read-only buffer. + /// Returns `None` if the writer has not submitted any flush request. + pub fn inspect_maybe_flushed(&self) -> Option<&FullSlice> { + self.flush_handle.maybe_flushed.as_ref() } #[cfg_attr(target_os = "macos", allow(dead_code))] - pub async fn flush_and_into_inner(mut self, ctx: &RequestContext) -> std::io::Result { + pub async fn flush_and_into_inner( + mut self, + ctx: &RequestContext, + ) -> std::io::Result<(u64, Arc)> { self.flush(ctx).await?; - let Self { buf, writer } = self; + let Self { + mutable: buf, + writer, + mut flush_handle, + bytes_submitted: bytes_amount, + } = self; + flush_handle.shutdown().await?; assert!(buf.is_some()); - Ok(writer) + Ok((bytes_amount, writer)) } + /// Gets a reference to the mutable in-memory buffer. #[inline(always)] - fn buf(&self) -> &B { - self.buf + fn mutable(&self) -> &B { + self.mutable .as_ref() .expect("must not use after we returned an error") } - /// Guarantees that if Ok() is returned, all bytes in `chunk` have been accepted. #[cfg_attr(target_os = "macos", allow(dead_code))] - pub async fn write_buffered( + pub async fn write_buffered_borrowed( &mut self, - chunk: FullSlice, + chunk: &[u8], ctx: &RequestContext, - ) -> std::io::Result<(usize, FullSlice)> { - let chunk = chunk.into_raw_slice(); - - let chunk_len = chunk.len(); - // avoid memcpy for the middle of the chunk - if chunk.len() >= self.buf().cap() { - self.flush(ctx).await?; - // do a big write, bypassing `buf` - assert_eq!( - self.buf - .as_ref() - .expect("must not use after an error") - .pending(), - 0 - ); - let (nwritten, chunk) = self - .writer - .write_all(FullSlice::must_new(chunk), ctx) - .await?; - assert_eq!(nwritten, chunk_len); - return Ok((nwritten, chunk)); + ) -> std::io::Result { + let (len, control) = self.write_buffered_borrowed_controlled(chunk, ctx).await?; + if let Some(control) = control { + control.release().await; } - // in-memory copy the < BUFFER_SIZED tail of the chunk - assert!(chunk.len() < self.buf().cap()); - let mut slice = &chunk[..]; - while !slice.is_empty() { - let buf = self.buf.as_mut().expect("must not use after an error"); - let need = buf.cap() - buf.pending(); - let have = slice.len(); - let n = std::cmp::min(need, have); - buf.extend_from_slice(&slice[..n]); - slice = &slice[n..]; - if buf.pending() >= buf.cap() { - assert_eq!(buf.pending(), buf.cap()); - self.flush(ctx).await?; - } - } - assert!(slice.is_empty(), "by now we should have drained the chunk"); - Ok((chunk_len, FullSlice::must_new(chunk))) + Ok(len) } - /// Strictly less performant variant of [`Self::write_buffered`] that allows writing borrowed data. - /// - /// It is less performant because we always have to copy the borrowed data into the internal buffer - /// before we can do the IO. The [`Self::write_buffered`] can avoid this, which is more performant - /// for large writes. - pub async fn write_buffered_borrowed( + /// In addition to bytes submitted in this write, also returns a handle that can control the flush behavior. + pub(crate) async fn write_buffered_borrowed_controlled( &mut self, mut chunk: &[u8], ctx: &RequestContext, - ) -> std::io::Result { + ) -> std::io::Result<(usize, Option)> { let chunk_len = chunk.len(); + let mut control: Option = None; while !chunk.is_empty() { - let buf = self.buf.as_mut().expect("must not use after an error"); + let buf = self.mutable.as_mut().expect("must not use after an error"); let need = buf.cap() - buf.pending(); let have = chunk.len(); let n = std::cmp::min(need, have); @@ -145,26 +162,27 @@ where chunk = &chunk[n..]; if buf.pending() >= buf.cap() { assert_eq!(buf.pending(), buf.cap()); - self.flush(ctx).await?; + if let Some(control) = control.take() { + control.release().await; + } + control = self.flush(ctx).await?; } } - Ok(chunk_len) + Ok((chunk_len, control)) } - async fn flush(&mut self, ctx: &RequestContext) -> std::io::Result<()> { - let buf = self.buf.take().expect("must not use after an error"); + #[must_use = "caller must explcitly check the flush control"] + async fn flush(&mut self, _ctx: &RequestContext) -> std::io::Result> { + let buf = self.mutable.take().expect("must not use after an error"); let buf_len = buf.pending(); if buf_len == 0 { - self.buf = Some(buf); - return Ok(()); + self.mutable = Some(buf); + return Ok(None); } - let slice = buf.flush(); - let (nwritten, slice) = self.writer.write_all(slice, ctx).await?; - assert_eq!(nwritten, buf_len); - self.buf = Some(Buffer::reuse_after_flush( - slice.into_raw_slice().into_inner(), - )); - Ok(()) + let (recycled, flush_control) = self.flush_handle.flush(buf, self.bytes_submitted).await?; + self.bytes_submitted += u64::try_from(buf_len).unwrap(); + self.mutable = Some(recycled); + Ok(Some(flush_control)) } } @@ -192,64 +210,77 @@ pub trait Buffer { fn reuse_after_flush(iobuf: Self::IoBuf) -> Self; } -impl Buffer for BytesMut { - type IoBuf = BytesMut; +impl Buffer for IoBufferMut { + type IoBuf = IoBuffer; - #[inline(always)] fn cap(&self) -> usize { self.capacity() } fn extend_from_slice(&mut self, other: &[u8]) { - BytesMut::extend_from_slice(self, other) + if self.len() + other.len() > self.cap() { + panic!("Buffer capacity exceeded"); + } + + IoBufferMut::extend_from_slice(self, other); } - #[inline(always)] fn pending(&self) -> usize { self.len() } - fn flush(self) -> FullSlice { - self.slice_len() + fn flush(self) -> FullSlice { + self.freeze().slice_len() } - fn reuse_after_flush(mut iobuf: BytesMut) -> Self { - iobuf.clear(); - iobuf - } -} - -impl OwnedAsyncWriter for Vec { - async fn write_all( - &mut self, - buf: FullSlice, - _: &RequestContext, - ) -> std::io::Result<(usize, FullSlice)> { - self.extend_from_slice(&buf[..]); - Ok((buf.len(), buf)) + /// Caller should make sure that `iobuf` only have one strong reference before invoking this method. + fn reuse_after_flush(iobuf: Self::IoBuf) -> Self { + let mut recycled = iobuf + .into_mut() + .expect("buffer should only have one strong reference"); + recycled.clear(); + recycled } } #[cfg(test)] mod tests { - use bytes::BytesMut; + use std::sync::Mutex; use super::*; use crate::context::{DownloadBehavior, RequestContext}; use crate::task_mgr::TaskKind; - #[derive(Default)] + #[derive(Default, Debug)] struct RecorderWriter { - writes: Vec>, + /// record bytes and write offsets. + writes: Mutex, u64)>>, } + + impl RecorderWriter { + /// Gets recorded bytes and write offsets. + fn get_writes(&self) -> Vec> { + self.writes + .lock() + .unwrap() + .iter() + .map(|(buf, _)| buf.clone()) + .collect() + } + } + impl OwnedAsyncWriter for RecorderWriter { - async fn write_all( - &mut self, + async fn write_all_at( + &self, buf: FullSlice, + offset: u64, _: &RequestContext, - ) -> std::io::Result<(usize, FullSlice)> { - self.writes.push(Vec::from(&buf[..])); - Ok((buf.len(), buf)) + ) -> std::io::Result> { + self.writes + .lock() + .unwrap() + .push((Vec::from(&buf[..]), offset)); + Ok(buf) } } @@ -257,71 +288,21 @@ mod tests { RequestContext::new(TaskKind::UnitTest, DownloadBehavior::Error) } - macro_rules! write { - ($writer:ident, $data:literal) => {{ - $writer - .write_buffered(::bytes::Bytes::from_static($data).slice_len(), &test_ctx()) - .await?; - }}; - } - #[tokio::test] - async fn test_buffered_writes_only() -> std::io::Result<()> { - let recorder = RecorderWriter::default(); - let mut writer = BufferedWriter::new(recorder, BytesMut::with_capacity(2)); - write!(writer, b"a"); - write!(writer, b"b"); - write!(writer, b"c"); - write!(writer, b"d"); - write!(writer, b"e"); - let recorder = writer.flush_and_into_inner(&test_ctx()).await?; - assert_eq!( - recorder.writes, - vec![Vec::from(b"ab"), Vec::from(b"cd"), Vec::from(b"e")] - ); - Ok(()) - } - - #[tokio::test] - async fn test_passthrough_writes_only() -> std::io::Result<()> { - let recorder = RecorderWriter::default(); - let mut writer = BufferedWriter::new(recorder, BytesMut::with_capacity(2)); - write!(writer, b"abc"); - write!(writer, b"de"); - write!(writer, b""); - write!(writer, b"fghijk"); - let recorder = writer.flush_and_into_inner(&test_ctx()).await?; - assert_eq!( - recorder.writes, - vec![Vec::from(b"abc"), Vec::from(b"de"), Vec::from(b"fghijk")] - ); - Ok(()) - } - - #[tokio::test] - async fn test_passthrough_write_with_nonempty_buffer() -> std::io::Result<()> { - let recorder = RecorderWriter::default(); - let mut writer = BufferedWriter::new(recorder, BytesMut::with_capacity(2)); - write!(writer, b"a"); - write!(writer, b"bc"); - write!(writer, b"d"); - write!(writer, b"e"); - let recorder = writer.flush_and_into_inner(&test_ctx()).await?; - assert_eq!( - recorder.writes, - vec![Vec::from(b"a"), Vec::from(b"bc"), Vec::from(b"de")] - ); - Ok(()) - } - - #[tokio::test] - async fn test_write_all_borrowed_always_goes_through_buffer() -> std::io::Result<()> { + async fn test_write_all_borrowed_always_goes_through_buffer() -> anyhow::Result<()> { let ctx = test_ctx(); let ctx = &ctx; - let recorder = RecorderWriter::default(); - let mut writer = BufferedWriter::new(recorder, BytesMut::with_capacity(2)); + let recorder = Arc::new(RecorderWriter::default()); + let gate = utils::sync::gate::Gate::default(); + let mut writer = BufferedWriter::<_, RecorderWriter>::new( + recorder, + || IoBufferMut::with_capacity(2), + gate.enter()?, + ctx, + ); writer.write_buffered_borrowed(b"abc", ctx).await?; + writer.write_buffered_borrowed(b"", ctx).await?; writer.write_buffered_borrowed(b"d", ctx).await?; writer.write_buffered_borrowed(b"e", ctx).await?; writer.write_buffered_borrowed(b"fg", ctx).await?; @@ -329,9 +310,9 @@ mod tests { writer.write_buffered_borrowed(b"j", ctx).await?; writer.write_buffered_borrowed(b"klmno", ctx).await?; - let recorder = writer.flush_and_into_inner(ctx).await?; + let (_, recorder) = writer.flush_and_into_inner(ctx).await?; assert_eq!( - recorder.writes, + recorder.get_writes(), { let expect: &[&[u8]] = &[b"ab", b"cd", b"ef", b"gh", b"ij", b"kl", b"mn", b"o"]; expect diff --git a/pageserver/src/virtual_file/owned_buffers_io/write/flush.rs b/pageserver/src/virtual_file/owned_buffers_io/write/flush.rs new file mode 100644 index 0000000000..9ce8b311bb --- /dev/null +++ b/pageserver/src/virtual_file/owned_buffers_io/write/flush.rs @@ -0,0 +1,314 @@ +use std::sync::Arc; + +use utils::sync::duplex; + +use crate::{ + context::RequestContext, + virtual_file::owned_buffers_io::{io_buf_aligned::IoBufAligned, io_buf_ext::FullSlice}, +}; + +use super::{Buffer, CheapCloneForRead, OwnedAsyncWriter}; + +/// A handle to the flush task. +pub struct FlushHandle { + inner: Option>, + /// Immutable buffer for serving tail reads. + /// `None` if no flush request has been submitted. + pub(super) maybe_flushed: Option>, +} + +pub struct FlushHandleInner { + /// A bi-directional channel that sends (buffer, offset) for writes, + /// and receives recyled buffer. + channel: duplex::mpsc::Duplex, FullSlice>, + /// Join handle for the background flush task. + join_handle: tokio::task::JoinHandle>>, +} + +struct FlushRequest { + slice: FullSlice, + offset: u64, + #[cfg(test)] + ready_to_flush_rx: tokio::sync::oneshot::Receiver<()>, + #[cfg(test)] + done_flush_tx: tokio::sync::oneshot::Sender<()>, +} + +/// Constructs a request and a control object for a new flush operation. +#[cfg(not(test))] +fn new_flush_op(slice: FullSlice, offset: u64) -> (FlushRequest, FlushControl) { + let request = FlushRequest { slice, offset }; + let control = FlushControl::untracked(); + + (request, control) +} + +/// Constructs a request and a control object for a new flush operation. +#[cfg(test)] +fn new_flush_op(slice: FullSlice, offset: u64) -> (FlushRequest, FlushControl) { + let (ready_to_flush_tx, ready_to_flush_rx) = tokio::sync::oneshot::channel(); + let (done_flush_tx, done_flush_rx) = tokio::sync::oneshot::channel(); + let control = FlushControl::not_started(ready_to_flush_tx, done_flush_rx); + + let request = FlushRequest { + slice, + offset, + ready_to_flush_rx, + done_flush_tx, + }; + (request, control) +} + +/// A handle to a `FlushRequest` that allows unit tests precise control over flush behavior. +#[cfg(test)] +pub(crate) struct FlushControl { + not_started: FlushNotStarted, +} + +#[cfg(not(test))] +pub(crate) struct FlushControl; + +impl FlushControl { + #[cfg(test)] + fn not_started( + ready_to_flush_tx: tokio::sync::oneshot::Sender<()>, + done_flush_rx: tokio::sync::oneshot::Receiver<()>, + ) -> Self { + FlushControl { + not_started: FlushNotStarted { + ready_to_flush_tx, + done_flush_rx, + }, + } + } + + #[cfg(not(test))] + fn untracked() -> Self { + FlushControl + } + + /// In tests, turn flush control into a not started state. + #[cfg(test)] + pub(crate) fn into_not_started(self) -> FlushNotStarted { + self.not_started + } + + /// Release control to the submitted buffer. + /// + /// In `cfg(test)` environment, the buffer is guranteed to be flushed to disk after [`FlushControl::release`] is finishes execution. + pub async fn release(self) { + #[cfg(test)] + { + self.not_started + .ready_to_flush() + .wait_until_flush_is_done() + .await; + } + } +} + +impl FlushHandle +where + Buf: IoBufAligned + Send + Sync + CheapCloneForRead, + W: OwnedAsyncWriter + Send + Sync + 'static + std::fmt::Debug, +{ + /// Spawns a new background flush task and obtains a handle. + /// + /// Note: The background task so we do not need to explicitly maintain a queue of buffers. + pub fn spawn_new( + file: Arc, + buf: B, + gate_guard: utils::sync::gate::GateGuard, + ctx: RequestContext, + ) -> Self + where + B: Buffer + Send + 'static, + { + // It is fine to buffer up to only 1 message. We only 1 message in-flight at a time. + let (front, back) = duplex::mpsc::channel(1); + + let join_handle = tokio::spawn(async move { + FlushBackgroundTask::new(back, file, gate_guard, ctx) + .run(buf.flush()) + .await + }); + + FlushHandle { + inner: Some(FlushHandleInner { + channel: front, + join_handle, + }), + maybe_flushed: None, + } + } + + /// Submits a buffer to be flushed in the background task. + /// Returns a buffer that completed flushing for re-use, length reset to 0, capacity unchanged. + /// If `save_buf_for_read` is true, then we save the buffer in `Self::maybe_flushed`, otherwise + /// clear `maybe_flushed`. + pub async fn flush(&mut self, buf: B, offset: u64) -> std::io::Result<(B, FlushControl)> + where + B: Buffer + Send + 'static, + { + let slice = buf.flush(); + + // Saves a buffer for read while flushing. This also removes reference to the old buffer. + self.maybe_flushed = Some(slice.cheap_clone()); + + let (request, flush_control) = new_flush_op(slice, offset); + + // Submits the buffer to the background task. + let submit = self.inner_mut().channel.send(request).await; + if submit.is_err() { + return self.handle_error().await; + } + + // Wait for an available buffer from the background flush task. + // This is the BACKPRESSURE mechanism: if the flush task can't keep up, + // then the write path will eventually wait for it here. + let Some(recycled) = self.inner_mut().channel.recv().await else { + return self.handle_error().await; + }; + + // The only other place that could hold a reference to the recycled buffer + // is in `Self::maybe_flushed`, but we have already replace it with the new buffer. + let recycled = Buffer::reuse_after_flush(recycled.into_raw_slice().into_inner()); + Ok((recycled, flush_control)) + } + + async fn handle_error(&mut self) -> std::io::Result { + Err(self + .shutdown() + .await + .expect_err("flush task only disconnects duplex if it exits with an error")) + } + + /// Cleans up the channel, join the flush task. + pub async fn shutdown(&mut self) -> std::io::Result> { + let handle = self + .inner + .take() + .expect("must not use after we returned an error"); + drop(handle.channel.tx); + handle.join_handle.await.unwrap() + } + + /// Gets a mutable reference to the inner handle. Panics if [`Self::inner`] is `None`. + /// This only happens if the handle is used after an error. + fn inner_mut(&mut self) -> &mut FlushHandleInner { + self.inner + .as_mut() + .expect("must not use after we returned an error") + } +} + +/// A background task for flushing data to disk. +pub struct FlushBackgroundTask { + /// A bi-directional channel that receives (buffer, offset) for writes, + /// and send back recycled buffer. + channel: duplex::mpsc::Duplex, FlushRequest>, + /// A writter for persisting data to disk. + writer: Arc, + ctx: RequestContext, + /// Prevent timeline from shuting down until the flush background task finishes flushing all remaining buffers to disk. + _gate_guard: utils::sync::gate::GateGuard, +} + +impl FlushBackgroundTask +where + Buf: IoBufAligned + Send + Sync, + W: OwnedAsyncWriter + Sync + 'static, +{ + /// Creates a new background flush task. + fn new( + channel: duplex::mpsc::Duplex, FlushRequest>, + file: Arc, + gate_guard: utils::sync::gate::GateGuard, + ctx: RequestContext, + ) -> Self { + FlushBackgroundTask { + channel, + writer: file, + _gate_guard: gate_guard, + ctx, + } + } + + /// Runs the background flush task. + /// The passed in slice is immediately sent back to the flush handle through the duplex channel. + async fn run(mut self, slice: FullSlice) -> std::io::Result> { + // Sends the extra buffer back to the handle. + self.channel.send(slice).await.map_err(|_| { + std::io::Error::new(std::io::ErrorKind::BrokenPipe, "flush handle closed early") + })?; + + // Exit condition: channel is closed and there is no remaining buffer to be flushed + while let Some(request) = self.channel.recv().await { + #[cfg(test)] + { + // In test, wait for control to signal that we are ready to flush. + if request.ready_to_flush_rx.await.is_err() { + tracing::debug!("control dropped"); + } + } + + // Write slice to disk at `offset`. + let slice = self + .writer + .write_all_at(request.slice, request.offset, &self.ctx) + .await?; + + #[cfg(test)] + { + // In test, tell control we are done flushing buffer. + if request.done_flush_tx.send(()).is_err() { + tracing::debug!("control dropped"); + } + } + + // Sends the buffer back to the handle for reuse. The handle is in charged of cleaning the buffer. + if self.channel.send(slice).await.is_err() { + // Although channel is closed. Still need to finish flushing the remaining buffers. + continue; + } + } + + Ok(self.writer) + } +} + +#[cfg(test)] +pub(crate) struct FlushNotStarted { + ready_to_flush_tx: tokio::sync::oneshot::Sender<()>, + done_flush_rx: tokio::sync::oneshot::Receiver<()>, +} + +#[cfg(test)] +pub(crate) struct FlushInProgress { + done_flush_rx: tokio::sync::oneshot::Receiver<()>, +} + +#[cfg(test)] +pub(crate) struct FlushDone; + +#[cfg(test)] +impl FlushNotStarted { + /// Signals the background task the buffer is ready to flush to disk. + pub fn ready_to_flush(self) -> FlushInProgress { + self.ready_to_flush_tx + .send(()) + .map(|_| FlushInProgress { + done_flush_rx: self.done_flush_rx, + }) + .unwrap() + } +} + +#[cfg(test)] +impl FlushInProgress { + /// Waits until background flush is done. + pub async fn wait_until_flush_is_done(self) -> FlushDone { + self.done_flush_rx.await.unwrap(); + FlushDone + } +} diff --git a/pageserver/src/walingest.rs b/pageserver/src/walingest.rs index d568da596a..93ae88936f 100644 --- a/pageserver/src/walingest.rs +++ b/pageserver/src/walingest.rs @@ -1392,6 +1392,10 @@ impl WalIngest { img: Bytes, ctx: &RequestContext, ) -> Result<()> { + if !self.shard.is_shard_zero() { + return Ok(()); + } + self.handle_slru_extend(modification, kind, segno, blknum, ctx) .await?; modification.put_slru_page_image(kind, segno, blknum, img)?; diff --git a/pgxn/neon/neon.c b/pgxn/neon/neon.c index 51b9f58bbc..ff08f9164d 100644 --- a/pgxn/neon/neon.c +++ b/pgxn/neon/neon.c @@ -15,6 +15,9 @@ #include "access/subtrans.h" #include "access/twophase.h" #include "access/xlog.h" +#if PG_MAJORVERSION_NUM >= 15 +#include "access/xlogrecovery.h" +#endif #include "replication/logical.h" #include "replication/slot.h" #include "replication/walsender.h" @@ -432,6 +435,16 @@ _PG_init(void) restore_running_xacts_callback = RestoreRunningXactsFromClog; + DefineCustomBoolVariable( + "neon.allow_replica_misconfig", + "Allow replica startup when some critical GUCs have smaller value than on primary node", + NULL, + &allowReplicaMisconfig, + true, + PGC_POSTMASTER, + 0, + NULL, NULL, NULL); + DefineCustomEnumVariable( "neon.running_xacts_overflow_policy", "Action performed on snapshot overflow when restoring runnings xacts from CLOG", diff --git a/pgxn/neon/pagestore_smgr.c b/pgxn/neon/pagestore_smgr.c index cbb0e2ae6d..a5e0c402fb 100644 --- a/pgxn/neon/pagestore_smgr.c +++ b/pgxn/neon/pagestore_smgr.c @@ -439,6 +439,8 @@ readahead_buffer_resize(int newsize, void *extra) newPState->ring_unused = newsize; newPState->ring_receive = newsize; newPState->ring_flush = newsize; + newPState->max_shard_no = MyPState->max_shard_no; + memcpy(newPState->shard_bitmap, MyPState->shard_bitmap, sizeof(MyPState->shard_bitmap)); /* * Copy over the prefetches. @@ -495,7 +497,11 @@ readahead_buffer_resize(int newsize, void *extra) for (; end >= MyPState->ring_last && end != UINT64_MAX; end -= 1) { - prefetch_set_unused(end); + PrefetchRequest *slot = GetPrfSlot(end); + if (slot->status == PRFS_RECEIVED) + { + pfree(slot->response); + } } prfh_destroy(MyPState->prf_hash); @@ -944,6 +950,9 @@ Retry: Assert(entry == NULL); Assert(slot == NULL); + /* There should be no buffer overflow */ + Assert(MyPState->ring_last + readahead_buffer_size >= MyPState->ring_unused); + /* * If the prefetch queue is full, we need to make room by clearing the * oldest slot. If the oldest slot holds a buffer that was already @@ -958,7 +967,7 @@ Retry: * a prefetch request kind of goes against the principles of * prefetching) */ - if (MyPState->ring_last + readahead_buffer_size - 1 == MyPState->ring_unused) + if (MyPState->ring_last + readahead_buffer_size == MyPState->ring_unused) { uint64 cleanup_index = MyPState->ring_last; diff --git a/poetry.lock b/poetry.lock index e2fca7be47..59ae5cf1ca 100644 --- a/poetry.lock +++ b/poetry.lock @@ -2563,18 +2563,18 @@ pytest = "*" [[package]] name = "pytest-rerunfailures" -version = "13.0" +version = "15.0" description = "pytest plugin to re-run tests to eliminate flaky failures" optional = false -python-versions = ">=3.7" +python-versions = ">=3.9" files = [ - {file = "pytest-rerunfailures-13.0.tar.gz", hash = "sha256:e132dbe420bc476f544b96e7036edd0a69707574209b6677263c950d19b09199"}, - {file = "pytest_rerunfailures-13.0-py3-none-any.whl", hash = "sha256:34919cb3fcb1f8e5d4b940aa75ccdea9661bade925091873b7c6fa5548333069"}, + {file = "pytest-rerunfailures-15.0.tar.gz", hash = "sha256:2d9ac7baf59f4c13ac730b47f6fa80e755d1ba0581da45ce30b72fb3542b4474"}, + {file = "pytest_rerunfailures-15.0-py3-none-any.whl", hash = "sha256:dd150c4795c229ef44320adc9a0c0532c51b78bb7a6843a8c53556b9a611df1a"}, ] [package.dependencies] packaging = ">=17.1" -pytest = ">=7" +pytest = ">=7.4,<8.2.2 || >8.2.2" [[package]] name = "pytest-split" @@ -3524,4 +3524,4 @@ cffi = ["cffi (>=1.11)"] [metadata] lock-version = "2.0" python-versions = "^3.11" -content-hash = "21debe1116843e5d14bdf37d6e265c68c63a98a64ba04ec8b8a02af2e8d9f486" +content-hash = "426c385df93f578ba3537c40a269535e27fbcca1978b3cf266096ecbc298c6a9" diff --git a/proxy/Cargo.toml b/proxy/Cargo.toml index 1665d6361a..2f63ee3acc 100644 --- a/proxy/Cargo.toml +++ b/proxy/Cargo.toml @@ -6,7 +6,7 @@ license.workspace = true [features] default = [] -testing = [] +testing = ["dep:tokio-postgres"] [dependencies] ahash.workspace = true @@ -55,6 +55,8 @@ parquet.workspace = true parquet_derive.workspace = true pin-project-lite.workspace = true postgres_backend.workspace = true +postgres-client = { package = "tokio-postgres2", path = "../libs/proxy/tokio-postgres2" } +postgres-protocol = { package = "postgres-protocol2", path = "../libs/proxy/postgres-protocol2" } pq_proto.workspace = true prometheus.workspace = true rand.workspace = true @@ -80,8 +82,7 @@ subtle.workspace = true thiserror.workspace = true tikv-jemallocator.workspace = true tikv-jemalloc-ctl = { workspace = true, features = ["use_std"] } -tokio-postgres = { workspace = true, features = ["with-serde_json-1"] } -tokio-postgres-rustls.workspace = true +tokio-postgres = { workspace = true, optional = true } tokio-rustls.workspace = true tokio-util.workspace = true tokio = { workspace = true, features = ["signal"] } @@ -96,7 +97,6 @@ utils.workspace = true uuid.workspace = true rustls-native-certs.workspace = true x509-parser.workspace = true -postgres-protocol.workspace = true redis.workspace = true zerocopy.workspace = true @@ -113,10 +113,11 @@ workspace_hack.workspace = true [dev-dependencies] camino-tempfile.workspace = true fallible-iterator.workspace = true +flate2.workspace = true tokio-tungstenite.workspace = true pbkdf2 = { workspace = true, features = ["simple", "std"] } rcgen.workspace = true rstest.workspace = true -tokio-postgres-rustls.workspace = true walkdir.workspace = true rand_distr = "0.4" +tokio-postgres.workspace = true diff --git a/proxy/src/auth/backend/classic.rs b/proxy/src/auth/backend/classic.rs index 491b272ac4..5e494dfdd6 100644 --- a/proxy/src/auth/backend/classic.rs +++ b/proxy/src/auth/backend/classic.rs @@ -66,7 +66,7 @@ pub(super) async fn authenticate( Ok(ComputeCredentials { info: creds, - keys: ComputeCredentialKeys::AuthKeys(tokio_postgres::config::AuthKeys::ScramSha256( + keys: ComputeCredentialKeys::AuthKeys(postgres_client::config::AuthKeys::ScramSha256( scram_keys, )), }) diff --git a/proxy/src/auth/backend/console_redirect.rs b/proxy/src/auth/backend/console_redirect.rs index bf7a1cb070..575d60be85 100644 --- a/proxy/src/auth/backend/console_redirect.rs +++ b/proxy/src/auth/backend/console_redirect.rs @@ -1,8 +1,8 @@ use async_trait::async_trait; +use postgres_client::config::SslMode; use pq_proto::BeMessage as Be; use thiserror::Error; use tokio::io::{AsyncRead, AsyncWrite}; -use tokio_postgres::config::SslMode; use tracing::{info, info_span}; use super::ComputeCredentialKeys; @@ -49,13 +49,19 @@ impl ReportableError for ConsoleRedirectError { } } -fn hello_message(redirect_uri: &reqwest::Url, session_id: &str) -> String { +fn hello_message( + redirect_uri: &reqwest::Url, + session_id: &str, + duration: std::time::Duration, +) -> String { + let formatted_duration = humantime::format_duration(duration).to_string(); format!( concat![ "Welcome to Neon!\n", - "Authenticate by visiting:\n", + "Authenticate by visiting (will expire in {duration}):\n", " {redirect_uri}{session_id}\n\n", ], + duration = formatted_duration, redirect_uri = redirect_uri, session_id = session_id, ) @@ -118,7 +124,11 @@ async fn authenticate( }; let span = info_span!("console_redirect", psql_session_id = &psql_session_id); - let greeting = hello_message(link_uri, &psql_session_id); + let greeting = hello_message( + link_uri, + &psql_session_id, + auth_config.console_redirect_confirmation_timeout, + ); // Give user a URL to spawn a new database. info!(parent: &span, "sending the auth URL to the user"); @@ -151,12 +161,8 @@ async fn authenticate( // This config should be self-contained, because we won't // take username or dbname from client's startup message. - let mut config = compute::ConnCfg::new(); - config - .host(&db_info.host) - .port(db_info.port) - .dbname(&db_info.dbname) - .user(&db_info.user); + let mut config = compute::ConnCfg::new(db_info.host.to_string(), db_info.port); + config.dbname(&db_info.dbname).user(&db_info.user); ctx.set_dbname(db_info.dbname.into()); ctx.set_user(db_info.user.into()); diff --git a/proxy/src/auth/backend/jwt.rs b/proxy/src/auth/backend/jwt.rs index 517d4fd34b..a258090b15 100644 --- a/proxy/src/auth/backend/jwt.rs +++ b/proxy/src/auth/backend/jwt.rs @@ -350,6 +350,13 @@ impl JwkCacheEntryLock { let header = base64::decode_config(header, base64::URL_SAFE_NO_PAD)?; let header = serde_json::from_slice::>(&header)?; + let payloadb = base64::decode_config(payload, base64::URL_SAFE_NO_PAD)?; + let payload = serde_json::from_slice::>(&payloadb)?; + + if let Some(iss) = &payload.issuer { + ctx.set_jwt_issuer(iss.as_ref().to_owned()); + } + let sig = base64::decode_config(signature, base64::URL_SAFE_NO_PAD)?; let kid = header.key_id.ok_or(JwtError::MissingKeyId)?; @@ -388,9 +395,6 @@ impl JwkCacheEntryLock { key => return Err(JwtError::UnsupportedKeyType(key.into())), }; - let payloadb = base64::decode_config(payload, base64::URL_SAFE_NO_PAD)?; - let payload = serde_json::from_slice::>(&payloadb)?; - tracing::debug!(?payload, "JWT signature valid with claims"); if let Some(aud) = expected_audience { diff --git a/proxy/src/auth/backend/local.rs b/proxy/src/auth/backend/local.rs index 32e0f53615..d4273fb521 100644 --- a/proxy/src/auth/backend/local.rs +++ b/proxy/src/auth/backend/local.rs @@ -29,12 +29,7 @@ impl LocalBackend { api: http::Endpoint::new(compute_ctl, http::new_client()), }, node_info: NodeInfo { - config: { - let mut cfg = ConnCfg::new(); - cfg.host(&postgres_addr.ip().to_string()); - cfg.port(postgres_addr.port()); - cfg - }, + config: ConnCfg::new(postgres_addr.ip().to_string(), postgres_addr.port()), // TODO(conrad): make this better reflect compute info rather than endpoint info. aux: MetricsAuxInfo { endpoint_id: EndpointIdTag::get_interner().get_or_intern("local"), diff --git a/proxy/src/auth/backend/mod.rs b/proxy/src/auth/backend/mod.rs index 7e1b26a11a..1bad7b3086 100644 --- a/proxy/src/auth/backend/mod.rs +++ b/proxy/src/auth/backend/mod.rs @@ -11,8 +11,8 @@ pub use console_redirect::ConsoleRedirectBackend; pub(crate) use console_redirect::ConsoleRedirectError; use ipnet::{Ipv4Net, Ipv6Net}; use local::LocalBackend; +use postgres_client::config::AuthKeys; use tokio::io::{AsyncRead, AsyncWrite}; -use tokio_postgres::config::AuthKeys; use tracing::{debug, info, warn}; use crate::auth::credentials::check_peer_addr_is_in_list; @@ -70,6 +70,10 @@ impl std::fmt::Display for Backend<'_, ()> { fn fmt(&self, fmt: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { match self { Self::ControlPlane(api, ()) => match &**api { + ControlPlaneClient::ProxyV1(endpoint) => fmt + .debug_tuple("ControlPlane::ProxyV1") + .field(&endpoint.url()) + .finish(), ControlPlaneClient::Neon(endpoint) => fmt .debug_tuple("ControlPlane::Neon") .field(&endpoint.url()) diff --git a/proxy/src/auth/flow.rs b/proxy/src/auth/flow.rs index 9c6ce151cb..60d1962d7f 100644 --- a/proxy/src/auth/flow.rs +++ b/proxy/src/auth/flow.rs @@ -227,7 +227,7 @@ pub(crate) async fn validate_password_and_exchange( }; Ok(sasl::Outcome::Success(ComputeCredentialKeys::AuthKeys( - tokio_postgres::config::AuthKeys::ScramSha256(keys), + postgres_client::config::AuthKeys::ScramSha256(keys), ))) } } diff --git a/proxy/src/bin/proxy.rs b/proxy/src/bin/proxy.rs index a935378162..99144acef0 100644 --- a/proxy/src/bin/proxy.rs +++ b/proxy/src/bin/proxy.rs @@ -3,14 +3,6 @@ use std::pin::pin; use std::sync::Arc; use anyhow::bail; -use aws_config::environment::EnvironmentVariableCredentialsProvider; -use aws_config::imds::credentials::ImdsCredentialsProvider; -use aws_config::meta::credentials::CredentialsProviderChain; -use aws_config::meta::region::RegionProviderChain; -use aws_config::profile::ProfileFileCredentialsProvider; -use aws_config::provider_config::ProviderConfig; -use aws_config::web_identity_token::WebIdentityTokenCredentialsProvider; -use aws_config::Region; use futures::future::Either; use proxy::auth::backend::jwt::JwkCache; use proxy::auth::backend::{AuthRateLimiter, ConsoleRedirectBackend, MaybeOwned}; @@ -54,6 +46,9 @@ enum AuthBackendType { #[value(name("console"), alias("cplane"))] ControlPlane, + #[value(name("cplane-v1"), alias("control-plane"))] + ControlPlaneV1, + #[value(name("link"), alias("control-redirect"))] ConsoleRedirect, @@ -314,39 +309,7 @@ async fn main() -> anyhow::Result<()> { }; info!("Using region: {}", args.aws_region); - let region_provider = - RegionProviderChain::default_provider().or_else(Region::new(args.aws_region.clone())); - let provider_conf = - ProviderConfig::without_region().with_region(region_provider.region().await); - let aws_credentials_provider = { - // uses "AWS_ACCESS_KEY_ID", "AWS_SECRET_ACCESS_KEY" - CredentialsProviderChain::first_try("env", EnvironmentVariableCredentialsProvider::new()) - // uses "AWS_PROFILE" / `aws sso login --profile ` - .or_else( - "profile-sso", - ProfileFileCredentialsProvider::builder() - .configure(&provider_conf) - .build(), - ) - // uses "AWS_WEB_IDENTITY_TOKEN_FILE", "AWS_ROLE_ARN", "AWS_ROLE_SESSION_NAME" - // needed to access remote extensions bucket - .or_else( - "token", - WebIdentityTokenCredentialsProvider::builder() - .configure(&provider_conf) - .build(), - ) - // uses imds v2 - .or_else("imds", ImdsCredentialsProvider::builder().build()) - }; - let elasticache_credentials_provider = Arc::new(elasticache::CredentialsProvider::new( - elasticache::AWSIRSAConfig::new( - args.aws_region.clone(), - args.redis_cluster_name, - args.redis_user_id, - ), - aws_credentials_provider, - )); + // TODO: untangle the config args let regional_redis_client = match (args.redis_auth_type.as_str(), &args.redis_notifications) { ("plain", redis_url) => match redis_url { None => { @@ -361,7 +324,12 @@ async fn main() -> anyhow::Result<()> { ConnectionWithCredentialsProvider::new_with_credentials_provider( host.to_string(), port, - elasticache_credentials_provider.clone(), + elasticache::CredentialsProvider::new( + args.aws_region, + args.redis_cluster_name, + args.redis_user_id, + ) + .await, ), ), (None, None) => { @@ -517,10 +485,6 @@ async fn main() -> anyhow::Result<()> { if let Some(metrics_config) = &config.metric_collection { // TODO: Add gc regardles of the metric collection being enabled. maintenance_tasks.spawn(usage_metrics::task_main(metrics_config)); - client_tasks.spawn(usage_metrics::task_backup( - &metrics_config.backup_metric_collection_config, - cancellation_token.clone(), - )); } if let Either::Left(auth::Backend::ControlPlane(api, _)) = &auth_backend { @@ -557,6 +521,39 @@ async fn main() -> anyhow::Result<()> { .instrument(span), ); } + } else if let proxy::control_plane::client::ControlPlaneClient::ProxyV1(api) = &**api { + match (redis_notifications_client, regional_redis_client.clone()) { + (None, None) => {} + (client1, client2) => { + let cache = api.caches.project_info.clone(); + if let Some(client) = client1 { + maintenance_tasks.spawn(notifications::task_main( + client, + cache.clone(), + cancel_map.clone(), + args.region.clone(), + )); + } + if let Some(client) = client2 { + maintenance_tasks.spawn(notifications::task_main( + client, + cache.clone(), + cancel_map.clone(), + args.region.clone(), + )); + } + maintenance_tasks.spawn(async move { cache.clone().gc_worker().await }); + } + } + if let Some(regional_redis_client) = regional_redis_client { + let cache = api.caches.endpoints_cache.clone(); + let con = regional_redis_client; + let span = tracing::info_span!("endpoints_cache"); + maintenance_tasks.spawn( + async move { cache.do_read(con, cancellation_token.clone()).await } + .instrument(span), + ); + } } } @@ -701,6 +698,65 @@ fn build_auth_backend( args: &ProxyCliArgs, ) -> anyhow::Result, &'static ConsoleRedirectBackend>> { match &args.auth_backend { + AuthBackendType::ControlPlaneV1 => { + let wake_compute_cache_config: CacheOptions = args.wake_compute_cache.parse()?; + let project_info_cache_config: ProjectInfoCacheOptions = + args.project_info_cache.parse()?; + let endpoint_cache_config: config::EndpointCacheConfig = + args.endpoint_cache_config.parse()?; + + info!("Using NodeInfoCache (wake_compute) with options={wake_compute_cache_config:?}"); + info!( + "Using AllowedIpsCache (wake_compute) with options={project_info_cache_config:?}" + ); + info!("Using EndpointCacheConfig with options={endpoint_cache_config:?}"); + let caches = Box::leak(Box::new(control_plane::caches::ApiCaches::new( + wake_compute_cache_config, + project_info_cache_config, + endpoint_cache_config, + ))); + + let config::ConcurrencyLockOptions { + shards, + limiter, + epoch, + timeout, + } = args.wake_compute_lock.parse()?; + info!(?limiter, shards, ?epoch, "Using NodeLocks (wake_compute)"); + let locks = Box::leak(Box::new(control_plane::locks::ApiLocks::new( + "wake_compute_lock", + limiter, + shards, + timeout, + epoch, + &Metrics::get().wake_compute_lock, + )?)); + tokio::spawn(locks.garbage_collect_worker()); + + let url: proxy::url::ApiUrl = args.auth_endpoint.parse()?; + + let endpoint = http::Endpoint::new(url, http::new_client()); + + let mut wake_compute_rps_limit = args.wake_compute_limit.clone(); + RateBucketInfo::validate(&mut wake_compute_rps_limit)?; + let wake_compute_endpoint_rate_limiter = + Arc::new(WakeComputeRateLimiter::new(wake_compute_rps_limit)); + + let api = control_plane::client::cplane_proxy_v1::NeonControlPlaneClient::new( + endpoint, + args.control_plane_token.clone(), + caches, + locks, + wake_compute_endpoint_rate_limiter, + ); + + let api = control_plane::client::ControlPlaneClient::ProxyV1(api); + let auth_backend = auth::Backend::ControlPlane(MaybeOwned::Owned(api), ()); + let config = Box::leak(Box::new(auth_backend)); + + Ok(Either::Left(config)) + } + AuthBackendType::ControlPlane => { let wake_compute_cache_config: CacheOptions = args.wake_compute_cache.parse()?; let project_info_cache_config: ProjectInfoCacheOptions = @@ -736,13 +792,15 @@ fn build_auth_backend( )?)); tokio::spawn(locks.garbage_collect_worker()); - let url = args.auth_endpoint.parse()?; + let url: proxy::url::ApiUrl = args.auth_endpoint.parse()?; + let endpoint = http::Endpoint::new(url, http::new_client()); let mut wake_compute_rps_limit = args.wake_compute_limit.clone(); RateBucketInfo::validate(&mut wake_compute_rps_limit)?; let wake_compute_endpoint_rate_limiter = Arc::new(WakeComputeRateLimiter::new(wake_compute_rps_limit)); + let api = control_plane::client::neon::NeonControlPlaneClient::new( endpoint, args.control_plane_token.clone(), diff --git a/proxy/src/cancellation.rs b/proxy/src/cancellation.rs index 74415f1ffe..7bc5587a25 100644 --- a/proxy/src/cancellation.rs +++ b/proxy/src/cancellation.rs @@ -3,11 +3,11 @@ use std::sync::Arc; use dashmap::DashMap; use ipnet::{IpNet, Ipv4Net, Ipv6Net}; +use postgres_client::{CancelToken, NoTls}; use pq_proto::CancelKeyData; use thiserror::Error; use tokio::net::TcpStream; use tokio::sync::Mutex; -use tokio_postgres::{CancelToken, NoTls}; use tracing::{debug, info}; use uuid::Uuid; @@ -44,7 +44,7 @@ pub(crate) enum CancelError { IO(#[from] std::io::Error), #[error("{0}")] - Postgres(#[from] tokio_postgres::Error), + Postgres(#[from] postgres_client::Error), #[error("rate limit exceeded")] RateLimit, @@ -70,11 +70,12 @@ impl ReportableError for CancelError { impl CancellationHandler

{ /// Run async action within an ephemeral session identified by [`CancelKeyData`]. pub(crate) fn get_session(self: Arc) -> Session

{ - // HACK: We'd rather get the real backend_pid but tokio_postgres doesn't - // expose it and we don't want to do another roundtrip to query - // for it. The client will be able to notice that this is not the - // actual backend_pid, but backend_pid is not used for anything - // so it doesn't matter. + // we intentionally generate a random "backend pid" and "secret key" here. + // we use the corresponding u64 as an identifier for the + // actual endpoint+pid+secret for postgres/pgbouncer. + // + // if we forwarded the backend_pid from postgres to the client, there would be a lot + // of overlap between our computes as most pids are small (~100). let key = loop { let key = rand::random(); @@ -99,16 +100,17 @@ impl CancellationHandler

{ /// Try to cancel a running query for the corresponding connection. /// If the cancellation key is not found, it will be published to Redis. /// check_allowed - if true, check if the IP is allowed to cancel the query + /// return Result primarily for tests pub(crate) async fn cancel_session( &self, key: CancelKeyData, session_id: Uuid, - peer_addr: &IpAddr, + peer_addr: IpAddr, check_allowed: bool, ) -> Result<(), CancelError> { // TODO: check for unspecified address is only for backward compatibility, should be removed if !peer_addr.is_unspecified() { - let subnet_key = match *peer_addr { + let subnet_key = match peer_addr { IpAddr::V4(ip) => IpNet::V4(Ipv4Net::new_assert(ip, 24).trunc()), // use defaut mask here IpAddr::V6(ip) => IpNet::V6(Ipv6Net::new_assert(ip, 64).trunc()), }; @@ -141,9 +143,11 @@ impl CancellationHandler

{ return Ok(()); } - match self.client.try_publish(key, session_id, *peer_addr).await { + match self.client.try_publish(key, session_id, peer_addr).await { Ok(()) => {} // do nothing Err(e) => { + // log it here since cancel_session could be spawned in a task + tracing::error!("failed to publish cancellation key: {key}, error: {e}"); return Err(CancelError::IO(std::io::Error::new( std::io::ErrorKind::Other, e.to_string(), @@ -154,8 +158,10 @@ impl CancellationHandler

{ }; if check_allowed - && !check_peer_addr_is_in_list(peer_addr, cancel_closure.ip_allowlist.as_slice()) + && !check_peer_addr_is_in_list(&peer_addr, cancel_closure.ip_allowlist.as_slice()) { + // log it here since cancel_session could be spawned in a task + tracing::warn!("IP is not allowed to cancel the query: {key}"); return Err(CancelError::IpNotAllowed); } @@ -306,7 +312,7 @@ mod tests { cancel_key: 0, }, Uuid::new_v4(), - &("127.0.0.1".parse().unwrap()), + "127.0.0.1".parse().unwrap(), true, ) .await diff --git a/proxy/src/compute.rs b/proxy/src/compute.rs index 8408d4720b..4113b5bb80 100644 --- a/proxy/src/compute.rs +++ b/proxy/src/compute.rs @@ -6,14 +6,15 @@ use std::time::Duration; use futures::{FutureExt, TryFutureExt}; use itertools::Itertools; use once_cell::sync::OnceCell; +use postgres_client::tls::MakeTlsConnect; +use postgres_client::{CancelToken, RawConnection}; +use postgres_protocol::message::backend::NoticeResponseBody; use pq_proto::StartupMessageParams; use rustls::client::danger::ServerCertVerifier; use rustls::crypto::ring; use rustls::pki_types::InvalidDnsNameError; use thiserror::Error; use tokio::net::TcpStream; -use tokio_postgres::tls::MakeTlsConnect; -use tokio_postgres_rustls::MakeRustlsConnect; use tracing::{debug, error, info, warn}; use crate::auth::parse_endpoint_param; @@ -24,6 +25,7 @@ use crate::control_plane::errors::WakeComputeError; use crate::control_plane::messages::MetricsAuxInfo; use crate::error::{ReportableError, UserFacingError}; use crate::metrics::{Metrics, NumDbConnectionsGuard}; +use crate::postgres_rustls::MakeRustlsConnect; use crate::proxy::neon_option; use crate::types::Host; @@ -32,9 +34,9 @@ pub const COULD_NOT_CONNECT: &str = "Couldn't connect to compute node"; #[derive(Debug, Error)] pub(crate) enum ConnectionError { /// This error doesn't seem to reveal any secrets; for instance, - /// `tokio_postgres::error::Kind` doesn't contain ip addresses and such. + /// `postgres_client::error::Kind` doesn't contain ip addresses and such. #[error("{COULD_NOT_CONNECT}: {0}")] - Postgres(#[from] tokio_postgres::Error), + Postgres(#[from] postgres_client::Error), #[error("{COULD_NOT_CONNECT}: {0}")] CouldNotConnect(#[from] io::Error), @@ -97,18 +99,18 @@ impl ReportableError for ConnectionError { } /// A pair of `ClientKey` & `ServerKey` for `SCRAM-SHA-256`. -pub(crate) type ScramKeys = tokio_postgres::config::ScramKeys<32>; +pub(crate) type ScramKeys = postgres_client::config::ScramKeys<32>; /// A config for establishing a connection to compute node. -/// Eventually, `tokio_postgres` will be replaced with something better. +/// Eventually, `postgres_client` will be replaced with something better. /// Newtype allows us to implement methods on top of it. -#[derive(Clone, Default)] -pub(crate) struct ConnCfg(Box); +#[derive(Clone)] +pub(crate) struct ConnCfg(Box); /// Creation and initialization routines. impl ConnCfg { - pub(crate) fn new() -> Self { - Self::default() + pub(crate) fn new(host: String, port: u16) -> Self { + Self(Box::new(postgres_client::Config::new(host, port))) } /// Reuse password or auth keys from the other config. @@ -122,65 +124,49 @@ impl ConnCfg { } } - pub(crate) fn get_host(&self) -> Result { - match self.0.get_hosts() { - [tokio_postgres::config::Host::Tcp(s)] => Ok(s.into()), - // we should not have multiple address or unix addresses. - _ => Err(WakeComputeError::BadComputeAddress( - "invalid compute address".into(), - )), + pub(crate) fn get_host(&self) -> Host { + match self.0.get_host() { + postgres_client::config::Host::Tcp(s) => s.into(), } } /// Apply startup message params to the connection config. - pub(crate) fn set_startup_params(&mut self, params: &StartupMessageParams) { - // Only set `user` if it's not present in the config. - // Console redirect auth flow takes username from the console's response. - if let (None, Some(user)) = (self.get_user(), params.get("user")) { - self.user(user); + pub(crate) fn set_startup_params( + &mut self, + params: &StartupMessageParams, + arbitrary_params: bool, + ) { + if !arbitrary_params { + self.set_param("client_encoding", "UTF8"); } - - // Only set `dbname` if it's not present in the config. - // Console redirect auth flow takes dbname from the console's response. - if let (None, Some(dbname)) = (self.get_dbname(), params.get("database")) { - self.dbname(dbname); - } - - // Don't add `options` if they were only used for specifying a project. - // Connection pools don't support `options`, because they affect backend startup. - if let Some(options) = filtered_options(params) { - self.options(&options); - } - - if let Some(app_name) = params.get("application_name") { - self.application_name(app_name); - } - - // TODO: This is especially ugly... - if let Some(replication) = params.get("replication") { - use tokio_postgres::config::ReplicationMode; - match replication { - "true" | "on" | "yes" | "1" => { - self.replication_mode(ReplicationMode::Physical); + for (k, v) in params.iter() { + match k { + // Only set `user` if it's not present in the config. + // Console redirect auth flow takes username from the console's response. + "user" if self.user_is_set() => continue, + "database" if self.db_is_set() => continue, + "options" => { + if let Some(options) = filtered_options(v) { + self.set_param(k, &options); + } } - "database" => { - self.replication_mode(ReplicationMode::Logical); + "user" | "database" | "application_name" | "replication" => { + self.set_param(k, v); } - _other => {} + + // if we allow arbitrary params, then we forward them through. + // this is a flag for a period of backwards compatibility + k if arbitrary_params => { + self.set_param(k, v); + } + _ => {} } } - - // TODO: extend the list of the forwarded startup parameters. - // Currently, tokio-postgres doesn't allow us to pass - // arbitrary parameters, but the ones above are a good start. - // - // This and the reverse params problem can be better addressed - // in a bespoke connection machinery (a new library for that sake). } } impl std::ops::Deref for ConnCfg { - type Target = tokio_postgres::Config; + type Target = postgres_client::Config; fn deref(&self) -> &Self::Target { &self.0 @@ -197,7 +183,7 @@ impl std::ops::DerefMut for ConnCfg { impl ConnCfg { /// Establish a raw TCP connection to the compute node. async fn connect_raw(&self, timeout: Duration) -> io::Result<(SocketAddr, TcpStream, &str)> { - use tokio_postgres::config::Host; + use postgres_client::config::Host; // wrap TcpStream::connect with timeout let connect_with_timeout = |host, port| { @@ -222,47 +208,23 @@ impl ConnCfg { }) }; - // We can't reuse connection establishing logic from `tokio_postgres` here, + // We can't reuse connection establishing logic from `postgres_client` here, // because it has no means for extracting the underlying socket which we // require for our business. - let mut connection_error = None; - let ports = self.0.get_ports(); - let hosts = self.0.get_hosts(); - // the ports array is supposed to have 0 entries, 1 entry, or as many entries as in the hosts array - if ports.len() > 1 && ports.len() != hosts.len() { - return Err(io::Error::new( - io::ErrorKind::Other, - format!( - "bad compute config, \ - ports and hosts entries' count does not match: {:?}", - self.0 - ), - )); - } + let port = self.0.get_port(); + let host = self.0.get_host(); - for (i, host) in hosts.iter().enumerate() { - let port = ports.get(i).or_else(|| ports.first()).unwrap_or(&5432); - let host = match host { - Host::Tcp(host) => host.as_str(), - Host::Unix(_) => continue, // unix sockets are not welcome here - }; + let host = match host { + Host::Tcp(host) => host.as_str(), + }; - match connect_once(host, *port).await { - Ok((sockaddr, stream)) => return Ok((sockaddr, stream, host)), - Err(err) => { - // We can't throw an error here, as there might be more hosts to try. - warn!("couldn't connect to compute node at {host}:{port}: {err}"); - connection_error = Some(err); - } + match connect_once(host, port).await { + Ok((sockaddr, stream)) => Ok((sockaddr, stream, host)), + Err(err) => { + warn!("couldn't connect to compute node at {host}:{port}: {err}"); + Err(err) } } - - Err(connection_error.unwrap_or_else(|| { - io::Error::new( - io::ErrorKind::Other, - format!("bad compute config: {:?}", self.0), - ) - })) } } @@ -271,13 +233,15 @@ type RustlsStream = > pub(crate) struct PostgresConnection { /// Socket connected to a compute node. pub(crate) stream: - tokio_postgres::maybe_tls_stream::MaybeTlsStream, + postgres_client::maybe_tls_stream::MaybeTlsStream, /// PostgreSQL connection parameters. pub(crate) params: std::collections::HashMap, /// Query cancellation token. pub(crate) cancel_closure: CancelClosure, /// Labels for proxy's metrics. pub(crate) aux: MetricsAuxInfo, + /// Notices received from compute after authenticating + pub(crate) delayed_notice: Vec, _guage: NumDbConnectionsGuard<'static>, } @@ -315,7 +279,7 @@ impl ConnCfg { }; let client_config = client_config.with_no_client_auth(); - let mut mk_tls = tokio_postgres_rustls::MakeRustlsConnect::new(client_config); + let mut mk_tls = crate::postgres_rustls::MakeRustlsConnect::new(client_config); let tls = >::make_tls_connect( &mut mk_tls, host, @@ -323,10 +287,19 @@ impl ConnCfg { // connect_raw() will not use TLS if sslmode is "disable" let pause = ctx.latency_timer_pause(crate::metrics::Waiting::Compute); - let (client, connection) = self.0.connect_raw(stream, tls).await?; + let connection = self.0.connect_raw(stream, tls).await?; drop(pause); - tracing::Span::current().record("pid", tracing::field::display(client.get_process_id())); - let stream = connection.stream.into_inner(); + + let RawConnection { + stream, + parameters, + delayed_notice, + process_id, + secret_key, + } = connection; + + tracing::Span::current().record("pid", tracing::field::display(process_id)); + let stream = stream.into_inner(); // TODO: lots of useful info but maybe we can move it elsewhere (eg traces?) info!( @@ -335,18 +308,23 @@ impl ConnCfg { self.0.get_ssl_mode() ); - // This is very ugly but as of now there's no better way to - // extract the connection parameters from tokio-postgres' connection. - // TODO: solve this problem in a more elegant manner (e.g. the new library). - let params = connection.parameters; - // NB: CancelToken is supposed to hold socket_addr, but we use connect_raw. // Yet another reason to rework the connection establishing code. - let cancel_closure = CancelClosure::new(socket_addr, client.cancel_token(), vec![]); + let cancel_closure = CancelClosure::new( + socket_addr, + CancelToken { + socket_config: None, + ssl_mode: self.0.get_ssl_mode(), + process_id, + secret_key, + }, + vec![], + ); let connection = PostgresConnection { stream, - params, + params: parameters, + delayed_notice, cancel_closure, aux, _guage: Metrics::get().proxy.db_connections.guard(ctx.protocol()), @@ -357,10 +335,9 @@ impl ConnCfg { } /// Retrieve `options` from a startup message, dropping all proxy-secific flags. -fn filtered_options(params: &StartupMessageParams) -> Option { +fn filtered_options(options: &str) -> Option { #[allow(unstable_name_collisions)] - let options: String = params - .options_raw()? + let options: String = StartupMessageParams::parse_options_raw(options) .filter(|opt| parse_endpoint_param(opt).is_none() && neon_option(opt).is_none()) .intersperse(" ") // TODO: use impl from std once it's stabilized .collect(); @@ -437,27 +414,24 @@ mod tests { #[test] fn test_filtered_options() { // Empty options is unlikely to be useful anyway. - let params = StartupMessageParams::new([("options", "")]); - assert_eq!(filtered_options(¶ms), None); + let params = ""; + assert_eq!(filtered_options(params), None); // It's likely that clients will only use options to specify endpoint/project. - let params = StartupMessageParams::new([("options", "project=foo")]); - assert_eq!(filtered_options(¶ms), None); + let params = "project=foo"; + assert_eq!(filtered_options(params), None); // Same, because unescaped whitespaces are no-op. - let params = StartupMessageParams::new([("options", " project=foo ")]); - assert_eq!(filtered_options(¶ms).as_deref(), None); + let params = " project=foo "; + assert_eq!(filtered_options(params).as_deref(), None); - let params = StartupMessageParams::new([("options", r"\ project=foo \ ")]); - assert_eq!(filtered_options(¶ms).as_deref(), Some(r"\ \ ")); + let params = r"\ project=foo \ "; + assert_eq!(filtered_options(params).as_deref(), Some(r"\ \ ")); - let params = StartupMessageParams::new([("options", "project = foo")]); - assert_eq!(filtered_options(¶ms).as_deref(), Some("project = foo")); + let params = "project = foo"; + assert_eq!(filtered_options(params).as_deref(), Some("project = foo")); - let params = StartupMessageParams::new([( - "options", - "project = foo neon_endpoint_type:read_write neon_lsn:0/2", - )]); - assert_eq!(filtered_options(¶ms).as_deref(), Some("project = foo")); + let params = "project = foo neon_endpoint_type:read_write neon_lsn:0/2 neon_proxy_params_compat:true"; + assert_eq!(filtered_options(params).as_deref(), Some("project = foo")); } } diff --git a/proxy/src/console_redirect_proxy.rs b/proxy/src/console_redirect_proxy.rs index b910b524b1..7db1179eea 100644 --- a/proxy/src/console_redirect_proxy.rs +++ b/proxy/src/console_redirect_proxy.rs @@ -35,6 +35,7 @@ pub async fn task_main( socket2::SockRef::from(&listener).set_keepalive(true)?; let connections = tokio_util::task::task_tracker::TaskTracker::new(); + let cancellations = tokio_util::task::task_tracker::TaskTracker::new(); while let Some(accept_result) = run_until_cancelled(listener.accept(), &cancellation_token).await @@ -48,6 +49,7 @@ pub async fn task_main( let session_id = uuid::Uuid::new_v4(); let cancellation_handler = Arc::clone(&cancellation_handler); + let cancellations = cancellations.clone(); debug!(protocol = "tcp", %session_id, "accepted new TCP connection"); @@ -96,6 +98,7 @@ pub async fn task_main( cancellation_handler, socket, conn_gauge, + cancellations, ) .instrument(ctx.span()) .boxed() @@ -127,10 +130,12 @@ pub async fn task_main( } connections.close(); + cancellations.close(); drop(listener); // Drain connections connections.wait().await; + cancellations.wait().await; Ok(()) } @@ -142,6 +147,7 @@ pub(crate) async fn handle_client( cancellation_handler: Arc, stream: S, conn_gauge: NumClientConnectionsGuard<'static>, + cancellations: tokio_util::task::task_tracker::TaskTracker, ) -> Result>, ClientRequestError> { debug!( protocol = %ctx.protocol(), @@ -161,15 +167,26 @@ pub(crate) async fn handle_client( match tokio::time::timeout(config.handshake_timeout, do_handshake).await?? { HandshakeData::Startup(stream, params) => (stream, params), HandshakeData::Cancel(cancel_key_data) => { - return Ok(cancellation_handler - .cancel_session( - cancel_key_data, - ctx.session_id(), - &ctx.peer_addr(), - config.authentication_config.ip_allowlist_check_enabled, - ) - .await - .map(|()| None)?) + // spawn a task to cancel the session, but don't wait for it + cancellations.spawn({ + let cancellation_handler_clone = Arc::clone(&cancellation_handler); + let session_id = ctx.session_id(); + let peer_ip = ctx.peer_addr(); + async move { + drop( + cancellation_handler_clone + .cancel_session( + cancel_key_data, + session_id, + peer_ip, + config.authentication_config.ip_allowlist_check_enabled, + ) + .await, + ); + } + }); + + return Ok(None); } }; drop(pause); @@ -189,6 +206,7 @@ pub(crate) async fn handle_client( let mut node = connect_to_compute( ctx, &TcpMechanism { + params_compat: true, params: ¶ms, locks: &config.connect_compute_locks, }, diff --git a/proxy/src/context/mod.rs b/proxy/src/context/mod.rs index 5c19a23e36..a9fb513d3c 100644 --- a/proxy/src/context/mod.rs +++ b/proxy/src/context/mod.rs @@ -57,6 +57,7 @@ struct RequestContextInner { application: Option, error_kind: Option, pub(crate) auth_method: Option, + jwt_issuer: Option, success: bool, pub(crate) cold_start_info: ColdStartInfo, pg_options: Option, @@ -79,6 +80,7 @@ pub(crate) enum AuthMethod { ScramSha256, ScramSha256Plus, Cleartext, + Jwt, } impl Clone for RequestContext { @@ -100,6 +102,7 @@ impl Clone for RequestContext { application: inner.application.clone(), error_kind: inner.error_kind, auth_method: inner.auth_method.clone(), + jwt_issuer: inner.jwt_issuer.clone(), success: inner.success, rejected: inner.rejected, cold_start_info: inner.cold_start_info, @@ -148,6 +151,7 @@ impl RequestContext { application: None, error_kind: None, auth_method: None, + jwt_issuer: None, success: false, rejected: None, cold_start_info: ColdStartInfo::Unknown, @@ -246,6 +250,11 @@ impl RequestContext { this.auth_method = Some(auth_method); } + pub(crate) fn set_jwt_issuer(&self, jwt_issuer: String) { + let mut this = self.0.try_lock().expect("should not deadlock"); + this.jwt_issuer = Some(jwt_issuer); + } + pub fn has_private_peer_addr(&self) -> bool { self.0 .try_lock() @@ -414,6 +423,7 @@ impl RequestContextInner { outcome, }); } + if let Some(tx) = self.sender.take() { // If type changes, this error handling needs to be updated. let tx: mpsc::UnboundedSender = tx; diff --git a/proxy/src/context/parquet.rs b/proxy/src/context/parquet.rs index e328c6de79..3105d08526 100644 --- a/proxy/src/context/parquet.rs +++ b/proxy/src/context/parquet.rs @@ -87,6 +87,8 @@ pub(crate) struct RequestData { branch: Option, pg_options: Option, auth_method: Option<&'static str>, + jwt_issuer: Option, + error: Option<&'static str>, /// Success is counted if we form a HTTP response with sql rows inside /// Or if we make it to proxy_pass @@ -138,7 +140,9 @@ impl From<&RequestContextInner> for RequestData { super::AuthMethod::ScramSha256 => "scram_sha_256", super::AuthMethod::ScramSha256Plus => "scram_sha_256_plus", super::AuthMethod::Cleartext => "cleartext", + super::AuthMethod::Jwt => "jwt", }), + jwt_issuer: value.jwt_issuer.clone(), protocol: value.protocol.as_str(), region: value.region, error: value.error_kind.as_ref().map(|e| e.to_metric_label()), @@ -486,6 +490,7 @@ mod tests { upload_storage_class: None, }), timeout: RemoteStorageConfig::DEFAULT_TIMEOUT, + small_timeout: RemoteStorageConfig::DEFAULT_SMALL_TIMEOUT, }) ); assert_eq!(parquet_upload.parquet_upload_row_group_size, 100); @@ -518,6 +523,7 @@ mod tests { branch: Some(hex::encode(rng.gen::<[u8; 16]>())), pg_options: None, auth_method: None, + jwt_issuer: None, protocol: ["tcp", "ws", "http"][rng.gen_range(0..3)], region: "us-east-1", error: None, @@ -545,6 +551,7 @@ mod tests { local_path: tmpdir.to_path_buf(), }, timeout: std::time::Duration::from_secs(120), + small_timeout: std::time::Duration::from_secs(30), }; let storage = GenericRemoteStorage::from_config(&remote_storage_config) .await @@ -597,15 +604,15 @@ mod tests { assert_eq!( file_stats, [ - (1312632, 3, 6000), - (1312621, 3, 6000), - (1312680, 3, 6000), - (1312637, 3, 6000), - (1312773, 3, 6000), - (1312610, 3, 6000), - (1312404, 3, 6000), - (1312639, 3, 6000), - (437848, 1, 2000) + (1313105, 3, 6000), + (1313094, 3, 6000), + (1313153, 3, 6000), + (1313110, 3, 6000), + (1313246, 3, 6000), + (1313083, 3, 6000), + (1312877, 3, 6000), + (1313112, 3, 6000), + (438020, 1, 2000) ] ); @@ -637,11 +644,11 @@ mod tests { assert_eq!( file_stats, [ - (1203465, 5, 10000), - (1203189, 5, 10000), - (1203490, 5, 10000), - (1203475, 5, 10000), - (1203729, 5, 10000) + (1204324, 5, 10000), + (1204048, 5, 10000), + (1204349, 5, 10000), + (1204334, 5, 10000), + (1204588, 5, 10000) ] ); @@ -666,15 +673,15 @@ mod tests { assert_eq!( file_stats, [ - (1312632, 3, 6000), - (1312621, 3, 6000), - (1312680, 3, 6000), - (1312637, 3, 6000), - (1312773, 3, 6000), - (1312610, 3, 6000), - (1312404, 3, 6000), - (1312639, 3, 6000), - (437848, 1, 2000) + (1313105, 3, 6000), + (1313094, 3, 6000), + (1313153, 3, 6000), + (1313110, 3, 6000), + (1313246, 3, 6000), + (1313083, 3, 6000), + (1312877, 3, 6000), + (1313112, 3, 6000), + (438020, 1, 2000) ] ); @@ -711,7 +718,7 @@ mod tests { // files are smaller than the size threshold, but they took too long to fill so were flushed early assert_eq!( file_stats, - [(657696, 2, 3001), (657410, 2, 3000), (657206, 2, 2999)] + [(658014, 2, 3001), (657728, 2, 3000), (657524, 2, 2999)] ); tmpdir.close().unwrap(); diff --git a/proxy/src/control_plane/client/cplane_proxy_v1.rs b/proxy/src/control_plane/client/cplane_proxy_v1.rs new file mode 100644 index 0000000000..e33a37f643 --- /dev/null +++ b/proxy/src/control_plane/client/cplane_proxy_v1.rs @@ -0,0 +1,514 @@ +//! Production console backend. + +use std::sync::Arc; +use std::time::Duration; + +use ::http::header::AUTHORIZATION; +use ::http::HeaderName; +use futures::TryFutureExt; +use postgres_client::config::SslMode; +use tokio::time::Instant; +use tracing::{debug, info, info_span, warn, Instrument}; + +use super::super::messages::{ControlPlaneErrorMessage, GetEndpointAccessControl, WakeCompute}; +use crate::auth::backend::jwt::AuthRule; +use crate::auth::backend::ComputeUserInfo; +use crate::cache::Cached; +use crate::context::RequestContext; +use crate::control_plane::caches::ApiCaches; +use crate::control_plane::errors::{ + ControlPlaneError, GetAuthInfoError, GetEndpointJwksError, WakeComputeError, +}; +use crate::control_plane::locks::ApiLocks; +use crate::control_plane::messages::{ColdStartInfo, EndpointJwksResponse, Reason}; +use crate::control_plane::{ + AuthInfo, AuthSecret, CachedAllowedIps, CachedNodeInfo, CachedRoleSecret, NodeInfo, +}; +use crate::metrics::{CacheOutcome, Metrics}; +use crate::rate_limiter::WakeComputeRateLimiter; +use crate::types::{EndpointCacheKey, EndpointId}; +use crate::{compute, http, scram}; + +const X_REQUEST_ID: HeaderName = HeaderName::from_static("x-request-id"); + +#[derive(Clone)] +pub struct NeonControlPlaneClient { + endpoint: http::Endpoint, + pub caches: &'static ApiCaches, + pub(crate) locks: &'static ApiLocks, + pub(crate) wake_compute_endpoint_rate_limiter: Arc, + // put in a shared ref so we don't copy secrets all over in memory + jwt: Arc, +} + +impl NeonControlPlaneClient { + /// Construct an API object containing the auth parameters. + pub fn new( + endpoint: http::Endpoint, + jwt: Arc, + caches: &'static ApiCaches, + locks: &'static ApiLocks, + wake_compute_endpoint_rate_limiter: Arc, + ) -> Self { + Self { + endpoint, + caches, + locks, + wake_compute_endpoint_rate_limiter, + jwt, + } + } + + pub(crate) fn url(&self) -> &str { + self.endpoint.url().as_str() + } + + async fn do_get_auth_info( + &self, + ctx: &RequestContext, + user_info: &ComputeUserInfo, + ) -> Result { + if !self + .caches + .endpoints_cache + .is_valid(ctx, &user_info.endpoint.normalize()) + { + // TODO: refactor this because it's weird + // this is a failure to authenticate but we return Ok. + info!("endpoint is not valid, skipping the request"); + return Ok(AuthInfo::default()); + } + let request_id = ctx.session_id().to_string(); + let application_name = ctx.console_application_name(); + async { + let request = self + .endpoint + .get_path("get_endpoint_access_control") + .header(X_REQUEST_ID, &request_id) + .header(AUTHORIZATION, format!("Bearer {}", &self.jwt)) + .query(&[("session_id", ctx.session_id())]) + .query(&[ + ("application_name", application_name.as_str()), + ("endpointish", user_info.endpoint.as_str()), + ("role", user_info.user.as_str()), + ]) + .build()?; + + debug!(url = request.url().as_str(), "sending http request"); + let start = Instant::now(); + let pause = ctx.latency_timer_pause(crate::metrics::Waiting::Cplane); + let response = self.endpoint.execute(request).await?; + drop(pause); + info!(duration = ?start.elapsed(), "received http response"); + let body = match parse_body::(response).await { + Ok(body) => body, + // Error 404 is special: it's ok not to have a secret. + // TODO(anna): retry + Err(e) => { + return if e.get_reason().is_not_found() { + // TODO: refactor this because it's weird + // this is a failure to authenticate but we return Ok. + Ok(AuthInfo::default()) + } else { + Err(e.into()) + }; + } + }; + + // Ivan: don't know where it will be used, so I leave it here + let _endpoint_vpc_ids = body.allowed_vpc_endpoint_ids.unwrap_or_default(); + + let secret = if body.role_secret.is_empty() { + None + } else { + let secret = scram::ServerSecret::parse(&body.role_secret) + .map(AuthSecret::Scram) + .ok_or(GetAuthInfoError::BadSecret)?; + Some(secret) + }; + let allowed_ips = body.allowed_ips.unwrap_or_default(); + Metrics::get() + .proxy + .allowed_ips_number + .observe(allowed_ips.len() as f64); + Ok(AuthInfo { + secret, + allowed_ips, + project_id: body.project_id, + }) + } + .inspect_err(|e| tracing::debug!(error = ?e)) + .instrument(info_span!("do_get_auth_info")) + .await + } + + async fn do_get_endpoint_jwks( + &self, + ctx: &RequestContext, + endpoint: EndpointId, + ) -> Result, GetEndpointJwksError> { + if !self + .caches + .endpoints_cache + .is_valid(ctx, &endpoint.normalize()) + { + return Err(GetEndpointJwksError::EndpointNotFound); + } + let request_id = ctx.session_id().to_string(); + async { + let request = self + .endpoint + .get_with_url(|url| { + url.path_segments_mut() + .push("endpoints") + .push(endpoint.as_str()) + .push("jwks"); + }) + .header(X_REQUEST_ID, &request_id) + .header(AUTHORIZATION, format!("Bearer {}", &self.jwt)) + .query(&[("session_id", ctx.session_id())]) + .build() + .map_err(GetEndpointJwksError::RequestBuild)?; + + debug!(url = request.url().as_str(), "sending http request"); + let start = Instant::now(); + let pause = ctx.latency_timer_pause(crate::metrics::Waiting::Cplane); + let response = self + .endpoint + .execute(request) + .await + .map_err(GetEndpointJwksError::RequestExecute)?; + drop(pause); + info!(duration = ?start.elapsed(), "received http response"); + + let body = parse_body::(response).await?; + + let rules = body + .jwks + .into_iter() + .map(|jwks| AuthRule { + id: jwks.id, + jwks_url: jwks.jwks_url, + audience: jwks.jwt_audience, + role_names: jwks.role_names, + }) + .collect(); + + Ok(rules) + } + .inspect_err(|e| tracing::debug!(error = ?e)) + .instrument(info_span!("do_get_endpoint_jwks")) + .await + } + + async fn do_wake_compute( + &self, + ctx: &RequestContext, + user_info: &ComputeUserInfo, + ) -> Result { + let request_id = ctx.session_id().to_string(); + let application_name = ctx.console_application_name(); + async { + let mut request_builder = self + .endpoint + .get_path("wake_compute") + .header("X-Request-ID", &request_id) + .header("Authorization", format!("Bearer {}", &self.jwt)) + .query(&[("session_id", ctx.session_id())]) + .query(&[ + ("application_name", application_name.as_str()), + ("endpointish", user_info.endpoint.as_str()), + ]); + + let options = user_info.options.to_deep_object(); + if !options.is_empty() { + request_builder = request_builder.query(&options); + } + + let request = request_builder.build()?; + + debug!(url = request.url().as_str(), "sending http request"); + let start = Instant::now(); + let pause = ctx.latency_timer_pause(crate::metrics::Waiting::Cplane); + let response = self.endpoint.execute(request).await?; + drop(pause); + info!(duration = ?start.elapsed(), "received http response"); + let body = parse_body::(response).await?; + + // Unfortunately, ownership won't let us use `Option::ok_or` here. + let (host, port) = match parse_host_port(&body.address) { + None => return Err(WakeComputeError::BadComputeAddress(body.address)), + Some(x) => x, + }; + + // Don't set anything but host and port! This config will be cached. + // We'll set username and such later using the startup message. + // TODO: add more type safety (in progress). + let mut config = compute::ConnCfg::new(host.to_owned(), port); + config.ssl_mode(SslMode::Disable); // TLS is not configured on compute nodes. + + let node = NodeInfo { + config, + aux: body.aux, + allow_self_signed_compute: false, + }; + + Ok(node) + } + .inspect_err(|e| tracing::debug!(error = ?e)) + .instrument(info_span!("do_wake_compute")) + .await + } +} + +impl super::ControlPlaneApi for NeonControlPlaneClient { + #[tracing::instrument(skip_all)] + async fn get_role_secret( + &self, + ctx: &RequestContext, + user_info: &ComputeUserInfo, + ) -> Result { + let normalized_ep = &user_info.endpoint.normalize(); + let user = &user_info.user; + if let Some(role_secret) = self + .caches + .project_info + .get_role_secret(normalized_ep, user) + { + return Ok(role_secret); + } + let auth_info = self.do_get_auth_info(ctx, user_info).await?; + if let Some(project_id) = auth_info.project_id { + let normalized_ep_int = normalized_ep.into(); + self.caches.project_info.insert_role_secret( + project_id, + normalized_ep_int, + user.into(), + auth_info.secret.clone(), + ); + self.caches.project_info.insert_allowed_ips( + project_id, + normalized_ep_int, + Arc::new(auth_info.allowed_ips), + ); + ctx.set_project_id(project_id); + } + // When we just got a secret, we don't need to invalidate it. + Ok(Cached::new_uncached(auth_info.secret)) + } + + async fn get_allowed_ips_and_secret( + &self, + ctx: &RequestContext, + user_info: &ComputeUserInfo, + ) -> Result<(CachedAllowedIps, Option), GetAuthInfoError> { + let normalized_ep = &user_info.endpoint.normalize(); + if let Some(allowed_ips) = self.caches.project_info.get_allowed_ips(normalized_ep) { + Metrics::get() + .proxy + .allowed_ips_cache_misses + .inc(CacheOutcome::Hit); + return Ok((allowed_ips, None)); + } + Metrics::get() + .proxy + .allowed_ips_cache_misses + .inc(CacheOutcome::Miss); + let auth_info = self.do_get_auth_info(ctx, user_info).await?; + let allowed_ips = Arc::new(auth_info.allowed_ips); + let user = &user_info.user; + if let Some(project_id) = auth_info.project_id { + let normalized_ep_int = normalized_ep.into(); + self.caches.project_info.insert_role_secret( + project_id, + normalized_ep_int, + user.into(), + auth_info.secret.clone(), + ); + self.caches.project_info.insert_allowed_ips( + project_id, + normalized_ep_int, + allowed_ips.clone(), + ); + ctx.set_project_id(project_id); + } + Ok(( + Cached::new_uncached(allowed_ips), + Some(Cached::new_uncached(auth_info.secret)), + )) + } + + #[tracing::instrument(skip_all)] + async fn get_endpoint_jwks( + &self, + ctx: &RequestContext, + endpoint: EndpointId, + ) -> Result, GetEndpointJwksError> { + self.do_get_endpoint_jwks(ctx, endpoint).await + } + + #[tracing::instrument(skip_all)] + async fn wake_compute( + &self, + ctx: &RequestContext, + user_info: &ComputeUserInfo, + ) -> Result { + let key = user_info.endpoint_cache_key(); + + macro_rules! check_cache { + () => { + if let Some(cached) = self.caches.node_info.get(&key) { + let (cached, info) = cached.take_value(); + let info = info.map_err(|c| { + info!(key = &*key, "found cached wake_compute error"); + WakeComputeError::ControlPlane(ControlPlaneError::Message(Box::new(*c))) + })?; + + debug!(key = &*key, "found cached compute node info"); + ctx.set_project(info.aux.clone()); + return Ok(cached.map(|()| info)); + } + }; + } + + // Every time we do a wakeup http request, the compute node will stay up + // for some time (highly depends on the console's scale-to-zero policy); + // The connection info remains the same during that period of time, + // which means that we might cache it to reduce the load and latency. + check_cache!(); + + let permit = self.locks.get_permit(&key).await?; + + // after getting back a permit - it's possible the cache was filled + // double check + if permit.should_check_cache() { + // TODO: if there is something in the cache, mark the permit as success. + check_cache!(); + } + + // check rate limit + if !self + .wake_compute_endpoint_rate_limiter + .check(user_info.endpoint.normalize_intern(), 1) + { + return Err(WakeComputeError::TooManyConnections); + } + + let node = permit.release_result(self.do_wake_compute(ctx, user_info).await); + match node { + Ok(node) => { + ctx.set_project(node.aux.clone()); + debug!(key = &*key, "created a cache entry for woken compute node"); + + let mut stored_node = node.clone(); + // store the cached node as 'warm_cached' + stored_node.aux.cold_start_info = ColdStartInfo::WarmCached; + + let (_, cached) = self.caches.node_info.insert_unit(key, Ok(stored_node)); + + Ok(cached.map(|()| node)) + } + Err(err) => match err { + WakeComputeError::ControlPlane(ControlPlaneError::Message(err)) => { + let Some(status) = &err.status else { + return Err(WakeComputeError::ControlPlane(ControlPlaneError::Message( + err, + ))); + }; + + let reason = status + .details + .error_info + .map_or(Reason::Unknown, |x| x.reason); + + // if we can retry this error, do not cache it. + if reason.can_retry() { + return Err(WakeComputeError::ControlPlane(ControlPlaneError::Message( + err, + ))); + } + + // at this point, we should only have quota errors. + debug!( + key = &*key, + "created a cache entry for the wake compute error" + ); + + self.caches.node_info.insert_ttl( + key, + Err(err.clone()), + Duration::from_secs(30), + ); + + Err(WakeComputeError::ControlPlane(ControlPlaneError::Message( + err, + ))) + } + err => return Err(err), + }, + } + } +} + +/// Parse http response body, taking status code into account. +async fn parse_body serde::Deserialize<'a>>( + response: http::Response, +) -> Result { + let status = response.status(); + if status.is_success() { + // We shouldn't log raw body because it may contain secrets. + info!("request succeeded, processing the body"); + return Ok(response.json().await?); + } + let s = response.bytes().await?; + // Log plaintext to be able to detect, whether there are some cases not covered by the error struct. + info!("response_error plaintext: {:?}", s); + + // Don't throw an error here because it's not as important + // as the fact that the request itself has failed. + let mut body = serde_json::from_slice(&s).unwrap_or_else(|e| { + warn!("failed to parse error body: {e}"); + ControlPlaneErrorMessage { + error: "reason unclear (malformed error message)".into(), + http_status_code: status, + status: None, + } + }); + body.http_status_code = status; + + warn!("console responded with an error ({status}): {body:?}"); + Err(ControlPlaneError::Message(Box::new(body))) +} + +fn parse_host_port(input: &str) -> Option<(&str, u16)> { + let (host, port) = input.rsplit_once(':')?; + let ipv6_brackets: &[_] = &['[', ']']; + Some((host.trim_matches(ipv6_brackets), port.parse().ok()?)) +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn test_parse_host_port_v4() { + let (host, port) = parse_host_port("127.0.0.1:5432").expect("failed to parse"); + assert_eq!(host, "127.0.0.1"); + assert_eq!(port, 5432); + } + + #[test] + fn test_parse_host_port_v6() { + let (host, port) = parse_host_port("[2001:db8::1]:5432").expect("failed to parse"); + assert_eq!(host, "2001:db8::1"); + assert_eq!(port, 5432); + } + + #[test] + fn test_parse_host_port_url() { + let (host, port) = parse_host_port("compute-foo-bar-1234.default.svc.cluster.local:5432") + .expect("failed to parse"); + assert_eq!(host, "compute-foo-bar-1234.default.svc.cluster.local"); + assert_eq!(port, 5432); + } +} diff --git a/proxy/src/control_plane/client/mock.rs b/proxy/src/control_plane/client/mock.rs index 9537d717a1..eaf692ab27 100644 --- a/proxy/src/control_plane/client/mock.rs +++ b/proxy/src/control_plane/client/mock.rs @@ -5,7 +5,6 @@ use std::sync::Arc; use futures::TryFutureExt; use thiserror::Error; -use tokio_postgres::config::SslMode; use tokio_postgres::Client; use tracing::{error, info, info_span, warn, Instrument}; @@ -161,11 +160,11 @@ impl MockControlPlane { } async fn do_wake_compute(&self) -> Result { - let mut config = compute::ConnCfg::new(); - config - .host(self.endpoint.host_str().unwrap_or("localhost")) - .port(self.endpoint.port().unwrap_or(5432)) - .ssl_mode(SslMode::Disable); + let mut config = compute::ConnCfg::new( + self.endpoint.host_str().unwrap_or("localhost").to_owned(), + self.endpoint.port().unwrap_or(5432), + ); + config.ssl_mode(postgres_client::config::SslMode::Disable); let node = NodeInfo { config, diff --git a/proxy/src/control_plane/client/mod.rs b/proxy/src/control_plane/client/mod.rs index f8f74372f0..7ef5a9c9fd 100644 --- a/proxy/src/control_plane/client/mod.rs +++ b/proxy/src/control_plane/client/mod.rs @@ -1,3 +1,4 @@ +pub mod cplane_proxy_v1; #[cfg(any(test, feature = "testing"))] pub mod mock; pub mod neon; @@ -27,6 +28,8 @@ use crate::types::EndpointId; #[non_exhaustive] #[derive(Clone)] pub enum ControlPlaneClient { + /// New Proxy V1 control plane API + ProxyV1(cplane_proxy_v1::NeonControlPlaneClient), /// Current Management API (V2). Neon(neon::NeonControlPlaneClient), /// Local mock control plane. @@ -45,6 +48,7 @@ impl ControlPlaneApi for ControlPlaneClient { user_info: &ComputeUserInfo, ) -> Result { match self { + Self::ProxyV1(api) => api.get_role_secret(ctx, user_info).await, Self::Neon(api) => api.get_role_secret(ctx, user_info).await, #[cfg(any(test, feature = "testing"))] Self::PostgresMock(api) => api.get_role_secret(ctx, user_info).await, @@ -61,6 +65,7 @@ impl ControlPlaneApi for ControlPlaneClient { user_info: &ComputeUserInfo, ) -> Result<(CachedAllowedIps, Option), errors::GetAuthInfoError> { match self { + Self::ProxyV1(api) => api.get_allowed_ips_and_secret(ctx, user_info).await, Self::Neon(api) => api.get_allowed_ips_and_secret(ctx, user_info).await, #[cfg(any(test, feature = "testing"))] Self::PostgresMock(api) => api.get_allowed_ips_and_secret(ctx, user_info).await, @@ -75,6 +80,7 @@ impl ControlPlaneApi for ControlPlaneClient { endpoint: EndpointId, ) -> Result, errors::GetEndpointJwksError> { match self { + Self::ProxyV1(api) => api.get_endpoint_jwks(ctx, endpoint).await, Self::Neon(api) => api.get_endpoint_jwks(ctx, endpoint).await, #[cfg(any(test, feature = "testing"))] Self::PostgresMock(api) => api.get_endpoint_jwks(ctx, endpoint).await, @@ -89,6 +95,7 @@ impl ControlPlaneApi for ControlPlaneClient { user_info: &ComputeUserInfo, ) -> Result { match self { + Self::ProxyV1(api) => api.wake_compute(ctx, user_info).await, Self::Neon(api) => api.wake_compute(ctx, user_info).await, #[cfg(any(test, feature = "testing"))] Self::PostgresMock(api) => api.wake_compute(ctx, user_info).await, diff --git a/proxy/src/control_plane/client/neon.rs b/proxy/src/control_plane/client/neon.rs index 2cad981d01..bf62c0d6ab 100644 --- a/proxy/src/control_plane/client/neon.rs +++ b/proxy/src/control_plane/client/neon.rs @@ -1,4 +1,4 @@ -//! Production console backend. +//! Stale console backend, remove after migrating to Proxy V1 API (#15245). use std::sync::Arc; use std::time::Duration; @@ -6,8 +6,8 @@ use std::time::Duration; use ::http::header::AUTHORIZATION; use ::http::HeaderName; use futures::TryFutureExt; +use postgres_client::config::SslMode; use tokio::time::Instant; -use tokio_postgres::config::SslMode; use tracing::{debug, info, info_span, warn, Instrument}; use super::super::messages::{ControlPlaneErrorMessage, GetRoleSecret, WakeCompute}; @@ -241,8 +241,8 @@ impl NeonControlPlaneClient { // Don't set anything but host and port! This config will be cached. // We'll set username and such later using the startup message. // TODO: add more type safety (in progress). - let mut config = compute::ConnCfg::new(); - config.host(host).port(port).ssl_mode(SslMode::Disable); // TLS is not configured on compute nodes. + let mut config = compute::ConnCfg::new(host.to_owned(), port); + config.ssl_mode(SslMode::Disable); // TLS is not configured on compute nodes. let node = NodeInfo { config, diff --git a/proxy/src/control_plane/messages.rs b/proxy/src/control_plane/messages.rs index 8762ba874b..2662ab85f9 100644 --- a/proxy/src/control_plane/messages.rs +++ b/proxy/src/control_plane/messages.rs @@ -230,6 +230,16 @@ pub(crate) struct GetRoleSecret { pub(crate) project_id: Option, } +/// Response which holds client's auth secret, e.g. [`crate::scram::ServerSecret`]. +/// Returned by the `/get_endpoint_access_control` API method. +#[derive(Deserialize)] +pub(crate) struct GetEndpointAccessControl { + pub(crate) role_secret: Box, + pub(crate) allowed_ips: Option>, + pub(crate) project_id: Option, + pub(crate) allowed_vpc_endpoint_ids: Option>, +} + // Manually implement debug to omit sensitive info. impl fmt::Debug for GetRoleSecret { fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { diff --git a/proxy/src/error.rs b/proxy/src/error.rs index 2221aac407..6a379499dc 100644 --- a/proxy/src/error.rs +++ b/proxy/src/error.rs @@ -84,7 +84,7 @@ pub(crate) trait ReportableError: fmt::Display + Send + 'static { fn get_error_kind(&self) -> ErrorKind; } -impl ReportableError for tokio_postgres::error::Error { +impl ReportableError for postgres_client::error::Error { fn get_error_kind(&self) -> ErrorKind { if self.as_db_error().is_some() { ErrorKind::Postgres diff --git a/proxy/src/lib.rs b/proxy/src/lib.rs index ad7e1d2771..ba69f9cf2d 100644 --- a/proxy/src/lib.rs +++ b/proxy/src/lib.rs @@ -88,6 +88,7 @@ pub mod jemalloc; pub mod logging; pub mod metrics; pub mod parse; +pub mod postgres_rustls; pub mod protocol2; pub mod proxy; pub mod rate_limiter; diff --git a/proxy/src/postgres_rustls/mod.rs b/proxy/src/postgres_rustls/mod.rs new file mode 100644 index 0000000000..5ef20991c3 --- /dev/null +++ b/proxy/src/postgres_rustls/mod.rs @@ -0,0 +1,158 @@ +use std::convert::TryFrom; +use std::sync::Arc; + +use postgres_client::tls::MakeTlsConnect; +use rustls::pki_types::ServerName; +use rustls::ClientConfig; +use tokio::io::{AsyncRead, AsyncWrite}; + +mod private { + use std::future::Future; + use std::io; + use std::pin::Pin; + use std::task::{Context, Poll}; + + use postgres_client::tls::{ChannelBinding, TlsConnect}; + use rustls::pki_types::ServerName; + use tokio::io::{AsyncRead, AsyncWrite, ReadBuf}; + use tokio_rustls::client::TlsStream; + use tokio_rustls::TlsConnector; + + use crate::config::TlsServerEndPoint; + + pub struct TlsConnectFuture { + inner: tokio_rustls::Connect, + } + + impl Future for TlsConnectFuture + where + S: AsyncRead + AsyncWrite + Unpin, + { + type Output = io::Result>; + + fn poll(mut self: Pin<&mut Self>, cx: &mut Context<'_>) -> Poll { + Pin::new(&mut self.inner).poll(cx).map_ok(RustlsStream) + } + } + + pub struct RustlsConnect(pub RustlsConnectData); + + pub struct RustlsConnectData { + pub hostname: ServerName<'static>, + pub connector: TlsConnector, + } + + impl TlsConnect for RustlsConnect + where + S: AsyncRead + AsyncWrite + Unpin + Send + 'static, + { + type Stream = RustlsStream; + type Error = io::Error; + type Future = TlsConnectFuture; + + fn connect(self, stream: S) -> Self::Future { + TlsConnectFuture { + inner: self.0.connector.connect(self.0.hostname, stream), + } + } + } + + pub struct RustlsStream(TlsStream); + + impl postgres_client::tls::TlsStream for RustlsStream + where + S: AsyncRead + AsyncWrite + Unpin, + { + fn channel_binding(&self) -> ChannelBinding { + let (_, session) = self.0.get_ref(); + match session.peer_certificates() { + Some([cert, ..]) => TlsServerEndPoint::new(cert) + .ok() + .and_then(|cb| match cb { + TlsServerEndPoint::Sha256(hash) => Some(hash), + TlsServerEndPoint::Undefined => None, + }) + .map_or_else(ChannelBinding::none, |hash| { + ChannelBinding::tls_server_end_point(hash.to_vec()) + }), + _ => ChannelBinding::none(), + } + } + } + + impl AsyncRead for RustlsStream + where + S: AsyncRead + AsyncWrite + Unpin, + { + fn poll_read( + mut self: Pin<&mut Self>, + cx: &mut Context<'_>, + buf: &mut ReadBuf<'_>, + ) -> Poll> { + Pin::new(&mut self.0).poll_read(cx, buf) + } + } + + impl AsyncWrite for RustlsStream + where + S: AsyncRead + AsyncWrite + Unpin, + { + fn poll_write( + mut self: Pin<&mut Self>, + cx: &mut Context<'_>, + buf: &[u8], + ) -> Poll> { + Pin::new(&mut self.0).poll_write(cx, buf) + } + + fn poll_flush( + mut self: Pin<&mut Self>, + cx: &mut Context<'_>, + ) -> Poll> { + Pin::new(&mut self.0).poll_flush(cx) + } + + fn poll_shutdown( + mut self: Pin<&mut Self>, + cx: &mut Context<'_>, + ) -> Poll> { + Pin::new(&mut self.0).poll_shutdown(cx) + } + } +} + +/// A `MakeTlsConnect` implementation using `rustls`. +/// +/// That way you can connect to PostgreSQL using `rustls` as the TLS stack. +#[derive(Clone)] +pub struct MakeRustlsConnect { + config: Arc, +} + +impl MakeRustlsConnect { + /// Creates a new `MakeRustlsConnect` from the provided `ClientConfig`. + #[must_use] + pub fn new(config: ClientConfig) -> Self { + Self { + config: Arc::new(config), + } + } +} + +impl MakeTlsConnect for MakeRustlsConnect +where + S: AsyncRead + AsyncWrite + Unpin + Send + 'static, +{ + type Stream = private::RustlsStream; + type TlsConnect = private::RustlsConnect; + type Error = rustls::pki_types::InvalidDnsNameError; + + fn make_tls_connect(&mut self, hostname: &str) -> Result { + ServerName::try_from(hostname).map(|dns_name| { + private::RustlsConnect(private::RustlsConnectData { + hostname: dns_name.to_owned(), + connector: Arc::clone(&self.config).into(), + }) + }) + } +} diff --git a/proxy/src/proxy/connect_compute.rs b/proxy/src/proxy/connect_compute.rs index 2e759b0894..a3027abd7c 100644 --- a/proxy/src/proxy/connect_compute.rs +++ b/proxy/src/proxy/connect_compute.rs @@ -66,6 +66,8 @@ pub(crate) trait ComputeConnectBackend { } pub(crate) struct TcpMechanism<'a> { + pub(crate) params_compat: bool, + /// KV-dictionary with PostgreSQL connection params. pub(crate) params: &'a StartupMessageParams, @@ -86,13 +88,13 @@ impl ConnectMechanism for TcpMechanism<'_> { node_info: &control_plane::CachedNodeInfo, timeout: time::Duration, ) -> Result { - let host = node_info.config.get_host()?; + let host = node_info.config.get_host(); let permit = self.locks.get_permit(&host).await?; permit.release_result(node_info.connect(ctx, timeout).await) } fn update_connect_config(&self, config: &mut compute::ConnCfg) { - config.set_startup_params(self.params); + config.set_startup_params(self.params, self.params_compat); } } diff --git a/proxy/src/proxy/mod.rs b/proxy/src/proxy/mod.rs index 7fe67e43de..f74eb5940f 100644 --- a/proxy/src/proxy/mod.rs +++ b/proxy/src/proxy/mod.rs @@ -69,6 +69,7 @@ pub async fn task_main( socket2::SockRef::from(&listener).set_keepalive(true)?; let connections = tokio_util::task::task_tracker::TaskTracker::new(); + let cancellations = tokio_util::task::task_tracker::TaskTracker::new(); while let Some(accept_result) = run_until_cancelled(listener.accept(), &cancellation_token).await @@ -82,6 +83,7 @@ pub async fn task_main( let session_id = uuid::Uuid::new_v4(); let cancellation_handler = Arc::clone(&cancellation_handler); + let cancellations = cancellations.clone(); debug!(protocol = "tcp", %session_id, "accepted new TCP connection"); let endpoint_rate_limiter2 = endpoint_rate_limiter.clone(); @@ -133,6 +135,7 @@ pub async fn task_main( ClientMode::Tcp, endpoint_rate_limiter2, conn_gauge, + cancellations, ) .instrument(ctx.span()) .boxed() @@ -164,10 +167,12 @@ pub async fn task_main( } connections.close(); + cancellations.close(); drop(listener); // Drain connections connections.wait().await; + cancellations.wait().await; Ok(()) } @@ -250,6 +255,7 @@ pub(crate) async fn handle_client( mode: ClientMode, endpoint_rate_limiter: Arc, conn_gauge: NumClientConnectionsGuard<'static>, + cancellations: tokio_util::task::task_tracker::TaskTracker, ) -> Result>, ClientRequestError> { debug!( protocol = %ctx.protocol(), @@ -270,15 +276,26 @@ pub(crate) async fn handle_client( match tokio::time::timeout(config.handshake_timeout, do_handshake).await?? { HandshakeData::Startup(stream, params) => (stream, params), HandshakeData::Cancel(cancel_key_data) => { - return Ok(cancellation_handler - .cancel_session( - cancel_key_data, - ctx.session_id(), - &ctx.peer_addr(), - config.authentication_config.ip_allowlist_check_enabled, - ) - .await - .map(|()| None)?) + // spawn a task to cancel the session, but don't wait for it + cancellations.spawn({ + let cancellation_handler_clone = Arc::clone(&cancellation_handler); + let session_id = ctx.session_id(); + let peer_ip = ctx.peer_addr(); + async move { + drop( + cancellation_handler_clone + .cancel_session( + cancel_key_data, + session_id, + peer_ip, + config.authentication_config.ip_allowlist_check_enabled, + ) + .await, + ); + } + }); + + return Ok(None); } }; drop(pause); @@ -321,9 +338,17 @@ pub(crate) async fn handle_client( } }; + let params_compat = match &user_info { + auth::Backend::ControlPlane(_, info) => { + info.info.options.get(NeonOptions::PARAMS_COMPAT).is_some() + } + auth::Backend::Local(_) => false, + }; + let mut node = connect_to_compute( ctx, &TcpMechanism { + params_compat, params: ¶ms, locks: &config.connect_compute_locks, }, @@ -367,11 +392,13 @@ pub(crate) async fn prepare_client_connection

( // The new token (cancel_key_data) will be sent to the client. let cancel_key_data = session.enable_query_cancellation(node.cancel_closure.clone()); + // Forward all deferred notices to the client. + for notice in &node.delayed_notice { + stream.write_message_noflush(&Be::Raw(b'N', notice.as_bytes()))?; + } + // Forward all postgres connection params to the client. - // Right now the implementation is very hacky and inefficent (ideally, - // we don't need an intermediate hashmap), but at least it should be correct. for (name, value) in &node.params { - // TODO: Theoretically, this could result in a big pile of params... stream.write_message_noflush(&Be::ParameterStatus { name: name.as_bytes(), value: value.as_bytes(), @@ -390,19 +417,47 @@ pub(crate) async fn prepare_client_connection

( pub(crate) struct NeonOptions(Vec<(SmolStr, SmolStr)>); impl NeonOptions { + // proxy options: + + /// `PARAMS_COMPAT` allows opting in to forwarding all startup parameters from client to compute. + const PARAMS_COMPAT: &str = "proxy_params_compat"; + + // cplane options: + + /// `LSN` allows provisioning an ephemeral compute with time-travel to the provided LSN. + const LSN: &str = "lsn"; + + /// `ENDPOINT_TYPE` allows configuring an ephemeral compute to be read_only or read_write. + const ENDPOINT_TYPE: &str = "endpoint_type"; + pub(crate) fn parse_params(params: &StartupMessageParams) -> Self { params .options_raw() .map(Self::parse_from_iter) .unwrap_or_default() } + pub(crate) fn parse_options_raw(options: &str) -> Self { Self::parse_from_iter(StartupMessageParams::parse_options_raw(options)) } + pub(crate) fn get(&self, key: &str) -> Option { + self.0 + .iter() + .find_map(|(k, v)| (k == key).then_some(v)) + .cloned() + } + pub(crate) fn is_ephemeral(&self) -> bool { - // Currently, neon endpoint options are all reserved for ephemeral endpoints. - !self.0.is_empty() + self.0.iter().any(|(k, _)| match &**k { + // This is not a cplane option, we know it does not create ephemeral computes. + Self::PARAMS_COMPAT => false, + Self::LSN => true, + Self::ENDPOINT_TYPE => true, + // err on the side of caution. any cplane options we don't know about + // might lead to ephemeral computes. + _ => true, + }) } fn parse_from_iter<'a>(options: impl Iterator) -> Self { diff --git a/proxy/src/proxy/retry.rs b/proxy/src/proxy/retry.rs index d3f0c3e7d4..42d1491782 100644 --- a/proxy/src/proxy/retry.rs +++ b/proxy/src/proxy/retry.rs @@ -31,9 +31,9 @@ impl CouldRetry for io::Error { } } -impl CouldRetry for tokio_postgres::error::DbError { +impl CouldRetry for postgres_client::error::DbError { fn could_retry(&self) -> bool { - use tokio_postgres::error::SqlState; + use postgres_client::error::SqlState; matches!( self.code(), &SqlState::CONNECTION_FAILURE @@ -43,9 +43,9 @@ impl CouldRetry for tokio_postgres::error::DbError { ) } } -impl ShouldRetryWakeCompute for tokio_postgres::error::DbError { +impl ShouldRetryWakeCompute for postgres_client::error::DbError { fn should_retry_wake_compute(&self) -> bool { - use tokio_postgres::error::SqlState; + use postgres_client::error::SqlState; // Here are errors that happens after the user successfully authenticated to the database. // TODO: there are pgbouncer errors that should be retried, but they are not listed here. !matches!( @@ -61,21 +61,21 @@ impl ShouldRetryWakeCompute for tokio_postgres::error::DbError { } } -impl CouldRetry for tokio_postgres::Error { +impl CouldRetry for postgres_client::Error { fn could_retry(&self) -> bool { if let Some(io_err) = self.source().and_then(|x| x.downcast_ref()) { io::Error::could_retry(io_err) } else if let Some(db_err) = self.source().and_then(|x| x.downcast_ref()) { - tokio_postgres::error::DbError::could_retry(db_err) + postgres_client::error::DbError::could_retry(db_err) } else { false } } } -impl ShouldRetryWakeCompute for tokio_postgres::Error { +impl ShouldRetryWakeCompute for postgres_client::Error { fn should_retry_wake_compute(&self) -> bool { if let Some(db_err) = self.source().and_then(|x| x.downcast_ref()) { - tokio_postgres::error::DbError::should_retry_wake_compute(db_err) + postgres_client::error::DbError::should_retry_wake_compute(db_err) } else { // likely an IO error. Possible the compute has shutdown and the // cache is stale. diff --git a/proxy/src/proxy/tests/mitm.rs b/proxy/src/proxy/tests/mitm.rs index fe211adfeb..59c9ac27b8 100644 --- a/proxy/src/proxy/tests/mitm.rs +++ b/proxy/src/proxy/tests/mitm.rs @@ -8,9 +8,9 @@ use std::fmt::Debug; use bytes::{Bytes, BytesMut}; use futures::{SinkExt, StreamExt}; +use postgres_client::tls::TlsConnect; use postgres_protocol::message::frontend; use tokio::io::{AsyncReadExt, DuplexStream}; -use tokio_postgres::tls::TlsConnect; use tokio_util::codec::{Decoder, Encoder}; use super::*; @@ -55,7 +55,13 @@ async fn proxy_mitm( // give the end_server the startup parameters let mut buf = BytesMut::new(); - frontend::startup_message(startup.iter(), &mut buf).unwrap(); + frontend::startup_message( + &postgres_protocol::message::frontend::StartupMessageParams { + params: startup.params.into(), + }, + &mut buf, + ) + .unwrap(); end_server.send(buf.freeze()).await.unwrap(); // proxy messages between end_client and end_server @@ -158,8 +164,8 @@ async fn scram_auth_disable_channel_binding() -> anyhow::Result<()> { Scram::new("password").await?, )); - let _client_err = tokio_postgres::Config::new() - .channel_binding(tokio_postgres::config::ChannelBinding::Disable) + let _client_err = postgres_client::Config::new("test".to_owned(), 5432) + .channel_binding(postgres_client::config::ChannelBinding::Disable) .user("user") .dbname("db") .password("password") @@ -175,7 +181,7 @@ async fn scram_auth_disable_channel_binding() -> anyhow::Result<()> { async fn scram_auth_prefer_channel_binding() -> anyhow::Result<()> { connect_failure( Intercept::None, - tokio_postgres::config::ChannelBinding::Prefer, + postgres_client::config::ChannelBinding::Prefer, ) .await } @@ -185,7 +191,7 @@ async fn scram_auth_prefer_channel_binding() -> anyhow::Result<()> { async fn scram_auth_prefer_channel_binding_intercept() -> anyhow::Result<()> { connect_failure( Intercept::Methods, - tokio_postgres::config::ChannelBinding::Prefer, + postgres_client::config::ChannelBinding::Prefer, ) .await } @@ -195,7 +201,7 @@ async fn scram_auth_prefer_channel_binding_intercept() -> anyhow::Result<()> { async fn scram_auth_prefer_channel_binding_intercept_response() -> anyhow::Result<()> { connect_failure( Intercept::SASLResponse, - tokio_postgres::config::ChannelBinding::Prefer, + postgres_client::config::ChannelBinding::Prefer, ) .await } @@ -205,7 +211,7 @@ async fn scram_auth_prefer_channel_binding_intercept_response() -> anyhow::Resul async fn scram_auth_require_channel_binding() -> anyhow::Result<()> { connect_failure( Intercept::None, - tokio_postgres::config::ChannelBinding::Require, + postgres_client::config::ChannelBinding::Require, ) .await } @@ -215,7 +221,7 @@ async fn scram_auth_require_channel_binding() -> anyhow::Result<()> { async fn scram_auth_require_channel_binding_intercept() -> anyhow::Result<()> { connect_failure( Intercept::Methods, - tokio_postgres::config::ChannelBinding::Require, + postgres_client::config::ChannelBinding::Require, ) .await } @@ -225,14 +231,14 @@ async fn scram_auth_require_channel_binding_intercept() -> anyhow::Result<()> { async fn scram_auth_require_channel_binding_intercept_response() -> anyhow::Result<()> { connect_failure( Intercept::SASLResponse, - tokio_postgres::config::ChannelBinding::Require, + postgres_client::config::ChannelBinding::Require, ) .await } async fn connect_failure( intercept: Intercept, - channel_binding: tokio_postgres::config::ChannelBinding, + channel_binding: postgres_client::config::ChannelBinding, ) -> anyhow::Result<()> { let (server, client, client_config, server_config) = proxy_mitm(intercept).await; let proxy = tokio::spawn(dummy_proxy( @@ -241,7 +247,7 @@ async fn connect_failure( Scram::new("password").await?, )); - let _client_err = tokio_postgres::Config::new() + let _client_err = postgres_client::Config::new("test".to_owned(), 5432) .channel_binding(channel_binding) .user("user") .dbname("db") diff --git a/proxy/src/proxy/tests/mod.rs b/proxy/src/proxy/tests/mod.rs index 3de8ca8736..911b349416 100644 --- a/proxy/src/proxy/tests/mod.rs +++ b/proxy/src/proxy/tests/mod.rs @@ -7,14 +7,13 @@ use std::time::Duration; use anyhow::{bail, Context}; use async_trait::async_trait; use http::StatusCode; +use postgres_client::config::SslMode; +use postgres_client::tls::{MakeTlsConnect, NoTls}; use retry::{retry_after, ShouldRetryWakeCompute}; use rstest::rstest; use rustls::crypto::ring; use rustls::pki_types; use tokio::io::DuplexStream; -use tokio_postgres::config::SslMode; -use tokio_postgres::tls::{MakeTlsConnect, NoTls}; -use tokio_postgres_rustls::MakeRustlsConnect; use super::connect_compute::ConnectMechanism; use super::retry::CouldRetry; @@ -29,6 +28,7 @@ use crate::control_plane::{ self, CachedAllowedIps, CachedNodeInfo, CachedRoleSecret, NodeInfo, NodeInfoCache, }; use crate::error::ErrorKind; +use crate::postgres_rustls::MakeRustlsConnect; use crate::types::{BranchId, EndpointId, ProjectId}; use crate::{sasl, scram}; @@ -204,7 +204,7 @@ async fn handshake_tls_is_enforced_by_proxy() -> anyhow::Result<()> { let (_, server_config) = generate_tls_config("generic-project-name.localhost", "localhost")?; let proxy = tokio::spawn(dummy_proxy(client, Some(server_config), NoAuth)); - let client_err = tokio_postgres::Config::new() + let client_err = postgres_client::Config::new("test".to_owned(), 5432) .user("john_doe") .dbname("earth") .ssl_mode(SslMode::Disable) @@ -233,7 +233,7 @@ async fn handshake_tls() -> anyhow::Result<()> { generate_tls_config("generic-project-name.localhost", "localhost")?; let proxy = tokio::spawn(dummy_proxy(client, Some(server_config), NoAuth)); - let (_client, _conn) = tokio_postgres::Config::new() + let _conn = postgres_client::Config::new("test".to_owned(), 5432) .user("john_doe") .dbname("earth") .ssl_mode(SslMode::Require) @@ -249,10 +249,10 @@ async fn handshake_raw() -> anyhow::Result<()> { let proxy = tokio::spawn(dummy_proxy(client, None, NoAuth)); - let (_client, _conn) = tokio_postgres::Config::new() + let _conn = postgres_client::Config::new("test".to_owned(), 5432) .user("john_doe") .dbname("earth") - .options("project=generic-project-name") + .set_param("options", "project=generic-project-name") .ssl_mode(SslMode::Prefer) .connect_raw(server, NoTls) .await?; @@ -296,8 +296,8 @@ async fn scram_auth_good(#[case] password: &str) -> anyhow::Result<()> { Scram::new(password).await?, )); - let (_client, _conn) = tokio_postgres::Config::new() - .channel_binding(tokio_postgres::config::ChannelBinding::Require) + let _conn = postgres_client::Config::new("test".to_owned(), 5432) + .channel_binding(postgres_client::config::ChannelBinding::Require) .user("user") .dbname("db") .password(password) @@ -320,8 +320,8 @@ async fn scram_auth_disable_channel_binding() -> anyhow::Result<()> { Scram::new("password").await?, )); - let (_client, _conn) = tokio_postgres::Config::new() - .channel_binding(tokio_postgres::config::ChannelBinding::Disable) + let _conn = postgres_client::Config::new("test".to_owned(), 5432) + .channel_binding(postgres_client::config::ChannelBinding::Disable) .user("user") .dbname("db") .password("password") @@ -348,7 +348,7 @@ async fn scram_auth_mock() -> anyhow::Result<()> { .map(char::from) .collect(); - let _client_err = tokio_postgres::Config::new() + let _client_err = postgres_client::Config::new("test".to_owned(), 5432) .user("user") .dbname("db") .password(&password) // no password will match the mocked secret @@ -546,7 +546,7 @@ impl TestControlPlaneClient for TestConnectMechanism { fn helper_create_cached_node_info(cache: &'static NodeInfoCache) -> CachedNodeInfo { let node = NodeInfo { - config: compute::ConnCfg::new(), + config: compute::ConnCfg::new("test".to_owned(), 5432), aux: MetricsAuxInfo { endpoint_id: (&EndpointId::from("endpoint")).into(), project_id: (&ProjectId::from("project")).into(), diff --git a/proxy/src/redis/elasticache.rs b/proxy/src/redis/elasticache.rs index d118c8f412..bf6dde9332 100644 --- a/proxy/src/redis/elasticache.rs +++ b/proxy/src/redis/elasticache.rs @@ -1,6 +1,14 @@ +use std::sync::Arc; use std::time::{Duration, SystemTime}; +use aws_config::environment::EnvironmentVariableCredentialsProvider; +use aws_config::imds::credentials::ImdsCredentialsProvider; use aws_config::meta::credentials::CredentialsProviderChain; +use aws_config::meta::region::RegionProviderChain; +use aws_config::profile::ProfileFileCredentialsProvider; +use aws_config::provider_config::ProviderConfig; +use aws_config::web_identity_token::WebIdentityTokenCredentialsProvider; +use aws_config::Region; use aws_sdk_iam::config::ProvideCredentials; use aws_sigv4::http_request::{ self, SignableBody, SignableRequest, SignatureLocation, SigningSettings, @@ -45,12 +53,45 @@ pub struct CredentialsProvider { } impl CredentialsProvider { - pub fn new(config: AWSIRSAConfig, credentials_provider: CredentialsProviderChain) -> Self { - CredentialsProvider { - config, - credentials_provider, - } + pub async fn new( + aws_region: String, + redis_cluster_name: Option, + redis_user_id: Option, + ) -> Arc { + let region_provider = + RegionProviderChain::default_provider().or_else(Region::new(aws_region.clone())); + let provider_conf = + ProviderConfig::without_region().with_region(region_provider.region().await); + let aws_credentials_provider = { + // uses "AWS_ACCESS_KEY_ID", "AWS_SECRET_ACCESS_KEY" + CredentialsProviderChain::first_try( + "env", + EnvironmentVariableCredentialsProvider::new(), + ) + // uses "AWS_PROFILE" / `aws sso login --profile ` + .or_else( + "profile-sso", + ProfileFileCredentialsProvider::builder() + .configure(&provider_conf) + .build(), + ) + // uses "AWS_WEB_IDENTITY_TOKEN_FILE", "AWS_ROLE_ARN", "AWS_ROLE_SESSION_NAME" + // needed to access remote extensions bucket + .or_else( + "token", + WebIdentityTokenCredentialsProvider::builder() + .configure(&provider_conf) + .build(), + ) + // uses imds v2 + .or_else("imds", ImdsCredentialsProvider::builder().build()) + }; + Arc::new(CredentialsProvider { + config: AWSIRSAConfig::new(aws_region, redis_cluster_name, redis_user_id), + credentials_provider: aws_credentials_provider, + }) } + pub(crate) async fn provide_credentials(&self) -> anyhow::Result<(String, String)> { let aws_credentials = self .credentials_provider diff --git a/proxy/src/redis/notifications.rs b/proxy/src/redis/notifications.rs index 65008ae943..9ac07b7e90 100644 --- a/proxy/src/redis/notifications.rs +++ b/proxy/src/redis/notifications.rs @@ -149,7 +149,7 @@ impl MessageHandler { .cancel_session( cancel_session.cancel_key_data, uuid::Uuid::nil(), - &peer_addr, + peer_addr, cancel_session.peer_addr.is_some(), ) .await diff --git a/proxy/src/serverless/backend.rs b/proxy/src/serverless/backend.rs index 3037e20888..251aa47084 100644 --- a/proxy/src/serverless/backend.rs +++ b/proxy/src/serverless/backend.rs @@ -37,9 +37,9 @@ use crate::types::{EndpointId, Host, LOCAL_PROXY_SUFFIX}; pub(crate) struct PoolingBackend { pub(crate) http_conn_pool: Arc>>, - pub(crate) local_pool: Arc>, + pub(crate) local_pool: Arc>, pub(crate) pool: - Arc>>, + Arc>>, pub(crate) config: &'static ProxyConfig, pub(crate) auth_backend: &'static crate::auth::Backend<'static, ()>, @@ -53,6 +53,8 @@ impl PoolingBackend { user_info: &ComputeUserInfo, password: &[u8], ) -> Result { + ctx.set_auth_method(crate::context::AuthMethod::Cleartext); + let user_info = user_info.clone(); let backend = self.auth_backend.as_ref().map(|()| user_info.clone()); let (allowed_ips, maybe_secret) = backend.get_allowed_ips_and_secret(ctx).await?; @@ -115,6 +117,8 @@ impl PoolingBackend { user_info: &ComputeUserInfo, jwt: String, ) -> Result { + ctx.set_auth_method(crate::context::AuthMethod::Jwt); + match &self.auth_backend { crate::auth::Backend::ControlPlane(console, ()) => { self.config @@ -166,7 +170,7 @@ impl PoolingBackend { conn_info: ConnInfo, keys: ComputeCredentials, force_new: bool, - ) -> Result, HttpConnError> { + ) -> Result, HttpConnError> { let maybe_client = if force_new { debug!("pool: pool is disabled"); None @@ -252,7 +256,7 @@ impl PoolingBackend { &self, ctx: &RequestContext, conn_info: ConnInfo, - ) -> Result, HttpConnError> { + ) -> Result, HttpConnError> { if let Some(client) = self.local_pool.get(ctx, &conn_info)? { return Ok(client); } @@ -305,13 +309,16 @@ impl PoolingBackend { .config .user(&conn_info.user_info.user) .dbname(&conn_info.dbname) - .options(&format!( - "-c pg_session_jwt.jwk={}", - serde_json::to_string(&jwk).expect("serializing jwk to json should not fail") - )); + .set_param( + "options", + &format!( + "-c pg_session_jwt.jwk={}", + serde_json::to_string(&jwk).expect("serializing jwk to json should not fail") + ), + ); let pause = ctx.latency_timer_pause(crate::metrics::Waiting::Compute); - let (client, connection) = config.connect(tokio_postgres::NoTls).await?; + let (client, connection) = config.connect(postgres_client::NoTls).await?; drop(pause); let pid = client.get_process_id(); @@ -333,7 +340,7 @@ impl PoolingBackend { debug!("setting up backend session state"); // initiates the auth session - if let Err(e) = client.query("select auth.init()", &[]).await { + if let Err(e) = client.execute("select auth.init()", &[]).await { discard.discard(); return Err(e.into()); } @@ -356,7 +363,7 @@ pub(crate) enum HttpConnError { #[error("pooled connection closed at inconsistent state")] ConnectionClosedAbruptly(#[from] tokio::sync::watch::error::SendError), #[error("could not connection to postgres in compute")] - PostgresConnectionError(#[from] tokio_postgres::Error), + PostgresConnectionError(#[from] postgres_client::Error), #[error("could not connection to local-proxy in compute")] LocalProxyConnectionError(#[from] LocalProxyConnError), #[error("could not parse JWT payload")] @@ -475,7 +482,7 @@ impl ShouldRetryWakeCompute for LocalProxyConnError { } struct TokioMechanism { - pool: Arc>>, + pool: Arc>>, conn_info: ConnInfo, conn_id: uuid::Uuid, @@ -485,7 +492,7 @@ struct TokioMechanism { #[async_trait] impl ConnectMechanism for TokioMechanism { - type Connection = Client; + type Connection = Client; type ConnectError = HttpConnError; type Error = HttpConnError; @@ -495,7 +502,7 @@ impl ConnectMechanism for TokioMechanism { node_info: &CachedNodeInfo, timeout: Duration, ) -> Result { - let host = node_info.config.get_host()?; + let host = node_info.config.get_host(); let permit = self.locks.get_permit(&host).await?; let mut config = (*node_info.config).clone(); @@ -505,7 +512,7 @@ impl ConnectMechanism for TokioMechanism { .connect_timeout(timeout); let pause = ctx.latency_timer_pause(crate::metrics::Waiting::Compute); - let res = config.connect(tokio_postgres::NoTls).await; + let res = config.connect(postgres_client::NoTls).await; drop(pause); let (client, connection) = permit.release_result(res)?; @@ -545,16 +552,12 @@ impl ConnectMechanism for HyperMechanism { node_info: &CachedNodeInfo, timeout: Duration, ) -> Result { - let host = node_info.config.get_host()?; + let host = node_info.config.get_host(); let permit = self.locks.get_permit(&host).await?; let pause = ctx.latency_timer_pause(crate::metrics::Waiting::Compute); - let port = *node_info.config.get_ports().first().ok_or_else(|| { - HttpConnError::WakeCompute(WakeComputeError::BadComputeAddress( - "local-proxy port missing on compute address".into(), - )) - })?; + let port = node_info.config.get_port(); let res = connect_http2(&host, port, timeout).await; drop(pause); let (client, connection) = permit.release_result(res)?; diff --git a/proxy/src/serverless/conn_pool.rs b/proxy/src/serverless/conn_pool.rs index bd262f45ed..cac5a173cb 100644 --- a/proxy/src/serverless/conn_pool.rs +++ b/proxy/src/serverless/conn_pool.rs @@ -5,10 +5,11 @@ use std::task::{ready, Poll}; use futures::future::poll_fn; use futures::Future; +use postgres_client::tls::NoTlsStream; +use postgres_client::AsyncMessage; use smallvec::SmallVec; +use tokio::net::TcpStream; use tokio::time::Instant; -use tokio_postgres::tls::NoTlsStream; -use tokio_postgres::{AsyncMessage, Socket}; use tokio_util::sync::CancellationToken; use tracing::{error, info, info_span, warn, Instrument}; #[cfg(test)] @@ -57,7 +58,7 @@ pub(crate) fn poll_client( ctx: &RequestContext, conn_info: ConnInfo, client: C, - mut connection: tokio_postgres::Connection, + mut connection: postgres_client::Connection, conn_id: uuid::Uuid, aux: MetricsAuxInfo, ) -> Client { diff --git a/proxy/src/serverless/conn_pool_lib.rs b/proxy/src/serverless/conn_pool_lib.rs index fe1d2563bc..2a46c8f9c5 100644 --- a/proxy/src/serverless/conn_pool_lib.rs +++ b/proxy/src/serverless/conn_pool_lib.rs @@ -7,8 +7,8 @@ use std::time::Duration; use dashmap::DashMap; use parking_lot::RwLock; +use postgres_client::ReadyForQueryStatus; use rand::Rng; -use tokio_postgres::ReadyForQueryStatus; use tracing::{debug, info, Span}; use super::backend::HttpConnError; @@ -683,7 +683,7 @@ pub(crate) trait ClientInnerExt: Sync + Send + 'static { fn get_process_id(&self) -> i32; } -impl ClientInnerExt for tokio_postgres::Client { +impl ClientInnerExt for postgres_client::Client { fn is_closed(&self) -> bool { self.is_closed() } diff --git a/proxy/src/serverless/json.rs b/proxy/src/serverless/json.rs index 569e2da571..25b25c66d3 100644 --- a/proxy/src/serverless/json.rs +++ b/proxy/src/serverless/json.rs @@ -1,6 +1,6 @@ +use postgres_client::types::{Kind, Type}; +use postgres_client::Row; use serde_json::{Map, Value}; -use tokio_postgres::types::{Kind, Type}; -use tokio_postgres::Row; // // Convert json non-string types to strings, so that they can be passed to Postgres @@ -61,7 +61,7 @@ fn json_array_to_pg_array(value: &Value) -> Option { #[derive(Debug, thiserror::Error)] pub(crate) enum JsonConversionError { #[error("internal error compute returned invalid data: {0}")] - AsTextError(tokio_postgres::Error), + AsTextError(postgres_client::Error), #[error("parse int error: {0}")] ParseIntError(#[from] std::num::ParseIntError), #[error("parse float error: {0}")] diff --git a/proxy/src/serverless/local_conn_pool.rs b/proxy/src/serverless/local_conn_pool.rs index 9abe35db08..b84cde9e25 100644 --- a/proxy/src/serverless/local_conn_pool.rs +++ b/proxy/src/serverless/local_conn_pool.rs @@ -22,12 +22,13 @@ use indexmap::IndexMap; use jose_jwk::jose_b64::base64ct::{Base64UrlUnpadded, Encoding}; use p256::ecdsa::{Signature, SigningKey}; use parking_lot::RwLock; +use postgres_client::tls::NoTlsStream; +use postgres_client::types::ToSql; +use postgres_client::AsyncMessage; use serde_json::value::RawValue; use signature::Signer; +use tokio::net::TcpStream; use tokio::time::Instant; -use tokio_postgres::tls::NoTlsStream; -use tokio_postgres::types::ToSql; -use tokio_postgres::{AsyncMessage, Socket}; use tokio_util::sync::CancellationToken; use tracing::{debug, error, info, info_span, warn, Instrument}; @@ -163,7 +164,7 @@ pub(crate) fn poll_client( ctx: &RequestContext, conn_info: ConnInfo, client: C, - mut connection: tokio_postgres::Connection, + mut connection: postgres_client::Connection, key: SigningKey, conn_id: uuid::Uuid, aux: MetricsAuxInfo, @@ -279,18 +280,18 @@ pub(crate) fn poll_client( ) } -impl ClientInnerCommon { +impl ClientInnerCommon { pub(crate) async fn set_jwt_session(&mut self, payload: &[u8]) -> Result<(), HttpConnError> { if let ClientDataEnum::Local(local_data) = &mut self.data { local_data.jti += 1; let token = resign_jwt(&local_data.key, payload, local_data.jti)?; // initiates the auth session - self.inner.simple_query("discard all").await?; + self.inner.batch_execute("discard all").await?; self.inner - .query( + .execute( "select auth.jwt_session_init($1)", - &[&token as &(dyn ToSql + Sync)], + &[&&*token as &(dyn ToSql + Sync)], ) .await?; diff --git a/proxy/src/serverless/mod.rs b/proxy/src/serverless/mod.rs index 77025f419d..80b42f9e55 100644 --- a/proxy/src/serverless/mod.rs +++ b/proxy/src/serverless/mod.rs @@ -132,6 +132,7 @@ pub async fn task_main( let connections = tokio_util::task::task_tracker::TaskTracker::new(); connections.close(); // allows `connections.wait to complete` + let cancellations = tokio_util::task::task_tracker::TaskTracker::new(); while let Some(res) = run_until_cancelled(ws_listener.accept(), &cancellation_token).await { let (conn, peer_addr) = res.context("could not accept TCP stream")?; if let Err(e) = conn.set_nodelay(true) { @@ -160,6 +161,7 @@ pub async fn task_main( let connections2 = connections.clone(); let cancellation_handler = cancellation_handler.clone(); let endpoint_rate_limiter = endpoint_rate_limiter.clone(); + let cancellations = cancellations.clone(); connections.spawn( async move { let conn_token2 = conn_token.clone(); @@ -188,6 +190,7 @@ pub async fn task_main( config, backend, connections2, + cancellations, cancellation_handler, endpoint_rate_limiter, conn_token, @@ -313,6 +316,7 @@ async fn connection_handler( config: &'static ProxyConfig, backend: Arc, connections: TaskTracker, + cancellations: TaskTracker, cancellation_handler: Arc, endpoint_rate_limiter: Arc, cancellation_token: CancellationToken, @@ -353,6 +357,7 @@ async fn connection_handler( // `request_handler` is not cancel safe. It expects to be cancelled only at specific times. // By spawning the future, we ensure it never gets cancelled until it decides to. + let cancellations = cancellations.clone(); let handler = connections.spawn( request_handler( req, @@ -364,6 +369,7 @@ async fn connection_handler( conn_info2.clone(), http_request_token, endpoint_rate_limiter.clone(), + cancellations, ) .in_current_span() .map_ok_or_else(api_error_into_response, |r| r), @@ -411,6 +417,7 @@ async fn request_handler( // used to cancel in-flight HTTP requests. not used to cancel websockets http_cancellation_token: CancellationToken, endpoint_rate_limiter: Arc, + cancellations: TaskTracker, ) -> Result>, ApiError> { let host = request .headers() @@ -436,6 +443,7 @@ async fn request_handler( let (response, websocket) = framed_websockets::upgrade::upgrade(&mut request) .map_err(|e| ApiError::BadRequest(e.into()))?; + let cancellations = cancellations.clone(); ws_connections.spawn( async move { if let Err(e) = websocket::serve_websocket( @@ -446,6 +454,7 @@ async fn request_handler( cancellation_handler, endpoint_rate_limiter, host, + cancellations, ) .await { diff --git a/proxy/src/serverless/sql_over_http.rs b/proxy/src/serverless/sql_over_http.rs index afd93d02f0..5e85f5ec40 100644 --- a/proxy/src/serverless/sql_over_http.rs +++ b/proxy/src/serverless/sql_over_http.rs @@ -11,12 +11,12 @@ use http_body_util::{BodyExt, Full}; use hyper::body::Incoming; use hyper::http::{HeaderName, HeaderValue}; use hyper::{header, HeaderMap, Request, Response, StatusCode}; +use postgres_client::error::{DbError, ErrorPosition, SqlState}; +use postgres_client::{GenericClient, IsolationLevel, NoTls, ReadyForQueryStatus, Transaction}; use pq_proto::StartupMessageParamsBuilder; use serde::Serialize; use serde_json::Value; use tokio::time::{self, Instant}; -use tokio_postgres::error::{DbError, ErrorPosition, SqlState}; -use tokio_postgres::{GenericClient, IsolationLevel, NoTls, ReadyForQueryStatus, Transaction}; use tokio_util::sync::CancellationToken; use tracing::{debug, error, info}; use typed_json::json; @@ -139,9 +139,6 @@ fn get_conn_info( headers: &HeaderMap, tls: Option<&TlsConfig>, ) -> Result { - // HTTP only uses cleartext (for now and likely always) - ctx.set_auth_method(crate::context::AuthMethod::Cleartext); - let connection_string = headers .get(&CONN_STRING) .ok_or(ConnInfoError::InvalidHeader(&CONN_STRING))? @@ -364,7 +361,7 @@ pub(crate) enum SqlOverHttpError { #[error("invalid isolation level")] InvalidIsolationLevel, #[error("{0}")] - Postgres(#[from] tokio_postgres::Error), + Postgres(#[from] postgres_client::Error), #[error("{0}")] JsonConversion(#[from] JsonConversionError), #[error("{0}")] @@ -989,7 +986,7 @@ async fn query_to_json( // Manually drain the stream into a vector to leave row_stream hanging // around to get a command tag. Also check that the response is not too // big. - let mut rows: Vec = Vec::new(); + let mut rows: Vec = Vec::new(); while let Some(row) = row_stream.next().await { let row = row?; *current_size += row.body_len(); @@ -1066,13 +1063,13 @@ async fn query_to_json( } enum Client { - Remote(conn_pool_lib::Client), - Local(conn_pool_lib::Client), + Remote(conn_pool_lib::Client), + Local(conn_pool_lib::Client), } enum Discard<'a> { - Remote(conn_pool_lib::Discard<'a, tokio_postgres::Client>), - Local(conn_pool_lib::Discard<'a, tokio_postgres::Client>), + Remote(conn_pool_lib::Discard<'a, postgres_client::Client>), + Local(conn_pool_lib::Discard<'a, postgres_client::Client>), } impl Client { @@ -1083,7 +1080,7 @@ impl Client { } } - fn inner(&mut self) -> (&mut tokio_postgres::Client, Discard<'_>) { + fn inner(&mut self) -> (&mut postgres_client::Client, Discard<'_>) { match self { Client::Remote(client) => { let (c, d) = client.inner(); diff --git a/proxy/src/serverless/websocket.rs b/proxy/src/serverless/websocket.rs index 4088fea835..bdb83fe6be 100644 --- a/proxy/src/serverless/websocket.rs +++ b/proxy/src/serverless/websocket.rs @@ -123,6 +123,7 @@ impl AsyncBufRead for WebSocketRw { } } +#[allow(clippy::too_many_arguments)] pub(crate) async fn serve_websocket( config: &'static ProxyConfig, auth_backend: &'static crate::auth::Backend<'static, ()>, @@ -131,6 +132,7 @@ pub(crate) async fn serve_websocket( cancellation_handler: Arc, endpoint_rate_limiter: Arc, hostname: Option, + cancellations: tokio_util::task::task_tracker::TaskTracker, ) -> anyhow::Result<()> { let websocket = websocket.await?; let websocket = WebSocketServer::after_handshake(TokioIo::new(websocket)); @@ -149,6 +151,7 @@ pub(crate) async fn serve_websocket( ClientMode::Websockets { hostname }, endpoint_rate_limiter, conn_gauge, + cancellations, )) .await; diff --git a/proxy/src/usage_metrics.rs b/proxy/src/usage_metrics.rs index c5e8588623..65e74466f2 100644 --- a/proxy/src/usage_metrics.rs +++ b/proxy/src/usage_metrics.rs @@ -1,19 +1,18 @@ //! Periodically collect proxy consumption metrics //! and push them to a HTTP endpoint. +use std::borrow::Cow; use std::convert::Infallible; -use std::pin::pin; use std::sync::atomic::{AtomicU64, AtomicUsize, Ordering}; use std::sync::Arc; use std::time::Duration; -use anyhow::Context; +use anyhow::{bail, Context}; use async_compression::tokio::write::GzipEncoder; use bytes::Bytes; use chrono::{DateTime, Datelike, Timelike, Utc}; use consumption_metrics::{idempotency_key, Event, EventChunk, EventType, CHUNK_SIZE}; use dashmap::mapref::entry::Entry; use dashmap::DashMap; -use futures::future::select; use once_cell::sync::Lazy; use remote_storage::{GenericRemoteStorage, RemotePath, TimeoutOrCancel}; use serde::{Deserialize, Serialize}; @@ -23,7 +22,7 @@ use tracing::{error, info, instrument, trace, warn}; use utils::backoff; use uuid::{NoContext, Timestamp}; -use crate::config::{MetricBackupCollectionConfig, MetricCollectionConfig}; +use crate::config::MetricCollectionConfig; use crate::context::parquet::{FAILED_UPLOAD_MAX_RETRIES, FAILED_UPLOAD_WARN_THRESHOLD}; use crate::http; use crate::intern::{BranchIdInt, EndpointIdInt}; @@ -58,55 +57,21 @@ trait MetricCounterReporter { fn move_metrics(&self) -> (u64, usize); } -#[derive(Debug)] -struct MetricBackupCounter { - transmitted: AtomicU64, - opened_connections: AtomicUsize, -} - -impl MetricCounterRecorder for MetricBackupCounter { - fn record_egress(&self, bytes: u64) { - self.transmitted.fetch_add(bytes, Ordering::AcqRel); - } - - fn record_connection(&self, count: usize) { - self.opened_connections.fetch_add(count, Ordering::AcqRel); - } -} - -impl MetricCounterReporter for MetricBackupCounter { - fn get_metrics(&mut self) -> (u64, usize) { - ( - *self.transmitted.get_mut(), - *self.opened_connections.get_mut(), - ) - } - fn move_metrics(&self) -> (u64, usize) { - ( - self.transmitted.swap(0, Ordering::AcqRel), - self.opened_connections.swap(0, Ordering::AcqRel), - ) - } -} - #[derive(Debug)] pub(crate) struct MetricCounter { transmitted: AtomicU64, opened_connections: AtomicUsize, - backup: Arc, } impl MetricCounterRecorder for MetricCounter { /// Record that some bytes were sent from the proxy to the client fn record_egress(&self, bytes: u64) { - self.transmitted.fetch_add(bytes, Ordering::AcqRel); - self.backup.record_egress(bytes); + self.transmitted.fetch_add(bytes, Ordering::Relaxed); } /// Record that some connections were opened fn record_connection(&self, count: usize) { - self.opened_connections.fetch_add(count, Ordering::AcqRel); - self.backup.record_connection(count); + self.opened_connections.fetch_add(count, Ordering::Relaxed); } } @@ -119,8 +84,8 @@ impl MetricCounterReporter for MetricCounter { } fn move_metrics(&self) -> (u64, usize) { ( - self.transmitted.swap(0, Ordering::AcqRel), - self.opened_connections.swap(0, Ordering::AcqRel), + self.transmitted.swap(0, Ordering::Relaxed), + self.opened_connections.swap(0, Ordering::Relaxed), ) } } @@ -173,26 +138,11 @@ type FastHasher = std::hash::BuildHasherDefault; #[derive(Default)] pub(crate) struct Metrics { endpoints: DashMap, FastHasher>, - backup_endpoints: DashMap, FastHasher>, } impl Metrics { /// Register a new byte metrics counter for this endpoint pub(crate) fn register(&self, ids: Ids) -> Arc { - let backup = if let Some(entry) = self.backup_endpoints.get(&ids) { - entry.clone() - } else { - self.backup_endpoints - .entry(ids.clone()) - .or_insert_with(|| { - Arc::new(MetricBackupCounter { - transmitted: AtomicU64::new(0), - opened_connections: AtomicUsize::new(0), - }) - }) - .clone() - }; - let entry = if let Some(entry) = self.endpoints.get(&ids) { entry.clone() } else { @@ -202,7 +152,6 @@ impl Metrics { Arc::new(MetricCounter { transmitted: AtomicU64::new(0), opened_connections: AtomicUsize::new(0), - backup: backup.clone(), }) }) .clone() @@ -227,6 +176,21 @@ pub async fn task_main(config: &MetricCollectionConfig) -> anyhow::Result anyhow::Result( now: DateTime, chunk_size: usize, ) -> impl Iterator>> + 'a { - // Split into chunks of 1000 metrics to avoid exceeding the max request size metrics_to_send .chunks(chunk_size) .map(move |chunk| EventChunk { @@ -303,11 +268,14 @@ fn create_event_chunks<'a>( }) } +#[expect(clippy::too_many_arguments)] #[instrument(skip_all)] async fn collect_metrics_iteration( endpoints: &DashMap, FastHasher>, client: &http::ClientWithMiddleware, metric_collection_endpoint: &reqwest::Url, + storage: Option<&GenericRemoteStorage>, + outer_chunk_size: usize, hostname: &str, prev: DateTime, now: DateTime, @@ -323,17 +291,54 @@ async fn collect_metrics_iteration( trace!("no new metrics to send"); } + let cancel = CancellationToken::new(); + let path_prefix = create_remote_path_prefix(now); + // Send metrics. - for chunk in create_event_chunks(&metrics_to_send, hostname, prev, now, CHUNK_SIZE) { + for chunk in create_event_chunks(&metrics_to_send, hostname, prev, now, outer_chunk_size) { + tokio::join!( + upload_main_events_chunked(client, metric_collection_endpoint, &chunk, CHUNK_SIZE), + async { + if let Err(e) = upload_backup_events(storage, &chunk, &path_prefix, &cancel).await { + error!("failed to upload consumption events to remote storage: {e:?}"); + } + } + ); + } +} + +fn create_remote_path_prefix(now: DateTime) -> String { + format!( + "year={year:04}/month={month:02}/day={day:02}/{hour:02}:{minute:02}:{second:02}Z", + year = now.year(), + month = now.month(), + day = now.day(), + hour = now.hour(), + minute = now.minute(), + second = now.second(), + ) +} + +async fn upload_main_events_chunked( + client: &http::ClientWithMiddleware, + metric_collection_endpoint: &reqwest::Url, + chunk: &EventChunk<'_, Event>, + subchunk_size: usize, +) { + // Split into smaller chunks to avoid exceeding the max request size + for subchunk in chunk.events.chunks(subchunk_size).map(|c| EventChunk { + events: Cow::Borrowed(c), + }) { let res = client .post(metric_collection_endpoint.clone()) - .json(&chunk) + .json(&subchunk) .send() .await; let res = match res { Ok(x) => x, Err(err) => { + // TODO: retry? error!("failed to send metrics: {:?}", err); continue; } @@ -341,7 +346,7 @@ async fn collect_metrics_iteration( if !res.status().is_success() { error!("metrics endpoint refused the sent metrics: {:?}", res); - for metric in chunk.events.iter().filter(|e| e.value > (1u64 << 40)) { + for metric in subchunk.events.iter().filter(|e| e.value > (1u64 << 40)) { // Report if the metric value is suspiciously large warn!("potentially abnormal metric value: {:?}", metric); } @@ -349,113 +354,34 @@ async fn collect_metrics_iteration( } } -pub async fn task_backup( - backup_config: &MetricBackupCollectionConfig, - cancellation_token: CancellationToken, -) -> anyhow::Result<()> { - info!("metrics backup config: {backup_config:?}"); - scopeguard::defer! { - info!("metrics backup has shut down"); - } - // Even if the remote storage is not configured, we still want to clear the metrics. - let storage = if let Some(config) = backup_config.remote_storage_config.as_ref() { - Some( - GenericRemoteStorage::from_config(config) - .await - .context("remote storage init")?, - ) - } else { - None - }; - let mut ticker = tokio::time::interval(backup_config.interval); - let mut prev = Utc::now(); - let hostname = hostname::get()?.as_os_str().to_string_lossy().into_owned(); - loop { - select(pin!(ticker.tick()), pin!(cancellation_token.cancelled())).await; - let now = Utc::now(); - collect_metrics_backup_iteration( - &USAGE_METRICS.backup_endpoints, - storage.as_ref(), - &hostname, - prev, - now, - backup_config.chunk_size, - ) - .await; - - prev = now; - if cancellation_token.is_cancelled() { - info!("metrics backup has been cancelled"); - break; - } - } - Ok(()) -} - -#[instrument(skip_all)] -async fn collect_metrics_backup_iteration( - endpoints: &DashMap, FastHasher>, +async fn upload_backup_events( storage: Option<&GenericRemoteStorage>, - hostname: &str, - prev: DateTime, - now: DateTime, - chunk_size: usize, -) { - let year = now.year(); - let month = now.month(); - let day = now.day(); - let hour = now.hour(); - let minute = now.minute(); - let second = now.second(); - let cancel = CancellationToken::new(); - - info!("starting collect_metrics_backup_iteration"); - - let metrics_to_send = collect_and_clear_metrics(endpoints); - - if metrics_to_send.is_empty() { - trace!("no new metrics to send"); - } - - // Send metrics. - for chunk in create_event_chunks(&metrics_to_send, hostname, prev, now, chunk_size) { - let real_now = Utc::now(); - let id = uuid::Uuid::new_v7(Timestamp::from_unix( - NoContext, - real_now.second().into(), - real_now.nanosecond(), - )); - let path = format!("year={year:04}/month={month:02}/day={day:02}/{hour:02}:{minute:02}:{second:02}Z_{id}.json.gz"); - let remote_path = match RemotePath::from_string(&path) { - Ok(remote_path) => remote_path, - Err(e) => { - error!("failed to create remote path from str {path}: {:?}", e); - continue; - } - }; - - let res = upload_events_chunk(storage, chunk, &remote_path, &cancel).await; - - if let Err(e) = res { - error!( - "failed to upload consumption events to remote storage: {:?}", - e - ); - } - } -} - -async fn upload_events_chunk( - storage: Option<&GenericRemoteStorage>, - chunk: EventChunk<'_, Event>, - remote_path: &RemotePath, + chunk: &EventChunk<'_, Event>, + path_prefix: &str, cancel: &CancellationToken, ) -> anyhow::Result<()> { let Some(storage) = storage else { - error!("no remote storage configured"); + warn!("no remote storage configured"); return Ok(()); }; - let data = serde_json::to_vec(&chunk).context("serialize metrics")?; + + let real_now = Utc::now(); + let id = uuid::Uuid::new_v7(Timestamp::from_unix( + NoContext, + real_now.second().into(), + real_now.nanosecond(), + )); + let path = format!("{path_prefix}_{id}.json.gz"); + let remote_path = match RemotePath::from_string(&path) { + Ok(remote_path) => remote_path, + Err(e) => { + bail!("failed to create remote path from str {path}: {:?}", e); + } + }; + + // TODO: This is async compression from Vec to Vec. Rewrite as byte stream. + // Use sync compression in blocking threadpool. + let data = serde_json::to_vec(chunk).context("serialize metrics")?; let mut encoder = GzipEncoder::new(Vec::new()); encoder.write_all(&data).await.context("compress metrics")?; encoder.shutdown().await.context("compress metrics")?; @@ -464,7 +390,7 @@ async fn upload_events_chunk( || async { let stream = futures::stream::once(futures::future::ready(Ok(compressed_data.clone()))); storage - .upload(stream, compressed_data.len(), remote_path, None, cancel) + .upload(stream, compressed_data.len(), &remote_path, None, cancel) .await }, TimeoutOrCancel::caused_by_cancel, @@ -482,9 +408,12 @@ async fn upload_events_chunk( #[cfg(test)] mod tests { + use std::fs; + use std::io::BufReader; use std::sync::{Arc, Mutex}; use anyhow::Error; + use camino_tempfile::tempdir; use chrono::Utc; use consumption_metrics::{Event, EventChunk}; use http_body_util::BodyExt; @@ -493,6 +422,7 @@ mod tests { use hyper::service::service_fn; use hyper::{Request, Response}; use hyper_util::rt::TokioIo; + use remote_storage::{RemoteStorageConfig, RemoteStorageKind}; use tokio::net::TcpListener; use url::Url; @@ -538,8 +468,34 @@ mod tests { let endpoint = Url::parse(&format!("http://{addr}")).unwrap(); let now = Utc::now(); + let storage_test_dir = tempdir().unwrap(); + let local_fs_path = storage_test_dir.path().join("usage_metrics"); + fs::create_dir_all(&local_fs_path).unwrap(); + let storage = GenericRemoteStorage::from_config(&RemoteStorageConfig { + storage: RemoteStorageKind::LocalFs { + local_path: local_fs_path.clone(), + }, + timeout: Duration::from_secs(10), + small_timeout: Duration::from_secs(1), + }) + .await + .unwrap(); + + let mut pushed_chunks: Vec = Vec::new(); + let mut stored_chunks: Vec = Vec::new(); + // no counters have been registered - collect_metrics_iteration(&metrics.endpoints, &client, &endpoint, "foo", now, now).await; + collect_metrics_iteration( + &metrics.endpoints, + &client, + &endpoint, + Some(&storage), + 1000, + "foo", + now, + now, + ) + .await; let r = std::mem::take(&mut *reports.lock().unwrap()); assert!(r.is_empty()); @@ -551,39 +507,84 @@ mod tests { }); // the counter should be observed despite 0 egress - collect_metrics_iteration(&metrics.endpoints, &client, &endpoint, "foo", now, now).await; + collect_metrics_iteration( + &metrics.endpoints, + &client, + &endpoint, + Some(&storage), + 1000, + "foo", + now, + now, + ) + .await; let r = std::mem::take(&mut *reports.lock().unwrap()); assert_eq!(r.len(), 1); assert_eq!(r[0].events.len(), 1); assert_eq!(r[0].events[0].value, 0); + pushed_chunks.extend(r); // record egress counter.record_egress(1); // egress should be observered - collect_metrics_iteration(&metrics.endpoints, &client, &endpoint, "foo", now, now).await; + collect_metrics_iteration( + &metrics.endpoints, + &client, + &endpoint, + Some(&storage), + 1000, + "foo", + now, + now, + ) + .await; let r = std::mem::take(&mut *reports.lock().unwrap()); assert_eq!(r.len(), 1); assert_eq!(r[0].events.len(), 1); assert_eq!(r[0].events[0].value, 1); + pushed_chunks.extend(r); // release counter drop(counter); // we do not observe the counter - collect_metrics_iteration(&metrics.endpoints, &client, &endpoint, "foo", now, now).await; + collect_metrics_iteration( + &metrics.endpoints, + &client, + &endpoint, + Some(&storage), + 1000, + "foo", + now, + now, + ) + .await; let r = std::mem::take(&mut *reports.lock().unwrap()); assert!(r.is_empty()); // counter is unregistered assert!(metrics.endpoints.is_empty()); - collect_metrics_backup_iteration(&metrics.backup_endpoints, None, "foo", now, now, 1000) - .await; - assert!(!metrics.backup_endpoints.is_empty()); - collect_metrics_backup_iteration(&metrics.backup_endpoints, None, "foo", now, now, 1000) - .await; - // backup counter is unregistered after the second iteration - assert!(metrics.backup_endpoints.is_empty()); + let path_prefix = create_remote_path_prefix(now); + for entry in walkdir::WalkDir::new(&local_fs_path) + .into_iter() + .filter_map(|e| e.ok()) + { + let path = local_fs_path.join(&path_prefix).to_string(); + if entry.path().to_str().unwrap().starts_with(&path) { + let chunk = serde_json::from_reader(flate2::bufread::GzDecoder::new( + BufReader::new(fs::File::open(entry.into_path()).unwrap()), + )) + .unwrap(); + stored_chunks.push(chunk); + } + } + storage_test_dir.close().ok(); + + // sort by first event's idempotency key because the order of files is nondeterministic + pushed_chunks.sort_by_cached_key(|c| c.events[0].idempotency_key.clone()); + stored_chunks.sort_by_cached_key(|c| c.events[0].idempotency_key.clone()); + assert_eq!(pushed_chunks, stored_chunks); } } diff --git a/pyproject.toml b/pyproject.toml index ccd3ab1864..01d15ee6bb 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -33,7 +33,7 @@ types-psutil = "^5.9.5.12" types-toml = "^0.10.8.6" pytest-httpserver = "^1.0.8" aiohttp = "3.10.11" -pytest-rerunfailures = "^13.0" +pytest-rerunfailures = "^15.0" types-pytest-lazy-fixture = "^0.6.3.3" pytest-split = "^0.8.1" zstandard = "^0.21.0" diff --git a/rust-toolchain.toml b/rust-toolchain.toml index 92b7929c7f..f0661a32e0 100644 --- a/rust-toolchain.toml +++ b/rust-toolchain.toml @@ -1,5 +1,5 @@ [toolchain] -channel = "1.82.0" +channel = "1.83.0" profile = "default" # The default profile includes rustc, rust-std, cargo, rust-docs, rustfmt and clippy. # https://rust-lang.github.io/rustup/concepts/profiles.html diff --git a/safekeeper/Cargo.toml b/safekeeper/Cargo.toml index 635a9222e1..0422c46ab1 100644 --- a/safekeeper/Cargo.toml +++ b/safekeeper/Cargo.toml @@ -41,6 +41,7 @@ serde_json.workspace = true strum.workspace = true strum_macros.workspace = true thiserror.workspace = true +tikv-jemallocator.workspace = true tokio = { workspace = true, features = ["fs"] } tokio-util = { workspace = true } tokio-io-timeout.workspace = true diff --git a/safekeeper/benches/receive_wal.rs b/safekeeper/benches/receive_wal.rs index c637b4fb24..313d945b94 100644 --- a/safekeeper/benches/receive_wal.rs +++ b/safekeeper/benches/receive_wal.rs @@ -6,6 +6,7 @@ mod benchutils; use std::io::Write as _; use benchutils::Env; +use bytes::BytesMut; use camino_tempfile::tempfile; use criterion::{criterion_group, criterion_main, BatchSize, Bencher, Criterion}; use itertools::Itertools as _; @@ -23,6 +24,15 @@ const KB: usize = 1024; const MB: usize = 1024 * KB; const GB: usize = 1024 * MB; +/// Use jemalloc, and configure it to sample allocations for profiles every 1 MB. +/// This mirrors the configuration in bin/safekeeper.rs. +#[global_allocator] +static GLOBAL: tikv_jemallocator::Jemalloc = tikv_jemallocator::Jemalloc; + +#[allow(non_upper_case_globals)] +#[export_name = "malloc_conf"] +pub static malloc_conf: &[u8] = b"prof:true,prof_active:true,lg_prof_sample:20\0"; + // Register benchmarks with Criterion. criterion_group!( name = benches; @@ -30,7 +40,8 @@ criterion_group!( targets = bench_process_msg, bench_wal_acceptor, bench_wal_acceptor_throughput, - bench_file_write + bench_file_write, + bench_bytes_reserve, ); criterion_main!(benches); @@ -341,3 +352,26 @@ fn bench_file_write(c: &mut Criterion) { Ok(()) } } + +/// Benchmarks the cost of memory allocations when receiving WAL messages. This emulates the logic +/// in FeMessage::parse, which extends the read buffer. It is primarily intended to test jemalloc. +fn bench_bytes_reserve(c: &mut Criterion) { + let mut g = c.benchmark_group("bytes_reserve"); + for size in [1, 64, KB, 8 * KB, 128 * KB] { + g.throughput(criterion::Throughput::Bytes(size as u64)); + g.bench_function(format!("size={size}"), |b| run_bench(b, size).unwrap()); + } + + fn run_bench(b: &mut Bencher, size: usize) -> anyhow::Result<()> { + let mut bytes = BytesMut::new(); + let data = vec![0; size]; + + b.iter(|| { + bytes.reserve(size); + bytes.extend_from_slice(&data); + bytes.split_to(size).freeze(); + }); + + Ok(()) + } +} diff --git a/safekeeper/spec/.gitignore b/safekeeper/spec/.gitignore new file mode 100644 index 0000000000..7233153039 --- /dev/null +++ b/safekeeper/spec/.gitignore @@ -0,0 +1,3 @@ +*TTrace* +*.toolbox/ +states/ diff --git a/safekeeper/spec/MCProposerAcceptorStatic.tla b/safekeeper/spec/MCProposerAcceptorStatic.tla new file mode 100644 index 0000000000..be3d99c697 --- /dev/null +++ b/safekeeper/spec/MCProposerAcceptorStatic.tla @@ -0,0 +1,31 @@ +---- MODULE MCProposerAcceptorStatic ---- +EXTENDS TLC, ProposerAcceptorStatic + +\* Augments the spec with model checking constraints. + +\* For model checking. +CONSTANTS + max_entries, \* model constraint: max log entries acceptor/proposer can hold + max_term \* model constraint: max allowed term + +ASSUME max_entries \in Nat /\ max_term \in Nat + +\* Model space constraint. +StateConstraint == \A p \in proposers: + /\ prop_state[p].term <= max_term + /\ Len(prop_state[p].wal) <= max_entries +\* Sets of proposers and acceptors are symmetric because we don't take any +\* actions depending on some concrete proposer/acceptor (like IF p = p1 THEN +\* ...) +ProposerAcceptorSymmetry == Permutations(proposers) \union Permutations(acceptors) + +\* enforce order of the vars in the error trace with ALIAS +\* Note that ALIAS is supported only since version 1.8.0 which is pre-release +\* as of writing this. +Alias == [ + prop_state |-> prop_state, + acc_state |-> acc_state, + committed |-> committed + ] + +==== diff --git a/safekeeper/spec/ProposerAcceptorConsensus.cfg b/safekeeper/spec/ProposerAcceptorConsensus.cfg deleted file mode 100644 index 989c86e47d..0000000000 --- a/safekeeper/spec/ProposerAcceptorConsensus.cfg +++ /dev/null @@ -1,34 +0,0 @@ -\* MV CONSTANT declarations -CONSTANT NULL = NULL -CONSTANTS -p1 = p1 -p2 = p2 -p3 = p3 -a1 = a1 -a2 = a2 -a3 = a3 -\* MV CONSTANT definitions -CONSTANT -proposers = {p1, p2} -acceptors = {a1, a2, a3} -\* SYMMETRY definition -SYMMETRY perms -\* CONSTANT definitions -CONSTANT -max_term = 3 -CONSTANT -max_entries = 3 -\* INIT definition -INIT -Init -\* NEXT definition -NEXT -Next -\* INVARIANT definition -INVARIANT -TypeOk -ElectionSafety -LogIsMonotonic -LogSafety -CommittedNotOverwritten -CHECK_DEADLOCK FALSE \ No newline at end of file diff --git a/safekeeper/spec/ProposerAcceptorConsensus.tla b/safekeeper/spec/ProposerAcceptorConsensus.tla deleted file mode 100644 index e5f0bb270f..0000000000 --- a/safekeeper/spec/ProposerAcceptorConsensus.tla +++ /dev/null @@ -1,363 +0,0 @@ ----- MODULE ProposerAcceptorConsensus ---- - -\* Differences from current implementation: -\* - unified not-globally-unique epoch & term (node_id) -\* Simplifications: -\* - instant message delivery -\* - feedback is not modeled separately, commit_lsn is updated directly - -EXTENDS Integers, Sequences, FiniteSets, TLC - -VARIABLES - prop_state, \* prop_state[p] is state of proposer p - acc_state, \* acc_state[a] is state of acceptor a - commit_lsns \* map of acceptor -> commit_lsn - -CONSTANT - acceptors, - proposers, - max_entries, \* model constraint: max log entries acceptor/proposer can hold - max_term \* model constraint: max allowed term - -CONSTANT NULL - -ASSUME max_entries \in Nat /\ max_term \in Nat - -\* For specifying symmetry set in manual cfg file, see -\* https://github.com/tlaplus/tlaplus/issues/404 -perms == Permutations(proposers) \union Permutations(acceptors) - -\******************************************************************************** -\* Helpers -\******************************************************************************** - -Maximum(S) == - (*************************************************************************) - (* If S is a set of numbers, then this define Maximum(S) to be the *) - (* maximum of those numbers, or -1 if S is empty. *) - (*************************************************************************) - IF S = {} THEN -1 - ELSE CHOOSE n \in S : \A m \in S : n \geq m - -\* minimum of numbers in the set, error if set is empty -Minimum(S) == - CHOOSE min \in S : \A n \in S : min <= n - -\* Min of two numbers -Min(a, b) == IF a < b THEN a ELSE b - -\* Set of values of function f. XXX is there a such builtin? -FValues(f) == {f[a] : a \in DOMAIN f} - -\* Sort of 0 for functions -EmptyF == [x \in {} |-> 42] -IsEmptyF(f) == DOMAIN f = {} - -\* Next entry proposer p will push to acceptor a or NULL. -NextEntry(p, a) == - IF Len(prop_state[p].wal) >= prop_state[p].next_send_lsn[a] THEN - CHOOSE r \in FValues(prop_state[p].wal) : r.lsn = prop_state[p].next_send_lsn[a] - ELSE - NULL - - -\***************** - -NumAccs == Cardinality(acceptors) - -\* does acc_set form the quorum? -Quorum(acc_set) == Cardinality(acc_set) >= (NumAccs \div 2 + 1) -\* all quorums of acceptors -Quorums == {subset \in SUBSET acceptors: Quorum(subset)} - -\* flush_lsn of acceptor a. -FlushLsn(a) == Len(acc_state[a].wal) - - -\******************************************************************************** -\* Type assertion -\******************************************************************************** -\* Defining sets of all possible tuples and using them in TypeOk in usual -\* all-tuples constructor is not practical because such definitions force -\* TLC to enumerate them, while they are are horribly enormous -\* (TLC screams "Attempted to construct a set with too many elements"). -\* So instead check types manually. -TypeOk == - /\ \A p \in proposers: - /\ DOMAIN prop_state[p] = {"state", "term", "votes", "donor_epoch", "vcl", "wal", "next_send_lsn"} - \* in campaign proposer sends RequestVote and waits for acks; - \* in leader he is elected - /\ prop_state[p].state \in {"campaign", "leader"} - \* 0..max_term should be actually Nat in the unbounded model, but TLC won't - \* swallow it - /\ prop_state[p].term \in 0..max_term - \* votes received - /\ \A voter \in DOMAIN prop_state[p].votes: - /\ voter \in acceptors - /\ prop_state[p].votes[voter] \in [epoch: 0..max_term, flush_lsn: 0..max_entries] - /\ prop_state[p].donor_epoch \in 0..max_term - \* wal is sequence of just records - /\ \A i \in DOMAIN prop_state[p].wal: - prop_state[p].wal[i] \in [lsn: 1..max_entries, epoch: 1..max_term] - \* Following implementation, we skew the original Aurora meaning of this; - \* here it is lsn of highest definitely committed record as set by proposer - \* when it is elected; it doesn't change since then - /\ prop_state[p].vcl \in 0..max_entries - \* map of acceptor -> next lsn to send - /\ \A a \in DOMAIN prop_state[p].next_send_lsn: - /\ a \in acceptors - /\ prop_state[p].next_send_lsn[a] \in 1..(max_entries + 1) - /\ \A a \in acceptors: - /\ DOMAIN acc_state[a] = {"term", "epoch", "wal"} - /\ acc_state[a].term \in 0..max_term - /\ acc_state[a].epoch \in 0..max_term - /\ \A i \in DOMAIN acc_state[a].wal: - acc_state[a].wal[i] \in [lsn: 1..max_entries, epoch: 1..max_term] - /\ \A a \in DOMAIN commit_lsns: - /\ a \in acceptors - /\ commit_lsns[a] \in 0..max_entries - -\******************************************************************************** -\* Initial -\******************************************************************************** - -Init == - /\ prop_state = [p \in proposers |-> [ - state |-> "campaign", - term |-> 1, - votes |-> EmptyF, - donor_epoch |-> 0, - vcl |-> 0, - wal |-> << >>, - next_send_lsn |-> EmptyF - ]] - /\ acc_state = [a \in acceptors |-> [ - \* there will be no leader in this term, 1 is the first real - term |-> 0, - epoch |-> 0, - wal |-> << >> - ]] - /\ commit_lsns = [a \in acceptors |-> 0] - - -\******************************************************************************** -\* Actions -\******************************************************************************** - -\* Proposer loses all state. -\* For simplicity (and to reduct state space), we assume it immediately gets -\* current state from quorum q of acceptors determining the term he will request -\* to vote for. -RestartProposer(p, q) == - /\ Quorum(q) - /\ LET - new_term == Maximum({acc_state[a].term : a \in q}) + 1 - IN - /\ new_term <= max_term - /\ prop_state' = [prop_state EXCEPT ![p].state = "campaign", - ![p].term = new_term, - ![p].votes = EmptyF, - ![p].donor_epoch = 0, - ![p].vcl = 0, - ![p].wal = << >>, - ![p].next_send_lsn = EmptyF] - /\ UNCHANGED <> - -\* Acceptor a immediately votes for proposer p. -Vote(p, a) == - /\ prop_state[p].state = "campaign" - /\ acc_state[a].term < prop_state[p].term \* main voting condition - /\ acc_state' = [acc_state EXCEPT ![a].term = prop_state[p].term] - /\ LET - vote == [epoch |-> acc_state[a].epoch, flush_lsn |-> FlushLsn(a)] - IN - prop_state' = [prop_state EXCEPT ![p].votes = prop_state[p].votes @@ (a :> vote)] - /\ UNCHANGED <> - - -\* Proposer p gets elected. -BecomeLeader(p) == - /\ prop_state[p].state = "campaign" - /\ Quorum(DOMAIN prop_state[p].votes) - /\ LET - max_epoch == Maximum({v.epoch : v \in FValues(prop_state[p].votes)}) - max_epoch_votes == {v \in FValues(prop_state[p].votes) : v.epoch = max_epoch} - donor == CHOOSE dv \in DOMAIN prop_state[p].votes : - /\ prop_state[p].votes[dv].epoch = max_epoch - /\ \A v \in max_epoch_votes: - prop_state[p].votes[dv].flush_lsn >= v.flush_lsn - max_vote == prop_state[p].votes[donor] - \* Establish lsn to stream from for voters. - \* At some point it seemed like we can regard log as correct and only - \* append to it if has in the max_epoch, however TLC showed that's not - \* the case; we must always stream since first not matching record. - next_send_lsn == [voter \in DOMAIN prop_state[p].votes |-> 1] - IN - \* we fetch log from the most advanced node (this is separate - \* roundtrip), make sure node is still on one term with us - /\ acc_state[donor].term = prop_state[p].term - /\ prop_state' = [prop_state EXCEPT ![p].state = "leader", - \* fetch the log from donor - ![p].wal = acc_state[donor].wal, - ![p].donor_epoch = max_epoch, - ![p].vcl = max_vote.flush_lsn, - ![p].next_send_lsn = next_send_lsn] - /\ UNCHANGED <> - - -\* acceptor a learns about elected proposer p's term. -UpdateTerm(p, a) == - /\ prop_state[p].state = "leader" - /\ acc_state[a].term < prop_state[p].term - /\ acc_state' = [acc_state EXCEPT ![a].term = prop_state[p].term] - /\ UNCHANGED <> - - -\* Acceptor a which didn't participate in voting connects to elected proposer p -\* and p sets the streaming point -HandshakeWithLeader(p, a) == - /\ prop_state[p].state = "leader" - /\ acc_state[a].term = prop_state[p].term - /\ a \notin DOMAIN prop_state[p].next_send_lsn - /\ LET - next_send_lsn == prop_state[p].next_send_lsn @@ (a :> 1) - IN - prop_state' = [prop_state EXCEPT ![p].next_send_lsn = next_send_lsn] - /\ UNCHANGED <> - - -\* Append new log entry to elected proposer -NewEntry(p) == - /\ prop_state[p].state = "leader" - /\ Len(prop_state[p].wal) < max_entries \* model constraint - /\ LET - new_lsn == IF Len(prop_state[p].wal) = 0 THEN - prop_state[p].vcl + 1 - ELSE - \* lsn of last record + 1 - prop_state[p].wal[Len(prop_state[p].wal)].lsn + 1 - new_entry == [lsn |-> new_lsn, epoch |-> prop_state[p].term] - IN - /\ prop_state' = [prop_state EXCEPT ![p].wal = Append(prop_state[p].wal, new_entry)] - /\ UNCHANGED <> - - -\* Write entry new_e to log wal, rolling back all higher entries if e is different. -\* If bump_epoch is TRUE, it means we get record with lsn=vcl and going to update -\* the epoch. Truncate log in this case as well, as we might have correct <= vcl -\* part and some outdated entries behind it which we want to purge before -\* declaring us as recovered. Another way to accomplish this (in previous commit) -\* is wait for first-entry-from-new-epoch before bumping it. -WriteEntry(wal, new_e, bump_epoch) == - (new_e.lsn :> new_e) @@ - \* If wal has entry with such lsn and it is different, truncate all higher log. - IF \/ (new_e.lsn \in DOMAIN wal /\ wal[new_e.lsn] /= new_e) - \/ bump_epoch THEN - SelectSeq(wal, LAMBDA e: e.lsn < new_e.lsn) - ELSE - wal - - -\* Try to transfer entry from elected proposer p to acceptor a -TransferEntry(p, a) == - /\ prop_state[p].state = "leader" - /\ prop_state[p].term = acc_state[a].term - /\ a \in DOMAIN prop_state[p].next_send_lsn - /\ LET - next_e == NextEntry(p, a) - IN - /\ next_e /= NULL - /\ LET - \* Consider bumping epoch if getting this entry recovers the acceptor, - \* that is, we reach first record behind VCL. - new_epoch == - IF /\ acc_state[a].epoch < prop_state[p].term - /\ next_e.lsn >= prop_state[p].vcl - THEN - prop_state[p].term - ELSE - acc_state[a].epoch - \* Also check whether this entry allows to advance commit_lsn and - \* if so, bump it where possible. Modeling this as separate action - \* significantly bloats the space (5m vs 15m on max_entries=3 max_term=3, - \* so act immediately. - entry_owners == {o \in acceptors: - /\ o /= a - \* only recovered acceptors advance commit_lsn - /\ acc_state[o].epoch = prop_state[p].term - /\ next_e \in FValues(acc_state[o].wal)} \cup {a} - IN - /\ acc_state' = [acc_state EXCEPT ![a].wal = WriteEntry(acc_state[a].wal, next_e, new_epoch /= acc_state[a].epoch), - ![a].epoch = new_epoch] - /\ prop_state' = [prop_state EXCEPT ![p].next_send_lsn[a] = - prop_state[p].next_send_lsn[a] + 1] - /\ commit_lsns' = IF /\ new_epoch = prop_state[p].term - /\ Quorum(entry_owners) - THEN - [acc \in acceptors |-> - IF /\ acc \in entry_owners - /\ next_e.lsn > commit_lsns[acc] - THEN - next_e.lsn - ELSE - commit_lsns[acc]] - ELSE - commit_lsns - - -\******************************************************************************* -\* Final spec -\******************************************************************************* - -Next == - \/ \E q \in Quorums: \E p \in proposers: RestartProposer(p, q) - \/ \E p \in proposers: \E a \in acceptors: Vote(p, a) - \/ \E p \in proposers: BecomeLeader(p) - \/ \E p \in proposers: \E a \in acceptors: UpdateTerm(p, a) - \/ \E p \in proposers: \E a \in acceptors: HandshakeWithLeader(p, a) - \/ \E p \in proposers: NewEntry(p) - \/ \E p \in proposers: \E a \in acceptors: TransferEntry(p, a) - -Spec == Init /\ [][Next]_<> - - -\******************************************************************************** -\* Invariants -\******************************************************************************** - -\* we don't track history, but this property is fairly convincing anyway -ElectionSafety == - \A p1, p2 \in proposers: - (/\ prop_state[p1].state = "leader" - /\ prop_state[p2].state = "leader" - /\ prop_state[p1].term = prop_state[p2].term) => (p1 = p2) - -LogIsMonotonic == - \A a \in acceptors: - \A i \in DOMAIN acc_state[a].wal: \A j \in DOMAIN acc_state[a].wal: - (i > j) => (/\ acc_state[a].wal[i].lsn > acc_state[a].wal[j].lsn - /\ acc_state[a].wal[i].epoch >= acc_state[a].wal[j].epoch) - -\* Main invariant: log under commit_lsn must match everywhere. -LogSafety == - \A a1 \in acceptors: \A a2 \in acceptors: - LET - common_len == Min(commit_lsns[a1], commit_lsns[a2]) - IN - SubSeq(acc_state[a1].wal, 1, common_len) = SubSeq(acc_state[a2].wal, 1, common_len) - -\* Next record we are going to push to acceptor must never overwrite committed -\* different record. -CommittedNotOverwritten == - \A p \in proposers: \A a \in acceptors: - (/\ prop_state[p].state = "leader" - /\ prop_state[p].term = acc_state[a].term - /\ a \in DOMAIN prop_state[p].next_send_lsn) => - LET - next_e == NextEntry(p, a) - IN - (next_e /= NULL) => - ((commit_lsns[a] >= next_e.lsn) => (acc_state[a].wal[next_e.lsn] = next_e)) - - -==== \ No newline at end of file diff --git a/safekeeper/spec/ProposerAcceptorStatic.tla b/safekeeper/spec/ProposerAcceptorStatic.tla new file mode 100644 index 0000000000..b2d2f005db --- /dev/null +++ b/safekeeper/spec/ProposerAcceptorStatic.tla @@ -0,0 +1,449 @@ +---- MODULE ProposerAcceptorStatic ---- + +(* + The protocol is very similar to Raft. The key differences are: + - Leaders (proposers) are separated from storage nodes (acceptors), which has + been already an established way to think about Paxos. + - We don't want to stamp each log record with term, so instead carry around + term histories which are sequences of pairs. + As a bonus (and subtlety) this allows the proposer to commit entries from + previous terms without writing new records -- if acceptor's log is caught + up, update of term history on it updates last_log_term as well. +*) + +\* Model simplifications: +\* - Instant message delivery. Notably, ProposerElected message (TruncateWal action) is not +\* delayed, so we don't attempt to truncate WAL when the same wp already appended something +\* on the acceptor since common point had been calculated (this should be rejected). +\* - old WAL is immediately copied to proposer on its election, without on-demand fetch later. + +\* Some ideas how to break it to play around to get a feeling: +\* - replace Quorums with BadQuorums. +\* - remove 'don't commit entries from previous terms separately' rule in +\* CommitEntries and observe figure 8 from the raft paper. +\* With p2a3t4l4 32 steps error was found in 1h on 80 cores. + +EXTENDS Integers, Sequences, FiniteSets, TLC + +VARIABLES + prop_state, \* prop_state[p] is state of proposer p + acc_state, \* acc_state[a] is state of acceptor a + committed, \* bag (set) of ever committed <> entries + elected_history \* counter for elected terms, see TypeOk for details + +CONSTANT + acceptors, + proposers + +CONSTANT NULL + +\******************************************************************************** +\* Helpers +\******************************************************************************** + +Maximum(S) == + (*************************************************************************) + (* If S is a set of numbers, then this define Maximum(S) to be the *) + (* maximum of those numbers, or -1 if S is empty. *) + (*************************************************************************) + IF S = {} THEN -1 ELSE CHOOSE n \in S : \A m \in S : n \geq m + +\* minimum of numbers in the set, error if set is empty +Minimum(S) == CHOOSE min \in S : \A n \in S : min <= n + +\* Min of two numbers +Min(a, b) == IF a < b THEN a ELSE b + +\* Sort of 0 for functions +EmptyF == [x \in {} |-> 42] +IsEmptyF(f) == DOMAIN f = {} + +\* Set of values (image) of the function f. Apparently no such builtin. +Range(f) == {f[x] : x \in DOMAIN f} + +\* If key k is in function f, map it using l, otherwise insert v. Returns the +\* updated function. +Upsert(f, k, v, l(_)) == + LET new_val == IF k \in DOMAIN f THEN l(f[k]) ELSE v IN + (k :> new_val) @@ f + +\***************** + +NumAccs == Cardinality(acceptors) + +\* does acc_set form the quorum? +Quorum(acc_set) == Cardinality(acc_set) >= (NumAccs \div 2 + 1) +\* all quorums of acceptors +Quorums == {subset \in SUBSET acceptors: Quorum(subset)} + +\* For substituting Quorums and seeing what happens. +BadQuorum(acc_set) == Cardinality(acc_set) >= (NumAccs \div 2) +BadQuorums == {subset \in SUBSET acceptors: BadQuorum(subset)} + +\* flushLsn (end of WAL, i.e. index of next entry) of acceptor a. +FlushLsn(a) == Len(acc_state[a].wal) + 1 + +\* Typedefs. Note that TLA+ Nat includes zero. +Terms == Nat +Lsns == Nat + +\******************************************************************************** +\* Type assertion +\******************************************************************************** +\* Defining sets of all possible tuples and using them in TypeOk in usual +\* all-tuples constructor is not practical because such definitions force +\* TLC to enumerate them, while they are are horribly enormous +\* (TLC screams "Attempted to construct a set with too many elements"). +\* So instead check types manually. + + +\* Term history is a sequence of pairs. +IsTermHistory(th) == + \A th_entry \in Range(th): th_entry.term \in Terms /\ th_entry.lsn \in Lsns + +IsWal(w) == + \A i \in DOMAIN w: + /\ i \in Lsns + /\ w[i] \in Terms + +TypeOk == + /\ \A p \in proposers: + \* '_' in field names hinders pretty printing + \* https://github.com/tlaplus/tlaplus/issues/1051 + \* so use camel case. + /\ DOMAIN prop_state[p] = {"state", "term", "votes", "termHistory", "wal", "nextSendLsn"} + \* In campaign proposer sends RequestVote and waits for acks; + \* in leader he is elected. + /\ prop_state[p].state \in {"campaign", "leader"} + \* term for which it will campaign, or won term in leader state + /\ prop_state[p].term \in Terms + \* votes received + /\ \A voter \in DOMAIN prop_state[p].votes: voter \in acceptors + /\ \A vote \in Range(prop_state[p].votes): + /\ IsTermHistory(vote.termHistory) + /\ vote.flushLsn \in Lsns + \* Proposer's term history. Empty while proposer is in "campaign". + /\ IsTermHistory(prop_state[p].termHistory) + \* In the model we identify WAL entries only by pairs + \* without additional unique id, which is enough for its purposes. + \* It means that with term history fully modeled wal becomes + \* redundant as it can be computed from term history + WAL length. + \* However, we still keep it here and at acceptors as explicit sequence + \* where index is LSN and value is the term to avoid artificial mapping to + \* figure out real entries. It shouldn't bloat model much because this + \* doesn't increase number of distinct states. + /\ IsWal(prop_state[p].wal) + \* Map of acceptor -> next lsn to send. It is set when truncate_wal is + \* done so sending entries is allowed only after that. In the impl TCP + \* ensures this ordering. + /\ \A a \in DOMAIN prop_state[p].nextSendLsn: + /\ a \in acceptors + /\ prop_state[p].nextSendLsn[a] \in Lsns + /\ \A a \in acceptors: + /\ DOMAIN acc_state[a] = {"term", "termHistory", "wal"} + /\ acc_state[a].term \in Terms + /\ IsTermHistory(acc_state[a].termHistory) + /\ IsWal(acc_state[a].wal) + /\ \A c \in committed: + /\ c.term \in Terms + /\ c.lsn \in Lsns + \* elected_history is a retrospective map of term -> number of times it was + \* elected, for use in ElectionSafetyFull invariant. For static spec it is + \* fairly convincing that it holds, but with membership change it is less + \* trivial. And as we identify log entries only with , importance + \* of it is quite high as violation of log safety might go undetected if + \* election safety is violated. Note though that this is not always the + \* case, i.e. you can imagine (and TLC should find) schedule where log + \* safety violation is still detected because two leaders with the same term + \* commit histories which are different in previous terms, so it is not that + \* crucial. Plus if spec allows ElectionSafetyFull violation, likely + \* ElectionSafety will also be violated in some schedules. But neither it + \* should bloat the model too much. + /\ \A term \in DOMAIN elected_history: + /\ term \in Terms + /\ elected_history[term] \in Nat + +\******************************************************************************** +\* Initial +\******************************************************************************** + +Init == + /\ prop_state = [p \in proposers |-> [ + state |-> "campaign", + term |-> 1, + votes |-> EmptyF, + termHistory |-> << >>, + wal |-> << >>, + nextSendLsn |-> EmptyF + ]] + /\ acc_state = [a \in acceptors |-> [ + \* There will be no leader in zero term, 1 is the first + \* real. + term |-> 0, + \* Again, leader in term 0 doesn't exist, but we initialize + \* term histories with it to always have common point in + \* them. Lsn is 1 because TLA+ sequences are indexed from 1 + \* (we don't want to truncate WAL out of range). + termHistory |-> << [term |-> 0, lsn |-> 1] >>, + wal |-> << >> + ]] + /\ committed = {} + /\ elected_history = EmptyF + + +\******************************************************************************** +\* Actions +\******************************************************************************** + +\* Proposer loses all state. +\* For simplicity (and to reduct state space), we assume it immediately gets +\* current state from quorum q of acceptors determining the term he will request +\* to vote for. +RestartProposer(p, q) == + /\ Quorum(q) + /\ LET new_term == Maximum({acc_state[a].term : a \in q}) + 1 IN + /\ prop_state' = [prop_state EXCEPT ![p].state = "campaign", + ![p].term = new_term, + ![p].votes = EmptyF, + ![p].termHistory = << >>, + ![p].wal = << >>, + ![p].nextSendLsn = EmptyF] + /\ UNCHANGED <> + +\* Term history of acceptor a's WAL: the one saved truncated to contain only <= +\* local FlushLsn entries. +AcceptorTermHistory(a) == + SelectSeq(acc_state[a].termHistory, LAMBDA th_entry: th_entry.lsn <= FlushLsn(a)) + +\* Acceptor a immediately votes for proposer p. +Vote(p, a) == + /\ prop_state[p].state = "campaign" + /\ acc_state[a].term < prop_state[p].term \* main voting condition + /\ acc_state' = [acc_state EXCEPT ![a].term = prop_state[p].term] + /\ LET + vote == [termHistory |-> AcceptorTermHistory(a), flushLsn |-> FlushLsn(a)] + IN + prop_state' = [prop_state EXCEPT ![p].votes = (a :> vote) @@ prop_state[p].votes] + /\ UNCHANGED <> + + +\* Get lastLogTerm from term history th. +LastLogTerm(th) == th[Len(th)].term + +\* Proposer p gets elected. +BecomeLeader(p) == + /\ prop_state[p].state = "campaign" + /\ Quorum(DOMAIN prop_state[p].votes) + /\ LET + \* Find acceptor with the highest vote. + max_vote_acc == + CHOOSE a \in DOMAIN prop_state[p].votes: + LET v == prop_state[p].votes[a] + IN \A v2 \in Range(prop_state[p].votes): + /\ LastLogTerm(v.termHistory) >= LastLogTerm(v2.termHistory) + /\ (LastLogTerm(v.termHistory) = LastLogTerm(v2.termHistory) => v.flushLsn >= v2.flushLsn) + max_vote == prop_state[p].votes[max_vote_acc] + prop_th == Append(max_vote.termHistory, [term |-> prop_state[p].term, lsn |-> max_vote.flushLsn]) + IN + \* We copy all log preceding proposer's term from the max vote node so + \* make sure it is still on one term with us. This is a model + \* simplification which can be removed, in impl we fetch WAL on demand + \* from safekeeper which has it later. Note though that in case of on + \* demand fetch we must check on donor not only term match, but that + \* truncate_wal had already been done (if it is not max_vote_acc). + /\ acc_state[max_vote_acc].term = prop_state[p].term + /\ prop_state' = [prop_state EXCEPT ![p].state = "leader", + ![p].termHistory = prop_th, + ![p].wal = acc_state[max_vote_acc].wal + ] + /\ elected_history' = Upsert(elected_history, prop_state[p].term, 1, LAMBDA c: c + 1) + /\ UNCHANGED <> + + +\* Acceptor a learns about elected proposer p's term. In impl it matches to +\* VoteRequest/VoteResponse exchange when leader is already elected and is not +\* interested in the vote result. +UpdateTerm(p, a) == + /\ prop_state[p].state = "leader" + /\ acc_state[a].term < prop_state[p].term + /\ acc_state' = [acc_state EXCEPT ![a].term = prop_state[p].term] + /\ UNCHANGED <> + +\* Find highest common point (LSN of the first divergent record) in the logs of +\* proposer p and acceptor a. Returns of the highest common point. +FindHighestCommonPoint(prop_th, acc_th, acc_flush_lsn) == + LET + \* First find index of the highest common term. + \* It must exist because we initialize th with <0, 1>. + last_common_idx == Maximum({i \in 1..Min(Len(prop_th), Len(acc_th)): prop_th[i].term = acc_th[i].term}) + last_common_term == prop_th[last_common_idx].term + \* Now find where it ends at both prop and acc and take min. End of term + \* is the start of the next unless it is the last one; there it is + \* flush_lsn in case of acceptor. In case of proposer it is the current + \* writing position, but it can't be less than flush_lsn, so we + \* take flush_lsn. + acc_common_term_end == IF last_common_idx = Len(acc_th) THEN acc_flush_lsn ELSE acc_th[last_common_idx + 1].lsn + prop_common_term_end == IF last_common_idx = Len(prop_th) THEN acc_flush_lsn ELSE prop_th[last_common_idx + 1].lsn + IN + [term |-> last_common_term, lsn |-> Min(acc_common_term_end, prop_common_term_end)] + +\* Elected proposer p immediately truncates WAL (and term history) of acceptor a +\* before starting streaming. Establishes nextSendLsn for a. +\* +\* In impl this happens at each reconnection, here we also allow to do it multiple times. +TruncateWal(p, a) == + /\ prop_state[p].state = "leader" + /\ acc_state[a].term = prop_state[p].term + /\ LET + hcp == FindHighestCommonPoint(prop_state[p].termHistory, AcceptorTermHistory(a), FlushLsn(a)) + next_send_lsn == (a :> hcp.lsn) @@ prop_state[p].nextSendLsn + IN + \* Acceptor persists full history immediately; reads adjust it to the + \* really existing wal with AcceptorTermHistory. + /\ acc_state' = [acc_state EXCEPT ![a].termHistory = prop_state[p].termHistory, + \* note: SubSeq is inclusive, hence -1. + ![a].wal = SubSeq(acc_state[a].wal, 1, hcp.lsn - 1) + ] + /\ prop_state' = [prop_state EXCEPT ![p].nextSendLsn = next_send_lsn] + /\ UNCHANGED <> + +\* Append new log entry to elected proposer +NewEntry(p) == + /\ prop_state[p].state = "leader" + /\ LET + \* entry consists only of term, index serves as LSN. + new_entry == prop_state[p].term + IN + /\ prop_state' = [prop_state EXCEPT ![p].wal = Append(prop_state[p].wal, new_entry)] + /\ UNCHANGED <> + +\* Immediately append next entry from elected proposer to acceptor a. +AppendEntry(p, a) == + /\ prop_state[p].state = "leader" + /\ acc_state[a].term = prop_state[p].term + /\ a \in DOMAIN prop_state[p].nextSendLsn \* did TruncateWal + /\ prop_state[p].nextSendLsn[a] <= Len(prop_state[p].wal) \* have smth to send + /\ LET + send_lsn == prop_state[p].nextSendLsn[a] + entry == prop_state[p].wal[send_lsn] + \* Since message delivery is instant we don't check that send_lsn follows + \* the last acc record, it must always be true. + IN + /\ prop_state' = [prop_state EXCEPT ![p].nextSendLsn[a] = send_lsn + 1] + /\ acc_state' = [acc_state EXCEPT ![a].wal = Append(acc_state[a].wal, entry)] + /\ UNCHANGED <> + +\* LSN where elected proposer p starts writing its records. +PropStartLsn(p) == + IF prop_state[p].state = "leader" THEN prop_state[p].termHistory[Len(prop_state[p].termHistory)].lsn ELSE NULL + +\* Proposer p commits all entries it can using quorum q. Note that unlike +\* will62794/logless-reconfig this allows to commit entries from previous terms +\* (when conditions for that are met). +CommitEntries(p, q) == + /\ prop_state[p].state = "leader" + /\ \A a \in q: + /\ acc_state[a].term = prop_state[p].term + \* nextSendLsn existence means TruncateWal has happened, it ensures + \* acceptor's WAL (and FlushLsn) are from proper proposer's history. + \* Alternatively we could compare LastLogTerm here, but that's closer to + \* what we do in the impl (we check flushLsn in AppendResponse, but + \* AppendRequest is processed only if HandleElected handling was good). + /\ a \in DOMAIN prop_state[p].nextSendLsn + \* Now find the LSN present on all the quorum. + /\ LET quorum_lsn == Minimum({FlushLsn(a): a \in q}) IN + \* This is the basic Raft rule of not committing entries from previous + \* terms except along with current term entry (commit them only when + \* quorum recovers, i.e. last_log_term on it reaches leader's term). + /\ quorum_lsn >= PropStartLsn(p) + /\ committed' = committed \cup {[term |-> prop_state[p].wal[lsn], lsn |-> lsn]: lsn \in 1..(quorum_lsn - 1)} + /\ UNCHANGED <> + +\******************************************************************************* +\* Final spec +\******************************************************************************* + +Next == + \/ \E q \in Quorums: \E p \in proposers: RestartProposer(p, q) + \/ \E p \in proposers: \E a \in acceptors: Vote(p, a) + \/ \E p \in proposers: BecomeLeader(p) + \/ \E p \in proposers: \E a \in acceptors: UpdateTerm(p, a) + \/ \E p \in proposers: \E a \in acceptors: TruncateWal(p, a) + \/ \E p \in proposers: NewEntry(p) + \/ \E p \in proposers: \E a \in acceptors: AppendEntry(p, a) + \/ \E q \in Quorums: \E p \in proposers: CommitEntries(p, q) + +Spec == Init /\ [][Next]_<> + + +\******************************************************************************** +\* Invariants +\******************************************************************************** + +\* Lighter version of ElectionSafetyFull which doesn't require elected_history. +ElectionSafety == + \A p1, p2 \in proposers: + (/\ prop_state[p1].state = "leader" + /\ prop_state[p2].state = "leader" + /\ prop_state[p1].term = prop_state[p2].term) => (p1 = p2) + +\* Single term must never be elected more than once. +ElectionSafetyFull == \A term \in DOMAIN elected_history: elected_history[term] <= 1 + +\* Log is expected to be monotonic by comparison. This is not true +\* in variants of multi Paxos, but in Raft (and here) it is. +LogIsMonotonic == + \A a \in acceptors: + \A i, j \in DOMAIN acc_state[a].wal: + (i > j) => (acc_state[a].wal[i] >= acc_state[a].wal[j]) + +\* Main invariant: If two entries are committed at the same LSN, they must be +\* the same entry. +LogSafety == + \A c1, c2 \in committed: (c1.lsn = c2.lsn) => (c1 = c2) + + +\******************************************************************************** +\* Invariants which don't need to hold, but useful for playing/debugging. +\******************************************************************************** + +\* Limits term of elected proposers +MaxTerm == \A p \in proposers: (prop_state[p].state = "leader" => prop_state[p].term < 2) + +MaxAccWalLen == \A a \in acceptors: Len(acc_state[a].wal) < 2 + +\* Limits max number of committed entries. That way we can check that we'are +\* actually committing something. +MaxCommitLsn == Cardinality(committed) < 2 + +\* How many records with different terms can be removed in single WAL +\* truncation. +MaxTruncatedTerms == + \A p \in proposers: \A a \in acceptors: + (/\ prop_state[p].state = "leader" + /\ prop_state[p].term = acc_state[a].term) => + LET + hcp == FindHighestCommonPoint(prop_state[p].termHistory, AcceptorTermHistory(a), FlushLsn(a)) + truncated_lsns == {lsn \in DOMAIN acc_state[a].wal: lsn >= hcp.lsn} + truncated_records_terms == {acc_state[a].wal[lsn]: lsn \in truncated_lsns} + IN + Cardinality(truncated_records_terms) < 2 + +\* Check that TruncateWal never deletes committed record. +\* It might seem that this should an invariant, but it is not. +\* With 5 nodes, it is legit to truncate record which had been +\* globally committed: e.g. nodes abc can commit record of term 1 in +\* term 3, and after that leader of term 2 can delete such record +\* on d. On 10 cores TLC can find such a trace in ~7 hours. +CommittedNotTruncated == + \A p \in proposers: \A a \in acceptors: + (/\ prop_state[p].state = "leader" + /\ prop_state[p].term = acc_state[a].term) => + LET + hcp == FindHighestCommonPoint(prop_state[p].termHistory, AcceptorTermHistory(a), FlushLsn(a)) + truncated_lsns == {lsn \in DOMAIN acc_state[a].wal: lsn >= hcp.lsn} + truncated_records == {[term |-> acc_state[a].wal[lsn], lsn |-> lsn]: lsn \in truncated_lsns} + IN + \A r \in truncated_records: r \notin committed + +==== diff --git a/safekeeper/spec/modelcheck.sh b/safekeeper/spec/modelcheck.sh new file mode 100755 index 0000000000..21ead7dad8 --- /dev/null +++ b/safekeeper/spec/modelcheck.sh @@ -0,0 +1,49 @@ +#!/bin/bash + +# Usage: ./modelcheck.sh , e.g. +# ./modelcheck.sh models/MCProposerAcceptorStatic_p2_a3_t3_l3.cfg MCProposerAcceptorStatic.tla +CONFIG=$1 +SPEC=$2 + +MEM=7G +TOOLSPATH="/opt/TLA+Toolbox/tla2tools.jar" + +mkdir -p "tlc-results" +CONFIG_FILE=$(basename -- "$CONFIG") +outfilename="$SPEC-${CONFIG_FILE}-$(date --utc +%Y-%m-%d--%H-%M-%S)".log +outfile="tlc-results/$outfilename" +touch $outfile + +# Save some info about the run. +GIT_REV=`git rev-parse --short HEAD` +INFO=`uname -a` + +# First for Linux, second for Mac. +CPUNAMELinux=$(lscpu | grep 'Model name' | cut -f 2 -d ":" | awk '{$1=$1}1') +CPUCORESLinux=`nproc` +CPUNAMEMac=`sysctl -n machdep.cpu.brand_string` +CPUCORESMac=`sysctl -n machdep.cpu.thread_count` + +echo "git revision: $GIT_REV" >> $outfile +echo "Platform: $INFO" >> $outfile +echo "CPU Info Linux: $CPUNAMELinux" >> $outfile +echo "CPU Cores Linux: $CPUCORESLinux" >> $outfile +echo "CPU Info Mac: $CPUNAMEMac" >> $outfile +echo "CPU Cores Mac: $CPUCORESMac" >> $outfile +echo "Spec: $SPEC" >> $outfile +echo "Config: $CONFIG" >> $outfile +echo "----" >> $outfile +cat $CONFIG >> $outfile +echo "" >> $outfile +echo "----" >> $outfile +echo "" >> $outfile + +# see +# https://lamport.azurewebsites.net/tla/current-tools.pdf +# for TLC options. +# OffHeapDiskFPSet is the optimal fingerprint set implementation +# https://docs.tlapl.us/codebase:architecture#fingerprint_sets_fpsets +# +# Add -simulate to run in infinite simulation mode. +java -Xmx$MEM -XX:MaxDirectMemorySize=$MEM -XX:+UseParallelGC -Dtlc2.tool.fp.FPSet.impl=tlc2.tool.fp.OffHeapDiskFPSet \ + -cp "${TOOLSPATH}" tlc2.TLC $SPEC -config $CONFIG -workers auto -gzip | tee -a $outfile diff --git a/safekeeper/spec/models/MCProposerAcceptorStatic_p2_a3_t2_l2.cfg b/safekeeper/spec/models/MCProposerAcceptorStatic_p2_a3_t2_l2.cfg new file mode 100644 index 0000000000..c06109c601 --- /dev/null +++ b/safekeeper/spec/models/MCProposerAcceptorStatic_p2_a3_t2_l2.cfg @@ -0,0 +1,19 @@ +\* A very small model just to play. +CONSTANTS +NULL = NULL +proposers = {p1, p2} +acceptors = {a1, a2, a3} +max_term = 2 +max_entries = 2 +SPECIFICATION Spec +CONSTRAINT StateConstraint +INVARIANT +TypeOk +ElectionSafetyFull +LogIsMonotonic +LogSafety +CommittedNotTruncated +SYMMETRY ProposerAcceptorSymmetry +CHECK_DEADLOCK FALSE +ALIAS Alias + diff --git a/safekeeper/spec/models/MCProposerAcceptorStatic_p2_a3_t3_l2.cfg b/safekeeper/spec/models/MCProposerAcceptorStatic_p2_a3_t3_l2.cfg new file mode 100644 index 0000000000..5d10fa960f --- /dev/null +++ b/safekeeper/spec/models/MCProposerAcceptorStatic_p2_a3_t3_l2.cfg @@ -0,0 +1,19 @@ +\* A model next to the smallest one. +CONSTANTS +NULL = NULL +proposers = {p1, p2} +acceptors = {a1, a2, a3} +max_term = 3 +max_entries = 2 +SPECIFICATION Spec +CONSTRAINT StateConstraint +INVARIANT +TypeOk +ElectionSafetyFull +LogIsMonotonic +LogSafety +CommittedNotTruncated +SYMMETRY ProposerAcceptorSymmetry +CHECK_DEADLOCK FALSE +ALIAS Alias + diff --git a/safekeeper/spec/models/MCProposerAcceptorStatic_p2_a3_t3_l3.cfg b/safekeeper/spec/models/MCProposerAcceptorStatic_p2_a3_t3_l3.cfg new file mode 100644 index 0000000000..8ba8ce95a4 --- /dev/null +++ b/safekeeper/spec/models/MCProposerAcceptorStatic_p2_a3_t3_l3.cfg @@ -0,0 +1,17 @@ +CONSTANTS +NULL = NULL +proposers = {p1, p2} +acceptors = {a1, a2, a3} +max_term = 3 +max_entries = 3 +SPECIFICATION Spec +CONSTRAINT StateConstraint +INVARIANT +TypeOk +ElectionSafety +LogIsMonotonic +LogSafety +CommittedNotTruncated +SYMMETRY ProposerAcceptorSymmetry +CHECK_DEADLOCK FALSE +ALIAS Alias diff --git a/safekeeper/spec/models/MCProposerAcceptorStatic_p2_a3_t4_l4.cfg b/safekeeper/spec/models/MCProposerAcceptorStatic_p2_a3_t4_l4.cfg new file mode 100644 index 0000000000..4763a34ec4 --- /dev/null +++ b/safekeeper/spec/models/MCProposerAcceptorStatic_p2_a3_t4_l4.cfg @@ -0,0 +1,17 @@ +CONSTANTS +NULL = NULL +proposers = {p1, p2} +acceptors = {a1, a2, a3} +max_term = 4 +max_entries = 4 +SPECIFICATION Spec +CONSTRAINT StateConstraint +INVARIANT +TypeOk +ElectionSafety +LogIsMonotonic +LogSafety +CommittedNotTruncated +SYMMETRY ProposerAcceptorSymmetry +CHECK_DEADLOCK FALSE +ALIAS Alias diff --git a/safekeeper/spec/models/MCProposerAcceptorStatic_p2_a5_t2_l2.cfg b/safekeeper/spec/models/MCProposerAcceptorStatic_p2_a5_t2_l2.cfg new file mode 100644 index 0000000000..ebf4724633 --- /dev/null +++ b/safekeeper/spec/models/MCProposerAcceptorStatic_p2_a5_t2_l2.cfg @@ -0,0 +1,16 @@ +CONSTANTS +NULL = NULL +proposers = {p1, p2} +acceptors = {a1, a2, a3, a4, a5} +max_term = 2 +max_entries = 2 +SPECIFICATION Spec +CONSTRAINT StateConstraint +INVARIANT +TypeOk +ElectionSafety +LogIsMonotonic +LogSafety +SYMMETRY ProposerAcceptorSymmetry +CHECK_DEADLOCK FALSE +ALIAS Alias diff --git a/safekeeper/spec/models/MCProposerAcceptorStatic_p2_a5_t3_l3.cfg b/safekeeper/spec/models/MCProposerAcceptorStatic_p2_a5_t3_l3.cfg new file mode 100644 index 0000000000..bb77350c58 --- /dev/null +++ b/safekeeper/spec/models/MCProposerAcceptorStatic_p2_a5_t3_l3.cfg @@ -0,0 +1,16 @@ +CONSTANTS +NULL = NULL +proposers = {p1, p2} +acceptors = {a1, a2, a3, a4, a5} +max_term = 3 +max_entries = 3 +SPECIFICATION Spec +CONSTRAINT StateConstraint +INVARIANT +TypeOk +ElectionSafety +LogIsMonotonic +LogSafety +SYMMETRY ProposerAcceptorSymmetry +CHECK_DEADLOCK FALSE +ALIAS Alias diff --git a/safekeeper/spec/models/MCProposerAcceptorStatic_p2_a5_t4_l3.cfg b/safekeeper/spec/models/MCProposerAcceptorStatic_p2_a5_t4_l3.cfg new file mode 100644 index 0000000000..9a5e142f99 --- /dev/null +++ b/safekeeper/spec/models/MCProposerAcceptorStatic_p2_a5_t4_l3.cfg @@ -0,0 +1,16 @@ +CONSTANTS +NULL = NULL +proposers = {p1, p2} +acceptors = {a1, a2, a3, a4, a5} +max_term = 4 +max_entries = 3 +SPECIFICATION Spec +CONSTRAINT StateConstraint +INVARIANT +TypeOk +ElectionSafety +LogIsMonotonic +LogSafety +SYMMETRY ProposerAcceptorSymmetry +CHECK_DEADLOCK FALSE +ALIAS Alias diff --git a/safekeeper/spec/readme.md b/safekeeper/spec/readme.md new file mode 100644 index 0000000000..ec2649d87d --- /dev/null +++ b/safekeeper/spec/readme.md @@ -0,0 +1,12 @@ +The specifications, models and results of running of them of the compute <-> +safekeepers consensus algorithm for committing WAL on the fleet of safekeepers. +Following Paxos parlance, compute which writes WAL is called (WAL) proposer here +and safekeepers which persist it are called (WAL) acceptors. + +Directory structure: +- Use modelcheck.sh to run TLC. +- MC*.tla contains bits of TLA+ needed for TLC like constraining the state space, and models/ actual models. +- Other .tla files are the actual specs. + +Structure is partially borrowed from +[logless-reconfig](https://github.com/will62794/logless-reconfig), thanks to it. diff --git a/safekeeper/spec/tlc-results/MCProposerAcceptorStatic.tla-MCProposerAcceptorStatic_p2_a3_t2_l2.cfg-2024-11-06--13-44-17.log b/safekeeper/spec/tlc-results/MCProposerAcceptorStatic.tla-MCProposerAcceptorStatic_p2_a3_t2_l2.cfg-2024-11-06--13-44-17.log new file mode 100644 index 0000000000..768722b1eb --- /dev/null +++ b/safekeeper/spec/tlc-results/MCProposerAcceptorStatic.tla-MCProposerAcceptorStatic_p2_a3_t2_l2.cfg-2024-11-06--13-44-17.log @@ -0,0 +1,63 @@ +git revision: 864f4667d +Platform: Linux neon-dev-arm64-1 6.8.0-48-generic #48-Ubuntu SMP PREEMPT_DYNAMIC Fri Sep 27 14:35:45 UTC 2024 aarch64 aarch64 aarch64 GNU/Linux +CPU Info Linux: Neoverse-N1 +CPU Cores Linux: 80 +CPU Info Mac: +CPU Cores Mac: +Spec: MCProposerAcceptorStatic.tla +Config: models/MCProposerAcceptorStatic_p2_a3_t2_l2.cfg +---- +\* A very small model just to play. +CONSTANTS +NULL = NULL +proposers = {p1, p2} +acceptors = {a1, a2, a3} +max_term = 2 +max_entries = 2 +SPECIFICATION Spec +CONSTRAINT StateConstraint +INVARIANT +TypeOk +ElectionSafety +LogIsMonotonic +LogSafety +CommittedNotTruncated +SYMMETRY ProposerAcceptorSymmetry +CHECK_DEADLOCK FALSE +ALIAS Alias + + +---- + +TLC2 Version 2.20 of Day Month 20?? (rev: f68cb71) +Running breadth-first search Model-Checking with fp 110 and seed 3949669318051689745 with 80 workers on 80 cores with 54613MB heap and 61440MB offheap memory [pid: 46037] (Linux 6.8.0-48-generic aarch64, Ubuntu 21.0.4 x86_64, OffHeapDiskFPSet, DiskStateQueue). +Parsing file /home/arseny/neon/safekeeper/spec/MCProposerAcceptorStatic.tla +Parsing file /tmp/tlc-11123278435718411444/TLC.tla (jar:file:/home/arseny/tla2tools.jar!/tla2sany/StandardModules/TLC.tla) +Parsing file /home/arseny/neon/safekeeper/spec/ProposerAcceptorStatic.tla +Parsing file /tmp/tlc-11123278435718411444/_TLCTrace.tla (jar:file:/home/arseny/tla2tools.jar!/tla2sany/StandardModules/_TLCTrace.tla) +Parsing file /tmp/tlc-11123278435718411444/Integers.tla (jar:file:/home/arseny/tla2tools.jar!/tla2sany/StandardModules/Integers.tla) +Parsing file /tmp/tlc-11123278435718411444/Sequences.tla (jar:file:/home/arseny/tla2tools.jar!/tla2sany/StandardModules/Sequences.tla) +Parsing file /tmp/tlc-11123278435718411444/FiniteSets.tla (jar:file:/home/arseny/tla2tools.jar!/tla2sany/StandardModules/FiniteSets.tla) +Parsing file /tmp/tlc-11123278435718411444/Naturals.tla (jar:file:/home/arseny/tla2tools.jar!/tla2sany/StandardModules/Naturals.tla) +Parsing file /tmp/tlc-11123278435718411444/TLCExt.tla (jar:file:/home/arseny/tla2tools.jar!/tla2sany/StandardModules/TLCExt.tla) +Semantic processing of module Naturals +Semantic processing of module Sequences +Semantic processing of module FiniteSets +Semantic processing of module TLC +Semantic processing of module Integers +Semantic processing of module ProposerAcceptorStatic +Semantic processing of module TLCExt +Semantic processing of module _TLCTrace +Semantic processing of module MCProposerAcceptorStatic +Starting... (2024-11-06 13:44:18) +Computing initial states... +Finished computing initial states: 1 distinct state generated at 2024-11-06 13:44:20. +Model checking completed. No error has been found. + Estimates of the probability that TLC did not check all reachable states + because two distinct states had the same fingerprint: + calculated (optimistic): val = 2.9E-9 + based on the actual fingerprints: val = 4.1E-10 +922134 states generated, 61249 distinct states found, 0 states left on queue. +The depth of the complete state graph search is 31. +The average outdegree of the complete state graph is 1 (minimum is 0, the maximum 6 and the 95th percentile is 3). +Finished in 11s at (2024-11-06 13:44:28) diff --git a/safekeeper/spec/tlc-results/MCProposerAcceptorStatic.tla-MCProposerAcceptorStatic_p2_a3_t3_l2.cfg-2024-11-15--09-09-58.log b/safekeeper/spec/tlc-results/MCProposerAcceptorStatic.tla-MCProposerAcceptorStatic_p2_a3_t3_l2.cfg-2024-11-15--09-09-58.log new file mode 100644 index 0000000000..ae3ba98da6 --- /dev/null +++ b/safekeeper/spec/tlc-results/MCProposerAcceptorStatic.tla-MCProposerAcceptorStatic_p2_a3_t3_l2.cfg-2024-11-15--09-09-58.log @@ -0,0 +1,69 @@ +git revision: bcbff084a +Platform: Linux nonlibrem 6.10.11-amd64 #1 SMP PREEMPT_DYNAMIC Debian 6.10.11-1 (2024-09-22) x86_64 GNU/Linux +CPU Info Linux: 13th Gen Intel(R) Core(TM) i7-1355U +CPU Cores Linux: 10 +CPU Info Mac: +CPU Cores Mac: +Spec: MCProposerAcceptorStatic.tla +Config: models/MCProposerAcceptorStatic_p2_a3_t3_l2.cfg +---- +\* A model next to the smallest one. +CONSTANTS +NULL = NULL +proposers = {p1, p2} +acceptors = {a1, a2, a3} +max_term = 3 +max_entries = 2 +SPECIFICATION Spec +CONSTRAINT StateConstraint +INVARIANT +TypeOk +ElectionSafety +LogIsMonotonic +LogSafety +CommittedNotTruncated +SYMMETRY ProposerAcceptorSymmetry +CHECK_DEADLOCK FALSE +ALIAS Alias + + +---- + +TLC2 Version 2.20 of Day Month 20?? (rev: cc65eef) +Running breadth-first search Model-Checking with fp 41 and seed -3061068726727581619 with 10 workers on 10 cores with 6372MB heap and 7168MB offheap memory [pid: 1250346] (Linux 6.10.11-amd64 amd64, Debian 21.0.5 x86_64, OffHeapDiskFPSet, DiskStateQueue). +Parsing file /home/ars/neon/neon/safekeeper/spec/MCProposerAcceptorStatic.tla +Parsing file /tmp/tlc-3023124431504466774/TLC.tla (jar:file:/opt/TLA+Toolbox/tla2tools.jar!/tla2sany/StandardModules/TLC.tla) +Parsing file /home/ars/neon/neon/safekeeper/spec/ProposerAcceptorStatic.tla +Parsing file /tmp/tlc-3023124431504466774/_TLCTrace.tla (jar:file:/opt/TLA+Toolbox/tla2tools.jar!/tla2sany/StandardModules/_TLCTrace.tla) +Parsing file /tmp/tlc-3023124431504466774/Integers.tla (jar:file:/opt/TLA+Toolbox/tla2tools.jar!/tla2sany/StandardModules/Integers.tla) +Parsing file /tmp/tlc-3023124431504466774/Sequences.tla (jar:file:/opt/TLA+Toolbox/tla2tools.jar!/tla2sany/StandardModules/Sequences.tla) +Parsing file /tmp/tlc-3023124431504466774/FiniteSets.tla (jar:file:/opt/TLA+Toolbox/tla2tools.jar!/tla2sany/StandardModules/FiniteSets.tla) +Parsing file /tmp/tlc-3023124431504466774/Naturals.tla (jar:file:/opt/TLA+Toolbox/tla2tools.jar!/tla2sany/StandardModules/Naturals.tla) +Parsing file /tmp/tlc-3023124431504466774/TLCExt.tla (jar:file:/opt/TLA+Toolbox/tla2tools.jar!/tla2sany/StandardModules/TLCExt.tla) +Semantic processing of module Naturals +Semantic processing of module Sequences +Semantic processing of module FiniteSets +Semantic processing of module TLC +Semantic processing of module Integers +Semantic processing of module ProposerAcceptorStatic +Semantic processing of module TLCExt +Semantic processing of module _TLCTrace +Semantic processing of module MCProposerAcceptorStatic +Starting... (2024-11-15 12:09:59) +Computing initial states... +Finished computing initial states: 1 distinct state generated at 2024-11-15 12:10:00. +Progress(19) at 2024-11-15 12:10:03: 464,696 states generated (464,696 s/min), 57,859 distinct states found (57,859 ds/min), 21,435 states left on queue. +Progress(26) at 2024-11-15 12:11:03: 8,813,399 states generated (8,348,703 s/min), 877,254 distinct states found (819,395 ds/min), 214,794 states left on queue. +Progress(27) at 2024-11-15 12:12:03: 16,121,858 states generated (7,308,459 s/min), 1,464,707 distinct states found (587,453 ds/min), 274,230 states left on queue. +Progress(29) at 2024-11-15 12:13:03: 23,073,903 states generated (6,952,045 s/min), 1,948,802 distinct states found (484,095 ds/min), 263,697 states left on queue. +Progress(31) at 2024-11-15 12:14:03: 29,740,681 states generated (6,666,778 s/min), 2,331,052 distinct states found (382,250 ds/min), 185,484 states left on queue. +Progress(34) at 2024-11-15 12:15:03: 36,085,876 states generated (6,345,195 s/min), 2,602,370 distinct states found (271,318 ds/min), 31,659 states left on queue. +Model checking completed. No error has been found. + Estimates of the probability that TLC did not check all reachable states + because two distinct states had the same fingerprint: + calculated (optimistic): val = 4.9E-6 + based on the actual fingerprints: val = 6.9E-7 +36896322 states generated, 2623542 distinct states found, 0 states left on queue. +The depth of the complete state graph search is 39. +The average outdegree of the complete state graph is 1 (minimum is 0, the maximum 7 and the 95th percentile is 3). +Finished in 05min 14s at (2024-11-15 12:15:13) diff --git a/safekeeper/spec/tlc-results/MCProposerAcceptorStatic.tla-MCProposerAcceptorStatic_p2_a3_t3_l3.cfg-2024-11-06--13-03-51.log b/safekeeper/spec/tlc-results/MCProposerAcceptorStatic.tla-MCProposerAcceptorStatic_p2_a3_t3_l3.cfg-2024-11-06--13-03-51.log new file mode 100644 index 0000000000..46f21cee72 --- /dev/null +++ b/safekeeper/spec/tlc-results/MCProposerAcceptorStatic.tla-MCProposerAcceptorStatic_p2_a3_t3_l3.cfg-2024-11-06--13-03-51.log @@ -0,0 +1,72 @@ +git revision: 864f4667d +Platform: Linux neon-dev-arm64-1 6.8.0-48-generic #48-Ubuntu SMP PREEMPT_DYNAMIC Fri Sep 27 14:35:45 UTC 2024 aarch64 aarch64 aarch64 GNU/Linux +CPU Info Linux: Neoverse-N1 +CPU Cores Linux: 80 +CPU Info Mac: +CPU Cores Mac: +Spec: MCProposerAcceptorStatic.tla +Config: models/MCProposerAcceptorStatic_p2_a3_t3_l3.cfg +---- +CONSTANTS +NULL = NULL +proposers = {p1, p2} +acceptors = {a1, a2, a3} +max_term = 3 +max_entries = 3 +SPECIFICATION Spec +CONSTRAINT StateConstraint +INVARIANT +TypeOk +ElectionSafety +LogIsMonotonic +LogSafety +CommittedNotTruncated +SYMMETRY ProposerAcceptorSymmetry +CHECK_DEADLOCK FALSE +ALIAS Alias + +---- + +TLC2 Version 2.20 of Day Month 20?? (rev: f68cb71) +Running breadth-first search Model-Checking with fp 126 and seed 2302892334567572769 with 80 workers on 80 cores with 54613MB heap and 61440MB offheap memory [pid: 39701] (Linux 6.8.0-48-generic aarch64, Ubuntu 21.0.4 x86_64, OffHeapDiskFPSet, DiskStateQueue). +Parsing file /home/arseny/neon/safekeeper/spec/MCProposerAcceptorStatic.tla +Parsing file /tmp/tlc-15178810317173795942/TLC.tla (jar:file:/home/arseny/tla2tools.jar!/tla2sany/StandardModules/TLC.tla) +Parsing file /home/arseny/neon/safekeeper/spec/ProposerAcceptorStatic.tla +Parsing file /tmp/tlc-15178810317173795942/_TLCTrace.tla (jar:file:/home/arseny/tla2tools.jar!/tla2sany/StandardModules/_TLCTrace.tla) +Parsing file /tmp/tlc-15178810317173795942/Integers.tla (jar:file:/home/arseny/tla2tools.jar!/tla2sany/StandardModules/Integers.tla) +Parsing file /tmp/tlc-15178810317173795942/Sequences.tla (jar:file:/home/arseny/tla2tools.jar!/tla2sany/StandardModules/Sequences.tla) +Parsing file /tmp/tlc-15178810317173795942/FiniteSets.tla (jar:file:/home/arseny/tla2tools.jar!/tla2sany/StandardModules/FiniteSets.tla) +Parsing file /tmp/tlc-15178810317173795942/Naturals.tla (jar:file:/home/arseny/tla2tools.jar!/tla2sany/StandardModules/Naturals.tla) +Parsing file /tmp/tlc-15178810317173795942/TLCExt.tla (jar:file:/home/arseny/tla2tools.jar!/tla2sany/StandardModules/TLCExt.tla) +Semantic processing of module Naturals +Semantic processing of module Sequences +Semantic processing of module FiniteSets +Semantic processing of module TLC +Semantic processing of module Integers +Semantic processing of module ProposerAcceptorStatic +Semantic processing of module TLCExt +Semantic processing of module _TLCTrace +Semantic processing of module MCProposerAcceptorStatic +Starting... (2024-11-06 13:03:52) +Computing initial states... +Finished computing initial states: 1 distinct state generated at 2024-11-06 13:03:55. +Progress(21) at 2024-11-06 13:03:58: 846,240 states generated (846,240 s/min), 106,298 distinct states found (106,298 ds/min), 41,028 states left on queue. +Progress(28) at 2024-11-06 13:04:58: 27,538,211 states generated (26,691,971 s/min), 2,768,793 distinct states found (2,662,495 ds/min), 782,984 states left on queue. +Progress(30) at 2024-11-06 13:05:58: 54,048,763 states generated (26,510,552 s/min), 5,076,745 distinct states found (2,307,952 ds/min), 1,241,301 states left on queue. +Progress(31) at 2024-11-06 13:06:58: 80,554,724 states generated (26,505,961 s/min), 7,199,201 distinct states found (2,122,456 ds/min), 1,541,574 states left on queue. +Progress(32) at 2024-11-06 13:07:58: 106,991,261 states generated (26,436,537 s/min), 9,121,549 distinct states found (1,922,348 ds/min), 1,686,289 states left on queue. +Progress(33) at 2024-11-06 13:08:58: 133,354,665 states generated (26,363,404 s/min), 10,935,451 distinct states found (1,813,902 ds/min), 1,739,977 states left on queue. +Progress(34) at 2024-11-06 13:09:58: 159,631,385 states generated (26,276,720 s/min), 12,605,372 distinct states found (1,669,921 ds/min), 1,677,447 states left on queue. +Progress(35) at 2024-11-06 13:10:58: 185,862,196 states generated (26,230,811 s/min), 14,138,409 distinct states found (1,533,037 ds/min), 1,501,760 states left on queue. +Progress(36) at 2024-11-06 13:11:58: 212,021,688 states generated (26,159,492 s/min), 15,538,990 distinct states found (1,400,581 ds/min), 1,216,621 states left on queue. +Progress(37) at 2024-11-06 13:12:58: 238,046,160 states generated (26,024,472 s/min), 16,778,583 distinct states found (1,239,593 ds/min), 797,230 states left on queue. +Progress(39) at 2024-11-06 13:13:58: 263,931,163 states generated (25,885,003 s/min), 17,820,786 distinct states found (1,042,203 ds/min), 209,400 states left on queue. +Model checking completed. No error has been found. + Estimates of the probability that TLC did not check all reachable states + because two distinct states had the same fingerprint: + calculated (optimistic): val = 2.5E-4 + based on the actual fingerprints: val = 7.9E-5 +270257170 states generated, 18005639 distinct states found, 0 states left on queue. +The depth of the complete state graph search is 47. +The average outdegree of the complete state graph is 1 (minimum is 0, the maximum 7 and the 95th percentile is 3). +Finished in 10min 25s at (2024-11-06 13:14:17) diff --git a/safekeeper/spec/tlc-results/MCProposerAcceptorStatic.tla-MCProposerAcceptorStatic_p2_a3_t4_l4.cfg-2024-11-06--14-20-25.log b/safekeeper/spec/tlc-results/MCProposerAcceptorStatic.tla-MCProposerAcceptorStatic_p2_a3_t4_l4.cfg-2024-11-06--14-20-25.log new file mode 100644 index 0000000000..c7cc853af0 --- /dev/null +++ b/safekeeper/spec/tlc-results/MCProposerAcceptorStatic.tla-MCProposerAcceptorStatic_p2_a3_t4_l4.cfg-2024-11-06--14-20-25.log @@ -0,0 +1,1466 @@ +# Shows LogSafety violation when "don't commit separately entries from previous terms" check is disabled. +git revision: 4f1ee6331 +Platform: Linux neon-dev-arm64-1 6.8.0-48-generic #48-Ubuntu SMP PREEMPT_DYNAMIC Fri Sep 27 14:35:45 UTC 2024 aarch64 aarch64 aarch64 GNU/Linux +CPU Info Linux: Neoverse-N1 +CPU Cores Linux: 80 +CPU Info Mac: +CPU Cores Mac: +Spec: MCProposerAcceptorStatic.tla +Config: models/MCProposerAcceptorStatic_p2_a3_t4_l4.cfg +---- +CONSTANTS +NULL = NULL +proposers = {p1, p2} +acceptors = {a1, a2, a3} +max_term = 4 +max_entries = 4 +SPECIFICATION Spec +CONSTRAINT StateConstraint +INVARIANT +TypeOk +ElectionSafety +LogIsMonotonic +LogSafety +SYMMETRY ProposerAcceptorSymmetry +CHECK_DEADLOCK FALSE +ALIAS Alias + +---- + +TLC2 Version 2.20 of Day Month 20?? (rev: f68cb71) +Running breadth-first search Model-Checking with fp 12 and seed -5379034126224420237 with 80 workers on 80 cores with 54613MB heap and 61440MB offheap memory [pid: 52295] (Linux 6.8.0-48-generic aarch64, Ubuntu 21.0.4 x86_64, OffHeapDiskFPSet, DiskStateQueue). +Parsing file /home/arseny/neon/safekeeper/spec/MCProposerAcceptorStatic.tla +Parsing file /tmp/tlc-4533438058229992850/TLC.tla (jar:file:/home/arseny/tla2tools.jar!/tla2sany/StandardModules/TLC.tla) +Parsing file /home/arseny/neon/safekeeper/spec/ProposerAcceptorStatic.tla +Parsing file /tmp/tlc-4533438058229992850/_TLCTrace.tla (jar:file:/home/arseny/tla2tools.jar!/tla2sany/StandardModules/_TLCTrace.tla) +Parsing file /tmp/tlc-4533438058229992850/Integers.tla (jar:file:/home/arseny/tla2tools.jar!/tla2sany/StandardModules/Integers.tla) +Parsing file /tmp/tlc-4533438058229992850/Sequences.tla (jar:file:/home/arseny/tla2tools.jar!/tla2sany/StandardModules/Sequences.tla) +Parsing file /tmp/tlc-4533438058229992850/FiniteSets.tla (jar:file:/home/arseny/tla2tools.jar!/tla2sany/StandardModules/FiniteSets.tla) +Parsing file /tmp/tlc-4533438058229992850/Naturals.tla (jar:file:/home/arseny/tla2tools.jar!/tla2sany/StandardModules/Naturals.tla) +Parsing file /tmp/tlc-4533438058229992850/TLCExt.tla (jar:file:/home/arseny/tla2tools.jar!/tla2sany/StandardModules/TLCExt.tla) +Semantic processing of module Naturals +Semantic processing of module Sequences +Semantic processing of module FiniteSets +Semantic processing of module TLC +Semantic processing of module Integers +Semantic processing of module ProposerAcceptorStatic +Semantic processing of module TLCExt +Semantic processing of module _TLCTrace +Semantic processing of module MCProposerAcceptorStatic +Starting... (2024-11-06 14:20:26) +Computing initial states... +Finished computing initial states: 1 distinct state generated at 2024-11-06 14:20:29. +Progress(20) at 2024-11-06 14:20:32: 1,011,898 states generated (1,011,898 s/min), 140,947 distinct states found (140,947 ds/min), 60,535 states left on queue. +Progress(26) at 2024-11-06 14:21:32: 30,146,518 states generated (29,134,620 s/min), 3,742,736 distinct states found (3,601,789 ds/min), 1,438,779 states left on queue. +Progress(27) at 2024-11-06 14:22:32: 59,362,708 states generated (29,216,190 s/min), 7,210,233 distinct states found (3,467,497 ds/min), 2,708,295 states left on queue. +Progress(28) at 2024-11-06 14:23:32: 88,589,291 states generated (29,226,583 s/min), 10,552,781 distinct states found (3,342,548 ds/min), 3,874,296 states left on queue. +Progress(29) at 2024-11-06 14:24:32: 117,894,209 states generated (29,304,918 s/min), 13,932,498 distinct states found (3,379,717 ds/min), 5,069,960 states left on queue. +Progress(29) at 2024-11-06 14:25:32: 147,338,882 states generated (29,444,673 s/min), 17,180,069 distinct states found (3,247,571 ds/min), 6,146,371 states left on queue. +Progress(29) at 2024-11-06 14:26:32: 176,498,135 states generated (29,159,253 s/min), 20,547,926 distinct states found (3,367,857 ds/min), 7,338,835 states left on queue. +Progress(30) at 2024-11-06 14:27:32: 205,957,044 states generated (29,458,909 s/min), 23,661,090 distinct states found (3,113,164 ds/min), 8,293,570 states left on queue. +Progress(30) at 2024-11-06 14:28:32: 235,390,133 states generated (29,433,089 s/min), 26,892,306 distinct states found (3,231,216 ds/min), 9,369,229 states left on queue. +Progress(30) at 2024-11-06 14:29:32: 264,571,938 states generated (29,181,805 s/min), 30,176,971 distinct states found (3,284,665 ds/min), 10,493,429 states left on queue. +Progress(31) at 2024-11-06 14:30:32: 293,928,191 states generated (29,356,253 s/min), 33,296,160 distinct states found (3,119,189 ds/min), 11,463,686 states left on queue. +Progress(31) at 2024-11-06 14:31:32: 323,436,668 states generated (29,508,477 s/min), 36,347,973 distinct states found (3,051,813 ds/min), 12,365,578 states left on queue. +Progress(31) at 2024-11-06 14:32:32: 352,943,790 states generated (29,507,122 s/min), 39,465,244 distinct states found (3,117,271 ds/min), 13,349,544 states left on queue. +Progress(31) at 2024-11-06 14:33:32: 382,292,863 states generated (29,349,073 s/min), 42,654,621 distinct states found (3,189,377 ds/min), 14,384,363 states left on queue. +Progress(31) at 2024-11-06 14:34:32: 411,385,854 states generated (29,092,991 s/min), 45,941,145 distinct states found (3,286,524 ds/min), 15,509,450 states left on queue. +Progress(31) at 2024-11-06 14:35:32: 440,738,756 states generated (29,352,902 s/min), 48,984,566 distinct states found (3,043,421 ds/min), 16,419,882 states left on queue. +Progress(32) at 2024-11-06 14:36:32: 470,251,558 states generated (29,512,802 s/min), 51,925,693 distinct states found (2,941,127 ds/min), 17,211,457 states left on queue. +Progress(32) at 2024-11-06 14:37:32: 499,714,013 states generated (29,462,455 s/min), 54,955,581 distinct states found (3,029,888 ds/min), 18,114,624 states left on queue. +Progress(32) at 2024-11-06 14:38:32: 529,254,608 states generated (29,540,595 s/min), 57,938,914 distinct states found (2,983,333 ds/min), 18,996,128 states left on queue. +Progress(32) at 2024-11-06 14:39:32: 558,774,398 states generated (29,519,790 s/min), 61,072,943 distinct states found (3,134,029 ds/min), 19,975,689 states left on queue. +Progress(32) at 2024-11-06 14:40:32: 588,134,665 states generated (29,360,267 s/min), 64,148,888 distinct states found (3,075,945 ds/min), 20,922,407 states left on queue. +Progress(32) at 2024-11-06 14:41:32: 617,464,374 states generated (29,329,709 s/min), 67,306,855 distinct states found (3,157,967 ds/min), 21,928,799 states left on queue. +Progress(32) at 2024-11-06 14:42:32: 646,525,281 states generated (29,060,907 s/min), 70,425,194 distinct states found (3,118,339 ds/min), 22,895,971 states left on queue. +Progress(32) at 2024-11-06 14:43:32: 676,054,893 states generated (29,529,612 s/min), 73,351,905 distinct states found (2,926,711 ds/min), 23,703,779 states left on queue. +Progress(33) at 2024-11-06 14:44:32: 705,581,782 states generated (29,526,889 s/min), 76,200,615 distinct states found (2,848,710 ds/min), 24,414,094 states left on queue. +Progress(33) at 2024-11-06 14:45:32: 735,069,836 states generated (29,488,054 s/min), 79,168,244 distinct states found (2,967,629 ds/min), 25,255,224 states left on queue. +Progress(33) at 2024-11-06 14:46:32: 764,659,188 states generated (29,589,352 s/min), 82,024,430 distinct states found (2,856,186 ds/min), 26,011,047 states left on queue. +Progress(33) at 2024-11-06 14:47:32: 794,276,423 states generated (29,617,235 s/min), 84,974,312 distinct states found (2,949,882 ds/min), 26,868,750 states left on queue. +Progress(33) at 2024-11-06 14:48:32: 823,875,831 states generated (29,599,408 s/min), 88,004,386 distinct states found (3,030,074 ds/min), 27,771,984 states left on queue. +Progress(33) at 2024-11-06 14:49:32: 853,138,894 states generated (29,263,063 s/min), 91,006,890 distinct states found (3,002,504 ds/min), 28,636,661 states left on queue. +Checkpointing of run states/24-11-06-14-20-25.868 +Checkpointing completed at (2024-11-06 14:50:32) +Progress(33) at 2024-11-06 14:50:32: 882,514,167 states generated (29,375,273 s/min), 94,011,000 distinct states found (3,004,110 ds/min), 29,534,516 states left on queue. +Progress(33) at 2024-11-06 14:51:32: 911,838,377 states generated (29,324,210 s/min), 97,108,937 distinct states found (3,097,937 ds/min), 30,498,587 states left on queue. +Progress(33) at 2024-11-06 14:52:32: 940,646,920 states generated (28,808,543 s/min), 100,248,865 distinct states found (3,139,928 ds/min), 31,472,191 states left on queue. +Progress(33) at 2024-11-06 14:53:32: 970,074,175 states generated (29,427,255 s/min), 103,170,815 distinct states found (2,921,950 ds/min), 32,265,691 states left on queue. +Progress(33) at 2024-11-06 14:54:32: 999,627,974 states generated (29,553,799 s/min), 106,004,823 distinct states found (2,834,008 ds/min), 33,009,618 states left on queue. +Progress(34) at 2024-11-06 14:55:32: 1,029,148,983 states generated (29,521,009 s/min), 108,740,783 distinct states found (2,735,960 ds/min), 33,616,222 states left on queue. +Progress(34) at 2024-11-06 14:56:32: 1,058,582,001 states generated (29,433,018 s/min), 111,612,965 distinct states found (2,872,182 ds/min), 34,375,212 states left on queue. +Progress(34) at 2024-11-06 14:57:32: 1,088,123,602 states generated (29,541,601 s/min), 114,464,196 distinct states found (2,851,231 ds/min), 35,116,195 states left on queue. +Progress(34) at 2024-11-06 14:58:32: 1,117,684,936 states generated (29,561,334 s/min), 117,252,198 distinct states found (2,788,002 ds/min), 35,817,205 states left on queue. +Progress(34) at 2024-11-06 14:59:32: 1,147,356,249 states generated (29,671,313 s/min), 120,014,476 distinct states found (2,762,278 ds/min), 36,517,255 states left on queue. +Progress(34) at 2024-11-06 15:00:32: 1,176,921,098 states generated (29,564,849 s/min), 122,859,312 distinct states found (2,844,836 ds/min), 37,291,096 states left on queue. +Progress(34) at 2024-11-06 15:01:32: 1,206,454,440 states generated (29,533,342 s/min), 125,830,942 distinct states found (2,971,630 ds/min), 38,147,762 states left on queue. +Progress(34) at 2024-11-06 15:02:32: 1,235,721,673 states generated (29,267,233 s/min), 128,869,493 distinct states found (3,038,551 ds/min), 39,035,481 states left on queue. +Progress(34) at 2024-11-06 15:03:32: 1,265,097,779 states generated (29,376,106 s/min), 131,669,552 distinct states found (2,800,059 ds/min), 39,746,864 states left on queue. +Progress(34) at 2024-11-06 15:04:32: 1,294,408,098 states generated (29,310,319 s/min), 134,604,630 distinct states found (2,935,078 ds/min), 40,584,235 states left on queue. +Progress(34) at 2024-11-06 15:05:32: 1,323,792,755 states generated (29,384,657 s/min), 137,579,390 distinct states found (2,974,760 ds/min), 41,446,478 states left on queue. +Progress(34) at 2024-11-06 15:06:32: 1,353,085,163 states generated (29,292,408 s/min), 140,575,723 distinct states found (2,996,333 ds/min), 42,309,510 states left on queue. +Progress(34) at 2024-11-06 15:07:32: 1,381,809,417 states generated (28,724,254 s/min), 143,655,566 distinct states found (3,079,843 ds/min), 43,220,682 states left on queue. +Progress(34) at 2024-11-06 15:08:32: 1,411,255,848 states generated (29,446,431 s/min), 146,482,192 distinct states found (2,826,626 ds/min), 43,944,938 states left on queue. +Progress(34) at 2024-11-06 15:09:32: 1,440,646,323 states generated (29,390,475 s/min), 149,419,989 distinct states found (2,937,797 ds/min), 44,763,293 states left on queue. +Progress(34) at 2024-11-06 15:10:32: 1,470,298,568 states generated (29,652,245 s/min), 152,041,419 distinct states found (2,621,430 ds/min), 45,311,911 states left on queue. +Progress(35) at 2024-11-06 15:11:32: 1,499,747,712 states generated (29,449,144 s/min), 154,696,867 distinct states found (2,655,448 ds/min), 45,842,895 states left on queue. +Progress(35) at 2024-11-06 15:12:32: 1,529,256,993 states generated (29,509,281 s/min), 157,493,365 distinct states found (2,796,498 ds/min), 46,535,472 states left on queue. +Progress(35) at 2024-11-06 15:13:32: 1,558,829,306 states generated (29,572,313 s/min), 160,256,575 distinct states found (2,763,210 ds/min), 47,212,471 states left on queue. +Progress(35) at 2024-11-06 15:14:32: 1,588,345,878 states generated (29,516,572 s/min), 163,002,602 distinct states found (2,746,027 ds/min), 47,862,117 states left on queue. +Progress(35) at 2024-11-06 15:15:32: 1,617,885,675 states generated (29,539,797 s/min), 165,699,121 distinct states found (2,696,519 ds/min), 48,472,896 states left on queue. +Progress(35) at 2024-11-06 15:16:32: 1,647,559,965 states generated (29,674,290 s/min), 168,343,286 distinct states found (2,644,165 ds/min), 49,065,377 states left on queue. +Progress(35) at 2024-11-06 15:17:32: 1,677,033,250 states generated (29,473,285 s/min), 171,134,409 distinct states found (2,791,123 ds/min), 49,823,330 states left on queue. +Progress(35) at 2024-11-06 15:18:32: 1,706,730,266 states generated (29,697,016 s/min), 173,860,974 distinct states found (2,726,565 ds/min), 50,493,221 states left on queue. +Error: Invariant LogSafety is violated. +Error: The behavior up to this point is: +State 1: +/\ prop_state = ( p1 :> + [ term |-> 1, + wal |-> <<>>, + state |-> "campaign", + votes |-> <<>>, + termHistory |-> <<>>, + nextSendLsn |-> <<>> ] @@ + p2 :> + [ term |-> 1, + wal |-> <<>>, + state |-> "campaign", + votes |-> <<>>, + termHistory |-> <<>>, + nextSendLsn |-> <<>> ] ) +/\ acc_state = ( a1 :> + [term |-> 0, wal |-> <<>>, termHistory |-> <<[term |-> 0, lsn |-> 1]>>] @@ + a2 :> + [term |-> 0, wal |-> <<>>, termHistory |-> <<[term |-> 0, lsn |-> 1]>>] @@ + a3 :> + [term |-> 0, wal |-> <<>>, termHistory |-> <<[term |-> 0, lsn |-> 1]>>] ) +/\ committed = {} + +State 2: +/\ prop_state = ( p1 :> + [ term |-> 1, + wal |-> <<>>, + state |-> "campaign", + votes |-> + ( a1 :> + [ termHistory |-> <<[term |-> 0, lsn |-> 1]>>, + flushLsn |-> 1 ] ), + termHistory |-> <<>>, + nextSendLsn |-> <<>> ] @@ + p2 :> + [ term |-> 1, + wal |-> <<>>, + state |-> "campaign", + votes |-> <<>>, + termHistory |-> <<>>, + nextSendLsn |-> <<>> ] ) +/\ acc_state = ( a1 :> + [term |-> 1, wal |-> <<>>, termHistory |-> <<[term |-> 0, lsn |-> 1]>>] @@ + a2 :> + [term |-> 0, wal |-> <<>>, termHistory |-> <<[term |-> 0, lsn |-> 1]>>] @@ + a3 :> + [term |-> 0, wal |-> <<>>, termHistory |-> <<[term |-> 0, lsn |-> 1]>>] ) +/\ committed = {} + +State 3: +/\ prop_state = ( p1 :> + [ term |-> 1, + wal |-> <<>>, + state |-> "campaign", + votes |-> + ( a1 :> + [ termHistory |-> <<[term |-> 0, lsn |-> 1]>>, + flushLsn |-> 1 ] ), + termHistory |-> <<>>, + nextSendLsn |-> <<>> ] @@ + p2 :> + [ term |-> 2, + wal |-> <<>>, + state |-> "campaign", + votes |-> <<>>, + termHistory |-> <<>>, + nextSendLsn |-> <<>> ] ) +/\ acc_state = ( a1 :> + [term |-> 1, wal |-> <<>>, termHistory |-> <<[term |-> 0, lsn |-> 1]>>] @@ + a2 :> + [term |-> 0, wal |-> <<>>, termHistory |-> <<[term |-> 0, lsn |-> 1]>>] @@ + a3 :> + [term |-> 0, wal |-> <<>>, termHistory |-> <<[term |-> 0, lsn |-> 1]>>] ) +/\ committed = {} + +State 4: +/\ prop_state = ( p1 :> + [ term |-> 1, + wal |-> <<>>, + state |-> "campaign", + votes |-> + ( a1 :> + [ termHistory |-> <<[term |-> 0, lsn |-> 1]>>, + flushLsn |-> 1 ] @@ + a2 :> + [ termHistory |-> <<[term |-> 0, lsn |-> 1]>>, + flushLsn |-> 1 ] ), + termHistory |-> <<>>, + nextSendLsn |-> <<>> ] @@ + p2 :> + [ term |-> 2, + wal |-> <<>>, + state |-> "campaign", + votes |-> <<>>, + termHistory |-> <<>>, + nextSendLsn |-> <<>> ] ) +/\ acc_state = ( a1 :> + [term |-> 1, wal |-> <<>>, termHistory |-> <<[term |-> 0, lsn |-> 1]>>] @@ + a2 :> + [term |-> 1, wal |-> <<>>, termHistory |-> <<[term |-> 0, lsn |-> 1]>>] @@ + a3 :> + [term |-> 0, wal |-> <<>>, termHistory |-> <<[term |-> 0, lsn |-> 1]>>] ) +/\ committed = {} + +State 5: +/\ prop_state = ( p1 :> + [ term |-> 1, + wal |-> <<>>, + state |-> "leader", + votes |-> + ( a1 :> + [ termHistory |-> <<[term |-> 0, lsn |-> 1]>>, + flushLsn |-> 1 ] @@ + a2 :> + [ termHistory |-> <<[term |-> 0, lsn |-> 1]>>, + flushLsn |-> 1 ] ), + termHistory |-> <<[term |-> 0, lsn |-> 1], [term |-> 1, lsn |-> 1]>>, + nextSendLsn |-> <<>> ] @@ + p2 :> + [ term |-> 2, + wal |-> <<>>, + state |-> "campaign", + votes |-> <<>>, + termHistory |-> <<>>, + nextSendLsn |-> <<>> ] ) +/\ acc_state = ( a1 :> + [term |-> 1, wal |-> <<>>, termHistory |-> <<[term |-> 0, lsn |-> 1]>>] @@ + a2 :> + [term |-> 1, wal |-> <<>>, termHistory |-> <<[term |-> 0, lsn |-> 1]>>] @@ + a3 :> + [term |-> 0, wal |-> <<>>, termHistory |-> <<[term |-> 0, lsn |-> 1]>>] ) +/\ committed = {} + +State 6: +/\ prop_state = ( p1 :> + [ term |-> 1, + wal |-> <<>>, + state |-> "leader", + votes |-> + ( a1 :> + [ termHistory |-> <<[term |-> 0, lsn |-> 1]>>, + flushLsn |-> 1 ] @@ + a2 :> + [ termHistory |-> <<[term |-> 0, lsn |-> 1]>>, + flushLsn |-> 1 ] ), + termHistory |-> <<[term |-> 0, lsn |-> 1], [term |-> 1, lsn |-> 1]>>, + nextSendLsn |-> <<>> ] @@ + p2 :> + [ term |-> 2, + wal |-> <<>>, + state |-> "campaign", + votes |-> + ( a1 :> + [ termHistory |-> <<[term |-> 0, lsn |-> 1]>>, + flushLsn |-> 1 ] ), + termHistory |-> <<>>, + nextSendLsn |-> <<>> ] ) +/\ acc_state = ( a1 :> + [term |-> 2, wal |-> <<>>, termHistory |-> <<[term |-> 0, lsn |-> 1]>>] @@ + a2 :> + [term |-> 1, wal |-> <<>>, termHistory |-> <<[term |-> 0, lsn |-> 1]>>] @@ + a3 :> + [term |-> 0, wal |-> <<>>, termHistory |-> <<[term |-> 0, lsn |-> 1]>>] ) +/\ committed = {} + +State 7: +/\ prop_state = ( p1 :> + [ term |-> 1, + wal |-> <<>>, + state |-> "leader", + votes |-> + ( a1 :> + [ termHistory |-> <<[term |-> 0, lsn |-> 1]>>, + flushLsn |-> 1 ] @@ + a2 :> + [ termHistory |-> <<[term |-> 0, lsn |-> 1]>>, + flushLsn |-> 1 ] ), + termHistory |-> <<[term |-> 0, lsn |-> 1], [term |-> 1, lsn |-> 1]>>, + nextSendLsn |-> (a2 :> 1) ] @@ + p2 :> + [ term |-> 2, + wal |-> <<>>, + state |-> "campaign", + votes |-> + ( a1 :> + [ termHistory |-> <<[term |-> 0, lsn |-> 1]>>, + flushLsn |-> 1 ] ), + termHistory |-> <<>>, + nextSendLsn |-> <<>> ] ) +/\ acc_state = ( a1 :> + [term |-> 2, wal |-> <<>>, termHistory |-> <<[term |-> 0, lsn |-> 1]>>] @@ + a2 :> + [ term |-> 1, + wal |-> <<>>, + termHistory |-> + <<[term |-> 0, lsn |-> 1], [term |-> 1, lsn |-> 1]>> ] @@ + a3 :> + [term |-> 0, wal |-> <<>>, termHistory |-> <<[term |-> 0, lsn |-> 1]>>] ) +/\ committed = {} + +State 8: +/\ prop_state = ( p1 :> + [ term |-> 1, + wal |-> <<>>, + state |-> "leader", + votes |-> + ( a1 :> + [ termHistory |-> <<[term |-> 0, lsn |-> 1]>>, + flushLsn |-> 1 ] @@ + a2 :> + [ termHistory |-> <<[term |-> 0, lsn |-> 1]>>, + flushLsn |-> 1 ] ), + termHistory |-> <<[term |-> 0, lsn |-> 1], [term |-> 1, lsn |-> 1]>>, + nextSendLsn |-> (a2 :> 1) ] @@ + p2 :> + [ term |-> 2, + wal |-> <<>>, + state |-> "campaign", + votes |-> + ( a1 :> + [ termHistory |-> <<[term |-> 0, lsn |-> 1]>>, + flushLsn |-> 1 ] @@ + a3 :> + [ termHistory |-> <<[term |-> 0, lsn |-> 1]>>, + flushLsn |-> 1 ] ), + termHistory |-> <<>>, + nextSendLsn |-> <<>> ] ) +/\ acc_state = ( a1 :> + [term |-> 2, wal |-> <<>>, termHistory |-> <<[term |-> 0, lsn |-> 1]>>] @@ + a2 :> + [ term |-> 1, + wal |-> <<>>, + termHistory |-> + <<[term |-> 0, lsn |-> 1], [term |-> 1, lsn |-> 1]>> ] @@ + a3 :> + [term |-> 2, wal |-> <<>>, termHistory |-> <<[term |-> 0, lsn |-> 1]>>] ) +/\ committed = {} + +State 9: +/\ prop_state = ( p1 :> + [ term |-> 1, + wal |-> <<>>, + state |-> "leader", + votes |-> + ( a1 :> + [ termHistory |-> <<[term |-> 0, lsn |-> 1]>>, + flushLsn |-> 1 ] @@ + a2 :> + [ termHistory |-> <<[term |-> 0, lsn |-> 1]>>, + flushLsn |-> 1 ] ), + termHistory |-> <<[term |-> 0, lsn |-> 1], [term |-> 1, lsn |-> 1]>>, + nextSendLsn |-> (a2 :> 1) ] @@ + p2 :> + [ term |-> 2, + wal |-> <<>>, + state |-> "leader", + votes |-> + ( a1 :> + [ termHistory |-> <<[term |-> 0, lsn |-> 1]>>, + flushLsn |-> 1 ] @@ + a3 :> + [ termHistory |-> <<[term |-> 0, lsn |-> 1]>>, + flushLsn |-> 1 ] ), + termHistory |-> <<[term |-> 0, lsn |-> 1], [term |-> 2, lsn |-> 1]>>, + nextSendLsn |-> <<>> ] ) +/\ acc_state = ( a1 :> + [term |-> 2, wal |-> <<>>, termHistory |-> <<[term |-> 0, lsn |-> 1]>>] @@ + a2 :> + [ term |-> 1, + wal |-> <<>>, + termHistory |-> + <<[term |-> 0, lsn |-> 1], [term |-> 1, lsn |-> 1]>> ] @@ + a3 :> + [term |-> 2, wal |-> <<>>, termHistory |-> <<[term |-> 0, lsn |-> 1]>>] ) +/\ committed = {} + +State 10: +/\ prop_state = ( p1 :> + [ term |-> 1, + wal |-> <<>>, + state |-> "leader", + votes |-> + ( a1 :> + [ termHistory |-> <<[term |-> 0, lsn |-> 1]>>, + flushLsn |-> 1 ] @@ + a2 :> + [ termHistory |-> <<[term |-> 0, lsn |-> 1]>>, + flushLsn |-> 1 ] ), + termHistory |-> <<[term |-> 0, lsn |-> 1], [term |-> 1, lsn |-> 1]>>, + nextSendLsn |-> (a2 :> 1) ] @@ + p2 :> + [ term |-> 2, + wal |-> <<>>, + state |-> "leader", + votes |-> + ( a1 :> + [ termHistory |-> <<[term |-> 0, lsn |-> 1]>>, + flushLsn |-> 1 ] @@ + a3 :> + [ termHistory |-> <<[term |-> 0, lsn |-> 1]>>, + flushLsn |-> 1 ] ), + termHistory |-> <<[term |-> 0, lsn |-> 1], [term |-> 2, lsn |-> 1]>>, + nextSendLsn |-> (a1 :> 1) ] ) +/\ acc_state = ( a1 :> + [ term |-> 2, + wal |-> <<>>, + termHistory |-> + <<[term |-> 0, lsn |-> 1], [term |-> 2, lsn |-> 1]>> ] @@ + a2 :> + [ term |-> 1, + wal |-> <<>>, + termHistory |-> + <<[term |-> 0, lsn |-> 1], [term |-> 1, lsn |-> 1]>> ] @@ + a3 :> + [term |-> 2, wal |-> <<>>, termHistory |-> <<[term |-> 0, lsn |-> 1]>>] ) +/\ committed = {} + +State 11: +/\ prop_state = ( p1 :> + [ term |-> 1, + wal |-> <<>>, + state |-> "leader", + votes |-> + ( a1 :> + [ termHistory |-> <<[term |-> 0, lsn |-> 1]>>, + flushLsn |-> 1 ] @@ + a2 :> + [ termHistory |-> <<[term |-> 0, lsn |-> 1]>>, + flushLsn |-> 1 ] ), + termHistory |-> <<[term |-> 0, lsn |-> 1], [term |-> 1, lsn |-> 1]>>, + nextSendLsn |-> (a2 :> 1) ] @@ + p2 :> + [ term |-> 3, + wal |-> <<>>, + state |-> "campaign", + votes |-> <<>>, + termHistory |-> <<>>, + nextSendLsn |-> <<>> ] ) +/\ acc_state = ( a1 :> + [ term |-> 2, + wal |-> <<>>, + termHistory |-> + <<[term |-> 0, lsn |-> 1], [term |-> 2, lsn |-> 1]>> ] @@ + a2 :> + [ term |-> 1, + wal |-> <<>>, + termHistory |-> + <<[term |-> 0, lsn |-> 1], [term |-> 1, lsn |-> 1]>> ] @@ + a3 :> + [term |-> 2, wal |-> <<>>, termHistory |-> <<[term |-> 0, lsn |-> 1]>>] ) +/\ committed = {} + +State 12: +/\ prop_state = ( p1 :> + [ term |-> 1, + wal |-> <<>>, + state |-> "leader", + votes |-> + ( a1 :> + [ termHistory |-> <<[term |-> 0, lsn |-> 1]>>, + flushLsn |-> 1 ] @@ + a2 :> + [ termHistory |-> <<[term |-> 0, lsn |-> 1]>>, + flushLsn |-> 1 ] ), + termHistory |-> <<[term |-> 0, lsn |-> 1], [term |-> 1, lsn |-> 1]>>, + nextSendLsn |-> (a2 :> 1) ] @@ + p2 :> + [ term |-> 3, + wal |-> <<>>, + state |-> "campaign", + votes |-> + ( a3 :> + [ termHistory |-> <<[term |-> 0, lsn |-> 1]>>, + flushLsn |-> 1 ] ), + termHistory |-> <<>>, + nextSendLsn |-> <<>> ] ) +/\ acc_state = ( a1 :> + [ term |-> 2, + wal |-> <<>>, + termHistory |-> + <<[term |-> 0, lsn |-> 1], [term |-> 2, lsn |-> 1]>> ] @@ + a2 :> + [ term |-> 1, + wal |-> <<>>, + termHistory |-> + <<[term |-> 0, lsn |-> 1], [term |-> 1, lsn |-> 1]>> ] @@ + a3 :> + [term |-> 3, wal |-> <<>>, termHistory |-> <<[term |-> 0, lsn |-> 1]>>] ) +/\ committed = {} + +State 13: +/\ prop_state = ( p1 :> + [ term |-> 1, + wal |-> <<1>>, + state |-> "leader", + votes |-> + ( a1 :> + [ termHistory |-> <<[term |-> 0, lsn |-> 1]>>, + flushLsn |-> 1 ] @@ + a2 :> + [ termHistory |-> <<[term |-> 0, lsn |-> 1]>>, + flushLsn |-> 1 ] ), + termHistory |-> <<[term |-> 0, lsn |-> 1], [term |-> 1, lsn |-> 1]>>, + nextSendLsn |-> (a2 :> 1) ] @@ + p2 :> + [ term |-> 3, + wal |-> <<>>, + state |-> "campaign", + votes |-> + ( a3 :> + [ termHistory |-> <<[term |-> 0, lsn |-> 1]>>, + flushLsn |-> 1 ] ), + termHistory |-> <<>>, + nextSendLsn |-> <<>> ] ) +/\ acc_state = ( a1 :> + [ term |-> 2, + wal |-> <<>>, + termHistory |-> + <<[term |-> 0, lsn |-> 1], [term |-> 2, lsn |-> 1]>> ] @@ + a2 :> + [ term |-> 1, + wal |-> <<>>, + termHistory |-> + <<[term |-> 0, lsn |-> 1], [term |-> 1, lsn |-> 1]>> ] @@ + a3 :> + [term |-> 3, wal |-> <<>>, termHistory |-> <<[term |-> 0, lsn |-> 1]>>] ) +/\ committed = {} + +State 14: +/\ prop_state = ( p1 :> + [ term |-> 1, + wal |-> <<1, 1>>, + state |-> "leader", + votes |-> + ( a1 :> + [ termHistory |-> <<[term |-> 0, lsn |-> 1]>>, + flushLsn |-> 1 ] @@ + a2 :> + [ termHistory |-> <<[term |-> 0, lsn |-> 1]>>, + flushLsn |-> 1 ] ), + termHistory |-> <<[term |-> 0, lsn |-> 1], [term |-> 1, lsn |-> 1]>>, + nextSendLsn |-> (a2 :> 1) ] @@ + p2 :> + [ term |-> 3, + wal |-> <<>>, + state |-> "campaign", + votes |-> + ( a3 :> + [ termHistory |-> <<[term |-> 0, lsn |-> 1]>>, + flushLsn |-> 1 ] ), + termHistory |-> <<>>, + nextSendLsn |-> <<>> ] ) +/\ acc_state = ( a1 :> + [ term |-> 2, + wal |-> <<>>, + termHistory |-> + <<[term |-> 0, lsn |-> 1], [term |-> 2, lsn |-> 1]>> ] @@ + a2 :> + [ term |-> 1, + wal |-> <<>>, + termHistory |-> + <<[term |-> 0, lsn |-> 1], [term |-> 1, lsn |-> 1]>> ] @@ + a3 :> + [term |-> 3, wal |-> <<>>, termHistory |-> <<[term |-> 0, lsn |-> 1]>>] ) +/\ committed = {} + +State 15: +/\ prop_state = ( p1 :> + [ term |-> 1, + wal |-> <<1, 1>>, + state |-> "leader", + votes |-> + ( a1 :> + [ termHistory |-> <<[term |-> 0, lsn |-> 1]>>, + flushLsn |-> 1 ] @@ + a2 :> + [ termHistory |-> <<[term |-> 0, lsn |-> 1]>>, + flushLsn |-> 1 ] ), + termHistory |-> <<[term |-> 0, lsn |-> 1], [term |-> 1, lsn |-> 1]>>, + nextSendLsn |-> (a2 :> 2) ] @@ + p2 :> + [ term |-> 3, + wal |-> <<>>, + state |-> "campaign", + votes |-> + ( a3 :> + [ termHistory |-> <<[term |-> 0, lsn |-> 1]>>, + flushLsn |-> 1 ] ), + termHistory |-> <<>>, + nextSendLsn |-> <<>> ] ) +/\ acc_state = ( a1 :> + [ term |-> 2, + wal |-> <<>>, + termHistory |-> + <<[term |-> 0, lsn |-> 1], [term |-> 2, lsn |-> 1]>> ] @@ + a2 :> + [ term |-> 1, + wal |-> <<1>>, + termHistory |-> + <<[term |-> 0, lsn |-> 1], [term |-> 1, lsn |-> 1]>> ] @@ + a3 :> + [term |-> 3, wal |-> <<>>, termHistory |-> <<[term |-> 0, lsn |-> 1]>>] ) +/\ committed = {} + +State 16: +/\ prop_state = ( p1 :> + [ term |-> 1, + wal |-> <<1, 1>>, + state |-> "leader", + votes |-> + ( a1 :> + [ termHistory |-> <<[term |-> 0, lsn |-> 1]>>, + flushLsn |-> 1 ] @@ + a2 :> + [ termHistory |-> <<[term |-> 0, lsn |-> 1]>>, + flushLsn |-> 1 ] ), + termHistory |-> <<[term |-> 0, lsn |-> 1], [term |-> 1, lsn |-> 1]>>, + nextSendLsn |-> (a2 :> 3) ] @@ + p2 :> + [ term |-> 3, + wal |-> <<>>, + state |-> "campaign", + votes |-> + ( a3 :> + [ termHistory |-> <<[term |-> 0, lsn |-> 1]>>, + flushLsn |-> 1 ] ), + termHistory |-> <<>>, + nextSendLsn |-> <<>> ] ) +/\ acc_state = ( a1 :> + [ term |-> 2, + wal |-> <<>>, + termHistory |-> + <<[term |-> 0, lsn |-> 1], [term |-> 2, lsn |-> 1]>> ] @@ + a2 :> + [ term |-> 1, + wal |-> <<1, 1>>, + termHistory |-> + <<[term |-> 0, lsn |-> 1], [term |-> 1, lsn |-> 1]>> ] @@ + a3 :> + [term |-> 3, wal |-> <<>>, termHistory |-> <<[term |-> 0, lsn |-> 1]>>] ) +/\ committed = {} + +State 17: +/\ prop_state = ( p1 :> + [ term |-> 4, + wal |-> <<>>, + state |-> "campaign", + votes |-> <<>>, + termHistory |-> <<>>, + nextSendLsn |-> <<>> ] @@ + p2 :> + [ term |-> 3, + wal |-> <<>>, + state |-> "campaign", + votes |-> + ( a3 :> + [ termHistory |-> <<[term |-> 0, lsn |-> 1]>>, + flushLsn |-> 1 ] ), + termHistory |-> <<>>, + nextSendLsn |-> <<>> ] ) +/\ acc_state = ( a1 :> + [ term |-> 2, + wal |-> <<>>, + termHistory |-> + <<[term |-> 0, lsn |-> 1], [term |-> 2, lsn |-> 1]>> ] @@ + a2 :> + [ term |-> 1, + wal |-> <<1, 1>>, + termHistory |-> + <<[term |-> 0, lsn |-> 1], [term |-> 1, lsn |-> 1]>> ] @@ + a3 :> + [term |-> 3, wal |-> <<>>, termHistory |-> <<[term |-> 0, lsn |-> 1]>>] ) +/\ committed = {} + +State 18: +/\ prop_state = ( p1 :> + [ term |-> 4, + wal |-> <<>>, + state |-> "campaign", + votes |-> + ( a1 :> + [ termHistory |-> + <<[term |-> 0, lsn |-> 1], [term |-> 2, lsn |-> 1]>>, + flushLsn |-> 1 ] ), + termHistory |-> <<>>, + nextSendLsn |-> <<>> ] @@ + p2 :> + [ term |-> 3, + wal |-> <<>>, + state |-> "campaign", + votes |-> + ( a3 :> + [ termHistory |-> <<[term |-> 0, lsn |-> 1]>>, + flushLsn |-> 1 ] ), + termHistory |-> <<>>, + nextSendLsn |-> <<>> ] ) +/\ acc_state = ( a1 :> + [ term |-> 4, + wal |-> <<>>, + termHistory |-> + <<[term |-> 0, lsn |-> 1], [term |-> 2, lsn |-> 1]>> ] @@ + a2 :> + [ term |-> 1, + wal |-> <<1, 1>>, + termHistory |-> + <<[term |-> 0, lsn |-> 1], [term |-> 1, lsn |-> 1]>> ] @@ + a3 :> + [term |-> 3, wal |-> <<>>, termHistory |-> <<[term |-> 0, lsn |-> 1]>>] ) +/\ committed = {} + +State 19: +/\ prop_state = ( p1 :> + [ term |-> 4, + wal |-> <<>>, + state |-> "campaign", + votes |-> + ( a1 :> + [ termHistory |-> + <<[term |-> 0, lsn |-> 1], [term |-> 2, lsn |-> 1]>>, + flushLsn |-> 1 ] ), + termHistory |-> <<>>, + nextSendLsn |-> <<>> ] @@ + p2 :> + [ term |-> 3, + wal |-> <<>>, + state |-> "campaign", + votes |-> + ( a2 :> + [ termHistory |-> + <<[term |-> 0, lsn |-> 1], [term |-> 1, lsn |-> 1]>>, + flushLsn |-> 3 ] @@ + a3 :> + [ termHistory |-> <<[term |-> 0, lsn |-> 1]>>, + flushLsn |-> 1 ] ), + termHistory |-> <<>>, + nextSendLsn |-> <<>> ] ) +/\ acc_state = ( a1 :> + [ term |-> 4, + wal |-> <<>>, + termHistory |-> + <<[term |-> 0, lsn |-> 1], [term |-> 2, lsn |-> 1]>> ] @@ + a2 :> + [ term |-> 3, + wal |-> <<1, 1>>, + termHistory |-> + <<[term |-> 0, lsn |-> 1], [term |-> 1, lsn |-> 1]>> ] @@ + a3 :> + [term |-> 3, wal |-> <<>>, termHistory |-> <<[term |-> 0, lsn |-> 1]>>] ) +/\ committed = {} + +State 20: +/\ prop_state = ( p1 :> + [ term |-> 4, + wal |-> <<>>, + state |-> "campaign", + votes |-> + ( a1 :> + [ termHistory |-> + <<[term |-> 0, lsn |-> 1], [term |-> 2, lsn |-> 1]>>, + flushLsn |-> 1 ] ), + termHistory |-> <<>>, + nextSendLsn |-> <<>> ] @@ + p2 :> + [ term |-> 3, + wal |-> <<1, 1>>, + state |-> "leader", + votes |-> + ( a2 :> + [ termHistory |-> + <<[term |-> 0, lsn |-> 1], [term |-> 1, lsn |-> 1]>>, + flushLsn |-> 3 ] @@ + a3 :> + [ termHistory |-> <<[term |-> 0, lsn |-> 1]>>, + flushLsn |-> 1 ] ), + termHistory |-> + << [term |-> 0, lsn |-> 1], + [term |-> 1, lsn |-> 1], + [term |-> 3, lsn |-> 3] >>, + nextSendLsn |-> <<>> ] ) +/\ acc_state = ( a1 :> + [ term |-> 4, + wal |-> <<>>, + termHistory |-> + <<[term |-> 0, lsn |-> 1], [term |-> 2, lsn |-> 1]>> ] @@ + a2 :> + [ term |-> 3, + wal |-> <<1, 1>>, + termHistory |-> + <<[term |-> 0, lsn |-> 1], [term |-> 1, lsn |-> 1]>> ] @@ + a3 :> + [term |-> 3, wal |-> <<>>, termHistory |-> <<[term |-> 0, lsn |-> 1]>>] ) +/\ committed = {} + +State 21: +/\ prop_state = ( p1 :> + [ term |-> 4, + wal |-> <<>>, + state |-> "campaign", + votes |-> + ( a1 :> + [ termHistory |-> + <<[term |-> 0, lsn |-> 1], [term |-> 2, lsn |-> 1]>>, + flushLsn |-> 1 ] ), + termHistory |-> <<>>, + nextSendLsn |-> <<>> ] @@ + p2 :> + [ term |-> 3, + wal |-> <<1, 1>>, + state |-> "leader", + votes |-> + ( a2 :> + [ termHistory |-> + <<[term |-> 0, lsn |-> 1], [term |-> 1, lsn |-> 1]>>, + flushLsn |-> 3 ] @@ + a3 :> + [ termHistory |-> <<[term |-> 0, lsn |-> 1]>>, + flushLsn |-> 1 ] ), + termHistory |-> + << [term |-> 0, lsn |-> 1], + [term |-> 1, lsn |-> 1], + [term |-> 3, lsn |-> 3] >>, + nextSendLsn |-> (a2 :> 3) ] ) +/\ acc_state = ( a1 :> + [ term |-> 4, + wal |-> <<>>, + termHistory |-> + <<[term |-> 0, lsn |-> 1], [term |-> 2, lsn |-> 1]>> ] @@ + a2 :> + [ term |-> 3, + wal |-> <<1, 1>>, + termHistory |-> + << [term |-> 0, lsn |-> 1], + [term |-> 1, lsn |-> 1], + [term |-> 3, lsn |-> 3] >> ] @@ + a3 :> + [term |-> 3, wal |-> <<>>, termHistory |-> <<[term |-> 0, lsn |-> 1]>>] ) +/\ committed = {} + +State 22: +/\ prop_state = ( p1 :> + [ term |-> 4, + wal |-> <<>>, + state |-> "campaign", + votes |-> + ( a1 :> + [ termHistory |-> + <<[term |-> 0, lsn |-> 1], [term |-> 2, lsn |-> 1]>>, + flushLsn |-> 1 ] ), + termHistory |-> <<>>, + nextSendLsn |-> <<>> ] @@ + p2 :> + [ term |-> 3, + wal |-> <<1, 1>>, + state |-> "leader", + votes |-> + ( a2 :> + [ termHistory |-> + <<[term |-> 0, lsn |-> 1], [term |-> 1, lsn |-> 1]>>, + flushLsn |-> 3 ] @@ + a3 :> + [ termHistory |-> <<[term |-> 0, lsn |-> 1]>>, + flushLsn |-> 1 ] ), + termHistory |-> + << [term |-> 0, lsn |-> 1], + [term |-> 1, lsn |-> 1], + [term |-> 3, lsn |-> 3] >>, + nextSendLsn |-> (a2 :> 3 @@ a3 :> 1) ] ) +/\ acc_state = ( a1 :> + [ term |-> 4, + wal |-> <<>>, + termHistory |-> + <<[term |-> 0, lsn |-> 1], [term |-> 2, lsn |-> 1]>> ] @@ + a2 :> + [ term |-> 3, + wal |-> <<1, 1>>, + termHistory |-> + << [term |-> 0, lsn |-> 1], + [term |-> 1, lsn |-> 1], + [term |-> 3, lsn |-> 3] >> ] @@ + a3 :> + [ term |-> 3, + wal |-> <<>>, + termHistory |-> + << [term |-> 0, lsn |-> 1], + [term |-> 1, lsn |-> 1], + [term |-> 3, lsn |-> 3] >> ] ) +/\ committed = {} + +State 23: +/\ prop_state = ( p1 :> + [ term |-> 4, + wal |-> <<>>, + state |-> "campaign", + votes |-> + ( a1 :> + [ termHistory |-> + <<[term |-> 0, lsn |-> 1], [term |-> 2, lsn |-> 1]>>, + flushLsn |-> 1 ] ), + termHistory |-> <<>>, + nextSendLsn |-> <<>> ] @@ + p2 :> + [ term |-> 3, + wal |-> <<1, 1>>, + state |-> "leader", + votes |-> + ( a2 :> + [ termHistory |-> + <<[term |-> 0, lsn |-> 1], [term |-> 1, lsn |-> 1]>>, + flushLsn |-> 3 ] @@ + a3 :> + [ termHistory |-> <<[term |-> 0, lsn |-> 1]>>, + flushLsn |-> 1 ] ), + termHistory |-> + << [term |-> 0, lsn |-> 1], + [term |-> 1, lsn |-> 1], + [term |-> 3, lsn |-> 3] >>, + nextSendLsn |-> (a2 :> 3 @@ a3 :> 2) ] ) +/\ acc_state = ( a1 :> + [ term |-> 4, + wal |-> <<>>, + termHistory |-> + <<[term |-> 0, lsn |-> 1], [term |-> 2, lsn |-> 1]>> ] @@ + a2 :> + [ term |-> 3, + wal |-> <<1, 1>>, + termHistory |-> + << [term |-> 0, lsn |-> 1], + [term |-> 1, lsn |-> 1], + [term |-> 3, lsn |-> 3] >> ] @@ + a3 :> + [ term |-> 3, + wal |-> <<1>>, + termHistory |-> + << [term |-> 0, lsn |-> 1], + [term |-> 1, lsn |-> 1], + [term |-> 3, lsn |-> 3] >> ] ) +/\ committed = {} + +State 24: +/\ prop_state = ( p1 :> + [ term |-> 4, + wal |-> <<>>, + state |-> "campaign", + votes |-> + ( a1 :> + [ termHistory |-> + <<[term |-> 0, lsn |-> 1], [term |-> 2, lsn |-> 1]>>, + flushLsn |-> 1 ] ), + termHistory |-> <<>>, + nextSendLsn |-> <<>> ] @@ + p2 :> + [ term |-> 3, + wal |-> <<1, 1>>, + state |-> "leader", + votes |-> + ( a2 :> + [ termHistory |-> + <<[term |-> 0, lsn |-> 1], [term |-> 1, lsn |-> 1]>>, + flushLsn |-> 3 ] @@ + a3 :> + [ termHistory |-> <<[term |-> 0, lsn |-> 1]>>, + flushLsn |-> 1 ] ), + termHistory |-> + << [term |-> 0, lsn |-> 1], + [term |-> 1, lsn |-> 1], + [term |-> 3, lsn |-> 3] >>, + nextSendLsn |-> (a2 :> 3 @@ a3 :> 2) ] ) +/\ acc_state = ( a1 :> + [ term |-> 4, + wal |-> <<>>, + termHistory |-> + <<[term |-> 0, lsn |-> 1], [term |-> 2, lsn |-> 1]>> ] @@ + a2 :> + [ term |-> 3, + wal |-> <<1, 1>>, + termHistory |-> + << [term |-> 0, lsn |-> 1], + [term |-> 1, lsn |-> 1], + [term |-> 3, lsn |-> 3] >> ] @@ + a3 :> + [ term |-> 3, + wal |-> <<1>>, + termHistory |-> + << [term |-> 0, lsn |-> 1], + [term |-> 1, lsn |-> 1], + [term |-> 3, lsn |-> 3] >> ] ) +/\ committed = {[term |-> 1, lsn |-> 1]} + +State 25: +/\ prop_state = ( p1 :> + [ term |-> 4, + wal |-> <<>>, + state |-> "campaign", + votes |-> + ( a1 :> + [ termHistory |-> + <<[term |-> 0, lsn |-> 1], [term |-> 2, lsn |-> 1]>>, + flushLsn |-> 1 ] @@ + a3 :> + [ termHistory |-> + <<[term |-> 0, lsn |-> 1], [term |-> 1, lsn |-> 1]>>, + flushLsn |-> 2 ] ), + termHistory |-> <<>>, + nextSendLsn |-> <<>> ] @@ + p2 :> + [ term |-> 3, + wal |-> <<1, 1>>, + state |-> "leader", + votes |-> + ( a2 :> + [ termHistory |-> + <<[term |-> 0, lsn |-> 1], [term |-> 1, lsn |-> 1]>>, + flushLsn |-> 3 ] @@ + a3 :> + [ termHistory |-> <<[term |-> 0, lsn |-> 1]>>, + flushLsn |-> 1 ] ), + termHistory |-> + << [term |-> 0, lsn |-> 1], + [term |-> 1, lsn |-> 1], + [term |-> 3, lsn |-> 3] >>, + nextSendLsn |-> (a2 :> 3 @@ a3 :> 2) ] ) +/\ acc_state = ( a1 :> + [ term |-> 4, + wal |-> <<>>, + termHistory |-> + <<[term |-> 0, lsn |-> 1], [term |-> 2, lsn |-> 1]>> ] @@ + a2 :> + [ term |-> 3, + wal |-> <<1, 1>>, + termHistory |-> + << [term |-> 0, lsn |-> 1], + [term |-> 1, lsn |-> 1], + [term |-> 3, lsn |-> 3] >> ] @@ + a3 :> + [ term |-> 4, + wal |-> <<1>>, + termHistory |-> + << [term |-> 0, lsn |-> 1], + [term |-> 1, lsn |-> 1], + [term |-> 3, lsn |-> 3] >> ] ) +/\ committed = {[term |-> 1, lsn |-> 1]} + +State 26: +/\ prop_state = ( p1 :> + [ term |-> 4, + wal |-> <<>>, + state |-> "leader", + votes |-> + ( a1 :> + [ termHistory |-> + <<[term |-> 0, lsn |-> 1], [term |-> 2, lsn |-> 1]>>, + flushLsn |-> 1 ] @@ + a3 :> + [ termHistory |-> + <<[term |-> 0, lsn |-> 1], [term |-> 1, lsn |-> 1]>>, + flushLsn |-> 2 ] ), + termHistory |-> + << [term |-> 0, lsn |-> 1], + [term |-> 2, lsn |-> 1], + [term |-> 4, lsn |-> 1] >>, + nextSendLsn |-> <<>> ] @@ + p2 :> + [ term |-> 3, + wal |-> <<1, 1>>, + state |-> "leader", + votes |-> + ( a2 :> + [ termHistory |-> + <<[term |-> 0, lsn |-> 1], [term |-> 1, lsn |-> 1]>>, + flushLsn |-> 3 ] @@ + a3 :> + [ termHistory |-> <<[term |-> 0, lsn |-> 1]>>, + flushLsn |-> 1 ] ), + termHistory |-> + << [term |-> 0, lsn |-> 1], + [term |-> 1, lsn |-> 1], + [term |-> 3, lsn |-> 3] >>, + nextSendLsn |-> (a2 :> 3 @@ a3 :> 2) ] ) +/\ acc_state = ( a1 :> + [ term |-> 4, + wal |-> <<>>, + termHistory |-> + <<[term |-> 0, lsn |-> 1], [term |-> 2, lsn |-> 1]>> ] @@ + a2 :> + [ term |-> 3, + wal |-> <<1, 1>>, + termHistory |-> + << [term |-> 0, lsn |-> 1], + [term |-> 1, lsn |-> 1], + [term |-> 3, lsn |-> 3] >> ] @@ + a3 :> + [ term |-> 4, + wal |-> <<1>>, + termHistory |-> + << [term |-> 0, lsn |-> 1], + [term |-> 1, lsn |-> 1], + [term |-> 3, lsn |-> 3] >> ] ) +/\ committed = {[term |-> 1, lsn |-> 1]} + +State 27: +/\ prop_state = ( p1 :> + [ term |-> 4, + wal |-> <<>>, + state |-> "leader", + votes |-> + ( a1 :> + [ termHistory |-> + <<[term |-> 0, lsn |-> 1], [term |-> 2, lsn |-> 1]>>, + flushLsn |-> 1 ] @@ + a3 :> + [ termHistory |-> + <<[term |-> 0, lsn |-> 1], [term |-> 1, lsn |-> 1]>>, + flushLsn |-> 2 ] ), + termHistory |-> + << [term |-> 0, lsn |-> 1], + [term |-> 2, lsn |-> 1], + [term |-> 4, lsn |-> 1] >>, + nextSendLsn |-> (a3 :> 1) ] @@ + p2 :> + [ term |-> 3, + wal |-> <<1, 1>>, + state |-> "leader", + votes |-> + ( a2 :> + [ termHistory |-> + <<[term |-> 0, lsn |-> 1], [term |-> 1, lsn |-> 1]>>, + flushLsn |-> 3 ] @@ + a3 :> + [ termHistory |-> <<[term |-> 0, lsn |-> 1]>>, + flushLsn |-> 1 ] ), + termHistory |-> + << [term |-> 0, lsn |-> 1], + [term |-> 1, lsn |-> 1], + [term |-> 3, lsn |-> 3] >>, + nextSendLsn |-> (a2 :> 3 @@ a3 :> 2) ] ) +/\ acc_state = ( a1 :> + [ term |-> 4, + wal |-> <<>>, + termHistory |-> + <<[term |-> 0, lsn |-> 1], [term |-> 2, lsn |-> 1]>> ] @@ + a2 :> + [ term |-> 3, + wal |-> <<1, 1>>, + termHistory |-> + << [term |-> 0, lsn |-> 1], + [term |-> 1, lsn |-> 1], + [term |-> 3, lsn |-> 3] >> ] @@ + a3 :> + [ term |-> 4, + wal |-> <<>>, + termHistory |-> + << [term |-> 0, lsn |-> 1], + [term |-> 2, lsn |-> 1], + [term |-> 4, lsn |-> 1] >> ] ) +/\ committed = {[term |-> 1, lsn |-> 1]} + +State 28: +/\ prop_state = ( p1 :> + [ term |-> 4, + wal |-> <<4>>, + state |-> "leader", + votes |-> + ( a1 :> + [ termHistory |-> + <<[term |-> 0, lsn |-> 1], [term |-> 2, lsn |-> 1]>>, + flushLsn |-> 1 ] @@ + a3 :> + [ termHistory |-> + <<[term |-> 0, lsn |-> 1], [term |-> 1, lsn |-> 1]>>, + flushLsn |-> 2 ] ), + termHistory |-> + << [term |-> 0, lsn |-> 1], + [term |-> 2, lsn |-> 1], + [term |-> 4, lsn |-> 1] >>, + nextSendLsn |-> (a3 :> 1) ] @@ + p2 :> + [ term |-> 3, + wal |-> <<1, 1>>, + state |-> "leader", + votes |-> + ( a2 :> + [ termHistory |-> + <<[term |-> 0, lsn |-> 1], [term |-> 1, lsn |-> 1]>>, + flushLsn |-> 3 ] @@ + a3 :> + [ termHistory |-> <<[term |-> 0, lsn |-> 1]>>, + flushLsn |-> 1 ] ), + termHistory |-> + << [term |-> 0, lsn |-> 1], + [term |-> 1, lsn |-> 1], + [term |-> 3, lsn |-> 3] >>, + nextSendLsn |-> (a2 :> 3 @@ a3 :> 2) ] ) +/\ acc_state = ( a1 :> + [ term |-> 4, + wal |-> <<>>, + termHistory |-> + <<[term |-> 0, lsn |-> 1], [term |-> 2, lsn |-> 1]>> ] @@ + a2 :> + [ term |-> 3, + wal |-> <<1, 1>>, + termHistory |-> + << [term |-> 0, lsn |-> 1], + [term |-> 1, lsn |-> 1], + [term |-> 3, lsn |-> 3] >> ] @@ + a3 :> + [ term |-> 4, + wal |-> <<>>, + termHistory |-> + << [term |-> 0, lsn |-> 1], + [term |-> 2, lsn |-> 1], + [term |-> 4, lsn |-> 1] >> ] ) +/\ committed = {[term |-> 1, lsn |-> 1]} + +State 29: +/\ prop_state = ( p1 :> + [ term |-> 4, + wal |-> <<4>>, + state |-> "leader", + votes |-> + ( a1 :> + [ termHistory |-> + <<[term |-> 0, lsn |-> 1], [term |-> 2, lsn |-> 1]>>, + flushLsn |-> 1 ] @@ + a3 :> + [ termHistory |-> + <<[term |-> 0, lsn |-> 1], [term |-> 1, lsn |-> 1]>>, + flushLsn |-> 2 ] ), + termHistory |-> + << [term |-> 0, lsn |-> 1], + [term |-> 2, lsn |-> 1], + [term |-> 4, lsn |-> 1] >>, + nextSendLsn |-> (a3 :> 2) ] @@ + p2 :> + [ term |-> 3, + wal |-> <<1, 1>>, + state |-> "leader", + votes |-> + ( a2 :> + [ termHistory |-> + <<[term |-> 0, lsn |-> 1], [term |-> 1, lsn |-> 1]>>, + flushLsn |-> 3 ] @@ + a3 :> + [ termHistory |-> <<[term |-> 0, lsn |-> 1]>>, + flushLsn |-> 1 ] ), + termHistory |-> + << [term |-> 0, lsn |-> 1], + [term |-> 1, lsn |-> 1], + [term |-> 3, lsn |-> 3] >>, + nextSendLsn |-> (a2 :> 3 @@ a3 :> 2) ] ) +/\ acc_state = ( a1 :> + [ term |-> 4, + wal |-> <<>>, + termHistory |-> + <<[term |-> 0, lsn |-> 1], [term |-> 2, lsn |-> 1]>> ] @@ + a2 :> + [ term |-> 3, + wal |-> <<1, 1>>, + termHistory |-> + << [term |-> 0, lsn |-> 1], + [term |-> 1, lsn |-> 1], + [term |-> 3, lsn |-> 3] >> ] @@ + a3 :> + [ term |-> 4, + wal |-> <<4>>, + termHistory |-> + << [term |-> 0, lsn |-> 1], + [term |-> 2, lsn |-> 1], + [term |-> 4, lsn |-> 1] >> ] ) +/\ committed = {[term |-> 1, lsn |-> 1]} + +State 30: +/\ prop_state = ( p1 :> + [ term |-> 4, + wal |-> <<4>>, + state |-> "leader", + votes |-> + ( a1 :> + [ termHistory |-> + <<[term |-> 0, lsn |-> 1], [term |-> 2, lsn |-> 1]>>, + flushLsn |-> 1 ] @@ + a3 :> + [ termHistory |-> + <<[term |-> 0, lsn |-> 1], [term |-> 1, lsn |-> 1]>>, + flushLsn |-> 2 ] ), + termHistory |-> + << [term |-> 0, lsn |-> 1], + [term |-> 2, lsn |-> 1], + [term |-> 4, lsn |-> 1] >>, + nextSendLsn |-> (a1 :> 1 @@ a3 :> 2) ] @@ + p2 :> + [ term |-> 3, + wal |-> <<1, 1>>, + state |-> "leader", + votes |-> + ( a2 :> + [ termHistory |-> + <<[term |-> 0, lsn |-> 1], [term |-> 1, lsn |-> 1]>>, + flushLsn |-> 3 ] @@ + a3 :> + [ termHistory |-> <<[term |-> 0, lsn |-> 1]>>, + flushLsn |-> 1 ] ), + termHistory |-> + << [term |-> 0, lsn |-> 1], + [term |-> 1, lsn |-> 1], + [term |-> 3, lsn |-> 3] >>, + nextSendLsn |-> (a2 :> 3 @@ a3 :> 2) ] ) +/\ acc_state = ( a1 :> + [ term |-> 4, + wal |-> <<>>, + termHistory |-> + << [term |-> 0, lsn |-> 1], + [term |-> 2, lsn |-> 1], + [term |-> 4, lsn |-> 1] >> ] @@ + a2 :> + [ term |-> 3, + wal |-> <<1, 1>>, + termHistory |-> + << [term |-> 0, lsn |-> 1], + [term |-> 1, lsn |-> 1], + [term |-> 3, lsn |-> 3] >> ] @@ + a3 :> + [ term |-> 4, + wal |-> <<4>>, + termHistory |-> + << [term |-> 0, lsn |-> 1], + [term |-> 2, lsn |-> 1], + [term |-> 4, lsn |-> 1] >> ] ) +/\ committed = {[term |-> 1, lsn |-> 1]} + +State 31: +/\ prop_state = ( p1 :> + [ term |-> 4, + wal |-> <<4>>, + state |-> "leader", + votes |-> + ( a1 :> + [ termHistory |-> + <<[term |-> 0, lsn |-> 1], [term |-> 2, lsn |-> 1]>>, + flushLsn |-> 1 ] @@ + a3 :> + [ termHistory |-> + <<[term |-> 0, lsn |-> 1], [term |-> 1, lsn |-> 1]>>, + flushLsn |-> 2 ] ), + termHistory |-> + << [term |-> 0, lsn |-> 1], + [term |-> 2, lsn |-> 1], + [term |-> 4, lsn |-> 1] >>, + nextSendLsn |-> (a1 :> 2 @@ a3 :> 2) ] @@ + p2 :> + [ term |-> 3, + wal |-> <<1, 1>>, + state |-> "leader", + votes |-> + ( a2 :> + [ termHistory |-> + <<[term |-> 0, lsn |-> 1], [term |-> 1, lsn |-> 1]>>, + flushLsn |-> 3 ] @@ + a3 :> + [ termHistory |-> <<[term |-> 0, lsn |-> 1]>>, + flushLsn |-> 1 ] ), + termHistory |-> + << [term |-> 0, lsn |-> 1], + [term |-> 1, lsn |-> 1], + [term |-> 3, lsn |-> 3] >>, + nextSendLsn |-> (a2 :> 3 @@ a3 :> 2) ] ) +/\ acc_state = ( a1 :> + [ term |-> 4, + wal |-> <<4>>, + termHistory |-> + << [term |-> 0, lsn |-> 1], + [term |-> 2, lsn |-> 1], + [term |-> 4, lsn |-> 1] >> ] @@ + a2 :> + [ term |-> 3, + wal |-> <<1, 1>>, + termHistory |-> + << [term |-> 0, lsn |-> 1], + [term |-> 1, lsn |-> 1], + [term |-> 3, lsn |-> 3] >> ] @@ + a3 :> + [ term |-> 4, + wal |-> <<4>>, + termHistory |-> + << [term |-> 0, lsn |-> 1], + [term |-> 2, lsn |-> 1], + [term |-> 4, lsn |-> 1] >> ] ) +/\ committed = {[term |-> 1, lsn |-> 1]} + +State 32: +/\ prop_state = ( p1 :> + [ term |-> 4, + wal |-> <<4>>, + state |-> "leader", + votes |-> + ( a1 :> + [ termHistory |-> + <<[term |-> 0, lsn |-> 1], [term |-> 2, lsn |-> 1]>>, + flushLsn |-> 1 ] @@ + a3 :> + [ termHistory |-> + <<[term |-> 0, lsn |-> 1], [term |-> 1, lsn |-> 1]>>, + flushLsn |-> 2 ] ), + termHistory |-> + << [term |-> 0, lsn |-> 1], + [term |-> 2, lsn |-> 1], + [term |-> 4, lsn |-> 1] >>, + nextSendLsn |-> (a1 :> 2 @@ a3 :> 2) ] @@ + p2 :> + [ term |-> 3, + wal |-> <<1, 1>>, + state |-> "leader", + votes |-> + ( a2 :> + [ termHistory |-> + <<[term |-> 0, lsn |-> 1], [term |-> 1, lsn |-> 1]>>, + flushLsn |-> 3 ] @@ + a3 :> + [ termHistory |-> <<[term |-> 0, lsn |-> 1]>>, + flushLsn |-> 1 ] ), + termHistory |-> + << [term |-> 0, lsn |-> 1], + [term |-> 1, lsn |-> 1], + [term |-> 3, lsn |-> 3] >>, + nextSendLsn |-> (a2 :> 3 @@ a3 :> 2) ] ) +/\ acc_state = ( a1 :> + [ term |-> 4, + wal |-> <<4>>, + termHistory |-> + << [term |-> 0, lsn |-> 1], + [term |-> 2, lsn |-> 1], + [term |-> 4, lsn |-> 1] >> ] @@ + a2 :> + [ term |-> 3, + wal |-> <<1, 1>>, + termHistory |-> + << [term |-> 0, lsn |-> 1], + [term |-> 1, lsn |-> 1], + [term |-> 3, lsn |-> 3] >> ] @@ + a3 :> + [ term |-> 4, + wal |-> <<4>>, + termHistory |-> + << [term |-> 0, lsn |-> 1], + [term |-> 2, lsn |-> 1], + [term |-> 4, lsn |-> 1] >> ] ) +/\ committed = {[term |-> 1, lsn |-> 1], [term |-> 4, lsn |-> 1]} + +1712918117 states generated, 174460942 distinct states found, 50658619 states left on queue. +The depth of the complete state graph search is 35. +Finished in 58min 19s at (2024-11-06 15:18:45) +Trace exploration spec path: ./MCProposerAcceptorStatic_TTrace_1730902825.tla diff --git a/safekeeper/spec/tlc-results/MCProposerAcceptorStatic.tla-MCProposerAcceptorStatic_p2_a3_t4_l4.cfg-2024-11-06--15-30-45.log b/safekeeper/spec/tlc-results/MCProposerAcceptorStatic.tla-MCProposerAcceptorStatic_p2_a3_t4_l4.cfg-2024-11-06--15-30-45.log new file mode 100644 index 0000000000..8248240ded --- /dev/null +++ b/safekeeper/spec/tlc-results/MCProposerAcceptorStatic.tla-MCProposerAcceptorStatic_p2_a3_t4_l4.cfg-2024-11-06--15-30-45.log @@ -0,0 +1,1374 @@ +git revision: 4f1ee6331 +Platform: Linux neon-dev-arm64-1 6.8.0-48-generic #48-Ubuntu SMP PREEMPT_DYNAMIC Fri Sep 27 14:35:45 UTC 2024 aarch64 aarch64 aarch64 GNU/Linux +CPU Info Linux: Neoverse-N1 +CPU Cores Linux: 80 +CPU Info Mac: +CPU Cores Mac: +Spec: MCProposerAcceptorStatic.tla +Config: models/MCProposerAcceptorStatic_p2_a3_t4_l4.cfg +---- +CONSTANTS +NULL = NULL +proposers = {p1, p2} +acceptors = {a1, a2, a3} +max_term = 4 +max_entries = 4 +SPECIFICATION Spec +CONSTRAINT StateConstraint +INVARIANT +TypeOk +ElectionSafety +LogIsMonotonic +LogSafety +SYMMETRY ProposerAcceptorSymmetry +CHECK_DEADLOCK FALSE +ALIAS Alias + +---- + +TLC2 Version 2.20 of Day Month 20?? (rev: f68cb71) +Running breadth-first search Model-Checking with fp 84 and seed -1069171980999686913 with 80 workers on 80 cores with 54613MB heap and 61440MB offheap memory [pid: 62544] (Linux 6.8.0-48-generic aarch64, Ubuntu 21.0.4 x86_64, OffHeapDiskFPSet, DiskStateQueue). +Parsing file /home/arseny/neon/safekeeper/spec/MCProposerAcceptorStatic.tla +Parsing file /tmp/tlc-6542850091824737097/TLC.tla (jar:file:/home/arseny/tla2tools.jar!/tla2sany/StandardModules/TLC.tla) +Parsing file /home/arseny/neon/safekeeper/spec/ProposerAcceptorStatic.tla +Parsing file /tmp/tlc-6542850091824737097/_TLCTrace.tla (jar:file:/home/arseny/tla2tools.jar!/tla2sany/StandardModules/_TLCTrace.tla) +Parsing file /tmp/tlc-6542850091824737097/Integers.tla (jar:file:/home/arseny/tla2tools.jar!/tla2sany/StandardModules/Integers.tla) +Parsing file /tmp/tlc-6542850091824737097/Sequences.tla (jar:file:/home/arseny/tla2tools.jar!/tla2sany/StandardModules/Sequences.tla) +Parsing file /tmp/tlc-6542850091824737097/FiniteSets.tla (jar:file:/home/arseny/tla2tools.jar!/tla2sany/StandardModules/FiniteSets.tla) +Parsing file /tmp/tlc-6542850091824737097/Naturals.tla (jar:file:/home/arseny/tla2tools.jar!/tla2sany/StandardModules/Naturals.tla) +Parsing file /tmp/tlc-6542850091824737097/TLCExt.tla (jar:file:/home/arseny/tla2tools.jar!/tla2sany/StandardModules/TLCExt.tla) +Semantic processing of module Naturals +Semantic processing of module Sequences +Semantic processing of module FiniteSets +Semantic processing of module TLC +Semantic processing of module Integers +Semantic processing of module ProposerAcceptorStatic +Semantic processing of module TLCExt +Semantic processing of module _TLCTrace +Semantic processing of module MCProposerAcceptorStatic +Starting... (2024-11-06 15:30:45) +Computing initial states... +Finished computing initial states: 1 distinct state generated at 2024-11-06 15:30:48. +Progress(20) at 2024-11-06 15:30:51: 956,386 states generated (956,386 s/min), 134,121 distinct states found (134,121 ds/min), 57,996 states left on queue. +Progress(27) at 2024-11-06 15:31:51: 30,048,294 states generated (29,091,908 s/min), 3,778,849 distinct states found (3,644,728 ds/min), 1,463,715 states left on queue. +Progress(28) at 2024-11-06 15:32:51: 59,092,248 states generated (29,043,954 s/min), 7,282,332 distinct states found (3,503,483 ds/min), 2,750,944 states left on queue. +Progress(29) at 2024-11-06 15:33:51: 88,333,136 states generated (29,240,888 s/min), 10,694,325 distinct states found (3,411,993 ds/min), 3,955,744 states left on queue. +Progress(29) at 2024-11-06 15:34:51: 117,708,994 states generated (29,375,858 s/min), 14,000,885 distinct states found (3,306,560 ds/min), 5,067,487 states left on queue. +Progress(30) at 2024-11-06 15:35:51: 146,847,667 states generated (29,138,673 s/min), 17,407,824 distinct states found (3,406,939 ds/min), 6,258,337 states left on queue. +Progress(30) at 2024-11-06 15:36:51: 176,211,801 states generated (29,364,134 s/min), 20,626,933 distinct states found (3,219,109 ds/min), 7,302,661 states left on queue. +Progress(31) at 2024-11-06 15:37:51: 205,665,438 states generated (29,453,637 s/min), 23,877,622 distinct states found (3,250,689 ds/min), 8,361,004 states left on queue. +Progress(31) at 2024-11-06 15:38:51: 234,757,357 states generated (29,091,919 s/min), 27,246,813 distinct states found (3,369,191 ds/min), 9,511,916 states left on queue. +Progress(31) at 2024-11-06 15:39:51: 264,154,436 states generated (29,397,079 s/min), 30,383,069 distinct states found (3,136,256 ds/min), 10,494,238 states left on queue. +Progress(31) at 2024-11-06 15:40:51: 293,638,121 states generated (29,483,685 s/min), 33,498,433 distinct states found (3,115,364 ds/min), 11,429,812 states left on queue. +Progress(32) at 2024-11-06 15:41:51: 323,039,991 states generated (29,401,870 s/min), 36,709,338 distinct states found (3,210,905 ds/min), 12,463,752 states left on queue. +Progress(32) at 2024-11-06 15:42:51: 352,081,458 states generated (29,041,467 s/min), 39,979,938 distinct states found (3,270,600 ds/min), 13,531,461 states left on queue. +Progress(32) at 2024-11-06 15:43:51: 381,472,323 states generated (29,390,865 s/min), 43,147,359 distinct states found (3,167,421 ds/min), 14,513,444 states left on queue. +Progress(32) at 2024-11-06 15:44:51: 410,911,764 states generated (29,439,441 s/min), 46,200,793 distinct states found (3,053,434 ds/min), 15,418,951 states left on queue. +Progress(32) at 2024-11-06 15:45:51: 440,514,627 states generated (29,602,863 s/min), 49,210,279 distinct states found (3,009,486 ds/min), 16,263,879 states left on queue. +Progress(33) at 2024-11-06 15:46:51: 470,070,180 states generated (29,555,553 s/min), 52,317,535 distinct states found (3,107,256 ds/min), 17,200,875 states left on queue. +Progress(33) at 2024-11-06 15:47:51: 499,387,268 states generated (29,317,088 s/min), 55,489,376 distinct states found (3,171,841 ds/min), 18,196,719 states left on queue. +Progress(33) at 2024-11-06 15:48:51: 528,308,354 states generated (28,921,086 s/min), 58,716,400 distinct states found (3,227,024 ds/min), 19,225,822 states left on queue. +Progress(33) at 2024-11-06 15:49:51: 557,626,508 states generated (29,318,154 s/min), 61,861,039 distinct states found (3,144,639 ds/min), 20,172,391 states left on queue. +Progress(33) at 2024-11-06 15:50:51: 587,011,551 states generated (29,385,043 s/min), 64,911,520 distinct states found (3,050,481 ds/min), 21,068,246 states left on queue. +Progress(33) at 2024-11-06 15:51:51: 616,469,665 states generated (29,458,114 s/min), 67,862,377 distinct states found (2,950,857 ds/min), 21,888,495 states left on queue. +Progress(33) at 2024-11-06 15:52:51: 646,037,901 states generated (29,568,236 s/min), 70,774,601 distinct states found (2,912,224 ds/min), 22,642,487 states left on queue. +Progress(33) at 2024-11-06 15:53:51: 675,679,292 states generated (29,641,391 s/min), 73,753,124 distinct states found (2,978,523 ds/min), 23,459,982 states left on queue. +Progress(34) at 2024-11-06 15:54:51: 705,213,119 states generated (29,533,827 s/min), 76,751,356 distinct states found (2,998,232 ds/min), 24,319,315 states left on queue. +Progress(34) at 2024-11-06 15:55:51: 734,548,637 states generated (29,335,518 s/min), 79,865,504 distinct states found (3,114,148 ds/min), 25,270,867 states left on queue. +Progress(34) at 2024-11-06 15:56:51: 763,724,351 states generated (29,175,714 s/min), 82,969,406 distinct states found (3,103,902 ds/min), 26,203,099 states left on queue. +Progress(34) at 2024-11-06 15:57:51: 792,795,916 states generated (29,071,565 s/min), 86,092,913 distinct states found (3,123,507 ds/min), 27,124,641 states left on queue. +Progress(34) at 2024-11-06 15:58:51: 822,084,221 states generated (29,288,305 s/min), 89,196,548 distinct states found (3,103,635 ds/min), 28,028,058 states left on queue. +Progress(34) at 2024-11-06 15:59:51: 851,516,510 states generated (29,432,289 s/min), 92,135,078 distinct states found (2,938,530 ds/min), 28,822,750 states left on queue. +Checkpointing of run states/24-11-06-15-30-45.354 +Checkpointing completed at (2024-11-06 16:00:51) +Progress(34) at 2024-11-06 16:00:51: 880,891,436 states generated (29,374,926 s/min), 95,133,622 distinct states found (2,998,544 ds/min), 29,669,470 states left on queue. +Progress(34) at 2024-11-06 16:01:51: 910,262,536 states generated (29,371,100 s/min), 98,019,631 distinct states found (2,886,009 ds/min), 30,433,293 states left on queue. +Progress(34) at 2024-11-06 16:02:51: 939,689,255 states generated (29,426,719 s/min), 100,814,884 distinct states found (2,795,253 ds/min), 31,083,132 states left on queue. +Progress(34) at 2024-11-06 16:03:51: 969,299,651 states generated (29,610,396 s/min), 103,664,772 distinct states found (2,849,888 ds/min), 31,821,093 states left on queue. +Progress(34) at 2024-11-06 16:04:51: 999,051,292 states generated (29,751,641 s/min), 106,544,287 distinct states found (2,879,515 ds/min), 32,536,946 states left on queue. +Progress(35) at 2024-11-06 16:05:51: 1,028,690,576 states generated (29,639,284 s/min), 109,444,362 distinct states found (2,900,075 ds/min), 33,326,316 states left on queue. +Progress(35) at 2024-11-06 16:06:51: 1,058,155,400 states generated (29,464,824 s/min), 112,439,937 distinct states found (2,995,575 ds/min), 34,167,604 states left on queue. +Progress(35) at 2024-11-06 16:07:51: 1,087,496,744 states generated (29,341,344 s/min), 115,461,649 distinct states found (3,021,712 ds/min), 35,032,974 states left on queue. +Progress(35) at 2024-11-06 16:08:51: 1,116,663,767 states generated (29,167,023 s/min), 118,482,838 distinct states found (3,021,189 ds/min), 35,902,651 states left on queue. +Progress(35) at 2024-11-06 16:09:51: 1,145,439,918 states generated (28,776,151 s/min), 121,562,159 distinct states found (3,079,321 ds/min), 36,785,088 states left on queue. +Progress(35) at 2024-11-06 16:10:51: 1,174,812,354 states generated (29,372,436 s/min), 124,511,721 distinct states found (2,949,562 ds/min), 37,555,204 states left on queue. +Progress(35) at 2024-11-06 16:11:51: 1,204,150,178 states generated (29,337,824 s/min), 127,579,155 distinct states found (3,067,434 ds/min), 38,425,790 states left on queue. +Progress(35) at 2024-11-06 16:12:51: 1,233,620,353 states generated (29,470,175 s/min), 130,490,427 distinct states found (2,911,272 ds/min), 39,188,412 states left on queue. +Progress(35) at 2024-11-06 16:13:51: 1,263,022,331 states generated (29,401,978 s/min), 133,317,160 distinct states found (2,826,733 ds/min), 39,893,070 states left on queue. +Progress(35) at 2024-11-06 16:14:51: 1,292,411,979 states generated (29,389,648 s/min), 136,229,817 distinct states found (2,912,657 ds/min), 40,666,029 states left on queue. +Progress(35) at 2024-11-06 16:15:51: 1,321,695,856 states generated (29,283,877 s/min), 139,081,910 distinct states found (2,852,093 ds/min), 41,389,715 states left on queue. +Progress(35) at 2024-11-06 16:16:51: 1,351,045,560 states generated (29,349,704 s/min), 141,811,662 distinct states found (2,729,752 ds/min), 41,999,267 states left on queue. +Progress(35) at 2024-11-06 16:17:51: 1,380,677,436 states generated (29,631,876 s/min), 144,516,072 distinct states found (2,704,410 ds/min), 42,579,779 states left on queue. +Progress(35) at 2024-11-06 16:18:51: 1,410,332,660 states generated (29,655,224 s/min), 147,269,848 distinct states found (2,753,776 ds/min), 43,232,732 states left on queue. +Progress(35) at 2024-11-06 16:19:51: 1,440,071,594 states generated (29,738,934 s/min), 150,116,683 distinct states found (2,846,835 ds/min), 43,917,859 states left on queue. +Progress(35) at 2024-11-06 16:20:51: 1,469,737,942 states generated (29,666,348 s/min), 152,881,605 distinct states found (2,764,922 ds/min), 44,594,909 states left on queue. +Progress(36) at 2024-11-06 16:21:51: 1,499,124,482 states generated (29,386,540 s/min), 155,722,313 distinct states found (2,840,708 ds/min), 45,306,186 states left on queue. +Progress(36) at 2024-11-06 16:22:51: 1,528,616,635 states generated (29,492,153 s/min), 158,643,911 distinct states found (2,921,598 ds/min), 46,098,600 states left on queue. +Progress(36) at 2024-11-06 16:23:51: 1,557,820,328 states generated (29,203,693 s/min), 161,651,516 distinct states found (3,007,605 ds/min), 46,958,572 states left on queue. +Progress(36) at 2024-11-06 16:24:51: 1,587,341,565 states generated (29,521,237 s/min), 164,469,424 distinct states found (2,817,908 ds/min), 47,648,932 states left on queue. +Progress(36) at 2024-11-06 16:25:51: 1,616,246,807 states generated (28,905,242 s/min), 167,471,199 distinct states found (3,001,775 ds/min), 48,496,844 states left on queue. +Progress(36) at 2024-11-06 16:26:51: 1,645,107,613 states generated (28,860,806 s/min), 170,454,103 distinct states found (2,982,904 ds/min), 49,283,244 states left on queue. +Progress(36) at 2024-11-06 16:27:51: 1,674,492,314 states generated (29,384,701 s/min), 173,343,045 distinct states found (2,888,942 ds/min), 50,006,895 states left on queue. +Progress(36) at 2024-11-06 16:28:51: 1,703,875,027 states generated (29,382,713 s/min), 176,157,623 distinct states found (2,814,578 ds/min), 50,662,128 states left on queue. +Progress(36) at 2024-11-06 16:29:51: 1,733,099,131 states generated (29,224,104 s/min), 179,186,519 distinct states found (3,028,896 ds/min), 51,498,029 states left on queue. +Checkpointing of run states/24-11-06-15-30-45.354 +Checkpointing completed at (2024-11-06 16:30:52) +Progress(36) at 2024-11-06 16:30:52: 1,762,724,622 states generated (29,625,491 s/min), 181,958,595 distinct states found (2,772,076 ds/min), 52,142,450 states left on queue. +Progress(36) at 2024-11-06 16:31:52: 1,792,118,288 states generated (29,393,666 s/min), 184,725,090 distinct states found (2,766,495 ds/min), 52,785,705 states left on queue. +Progress(36) at 2024-11-06 16:32:52: 1,821,258,069 states generated (29,139,781 s/min), 187,681,452 distinct states found (2,956,362 ds/min), 53,592,610 states left on queue. +Progress(36) at 2024-11-06 16:33:52: 1,850,729,054 states generated (29,470,985 s/min), 190,451,722 distinct states found (2,770,270 ds/min), 54,239,919 states left on queue. +Progress(36) at 2024-11-06 16:34:52: 1,879,860,913 states generated (29,131,859 s/min), 193,207,770 distinct states found (2,756,048 ds/min), 54,886,748 states left on queue. +Progress(36) at 2024-11-06 16:35:52: 1,909,200,565 states generated (29,339,652 s/min), 195,832,123 distinct states found (2,624,353 ds/min), 55,404,535 states left on queue. +Progress(36) at 2024-11-06 16:36:52: 1,938,403,873 states generated (29,203,308 s/min), 198,569,916 distinct states found (2,737,793 ds/min), 55,993,675 states left on queue. +Progress(36) at 2024-11-06 16:37:52: 1,968,097,695 states generated (29,693,822 s/min), 201,148,799 distinct states found (2,578,883 ds/min), 56,501,179 states left on queue. +Progress(36) at 2024-11-06 16:38:52: 1,997,628,304 states generated (29,530,609 s/min), 203,860,765 distinct states found (2,711,966 ds/min), 57,133,283 states left on queue. +Progress(36) at 2024-11-06 16:39:52: 2,027,338,755 states generated (29,710,451 s/min), 206,496,491 distinct states found (2,635,726 ds/min), 57,649,914 states left on queue. +Progress(36) at 2024-11-06 16:40:52: 2,057,072,538 states generated (29,733,783 s/min), 209,189,488 distinct states found (2,692,997 ds/min), 58,229,449 states left on queue. +Progress(36) at 2024-11-06 16:41:52: 2,086,549,250 states generated (29,476,712 s/min), 211,909,869 distinct states found (2,720,381 ds/min), 58,875,611 states left on queue. +Progress(37) at 2024-11-06 16:42:52: 2,115,953,926 states generated (29,404,676 s/min), 214,630,876 distinct states found (2,721,007 ds/min), 59,494,220 states left on queue. +Progress(37) at 2024-11-06 16:43:52: 2,145,423,196 states generated (29,469,270 s/min), 217,412,888 distinct states found (2,782,012 ds/min), 60,176,423 states left on queue. +Progress(37) at 2024-11-06 16:44:52: 2,174,796,796 states generated (29,373,600 s/min), 220,316,140 distinct states found (2,903,252 ds/min), 60,925,815 states left on queue. +Progress(37) at 2024-11-06 16:45:52: 2,203,907,384 states generated (29,110,588 s/min), 223,255,125 distinct states found (2,938,985 ds/min), 61,739,564 states left on queue. +Progress(37) at 2024-11-06 16:46:52: 2,233,378,272 states generated (29,470,888 s/min), 225,995,858 distinct states found (2,740,733 ds/min), 62,364,627 states left on queue. +Progress(37) at 2024-11-06 16:47:52: 2,262,648,334 states generated (29,270,062 s/min), 228,738,653 distinct states found (2,742,795 ds/min), 63,003,155 states left on queue. +Progress(37) at 2024-11-06 16:48:52: 2,291,309,648 states generated (28,661,314 s/min), 231,720,498 distinct states found (2,981,845 ds/min), 63,816,162 states left on queue. +Progress(37) at 2024-11-06 16:49:52: 2,320,153,384 states generated (28,843,736 s/min), 234,599,475 distinct states found (2,878,977 ds/min), 64,513,886 states left on queue. +Progress(37) at 2024-11-06 16:50:52: 2,349,538,907 states generated (29,385,523 s/min), 237,330,640 distinct states found (2,731,165 ds/min), 65,105,576 states left on queue. +Progress(37) at 2024-11-06 16:51:52: 2,379,015,082 states generated (29,476,175 s/min), 240,064,625 distinct states found (2,733,985 ds/min), 65,704,108 states left on queue. +Progress(37) at 2024-11-06 16:52:52: 2,408,376,582 states generated (29,361,500 s/min), 242,869,889 distinct states found (2,805,264 ds/min), 66,339,299 states left on queue. +Progress(37) at 2024-11-06 16:53:52: 2,437,554,516 states generated (29,177,934 s/min), 245,844,106 distinct states found (2,974,217 ds/min), 67,125,834 states left on queue. +Progress(37) at 2024-11-06 16:54:52: 2,466,925,193 states generated (29,370,677 s/min), 248,540,587 distinct states found (2,696,481 ds/min), 67,707,623 states left on queue. +Progress(37) at 2024-11-06 16:55:52: 2,496,386,977 states generated (29,461,784 s/min), 251,318,893 distinct states found (2,778,306 ds/min), 68,345,796 states left on queue. +Progress(37) at 2024-11-06 16:56:52: 2,525,837,965 states generated (29,450,988 s/min), 253,918,986 distinct states found (2,600,093 ds/min), 68,851,521 states left on queue. +Progress(37) at 2024-11-06 16:57:52: 2,555,073,687 states generated (29,235,722 s/min), 256,806,753 distinct states found (2,887,767 ds/min), 69,596,597 states left on queue. +Progress(37) at 2024-11-06 16:58:52: 2,584,381,294 states generated (29,307,607 s/min), 259,714,054 distinct states found (2,907,301 ds/min), 70,335,539 states left on queue. +Progress(37) at 2024-11-06 16:59:52: 2,613,557,081 states generated (29,175,787 s/min), 262,407,462 distinct states found (2,693,408 ds/min), 70,920,265 states left on queue. +Checkpointing of run states/24-11-06-15-30-45.354 +Checkpointing completed at (2024-11-06 17:00:53) +Progress(37) at 2024-11-06 17:00:53: 2,643,168,141 states generated (29,611,060 s/min), 264,973,171 distinct states found (2,565,709 ds/min), 71,384,749 states left on queue. +Progress(37) at 2024-11-06 17:01:53: 2,672,453,868 states generated (29,285,727 s/min), 267,551,971 distinct states found (2,578,800 ds/min), 71,854,220 states left on queue. +Progress(37) at 2024-11-06 17:02:53: 2,701,696,399 states generated (29,242,531 s/min), 270,233,135 distinct states found (2,681,164 ds/min), 72,406,567 states left on queue. +Progress(37) at 2024-11-06 17:03:53: 2,731,216,488 states generated (29,520,089 s/min), 272,711,390 distinct states found (2,478,255 ds/min), 72,805,269 states left on queue. +Progress(37) at 2024-11-06 17:04:53: 2,760,788,758 states generated (29,572,270 s/min), 275,307,217 distinct states found (2,595,827 ds/min), 73,313,123 states left on queue. +Progress(37) at 2024-11-06 17:05:53: 2,790,339,552 states generated (29,550,794 s/min), 277,881,113 distinct states found (2,573,896 ds/min), 73,833,900 states left on queue. +Progress(37) at 2024-11-06 17:06:53: 2,820,046,206 states generated (29,706,654 s/min), 280,371,086 distinct states found (2,489,973 ds/min), 74,231,258 states left on queue. +Progress(37) at 2024-11-06 17:07:53: 2,849,787,753 states generated (29,741,547 s/min), 283,097,131 distinct states found (2,726,045 ds/min), 74,814,735 states left on queue. +Progress(37) at 2024-11-06 17:08:53: 2,879,520,949 states generated (29,733,196 s/min), 285,608,053 distinct states found (2,510,922 ds/min), 75,293,894 states left on queue. +Progress(37) at 2024-11-06 17:09:53: 2,908,889,760 states generated (29,368,811 s/min), 288,274,872 distinct states found (2,666,819 ds/min), 75,880,480 states left on queue. +Progress(38) at 2024-11-06 17:10:53: 2,938,412,523 states generated (29,522,763 s/min), 290,877,598 distinct states found (2,602,726 ds/min), 76,391,156 states left on queue. +Progress(38) at 2024-11-06 17:11:53: 2,967,963,455 states generated (29,550,932 s/min), 293,492,146 distinct states found (2,614,548 ds/min), 76,932,124 states left on queue. +Progress(38) at 2024-11-06 17:12:53: 2,997,327,370 states generated (29,363,915 s/min), 296,353,306 distinct states found (2,861,160 ds/min), 77,659,606 states left on queue. +Progress(38) at 2024-11-06 17:13:53: 3,026,713,138 states generated (29,385,768 s/min), 299,173,963 distinct states found (2,820,657 ds/min), 78,342,645 states left on queue. +Progress(38) at 2024-11-06 17:14:53: 3,055,986,492 states generated (29,273,354 s/min), 302,024,049 distinct states found (2,850,086 ds/min), 79,071,501 states left on queue. +Progress(38) at 2024-11-06 17:15:53: 3,085,491,974 states generated (29,505,482 s/min), 304,668,970 distinct states found (2,644,921 ds/min), 79,608,084 states left on queue. +Progress(38) at 2024-11-06 17:16:53: 3,114,898,266 states generated (29,406,292 s/min), 307,272,526 distinct states found (2,603,556 ds/min), 80,132,575 states left on queue. +Progress(38) at 2024-11-06 17:17:53: 3,144,023,490 states generated (29,125,224 s/min), 310,022,073 distinct states found (2,749,547 ds/min), 80,777,238 states left on queue. +Progress(38) at 2024-11-06 17:18:53: 3,172,762,795 states generated (28,739,305 s/min), 312,891,905 distinct states found (2,869,832 ds/min), 81,497,739 states left on queue. +Progress(38) at 2024-11-06 17:19:53: 3,201,314,425 states generated (28,551,630 s/min), 315,766,566 distinct states found (2,874,661 ds/min), 82,171,729 states left on queue. +Progress(38) at 2024-11-06 17:20:53: 3,230,713,777 states generated (29,399,352 s/min), 318,365,612 distinct states found (2,599,046 ds/min), 82,638,018 states left on queue. +Progress(38) at 2024-11-06 17:21:53: 3,260,188,634 states generated (29,474,857 s/min), 321,040,810 distinct states found (2,675,198 ds/min), 83,185,708 states left on queue. +Progress(38) at 2024-11-06 17:22:53: 3,289,654,456 states generated (29,465,822 s/min), 323,660,313 distinct states found (2,619,503 ds/min), 83,689,075 states left on queue. +Progress(38) at 2024-11-06 17:23:53: 3,319,003,677 states generated (29,349,221 s/min), 326,391,347 distinct states found (2,731,034 ds/min), 84,261,368 states left on queue. +Progress(38) at 2024-11-06 17:24:53: 3,348,330,685 states generated (29,327,008 s/min), 329,204,934 distinct states found (2,813,587 ds/min), 84,925,046 states left on queue. +Progress(38) at 2024-11-06 17:25:53: 3,377,572,946 states generated (29,242,261 s/min), 331,997,887 distinct states found (2,792,953 ds/min), 85,533,473 states left on queue. +Progress(38) at 2024-11-06 17:26:53: 3,406,881,714 states generated (29,308,768 s/min), 334,599,745 distinct states found (2,601,858 ds/min), 86,047,276 states left on queue. +Progress(38) at 2024-11-06 17:27:53: 3,436,375,389 states generated (29,493,675 s/min), 337,261,572 distinct states found (2,661,827 ds/min), 86,591,357 states left on queue. +Progress(38) at 2024-11-06 17:28:53: 3,465,811,732 states generated (29,436,343 s/min), 339,829,613 distinct states found (2,568,041 ds/min), 87,057,550 states left on queue. +Progress(38) at 2024-11-06 17:29:53: 3,495,144,983 states generated (29,333,251 s/min), 342,566,275 distinct states found (2,736,662 ds/min), 87,671,131 states left on queue. +Checkpointing of run states/24-11-06-15-30-45.354 +Checkpointing completed at (2024-11-06 17:30:53) +Progress(38) at 2024-11-06 17:30:53: 3,524,611,246 states generated (29,466,263 s/min), 345,366,358 distinct states found (2,800,083 ds/min), 88,316,673 states left on queue. +Progress(38) at 2024-11-06 17:31:53: 3,553,819,331 states generated (29,208,085 s/min), 348,291,666 distinct states found (2,925,308 ds/min), 89,059,679 states left on queue. +Progress(38) at 2024-11-06 17:32:53: 3,583,208,821 states generated (29,389,490 s/min), 350,796,636 distinct states found (2,504,970 ds/min), 89,478,521 states left on queue. +Progress(38) at 2024-11-06 17:33:53: 3,612,329,910 states generated (29,121,089 s/min), 353,414,448 distinct states found (2,617,812 ds/min), 90,008,568 states left on queue. +Progress(38) at 2024-11-06 17:34:53: 3,641,485,253 states generated (29,155,343 s/min), 356,010,441 distinct states found (2,595,993 ds/min), 90,486,313 states left on queue. +Progress(38) at 2024-11-06 17:35:53: 3,670,761,645 states generated (29,276,392 s/min), 358,411,973 distinct states found (2,401,532 ds/min), 90,799,029 states left on queue. +Progress(38) at 2024-11-06 17:36:53: 3,700,008,207 states generated (29,246,562 s/min), 360,943,422 distinct states found (2,531,449 ds/min), 91,235,694 states left on queue. +Progress(38) at 2024-11-06 17:37:53: 3,729,045,761 states generated (29,037,554 s/min), 363,523,499 distinct states found (2,580,077 ds/min), 91,685,579 states left on queue. +Progress(38) at 2024-11-06 17:38:53: 3,758,697,262 states generated (29,651,501 s/min), 365,860,396 distinct states found (2,336,897 ds/min), 92,003,313 states left on queue. +Progress(38) at 2024-11-06 17:39:53: 3,788,188,489 states generated (29,491,227 s/min), 368,369,398 distinct states found (2,509,002 ds/min), 92,452,083 states left on queue. +Progress(38) at 2024-11-06 17:40:53: 3,817,718,772 states generated (29,530,283 s/min), 370,855,965 distinct states found (2,486,567 ds/min), 92,899,812 states left on queue. +Progress(38) at 2024-11-06 17:41:53: 3,847,372,748 states generated (29,653,976 s/min), 373,231,774 distinct states found (2,375,809 ds/min), 93,202,503 states left on queue. +Progress(38) at 2024-11-06 17:42:53: 3,877,091,950 states generated (29,719,202 s/min), 375,934,374 distinct states found (2,702,600 ds/min), 93,775,105 states left on queue. +Progress(38) at 2024-11-06 17:43:53: 3,906,843,295 states generated (29,751,345 s/min), 378,304,497 distinct states found (2,370,123 ds/min), 94,098,611 states left on queue. +Progress(38) at 2024-11-06 17:44:53: 3,936,304,033 states generated (29,460,738 s/min), 380,793,774 distinct states found (2,489,277 ds/min), 94,560,398 states left on queue. +Progress(38) at 2024-11-06 17:45:53: 3,965,687,311 states generated (29,383,278 s/min), 383,366,376 distinct states found (2,572,602 ds/min), 95,062,163 states left on queue. +Progress(38) at 2024-11-06 17:46:53: 3,995,264,758 states generated (29,577,447 s/min), 385,832,314 distinct states found (2,465,938 ds/min), 95,460,777 states left on queue. +Progress(38) at 2024-11-06 17:47:53: 4,024,519,333 states generated (29,254,575 s/min), 388,384,282 distinct states found (2,551,968 ds/min), 95,931,698 states left on queue. +Progress(38) at 2024-11-06 17:48:53: 4,054,053,752 states generated (29,534,419 s/min), 390,990,581 distinct states found (2,606,299 ds/min), 96,493,705 states left on queue. +Progress(38) at 2024-11-06 17:49:53: 4,083,403,606 states generated (29,349,854 s/min), 393,717,328 distinct states found (2,726,747 ds/min), 97,099,592 states left on queue. +Progress(38) at 2024-11-06 17:50:53: 4,112,753,694 states generated (29,350,088 s/min), 396,441,909 distinct states found (2,724,581 ds/min), 97,694,523 states left on queue. +Progress(38) at 2024-11-06 17:51:53: 4,141,940,951 states generated (29,187,257 s/min), 399,238,612 distinct states found (2,796,703 ds/min), 98,387,103 states left on queue. +Progress(38) at 2024-11-06 17:52:53: 4,171,185,273 states generated (29,244,322 s/min), 401,861,376 distinct states found (2,622,764 ds/min), 98,900,168 states left on queue. +Progress(38) at 2024-11-06 17:53:53: 4,200,735,055 states generated (29,549,782 s/min), 404,419,627 distinct states found (2,558,251 ds/min), 99,388,507 states left on queue. +Progress(38) at 2024-11-06 17:54:53: 4,230,057,902 states generated (29,322,847 s/min), 406,926,477 distinct states found (2,506,850 ds/min), 99,826,562 states left on queue. +Progress(38) at 2024-11-06 17:55:53: 4,259,279,515 states generated (29,221,613 s/min), 409,512,606 distinct states found (2,586,129 ds/min), 100,340,214 states left on queue. +Progress(38) at 2024-11-06 17:56:53: 4,288,265,663 states generated (28,986,148 s/min), 412,254,402 distinct states found (2,741,796 ds/min), 100,966,036 states left on queue. +Progress(38) at 2024-11-06 17:57:53: 4,316,798,413 states generated (28,532,750 s/min), 415,047,481 distinct states found (2,793,079 ds/min), 101,589,869 states left on queue. +Progress(38) at 2024-11-06 17:58:53: 4,345,527,290 states generated (28,728,877 s/min), 417,768,588 distinct states found (2,721,107 ds/min), 102,133,503 states left on queue. +Progress(38) at 2024-11-06 17:59:53: 4,374,924,942 states generated (29,397,652 s/min), 420,254,082 distinct states found (2,485,494 ds/min), 102,500,461 states left on queue. +Checkpointing of run states/24-11-06-15-30-45.354 +Checkpointing completed at (2024-11-06 18:00:54) +Progress(38) at 2024-11-06 18:00:54: 4,404,604,911 states generated (29,679,969 s/min), 422,801,691 distinct states found (2,547,609 ds/min), 102,936,440 states left on queue. +Progress(38) at 2024-11-06 18:01:54: 4,434,018,901 states generated (29,413,990 s/min), 425,477,119 distinct states found (2,675,428 ds/min), 103,472,987 states left on queue. +Progress(38) at 2024-11-06 18:02:54: 4,463,498,297 states generated (29,479,396 s/min), 427,949,289 distinct states found (2,472,170 ds/min), 103,858,839 states left on queue. +Progress(38) at 2024-11-06 18:03:54: 4,492,775,931 states generated (29,277,634 s/min), 430,592,094 distinct states found (2,642,805 ds/min), 104,353,609 states left on queue. +Progress(38) at 2024-11-06 18:04:54: 4,522,002,300 states generated (29,226,369 s/min), 433,322,584 distinct states found (2,730,490 ds/min), 104,949,753 states left on queue. +Progress(38) at 2024-11-06 18:05:54: 4,551,375,180 states generated (29,372,880 s/min), 436,005,138 distinct states found (2,682,554 ds/min), 105,482,546 states left on queue. +Progress(38) at 2024-11-06 18:06:54: 4,580,718,169 states generated (29,342,989 s/min), 438,516,579 distinct states found (2,511,441 ds/min), 105,868,435 states left on queue. +Progress(38) at 2024-11-06 18:07:54: 4,609,859,344 states generated (29,141,175 s/min), 441,134,700 distinct states found (2,618,121 ds/min), 106,390,335 states left on queue. +Progress(38) at 2024-11-06 18:08:54: 4,639,331,150 states generated (29,471,806 s/min), 443,662,679 distinct states found (2,527,979 ds/min), 106,821,264 states left on queue. +Progress(38) at 2024-11-06 18:09:54: 4,668,696,820 states generated (29,365,670 s/min), 446,222,969 distinct states found (2,560,290 ds/min), 107,277,508 states left on queue. +Progress(38) at 2024-11-06 18:10:54: 4,698,140,829 states generated (29,444,009 s/min), 448,693,022 distinct states found (2,470,053 ds/min), 107,654,262 states left on queue. +Progress(38) at 2024-11-06 18:11:54: 4,727,380,985 states generated (29,240,156 s/min), 451,459,276 distinct states found (2,766,254 ds/min), 108,284,101 states left on queue. +Progress(38) at 2024-11-06 18:12:54: 4,756,654,088 states generated (29,273,103 s/min), 454,180,180 distinct states found (2,720,904 ds/min), 108,879,205 states left on queue. +Progress(38) at 2024-11-06 18:13:54: 4,785,893,104 states generated (29,239,016 s/min), 457,001,077 distinct states found (2,820,897 ds/min), 109,511,015 states left on queue. +Progress(38) at 2024-11-06 18:14:54: 4,815,289,339 states generated (29,396,235 s/min), 459,530,340 distinct states found (2,529,263 ds/min), 109,951,588 states left on queue. +Progress(38) at 2024-11-06 18:15:54: 4,844,354,767 states generated (29,065,428 s/min), 462,144,567 distinct states found (2,614,227 ds/min), 110,455,692 states left on queue. +Progress(38) at 2024-11-06 18:16:54: 4,873,381,465 states generated (29,026,698 s/min), 464,718,128 distinct states found (2,573,561 ds/min), 110,936,992 states left on queue. +Progress(38) at 2024-11-06 18:17:54: 4,902,616,179 states generated (29,234,714 s/min), 467,171,620 distinct states found (2,453,492 ds/min), 111,288,450 states left on queue. +Progress(38) at 2024-11-06 18:18:54: 4,931,808,383 states generated (29,192,204 s/min), 469,593,253 distinct states found (2,421,633 ds/min), 111,607,240 states left on queue. +Progress(38) at 2024-11-06 18:19:54: 4,961,319,800 states generated (29,511,417 s/min), 471,795,067 distinct states found (2,201,814 ds/min), 111,770,077 states left on queue. +Progress(38) at 2024-11-06 18:20:54: 4,990,051,892 states generated (28,732,092 s/min), 474,595,717 distinct states found (2,800,650 ds/min), 112,380,795 states left on queue. +Progress(38) at 2024-11-06 18:21:54: 5,019,620,389 states generated (29,568,497 s/min), 476,860,178 distinct states found (2,264,461 ds/min), 112,610,789 states left on queue. +Progress(38) at 2024-11-06 18:22:54: 5,049,176,225 states generated (29,555,836 s/min), 479,117,000 distinct states found (2,256,822 ds/min), 112,849,809 states left on queue. +Progress(38) at 2024-11-06 18:23:54: 5,078,659,511 states generated (29,483,286 s/min), 481,552,566 distinct states found (2,435,566 ds/min), 113,238,679 states left on queue. +Progress(38) at 2024-11-06 18:24:54: 5,108,186,428 states generated (29,526,917 s/min), 483,970,290 distinct states found (2,417,724 ds/min), 113,645,974 states left on queue. +Progress(38) at 2024-11-06 18:25:54: 5,137,766,496 states generated (29,580,068 s/min), 486,204,445 distinct states found (2,234,155 ds/min), 113,816,273 states left on queue. +Progress(38) at 2024-11-06 18:26:54: 5,167,429,477 states generated (29,662,981 s/min), 488,726,479 distinct states found (2,522,034 ds/min), 114,265,425 states left on queue. +Progress(38) at 2024-11-06 18:27:54: 5,197,227,715 states generated (29,798,238 s/min), 491,213,848 distinct states found (2,487,369 ds/min), 114,645,624 states left on queue. +Progress(38) at 2024-11-06 18:28:54: 5,226,883,420 states generated (29,655,705 s/min), 493,480,968 distinct states found (2,267,120 ds/min), 114,901,786 states left on queue. +Progress(38) at 2024-11-06 18:29:54: 5,256,355,905 states generated (29,472,485 s/min), 495,866,549 distinct states found (2,385,581 ds/min), 115,277,276 states left on queue. +Checkpointing of run states/24-11-06-15-30-45.354 +Checkpointing completed at (2024-11-06 18:30:55) +Progress(38) at 2024-11-06 18:30:55: 5,286,035,252 states generated (29,679,347 s/min), 498,324,679 distinct states found (2,458,130 ds/min), 115,663,015 states left on queue. +Progress(38) at 2024-11-06 18:31:55: 5,315,467,724 states generated (29,432,472 s/min), 500,723,577 distinct states found (2,398,898 ds/min), 116,023,619 states left on queue. +Progress(38) at 2024-11-06 18:32:55: 5,344,728,453 states generated (29,260,729 s/min), 503,156,876 distinct states found (2,433,299 ds/min), 116,384,801 states left on queue. +Progress(38) at 2024-11-06 18:33:55: 5,374,055,231 states generated (29,326,778 s/min), 505,588,957 distinct states found (2,432,081 ds/min), 116,786,679 states left on queue. +Progress(38) at 2024-11-06 18:34:55: 5,403,566,278 states generated (29,511,047 s/min), 508,096,703 distinct states found (2,507,746 ds/min), 117,258,425 states left on queue. +Progress(38) at 2024-11-06 18:35:55: 5,432,770,932 states generated (29,204,654 s/min), 510,765,370 distinct states found (2,668,667 ds/min), 117,821,443 states left on queue. +Progress(38) at 2024-11-06 18:36:55: 5,462,325,607 states generated (29,554,675 s/min), 513,306,027 distinct states found (2,540,657 ds/min), 118,252,946 states left on queue. +Progress(38) at 2024-11-06 18:37:55: 5,491,531,381 states generated (29,205,774 s/min), 516,017,383 distinct states found (2,711,356 ds/min), 118,857,035 states left on queue. +Progress(38) at 2024-11-06 18:38:55: 5,520,744,572 states generated (29,213,191 s/min), 518,696,783 distinct states found (2,679,400 ds/min), 119,445,954 states left on queue. +Progress(38) at 2024-11-06 18:39:55: 5,549,903,819 states generated (29,159,247 s/min), 521,329,662 distinct states found (2,632,879 ds/min), 119,977,569 states left on queue. +Progress(38) at 2024-11-06 18:40:55: 5,579,474,839 states generated (29,571,020 s/min), 523,702,578 distinct states found (2,372,916 ds/min), 120,289,041 states left on queue. +Progress(38) at 2024-11-06 18:41:55: 5,608,757,550 states generated (29,282,711 s/min), 526,191,629 distinct states found (2,489,051 ds/min), 120,719,632 states left on queue. +Progress(38) at 2024-11-06 18:42:55: 5,638,085,090 states generated (29,327,540 s/min), 528,478,505 distinct states found (2,286,876 ds/min), 120,990,568 states left on queue. +Progress(38) at 2024-11-06 18:43:55: 5,667,141,833 states generated (29,056,743 s/min), 531,035,593 distinct states found (2,557,088 ds/min), 121,480,763 states left on queue. +Progress(38) at 2024-11-06 18:44:55: 5,696,139,104 states generated (28,997,271 s/min), 533,684,330 distinct states found (2,648,737 ds/min), 122,027,516 states left on queue. +Progress(38) at 2024-11-06 18:45:55: 5,724,868,902 states generated (28,729,798 s/min), 536,316,715 distinct states found (2,632,385 ds/min), 122,548,317 states left on queue. +Progress(38) at 2024-11-06 18:46:55: 5,753,438,871 states generated (28,569,969 s/min), 539,001,028 distinct states found (2,684,313 ds/min), 123,041,578 states left on queue. +Progress(38) at 2024-11-06 18:47:55: 5,782,391,778 states generated (28,952,907 s/min), 541,537,259 distinct states found (2,536,231 ds/min), 123,436,184 states left on queue. +Progress(38) at 2024-11-06 18:48:55: 5,811,823,996 states generated (29,432,218 s/min), 543,896,432 distinct states found (2,359,173 ds/min), 123,698,698 states left on queue. +Progress(38) at 2024-11-06 18:49:55: 5,841,258,941 states generated (29,434,945 s/min), 546,273,191 distinct states found (2,376,759 ds/min), 124,012,754 states left on queue. +Progress(38) at 2024-11-06 18:50:55: 5,870,667,995 states generated (29,409,054 s/min), 548,835,686 distinct states found (2,562,495 ds/min), 124,450,482 states left on queue. +Progress(38) at 2024-11-06 18:51:55: 5,900,038,718 states generated (29,370,723 s/min), 551,304,457 distinct states found (2,468,771 ds/min), 124,805,220 states left on queue. +Progress(38) at 2024-11-06 18:52:55: 5,929,442,421 states generated (29,403,703 s/min), 553,776,296 distinct states found (2,471,839 ds/min), 125,178,608 states left on queue. +Progress(38) at 2024-11-06 18:53:55: 5,958,838,496 states generated (29,396,075 s/min), 556,289,762 distinct states found (2,513,466 ds/min), 125,588,158 states left on queue. +Progress(38) at 2024-11-06 18:54:55: 5,988,187,325 states generated (29,348,829 s/min), 558,898,224 distinct states found (2,608,462 ds/min), 126,074,377 states left on queue. +Progress(38) at 2024-11-06 18:55:55: 6,017,546,111 states generated (29,358,786 s/min), 561,530,468 distinct states found (2,632,244 ds/min), 126,579,784 states left on queue. +Progress(38) at 2024-11-06 18:56:55: 6,046,777,143 states generated (29,231,032 s/min), 564,182,546 distinct states found (2,652,078 ds/min), 127,037,883 states left on queue. +Progress(39) at 2024-11-06 18:57:55: 6,076,111,479 states generated (29,334,336 s/min), 566,509,898 distinct states found (2,327,352 ds/min), 127,319,036 states left on queue. +Progress(39) at 2024-11-06 18:58:55: 6,105,215,668 states generated (29,104,189 s/min), 569,000,954 distinct states found (2,491,056 ds/min), 127,724,185 states left on queue. +Progress(39) at 2024-11-06 18:59:55: 6,134,619,650 states generated (29,403,982 s/min), 571,444,199 distinct states found (2,443,245 ds/min), 128,083,849 states left on queue. +Checkpointing of run states/24-11-06-15-30-45.354 +Checkpointing completed at (2024-11-06 19:00:55) +Progress(39) at 2024-11-06 19:00:55: 6,164,303,226 states generated (29,683,576 s/min), 574,046,920 distinct states found (2,602,721 ds/min), 128,537,330 states left on queue. +Progress(39) at 2024-11-06 19:01:55: 6,193,710,515 states generated (29,407,289 s/min), 576,294,161 distinct states found (2,247,241 ds/min), 128,749,186 states left on queue. +Progress(39) at 2024-11-06 19:02:55: 6,223,050,437 states generated (29,339,922 s/min), 578,840,811 distinct states found (2,546,650 ds/min), 129,198,375 states left on queue. +Progress(39) at 2024-11-06 19:03:55: 6,252,273,339 states generated (29,222,902 s/min), 581,530,481 distinct states found (2,689,670 ds/min), 129,745,195 states left on queue. +Progress(39) at 2024-11-06 19:04:55: 6,281,535,213 states generated (29,261,874 s/min), 584,206,969 distinct states found (2,676,488 ds/min), 130,306,182 states left on queue. +Progress(39) at 2024-11-06 19:05:55: 6,310,569,147 states generated (29,033,934 s/min), 587,031,959 distinct states found (2,824,990 ds/min), 130,922,629 states left on queue. +Progress(39) at 2024-11-06 19:06:55: 6,339,951,741 states generated (29,382,594 s/min), 589,709,668 distinct states found (2,677,709 ds/min), 131,483,555 states left on queue. +Progress(39) at 2024-11-06 19:07:55: 6,369,354,481 states generated (29,402,740 s/min), 591,964,654 distinct states found (2,254,986 ds/min), 131,688,532 states left on queue. +Progress(39) at 2024-11-06 19:08:55: 6,398,254,591 states generated (28,900,110 s/min), 594,604,924 distinct states found (2,640,270 ds/min), 132,195,069 states left on queue. +Progress(39) at 2024-11-06 19:09:55: 6,427,422,756 states generated (29,168,165 s/min), 597,059,083 distinct states found (2,454,159 ds/min), 132,571,626 states left on queue. +Progress(39) at 2024-11-06 19:10:55: 6,456,469,721 states generated (29,046,965 s/min), 599,400,317 distinct states found (2,341,234 ds/min), 132,826,474 states left on queue. +Progress(39) at 2024-11-06 19:11:55: 6,485,733,442 states generated (29,263,721 s/min), 602,040,336 distinct states found (2,640,019 ds/min), 133,286,664 states left on queue. +Progress(39) at 2024-11-06 19:12:55: 6,515,001,998 states generated (29,268,556 s/min), 604,003,958 distinct states found (1,963,622 ds/min), 133,255,252 states left on queue. +Progress(39) at 2024-11-06 19:13:55: 6,544,172,146 states generated (29,170,148 s/min), 606,473,164 distinct states found (2,469,206 ds/min), 133,627,323 states left on queue. +Progress(39) at 2024-11-06 19:14:55: 6,572,975,355 states generated (28,803,209 s/min), 609,043,606 distinct states found (2,570,442 ds/min), 134,023,262 states left on queue. +Progress(39) at 2024-11-06 19:15:55: 6,602,534,934 states generated (29,559,579 s/min), 611,212,652 distinct states found (2,169,046 ds/min), 134,205,070 states left on queue. +Progress(39) at 2024-11-06 19:16:55: 6,632,044,851 states generated (29,509,917 s/min), 613,377,378 distinct states found (2,164,726 ds/min), 134,360,577 states left on queue. +Progress(39) at 2024-11-06 19:17:55: 6,661,465,356 states generated (29,420,505 s/min), 615,729,605 distinct states found (2,352,227 ds/min), 134,679,148 states left on queue. +Progress(39) at 2024-11-06 19:18:55: 6,690,848,776 states generated (29,383,420 s/min), 618,034,126 distinct states found (2,304,521 ds/min), 134,989,999 states left on queue. +Progress(39) at 2024-11-06 19:19:55: 6,720,362,641 states generated (29,513,865 s/min), 620,264,990 distinct states found (2,230,864 ds/min), 135,213,527 states left on queue. +Progress(39) at 2024-11-06 19:20:55: 6,749,995,972 states generated (29,633,331 s/min), 622,424,423 distinct states found (2,159,433 ds/min), 135,336,269 states left on queue. +Progress(39) at 2024-11-06 19:21:55: 6,779,641,479 states generated (29,645,507 s/min), 624,953,002 distinct states found (2,528,579 ds/min), 135,781,717 states left on queue. +Progress(39) at 2024-11-06 19:22:55: 6,809,496,805 states generated (29,855,326 s/min), 627,297,563 distinct states found (2,344,561 ds/min), 136,040,988 states left on queue. +Progress(39) at 2024-11-06 19:23:55: 6,839,096,708 states generated (29,599,903 s/min), 629,464,688 distinct states found (2,167,125 ds/min), 136,210,971 states left on queue. +Progress(39) at 2024-11-06 19:24:55: 6,868,614,311 states generated (29,517,603 s/min), 631,704,627 distinct states found (2,239,939 ds/min), 136,469,731 states left on queue. +Progress(39) at 2024-11-06 19:25:55: 6,897,932,930 states generated (29,318,619 s/min), 633,961,042 distinct states found (2,256,415 ds/min), 136,714,912 states left on queue. +Progress(39) at 2024-11-06 19:26:55: 6,927,200,602 states generated (29,267,672 s/min), 636,414,800 distinct states found (2,453,758 ds/min), 137,101,547 states left on queue. +Progress(39) at 2024-11-06 19:27:55: 6,956,755,074 states generated (29,554,472 s/min), 638,616,489 distinct states found (2,201,689 ds/min), 137,285,238 states left on queue. +Progress(39) at 2024-11-06 19:28:55: 6,985,926,285 states generated (29,171,211 s/min), 640,970,274 distinct states found (2,353,785 ds/min), 137,592,586 states left on queue. +Progress(39) at 2024-11-06 19:29:55: 7,015,240,294 states generated (29,314,009 s/min), 643,310,280 distinct states found (2,340,006 ds/min), 137,914,322 states left on queue. +Checkpointing of run states/24-11-06-15-30-45.354 +Checkpointing completed at (2024-11-06 19:30:56) +Progress(39) at 2024-11-06 19:30:56: 7,045,112,039 states generated (29,871,745 s/min), 645,650,251 distinct states found (2,339,971 ds/min), 138,248,533 states left on queue. +Progress(39) at 2024-11-06 19:31:56: 7,074,347,122 states generated (29,235,083 s/min), 648,286,341 distinct states found (2,636,090 ds/min), 138,800,606 states left on queue. +Progress(39) at 2024-11-06 19:32:56: 7,103,701,427 states generated (29,354,305 s/min), 650,776,754 distinct states found (2,490,413 ds/min), 139,200,935 states left on queue. +Progress(39) at 2024-11-06 19:33:56: 7,133,125,574 states generated (29,424,147 s/min), 653,222,778 distinct states found (2,446,024 ds/min), 139,553,972 states left on queue. +Progress(39) at 2024-11-06 19:34:56: 7,162,393,954 states generated (29,268,380 s/min), 655,812,815 distinct states found (2,590,037 ds/min), 140,051,736 states left on queue. +Progress(39) at 2024-11-06 19:35:56: 7,191,614,309 states generated (29,220,355 s/min), 658,388,779 distinct states found (2,575,964 ds/min), 140,550,430 states left on queue. +Progress(39) at 2024-11-06 19:36:56: 7,220,841,977 states generated (29,227,668 s/min), 660,885,901 distinct states found (2,497,122 ds/min), 140,973,038 states left on queue. +Progress(39) at 2024-11-06 19:37:56: 7,250,020,241 states generated (29,178,264 s/min), 663,335,701 distinct states found (2,449,800 ds/min), 141,327,800 states left on queue. +Progress(39) at 2024-11-06 19:38:56: 7,279,545,923 states generated (29,525,682 s/min), 665,706,252 distinct states found (2,370,551 ds/min), 141,666,628 states left on queue. +Progress(39) at 2024-11-06 19:39:56: 7,308,806,585 states generated (29,260,662 s/min), 668,059,763 distinct states found (2,353,511 ds/min), 141,985,139 states left on queue. +Progress(39) at 2024-11-06 19:40:56: 7,338,028,888 states generated (29,222,303 s/min), 670,241,848 distinct states found (2,182,085 ds/min), 142,169,842 states left on queue. +Progress(39) at 2024-11-06 19:41:56: 7,367,241,753 states generated (29,212,865 s/min), 672,613,255 distinct states found (2,371,407 ds/min), 142,507,724 states left on queue. +Progress(39) at 2024-11-06 19:42:56: 7,396,269,434 states generated (29,027,681 s/min), 675,112,517 distinct states found (2,499,262 ds/min), 142,941,967 states left on queue. +Progress(39) at 2024-11-06 19:43:56: 7,425,237,701 states generated (28,968,267 s/min), 677,646,850 distinct states found (2,534,333 ds/min), 143,388,301 states left on queue. +Progress(39) at 2024-11-06 19:44:56: 7,453,929,312 states generated (28,691,611 s/min), 680,183,486 distinct states found (2,536,636 ds/min), 143,823,998 states left on queue. +Progress(39) at 2024-11-06 19:45:56: 7,482,605,282 states generated (28,675,970 s/min), 682,751,269 distinct states found (2,567,783 ds/min), 144,211,694 states left on queue. +Progress(39) at 2024-11-06 19:46:56: 7,511,402,194 states generated (28,796,912 s/min), 685,177,338 distinct states found (2,426,069 ds/min), 144,502,576 states left on queue. +Progress(39) at 2024-11-06 19:47:56: 7,540,667,315 states generated (29,265,121 s/min), 687,470,422 distinct states found (2,293,084 ds/min), 144,717,485 states left on queue. +Progress(39) at 2024-11-06 19:48:56: 7,570,065,371 states generated (29,398,056 s/min), 689,724,172 distinct states found (2,253,750 ds/min), 144,895,541 states left on queue. +Progress(39) at 2024-11-06 19:49:56: 7,599,596,791 states generated (29,531,420 s/min), 692,064,101 distinct states found (2,339,929 ds/min), 145,171,911 states left on queue. +Progress(39) at 2024-11-06 19:50:56: 7,629,011,363 states generated (29,414,572 s/min), 694,540,161 distinct states found (2,476,060 ds/min), 145,540,423 states left on queue. +Progress(39) at 2024-11-06 19:51:56: 7,658,453,965 states generated (29,442,602 s/min), 696,912,122 distinct states found (2,371,961 ds/min), 145,809,567 states left on queue. +Progress(39) at 2024-11-06 19:52:56: 7,687,913,137 states generated (29,459,172 s/min), 699,240,630 distinct states found (2,328,508 ds/min), 146,098,273 states left on queue. +Progress(39) at 2024-11-06 19:53:56: 7,717,161,254 states generated (29,248,117 s/min), 701,789,915 distinct states found (2,549,285 ds/min), 146,502,121 states left on queue. +Progress(39) at 2024-11-06 19:54:56: 7,746,587,948 states generated (29,426,694 s/min), 704,037,014 distinct states found (2,247,099 ds/min), 146,684,369 states left on queue. +Progress(39) at 2024-11-06 19:55:56: 7,775,767,241 states generated (29,179,293 s/min), 706,750,225 distinct states found (2,713,211 ds/min), 147,270,858 states left on queue. +Progress(39) at 2024-11-06 19:56:56: 7,805,143,313 states generated (29,376,072 s/min), 709,214,940 distinct states found (2,464,715 ds/min), 147,627,166 states left on queue. +Progress(39) at 2024-11-06 19:57:56: 7,834,403,478 states generated (29,260,165 s/min), 711,759,633 distinct states found (2,544,693 ds/min), 147,996,842 states left on queue. +Progress(40) at 2024-11-06 19:58:56: 7,863,785,909 states generated (29,382,431 s/min), 713,915,903 distinct states found (2,156,270 ds/min), 148,107,480 states left on queue. +Progress(40) at 2024-11-06 19:59:56: 7,892,661,923 states generated (28,876,014 s/min), 716,529,052 distinct states found (2,613,149 ds/min), 148,615,346 states left on queue. +Checkpointing of run states/24-11-06-15-30-45.354 +Checkpointing completed at (2024-11-06 20:00:57) +Progress(40) at 2024-11-06 20:00:57: 7,922,354,868 states generated (29,692,945 s/min), 718,724,840 distinct states found (2,195,788 ds/min), 148,760,464 states left on queue. +Progress(40) at 2024-11-06 20:01:57: 7,951,821,345 states generated (29,466,477 s/min), 721,199,790 distinct states found (2,474,950 ds/min), 149,133,458 states left on queue. +Progress(40) at 2024-11-06 20:02:57: 7,981,212,562 states generated (29,391,217 s/min), 723,637,084 distinct states found (2,437,294 ds/min), 149,453,388 states left on queue. +Progress(40) at 2024-11-06 20:03:57: 8,010,639,344 states generated (29,426,782 s/min), 725,776,597 distinct states found (2,139,513 ds/min), 149,580,205 states left on queue. +Progress(40) at 2024-11-06 20:04:57: 8,039,970,078 states generated (29,330,734 s/min), 728,145,896 distinct states found (2,369,299 ds/min), 149,873,787 states left on queue. +Progress(40) at 2024-11-06 20:05:57: 8,069,221,501 states generated (29,251,423 s/min), 730,835,980 distinct states found (2,690,084 ds/min), 150,431,663 states left on queue. +Progress(40) at 2024-11-06 20:06:57: 8,098,568,645 states generated (29,347,144 s/min), 733,266,238 distinct states found (2,430,258 ds/min), 150,772,190 states left on queue. +Progress(40) at 2024-11-06 20:07:57: 8,127,646,970 states generated (29,078,325 s/min), 736,001,441 distinct states found (2,735,203 ds/min), 151,368,297 states left on queue. +Progress(40) at 2024-11-06 20:08:57: 8,156,755,007 states generated (29,108,037 s/min), 738,759,675 distinct states found (2,758,234 ds/min), 151,912,929 states left on queue. +Progress(40) at 2024-11-06 20:09:57: 8,186,234,810 states generated (29,479,803 s/min), 741,336,146 distinct states found (2,576,471 ds/min), 152,376,828 states left on queue. +Progress(40) at 2024-11-06 20:10:57: 8,215,641,994 states generated (29,407,184 s/min), 743,647,353 distinct states found (2,311,207 ds/min), 152,617,899 states left on queue. +Progress(40) at 2024-11-06 20:11:57: 8,244,746,445 states generated (29,104,451 s/min), 746,080,007 distinct states found (2,432,654 ds/min), 152,939,104 states left on queue. +Progress(40) at 2024-11-06 20:12:57: 8,273,514,095 states generated (28,767,650 s/min), 748,726,701 distinct states found (2,646,694 ds/min), 153,445,645 states left on queue. +Progress(40) at 2024-11-06 20:13:57: 8,302,647,011 states generated (29,132,916 s/min), 751,041,420 distinct states found (2,314,719 ds/min), 153,711,631 states left on queue. +Progress(40) at 2024-11-06 20:14:57: 8,331,785,512 states generated (29,138,501 s/min), 753,262,324 distinct states found (2,220,904 ds/min), 153,861,206 states left on queue. +Progress(40) at 2024-11-06 20:15:57: 8,361,058,813 states generated (29,273,301 s/min), 755,881,803 distinct states found (2,619,479 ds/min), 154,293,451 states left on queue. +Progress(40) at 2024-11-06 20:16:57: 8,390,323,842 states generated (29,265,029 s/min), 757,769,813 distinct states found (1,888,010 ds/min), 154,184,183 states left on queue. +Progress(40) at 2024-11-06 20:17:57: 8,419,579,524 states generated (29,255,682 s/min), 760,009,795 distinct states found (2,239,982 ds/min), 154,382,656 states left on queue. +Progress(40) at 2024-11-06 20:18:57: 8,448,394,343 states generated (28,814,819 s/min), 762,597,225 distinct states found (2,587,430 ds/min), 154,795,314 states left on queue. +Progress(40) at 2024-11-06 20:19:57: 8,477,530,142 states generated (29,135,799 s/min), 764,903,184 distinct states found (2,305,959 ds/min), 154,997,361 states left on queue. +Progress(40) at 2024-11-06 20:20:57: 8,507,035,930 states generated (29,505,788 s/min), 766,887,142 distinct states found (1,983,958 ds/min), 155,034,831 states left on queue. +Progress(40) at 2024-11-06 20:21:57: 8,536,505,703 states generated (29,469,773 s/min), 769,048,483 distinct states found (2,161,341 ds/min), 155,183,742 states left on queue. +Progress(40) at 2024-11-06 20:22:57: 8,565,867,584 states generated (29,361,881 s/min), 771,258,076 distinct states found (2,209,593 ds/min), 155,385,262 states left on queue. +Progress(40) at 2024-11-06 20:23:57: 8,595,185,764 states generated (29,318,180 s/min), 773,454,985 distinct states found (2,196,909 ds/min), 155,614,111 states left on queue. +Progress(40) at 2024-11-06 20:24:57: 8,624,496,269 states generated (29,310,505 s/min), 775,619,630 distinct states found (2,164,645 ds/min), 155,798,174 states left on queue. +Progress(40) at 2024-11-06 20:25:57: 8,654,080,073 states generated (29,583,804 s/min), 777,637,410 distinct states found (2,017,780 ds/min), 155,782,045 states left on queue. +Progress(40) at 2024-11-06 20:26:57: 8,683,722,009 states generated (29,641,936 s/min), 779,940,399 distinct states found (2,302,989 ds/min), 156,073,330 states left on queue. +Progress(40) at 2024-11-06 20:27:57: 8,713,410,725 states generated (29,688,716 s/min), 782,406,987 distinct states found (2,466,588 ds/min), 156,445,902 states left on queue. +Progress(40) at 2024-11-06 20:28:57: 8,743,158,002 states generated (29,747,277 s/min), 784,542,609 distinct states found (2,135,622 ds/min), 156,539,841 states left on queue. +Progress(40) at 2024-11-06 20:29:57: 8,772,688,809 states generated (29,530,807 s/min), 786,583,608 distinct states found (2,040,999 ds/min), 156,630,041 states left on queue. +Checkpointing of run states/24-11-06-15-30-45.354 +Checkpointing completed at (2024-11-06 20:30:57) +Progress(40) at 2024-11-06 20:30:57: 8,802,299,219 states generated (29,610,410 s/min), 788,709,007 distinct states found (2,125,399 ds/min), 156,780,966 states left on queue. +Progress(40) at 2024-11-06 20:31:57: 8,831,545,663 states generated (29,246,444 s/min), 790,874,634 distinct states found (2,165,627 ds/min), 156,943,688 states left on queue. +Progress(40) at 2024-11-06 20:32:57: 8,860,742,526 states generated (29,196,863 s/min), 793,218,612 distinct states found (2,343,978 ds/min), 157,247,738 states left on queue. +Progress(40) at 2024-11-06 20:33:57: 8,890,145,689 states generated (29,403,163 s/min), 795,347,746 distinct states found (2,129,134 ds/min), 157,376,715 states left on queue. +Progress(40) at 2024-11-06 20:34:57: 8,919,277,440 states generated (29,131,751 s/min), 797,557,991 distinct states found (2,210,245 ds/min), 157,566,508 states left on queue. +Progress(40) at 2024-11-06 20:35:57: 8,948,368,355 states generated (29,090,915 s/min), 799,870,441 distinct states found (2,312,450 ds/min), 157,825,337 states left on queue. +Progress(40) at 2024-11-06 20:36:57: 8,977,811,769 states generated (29,443,414 s/min), 801,992,418 distinct states found (2,121,977 ds/min), 158,015,008 states left on queue. +Progress(40) at 2024-11-06 20:37:57: 9,007,285,675 states generated (29,473,906 s/min), 804,250,024 distinct states found (2,257,606 ds/min), 158,295,507 states left on queue. +Progress(40) at 2024-11-06 20:38:57: 9,036,450,953 states generated (29,165,278 s/min), 806,795,860 distinct states found (2,545,836 ds/min), 158,767,907 states left on queue. +Progress(40) at 2024-11-06 20:39:57: 9,065,704,268 states generated (29,253,315 s/min), 809,198,438 distinct states found (2,402,578 ds/min), 159,105,121 states left on queue. +Progress(40) at 2024-11-06 20:40:57: 9,095,165,427 states generated (29,461,159 s/min), 811,512,584 distinct states found (2,314,146 ds/min), 159,345,117 states left on queue. +Progress(40) at 2024-11-06 20:41:57: 9,124,541,297 states generated (29,375,870 s/min), 813,905,920 distinct states found (2,393,336 ds/min), 159,672,325 states left on queue. +Progress(40) at 2024-11-06 20:42:57: 9,153,712,591 states generated (29,171,294 s/min), 816,392,570 distinct states found (2,486,650 ds/min), 160,082,547 states left on queue. +Progress(40) at 2024-11-06 20:43:57: 9,182,920,866 states generated (29,208,275 s/min), 818,845,538 distinct states found (2,452,968 ds/min), 160,476,056 states left on queue. +Progress(40) at 2024-11-06 20:44:57: 9,212,093,614 states generated (29,172,748 s/min), 821,212,595 distinct states found (2,367,057 ds/min), 160,787,698 states left on queue. +Progress(40) at 2024-11-06 20:45:57: 9,241,177,362 states generated (29,083,748 s/min), 823,731,111 distinct states found (2,518,516 ds/min), 161,227,975 states left on queue. +Progress(40) at 2024-11-06 20:46:57: 9,270,666,448 states generated (29,489,086 s/min), 825,877,262 distinct states found (2,146,151 ds/min), 161,339,209 states left on queue. +Progress(40) at 2024-11-06 20:47:57: 9,299,985,513 states generated (29,319,065 s/min), 828,195,512 distinct states found (2,318,250 ds/min), 161,644,069 states left on queue. +Progress(40) at 2024-11-06 20:48:57: 9,329,155,005 states generated (29,169,492 s/min), 830,386,518 distinct states found (2,191,006 ds/min), 161,807,802 states left on queue. +Progress(40) at 2024-11-06 20:49:57: 9,358,433,771 states generated (29,278,766 s/min), 832,419,931 distinct states found (2,033,413 ds/min), 161,882,018 states left on queue. +Progress(40) at 2024-11-06 20:50:57: 9,387,665,287 states generated (29,231,516 s/min), 834,751,267 distinct states found (2,331,336 ds/min), 162,183,217 states left on queue. +Progress(40) at 2024-11-06 20:51:57: 9,416,697,647 states generated (29,032,360 s/min), 837,127,657 distinct states found (2,376,390 ds/min), 162,511,558 states left on queue. +Progress(40) at 2024-11-06 20:52:57: 9,445,747,666 states generated (29,050,019 s/min), 839,556,372 distinct states found (2,428,715 ds/min), 162,873,418 states left on queue. +Progress(40) at 2024-11-06 20:53:57: 9,474,599,613 states generated (28,851,947 s/min), 841,985,780 distinct states found (2,429,408 ds/min), 163,231,531 states left on queue. +Progress(40) at 2024-11-06 20:54:57: 9,503,408,525 states generated (28,808,912 s/min), 844,368,680 distinct states found (2,382,900 ds/min), 163,533,407 states left on queue. +Progress(40) at 2024-11-06 20:55:57: 9,532,128,492 states generated (28,719,967 s/min), 846,804,519 distinct states found (2,435,839 ds/min), 163,787,695 states left on queue. +Progress(40) at 2024-11-06 20:56:57: 9,560,935,598 states generated (28,807,106 s/min), 849,075,143 distinct states found (2,270,624 ds/min), 163,946,240 states left on queue. +Progress(40) at 2024-11-06 20:57:57: 9,590,127,374 states generated (29,191,776 s/min), 851,260,378 distinct states found (2,185,235 ds/min), 164,077,372 states left on queue. +Progress(40) at 2024-11-06 20:58:57: 9,619,514,341 states generated (29,386,967 s/min), 853,352,738 distinct states found (2,092,360 ds/min), 164,118,186 states left on queue. +Progress(40) at 2024-11-06 20:59:57: 9,648,985,302 states generated (29,470,961 s/min), 855,543,408 distinct states found (2,190,670 ds/min), 164,279,076 states left on queue. +Checkpointing of run states/24-11-06-15-30-45.354 +Checkpointing completed at (2024-11-06 21:00:58) +Progress(40) at 2024-11-06 21:00:58: 9,678,677,722 states generated (29,692,420 s/min), 857,894,775 distinct states found (2,351,367 ds/min), 164,516,395 states left on queue. +Progress(40) at 2024-11-06 21:01:58: 9,708,095,509 states generated (29,417,787 s/min), 860,383,155 distinct states found (2,488,380 ds/min), 164,898,153 states left on queue. +Progress(40) at 2024-11-06 21:02:58: 9,737,488,378 states generated (29,392,869 s/min), 862,497,194 distinct states found (2,114,039 ds/min), 164,966,010 states left on queue. +Progress(40) at 2024-11-06 21:03:58: 9,766,895,552 states generated (29,407,174 s/min), 864,819,525 distinct states found (2,322,331 ds/min), 165,232,701 states left on queue. +Progress(40) at 2024-11-06 21:04:58: 9,796,208,300 states generated (29,312,748 s/min), 867,276,841 distinct states found (2,457,316 ds/min), 165,568,613 states left on queue. +Progress(40) at 2024-11-06 21:05:58: 9,825,603,726 states generated (29,395,426 s/min), 869,434,526 distinct states found (2,157,685 ds/min), 165,685,610 states left on queue. +Progress(40) at 2024-11-06 21:06:58: 9,854,789,772 states generated (29,186,046 s/min), 871,934,034 distinct states found (2,499,508 ds/min), 166,084,000 states left on queue. +Progress(40) at 2024-11-06 21:07:58: 9,884,028,390 states generated (29,238,618 s/min), 874,443,659 distinct states found (2,509,625 ds/min), 166,483,652 states left on queue. +Progress(40) at 2024-11-06 21:08:58: 9,913,377,669 states generated (29,349,279 s/min), 876,803,913 distinct states found (2,360,254 ds/min), 166,740,702 states left on queue. +Progress(40) at 2024-11-06 21:09:58: 9,942,721,749 states generated (29,344,080 s/min), 879,187,270 distinct states found (2,383,357 ds/min), 166,953,562 states left on queue. +Progress(41) at 2024-11-06 21:10:58: 9,972,078,704 states generated (29,356,955 s/min), 881,233,361 distinct states found (2,046,091 ds/min), 166,999,841 states left on queue. +Progress(41) at 2024-11-06 21:11:58: 10,000,914,792 states generated (28,836,088 s/min), 883,811,441 distinct states found (2,578,080 ds/min), 167,466,583 states left on queue. +Progress(41) at 2024-11-06 21:12:58: 10,030,210,434 states generated (29,295,642 s/min), 885,899,950 distinct states found (2,088,509 ds/min), 167,531,826 states left on queue. +Progress(41) at 2024-11-06 21:13:58: 10,059,587,070 states generated (29,376,636 s/min), 888,188,669 distinct states found (2,288,719 ds/min), 167,753,242 states left on queue. +Progress(41) at 2024-11-06 21:14:58: 10,089,078,901 states generated (29,491,831 s/min), 890,649,997 distinct states found (2,461,328 ds/min), 168,098,890 states left on queue. +Progress(41) at 2024-11-06 21:15:58: 10,118,348,352 states generated (29,269,451 s/min), 892,695,892 distinct states found (2,045,895 ds/min), 168,141,532 states left on queue. +Progress(41) at 2024-11-06 21:16:58: 10,147,644,676 states generated (29,296,324 s/min), 894,823,997 distinct states found (2,128,105 ds/min), 168,231,032 states left on queue. +Progress(41) at 2024-11-06 21:17:58: 10,176,967,773 states generated (29,323,097 s/min), 897,225,523 distinct states found (2,401,526 ds/min), 168,555,740 states left on queue. +Progress(41) at 2024-11-06 21:18:58: 10,206,275,174 states generated (29,307,401 s/min), 899,814,626 distinct states found (2,589,103 ds/min), 169,020,971 states left on queue. +Progress(41) at 2024-11-06 21:19:58: 10,235,593,993 states generated (29,318,819 s/min), 902,141,356 distinct states found (2,326,730 ds/min), 169,267,251 states left on queue. +Progress(41) at 2024-11-06 21:20:58: 10,264,799,049 states generated (29,205,056 s/min), 904,746,333 distinct states found (2,604,977 ds/min), 169,758,459 states left on queue. +Progress(41) at 2024-11-06 21:21:58: 10,293,910,586 states generated (29,111,537 s/min), 907,433,182 distinct states found (2,686,849 ds/min), 170,277,176 states left on queue. +Progress(41) at 2024-11-06 21:22:58: 10,323,190,750 states generated (29,280,164 s/min), 910,052,108 distinct states found (2,618,926 ds/min), 170,695,212 states left on queue. +Progress(41) at 2024-11-06 21:23:58: 10,352,580,182 states generated (29,389,432 s/min), 912,516,064 distinct states found (2,463,956 ds/min), 171,083,771 states left on queue. +Progress(41) at 2024-11-06 21:24:58: 10,381,951,479 states generated (29,371,297 s/min), 914,781,443 distinct states found (2,265,379 ds/min), 171,281,545 states left on queue. +Progress(41) at 2024-11-06 21:25:58: 10,411,026,945 states generated (29,075,466 s/min), 917,078,052 distinct states found (2,296,609 ds/min), 171,498,613 states left on queue. +Progress(41) at 2024-11-06 21:26:58: 10,439,904,441 states generated (28,877,496 s/min), 919,547,808 distinct states found (2,469,756 ds/min), 171,860,589 states left on queue. +Progress(41) at 2024-11-06 21:27:58: 10,469,008,600 states generated (29,104,159 s/min), 921,912,547 distinct states found (2,364,739 ds/min), 172,121,551 states left on queue. +Progress(41) at 2024-11-06 21:28:58: 10,497,834,986 states generated (28,826,386 s/min), 924,235,840 distinct states found (2,323,293 ds/min), 172,353,661 states left on queue. +Progress(41) at 2024-11-06 21:29:58: 10,527,064,696 states generated (29,229,710 s/min), 926,456,744 distinct states found (2,220,904 ds/min), 172,508,439 states left on queue. +Checkpointing of run states/24-11-06-15-30-45.354 +Checkpointing completed at (2024-11-06 21:30:59) +Progress(41) at 2024-11-06 21:30:59: 10,556,579,142 states generated (29,514,446 s/min), 928,988,872 distinct states found (2,532,128 ds/min), 172,833,183 states left on queue. +Progress(41) at 2024-11-06 21:31:59: 10,585,719,909 states generated (29,140,767 s/min), 930,745,149 distinct states found (1,756,277 ds/min), 172,622,496 states left on queue. +Progress(41) at 2024-11-06 21:32:59: 10,614,881,115 states generated (29,161,206 s/min), 932,948,083 distinct states found (2,202,934 ds/min), 172,792,818 states left on queue. +Progress(41) at 2024-11-06 21:33:59: 10,643,693,909 states generated (28,812,794 s/min), 935,441,862 distinct states found (2,493,779 ds/min), 173,119,721 states left on queue. +Progress(41) at 2024-11-06 21:34:59: 10,672,671,166 states generated (28,977,257 s/min), 937,653,961 distinct states found (2,212,099 ds/min), 173,216,843 states left on queue. +Progress(41) at 2024-11-06 21:35:59: 10,702,072,440 states generated (29,401,274 s/min), 939,638,920 distinct states found (1,984,959 ds/min), 173,254,076 states left on queue. +Progress(41) at 2024-11-06 21:36:59: 10,731,415,292 states generated (29,342,852 s/min), 941,583,653 distinct states found (1,944,733 ds/min), 173,229,968 states left on queue. +Progress(41) at 2024-11-06 21:37:59: 10,760,802,656 states generated (29,387,364 s/min), 943,770,610 distinct states found (2,186,957 ds/min), 173,412,799 states left on queue. +Progress(41) at 2024-11-06 21:38:59: 10,789,961,996 states generated (29,159,340 s/min), 945,790,519 distinct states found (2,019,909 ds/min), 173,482,204 states left on queue. +Progress(41) at 2024-11-06 21:39:59: 10,819,303,972 states generated (29,341,976 s/min), 947,902,156 distinct states found (2,111,637 ds/min), 173,640,941 states left on queue. +Progress(41) at 2024-11-06 21:40:59: 10,848,636,471 states generated (29,332,499 s/min), 949,908,145 distinct states found (2,005,989 ds/min), 173,684,074 states left on queue. +Progress(41) at 2024-11-06 21:41:59: 10,878,207,345 states generated (29,570,874 s/min), 951,870,784 distinct states found (1,962,639 ds/min), 173,648,255 states left on queue. +Progress(41) at 2024-11-06 21:42:59: 10,907,777,091 states generated (29,569,746 s/min), 954,123,321 distinct states found (2,252,537 ds/min), 173,881,583 states left on queue. +Progress(41) at 2024-11-06 21:43:59: 10,937,383,465 states generated (29,606,374 s/min), 956,486,701 distinct states found (2,363,380 ds/min), 174,173,694 states left on queue. +Progress(41) at 2024-11-06 21:44:59: 10,967,070,713 states generated (29,687,248 s/min), 958,539,717 distinct states found (2,053,016 ds/min), 174,194,592 states left on queue. +Progress(41) at 2024-11-06 21:45:59: 10,996,524,132 states generated (29,453,419 s/min), 960,439,766 distinct states found (1,900,049 ds/min), 174,165,777 states left on queue. +Progress(41) at 2024-11-06 21:46:59: 11,025,919,452 states generated (29,395,320 s/min), 962,518,661 distinct states found (2,078,895 ds/min), 174,284,642 states left on queue. +Progress(41) at 2024-11-06 21:47:59: 11,055,087,136 states generated (29,167,684 s/min), 964,440,130 distinct states found (1,921,469 ds/min), 174,253,951 states left on queue. +Progress(41) at 2024-11-06 21:48:59: 11,084,346,164 states generated (29,259,028 s/min), 966,652,841 distinct states found (2,212,711 ds/min), 174,452,762 states left on queue. +Progress(41) at 2024-11-06 21:49:59: 11,113,503,996 states generated (29,157,832 s/min), 968,786,590 distinct states found (2,133,749 ds/min), 174,578,147 states left on queue. +Progress(41) at 2024-11-06 21:50:59: 11,142,862,327 states generated (29,358,331 s/min), 970,780,918 distinct states found (1,994,328 ds/min), 174,585,050 states left on queue. +Progress(41) at 2024-11-06 21:51:59: 11,171,907,560 states generated (29,045,233 s/min), 972,924,432 distinct states found (2,143,514 ds/min), 174,718,189 states left on queue. +Progress(41) at 2024-11-06 21:52:59: 11,201,055,602 states generated (29,148,042 s/min), 975,106,131 distinct states found (2,181,699 ds/min), 174,874,035 states left on queue. +Progress(41) at 2024-11-06 21:53:59: 11,230,576,268 states generated (29,520,666 s/min), 977,176,048 distinct states found (2,069,917 ds/min), 175,042,666 states left on queue. +Progress(41) at 2024-11-06 21:54:59: 11,259,928,257 states generated (29,351,989 s/min), 979,337,351 distinct states found (2,161,303 ds/min), 175,248,665 states left on queue. +Progress(41) at 2024-11-06 21:55:59: 11,289,190,366 states generated (29,262,109 s/min), 981,837,130 distinct states found (2,499,779 ds/min), 175,680,736 states left on queue. +Progress(41) at 2024-11-06 21:56:59: 11,318,399,828 states generated (29,209,462 s/min), 984,112,195 distinct states found (2,275,065 ds/min), 175,913,580 states left on queue. +Progress(41) at 2024-11-06 21:57:59: 11,347,862,845 states generated (29,463,017 s/min), 986,368,069 distinct states found (2,255,874 ds/min), 176,126,523 states left on queue. +Progress(41) at 2024-11-06 21:58:59: 11,377,318,937 states generated (29,456,092 s/min), 988,548,686 distinct states found (2,180,617 ds/min), 176,253,552 states left on queue. +Progress(41) at 2024-11-06 21:59:59: 11,406,551,913 states generated (29,232,976 s/min), 990,875,071 distinct states found (2,326,385 ds/min), 176,528,465 states left on queue. +Checkpointing of run states/24-11-06-15-30-45.354 +Checkpointing completed at (2024-11-06 22:00:59) +Progress(41) at 2024-11-06 22:00:59: 11,436,006,666 states generated (29,454,753 s/min), 993,234,999 distinct states found (2,359,928 ds/min), 176,816,755 states left on queue. +Progress(41) at 2024-11-06 22:01:59: 11,465,207,151 states generated (29,200,485 s/min), 995,557,179 distinct states found (2,322,180 ds/min), 177,094,397 states left on queue. +Progress(41) at 2024-11-06 22:02:59: 11,494,298,575 states generated (29,091,424 s/min), 997,927,812 distinct states found (2,370,633 ds/min), 177,411,890 states left on queue. +Progress(41) at 2024-11-06 22:03:59: 11,523,576,632 states generated (29,278,057 s/min), 1,000,196,030 distinct states found (2,268,218 ds/min), 177,640,656 states left on queue. +Progress(41) at 2024-11-06 22:04:59: 11,552,734,483 states generated (29,157,851 s/min), 1,002,452,277 distinct states found (2,256,247 ds/min), 177,827,247 states left on queue. +Progress(41) at 2024-11-06 22:05:59: 11,582,200,298 states generated (29,465,815 s/min), 1,004,593,818 distinct states found (2,141,541 ds/min), 177,983,707 states left on queue. +Progress(41) at 2024-11-06 22:06:59: 11,611,484,149 states generated (29,283,851 s/min), 1,006,774,383 distinct states found (2,180,565 ds/min), 178,161,577 states left on queue. +Progress(41) at 2024-11-06 22:07:59: 11,640,449,232 states generated (28,965,083 s/min), 1,008,870,356 distinct states found (2,095,973 ds/min), 178,245,657 states left on queue. +Progress(41) at 2024-11-06 22:08:59: 11,669,695,402 states generated (29,246,170 s/min), 1,010,743,262 distinct states found (1,872,906 ds/min), 178,199,630 states left on queue. +Progress(41) at 2024-11-06 22:09:59: 11,698,855,657 states generated (29,160,255 s/min), 1,012,993,163 distinct states found (2,249,901 ds/min), 178,433,806 states left on queue. +Progress(41) at 2024-11-06 22:10:59: 11,727,873,536 states generated (29,017,879 s/min), 1,015,222,628 distinct states found (2,229,465 ds/min), 178,645,315 states left on queue. +Progress(41) at 2024-11-06 22:11:59: 11,756,910,696 states generated (29,037,160 s/min), 1,017,493,811 distinct states found (2,271,183 ds/min), 178,885,854 states left on queue. +Progress(41) at 2024-11-06 22:12:59: 11,785,841,957 states generated (28,931,261 s/min), 1,019,798,730 distinct states found (2,304,919 ds/min), 179,138,831 states left on queue. +Progress(41) at 2024-11-06 22:13:59: 11,814,627,351 states generated (28,785,394 s/min), 1,022,115,935 distinct states found (2,317,205 ds/min), 179,401,355 states left on queue. +Progress(41) at 2024-11-06 22:14:59: 11,843,482,288 states generated (28,854,937 s/min), 1,024,372,991 distinct states found (2,257,056 ds/min), 179,570,167 states left on queue. +Progress(41) at 2024-11-06 22:15:59: 11,872,232,503 states generated (28,750,215 s/min), 1,026,655,919 distinct states found (2,282,928 ds/min), 179,704,400 states left on queue. +Progress(41) at 2024-11-06 22:16:59: 11,901,011,327 states generated (28,778,824 s/min), 1,028,780,151 distinct states found (2,124,232 ds/min), 179,744,822 states left on queue. +Progress(41) at 2024-11-06 22:17:59: 11,930,078,061 states generated (29,066,734 s/min), 1,030,863,673 distinct states found (2,083,522 ds/min), 179,790,662 states left on queue. +Progress(41) at 2024-11-06 22:18:59: 11,959,463,901 states generated (29,385,840 s/min), 1,032,840,344 distinct states found (1,976,671 ds/min), 179,738,442 states left on queue. +Progress(41) at 2024-11-06 22:19:59: 11,988,811,132 states generated (29,347,231 s/min), 1,034,897,049 distinct states found (2,056,705 ds/min), 179,788,782 states left on queue. +Progress(41) at 2024-11-06 22:20:59: 12,018,335,911 states generated (29,524,779 s/min), 1,037,158,579 distinct states found (2,261,530 ds/min), 179,978,226 states left on queue. +Progress(41) at 2024-11-06 22:21:59: 12,047,755,593 states generated (29,419,682 s/min), 1,039,437,623 distinct states found (2,279,044 ds/min), 180,177,371 states left on queue. +Progress(41) at 2024-11-06 22:22:59: 12,077,111,001 states generated (29,355,408 s/min), 1,041,672,961 distinct states found (2,235,338 ds/min), 180,336,777 states left on queue. +Progress(41) at 2024-11-06 22:23:59: 12,106,556,177 states generated (29,445,176 s/min), 1,043,675,880 distinct states found (2,002,919 ds/min), 180,345,759 states left on queue. +Progress(41) at 2024-11-06 22:24:59: 12,135,797,446 states generated (29,241,269 s/min), 1,045,966,606 distinct states found (2,290,726 ds/min), 180,552,887 states left on queue. +Progress(41) at 2024-11-06 22:25:59: 12,165,143,756 states generated (29,346,310 s/min), 1,048,373,643 distinct states found (2,407,037 ds/min), 180,860,142 states left on queue. +Progress(41) at 2024-11-06 22:26:59: 12,194,478,236 states generated (29,334,480 s/min), 1,050,403,560 distinct states found (2,029,917 ds/min), 180,873,811 states left on queue. +Progress(41) at 2024-11-06 22:27:59: 12,223,653,080 states generated (29,174,844 s/min), 1,052,798,502 distinct states found (2,394,942 ds/min), 181,184,025 states left on queue. +Progress(41) at 2024-11-06 22:28:59: 12,252,926,784 states generated (29,273,704 s/min), 1,055,243,990 distinct states found (2,445,488 ds/min), 181,542,525 states left on queue. +Progress(41) at 2024-11-06 22:29:59: 12,282,176,071 states generated (29,249,287 s/min), 1,057,488,489 distinct states found (2,244,499 ds/min), 181,704,266 states left on queue. +Checkpointing of run states/24-11-06-15-30-45.354 +Checkpointing completed at (2024-11-06 22:31:00) +Progress(41) at 2024-11-06 22:31:00: 12,311,654,529 states generated (29,478,458 s/min), 1,059,789,296 distinct states found (2,300,807 ds/min), 181,875,392 states left on queue. +Progress(41) at 2024-11-06 22:32:00: 12,340,837,903 states generated (29,183,374 s/min), 1,061,857,294 distinct states found (2,067,998 ds/min), 181,860,690 states left on queue. +Progress(41) at 2024-11-06 22:33:00: 12,369,978,352 states generated (29,140,449 s/min), 1,063,943,173 distinct states found (2,085,879 ds/min), 181,951,091 states left on queue. +Progress(41) at 2024-11-06 22:34:00: 12,398,820,660 states generated (28,842,308 s/min), 1,066,384,327 distinct states found (2,441,154 ds/min), 182,284,376 states left on queue. +Progress(41) at 2024-11-06 22:35:00: 12,427,966,245 states generated (29,145,585 s/min), 1,068,376,116 distinct states found (1,991,789 ds/min), 182,275,982 states left on queue. +Progress(41) at 2024-11-06 22:36:00: 12,457,300,671 states generated (29,334,426 s/min), 1,070,596,949 distinct states found (2,220,833 ds/min), 182,442,278 states left on queue. +Progress(41) at 2024-11-06 22:37:00: 12,486,769,483 states generated (29,468,812 s/min), 1,072,968,640 distinct states found (2,371,691 ds/min), 182,718,485 states left on queue. +Progress(41) at 2024-11-06 22:38:00: 12,516,031,360 states generated (29,261,877 s/min), 1,075,001,378 distinct states found (2,032,738 ds/min), 182,729,966 states left on queue. +Progress(41) at 2024-11-06 22:39:00: 12,545,265,331 states generated (29,233,971 s/min), 1,076,880,794 distinct states found (1,879,416 ds/min), 182,634,798 states left on queue. +Progress(41) at 2024-11-06 22:40:00: 12,574,495,559 states generated (29,230,228 s/min), 1,079,123,856 distinct states found (2,243,062 ds/min), 182,812,322 states left on queue. +Progress(41) at 2024-11-06 22:41:00: 12,603,757,387 states generated (29,261,828 s/min), 1,081,610,769 distinct states found (2,486,913 ds/min), 183,219,247 states left on queue. +Progress(41) at 2024-11-06 22:42:00: 12,632,909,026 states generated (29,151,639 s/min), 1,083,967,637 distinct states found (2,356,868 ds/min), 183,478,879 states left on queue. +Progress(41) at 2024-11-06 22:43:00: 12,662,254,981 states generated (29,345,955 s/min), 1,086,272,935 distinct states found (2,305,298 ds/min), 183,726,701 states left on queue. +Progress(41) at 2024-11-06 22:44:00: 12,691,400,218 states generated (29,145,237 s/min), 1,088,778,928 distinct states found (2,505,993 ds/min), 184,128,274 states left on queue. +Progress(41) at 2024-11-06 22:45:00: 12,720,528,098 states generated (29,127,880 s/min), 1,091,335,929 distinct states found (2,557,001 ds/min), 184,556,078 states left on queue. +Progress(41) at 2024-11-06 22:46:00: 12,749,701,886 states generated (29,173,788 s/min), 1,093,889,510 distinct states found (2,553,581 ds/min), 184,916,391 states left on queue. +Progress(41) at 2024-11-06 22:47:00: 12,779,153,937 states generated (29,452,051 s/min), 1,096,185,973 distinct states found (2,296,463 ds/min), 185,115,877 states left on queue. +Progress(41) at 2024-11-06 22:48:00: 12,808,440,971 states generated (29,287,034 s/min), 1,098,733,865 distinct states found (2,547,892 ds/min), 185,564,617 states left on queue. +Progress(41) at 2024-11-06 22:49:00: 12,837,695,256 states generated (29,254,285 s/min), 1,100,705,460 distinct states found (1,971,595 ds/min), 185,532,558 states left on queue. +Progress(41) at 2024-11-06 22:50:00: 12,866,801,129 states generated (29,105,873 s/min), 1,103,074,603 distinct states found (2,369,143 ds/min), 185,770,427 states left on queue. +Progress(41) at 2024-11-06 22:51:00: 12,895,682,870 states generated (28,881,741 s/min), 1,105,437,747 distinct states found (2,363,144 ds/min), 186,049,274 states left on queue. +Progress(41) at 2024-11-06 22:52:00: 12,924,655,990 states generated (28,973,120 s/min), 1,107,853,554 distinct states found (2,415,807 ds/min), 186,325,129 states left on queue. +Progress(41) at 2024-11-06 22:53:00: 12,953,616,826 states generated (28,960,836 s/min), 1,110,097,321 distinct states found (2,243,767 ds/min), 186,509,276 states left on queue. +Progress(41) at 2024-11-06 22:54:00: 12,982,711,068 states generated (29,094,242 s/min), 1,112,146,097 distinct states found (2,048,776 ds/min), 186,507,356 states left on queue. +Progress(41) at 2024-11-06 22:55:00: 13,011,962,667 states generated (29,251,599 s/min), 1,114,530,785 distinct states found (2,384,688 ds/min), 186,758,016 states left on queue. +Progress(41) at 2024-11-06 22:56:00: 13,041,163,382 states generated (29,200,715 s/min), 1,116,566,038 distinct states found (2,035,253 ds/min), 186,702,453 states left on queue. +Progress(41) at 2024-11-06 22:57:00: 13,070,416,604 states generated (29,253,222 s/min), 1,118,433,735 distinct states found (1,867,697 ds/min), 186,595,926 states left on queue. +Progress(41) at 2024-11-06 22:58:00: 13,099,393,765 states generated (28,977,161 s/min), 1,120,727,626 distinct states found (2,293,891 ds/min), 186,785,521 states left on queue. +Progress(41) at 2024-11-06 22:59:00: 13,128,309,003 states generated (28,915,238 s/min), 1,123,075,278 distinct states found (2,347,652 ds/min), 186,977,496 states left on queue. +Progress(42) at 2024-11-06 23:00:00: 13,157,492,254 states generated (29,183,251 s/min), 1,125,164,050 distinct states found (2,088,772 ds/min), 186,994,591 states left on queue. +Checkpointing of run states/24-11-06-15-30-45.354 +Checkpointing completed at (2024-11-06 23:01:01) +Progress(42) at 2024-11-06 23:01:01: 13,187,099,442 states generated (29,607,188 s/min), 1,126,955,828 distinct states found (1,791,778 ds/min), 186,860,457 states left on queue. +Progress(42) at 2024-11-06 23:02:01: 13,216,408,249 states generated (29,308,807 s/min), 1,128,852,586 distinct states found (1,896,758 ds/min), 186,800,218 states left on queue. +Progress(42) at 2024-11-06 23:03:01: 13,245,736,139 states generated (29,327,890 s/min), 1,130,960,381 distinct states found (2,107,795 ds/min), 186,911,118 states left on queue. +Progress(42) at 2024-11-06 23:04:01: 13,274,893,464 states generated (29,157,325 s/min), 1,132,863,930 distinct states found (1,903,549 ds/min), 186,892,129 states left on queue. +Progress(42) at 2024-11-06 23:05:01: 13,304,183,990 states generated (29,290,526 s/min), 1,134,876,306 distinct states found (2,012,376 ds/min), 186,965,153 states left on queue. +Progress(42) at 2024-11-06 23:06:01: 13,333,457,770 states generated (29,273,780 s/min), 1,136,812,237 distinct states found (1,935,931 ds/min), 186,957,506 states left on queue. +Progress(42) at 2024-11-06 23:07:01: 13,362,984,994 states generated (29,527,224 s/min), 1,138,649,876 distinct states found (1,837,639 ds/min), 186,823,887 states left on queue. +Progress(42) at 2024-11-06 23:08:01: 13,392,550,733 states generated (29,565,739 s/min), 1,140,795,722 distinct states found (2,145,846 ds/min), 186,974,795 states left on queue. +Progress(42) at 2024-11-06 23:09:01: 13,422,111,300 states generated (29,560,567 s/min), 1,143,038,611 distinct states found (2,242,889 ds/min), 187,179,197 states left on queue. +Progress(42) at 2024-11-06 23:10:01: 13,451,822,496 states generated (29,711,196 s/min), 1,145,071,502 distinct states found (2,032,891 ds/min), 187,190,480 states left on queue. +Progress(42) at 2024-11-06 23:11:01: 13,481,293,484 states generated (29,470,988 s/min), 1,146,905,806 distinct states found (1,834,304 ds/min), 187,079,661 states left on queue. +Progress(42) at 2024-11-06 23:12:01: 13,510,659,679 states generated (29,366,195 s/min), 1,148,841,643 distinct states found (1,935,837 ds/min), 187,082,815 states left on queue. +Progress(42) at 2024-11-06 23:13:01: 13,539,730,883 states generated (29,071,204 s/min), 1,150,715,436 distinct states found (1,873,793 ds/min), 187,013,975 states left on queue. +Progress(42) at 2024-11-06 23:14:01: 13,568,973,308 states generated (29,242,425 s/min), 1,152,689,735 distinct states found (1,974,299 ds/min), 187,016,208 states left on queue. +Progress(42) at 2024-11-06 23:15:01: 13,598,106,627 states generated (29,133,319 s/min), 1,154,829,869 distinct states found (2,140,134 ds/min), 187,147,884 states left on queue. +Progress(42) at 2024-11-06 23:16:01: 13,627,319,459 states generated (29,212,832 s/min), 1,156,740,070 distinct states found (1,910,201 ds/min), 187,086,942 states left on queue. +Progress(42) at 2024-11-06 23:17:01: 13,656,462,121 states generated (29,142,662 s/min), 1,158,698,307 distinct states found (1,958,237 ds/min), 187,072,201 states left on queue. +Progress(42) at 2024-11-06 23:18:01: 13,685,545,941 states generated (29,083,820 s/min), 1,160,688,939 distinct states found (1,990,632 ds/min), 187,078,553 states left on queue. +Progress(42) at 2024-11-06 23:19:01: 13,714,652,628 states generated (29,106,687 s/min), 1,162,748,633 distinct states found (2,059,694 ds/min), 187,157,229 states left on queue. +Progress(42) at 2024-11-06 23:20:01: 13,744,105,986 states generated (29,453,358 s/min), 1,164,748,782 distinct states found (2,000,149 ds/min), 187,275,480 states left on queue. +Progress(42) at 2024-11-06 23:21:01: 13,773,414,393 states generated (29,308,407 s/min), 1,166,804,740 distinct states found (2,055,958 ds/min), 187,393,312 states left on queue. +Progress(42) at 2024-11-06 23:22:01: 13,802,600,069 states generated (29,185,676 s/min), 1,169,251,493 distinct states found (2,446,753 ds/min), 187,781,298 states left on queue. +Progress(42) at 2024-11-06 23:23:01: 13,831,830,649 states generated (29,230,580 s/min), 1,171,412,176 distinct states found (2,160,683 ds/min), 187,932,991 states left on queue. +Progress(42) at 2024-11-06 23:24:01: 13,861,152,221 states generated (29,321,572 s/min), 1,173,582,994 distinct states found (2,170,818 ds/min), 188,078,037 states left on queue. +Progress(42) at 2024-11-06 23:25:01: 13,890,538,756 states generated (29,386,535 s/min), 1,175,642,901 distinct states found (2,059,907 ds/min), 188,116,794 states left on queue. +Progress(42) at 2024-11-06 23:26:01: 13,919,812,820 states generated (29,274,064 s/min), 1,177,743,048 distinct states found (2,100,147 ds/min), 188,189,399 states left on queue. +Progress(42) at 2024-11-06 23:27:01: 13,948,903,585 states generated (29,090,765 s/min), 1,179,980,470 distinct states found (2,237,422 ds/min), 188,388,309 states left on queue. +Progress(42) at 2024-11-06 23:28:01: 13,978,138,385 states generated (29,234,800 s/min), 1,182,134,981 distinct states found (2,154,511 ds/min), 188,526,735 states left on queue. +Progress(42) at 2024-11-06 23:29:01: 14,007,310,151 states generated (29,171,766 s/min), 1,184,360,360 distinct states found (2,225,379 ds/min), 188,718,575 states left on queue. +Progress(42) at 2024-11-06 23:30:01: 14,036,411,110 states generated (29,100,959 s/min), 1,186,617,835 distinct states found (2,257,475 ds/min), 188,941,068 states left on queue. +Checkpointing of run states/24-11-06-15-30-45.354 +Checkpointing completed at (2024-11-06 23:31:01) +Progress(42) at 2024-11-06 23:31:01: 14,065,894,113 states generated (29,483,003 s/min), 1,188,743,048 distinct states found (2,125,213 ds/min), 189,035,636 states left on queue. +Progress(42) at 2024-11-06 23:32:01: 14,094,909,096 states generated (29,014,983 s/min), 1,191,096,961 distinct states found (2,353,913 ds/min), 189,332,174 states left on queue. +Progress(42) at 2024-11-06 23:33:01: 14,124,212,567 states generated (29,303,471 s/min), 1,193,012,997 distinct states found (1,916,036 ds/min), 189,266,016 states left on queue. +Progress(42) at 2024-11-06 23:34:01: 14,153,428,768 states generated (29,216,201 s/min), 1,195,170,448 distinct states found (2,157,451 ds/min), 189,430,881 states left on queue. +Progress(42) at 2024-11-06 23:35:01: 14,182,568,290 states generated (29,139,522 s/min), 1,197,127,126 distinct states found (1,956,678 ds/min), 189,423,769 states left on queue. +Progress(42) at 2024-11-06 23:36:01: 14,211,602,024 states generated (29,033,734 s/min), 1,199,044,612 distinct states found (1,917,486 ds/min), 189,380,199 states left on queue. +Progress(42) at 2024-11-06 23:37:01: 14,240,593,845 states generated (28,991,821 s/min), 1,200,900,028 distinct states found (1,855,416 ds/min), 189,324,925 states left on queue. +Progress(42) at 2024-11-06 23:38:01: 14,269,687,808 states generated (29,093,963 s/min), 1,203,034,598 distinct states found (2,134,570 ds/min), 189,466,947 states left on queue. +Progress(42) at 2024-11-06 23:39:01: 14,298,626,140 states generated (28,938,332 s/min), 1,205,190,806 distinct states found (2,156,208 ds/min), 189,608,794 states left on queue. +Progress(42) at 2024-11-06 23:40:01: 14,327,587,116 states generated (28,960,976 s/min), 1,207,339,559 distinct states found (2,148,753 ds/min), 189,750,359 states left on queue. +Progress(42) at 2024-11-06 23:41:01: 14,356,469,494 states generated (28,882,378 s/min), 1,209,518,146 distinct states found (2,178,587 ds/min), 189,892,036 states left on queue. +Progress(42) at 2024-11-06 23:42:01: 14,385,314,696 states generated (28,845,202 s/min), 1,211,701,473 distinct states found (2,183,327 ds/min), 190,050,090 states left on queue. +Progress(42) at 2024-11-06 23:43:01: 14,414,142,550 states generated (28,827,854 s/min), 1,213,859,919 distinct states found (2,158,446 ds/min), 190,161,804 states left on queue. +Progress(42) at 2024-11-06 23:44:01: 14,442,945,644 states generated (28,803,094 s/min), 1,216,005,127 distinct states found (2,145,208 ds/min), 190,173,898 states left on queue. +Progress(42) at 2024-11-06 23:45:01: 14,471,693,798 states generated (28,748,154 s/min), 1,218,030,292 distinct states found (2,025,165 ds/min), 190,127,864 states left on queue. +Progress(42) at 2024-11-06 23:46:01: 14,500,599,025 states generated (28,905,227 s/min), 1,219,996,243 distinct states found (1,965,951 ds/min), 190,069,034 states left on queue. +Progress(42) at 2024-11-06 23:47:01: 14,529,770,118 states generated (29,171,093 s/min), 1,221,890,284 distinct states found (1,894,041 ds/min), 189,948,701 states left on queue. +Progress(42) at 2024-11-06 23:48:01: 14,559,044,399 states generated (29,274,281 s/min), 1,223,772,100 distinct states found (1,881,816 ds/min), 189,844,417 states left on queue. +Progress(42) at 2024-11-06 23:49:01: 14,588,505,088 states generated (29,460,689 s/min), 1,225,870,790 distinct states found (2,098,690 ds/min), 189,921,025 states left on queue. +Progress(42) at 2024-11-06 23:50:01: 14,618,007,797 states generated (29,502,709 s/min), 1,227,944,381 distinct states found (2,073,591 ds/min), 189,947,590 states left on queue. +Progress(42) at 2024-11-06 23:51:01: 14,647,405,532 states generated (29,397,735 s/min), 1,230,287,712 distinct states found (2,343,331 ds/min), 190,200,223 states left on queue. +Progress(42) at 2024-11-06 23:52:01: 14,676,733,478 states generated (29,327,946 s/min), 1,232,303,440 distinct states found (2,015,728 ds/min), 190,178,290 states left on queue. +Progress(42) at 2024-11-06 23:53:01: 14,706,089,483 states generated (29,356,005 s/min), 1,234,269,055 distinct states found (1,965,615 ds/min), 190,175,215 states left on queue. +Progress(42) at 2024-11-06 23:54:01: 14,735,226,809 states generated (29,137,326 s/min), 1,236,451,189 distinct states found (2,182,134 ds/min), 190,293,853 states left on queue. +Progress(42) at 2024-11-06 23:55:01: 14,764,611,146 states generated (29,384,337 s/min), 1,238,780,557 distinct states found (2,329,368 ds/min), 190,528,991 states left on queue. +Progress(42) at 2024-11-06 23:56:01: 14,793,911,038 states generated (29,299,892 s/min), 1,240,745,156 distinct states found (1,964,599 ds/min), 190,493,881 states left on queue. +Progress(42) at 2024-11-06 23:57:01: 14,823,113,635 states generated (29,202,597 s/min), 1,242,984,781 distinct states found (2,239,625 ds/min), 190,675,723 states left on queue. +Progress(42) at 2024-11-06 23:58:01: 14,852,208,056 states generated (29,094,421 s/min), 1,245,341,804 distinct states found (2,357,023 ds/min), 190,959,027 states left on queue. +Progress(42) at 2024-11-06 23:59:01: 14,881,390,523 states generated (29,182,467 s/min), 1,247,530,823 distinct states found (2,189,019 ds/min), 191,085,175 states left on queue. +Progress(42) at 2024-11-07 00:00:01: 14,910,709,837 states generated (29,319,314 s/min), 1,249,665,632 distinct states found (2,134,809 ds/min), 191,148,911 states left on queue. +Checkpointing of run states/24-11-06-15-30-45.354 +Checkpointing completed at (2024-11-07 00:01:02) +Progress(42) at 2024-11-07 00:01:02: 14,940,301,722 states generated (29,591,885 s/min), 1,251,820,098 distinct states found (2,154,466 ds/min), 191,164,099 states left on queue. +Progress(42) at 2024-11-07 00:02:02: 14,969,468,946 states generated (29,167,224 s/min), 1,253,608,374 distinct states found (1,788,276 ds/min), 190,977,899 states left on queue. +Progress(42) at 2024-11-07 00:03:02: 14,998,469,861 states generated (29,000,915 s/min), 1,255,846,206 distinct states found (2,237,832 ds/min), 191,179,932 states left on queue. +Progress(42) at 2024-11-07 00:04:02: 15,027,424,344 states generated (28,954,483 s/min), 1,258,012,253 distinct states found (2,166,047 ds/min), 191,269,006 states left on queue. +Progress(42) at 2024-11-07 00:05:02: 15,056,595,053 states generated (29,170,709 s/min), 1,259,974,817 distinct states found (1,962,564 ds/min), 191,232,379 states left on queue. +Progress(42) at 2024-11-07 00:06:02: 15,085,857,792 states generated (29,262,739 s/min), 1,262,139,752 distinct states found (2,164,935 ds/min), 191,351,326 states left on queue. +Progress(42) at 2024-11-07 00:07:02: 15,115,386,019 states generated (29,528,227 s/min), 1,264,425,723 distinct states found (2,285,971 ds/min), 191,549,077 states left on queue. +Progress(42) at 2024-11-07 00:08:02: 15,144,705,784 states generated (29,319,765 s/min), 1,266,390,816 distinct states found (1,965,093 ds/min), 191,495,454 states left on queue. +Progress(42) at 2024-11-07 00:09:02: 15,173,877,454 states generated (29,171,670 s/min), 1,268,144,487 distinct states found (1,753,671 ds/min), 191,300,959 states left on queue. +Progress(42) at 2024-11-07 00:10:02: 15,203,080,845 states generated (29,203,391 s/min), 1,270,256,870 distinct states found (2,112,383 ds/min), 191,363,085 states left on queue. +Progress(42) at 2024-11-07 00:11:02: 15,232,426,418 states generated (29,345,573 s/min), 1,272,624,413 distinct states found (2,367,543 ds/min), 191,673,032 states left on queue. +Progress(42) at 2024-11-07 00:12:02: 15,261,677,209 states generated (29,250,791 s/min), 1,274,995,857 distinct states found (2,371,444 ds/min), 191,960,618 states left on queue. +Progress(42) at 2024-11-07 00:13:02: 15,290,882,314 states generated (29,205,105 s/min), 1,277,269,501 distinct states found (2,273,644 ds/min), 192,155,220 states left on queue. +Progress(42) at 2024-11-07 00:14:02: 15,320,166,816 states generated (29,284,502 s/min), 1,279,524,897 distinct states found (2,255,396 ds/min), 192,367,797 states left on queue. +Progress(42) at 2024-11-07 00:15:02: 15,349,391,017 states generated (29,224,201 s/min), 1,281,912,896 distinct states found (2,387,999 ds/min), 192,657,361 states left on queue. +Progress(42) at 2024-11-07 00:16:02: 15,378,510,873 states generated (29,119,856 s/min), 1,284,352,819 distinct states found (2,439,923 ds/min), 192,982,001 states left on queue. +Progress(42) at 2024-11-07 00:17:02: 15,407,729,690 states generated (29,218,817 s/min), 1,286,798,116 distinct states found (2,445,297 ds/min), 193,251,888 states left on queue. +Progress(42) at 2024-11-07 00:18:02: 15,437,122,682 states generated (29,392,992 s/min), 1,289,060,398 distinct states found (2,262,282 ds/min), 193,393,686 states left on queue. +Progress(42) at 2024-11-07 00:19:02: 15,466,437,919 states generated (29,315,237 s/min), 1,291,390,007 distinct states found (2,329,609 ds/min), 193,674,611 states left on queue. +Progress(42) at 2024-11-07 00:20:02: 15,495,795,434 states generated (29,357,515 s/min), 1,293,625,999 distinct states found (2,235,992 ds/min), 193,855,148 states left on queue. +Progress(42) at 2024-11-07 00:21:02: 15,524,856,146 states generated (29,060,712 s/min), 1,295,675,220 distinct states found (2,049,221 ds/min), 193,858,347 states left on queue. +Progress(42) at 2024-11-07 00:22:02: 15,553,951,279 states generated (29,095,133 s/min), 1,297,806,219 distinct states found (2,130,999 ds/min), 193,910,330 states left on queue. +Progress(42) at 2024-11-07 00:23:02: 15,582,781,229 states generated (28,829,950 s/min), 1,300,215,254 distinct states found (2,409,035 ds/min), 194,211,020 states left on queue. +Progress(42) at 2024-11-07 00:24:02: 15,611,889,872 states generated (29,108,643 s/min), 1,302,431,347 distinct states found (2,216,093 ds/min), 194,324,070 states left on queue. +Progress(42) at 2024-11-07 00:25:02: 15,640,778,210 states generated (28,888,338 s/min), 1,304,674,839 distinct states found (2,243,492 ds/min), 194,483,563 states left on queue. +Progress(42) at 2024-11-07 00:26:02: 15,669,830,004 states generated (29,051,794 s/min), 1,306,661,103 distinct states found (1,986,264 ds/min), 194,429,101 states left on queue. +Progress(42) at 2024-11-07 00:27:02: 15,699,049,213 states generated (29,219,209 s/min), 1,308,920,712 distinct states found (2,259,609 ds/min), 194,577,576 states left on queue. +Progress(42) at 2024-11-07 00:28:02: 15,728,283,982 states generated (29,234,769 s/min), 1,310,924,780 distinct states found (2,004,068 ds/min), 194,488,601 states left on queue. +Progress(42) at 2024-11-07 00:29:02: 15,757,507,793 states generated (29,223,811 s/min), 1,312,729,390 distinct states found (1,804,610 ds/min), 194,321,454 states left on queue. +Progress(42) at 2024-11-07 00:30:02: 15,786,513,733 states generated (29,005,940 s/min), 1,314,926,573 distinct states found (2,197,183 ds/min), 194,422,995 states left on queue. +Checkpointing of run states/24-11-06-15-30-45.354 +Checkpointing completed at (2024-11-07 00:31:03) +Progress(42) at 2024-11-07 00:31:03: 15,815,683,048 states generated (29,169,315 s/min), 1,317,135,461 distinct states found (2,208,888 ds/min), 194,492,192 states left on queue. +Progress(42) at 2024-11-07 00:32:03: 15,844,758,678 states generated (29,075,630 s/min), 1,319,144,875 distinct states found (2,009,414 ds/min), 194,413,387 states left on queue. +Progress(42) at 2024-11-07 00:33:03: 15,873,998,157 states generated (29,239,479 s/min), 1,320,932,025 distinct states found (1,787,150 ds/min), 194,281,981 states left on queue. +Progress(42) at 2024-11-07 00:34:03: 15,903,205,479 states generated (29,207,322 s/min), 1,322,654,400 distinct states found (1,722,375 ds/min), 194,091,121 states left on queue. +Progress(42) at 2024-11-07 00:35:03: 15,932,501,264 states generated (29,295,785 s/min), 1,324,682,430 distinct states found (2,028,030 ds/min), 194,137,494 states left on queue. +Progress(42) at 2024-11-07 00:36:03: 15,961,589,919 states generated (29,088,655 s/min), 1,326,509,334 distinct states found (1,826,904 ds/min), 194,051,639 states left on queue. +Progress(42) at 2024-11-07 00:37:03: 15,990,668,327 states generated (29,078,408 s/min), 1,328,357,672 distinct states found (1,848,338 ds/min), 193,989,585 states left on queue. +Progress(42) at 2024-11-07 00:38:03: 16,019,782,313 states generated (29,113,986 s/min), 1,330,232,446 distinct states found (1,874,774 ds/min), 193,949,446 states left on queue. +Progress(42) at 2024-11-07 00:39:03: 16,049,252,200 states generated (29,469,887 s/min), 1,331,987,412 distinct states found (1,754,966 ds/min), 193,747,896 states left on queue. +Progress(42) at 2024-11-07 00:40:03: 16,078,692,514 states generated (29,440,314 s/min), 1,333,894,185 distinct states found (1,906,773 ds/min), 193,729,942 states left on queue. +Progress(42) at 2024-11-07 00:41:03: 16,108,160,136 states generated (29,467,622 s/min), 1,336,102,661 distinct states found (2,208,476 ds/min), 193,914,624 states left on queue. +Progress(42) at 2024-11-07 00:42:03: 16,137,813,382 states generated (29,653,246 s/min), 1,338,180,836 distinct states found (2,078,175 ds/min), 193,976,996 states left on queue. +Progress(43) at 2024-11-07 00:43:03: 16,167,357,885 states generated (29,544,503 s/min), 1,339,957,139 distinct states found (1,776,303 ds/min), 193,787,392 states left on queue. +Progress(43) at 2024-11-07 00:44:03: 16,196,650,450 states generated (29,292,565 s/min), 1,341,719,088 distinct states found (1,761,949 ds/min), 193,648,551 states left on queue. +Progress(43) at 2024-11-07 00:45:03: 16,225,735,286 states generated (29,084,836 s/min), 1,343,468,127 distinct states found (1,749,039 ds/min), 193,497,590 states left on queue. +Progress(43) at 2024-11-07 00:46:03: 16,254,805,612 states generated (29,070,326 s/min), 1,345,280,226 distinct states found (1,812,099 ds/min), 193,364,788 states left on queue. +Progress(43) at 2024-11-07 00:47:03: 16,283,933,423 states generated (29,127,811 s/min), 1,347,294,879 distinct states found (2,014,653 ds/min), 193,397,713 states left on queue. +Progress(43) at 2024-11-07 00:48:03: 16,312,911,730 states generated (28,978,307 s/min), 1,349,192,377 distinct states found (1,897,498 ds/min), 193,321,503 states left on queue. +Progress(43) at 2024-11-07 00:49:03: 16,342,115,657 states generated (29,203,927 s/min), 1,350,961,684 distinct states found (1,769,307 ds/min), 193,144,596 states left on queue. +Progress(43) at 2024-11-07 00:50:03: 16,370,988,391 states generated (28,872,734 s/min), 1,352,868,904 distinct states found (1,907,220 ds/min), 193,089,969 states left on queue. +Progress(43) at 2024-11-07 00:51:03: 16,400,089,208 states generated (29,100,817 s/min), 1,354,864,448 distinct states found (1,995,544 ds/min), 193,098,377 states left on queue. +Progress(43) at 2024-11-07 00:52:03: 16,429,331,456 states generated (29,242,248 s/min), 1,356,734,632 distinct states found (1,870,184 ds/min), 193,093,615 states left on queue. +Progress(43) at 2024-11-07 00:53:03: 16,458,648,761 states generated (29,317,305 s/min), 1,358,622,917 distinct states found (1,888,285 ds/min), 193,098,172 states left on queue. +Progress(43) at 2024-11-07 00:54:03: 16,487,874,773 states generated (29,226,012 s/min), 1,360,737,908 distinct states found (2,114,991 ds/min), 193,250,949 states left on queue. +Progress(43) at 2024-11-07 00:55:03: 16,517,101,401 states generated (29,226,628 s/min), 1,363,024,072 distinct states found (2,286,164 ds/min), 193,508,719 states left on queue. +Progress(43) at 2024-11-07 00:56:03: 16,546,231,362 states generated (29,129,961 s/min), 1,365,056,771 distinct states found (2,032,699 ds/min), 193,558,441 states left on queue. +Progress(43) at 2024-11-07 00:57:03: 16,575,532,837 states generated (29,301,475 s/min), 1,367,107,709 distinct states found (2,050,938 ds/min), 193,609,354 states left on queue. +Progress(43) at 2024-11-07 00:58:03: 16,604,872,137 states generated (29,339,300 s/min), 1,369,059,417 distinct states found (1,951,708 ds/min), 193,561,420 states left on queue. +Progress(43) at 2024-11-07 00:59:03: 16,634,070,732 states generated (29,198,595 s/min), 1,371,016,928 distinct states found (1,957,511 ds/min), 193,513,278 states left on queue. +Progress(43) at 2024-11-07 01:00:03: 16,663,158,113 states generated (29,087,381 s/min), 1,373,092,542 distinct states found (2,075,614 ds/min), 193,582,661 states left on queue. +Checkpointing of run states/24-11-06-15-30-45.354 +Checkpointing completed at (2024-11-07 01:01:03) +Progress(43) at 2024-11-07 01:01:03: 16,692,576,110 states generated (29,417,997 s/min), 1,375,200,108 distinct states found (2,107,566 ds/min), 193,664,621 states left on queue. +Progress(43) at 2024-11-07 01:02:03: 16,721,716,479 states generated (29,140,369 s/min), 1,377,247,529 distinct states found (2,047,421 ds/min), 193,708,538 states left on queue. +Progress(43) at 2024-11-07 01:03:03: 16,750,779,523 states generated (29,063,044 s/min), 1,379,368,087 distinct states found (2,120,558 ds/min), 193,813,065 states left on queue. +Progress(43) at 2024-11-07 01:04:03: 16,779,794,524 states generated (29,015,001 s/min), 1,381,371,287 distinct states found (2,003,200 ds/min), 193,825,465 states left on queue. +Progress(43) at 2024-11-07 01:05:03: 16,808,907,203 states generated (29,112,679 s/min), 1,383,515,008 distinct states found (2,143,721 ds/min), 193,953,821 states left on queue. +Progress(43) at 2024-11-07 01:06:03: 16,838,029,628 states generated (29,122,425 s/min), 1,385,629,882 distinct states found (2,114,874 ds/min), 194,038,163 states left on queue. +Progress(43) at 2024-11-07 01:07:03: 16,867,418,111 states generated (29,388,483 s/min), 1,387,561,049 distinct states found (1,931,167 ds/min), 194,004,058 states left on queue. +Progress(43) at 2024-11-07 01:08:03: 16,896,555,416 states generated (29,137,305 s/min), 1,389,592,238 distinct states found (2,031,189 ds/min), 194,058,208 states left on queue. +Progress(43) at 2024-11-07 01:09:03: 16,925,642,685 states generated (29,087,269 s/min), 1,391,404,896 distinct states found (1,812,658 ds/min), 193,924,951 states left on queue. +Progress(43) at 2024-11-07 01:10:03: 16,954,638,533 states generated (28,995,848 s/min), 1,393,186,525 distinct states found (1,781,629 ds/min), 193,784,358 states left on queue. +Progress(43) at 2024-11-07 01:11:03: 16,983,710,894 states generated (29,072,361 s/min), 1,395,018,264 distinct states found (1,831,739 ds/min), 193,697,690 states left on queue. +Progress(43) at 2024-11-07 01:12:03: 17,012,741,316 states generated (29,030,422 s/min), 1,397,039,325 distinct states found (2,021,061 ds/min), 193,755,919 states left on queue. +Progress(43) at 2024-11-07 01:13:03: 17,041,674,538 states generated (28,933,222 s/min), 1,399,086,352 distinct states found (2,047,027 ds/min), 193,799,420 states left on queue. +Progress(43) at 2024-11-07 01:14:03: 17,070,653,912 states generated (28,979,374 s/min), 1,401,092,312 distinct states found (2,005,960 ds/min), 193,820,018 states left on queue. +Progress(43) at 2024-11-07 01:15:03: 17,099,536,446 states generated (28,882,534 s/min), 1,403,159,743 distinct states found (2,067,431 ds/min), 193,867,947 states left on queue. +Progress(43) at 2024-11-07 01:16:03: 17,128,396,670 states generated (28,860,224 s/min), 1,405,244,280 distinct states found (2,084,537 ds/min), 193,945,380 states left on queue. +Progress(43) at 2024-11-07 01:17:03: 17,157,276,177 states generated (28,879,507 s/min), 1,407,274,748 distinct states found (2,030,468 ds/min), 193,944,077 states left on queue. +Progress(43) at 2024-11-07 01:18:03: 17,186,149,639 states generated (28,873,462 s/min), 1,409,283,088 distinct states found (2,008,340 ds/min), 193,881,792 states left on queue. +Progress(43) at 2024-11-07 01:19:03: 17,214,923,206 states generated (28,773,567 s/min), 1,411,167,065 distinct states found (1,883,977 ds/min), 193,711,394 states left on queue. +Progress(43) at 2024-11-07 01:20:03: 17,243,730,245 states generated (28,807,039 s/min), 1,413,023,763 distinct states found (1,856,698 ds/min), 193,546,054 states left on queue. +Progress(43) at 2024-11-07 01:21:03: 17,272,650,525 states generated (28,920,280 s/min), 1,414,802,171 distinct states found (1,778,408 ds/min), 193,345,308 states left on queue. +Progress(43) at 2024-11-07 01:22:03: 17,301,943,589 states generated (29,293,064 s/min), 1,416,599,440 distinct states found (1,797,269 ds/min), 193,158,676 states left on queue. +Progress(43) at 2024-11-07 01:23:03: 17,331,337,313 states generated (29,393,724 s/min), 1,418,547,450 distinct states found (1,948,010 ds/min), 193,112,883 states left on queue. +Progress(43) at 2024-11-07 01:24:03: 17,360,793,100 states generated (29,455,787 s/min), 1,420,576,018 distinct states found (2,028,568 ds/min), 193,100,476 states left on queue. +Progress(43) at 2024-11-07 01:25:03: 17,390,123,392 states generated (29,330,292 s/min), 1,422,693,479 distinct states found (2,117,461 ds/min), 193,171,748 states left on queue. +Progress(43) at 2024-11-07 01:26:03: 17,419,468,515 states generated (29,345,123 s/min), 1,424,783,244 distinct states found (2,089,765 ds/min), 193,228,274 states left on queue. +Progress(43) at 2024-11-07 01:27:03: 17,448,810,016 states generated (29,341,501 s/min), 1,426,560,811 distinct states found (1,777,567 ds/min), 193,036,459 states left on queue. +Progress(43) at 2024-11-07 01:28:03: 17,478,034,472 states generated (29,224,456 s/min), 1,428,663,374 distinct states found (2,102,563 ds/min), 193,125,616 states left on queue. +Progress(43) at 2024-11-07 01:29:03: 17,507,201,835 states generated (29,167,363 s/min), 1,430,735,910 distinct states found (2,072,536 ds/min), 193,146,850 states left on queue. +Progress(43) at 2024-11-07 01:30:03: 17,536,546,498 states generated (29,344,663 s/min), 1,432,877,950 distinct states found (2,142,040 ds/min), 193,230,645 states left on queue. +Checkpointing of run states/24-11-06-15-30-45.354 +Checkpointing completed at (2024-11-07 01:31:04) +Progress(43) at 2024-11-07 01:31:04: 17,566,061,546 states generated (29,515,048 s/min), 1,434,839,708 distinct states found (1,961,758 ds/min), 193,176,951 states left on queue. +Progress(43) at 2024-11-07 01:32:04: 17,595,015,993 states generated (28,954,447 s/min), 1,436,986,257 distinct states found (2,146,549 ds/min), 193,289,254 states left on queue. +Progress(43) at 2024-11-07 01:33:04: 17,624,137,153 states generated (29,121,160 s/min), 1,439,279,150 distinct states found (2,292,893 ds/min), 193,525,973 states left on queue. +Progress(43) at 2024-11-07 01:34:04: 17,653,328,248 states generated (29,191,095 s/min), 1,441,299,767 distinct states found (2,020,617 ds/min), 193,504,947 states left on queue. +Progress(43) at 2024-11-07 01:35:04: 17,682,562,562 states generated (29,234,314 s/min), 1,443,317,413 distinct states found (2,017,646 ds/min), 193,471,905 states left on queue. +Progress(43) at 2024-11-07 01:36:04: 17,711,829,397 states generated (29,266,835 s/min), 1,445,304,310 distinct states found (1,986,897 ds/min), 193,370,899 states left on queue. +Progress(43) at 2024-11-07 01:37:04: 17,740,910,347 states generated (29,080,950 s/min), 1,447,009,563 distinct states found (1,705,253 ds/min), 193,129,235 states left on queue. +Progress(43) at 2024-11-07 01:38:04: 17,769,836,321 states generated (28,925,974 s/min), 1,449,139,496 distinct states found (2,129,933 ds/min), 193,234,511 states left on queue. +Progress(43) at 2024-11-07 01:39:04: 17,798,713,067 states generated (28,876,746 s/min), 1,451,211,612 distinct states found (2,072,116 ds/min), 193,241,362 states left on queue. +Progress(43) at 2024-11-07 01:40:04: 17,827,794,691 states generated (29,081,624 s/min), 1,453,062,046 distinct states found (1,850,434 ds/min), 193,114,753 states left on queue. +Progress(43) at 2024-11-07 01:41:04: 17,856,974,014 states generated (29,179,323 s/min), 1,455,151,187 distinct states found (2,089,141 ds/min), 193,169,579 states left on queue. +Progress(43) at 2024-11-07 01:42:04: 17,886,446,666 states generated (29,472,652 s/min), 1,457,303,171 distinct states found (2,151,984 ds/min), 193,263,708 states left on queue. +Progress(43) at 2024-11-07 01:43:04: 17,915,744,840 states generated (29,298,174 s/min), 1,459,261,460 distinct states found (1,958,289 ds/min), 193,194,468 states left on queue. +Progress(43) at 2024-11-07 01:44:04: 17,944,793,057 states generated (29,048,217 s/min), 1,460,885,305 distinct states found (1,623,845 ds/min), 192,905,330 states left on queue. +Progress(43) at 2024-11-07 01:45:04: 17,973,952,967 states generated (29,159,910 s/min), 1,462,880,642 distinct states found (1,995,337 ds/min), 192,871,348 states left on queue. +Progress(43) at 2024-11-07 01:46:04: 18,003,158,344 states generated (29,205,377 s/min), 1,465,077,846 distinct states found (2,197,204 ds/min), 193,039,702 states left on queue. +Progress(43) at 2024-11-07 01:47:04: 18,032,464,087 states generated (29,305,743 s/min), 1,467,361,120 distinct states found (2,283,274 ds/min), 193,271,051 states left on queue. +Progress(43) at 2024-11-07 01:48:04: 18,061,597,682 states generated (29,133,595 s/min), 1,469,505,688 distinct states found (2,144,568 ds/min), 193,354,360 states left on queue. +Progress(43) at 2024-11-07 01:49:04: 18,090,888,515 states generated (29,290,833 s/min), 1,471,655,035 distinct states found (2,149,347 ds/min), 193,472,080 states left on queue. +Progress(43) at 2024-11-07 01:50:04: 18,119,855,749 states generated (28,967,234 s/min), 1,473,959,147 distinct states found (2,304,112 ds/min), 193,714,821 states left on queue. +Progress(43) at 2024-11-07 01:51:04: 18,149,035,954 states generated (29,180,205 s/min), 1,476,253,894 distinct states found (2,294,747 ds/min), 193,939,051 states left on queue. +Progress(43) at 2024-11-07 01:52:04: 18,178,210,402 states generated (29,174,448 s/min), 1,478,557,699 distinct states found (2,303,805 ds/min), 194,141,809 states left on queue. +Progress(43) at 2024-11-07 01:53:04: 18,207,377,534 states generated (29,167,132 s/min), 1,480,870,404 distinct states found (2,312,705 ds/min), 194,307,877 states left on queue. +Progress(43) at 2024-11-07 01:54:04: 18,236,577,989 states generated (29,200,455 s/min), 1,483,070,823 distinct states found (2,200,419 ds/min), 194,387,223 states left on queue. +Progress(43) at 2024-11-07 01:55:04: 18,265,859,163 states generated (29,281,174 s/min), 1,485,222,154 distinct states found (2,151,331 ds/min), 194,522,233 states left on queue. +Progress(43) at 2024-11-07 01:56:04: 18,295,148,797 states generated (29,289,634 s/min), 1,487,521,283 distinct states found (2,299,129 ds/min), 194,755,427 states left on queue. +Progress(43) at 2024-11-07 01:57:04: 18,324,289,175 states generated (29,140,378 s/min), 1,489,367,193 distinct states found (1,845,910 ds/min), 194,604,366 states left on queue. +Progress(43) at 2024-11-07 01:58:04: 18,353,385,770 states generated (29,096,595 s/min), 1,491,503,782 distinct states found (2,136,589 ds/min), 194,651,670 states left on queue. +Progress(43) at 2024-11-07 01:59:04: 18,382,277,307 states generated (28,891,537 s/min), 1,493,659,362 distinct states found (2,155,580 ds/min), 194,761,640 states left on queue. +Progress(43) at 2024-11-07 02:00:04: 18,411,146,853 states generated (28,869,546 s/min), 1,495,935,237 distinct states found (2,275,875 ds/min), 194,908,896 states left on queue. +Checkpointing of run states/24-11-06-15-30-45.354 +Checkpointing completed at (2024-11-07 02:01:05) +Progress(43) at 2024-11-07 02:01:05: 18,440,532,837 states generated (29,385,984 s/min), 1,497,937,249 distinct states found (2,002,012 ds/min), 194,874,096 states left on queue. +Progress(43) at 2024-11-07 02:02:05: 18,469,385,511 states generated (28,852,674 s/min), 1,500,062,084 distinct states found (2,124,835 ds/min), 194,900,930 states left on queue. +Progress(43) at 2024-11-07 02:03:05: 18,498,509,160 states generated (29,123,649 s/min), 1,502,077,387 distinct states found (2,015,303 ds/min), 194,871,250 states left on queue. +Progress(43) at 2024-11-07 02:04:05: 18,527,694,520 states generated (29,185,360 s/min), 1,504,242,136 distinct states found (2,164,749 ds/min), 194,924,748 states left on queue. +Progress(43) at 2024-11-07 02:05:05: 18,556,901,350 states generated (29,206,830 s/min), 1,506,034,687 distinct states found (1,792,551 ds/min), 194,670,609 states left on queue. +Progress(43) at 2024-11-07 02:06:05: 18,586,004,706 states generated (29,103,356 s/min), 1,507,879,191 distinct states found (1,844,504 ds/min), 194,551,782 states left on queue. +Progress(43) at 2024-11-07 02:07:05: 18,614,881,319 states generated (28,876,613 s/min), 1,510,019,997 distinct states found (2,140,806 ds/min), 194,594,957 states left on queue. +Progress(43) at 2024-11-07 02:08:05: 18,643,854,322 states generated (28,973,003 s/min), 1,512,074,165 distinct states found (2,054,168 ds/min), 194,532,832 states left on queue. +Progress(43) at 2024-11-07 02:09:05: 18,672,998,550 states generated (29,144,228 s/min), 1,513,943,120 distinct states found (1,868,955 ds/min), 194,368,599 states left on queue. +Progress(43) at 2024-11-07 02:10:05: 18,702,201,308 states generated (29,202,758 s/min), 1,515,546,068 distinct states found (1,602,948 ds/min), 194,090,755 states left on queue. +Progress(43) at 2024-11-07 02:11:05: 18,731,481,011 states generated (29,279,703 s/min), 1,517,343,788 distinct states found (1,797,720 ds/min), 193,942,961 states left on queue. +Progress(43) at 2024-11-07 02:12:05: 18,760,609,986 states generated (29,128,975 s/min), 1,519,160,050 distinct states found (1,816,262 ds/min), 193,815,502 states left on queue. +Progress(43) at 2024-11-07 02:13:05: 18,789,628,202 states generated (29,018,216 s/min), 1,520,860,123 distinct states found (1,700,073 ds/min), 193,642,399 states left on queue. +Progress(43) at 2024-11-07 02:14:05: 18,818,770,407 states generated (29,142,205 s/min), 1,522,616,126 distinct states found (1,756,003 ds/min), 193,516,180 states left on queue. +Progress(43) at 2024-11-07 02:15:05: 18,847,943,521 states generated (29,173,114 s/min), 1,524,373,878 distinct states found (1,757,752 ds/min), 193,352,389 states left on queue. +Progress(43) at 2024-11-07 02:16:05: 18,877,338,814 states generated (29,395,293 s/min), 1,526,022,199 distinct states found (1,648,321 ds/min), 193,099,089 states left on queue. +Progress(43) at 2024-11-07 02:17:05: 18,906,854,907 states generated (29,516,093 s/min), 1,528,057,287 distinct states found (2,035,088 ds/min), 193,164,007 states left on queue. +Progress(43) at 2024-11-07 02:18:05: 18,936,272,714 states generated (29,417,807 s/min), 1,530,070,868 distinct states found (2,013,581 ds/min), 193,195,191 states left on queue. +Progress(43) at 2024-11-07 02:19:05: 18,965,845,291 states generated (29,572,577 s/min), 1,531,953,514 distinct states found (1,882,646 ds/min), 193,094,610 states left on queue. +Progress(44) at 2024-11-07 02:20:05: 18,995,225,711 states generated (29,380,420 s/min), 1,533,586,486 distinct states found (1,632,972 ds/min), 192,813,292 states left on queue. +Progress(44) at 2024-11-07 02:21:05: 19,024,424,249 states generated (29,198,538 s/min), 1,535,341,846 distinct states found (1,755,360 ds/min), 192,665,431 states left on queue. +Progress(44) at 2024-11-07 02:22:05: 19,053,319,611 states generated (28,895,362 s/min), 1,536,913,652 distinct states found (1,571,806 ds/min), 192,336,687 states left on queue. +Progress(44) at 2024-11-07 02:23:05: 19,082,456,366 states generated (29,136,755 s/min), 1,538,781,638 distinct states found (1,867,986 ds/min), 192,258,068 states left on queue. +Progress(44) at 2024-11-07 02:24:05: 19,111,445,941 states generated (28,989,575 s/min), 1,540,696,734 distinct states found (1,915,096 ds/min), 192,193,602 states left on queue. +Progress(44) at 2024-11-07 02:25:05: 19,140,498,683 states generated (29,052,742 s/min), 1,542,368,994 distinct states found (1,672,260 ds/min), 191,938,239 states left on queue. +Progress(44) at 2024-11-07 02:26:05: 19,169,386,645 states generated (28,887,962 s/min), 1,544,099,236 distinct states found (1,730,242 ds/min), 191,741,059 states left on queue. +Progress(44) at 2024-11-07 02:27:05: 19,198,354,957 states generated (28,968,312 s/min), 1,545,891,836 distinct states found (1,792,600 ds/min), 191,577,211 states left on queue. +Progress(44) at 2024-11-07 02:28:05: 19,227,551,398 states generated (29,196,441 s/min), 1,547,751,807 distinct states found (1,859,971 ds/min), 191,530,291 states left on queue. +Progress(44) at 2024-11-07 02:29:05: 19,256,905,544 states generated (29,354,146 s/min), 1,549,562,753 distinct states found (1,810,946 ds/min), 191,492,536 states left on queue. +Progress(44) at 2024-11-07 02:30:05: 19,286,043,009 states generated (29,137,465 s/min), 1,551,387,062 distinct states found (1,824,309 ds/min), 191,432,131 states left on queue. +Checkpointing of run states/24-11-06-15-30-45.354 +Checkpointing completed at (2024-11-07 02:31:05) +Progress(44) at 2024-11-07 02:31:05: 19,315,478,636 states generated (29,435,627 s/min), 1,553,684,416 distinct states found (2,297,354 ds/min), 191,685,387 states left on queue. +Progress(44) at 2024-11-07 02:32:05: 19,344,574,433 states generated (29,095,797 s/min), 1,555,642,251 distinct states found (1,957,835 ds/min), 191,687,380 states left on queue. +Progress(44) at 2024-11-07 02:33:05: 19,373,560,321 states generated (28,985,888 s/min), 1,557,576,032 distinct states found (1,933,781 ds/min), 191,644,771 states left on queue. +Progress(44) at 2024-11-07 02:34:05: 19,402,882,849 states generated (29,322,528 s/min), 1,559,483,211 distinct states found (1,907,179 ds/min), 191,584,351 states left on queue. +Progress(44) at 2024-11-07 02:35:05: 19,432,084,827 states generated (29,201,978 s/min), 1,561,305,888 distinct states found (1,822,677 ds/min), 191,432,596 states left on queue. +Progress(44) at 2024-11-07 02:36:05: 19,461,112,335 states generated (29,027,508 s/min), 1,563,180,797 distinct states found (1,874,909 ds/min), 191,330,553 states left on queue. +Progress(44) at 2024-11-07 02:37:05: 19,490,043,498 states generated (28,931,163 s/min), 1,565,137,368 distinct states found (1,956,571 ds/min), 191,292,333 states left on queue. +Progress(44) at 2024-11-07 02:38:05: 19,519,153,014 states generated (29,109,516 s/min), 1,567,034,954 distinct states found (1,897,586 ds/min), 191,229,524 states left on queue. +Progress(44) at 2024-11-07 02:39:05: 19,548,204,678 states generated (29,051,664 s/min), 1,568,989,443 distinct states found (1,954,489 ds/min), 191,191,752 states left on queue. +Progress(44) at 2024-11-07 02:40:05: 19,577,227,470 states generated (29,022,792 s/min), 1,570,981,495 distinct states found (1,992,052 ds/min), 191,200,024 states left on queue. +Progress(44) at 2024-11-07 02:41:05: 19,606,172,601 states generated (28,945,131 s/min), 1,572,870,324 distinct states found (1,888,829 ds/min), 191,115,956 states left on queue. +Progress(44) at 2024-11-07 02:42:05: 19,635,167,481 states generated (28,994,880 s/min), 1,574,894,468 distinct states found (2,024,144 ds/min), 191,139,869 states left on queue. +Progress(44) at 2024-11-07 02:43:05: 19,664,339,049 states generated (29,171,568 s/min), 1,576,906,348 distinct states found (2,011,880 ds/min), 191,137,521 states left on queue. +Progress(44) at 2024-11-07 02:44:05: 19,693,639,689 states generated (29,300,640 s/min), 1,578,748,425 distinct states found (1,842,077 ds/min), 191,040,518 states left on queue. +Progress(44) at 2024-11-07 02:45:05: 19,722,704,536 states generated (29,064,847 s/min), 1,580,671,538 distinct states found (1,923,113 ds/min), 191,001,469 states left on queue. +Progress(44) at 2024-11-07 02:46:05: 19,751,627,669 states generated (28,923,133 s/min), 1,582,340,762 distinct states found (1,669,224 ds/min), 190,750,504 states left on queue. +Progress(44) at 2024-11-07 02:47:05: 19,780,532,535 states generated (28,904,866 s/min), 1,583,965,049 distinct states found (1,624,287 ds/min), 190,492,540 states left on queue. +Progress(44) at 2024-11-07 02:48:05: 19,809,548,743 states generated (29,016,208 s/min), 1,585,820,774 distinct states found (1,855,725 ds/min), 190,422,454 states left on queue. +Progress(44) at 2024-11-07 02:49:05: 19,838,541,075 states generated (28,992,332 s/min), 1,587,731,649 distinct states found (1,910,875 ds/min), 190,386,932 states left on queue. +Progress(44) at 2024-11-07 02:50:05: 19,867,458,320 states generated (28,917,245 s/min), 1,589,622,141 distinct states found (1,890,492 ds/min), 190,310,460 states left on queue. +Progress(44) at 2024-11-07 02:51:05: 19,896,287,158 states generated (28,828,838 s/min), 1,591,517,151 distinct states found (1,895,010 ds/min), 190,235,561 states left on queue. +Progress(44) at 2024-11-07 02:52:05: 19,925,117,820 states generated (28,830,662 s/min), 1,593,453,289 distinct states found (1,936,138 ds/min), 190,176,789 states left on queue. +Progress(44) at 2024-11-07 02:53:05: 19,953,949,651 states generated (28,831,831 s/min), 1,595,392,832 distinct states found (1,939,543 ds/min), 190,137,713 states left on queue. +Progress(44) at 2024-11-07 02:54:05: 19,982,791,590 states generated (28,841,939 s/min), 1,597,295,182 distinct states found (1,902,350 ds/min), 190,030,864 states left on queue. +Progress(44) at 2024-11-07 02:55:05: 20,011,631,796 states generated (28,840,206 s/min), 1,599,162,388 distinct states found (1,867,206 ds/min), 189,857,155 states left on queue. +Progress(44) at 2024-11-07 02:56:05: 20,040,350,017 states generated (28,718,221 s/min), 1,600,882,747 distinct states found (1,720,359 ds/min), 189,556,504 states left on queue. +Progress(44) at 2024-11-07 02:57:05: 20,069,048,267 states generated (28,698,250 s/min), 1,602,583,945 distinct states found (1,701,198 ds/min), 189,276,085 states left on queue. +Progress(44) at 2024-11-07 02:58:05: 20,098,037,079 states generated (28,988,812 s/min), 1,604,245,937 distinct states found (1,661,992 ds/min), 188,968,070 states left on queue. +Progress(44) at 2024-11-07 02:59:05: 20,127,216,730 states generated (29,179,651 s/min), 1,605,916,753 distinct states found (1,670,816 ds/min), 188,703,437 states left on queue. +Progress(44) at 2024-11-07 03:00:05: 20,156,712,917 states generated (29,496,187 s/min), 1,607,868,866 distinct states found (1,952,113 ds/min), 188,640,553 states left on queue. +Checkpointing of run states/24-11-06-15-30-45.354 +Checkpointing completed at (2024-11-07 03:01:06) +Progress(44) at 2024-11-07 03:01:06: 20,186,396,044 states generated (29,683,127 s/min), 1,609,765,772 distinct states found (1,896,906 ds/min), 188,510,848 states left on queue. +Progress(44) at 2024-11-07 03:02:06: 20,215,754,864 states generated (29,358,820 s/min), 1,611,924,139 distinct states found (2,158,367 ds/min), 188,607,723 states left on queue. +Progress(44) at 2024-11-07 03:03:06: 20,245,041,982 states generated (29,287,118 s/min), 1,613,794,702 distinct states found (1,870,563 ds/min), 188,472,700 states left on queue. +Progress(44) at 2024-11-07 03:04:06: 20,274,294,374 states generated (29,252,392 s/min), 1,615,566,733 distinct states found (1,772,031 ds/min), 188,311,061 states left on queue. +Progress(44) at 2024-11-07 03:05:06: 20,303,317,537 states generated (29,023,163 s/min), 1,617,541,966 distinct states found (1,975,233 ds/min), 188,275,227 states left on queue. +Progress(44) at 2024-11-07 03:06:06: 20,332,555,917 states generated (29,238,380 s/min), 1,619,626,477 distinct states found (2,084,511 ds/min), 188,311,129 states left on queue. +Progress(44) at 2024-11-07 03:07:06: 20,361,814,948 states generated (29,259,031 s/min), 1,621,498,944 distinct states found (1,872,467 ds/min), 188,187,982 states left on queue. +Progress(44) at 2024-11-07 03:08:06: 20,391,066,062 states generated (29,251,114 s/min), 1,623,499,145 distinct states found (2,000,201 ds/min), 188,184,372 states left on queue. +Progress(44) at 2024-11-07 03:09:06: 20,420,013,539 states generated (28,947,477 s/min), 1,625,534,256 distinct states found (2,035,111 ds/min), 188,202,174 states left on queue. +Progress(44) at 2024-11-07 03:10:06: 20,449,116,787 states generated (29,103,248 s/min), 1,627,670,135 distinct states found (2,135,879 ds/min), 188,303,061 states left on queue. +Progress(44) at 2024-11-07 03:11:06: 20,478,265,224 states generated (29,148,437 s/min), 1,629,558,947 distinct states found (1,888,812 ds/min), 188,171,995 states left on queue. +Progress(44) at 2024-11-07 03:12:06: 20,507,459,785 states generated (29,194,561 s/min), 1,631,460,915 distinct states found (1,901,968 ds/min), 188,044,516 states left on queue. +Progress(44) at 2024-11-07 03:13:06: 20,536,655,025 states generated (29,195,240 s/min), 1,633,292,515 distinct states found (1,831,600 ds/min), 187,823,678 states left on queue. +Progress(44) at 2024-11-07 03:14:06: 20,565,699,198 states generated (29,044,173 s/min), 1,634,967,122 distinct states found (1,674,607 ds/min), 187,564,357 states left on queue. +Progress(44) at 2024-11-07 03:15:06: 20,594,568,781 states generated (28,869,583 s/min), 1,636,996,440 distinct states found (2,029,318 ds/min), 187,577,506 states left on queue. +Progress(44) at 2024-11-07 03:16:06: 20,623,463,526 states generated (28,894,745 s/min), 1,638,870,718 distinct states found (1,874,278 ds/min), 187,429,057 states left on queue. +Progress(44) at 2024-11-07 03:17:06: 20,652,517,975 states generated (29,054,449 s/min), 1,640,608,054 distinct states found (1,737,336 ds/min), 187,198,996 states left on queue. +Progress(44) at 2024-11-07 03:18:06: 20,681,729,377 states generated (29,211,402 s/min), 1,642,682,611 distinct states found (2,074,557 ds/min), 187,238,673 states left on queue. +Progress(44) at 2024-11-07 03:19:06: 20,711,226,363 states generated (29,496,986 s/min), 1,644,764,480 distinct states found (2,081,869 ds/min), 187,269,746 states left on queue. +Progress(44) at 2024-11-07 03:20:06: 20,740,520,876 states generated (29,294,513 s/min), 1,646,565,948 distinct states found (1,801,468 ds/min), 187,085,841 states left on queue. +Progress(44) at 2024-11-07 03:21:06: 20,769,532,066 states generated (29,011,190 s/min), 1,648,139,570 distinct states found (1,573,622 ds/min), 186,737,971 states left on queue. +Progress(44) at 2024-11-07 03:22:06: 20,798,731,555 states generated (29,199,489 s/min), 1,650,061,318 distinct states found (1,921,748 ds/min), 186,652,080 states left on queue. +Progress(44) at 2024-11-07 03:23:06: 20,827,864,871 states generated (29,133,316 s/min), 1,652,217,368 distinct states found (2,156,050 ds/min), 186,786,338 states left on queue. +Progress(44) at 2024-11-07 03:24:06: 20,857,114,542 states generated (29,249,671 s/min), 1,654,404,059 distinct states found (2,186,691 ds/min), 186,937,127 states left on queue. +Progress(44) at 2024-11-07 03:25:06: 20,886,216,235 states generated (29,101,693 s/min), 1,656,424,687 distinct states found (2,020,628 ds/min), 186,925,384 states left on queue. +Progress(44) at 2024-11-07 03:26:06: 20,915,415,138 states generated (29,198,903 s/min), 1,658,503,968 distinct states found (2,079,281 ds/min), 186,988,200 states left on queue. +Progress(44) at 2024-11-07 03:27:06: 20,944,436,117 states generated (29,020,979 s/min), 1,660,708,925 distinct states found (2,204,957 ds/min), 187,151,771 states left on queue. +Progress(44) at 2024-11-07 03:28:06: 20,973,637,986 states generated (29,201,869 s/min), 1,662,812,161 distinct states found (2,103,236 ds/min), 187,208,363 states left on queue. +Progress(44) at 2024-11-07 03:29:06: 21,002,664,654 states generated (29,026,668 s/min), 1,665,077,078 distinct states found (2,264,917 ds/min), 187,398,168 states left on queue. +Progress(44) at 2024-11-07 03:30:06: 21,031,900,683 states generated (29,236,029 s/min), 1,667,241,517 distinct states found (2,164,439 ds/min), 187,444,342 states left on queue. +Checkpointing of run states/24-11-06-15-30-45.354 +Checkpointing completed at (2024-11-07 03:31:07) +Progress(44) at 2024-11-07 03:31:07: 21,061,190,967 states generated (29,290,284 s/min), 1,669,337,346 distinct states found (2,095,829 ds/min), 187,431,388 states left on queue. +Progress(44) at 2024-11-07 03:32:07: 21,090,368,622 states generated (29,177,655 s/min), 1,671,292,120 distinct states found (1,954,774 ds/min), 187,370,395 states left on queue. +Progress(44) at 2024-11-07 03:33:07: 21,119,546,588 states generated (29,177,966 s/min), 1,673,505,061 distinct states found (2,212,941 ds/min), 187,548,275 states left on queue. +Progress(44) at 2024-11-07 03:34:07: 21,148,770,544 states generated (29,223,956 s/min), 1,675,679,331 distinct states found (2,174,270 ds/min), 187,681,477 states left on queue. +Progress(44) at 2024-11-07 03:35:07: 21,177,752,842 states generated (28,982,298 s/min), 1,677,512,003 distinct states found (1,832,672 ds/min), 187,502,663 states left on queue. +Progress(44) at 2024-11-07 03:36:07: 21,206,777,989 states generated (29,025,147 s/min), 1,679,391,120 distinct states found (1,879,117 ds/min), 187,345,876 states left on queue. +Progress(44) at 2024-11-07 03:37:07: 21,235,634,562 states generated (28,856,573 s/min), 1,681,551,461 distinct states found (2,160,341 ds/min), 187,464,887 states left on queue. +Progress(44) at 2024-11-07 03:38:07: 21,264,448,690 states generated (28,814,128 s/min), 1,683,690,836 distinct states found (2,139,375 ds/min), 187,491,922 states left on queue. +Progress(44) at 2024-11-07 03:39:07: 21,293,469,454 states generated (29,020,764 s/min), 1,685,615,643 distinct states found (1,924,807 ds/min), 187,416,199 states left on queue. +Progress(44) at 2024-11-07 03:40:07: 21,322,287,082 states generated (28,817,628 s/min), 1,687,574,723 distinct states found (1,959,080 ds/min), 187,310,981 states left on queue. +Progress(44) at 2024-11-07 03:41:07: 21,351,396,680 states generated (29,109,598 s/min), 1,689,546,445 distinct states found (1,971,722 ds/min), 187,236,923 states left on queue. +Progress(44) at 2024-11-07 03:42:07: 21,380,557,165 states generated (29,160,485 s/min), 1,691,587,169 distinct states found (2,040,724 ds/min), 187,186,480 states left on queue. +Progress(44) at 2024-11-07 03:43:07: 21,409,627,333 states generated (29,070,168 s/min), 1,693,246,645 distinct states found (1,659,476 ds/min), 186,839,410 states left on queue. +Progress(44) at 2024-11-07 03:44:07: 21,438,692,500 states generated (29,065,167 s/min), 1,695,162,088 distinct states found (1,915,443 ds/min), 186,763,843 states left on queue. +Progress(44) at 2024-11-07 03:45:07: 21,467,558,980 states generated (28,866,480 s/min), 1,697,105,328 distinct states found (1,943,240 ds/min), 186,647,091 states left on queue. +Progress(44) at 2024-11-07 03:46:07: 21,496,459,596 states generated (28,900,616 s/min), 1,698,987,134 distinct states found (1,881,806 ds/min), 186,428,411 states left on queue. +Progress(44) at 2024-11-07 03:47:07: 21,525,539,564 states generated (29,079,968 s/min), 1,700,685,335 distinct states found (1,698,201 ds/min), 186,176,831 states left on queue. +Progress(44) at 2024-11-07 03:48:07: 21,554,716,115 states generated (29,176,551 s/min), 1,702,193,633 distinct states found (1,508,298 ds/min), 185,811,852 states left on queue. +Progress(44) at 2024-11-07 03:49:07: 21,583,930,332 states generated (29,214,217 s/min), 1,703,965,186 distinct states found (1,771,553 ds/min), 185,645,122 states left on queue. +Progress(44) at 2024-11-07 03:50:07: 21,612,870,304 states generated (28,939,972 s/min), 1,705,581,017 distinct states found (1,615,831 ds/min), 185,385,482 states left on queue. +Progress(44) at 2024-11-07 03:51:07: 21,641,828,993 states generated (28,958,689 s/min), 1,707,209,695 distinct states found (1,628,678 ds/min), 185,147,878 states left on queue. +Progress(44) at 2024-11-07 03:52:07: 21,670,879,227 states generated (29,050,234 s/min), 1,708,891,056 distinct states found (1,681,361 ds/min), 184,967,950 states left on queue. +Progress(44) at 2024-11-07 03:53:07: 21,700,175,853 states generated (29,296,626 s/min), 1,710,442,845 distinct states found (1,551,789 ds/min), 184,628,950 states left on queue. +Progress(44) at 2024-11-07 03:54:07: 21,729,661,920 states generated (29,486,067 s/min), 1,712,360,375 distinct states found (1,917,530 ds/min), 184,602,047 states left on queue. +Progress(44) at 2024-11-07 03:55:07: 21,759,015,470 states generated (29,353,550 s/min), 1,714,259,170 distinct states found (1,898,795 ds/min), 184,554,564 states left on queue. +Progress(44) at 2024-11-07 03:56:07: 21,788,534,088 states generated (29,518,618 s/min), 1,716,081,999 distinct states found (1,822,829 ds/min), 184,406,994 states left on queue. +Progress(44) at 2024-11-07 03:57:07: 21,817,875,474 states generated (29,341,386 s/min), 1,717,634,611 distinct states found (1,552,612 ds/min), 184,057,660 states left on queue. +Progress(44) at 2024-11-07 03:58:07: 21,847,006,510 states generated (29,131,036 s/min), 1,719,299,741 distinct states found (1,665,130 ds/min), 183,828,258 states left on queue. +Progress(44) at 2024-11-07 03:59:07: 21,875,869,357 states generated (28,862,847 s/min), 1,720,801,722 distinct states found (1,501,981 ds/min), 183,443,083 states left on queue. +Progress(44) at 2024-11-07 04:00:07: 21,904,922,732 states generated (29,053,375 s/min), 1,722,588,504 distinct states found (1,786,782 ds/min), 183,289,094 states left on queue. +Checkpointing of run states/24-11-06-15-30-45.354 +Checkpointing completed at (2024-11-07 04:01:07) +Progress(44) at 2024-11-07 04:01:07: 21,933,965,695 states generated (29,042,963 s/min), 1,724,285,279 distinct states found (1,696,775 ds/min), 183,029,310 states left on queue. +Progress(44) at 2024-11-07 04:02:07: 21,962,959,341 states generated (28,993,646 s/min), 1,725,868,213 distinct states found (1,582,934 ds/min), 182,699,155 states left on queue. +Progress(44) at 2024-11-07 04:03:07: 21,991,777,816 states generated (28,818,475 s/min), 1,727,519,032 distinct states found (1,650,819 ds/min), 182,433,701 states left on queue. +Progress(44) at 2024-11-07 04:04:07: 22,020,733,433 states generated (28,955,617 s/min), 1,729,219,503 distinct states found (1,700,471 ds/min), 182,216,615 states left on queue. +Progress(44) at 2024-11-07 04:05:07: 22,049,984,634 states generated (29,251,201 s/min), 1,730,967,606 distinct states found (1,748,103 ds/min), 182,140,987 states left on queue. +Progress(44) at 2024-11-07 04:06:07: 22,079,112,674 states generated (29,128,040 s/min), 1,732,648,368 distinct states found (1,680,762 ds/min), 181,963,576 states left on queue. +Progress(44) at 2024-11-07 04:07:07: 22,108,329,917 states generated (29,217,243 s/min), 1,734,711,160 distinct states found (2,062,792 ds/min), 182,074,434 states left on queue. +Progress(44) at 2024-11-07 04:08:07: 22,137,402,322 states generated (29,072,405 s/min), 1,736,773,111 distinct states found (2,061,951 ds/min), 182,163,318 states left on queue. +Progress(44) at 2024-11-07 04:09:07: 22,166,402,243 states generated (28,999,921 s/min), 1,738,573,615 distinct states found (1,800,504 ds/min), 182,034,194 states left on queue. +Progress(44) at 2024-11-07 04:10:07: 22,195,545,763 states generated (29,143,520 s/min), 1,740,349,901 distinct states found (1,776,286 ds/min), 181,869,339 states left on queue. +Progress(44) at 2024-11-07 04:11:07: 22,224,766,309 states generated (29,220,546 s/min), 1,742,110,577 distinct states found (1,760,676 ds/min), 181,671,885 states left on queue. +Progress(44) at 2024-11-07 04:12:07: 22,253,807,692 states generated (29,041,383 s/min), 1,743,796,752 distinct states found (1,686,175 ds/min), 181,407,584 states left on queue. +Progress(44) at 2024-11-07 04:13:07: 22,282,790,947 states generated (28,983,255 s/min), 1,745,617,175 distinct states found (1,820,423 ds/min), 181,265,096 states left on queue. +Progress(44) at 2024-11-07 04:14:07: 22,311,840,917 states generated (29,049,970 s/min), 1,747,424,658 distinct states found (1,807,483 ds/min), 181,110,335 states left on queue. +Progress(44) at 2024-11-07 04:15:07: 22,340,851,116 states generated (29,010,199 s/min), 1,749,204,899 distinct states found (1,780,241 ds/min), 180,933,264 states left on queue. +Progress(44) at 2024-11-07 04:16:07: 22,369,820,191 states generated (28,969,075 s/min), 1,751,058,290 distinct states found (1,853,391 ds/min), 180,819,450 states left on queue. +Progress(44) at 2024-11-07 04:17:07: 22,398,637,854 states generated (28,817,663 s/min), 1,752,838,012 distinct states found (1,779,722 ds/min), 180,641,066 states left on queue. +Progress(44) at 2024-11-07 04:18:07: 22,427,736,775 states generated (29,098,921 s/min), 1,754,678,716 distinct states found (1,840,704 ds/min), 180,523,907 states left on queue. +Progress(44) at 2024-11-07 04:19:07: 22,456,749,604 states generated (29,012,829 s/min), 1,756,653,204 distinct states found (1,974,488 ds/min), 180,502,441 states left on queue. +Progress(44) at 2024-11-07 04:20:07: 22,485,995,309 states generated (29,245,705 s/min), 1,758,406,219 distinct states found (1,753,015 ds/min), 180,303,710 states left on queue. +Progress(44) at 2024-11-07 04:21:07: 22,515,059,607 states generated (29,064,298 s/min), 1,760,239,858 distinct states found (1,833,639 ds/min), 180,203,277 states left on queue. +Progress(44) at 2024-11-07 04:22:07: 22,544,007,885 states generated (28,948,278 s/min), 1,761,871,023 distinct states found (1,631,165 ds/min), 179,919,396 states left on queue. +Progress(44) at 2024-11-07 04:23:07: 22,572,858,704 states generated (28,850,819 s/min), 1,763,420,170 distinct states found (1,549,147 ds/min), 179,579,696 states left on queue. +Progress(44) at 2024-11-07 04:24:07: 22,601,850,297 states generated (28,991,593 s/min), 1,765,118,103 distinct states found (1,697,933 ds/min), 179,386,571 states left on queue. +Progress(44) at 2024-11-07 04:25:07: 22,630,832,111 states generated (28,981,814 s/min), 1,766,934,802 distinct states found (1,816,699 ds/min), 179,271,264 states left on queue. +Progress(44) at 2024-11-07 04:26:07: 22,659,674,047 states generated (28,841,936 s/min), 1,768,697,425 distinct states found (1,762,623 ds/min), 179,093,059 states left on queue. +Progress(44) at 2024-11-07 04:27:07: 22,688,427,580 states generated (28,753,533 s/min), 1,770,450,184 distinct states found (1,752,759 ds/min), 178,899,489 states left on queue. +Progress(44) at 2024-11-07 04:28:07: 22,717,189,869 states generated (28,762,289 s/min), 1,772,256,239 distinct states found (1,806,055 ds/min), 178,731,640 states left on queue. +Progress(44) at 2024-11-07 04:29:07: 22,746,022,343 states generated (28,832,474 s/min), 1,774,044,050 distinct states found (1,787,811 ds/min), 178,570,129 states left on queue. +Progress(44) at 2024-11-07 04:30:07: 22,774,887,995 states generated (28,865,652 s/min), 1,775,840,059 distinct states found (1,796,009 ds/min), 178,368,886 states left on queue. +Checkpointing of run states/24-11-06-15-30-45.354 +Checkpointing completed at (2024-11-07 04:31:08) +Progress(44) at 2024-11-07 04:31:08: 22,803,877,345 states generated (28,989,350 s/min), 1,777,539,486 distinct states found (1,699,427 ds/min), 178,036,764 states left on queue. +Progress(44) at 2024-11-07 04:32:08: 22,832,344,161 states generated (28,466,816 s/min), 1,779,071,939 distinct states found (1,532,453 ds/min), 177,609,496 states left on queue. +Progress(44) at 2024-11-07 04:33:08: 22,860,965,708 states generated (28,621,547 s/min), 1,780,632,191 distinct states found (1,560,252 ds/min), 177,226,645 states left on queue. +Progress(44) at 2024-11-07 04:34:08: 22,890,116,212 states generated (29,150,504 s/min), 1,782,192,671 distinct states found (1,560,480 ds/min), 176,856,967 states left on queue. +Progress(44) at 2024-11-07 04:35:08: 22,919,394,798 states generated (29,278,586 s/min), 1,783,989,997 distinct states found (1,797,326 ds/min), 176,677,020 states left on queue. +Progress(44) at 2024-11-07 04:36:08: 22,948,717,272 states generated (29,322,474 s/min), 1,785,769,628 distinct states found (1,779,631 ds/min), 176,466,304 states left on queue. +Progress(44) at 2024-11-07 04:37:08: 22,978,008,874 states generated (29,291,602 s/min), 1,787,768,546 distinct states found (1,998,918 ds/min), 176,439,057 states left on queue. +Progress(45) at 2024-11-07 04:38:08: 23,007,259,342 states generated (29,250,468 s/min), 1,789,652,180 distinct states found (1,883,634 ds/min), 176,335,868 states left on queue. +Progress(45) at 2024-11-07 04:39:08: 23,036,414,234 states generated (29,154,892 s/min), 1,791,293,395 distinct states found (1,641,215 ds/min), 176,048,834 states left on queue. +Progress(45) at 2024-11-07 04:40:08: 23,065,467,218 states generated (29,052,984 s/min), 1,793,176,180 distinct states found (1,882,785 ds/min), 175,945,068 states left on queue. +Progress(45) at 2024-11-07 04:41:08: 23,094,601,413 states generated (29,134,195 s/min), 1,795,085,201 distinct states found (1,909,021 ds/min), 175,844,471 states left on queue. +Progress(45) at 2024-11-07 04:42:08: 23,123,835,299 states generated (29,233,886 s/min), 1,796,998,629 distinct states found (1,913,428 ds/min), 175,751,026 states left on queue. +Progress(45) at 2024-11-07 04:43:08: 23,153,014,383 states generated (29,179,084 s/min), 1,798,830,917 distinct states found (1,832,288 ds/min), 175,609,899 states left on queue. +Progress(45) at 2024-11-07 04:44:08: 23,181,848,791 states generated (28,834,408 s/min), 1,800,688,969 distinct states found (1,858,052 ds/min), 175,482,089 states left on queue. +Progress(45) at 2024-11-07 04:45:08: 23,210,960,242 states generated (29,111,451 s/min), 1,802,681,838 distinct states found (1,992,869 ds/min), 175,468,259 states left on queue. +Progress(45) at 2024-11-07 04:46:08: 23,239,931,898 states generated (28,971,656 s/min), 1,804,527,297 distinct states found (1,845,459 ds/min), 175,314,676 states left on queue. +Progress(45) at 2024-11-07 04:47:08: 23,269,110,236 states generated (29,178,338 s/min), 1,806,324,412 distinct states found (1,797,115 ds/min), 175,104,294 states left on queue. +Progress(45) at 2024-11-07 04:48:08: 23,298,261,893 states generated (29,151,657 s/min), 1,808,026,372 distinct states found (1,701,960 ds/min), 174,789,761 states left on queue. +Progress(45) at 2024-11-07 04:49:08: 23,327,194,301 states generated (28,932,408 s/min), 1,809,635,143 distinct states found (1,608,771 ds/min), 174,475,327 states left on queue. +Progress(45) at 2024-11-07 04:50:08: 23,356,033,807 states generated (28,839,506 s/min), 1,811,533,685 distinct states found (1,898,542 ds/min), 174,375,697 states left on queue. +Progress(45) at 2024-11-07 04:51:08: 23,384,783,950 states generated (28,750,143 s/min), 1,813,242,773 distinct states found (1,709,088 ds/min), 174,093,638 states left on queue. +Progress(45) at 2024-11-07 04:52:08: 23,413,868,078 states generated (29,084,128 s/min), 1,814,921,217 distinct states found (1,678,444 ds/min), 173,816,375 states left on queue. +Progress(45) at 2024-11-07 04:53:08: 23,443,072,326 states generated (29,204,248 s/min), 1,816,887,463 distinct states found (1,966,246 ds/min), 173,768,064 states left on queue. +Progress(45) at 2024-11-07 04:54:08: 23,472,531,302 states generated (29,458,976 s/min), 1,818,893,389 distinct states found (2,005,926 ds/min), 173,736,986 states left on queue. +Progress(45) at 2024-11-07 04:55:08: 23,501,670,169 states generated (29,138,867 s/min), 1,820,467,013 distinct states found (1,573,624 ds/min), 173,393,980 states left on queue. +Progress(45) at 2024-11-07 04:56:08: 23,530,619,816 states generated (28,949,647 s/min), 1,822,153,389 distinct states found (1,686,376 ds/min), 173,102,476 states left on queue. +Progress(45) at 2024-11-07 04:57:08: 23,559,730,839 states generated (29,111,023 s/min), 1,824,067,840 distinct states found (1,914,451 ds/min), 173,045,910 states left on queue. +Progress(45) at 2024-11-07 04:58:08: 23,588,956,543 states generated (29,225,704 s/min), 1,826,128,132 distinct states found (2,060,292 ds/min), 173,097,456 states left on queue. +Progress(45) at 2024-11-07 04:59:08: 23,617,943,385 states generated (28,986,842 s/min), 1,828,156,857 distinct states found (2,028,725 ds/min), 173,115,797 states left on queue. +Progress(45) at 2024-11-07 05:00:08: 23,647,052,247 states generated (29,108,862 s/min), 1,830,116,296 distinct states found (1,959,439 ds/min), 173,061,677 states left on queue. +Checkpointing of run states/24-11-06-15-30-45.354 +Checkpointing completed at (2024-11-07 05:01:09) +Progress(45) at 2024-11-07 05:01:09: 23,676,540,644 states generated (29,488,397 s/min), 1,832,081,172 distinct states found (1,964,876 ds/min), 173,019,523 states left on queue. +Progress(45) at 2024-11-07 05:02:09: 23,705,447,239 states generated (28,906,595 s/min), 1,834,157,962 distinct states found (2,076,790 ds/min), 173,069,444 states left on queue. +Progress(45) at 2024-11-07 05:03:09: 23,734,590,381 states generated (29,143,142 s/min), 1,836,148,599 distinct states found (1,990,637 ds/min), 173,037,041 states left on queue. +Progress(45) at 2024-11-07 05:04:09: 23,763,605,229 states generated (29,014,848 s/min), 1,838,302,051 distinct states found (2,153,452 ds/min), 173,135,339 states left on queue. +Progress(45) at 2024-11-07 05:05:09: 23,792,794,847 states generated (29,189,618 s/min), 1,840,318,078 distinct states found (2,016,027 ds/min), 173,064,676 states left on queue. +Progress(45) at 2024-11-07 05:06:09: 23,821,711,411 states generated (28,916,564 s/min), 1,842,248,819 distinct states found (1,930,741 ds/min), 172,938,116 states left on queue. +Progress(45) at 2024-11-07 05:07:09: 23,850,829,522 states generated (29,118,111 s/min), 1,844,084,520 distinct states found (1,835,701 ds/min), 172,779,569 states left on queue. +Progress(45) at 2024-11-07 05:08:09: 23,880,027,055 states generated (29,197,533 s/min), 1,846,207,907 distinct states found (2,123,387 ds/min), 172,876,875 states left on queue. +Progress(45) at 2024-11-07 05:09:09: 23,909,238,654 states generated (29,211,599 s/min), 1,848,275,162 distinct states found (2,067,255 ds/min), 172,917,710 states left on queue. +Progress(45) at 2024-11-07 05:10:09: 23,938,254,527 states generated (29,015,873 s/min), 1,850,062,508 distinct states found (1,787,346 ds/min), 172,709,939 states left on queue. +Progress(45) at 2024-11-07 05:11:09: 23,967,280,908 states generated (29,026,381 s/min), 1,851,840,844 distinct states found (1,778,336 ds/min), 172,472,809 states left on queue. +Progress(45) at 2024-11-07 05:12:09: 23,996,137,153 states generated (28,856,245 s/min), 1,853,907,711 distinct states found (2,066,867 ds/min), 172,514,422 states left on queue. +Progress(45) at 2024-11-07 05:13:09: 24,025,003,271 states generated (28,866,118 s/min), 1,855,881,596 distinct states found (1,973,885 ds/min), 172,410,581 states left on queue. +Progress(45) at 2024-11-07 05:14:09: 24,053,998,968 states generated (28,995,697 s/min), 1,857,730,142 distinct states found (1,848,546 ds/min), 172,259,468 states left on queue. +Progress(45) at 2024-11-07 05:15:09: 24,082,780,775 states generated (28,781,807 s/min), 1,859,612,879 distinct states found (1,882,737 ds/min), 172,097,889 states left on queue. +Progress(45) at 2024-11-07 05:16:09: 24,111,843,462 states generated (29,062,687 s/min), 1,861,479,353 distinct states found (1,866,474 ds/min), 171,938,834 states left on queue. +Progress(45) at 2024-11-07 05:17:09: 24,140,987,153 states generated (29,143,691 s/min), 1,863,390,493 distinct states found (1,911,140 ds/min), 171,786,752 states left on queue. +Progress(45) at 2024-11-07 05:18:09: 24,170,023,897 states generated (29,036,744 s/min), 1,864,965,603 distinct states found (1,575,110 ds/min), 171,386,848 states left on queue. +Progress(45) at 2024-11-07 05:19:09: 24,198,987,772 states generated (28,963,875 s/min), 1,866,820,638 distinct states found (1,855,035 ds/min), 171,238,575 states left on queue. +Progress(45) at 2024-11-07 05:20:09: 24,227,820,740 states generated (28,832,968 s/min), 1,868,623,853 distinct states found (1,803,215 ds/min), 171,005,974 states left on queue. +Progress(45) at 2024-11-07 05:21:09: 24,256,712,636 states generated (28,891,896 s/min), 1,870,265,139 distinct states found (1,641,286 ds/min), 170,619,838 states left on queue. +Progress(45) at 2024-11-07 05:22:09: 24,285,792,587 states generated (29,079,951 s/min), 1,871,770,548 distinct states found (1,505,409 ds/min), 170,247,019 states left on queue. +Progress(45) at 2024-11-07 05:23:09: 24,315,021,618 states generated (29,229,031 s/min), 1,873,433,426 distinct states found (1,662,878 ds/min), 169,986,497 states left on queue. +Progress(45) at 2024-11-07 05:24:09: 24,343,972,976 states generated (28,951,358 s/min), 1,874,958,509 distinct states found (1,525,083 ds/min), 169,639,357 states left on queue. +Progress(45) at 2024-11-07 05:25:09: 24,372,818,044 states generated (28,845,068 s/min), 1,876,461,909 distinct states found (1,503,400 ds/min), 169,298,313 states left on queue. +Progress(45) at 2024-11-07 05:26:09: 24,401,879,839 states generated (29,061,795 s/min), 1,878,043,093 distinct states found (1,581,184 ds/min), 169,034,999 states left on queue. +Progress(45) at 2024-11-07 05:27:09: 24,431,117,440 states generated (29,237,601 s/min), 1,879,528,913 distinct states found (1,485,820 ds/min), 168,669,766 states left on queue. +Progress(45) at 2024-11-07 05:28:09: 24,460,565,564 states generated (29,448,124 s/min), 1,881,382,841 distinct states found (1,853,928 ds/min), 168,585,549 states left on queue. +Progress(45) at 2024-11-07 05:29:09: 24,489,842,320 states generated (29,276,756 s/min), 1,883,163,526 distinct states found (1,780,685 ds/min), 168,440,866 states left on queue. +Progress(45) at 2024-11-07 05:30:09: 24,519,309,785 states generated (29,467,465 s/min), 1,884,840,978 distinct states found (1,677,452 ds/min), 168,176,100 states left on queue. +Checkpointing of run states/24-11-06-15-30-45.354 +Checkpointing completed at (2024-11-07 05:31:09) +Progress(45) at 2024-11-07 05:31:09: 24,548,699,426 states generated (29,389,641 s/min), 1,886,346,733 distinct states found (1,505,755 ds/min), 167,794,030 states left on queue. +Progress(45) at 2024-11-07 05:32:09: 24,577,454,860 states generated (28,755,434 s/min), 1,887,761,288 distinct states found (1,414,555 ds/min), 167,342,409 states left on queue. +Progress(45) at 2024-11-07 05:33:09: 24,606,401,929 states generated (28,947,069 s/min), 1,889,451,503 distinct states found (1,690,215 ds/min), 167,115,718 states left on queue. +Progress(45) at 2024-11-07 05:34:09: 24,635,080,181 states generated (28,678,252 s/min), 1,891,013,080 distinct states found (1,561,577 ds/min), 166,760,395 states left on queue. +Progress(45) at 2024-11-07 05:35:09: 24,663,912,233 states generated (28,832,052 s/min), 1,892,486,967 distinct states found (1,473,887 ds/min), 166,347,547 states left on queue. +Progress(45) at 2024-11-07 05:36:09: 24,692,601,003 states generated (28,688,770 s/min), 1,894,014,661 distinct states found (1,527,694 ds/min), 165,980,327 states left on queue. +Progress(45) at 2024-11-07 05:37:09: 24,721,596,280 states generated (28,995,277 s/min), 1,895,667,269 distinct states found (1,652,608 ds/min), 165,766,132 states left on queue. +Progress(45) at 2024-11-07 05:38:09: 24,750,737,270 states generated (29,140,990 s/min), 1,897,304,588 distinct states found (1,637,319 ds/min), 165,602,331 states left on queue. +Progress(45) at 2024-11-07 05:39:09: 24,779,762,621 states generated (29,025,351 s/min), 1,898,944,557 distinct states found (1,639,969 ds/min), 165,399,097 states left on queue. +Progress(45) at 2024-11-07 05:40:09: 24,808,890,636 states generated (29,128,015 s/min), 1,901,039,200 distinct states found (2,094,643 ds/min), 165,505,866 states left on queue. +Progress(45) at 2024-11-07 05:41:09: 24,837,834,330 states generated (28,943,694 s/min), 1,902,825,947 distinct states found (1,786,747 ds/min), 165,385,690 states left on queue. +Progress(45) at 2024-11-07 05:42:09: 24,866,749,194 states generated (28,914,864 s/min), 1,904,509,048 distinct states found (1,683,101 ds/min), 165,143,394 states left on queue. +Progress(45) at 2024-11-07 05:43:09: 24,895,891,462 states generated (29,142,268 s/min), 1,906,186,633 distinct states found (1,677,585 ds/min), 164,907,199 states left on queue. +Progress(45) at 2024-11-07 05:44:09: 24,924,929,592 states generated (29,038,130 s/min), 1,907,774,010 distinct states found (1,587,377 ds/min), 164,567,256 states left on queue. +Progress(45) at 2024-11-07 05:45:09: 24,953,854,731 states generated (28,925,139 s/min), 1,909,438,393 distinct states found (1,664,383 ds/min), 164,297,435 states left on queue. +Progress(45) at 2024-11-07 05:46:09: 24,982,773,173 states generated (28,918,442 s/min), 1,911,115,370 distinct states found (1,676,977 ds/min), 164,029,981 states left on queue. +Progress(45) at 2024-11-07 05:47:09: 25,011,681,639 states generated (28,908,466 s/min), 1,912,739,102 distinct states found (1,623,732 ds/min), 163,722,709 states left on queue. +Progress(45) at 2024-11-07 05:48:09: 25,040,624,886 states generated (28,943,247 s/min), 1,914,465,220 distinct states found (1,726,118 ds/min), 163,504,979 states left on queue. +Progress(45) at 2024-11-07 05:49:09: 25,069,369,631 states generated (28,744,745 s/min), 1,916,123,524 distinct states found (1,658,304 ds/min), 163,227,016 states left on queue. +Progress(45) at 2024-11-07 05:50:09: 25,098,381,973 states generated (29,012,342 s/min), 1,917,856,454 distinct states found (1,732,930 ds/min), 163,020,213 states left on queue. +Progress(45) at 2024-11-07 05:51:09: 25,127,432,010 states generated (29,050,037 s/min), 1,919,715,623 distinct states found (1,859,169 ds/min), 162,903,211 states left on queue. +Progress(45) at 2024-11-07 05:52:09: 25,156,554,852 states generated (29,122,842 s/min), 1,921,381,482 distinct states found (1,665,859 ds/min), 162,640,342 states left on queue. +Progress(45) at 2024-11-07 05:53:09: 25,185,439,752 states generated (28,884,900 s/min), 1,923,074,493 distinct states found (1,693,011 ds/min), 162,418,419 states left on queue. +Progress(45) at 2024-11-07 05:54:09: 25,214,250,620 states generated (28,810,868 s/min), 1,924,599,166 distinct states found (1,524,673 ds/min), 162,035,736 states left on queue. +Progress(45) at 2024-11-07 05:55:09: 25,243,065,684 states generated (28,815,064 s/min), 1,926,028,590 distinct states found (1,429,424 ds/min), 161,647,928 states left on queue. +Progress(45) at 2024-11-07 05:56:09: 25,272,074,106 states generated (29,008,422 s/min), 1,927,788,924 distinct states found (1,760,334 ds/min), 161,469,066 states left on queue. +Progress(45) at 2024-11-07 05:57:09: 25,300,916,527 states generated (28,842,421 s/min), 1,929,427,503 distinct states found (1,638,579 ds/min), 161,203,063 states left on queue. +Progress(45) at 2024-11-07 05:58:09: 25,329,617,957 states generated (28,701,430 s/min), 1,931,016,200 distinct states found (1,588,697 ds/min), 160,883,828 states left on queue. +Progress(45) at 2024-11-07 05:59:09: 25,358,305,874 states generated (28,687,917 s/min), 1,932,700,683 distinct states found (1,684,483 ds/min), 160,613,534 states left on queue. +Progress(45) at 2024-11-07 06:00:09: 25,387,060,807 states generated (28,754,933 s/min), 1,934,352,908 distinct states found (1,652,225 ds/min), 160,340,594 states left on queue. +Checkpointing of run states/24-11-06-15-30-45.354 +Checkpointing completed at (2024-11-07 06:01:10) +Progress(45) at 2024-11-07 06:01:10: 25,416,167,383 states generated (29,106,576 s/min), 1,936,031,185 distinct states found (1,678,277 ds/min), 160,024,096 states left on queue. +Progress(45) at 2024-11-07 06:02:10: 25,444,775,068 states generated (28,607,685 s/min), 1,937,531,864 distinct states found (1,500,679 ds/min), 159,558,759 states left on queue. +Progress(45) at 2024-11-07 06:03:10: 25,473,218,014 states generated (28,442,946 s/min), 1,938,932,593 distinct states found (1,400,729 ds/min), 159,031,186 states left on queue. +Progress(45) at 2024-11-07 06:04:10: 25,502,153,601 states generated (28,935,587 s/min), 1,940,366,906 distinct states found (1,434,313 ds/min), 158,550,067 states left on queue. +Progress(45) at 2024-11-07 06:05:10: 25,531,409,924 states generated (29,256,323 s/min), 1,942,031,081 distinct states found (1,664,175 ds/min), 158,260,393 states left on queue. +Progress(45) at 2024-11-07 06:06:10: 25,560,798,500 states generated (29,388,576 s/min), 1,943,755,697 distinct states found (1,724,616 ds/min), 158,001,838 states left on queue. +Progress(45) at 2024-11-07 06:07:10: 25,590,101,236 states generated (29,302,736 s/min), 1,945,659,191 distinct states found (1,903,494 ds/min), 157,894,541 states left on queue. +Progress(45) at 2024-11-07 06:08:10: 25,619,347,006 states generated (29,245,770 s/min), 1,947,436,584 distinct states found (1,777,393 ds/min), 157,703,839 states left on queue. +Progress(45) at 2024-11-07 06:09:10: 25,648,466,795 states generated (29,119,789 s/min), 1,949,039,117 distinct states found (1,602,533 ds/min), 157,391,298 states left on queue. +Progress(45) at 2024-11-07 06:10:10: 25,677,360,883 states generated (28,894,088 s/min), 1,950,787,656 distinct states found (1,748,539 ds/min), 157,176,854 states left on queue. +Progress(45) at 2024-11-07 06:11:10: 25,706,625,655 states generated (29,264,772 s/min), 1,952,700,166 distinct states found (1,912,510 ds/min), 157,069,408 states left on queue. +Progress(46) at 2024-11-07 06:12:10: 25,735,830,172 states generated (29,204,517 s/min), 1,954,444,069 distinct states found (1,743,903 ds/min), 156,852,227 states left on queue. +Progress(46) at 2024-11-07 06:13:10: 25,764,811,792 states generated (28,981,620 s/min), 1,956,165,433 distinct states found (1,721,364 ds/min), 156,618,900 states left on queue. +Progress(46) at 2024-11-07 06:14:10: 25,793,740,486 states generated (28,928,694 s/min), 1,957,961,862 distinct states found (1,796,429 ds/min), 156,441,787 states left on queue. +Progress(46) at 2024-11-07 06:15:10: 25,822,741,831 states generated (29,001,345 s/min), 1,959,749,416 distinct states found (1,787,554 ds/min), 156,253,838 states left on queue. +Progress(46) at 2024-11-07 06:16:10: 25,851,804,688 states generated (29,062,857 s/min), 1,961,466,422 distinct states found (1,717,006 ds/min), 155,977,351 states left on queue. +Progress(46) at 2024-11-07 06:17:10: 25,880,868,584 states generated (29,063,896 s/min), 1,963,090,742 distinct states found (1,624,320 ds/min), 155,628,145 states left on queue. +Progress(46) at 2024-11-07 06:18:10: 25,909,824,307 states generated (28,955,723 s/min), 1,964,570,100 distinct states found (1,479,358 ds/min), 155,182,107 states left on queue. +Progress(46) at 2024-11-07 06:19:10: 25,938,584,425 states generated (28,760,118 s/min), 1,966,303,642 distinct states found (1,733,542 ds/min), 154,946,766 states left on queue. +Progress(46) at 2024-11-07 06:20:10: 25,967,304,223 states generated (28,719,798 s/min), 1,967,883,207 distinct states found (1,579,565 ds/min), 154,558,935 states left on queue. +Progress(46) at 2024-11-07 06:21:10: 25,996,402,469 states generated (29,098,246 s/min), 1,969,591,000 distinct states found (1,707,793 ds/min), 154,302,069 states left on queue. +Progress(46) at 2024-11-07 06:22:10: 26,025,623,943 states generated (29,221,474 s/min), 1,971,434,403 distinct states found (1,843,403 ds/min), 154,157,059 states left on queue. +Progress(46) at 2024-11-07 06:23:10: 26,055,038,054 states generated (29,414,111 s/min), 1,973,261,720 distinct states found (1,827,317 ds/min), 153,981,317 states left on queue. +Progress(46) at 2024-11-07 06:24:10: 26,083,986,220 states generated (28,948,166 s/min), 1,974,670,648 distinct states found (1,408,928 ds/min), 153,508,388 states left on queue. +Progress(46) at 2024-11-07 06:25:10: 26,113,067,907 states generated (29,081,687 s/min), 1,976,391,547 distinct states found (1,720,899 ds/min), 153,263,845 states left on queue. +Progress(46) at 2024-11-07 06:26:10: 26,142,186,839 states generated (29,118,932 s/min), 1,978,379,881 distinct states found (1,988,334 ds/min), 153,253,200 states left on queue. +Progress(46) at 2024-11-07 06:27:10: 26,171,338,068 states generated (29,151,229 s/min), 1,980,293,569 distinct states found (1,913,688 ds/min), 153,185,559 states left on queue. +Progress(46) at 2024-11-07 06:28:10: 26,200,319,869 states generated (28,981,801 s/min), 1,982,130,034 distinct states found (1,836,465 ds/min), 153,039,826 states left on queue. +Progress(46) at 2024-11-07 06:29:10: 26,229,451,237 states generated (29,131,368 s/min), 1,984,117,981 distinct states found (1,987,947 ds/min), 153,030,792 states left on queue. +Progress(46) at 2024-11-07 06:30:10: 26,258,476,767 states generated (29,025,530 s/min), 1,985,981,073 distinct states found (1,863,092 ds/min), 152,917,939 states left on queue. +Checkpointing of run states/24-11-06-15-30-45.354 +Checkpointing completed at (2024-11-07 06:31:11) +Progress(46) at 2024-11-07 06:31:11: 26,287,657,848 states generated (29,181,081 s/min), 1,987,875,178 distinct states found (1,894,105 ds/min), 152,784,901 states left on queue. +Progress(46) at 2024-11-07 06:32:11: 26,316,549,803 states generated (28,891,955 s/min), 1,989,821,141 distinct states found (1,945,963 ds/min), 152,728,813 states left on queue. +Progress(46) at 2024-11-07 06:33:11: 26,345,570,902 states generated (29,021,099 s/min), 1,991,762,973 distinct states found (1,941,832 ds/min), 152,648,662 states left on queue. +Progress(46) at 2024-11-07 06:34:11: 26,374,519,051 states generated (28,948,149 s/min), 1,993,605,958 distinct states found (1,842,985 ds/min), 152,446,201 states left on queue. +Progress(46) at 2024-11-07 06:35:11: 26,403,403,284 states generated (28,884,233 s/min), 1,995,379,328 distinct states found (1,773,370 ds/min), 152,189,032 states left on queue. +Progress(46) at 2024-11-07 06:36:11: 26,432,512,518 states generated (29,109,234 s/min), 1,997,205,848 distinct states found (1,826,520 ds/min), 152,060,823 states left on queue. +Progress(46) at 2024-11-07 06:37:11: 26,461,635,963 states generated (29,123,445 s/min), 1,999,221,288 distinct states found (2,015,440 ds/min), 152,052,317 states left on queue. +Progress(46) at 2024-11-07 06:38:11: 26,490,692,408 states generated (29,056,445 s/min), 2,001,003,940 distinct states found (1,782,652 ds/min), 151,869,333 states left on queue. +Progress(46) at 2024-11-07 06:39:11: 26,519,611,691 states generated (28,919,283 s/min), 2,002,772,264 distinct states found (1,768,324 ds/min), 151,637,576 states left on queue. +Progress(46) at 2024-11-07 06:40:11: 26,548,405,773 states generated (28,794,082 s/min), 2,004,530,832 distinct states found (1,758,568 ds/min), 151,415,653 states left on queue. +Progress(46) at 2024-11-07 06:41:11: 26,577,168,173 states generated (28,762,400 s/min), 2,006,431,383 distinct states found (1,900,551 ds/min), 151,293,655 states left on queue. +Progress(46) at 2024-11-07 06:42:11: 26,606,013,565 states generated (28,845,392 s/min), 2,008,118,930 distinct states found (1,687,547 ds/min), 150,979,607 states left on queue. +Progress(46) at 2024-11-07 06:43:11: 26,634,840,454 states generated (28,826,889 s/min), 2,010,033,233 distinct states found (1,914,303 ds/min), 150,859,233 states left on queue. +Progress(46) at 2024-11-07 06:44:11: 26,663,791,564 states generated (28,951,110 s/min), 2,011,764,506 distinct states found (1,731,273 ds/min), 150,592,176 states left on queue. +Progress(46) at 2024-11-07 06:45:11: 26,692,845,560 states generated (29,053,996 s/min), 2,013,541,948 distinct states found (1,777,442 ds/min), 150,346,125 states left on queue. +Progress(46) at 2024-11-07 06:46:11: 26,721,838,462 states generated (28,992,902 s/min), 2,015,055,311 distinct states found (1,513,363 ds/min), 149,898,025 states left on queue. +Progress(46) at 2024-11-07 06:47:11: 26,750,784,724 states generated (28,946,262 s/min), 2,016,795,791 distinct states found (1,740,480 ds/min), 149,636,143 states left on queue. +Progress(46) at 2024-11-07 06:48:11: 26,779,537,729 states generated (28,753,005 s/min), 2,018,420,817 distinct states found (1,625,026 ds/min), 149,264,338 states left on queue. +Progress(46) at 2024-11-07 06:49:11: 26,808,414,064 states generated (28,876,335 s/min), 2,019,941,133 distinct states found (1,520,316 ds/min), 148,833,851 states left on queue. +Progress(46) at 2024-11-07 06:50:11: 26,837,552,895 states generated (29,138,831 s/min), 2,021,402,334 distinct states found (1,461,201 ds/min), 148,423,082 states left on queue. +Progress(46) at 2024-11-07 06:51:11: 26,866,488,521 states generated (28,935,626 s/min), 2,022,896,299 distinct states found (1,493,965 ds/min), 148,037,640 states left on queue. +Progress(46) at 2024-11-07 06:52:11: 26,895,259,654 states generated (28,771,133 s/min), 2,024,306,180 distinct states found (1,409,881 ds/min), 147,623,626 states left on queue. +Progress(46) at 2024-11-07 06:53:11: 26,924,324,639 states generated (29,064,985 s/min), 2,025,751,691 distinct states found (1,445,511 ds/min), 147,237,191 states left on queue. +Progress(46) at 2024-11-07 06:54:11: 26,953,575,306 states generated (29,250,667 s/min), 2,027,292,041 distinct states found (1,540,350 ds/min), 146,929,253 states left on queue. +Progress(46) at 2024-11-07 06:55:11: 26,982,863,734 states generated (29,288,428 s/min), 2,029,056,116 distinct states found (1,764,075 ds/min), 146,774,179 states left on queue. +Progress(46) at 2024-11-07 06:56:11: 27,012,217,899 states generated (29,354,165 s/min), 2,030,705,091 distinct states found (1,648,975 ds/min), 146,523,776 states left on queue. +Progress(46) at 2024-11-07 06:57:11: 27,041,431,406 states generated (29,213,507 s/min), 2,032,122,917 distinct states found (1,417,826 ds/min), 146,066,712 states left on queue. +Progress(46) at 2024-11-07 06:58:11: 27,070,230,233 states generated (28,798,827 s/min), 2,033,502,867 distinct states found (1,379,950 ds/min), 145,580,465 states left on queue. +Progress(46) at 2024-11-07 06:59:11: 27,099,119,410 states generated (28,889,177 s/min), 2,035,080,295 distinct states found (1,577,428 ds/min), 145,255,429 states left on queue. +Progress(46) at 2024-11-07 07:00:11: 27,127,802,546 states generated (28,683,136 s/min), 2,036,480,069 distinct states found (1,399,774 ds/min), 144,763,326 states left on queue. +Checkpointing of run states/24-11-06-15-30-45.354 +Checkpointing completed at (2024-11-07 07:01:11) +Progress(46) at 2024-11-07 07:01:11: 27,156,729,000 states generated (28,926,454 s/min), 2,037,888,171 distinct states found (1,408,102 ds/min), 144,282,188 states left on queue. +Progress(46) at 2024-11-07 07:02:11: 27,185,673,878 states generated (28,944,878 s/min), 2,039,404,499 distinct states found (1,516,328 ds/min), 143,933,899 states left on queue. +Progress(46) at 2024-11-07 07:03:11: 27,214,800,380 states generated (29,126,502 s/min), 2,040,991,907 distinct states found (1,587,408 ds/min), 143,736,528 states left on queue. +Progress(46) at 2024-11-07 07:04:11: 27,243,805,336 states generated (29,004,956 s/min), 2,042,560,493 distinct states found (1,568,586 ds/min), 143,474,607 states left on queue. +Progress(46) at 2024-11-07 07:05:11: 27,272,912,902 states generated (29,107,566 s/min), 2,044,549,687 distinct states found (1,989,194 ds/min), 143,501,934 states left on queue. +Progress(46) at 2024-11-07 07:06:11: 27,301,850,628 states generated (28,937,726 s/min), 2,046,213,816 distinct states found (1,664,129 ds/min), 143,280,971 states left on queue. +Progress(46) at 2024-11-07 07:07:11: 27,330,744,799 states generated (28,894,171 s/min), 2,047,777,121 distinct states found (1,563,305 ds/min), 142,943,602 states left on queue. +Progress(46) at 2024-11-07 07:08:11: 27,359,855,477 states generated (29,110,678 s/min), 2,049,356,015 distinct states found (1,578,894 ds/min), 142,631,188 states left on queue. +Progress(46) at 2024-11-07 07:09:11: 27,388,745,464 states generated (28,889,987 s/min), 2,050,822,496 distinct states found (1,466,481 ds/min), 142,190,439 states left on queue. +Progress(46) at 2024-11-07 07:10:11: 27,417,576,550 states generated (28,831,086 s/min), 2,052,379,523 distinct states found (1,557,027 ds/min), 141,821,153 states left on queue. +Progress(46) at 2024-11-07 07:11:11: 27,446,546,405 states generated (28,969,855 s/min), 2,053,934,499 distinct states found (1,554,976 ds/min), 141,462,097 states left on queue. +Progress(46) at 2024-11-07 07:12:11: 27,475,398,683 states generated (28,852,278 s/min), 2,055,510,649 distinct states found (1,576,150 ds/min), 141,116,110 states left on queue. +Progress(46) at 2024-11-07 07:13:11: 27,504,113,194 states generated (28,714,511 s/min), 2,057,051,677 distinct states found (1,541,028 ds/min), 140,743,906 states left on queue. +Progress(46) at 2024-11-07 07:14:11: 27,532,983,174 states generated (28,869,980 s/min), 2,058,669,649 distinct states found (1,617,972 ds/min), 140,436,853 states left on queue. +Progress(46) at 2024-11-07 07:15:11: 27,562,088,285 states generated (29,105,111 s/min), 2,060,404,146 distinct states found (1,734,497 ds/min), 140,213,296 states left on queue. +Progress(46) at 2024-11-07 07:16:11: 27,591,079,273 states generated (28,990,988 s/min), 2,061,979,907 distinct states found (1,575,761 ds/min), 139,895,056 states left on queue. +Progress(46) at 2024-11-07 07:17:11: 27,619,876,413 states generated (28,797,140 s/min), 2,063,482,225 distinct states found (1,502,318 ds/min), 139,506,174 states left on queue. +Progress(46) at 2024-11-07 07:18:11: 27,648,595,649 states generated (28,719,236 s/min), 2,064,847,355 distinct states found (1,365,130 ds/min), 139,035,783 states left on queue. +Progress(46) at 2024-11-07 07:19:11: 27,677,544,192 states generated (28,948,543 s/min), 2,066,507,355 distinct states found (1,660,000 ds/min), 138,783,592 states left on queue. +Progress(46) at 2024-11-07 07:20:11: 27,706,306,461 states generated (28,762,269 s/min), 2,068,019,192 distinct states found (1,511,837 ds/min), 138,418,256 states left on queue. +Progress(46) at 2024-11-07 07:21:11: 27,734,873,733 states generated (28,567,272 s/min), 2,069,467,142 distinct states found (1,447,950 ds/min), 137,977,630 states left on queue. +Progress(46) at 2024-11-07 07:22:11: 27,763,678,204 states generated (28,804,471 s/min), 2,071,034,824 distinct states found (1,567,682 ds/min), 137,622,296 states left on queue. +Progress(46) at 2024-11-07 07:23:11: 27,792,322,332 states generated (28,644,128 s/min), 2,072,586,226 distinct states found (1,551,402 ds/min), 137,231,762 states left on queue. +Progress(46) at 2024-11-07 07:24:11: 27,821,040,127 states generated (28,717,795 s/min), 2,074,030,831 distinct states found (1,444,605 ds/min), 136,731,600 states left on queue. +Progress(46) at 2024-11-07 07:25:11: 27,849,404,654 states generated (28,364,527 s/min), 2,075,273,409 distinct states found (1,242,578 ds/min), 136,082,131 states left on queue. +Progress(46) at 2024-11-07 07:26:11: 27,878,356,417 states generated (28,951,763 s/min), 2,076,656,601 distinct states found (1,383,192 ds/min), 135,570,796 states left on queue. +Progress(46) at 2024-11-07 07:27:11: 27,907,776,802 states generated (29,420,385 s/min), 2,078,383,391 distinct states found (1,726,790 ds/min), 135,306,248 states left on queue. +Progress(46) at 2024-11-07 07:28:11: 27,937,070,294 states generated (29,293,492 s/min), 2,080,076,828 distinct states found (1,693,437 ds/min), 135,034,380 states left on queue. +Progress(46) at 2024-11-07 07:29:11: 27,966,287,907 states generated (29,217,613 s/min), 2,081,855,223 distinct states found (1,778,395 ds/min), 134,839,763 states left on queue. +Progress(46) at 2024-11-07 07:30:11: 27,995,330,759 states generated (29,042,852 s/min), 2,083,372,197 distinct states found (1,516,974 ds/min), 134,461,641 states left on queue. +Checkpointing of run states/24-11-06-15-30-45.354 +Checkpointing completed at (2024-11-07 07:31:12) +Progress(46) at 2024-11-07 07:31:12: 28,024,387,579 states generated (29,056,820 s/min), 2,085,018,193 distinct states found (1,645,996 ds/min), 134,150,126 states left on queue. +Progress(46) at 2024-11-07 07:32:12: 28,053,564,379 states generated (29,176,800 s/min), 2,086,850,158 distinct states found (1,831,965 ds/min), 133,983,084 states left on queue. +Progress(46) at 2024-11-07 07:33:12: 28,082,556,747 states generated (28,992,368 s/min), 2,088,444,271 distinct states found (1,594,113 ds/min), 133,651,269 states left on queue. +Progress(47) at 2024-11-07 07:34:12: 28,111,323,007 states generated (28,766,260 s/min), 2,090,072,790 distinct states found (1,628,519 ds/min), 133,350,640 states left on queue. +Progress(47) at 2024-11-07 07:35:12: 28,140,191,163 states generated (28,868,156 s/min), 2,091,740,224 distinct states found (1,667,434 ds/min), 133,070,266 states left on queue. +Progress(47) at 2024-11-07 07:36:12: 28,169,054,601 states generated (28,863,438 s/min), 2,093,375,975 distinct states found (1,635,751 ds/min), 132,752,319 states left on queue. +Progress(47) at 2024-11-07 07:37:12: 28,197,994,162 states generated (28,939,561 s/min), 2,094,929,793 distinct states found (1,553,818 ds/min), 132,356,738 states left on queue. +Progress(47) at 2024-11-07 07:38:12: 28,226,808,491 states generated (28,814,329 s/min), 2,096,311,441 distinct states found (1,381,648 ds/min), 131,832,292 states left on queue. +Progress(47) at 2024-11-07 07:39:12: 28,255,451,016 states generated (28,642,525 s/min), 2,097,907,185 distinct states found (1,595,744 ds/min), 131,487,862 states left on queue. +Progress(47) at 2024-11-07 07:40:12: 28,284,015,286 states generated (28,564,270 s/min), 2,099,332,452 distinct states found (1,425,267 ds/min), 130,982,897 states left on queue. +Progress(47) at 2024-11-07 07:41:12: 28,313,051,806 states generated (29,036,520 s/min), 2,101,053,792 distinct states found (1,721,340 ds/min), 130,744,522 states left on queue. +Progress(47) at 2024-11-07 07:42:12: 28,342,348,160 states generated (29,296,354 s/min), 2,102,778,970 distinct states found (1,725,178 ds/min), 130,505,777 states left on queue. +Progress(47) at 2024-11-07 07:43:12: 28,371,533,935 states generated (29,185,775 s/min), 2,104,337,778 distinct states found (1,558,808 ds/min), 130,144,304 states left on queue. +Progress(47) at 2024-11-07 07:44:12: 28,400,351,066 states generated (28,817,131 s/min), 2,105,835,284 distinct states found (1,497,506 ds/min), 129,719,871 states left on queue. +Progress(47) at 2024-11-07 07:45:12: 28,429,411,463 states generated (29,060,397 s/min), 2,107,704,752 distinct states found (1,869,468 ds/min), 129,618,749 states left on queue. +Progress(47) at 2024-11-07 07:46:12: 28,458,488,093 states generated (29,076,630 s/min), 2,109,483,825 distinct states found (1,779,073 ds/min), 129,439,723 states left on queue. +Progress(47) at 2024-11-07 07:47:12: 28,487,338,391 states generated (28,850,298 s/min), 2,111,230,358 distinct states found (1,746,533 ds/min), 129,232,124 states left on queue. +Progress(47) at 2024-11-07 07:48:12: 28,516,411,931 states generated (29,073,540 s/min), 2,113,150,785 distinct states found (1,920,427 ds/min), 129,168,385 states left on queue. +Progress(47) at 2024-11-07 07:49:12: 28,545,299,037 states generated (28,887,106 s/min), 2,114,878,071 distinct states found (1,727,286 ds/min), 128,948,735 states left on queue. +Progress(47) at 2024-11-07 07:50:12: 28,574,186,091 states generated (28,887,054 s/min), 2,116,622,746 distinct states found (1,744,675 ds/min), 128,711,386 states left on queue. +Progress(47) at 2024-11-07 07:51:12: 28,603,057,442 states generated (28,871,351 s/min), 2,118,435,710 distinct states found (1,812,964 ds/min), 128,543,573 states left on queue. +Progress(47) at 2024-11-07 07:52:12: 28,632,042,720 states generated (28,985,278 s/min), 2,120,240,818 distinct states found (1,805,108 ds/min), 128,349,742 states left on queue. +Progress(47) at 2024-11-07 07:53:12: 28,660,885,097 states generated (28,842,377 s/min), 2,121,904,885 distinct states found (1,664,067 ds/min), 128,002,987 states left on queue. +Progress(47) at 2024-11-07 07:54:12: 28,689,690,902 states generated (28,805,805 s/min), 2,123,498,767 distinct states found (1,593,882 ds/min), 127,622,035 states left on queue. +Progress(47) at 2024-11-07 07:55:12: 28,718,827,206 states generated (29,136,304 s/min), 2,125,375,087 distinct states found (1,876,320 ds/min), 127,518,682 states left on queue. +Progress(47) at 2024-11-07 07:56:12: 28,747,988,287 states generated (29,161,081 s/min), 2,127,234,055 distinct states found (1,858,968 ds/min), 127,390,123 states left on queue. +Progress(47) at 2024-11-07 07:57:12: 28,776,918,449 states generated (28,930,162 s/min), 2,128,896,639 distinct states found (1,662,584 ds/min), 127,099,202 states left on queue. +Progress(47) at 2024-11-07 07:58:12: 28,805,826,521 states generated (28,908,072 s/min), 2,130,485,896 distinct states found (1,589,257 ds/min), 126,731,846 states left on queue. +Progress(47) at 2024-11-07 07:59:12: 28,834,550,061 states generated (28,723,540 s/min), 2,132,267,049 distinct states found (1,781,153 ds/min), 126,524,859 states left on queue. +Progress(47) at 2024-11-07 08:00:12: 28,863,218,037 states generated (28,667,976 s/min), 2,133,901,471 distinct states found (1,634,422 ds/min), 126,149,810 states left on queue. +Checkpointing of run states/24-11-06-15-30-45.354 +Checkpointing completed at (2024-11-07 08:01:13) +Progress(47) at 2024-11-07 08:01:13: 28,892,405,277 states generated (29,187,240 s/min), 2,135,683,266 distinct states found (1,781,795 ds/min), 125,938,046 states left on queue. +Progress(47) at 2024-11-07 08:02:13: 28,921,188,007 states generated (28,782,730 s/min), 2,137,299,589 distinct states found (1,616,323 ds/min), 125,575,223 states left on queue. +Progress(47) at 2024-11-07 08:03:13: 28,950,198,581 states generated (29,010,574 s/min), 2,138,945,715 distinct states found (1,646,126 ds/min), 125,225,825 states left on queue. +Progress(47) at 2024-11-07 08:04:13: 28,979,052,322 states generated (28,853,741 s/min), 2,140,384,312 distinct states found (1,438,597 ds/min), 124,739,890 states left on queue. +Progress(47) at 2024-11-07 08:05:13: 29,007,862,556 states generated (28,810,234 s/min), 2,142,020,690 distinct states found (1,636,378 ds/min), 124,389,570 states left on queue. +Progress(47) at 2024-11-07 08:06:13: 29,036,639,997 states generated (28,777,441 s/min), 2,143,436,769 distinct states found (1,416,079 ds/min), 123,853,456 states left on queue. +Progress(47) at 2024-11-07 08:07:13: 29,065,681,489 states generated (29,041,492 s/min), 2,144,841,718 distinct states found (1,404,949 ds/min), 123,385,042 states left on queue. +Progress(47) at 2024-11-07 08:08:13: 29,094,462,032 states generated (28,780,543 s/min), 2,146,214,867 distinct states found (1,373,149 ds/min), 122,908,921 states left on queue. +Progress(47) at 2024-11-07 08:09:13: 29,123,289,758 states generated (28,827,726 s/min), 2,147,553,984 distinct states found (1,339,117 ds/min), 122,446,193 states left on queue. +Progress(47) at 2024-11-07 08:10:13: 29,152,503,386 states generated (29,213,628 s/min), 2,148,942,911 distinct states found (1,388,927 ds/min), 122,030,640 states left on queue. +Progress(47) at 2024-11-07 08:11:13: 29,181,728,737 states generated (29,225,351 s/min), 2,150,631,619 distinct states found (1,688,708 ds/min), 121,816,919 states left on queue. +Progress(47) at 2024-11-07 08:12:13: 29,211,003,478 states generated (29,274,741 s/min), 2,152,175,254 distinct states found (1,543,635 ds/min), 121,489,774 states left on queue. +Progress(47) at 2024-11-07 08:13:13: 29,240,102,268 states generated (29,098,790 s/min), 2,153,537,952 distinct states found (1,362,698 ds/min), 120,992,206 states left on queue. +Progress(47) at 2024-11-07 08:14:13: 29,268,843,458 states generated (28,741,190 s/min), 2,154,896,522 distinct states found (1,358,570 ds/min), 120,481,830 states left on queue. +Progress(47) at 2024-11-07 08:15:13: 29,297,458,982 states generated (28,615,524 s/min), 2,156,228,693 distinct states found (1,332,171 ds/min), 119,935,590 states left on queue. +Progress(47) at 2024-11-07 08:16:13: 29,326,133,934 states generated (28,674,952 s/min), 2,157,558,222 distinct states found (1,329,529 ds/min), 119,402,611 states left on queue. +Progress(47) at 2024-11-07 08:17:13: 29,355,133,179 states generated (28,999,245 s/min), 2,159,036,229 distinct states found (1,478,007 ds/min), 119,059,305 states left on queue. +Progress(47) at 2024-11-07 08:18:13: 29,384,094,216 states generated (28,961,037 s/min), 2,160,401,726 distinct states found (1,365,497 ds/min), 118,659,528 states left on queue. +Progress(47) at 2024-11-07 08:19:13: 29,413,210,497 states generated (29,116,281 s/min), 2,162,252,062 distinct states found (1,850,336 ds/min), 118,605,990 states left on queue. +Progress(47) at 2024-11-07 08:20:13: 29,442,123,726 states generated (28,913,229 s/min), 2,163,968,572 distinct states found (1,716,510 ds/min), 118,430,828 states left on queue. +Progress(47) at 2024-11-07 08:21:13: 29,470,933,813 states generated (28,810,087 s/min), 2,165,411,802 distinct states found (1,443,230 ds/min), 118,017,068 states left on queue. +Progress(47) at 2024-11-07 08:22:13: 29,499,968,878 states generated (29,035,065 s/min), 2,166,884,069 distinct states found (1,472,267 ds/min), 117,620,342 states left on queue. +Progress(47) at 2024-11-07 08:23:13: 29,528,752,811 states generated (28,783,933 s/min), 2,168,252,577 distinct states found (1,368,508 ds/min), 117,101,560 states left on queue. +Progress(47) at 2024-11-07 08:24:13: 29,557,568,598 states generated (28,815,787 s/min), 2,169,705,158 distinct states found (1,452,581 ds/min), 116,651,662 states left on queue. +Progress(47) at 2024-11-07 08:25:13: 29,586,373,945 states generated (28,805,347 s/min), 2,171,138,563 distinct states found (1,433,405 ds/min), 116,184,414 states left on queue. +Progress(47) at 2024-11-07 08:26:13: 29,614,983,668 states generated (28,609,723 s/min), 2,172,585,802 distinct states found (1,447,239 ds/min), 115,737,683 states left on queue. +Progress(47) at 2024-11-07 08:27:13: 29,643,800,320 states generated (28,816,652 s/min), 2,174,078,381 distinct states found (1,492,579 ds/min), 115,326,021 states left on queue. +Progress(47) at 2024-11-07 08:28:13: 29,672,907,645 states generated (29,107,325 s/min), 2,175,677,520 distinct states found (1,599,139 ds/min), 114,997,885 states left on queue. +Progress(47) at 2024-11-07 08:29:13: 29,701,705,556 states generated (28,797,911 s/min), 2,177,190,856 distinct states found (1,513,336 ds/min), 114,628,087 states left on queue. +Progress(47) at 2024-11-07 08:30:13: 29,730,412,995 states generated (28,707,439 s/min), 2,178,512,609 distinct states found (1,321,753 ds/min), 114,087,841 states left on queue. +Checkpointing of run states/24-11-06-15-30-45.354 +Checkpointing completed at (2024-11-07 08:31:13) +Progress(47) at 2024-11-07 08:31:13: 29,759,387,383 states generated (28,974,388 s/min), 2,180,040,685 distinct states found (1,528,076 ds/min), 113,747,306 states left on queue. +Progress(47) at 2024-11-07 08:32:13: 29,788,001,065 states generated (28,613,682 s/min), 2,181,419,698 distinct states found (1,379,013 ds/min), 113,276,454 states left on queue. +Progress(47) at 2024-11-07 08:33:13: 29,816,483,253 states generated (28,482,188 s/min), 2,182,794,509 distinct states found (1,374,811 ds/min), 112,764,181 states left on queue. +Progress(47) at 2024-11-07 08:34:13: 29,845,032,133 states generated (28,548,880 s/min), 2,184,180,577 distinct states found (1,386,068 ds/min), 112,275,854 states left on queue. +Progress(47) at 2024-11-07 08:35:13: 29,873,704,121 states generated (28,671,988 s/min), 2,185,610,616 distinct states found (1,430,039 ds/min), 111,765,886 states left on queue. +Progress(47) at 2024-11-07 08:36:13: 29,901,983,007 states generated (28,278,886 s/min), 2,186,742,865 distinct states found (1,132,249 ds/min), 111,037,502 states left on queue. +Progress(47) at 2024-11-07 08:37:13: 29,931,128,222 states generated (29,145,215 s/min), 2,188,247,053 distinct states found (1,504,188 ds/min), 110,610,871 states left on queue. +Progress(47) at 2024-11-07 08:38:13: 29,960,291,600 states generated (29,163,378 s/min), 2,189,791,380 distinct states found (1,544,327 ds/min), 110,219,347 states left on queue. +Progress(47) at 2024-11-07 08:39:13: 29,989,426,093 states generated (29,134,493 s/min), 2,191,523,686 distinct states found (1,732,306 ds/min), 109,988,090 states left on queue. +Progress(47) at 2024-11-07 08:40:13: 30,018,419,613 states generated (28,993,520 s/min), 2,192,983,724 distinct states found (1,460,038 ds/min), 109,567,153 states left on queue. +Progress(47) at 2024-11-07 08:41:13: 30,047,169,261 states generated (28,749,648 s/min), 2,194,485,325 distinct states found (1,501,601 ds/min), 109,159,610 states left on queue. +Progress(47) at 2024-11-07 08:42:13: 30,076,320,011 states generated (29,150,750 s/min), 2,196,261,852 distinct states found (1,776,527 ds/min), 108,952,775 states left on queue. +Progress(47) at 2024-11-07 08:43:13: 30,105,246,939 states generated (28,926,928 s/min), 2,197,745,917 distinct states found (1,484,065 ds/min), 108,533,801 states left on queue. +Progress(47) at 2024-11-07 08:44:13: 30,134,017,722 states generated (28,770,783 s/min), 2,199,274,846 distinct states found (1,528,929 ds/min), 108,138,210 states left on queue. +Progress(48) at 2024-11-07 08:45:13: 30,162,850,009 states generated (28,832,287 s/min), 2,200,818,695 distinct states found (1,543,849 ds/min), 107,749,686 states left on queue. +Progress(48) at 2024-11-07 08:46:13: 30,191,763,541 states generated (28,913,532 s/min), 2,202,269,881 distinct states found (1,451,186 ds/min), 107,274,074 states left on queue. +Progress(48) at 2024-11-07 08:47:13: 30,220,450,821 states generated (28,687,280 s/min), 2,203,579,369 distinct states found (1,309,488 ds/min), 106,693,506 states left on queue. +Progress(48) at 2024-11-07 08:48:13: 30,249,109,647 states generated (28,658,826 s/min), 2,204,980,828 distinct states found (1,401,459 ds/min), 106,171,815 states left on queue. +Progress(48) at 2024-11-07 08:49:13: 30,278,004,502 states generated (28,894,855 s/min), 2,206,546,641 distinct states found (1,565,813 ds/min), 105,800,017 states left on queue. +Progress(48) at 2024-11-07 08:50:13: 30,307,176,628 states generated (29,172,126 s/min), 2,208,173,395 distinct states found (1,626,754 ds/min), 105,492,735 states left on queue. +Progress(48) at 2024-11-07 08:51:13: 30,336,267,563 states generated (29,090,935 s/min), 2,209,629,083 distinct states found (1,455,688 ds/min), 105,046,561 states left on queue. +Progress(48) at 2024-11-07 08:52:13: 30,365,237,300 states generated (28,969,737 s/min), 2,211,221,126 distinct states found (1,592,043 ds/min), 104,707,696 states left on queue. +Progress(48) at 2024-11-07 08:53:13: 30,394,270,909 states generated (29,033,609 s/min), 2,212,948,437 distinct states found (1,727,311 ds/min), 104,490,104 states left on queue. +Progress(48) at 2024-11-07 08:54:13: 30,423,140,115 states generated (28,869,206 s/min), 2,214,640,116 distinct states found (1,691,679 ds/min), 104,243,061 states left on queue. +Progress(48) at 2024-11-07 08:55:13: 30,452,062,605 states generated (28,922,490 s/min), 2,216,327,939 distinct states found (1,687,823 ds/min), 103,983,745 states left on queue. +Progress(48) at 2024-11-07 08:56:13: 30,481,071,056 states generated (29,008,451 s/min), 2,217,983,905 distinct states found (1,655,966 ds/min), 103,702,586 states left on queue. +Progress(48) at 2024-11-07 08:57:13: 30,509,808,031 states generated (28,736,975 s/min), 2,219,662,593 distinct states found (1,678,688 ds/min), 103,423,522 states left on queue. +Progress(48) at 2024-11-07 08:58:13: 30,538,616,862 states generated (28,808,831 s/min), 2,221,288,821 distinct states found (1,626,228 ds/min), 103,098,334 states left on queue. +Progress(48) at 2024-11-07 08:59:13: 30,567,539,949 states generated (28,923,087 s/min), 2,222,969,669 distinct states found (1,680,848 ds/min), 102,811,145 states left on queue. +Progress(48) at 2024-11-07 09:00:13: 30,596,220,572 states generated (28,680,623 s/min), 2,224,451,086 distinct states found (1,481,417 ds/min), 102,320,643 states left on queue. +Checkpointing of run states/24-11-06-15-30-45.354 +Checkpointing completed at (2024-11-07 09:01:14) +Progress(48) at 2024-11-07 09:01:14: 30,625,254,005 states generated (29,033,433 s/min), 2,225,971,213 distinct states found (1,520,127 ds/min), 101,895,678 states left on queue. +Progress(48) at 2024-11-07 09:02:14: 30,654,316,875 states generated (29,062,870 s/min), 2,227,776,007 distinct states found (1,804,794 ds/min), 101,720,925 states left on queue. +Progress(48) at 2024-11-07 09:03:14: 30,683,368,837 states generated (29,051,962 s/min), 2,229,520,592 distinct states found (1,744,585 ds/min), 101,516,049 states left on queue. +Progress(48) at 2024-11-07 09:04:14: 30,712,221,770 states generated (28,852,933 s/min), 2,231,006,576 distinct states found (1,485,984 ds/min), 101,059,951 states left on queue. +Progress(48) at 2024-11-07 09:05:14: 30,740,916,958 states generated (28,695,188 s/min), 2,232,634,565 distinct states found (1,627,989 ds/min), 100,742,863 states left on queue. +Progress(48) at 2024-11-07 09:06:14: 30,769,477,527 states generated (28,560,569 s/min), 2,234,099,495 distinct states found (1,464,930 ds/min), 100,237,731 states left on queue. +Progress(48) at 2024-11-07 09:07:14: 30,798,306,365 states generated (28,828,838 s/min), 2,235,757,510 distinct states found (1,658,015 ds/min), 99,936,798 states left on queue. +Progress(48) at 2024-11-07 09:08:14: 30,827,145,014 states generated (28,838,649 s/min), 2,237,323,374 distinct states found (1,565,864 ds/min), 99,542,928 states left on queue. +Progress(48) at 2024-11-07 09:09:14: 30,855,967,384 states generated (28,822,370 s/min), 2,238,712,445 distinct states found (1,389,071 ds/min), 98,994,892 states left on queue. +Progress(48) at 2024-11-07 09:10:14: 30,884,757,904 states generated (28,790,520 s/min), 2,240,211,537 distinct states found (1,499,092 ds/min), 98,555,003 states left on queue. +Progress(48) at 2024-11-07 09:11:14: 30,913,436,301 states generated (28,678,397 s/min), 2,241,549,402 distinct states found (1,337,865 ds/min), 97,972,368 states left on queue. +Progress(48) at 2024-11-07 09:12:14: 30,942,398,628 states generated (28,962,327 s/min), 2,242,894,478 distinct states found (1,345,076 ds/min), 97,450,191 states left on queue. +Progress(48) at 2024-11-07 09:13:14: 30,971,150,912 states generated (28,752,284 s/min), 2,244,149,533 distinct states found (1,255,055 ds/min), 96,915,440 states left on queue. +Progress(48) at 2024-11-07 09:14:14: 31,000,226,695 states generated (29,075,783 s/min), 2,245,486,253 distinct states found (1,336,720 ds/min), 96,453,711 states left on queue. +Progress(48) at 2024-11-07 09:15:14: 31,029,410,660 states generated (29,183,965 s/min), 2,247,033,348 distinct states found (1,547,095 ds/min), 96,134,910 states left on queue. +Progress(48) at 2024-11-07 09:16:14: 31,058,657,395 states generated (29,246,735 s/min), 2,248,447,081 distinct states found (1,413,733 ds/min), 95,701,875 states left on queue. +Progress(48) at 2024-11-07 09:17:14: 31,087,368,874 states generated (28,711,479 s/min), 2,249,703,997 distinct states found (1,256,916 ds/min), 95,112,797 states left on queue. +Progress(48) at 2024-11-07 09:18:14: 31,115,905,907 states generated (28,537,033 s/min), 2,250,949,093 distinct states found (1,245,096 ds/min), 94,499,889 states left on queue. +Progress(48) at 2024-11-07 09:19:14: 31,144,578,992 states generated (28,673,085 s/min), 2,252,226,995 distinct states found (1,277,902 ds/min), 93,927,098 states left on queue. +Progress(48) at 2024-11-07 09:20:14: 31,173,557,966 states generated (28,978,974 s/min), 2,253,602,196 distinct states found (1,375,201 ds/min), 93,561,559 states left on queue. +Progress(48) at 2024-11-07 09:21:14: 31,202,521,307 states generated (28,963,341 s/min), 2,255,224,149 distinct states found (1,621,953 ds/min), 93,337,000 states left on queue. +Progress(48) at 2024-11-07 09:22:14: 31,231,451,884 states generated (28,930,577 s/min), 2,256,879,564 distinct states found (1,655,415 ds/min), 93,119,996 states left on queue. +Progress(48) at 2024-11-07 09:23:14: 31,260,174,245 states generated (28,722,361 s/min), 2,258,206,514 distinct states found (1,326,950 ds/min), 92,610,216 states left on queue. +Progress(48) at 2024-11-07 09:24:14: 31,289,091,475 states generated (28,917,230 s/min), 2,259,564,810 distinct states found (1,358,296 ds/min), 92,123,452 states left on queue. +Progress(48) at 2024-11-07 09:25:14: 31,317,753,943 states generated (28,662,468 s/min), 2,260,868,559 distinct states found (1,303,749 ds/min), 91,550,997 states left on queue. +Progress(48) at 2024-11-07 09:26:14: 31,346,435,672 states generated (28,681,729 s/min), 2,262,197,433 distinct states found (1,328,874 ds/min), 91,002,731 states left on queue. +Progress(48) at 2024-11-07 09:27:14: 31,375,074,275 states generated (28,638,603 s/min), 2,263,549,308 distinct states found (1,351,875 ds/min), 90,479,028 states left on queue. +Progress(48) at 2024-11-07 09:28:14: 31,403,896,903 states generated (28,822,628 s/min), 2,264,999,048 distinct states found (1,449,740 ds/min), 90,030,284 states left on queue. +Progress(48) at 2024-11-07 09:29:14: 31,432,772,052 states generated (28,875,149 s/min), 2,266,431,878 distinct states found (1,432,830 ds/min), 89,580,165 states left on queue. +Progress(48) at 2024-11-07 09:30:14: 31,461,382,905 states generated (28,610,853 s/min), 2,267,701,315 distinct states found (1,269,437 ds/min), 89,008,135 states left on queue. +Checkpointing of run states/24-11-06-15-30-45.354 +Checkpointing completed at (2024-11-07 09:31:15) +Progress(48) at 2024-11-07 09:31:15: 31,490,350,002 states generated (28,967,097 s/min), 2,269,120,991 distinct states found (1,419,676 ds/min), 88,574,899 states left on queue. +Progress(48) at 2024-11-07 09:32:15: 31,518,738,286 states generated (28,388,284 s/min), 2,270,333,667 distinct states found (1,212,676 ds/min), 87,950,800 states left on queue. +Progress(48) at 2024-11-07 09:33:15: 31,547,227,429 states generated (28,489,143 s/min), 2,271,632,491 distinct states found (1,298,824 ds/min), 87,379,110 states left on queue. +Progress(48) at 2024-11-07 09:34:15: 31,575,696,846 states generated (28,469,417 s/min), 2,272,873,166 distinct states found (1,240,675 ds/min), 86,717,955 states left on queue. +Progress(48) at 2024-11-07 09:35:15: 31,604,509,248 states generated (28,812,402 s/min), 2,274,166,128 distinct states found (1,292,962 ds/min), 86,122,414 states left on queue. +Progress(48) at 2024-11-07 09:36:15: 31,633,623,894 states generated (29,114,646 s/min), 2,275,690,739 distinct states found (1,524,611 ds/min), 85,718,820 states left on queue. +Progress(48) at 2024-11-07 09:37:15: 31,662,734,164 states generated (29,110,270 s/min), 2,277,282,041 distinct states found (1,591,302 ds/min), 85,389,121 states left on queue. +Progress(48) at 2024-11-07 09:38:15: 31,691,488,753 states generated (28,754,589 s/min), 2,278,666,982 distinct states found (1,384,941 ds/min), 84,903,119 states left on queue. +Progress(48) at 2024-11-07 09:39:15: 31,720,428,706 states generated (28,939,953 s/min), 2,280,231,311 distinct states found (1,564,329 ds/min), 84,529,794 states left on queue. +Progress(48) at 2024-11-07 09:40:15: 31,749,336,886 states generated (28,908,180 s/min), 2,281,688,218 distinct states found (1,456,907 ds/min), 84,091,511 states left on queue. +Progress(48) at 2024-11-07 09:41:15: 31,778,054,342 states generated (28,717,456 s/min), 2,283,102,693 distinct states found (1,414,475 ds/min), 83,605,316 states left on queue. +Progress(49) at 2024-11-07 09:42:15: 31,806,874,604 states generated (28,820,262 s/min), 2,284,525,902 distinct states found (1,423,209 ds/min), 83,115,134 states left on queue. +Progress(49) at 2024-11-07 09:43:15: 31,835,557,645 states generated (28,683,041 s/min), 2,285,776,893 distinct states found (1,250,991 ds/min), 82,491,419 states left on queue. +Progress(49) at 2024-11-07 09:44:15: 31,864,075,450 states generated (28,517,805 s/min), 2,287,028,991 distinct states found (1,252,098 ds/min), 81,847,819 states left on queue. +Progress(49) at 2024-11-07 09:45:15: 31,892,999,186 states generated (28,923,736 s/min), 2,288,552,140 distinct states found (1,523,149 ds/min), 81,459,937 states left on queue. +Progress(49) at 2024-11-07 09:46:15: 31,922,276,996 states generated (29,277,810 s/min), 2,290,137,668 distinct states found (1,585,528 ds/min), 81,115,285 states left on queue. +Progress(49) at 2024-11-07 09:47:15: 31,951,109,751 states generated (28,832,755 s/min), 2,291,477,001 distinct states found (1,339,333 ds/min), 80,582,606 states left on queue. +Progress(49) at 2024-11-07 09:48:15: 31,980,103,122 states generated (28,993,371 s/min), 2,293,149,633 distinct states found (1,672,632 ds/min), 80,321,900 states left on queue. +Progress(49) at 2024-11-07 09:49:15: 32,008,927,227 states generated (28,824,105 s/min), 2,294,737,299 distinct states found (1,587,666 ds/min), 79,988,982 states left on queue. +Progress(49) at 2024-11-07 09:50:15: 32,037,912,405 states generated (28,985,178 s/min), 2,296,369,269 distinct states found (1,631,970 ds/min), 79,688,340 states left on queue. +Progress(49) at 2024-11-07 09:51:15: 32,066,650,871 states generated (28,738,466 s/min), 2,297,881,682 distinct states found (1,512,413 ds/min), 79,285,058 states left on queue. +Progress(49) at 2024-11-07 09:52:15: 32,095,474,869 states generated (28,823,998 s/min), 2,299,386,856 distinct states found (1,505,174 ds/min), 78,860,285 states left on queue. +Progress(49) at 2024-11-07 09:53:15: 32,124,254,306 states generated (28,779,437 s/min), 2,300,974,245 distinct states found (1,587,389 ds/min), 78,501,509 states left on queue. +Progress(49) at 2024-11-07 09:54:15: 32,152,874,934 states generated (28,620,628 s/min), 2,302,313,494 distinct states found (1,339,249 ds/min), 77,908,264 states left on queue. +Progress(49) at 2024-11-07 09:55:15: 32,181,625,656 states generated (28,750,722 s/min), 2,303,719,911 distinct states found (1,406,417 ds/min), 77,409,147 states left on queue. +Progress(49) at 2024-11-07 09:56:15: 32,210,690,682 states generated (29,065,026 s/min), 2,305,458,559 distinct states found (1,738,648 ds/min), 77,178,015 states left on queue. +Progress(49) at 2024-11-07 09:57:15: 32,239,586,160 states generated (28,895,478 s/min), 2,307,003,156 distinct states found (1,544,597 ds/min), 76,805,818 states left on queue. +Progress(49) at 2024-11-07 09:58:15: 32,268,327,819 states generated (28,741,659 s/min), 2,308,436,891 distinct states found (1,433,735 ds/min), 76,324,212 states left on queue. +Progress(49) at 2024-11-07 09:59:15: 32,296,829,379 states generated (28,501,560 s/min), 2,309,831,948 distinct states found (1,395,057 ds/min), 75,779,735 states left on queue. +Progress(49) at 2024-11-07 10:00:15: 32,325,628,397 states generated (28,799,018 s/min), 2,311,380,882 distinct states found (1,548,934 ds/min), 75,395,162 states left on queue. +Checkpointing of run states/24-11-06-15-30-45.354 +Checkpointing completed at (2024-11-07 10:01:15) +Progress(49) at 2024-11-07 10:01:15: 32,354,681,149 states generated (29,052,752 s/min), 2,312,867,979 distinct states found (1,487,097 ds/min), 74,928,503 states left on queue. +Progress(49) at 2024-11-07 10:02:15: 32,383,406,034 states generated (28,724,885 s/min), 2,314,202,265 distinct states found (1,334,286 ds/min), 74,352,680 states left on queue. +Progress(49) at 2024-11-07 10:03:15: 32,411,997,317 states generated (28,591,283 s/min), 2,315,435,082 distinct states found (1,232,817 ds/min), 73,700,708 states left on queue. +Progress(49) at 2024-11-07 10:04:15: 32,440,769,297 states generated (28,771,980 s/min), 2,316,687,791 distinct states found (1,252,709 ds/min), 73,114,003 states left on queue. +Progress(49) at 2024-11-07 10:05:15: 32,469,733,062 states generated (28,963,765 s/min), 2,317,885,762 distinct states found (1,197,971 ds/min), 72,558,372 states left on queue. +Progress(49) at 2024-11-07 10:06:15: 32,498,863,740 states generated (29,130,678 s/min), 2,319,353,511 distinct states found (1,467,749 ds/min), 72,186,248 states left on queue. +Progress(49) at 2024-11-07 10:07:15: 32,527,902,407 states generated (29,038,667 s/min), 2,320,635,445 distinct states found (1,281,934 ds/min), 71,639,893 states left on queue. +Progress(49) at 2024-11-07 10:08:15: 32,556,361,400 states generated (28,458,993 s/min), 2,321,793,726 distinct states found (1,158,281 ds/min), 70,954,333 states left on queue. +Progress(49) at 2024-11-07 10:09:15: 32,585,056,251 states generated (28,694,851 s/min), 2,323,009,155 distinct states found (1,215,429 ds/min), 70,362,671 states left on queue. +Progress(49) at 2024-11-07 10:10:15: 32,613,972,815 states generated (28,916,564 s/min), 2,324,321,084 distinct states found (1,311,929 ds/min), 69,935,186 states left on queue. +Progress(49) at 2024-11-07 10:11:15: 32,642,963,038 states generated (28,990,223 s/min), 2,325,997,874 distinct states found (1,676,790 ds/min), 69,730,871 states left on queue. +Progress(49) at 2024-11-07 10:12:15: 32,671,642,762 states generated (28,679,724 s/min), 2,327,294,217 distinct states found (1,296,343 ds/min), 69,221,413 states left on queue. +Progress(49) at 2024-11-07 10:13:15: 32,700,429,296 states generated (28,786,534 s/min), 2,328,535,742 distinct states found (1,241,525 ds/min), 68,635,066 states left on queue. +Progress(49) at 2024-11-07 10:14:15: 32,729,076,182 states generated (28,646,886 s/min), 2,329,760,071 distinct states found (1,224,329 ds/min), 67,997,735 states left on queue. +Progress(49) at 2024-11-07 10:15:15: 32,757,631,787 states generated (28,555,605 s/min), 2,331,002,517 distinct states found (1,242,446 ds/min), 67,379,374 states left on queue. +Progress(49) at 2024-11-07 10:16:15: 32,786,472,553 states generated (28,840,766 s/min), 2,332,364,440 distinct states found (1,361,923 ds/min), 66,856,953 states left on queue. +Progress(49) at 2024-11-07 10:17:15: 32,815,068,782 states generated (28,596,229 s/min), 2,333,629,799 distinct states found (1,265,359 ds/min), 66,266,973 states left on queue. +Progress(49) at 2024-11-07 10:18:15: 32,843,671,035 states generated (28,602,253 s/min), 2,334,875,787 distinct states found (1,245,988 ds/min), 65,714,901 states left on queue. +Progress(49) at 2024-11-07 10:19:15: 32,872,127,728 states generated (28,456,693 s/min), 2,336,030,334 distinct states found (1,154,547 ds/min), 65,023,805 states left on queue. +Progress(49) at 2024-11-07 10:20:15: 32,900,582,167 states generated (28,454,439 s/min), 2,337,180,611 distinct states found (1,150,277 ds/min), 64,304,348 states left on queue. +Progress(49) at 2024-11-07 10:21:15: 32,929,545,972 states generated (28,963,805 s/min), 2,338,488,833 distinct states found (1,308,222 ds/min), 63,715,470 states left on queue. +Progress(49) at 2024-11-07 10:22:15: 32,958,603,673 states generated (29,057,701 s/min), 2,339,992,330 distinct states found (1,503,497 ds/min), 63,307,968 states left on queue. +Progress(49) at 2024-11-07 10:23:15: 32,987,442,078 states generated (28,838,405 s/min), 2,341,335,966 distinct states found (1,343,636 ds/min), 62,792,292 states left on queue. +Progress(49) at 2024-11-07 10:24:15: 33,016,381,018 states generated (28,938,940 s/min), 2,342,828,482 distinct states found (1,492,516 ds/min), 62,365,394 states left on queue. +Progress(49) at 2024-11-07 10:25:15: 33,045,061,128 states generated (28,680,110 s/min), 2,344,118,515 distinct states found (1,290,033 ds/min), 61,789,542 states left on queue. +Progress(49) at 2024-11-07 10:26:15: 33,073,888,592 states generated (28,827,464 s/min), 2,345,475,829 distinct states found (1,357,314 ds/min), 61,253,128 states left on queue. +Progress(50) at 2024-11-07 10:27:15: 33,102,491,050 states generated (28,602,458 s/min), 2,346,652,625 distinct states found (1,176,796 ds/min), 60,570,177 states left on queue. +Progress(50) at 2024-11-07 10:28:15: 33,131,166,035 states generated (28,674,985 s/min), 2,347,941,873 distinct states found (1,289,248 ds/min), 59,969,815 states left on queue. +Progress(50) at 2024-11-07 10:29:15: 33,160,270,838 states generated (29,104,803 s/min), 2,349,441,004 distinct states found (1,499,131 ds/min), 59,570,847 states left on queue. +Progress(50) at 2024-11-07 10:30:15: 33,189,149,869 states generated (28,879,031 s/min), 2,350,812,706 distinct states found (1,371,702 ds/min), 59,068,202 states left on queue. +Checkpointing of run states/24-11-06-15-30-45.354 +Checkpointing completed at (2024-11-07 10:31:16) +Progress(50) at 2024-11-07 10:31:16: 33,218,286,121 states generated (29,136,252 s/min), 2,352,357,375 distinct states found (1,544,669 ds/min), 58,692,343 states left on queue. +Progress(50) at 2024-11-07 10:32:16: 33,246,927,616 states generated (28,641,495 s/min), 2,353,796,993 distinct states found (1,439,618 ds/min), 58,245,674 states left on queue. +Progress(50) at 2024-11-07 10:33:16: 33,275,692,609 states generated (28,764,993 s/min), 2,355,282,278 distinct states found (1,485,285 ds/min), 57,825,713 states left on queue. +Progress(50) at 2024-11-07 10:34:16: 33,304,267,545 states generated (28,574,936 s/min), 2,356,681,270 distinct states found (1,398,992 ds/min), 57,325,849 states left on queue. +Progress(50) at 2024-11-07 10:35:16: 33,332,888,163 states generated (28,620,618 s/min), 2,358,099,683 distinct states found (1,418,413 ds/min), 56,833,993 states left on queue. +Progress(50) at 2024-11-07 10:36:16: 33,361,236,042 states generated (28,347,879 s/min), 2,359,281,358 distinct states found (1,181,675 ds/min), 56,126,890 states left on queue. +Progress(50) at 2024-11-07 10:37:16: 33,390,140,655 states generated (28,904,613 s/min), 2,360,868,517 distinct states found (1,587,159 ds/min), 55,791,859 states left on queue. +Progress(50) at 2024-11-07 10:38:16: 33,418,998,816 states generated (28,858,161 s/min), 2,362,363,780 distinct states found (1,495,263 ds/min), 55,385,255 states left on queue. +Progress(50) at 2024-11-07 10:39:16: 33,447,612,810 states generated (28,613,994 s/min), 2,363,728,858 distinct states found (1,365,078 ds/min), 54,854,942 states left on queue. +Progress(50) at 2024-11-07 10:40:16: 33,476,162,070 states generated (28,549,260 s/min), 2,365,099,267 distinct states found (1,370,409 ds/min), 54,312,039 states left on queue. +Progress(50) at 2024-11-07 10:41:16: 33,504,811,505 states generated (28,649,435 s/min), 2,366,473,549 distinct states found (1,374,282 ds/min), 53,784,809 states left on queue. +Progress(50) at 2024-11-07 10:42:16: 33,533,403,252 states generated (28,591,747 s/min), 2,367,734,253 distinct states found (1,260,704 ds/min), 53,158,819 states left on queue. +Progress(50) at 2024-11-07 10:43:16: 33,561,952,889 states generated (28,549,637 s/min), 2,368,855,124 distinct states found (1,120,871 ds/min), 52,441,471 states left on queue. +Progress(50) at 2024-11-07 10:44:16: 33,590,825,690 states generated (28,872,801 s/min), 2,370,054,403 distinct states found (1,199,279 ds/min), 51,878,202 states left on queue. +Progress(50) at 2024-11-07 10:45:16: 33,619,895,477 states generated (29,069,787 s/min), 2,371,355,035 distinct states found (1,300,632 ds/min), 51,382,836 states left on queue. +Progress(50) at 2024-11-07 10:46:16: 33,648,391,719 states generated (28,496,242 s/min), 2,372,441,699 distinct states found (1,086,664 ds/min), 50,647,071 states left on queue. +Progress(50) at 2024-11-07 10:47:16: 33,677,074,147 states generated (28,682,428 s/min), 2,373,600,507 distinct states found (1,158,808 ds/min), 50,052,421 states left on queue. +Progress(50) at 2024-11-07 10:48:16: 33,705,980,713 states generated (28,906,566 s/min), 2,375,050,402 distinct states found (1,449,895 ds/min), 49,692,912 states left on queue. +Progress(50) at 2024-11-07 10:49:16: 33,734,700,309 states generated (28,719,596 s/min), 2,376,355,805 distinct states found (1,305,403 ds/min), 49,202,990 states left on queue. +Progress(50) at 2024-11-07 10:50:16: 33,763,294,505 states generated (28,594,196 s/min), 2,377,489,014 distinct states found (1,133,209 ds/min), 48,526,991 states left on queue. +Progress(50) at 2024-11-07 10:51:16: 33,791,781,835 states generated (28,487,330 s/min), 2,378,610,114 distinct states found (1,121,100 ds/min), 47,806,234 states left on queue. +Progress(50) at 2024-11-07 10:52:16: 33,820,496,936 states generated (28,715,101 s/min), 2,379,861,294 distinct states found (1,251,180 ds/min), 47,194,112 states left on queue. +Progress(50) at 2024-11-07 10:53:16: 33,848,955,580 states generated (28,458,644 s/min), 2,381,018,247 distinct states found (1,156,953 ds/min), 46,544,595 states left on queue. +Progress(50) at 2024-11-07 10:54:16: 33,877,358,985 states generated (28,403,405 s/min), 2,382,084,162 distinct states found (1,065,915 ds/min), 45,797,353 states left on queue. +Progress(50) at 2024-11-07 10:55:16: 33,905,938,026 states generated (28,579,041 s/min), 2,383,237,725 distinct states found (1,153,563 ds/min), 45,079,182 states left on queue. +Progress(50) at 2024-11-07 10:56:16: 33,934,925,952 states generated (28,987,926 s/min), 2,384,648,770 distinct states found (1,411,045 ds/min), 44,602,865 states left on queue. +Progress(50) at 2024-11-07 10:57:16: 33,963,625,658 states generated (28,699,706 s/min), 2,385,892,826 distinct states found (1,244,056 ds/min), 44,000,281 states left on queue. +Progress(50) at 2024-11-07 10:58:16: 33,992,548,128 states generated (28,922,470 s/min), 2,387,290,030 distinct states found (1,397,204 ds/min), 43,514,140 states left on queue. +Progress(51) at 2024-11-07 10:59:16: 34,021,202,960 states generated (28,654,832 s/min), 2,388,511,227 distinct states found (1,221,197 ds/min), 42,867,785 states left on queue. +Progress(51) at 2024-11-07 11:00:16: 34,049,640,853 states generated (28,437,893 s/min), 2,389,565,989 distinct states found (1,054,762 ds/min), 42,084,713 states left on queue. +Checkpointing of run states/24-11-06-15-30-45.354 +Checkpointing completed at (2024-11-07 11:01:17) +Progress(51) at 2024-11-07 11:01:17: 34,079,102,421 states generated (29,461,568 s/min), 2,391,039,395 distinct states found (1,473,406 ds/min), 41,644,463 states left on queue. +Progress(51) at 2024-11-07 11:02:17: 34,107,932,294 states generated (28,829,873 s/min), 2,392,415,920 distinct states found (1,376,525 ds/min), 41,153,999 states left on queue. +Progress(51) at 2024-11-07 11:03:17: 34,136,619,823 states generated (28,687,529 s/min), 2,393,784,341 distinct states found (1,368,421 ds/min), 40,648,398 states left on queue. +Progress(51) at 2024-11-07 11:04:17: 34,165,416,573 states generated (28,796,750 s/min), 2,395,186,568 distinct states found (1,402,227 ds/min), 40,162,223 states left on queue. +Progress(51) at 2024-11-07 11:05:17: 34,193,934,145 states generated (28,517,572 s/min), 2,396,461,207 distinct states found (1,274,639 ds/min), 39,558,749 states left on queue. +Progress(51) at 2024-11-07 11:06:17: 34,222,437,146 states generated (28,503,001 s/min), 2,397,667,005 distinct states found (1,205,798 ds/min), 38,877,170 states left on queue. +Progress(51) at 2024-11-07 11:07:17: 34,251,162,633 states generated (28,725,487 s/min), 2,399,047,586 distinct states found (1,380,581 ds/min), 38,366,536 states left on queue. +Progress(51) at 2024-11-07 11:08:17: 34,280,005,309 states generated (28,842,676 s/min), 2,400,476,715 distinct states found (1,429,129 ds/min), 37,912,093 states left on queue. +Progress(51) at 2024-11-07 11:09:17: 34,308,388,681 states generated (28,383,372 s/min), 2,401,648,509 distinct states found (1,171,794 ds/min), 37,215,479 states left on queue. +Progress(51) at 2024-11-07 11:10:17: 34,337,086,557 states generated (28,697,876 s/min), 2,403,035,913 distinct states found (1,387,404 ds/min), 36,712,331 states left on queue. +Progress(51) at 2024-11-07 11:11:17: 34,365,565,315 states generated (28,478,758 s/min), 2,404,187,792 distinct states found (1,151,879 ds/min), 36,008,223 states left on queue. +Progress(51) at 2024-11-07 11:12:17: 34,394,280,845 states generated (28,715,530 s/min), 2,405,264,161 distinct states found (1,076,369 ds/min), 35,318,651 states left on queue. +Progress(51) at 2024-11-07 11:13:17: 34,423,292,173 states generated (29,011,328 s/min), 2,406,461,030 distinct states found (1,196,869 ds/min), 34,731,310 states left on queue. +Progress(51) at 2024-11-07 11:14:17: 34,451,717,631 states generated (28,425,458 s/min), 2,407,470,263 distinct states found (1,009,233 ds/min), 33,977,845 states left on queue. +Progress(51) at 2024-11-07 11:15:17: 34,480,582,848 states generated (28,865,217 s/min), 2,408,844,472 distinct states found (1,374,209 ds/min), 33,563,385 states left on queue. +Progress(51) at 2024-11-07 11:16:17: 34,509,255,375 states generated (28,672,527 s/min), 2,409,992,223 distinct states found (1,147,751 ds/min), 32,948,371 states left on queue. +Progress(51) at 2024-11-07 11:17:17: 34,537,627,156 states generated (28,371,781 s/min), 2,411,007,744 distinct states found (1,015,521 ds/min), 32,138,450 states left on queue. +Progress(51) at 2024-11-07 11:18:17: 34,566,104,650 states generated (28,477,494 s/min), 2,412,094,834 distinct states found (1,087,090 ds/min), 31,405,790 states left on queue. +Progress(51) at 2024-11-07 11:19:17: 34,594,468,421 states generated (28,363,771 s/min), 2,413,136,514 distinct states found (1,041,680 ds/min), 30,631,648 states left on queue. +Progress(51) at 2024-11-07 11:20:17: 34,623,282,746 states generated (28,814,325 s/min), 2,414,376,756 distinct states found (1,240,242 ds/min), 30,011,457 states left on queue. +Progress(51) at 2024-11-07 11:21:17: 34,652,013,328 states generated (28,730,582 s/min), 2,415,631,977 distinct states found (1,255,221 ds/min), 29,420,035 states left on queue. +Progress(51) at 2024-11-07 11:22:17: 34,680,708,001 states generated (28,694,673 s/min), 2,416,841,149 distinct states found (1,209,172 ds/min), 28,780,239 states left on queue. +Progress(52) at 2024-11-07 11:23:17: 34,709,197,697 states generated (28,489,696 s/min), 2,417,931,157 distinct states found (1,090,008 ds/min), 28,033,256 states left on queue. +Progress(52) at 2024-11-07 11:24:17: 34,738,057,742 states generated (28,860,045 s/min), 2,419,214,866 distinct states found (1,283,709 ds/min), 27,476,210 states left on queue. +Progress(52) at 2024-11-07 11:25:17: 34,766,795,719 states generated (28,737,977 s/min), 2,420,575,203 distinct states found (1,360,337 ds/min), 26,973,510 states left on queue. +Progress(52) at 2024-11-07 11:26:17: 34,795,409,801 states generated (28,614,082 s/min), 2,421,852,170 distinct states found (1,276,967 ds/min), 26,383,152 states left on queue. +Progress(52) at 2024-11-07 11:27:17: 34,823,871,413 states generated (28,461,612 s/min), 2,423,018,118 distinct states found (1,165,948 ds/min), 25,687,358 states left on queue. +Progress(52) at 2024-11-07 11:28:17: 34,852,452,267 states generated (28,580,854 s/min), 2,424,258,491 distinct states found (1,240,373 ds/min), 25,061,677 states left on queue. +Progress(52) at 2024-11-07 11:29:17: 34,881,109,110 states generated (28,656,843 s/min), 2,425,536,450 distinct states found (1,277,959 ds/min), 24,485,682 states left on queue. +Progress(52) at 2024-11-07 11:30:17: 34,909,638,357 states generated (28,529,247 s/min), 2,426,766,241 distinct states found (1,229,791 ds/min), 23,851,800 states left on queue. +Checkpointing of run states/24-11-06-15-30-45.354 +Checkpointing completed at (2024-11-07 11:31:18) +Progress(52) at 2024-11-07 11:31:18: 34,938,217,205 states generated (28,578,848 s/min), 2,427,804,784 distinct states found (1,038,543 ds/min), 23,061,400 states left on queue. +Progress(52) at 2024-11-07 11:32:18: 34,967,089,391 states generated (28,872,186 s/min), 2,428,907,251 distinct states found (1,102,467 ds/min), 22,421,037 states left on queue. +Progress(52) at 2024-11-07 11:33:18: 34,995,531,710 states generated (28,442,319 s/min), 2,429,963,142 distinct states found (1,055,891 ds/min), 21,740,235 states left on queue. +Progress(52) at 2024-11-07 11:34:18: 35,024,141,172 states generated (28,609,462 s/min), 2,431,122,150 distinct states found (1,159,008 ds/min), 21,149,288 states left on queue. +Progress(52) at 2024-11-07 11:35:18: 35,052,351,960 states generated (28,210,788 s/min), 2,432,077,858 distinct states found (955,708 ds/min), 20,295,072 states left on queue. +Progress(52) at 2024-11-07 11:36:18: 35,080,654,028 states generated (28,302,068 s/min), 2,433,061,991 distinct states found (984,133 ds/min), 19,478,746 states left on queue. +Progress(52) at 2024-11-07 11:37:18: 35,109,293,099 states generated (28,639,071 s/min), 2,434,258,110 distinct states found (1,196,119 ds/min), 18,850,062 states left on queue. +Progress(53) at 2024-11-07 11:38:18: 35,137,874,307 states generated (28,581,208 s/min), 2,435,408,538 distinct states found (1,150,428 ds/min), 18,171,042 states left on queue. +Progress(53) at 2024-11-07 11:39:18: 35,166,493,712 states generated (28,619,405 s/min), 2,436,567,034 distinct states found (1,158,496 ds/min), 17,510,811 states left on queue. +Progress(53) at 2024-11-07 11:40:18: 35,195,076,188 states generated (28,582,476 s/min), 2,437,810,887 distinct states found (1,243,853 ds/min), 16,916,098 states left on queue. +Progress(53) at 2024-11-07 11:41:18: 35,223,492,769 states generated (28,416,581 s/min), 2,438,939,934 distinct states found (1,129,047 ds/min), 16,200,301 states left on queue. +Progress(53) at 2024-11-07 11:42:18: 35,252,026,035 states generated (28,533,266 s/min), 2,440,130,151 distinct states found (1,190,217 ds/min), 15,545,447 states left on queue. +Progress(53) at 2024-11-07 11:43:18: 35,280,482,465 states generated (28,456,430 s/min), 2,441,297,027 distinct states found (1,166,876 ds/min), 14,879,990 states left on queue. +Progress(53) at 2024-11-07 11:44:18: 35,308,940,796 states generated (28,458,331 s/min), 2,442,317,453 distinct states found (1,020,426 ds/min), 14,116,803 states left on queue. +Progress(53) at 2024-11-07 11:45:18: 35,337,597,306 states generated (28,656,510 s/min), 2,443,328,791 distinct states found (1,011,338 ds/min), 13,403,307 states left on queue. +Progress(53) at 2024-11-07 11:46:18: 35,366,058,165 states generated (28,460,859 s/min), 2,444,336,498 distinct states found (1,007,707 ds/min), 12,657,418 states left on queue. +Progress(53) at 2024-11-07 11:47:18: 35,394,499,327 states generated (28,441,162 s/min), 2,445,346,072 distinct states found (1,009,574 ds/min), 11,856,670 states left on queue. +Progress(53) at 2024-11-07 11:48:18: 35,423,058,448 states generated (28,559,121 s/min), 2,446,449,527 distinct states found (1,103,455 ds/min), 11,150,850 states left on queue. +Progress(54) at 2024-11-07 11:49:18: 35,451,714,950 states generated (28,656,502 s/min), 2,447,608,246 distinct states found (1,158,719 ds/min), 10,497,489 states left on queue. +Progress(54) at 2024-11-07 11:50:18: 35,480,075,027 states generated (28,360,077 s/min), 2,448,668,413 distinct states found (1,060,167 ds/min), 9,734,924 states left on queue. +Progress(54) at 2024-11-07 11:51:18: 35,508,544,241 states generated (28,469,214 s/min), 2,449,793,995 distinct states found (1,125,582 ds/min), 9,041,108 states left on queue. +Progress(54) at 2024-11-07 11:52:18: 35,537,058,894 states generated (28,514,653 s/min), 2,450,835,560 distinct states found (1,041,565 ds/min), 8,304,357 states left on queue. +Progress(54) at 2024-11-07 11:53:18: 35,565,617,770 states generated (28,558,876 s/min), 2,451,805,307 distinct states found (969,747 ds/min), 7,554,593 states left on queue. +Progress(54) at 2024-11-07 11:54:18: 35,594,096,319 states generated (28,478,549 s/min), 2,452,829,286 distinct states found (1,023,979 ds/min), 6,777,854 states left on queue. +Progress(55) at 2024-11-07 11:55:18: 35,622,658,049 states generated (28,561,730 s/min), 2,453,911,213 distinct states found (1,081,927 ds/min), 6,063,348 states left on queue. +Progress(55) at 2024-11-07 11:56:18: 35,651,019,108 states generated (28,361,059 s/min), 2,454,944,844 distinct states found (1,033,631 ds/min), 5,290,297 states left on queue. +Progress(55) at 2024-11-07 11:57:18: 35,679,577,103 states generated (28,557,995 s/min), 2,455,941,484 distinct states found (996,640 ds/min), 4,540,257 states left on queue. +Progress(55) at 2024-11-07 11:58:18: 35,708,050,230 states generated (28,473,127 s/min), 2,456,911,566 distinct states found (970,082 ds/min), 3,737,722 states left on queue. +Progress(55) at 2024-11-07 11:59:18: 35,736,484,911 states generated (28,434,681 s/min), 2,457,942,176 distinct states found (1,030,610 ds/min), 2,980,348 states left on queue. +Progress(56) at 2024-11-07 12:00:18: 35,765,029,620 states generated (28,544,709 s/min), 2,458,911,346 distinct states found (969,170 ds/min), 2,201,353 states left on queue. +Checkpointing of run states/24-11-06-15-30-45.354 +Checkpointing completed at (2024-11-07 12:01:18) +Progress(57) at 2024-11-07 12:01:18: 35,793,733,161 states generated (28,703,541 s/min), 2,459,897,228 distinct states found (985,882 ds/min), 1,411,705 states left on queue. +Progress(58) at 2024-11-07 12:02:18: 35,822,110,432 states generated (28,377,271 s/min), 2,460,820,961 distinct states found (923,733 ds/min), 587,430 states left on queue. +Model checking completed. No error has been found. + Estimates of the probability that TLC did not check all reachable states + because two distinct states had the same fingerprint: + calculated (optimistic): val = 4.5 + based on the actual fingerprints: val = .25 +35840434685 states generated, 2461362509 distinct states found, 0 states left on queue. +The depth of the complete state graph search is 67. +The average outdegree of the complete state graph is 1 (minimum is 0, the maximum 8 and the 95th percentile is 2). +Finished in 20h 32min at (2024-11-07 12:03:02) diff --git a/safekeeper/spec/tlc-results/MCProposerAcceptorStatic.tla-MCProposerAcceptorStatic_p2_a5_t2_l2.cfg-2024-11-06--12-09-32.log b/safekeeper/spec/tlc-results/MCProposerAcceptorStatic.tla-MCProposerAcceptorStatic_p2_a5_t2_l2.cfg-2024-11-06--12-09-32.log new file mode 100644 index 0000000000..c43d52302b --- /dev/null +++ b/safekeeper/spec/tlc-results/MCProposerAcceptorStatic.tla-MCProposerAcceptorStatic_p2_a5_t2_l2.cfg-2024-11-06--12-09-32.log @@ -0,0 +1,89 @@ +git revision: 864f4667d +Platform: Linux neon-dev-arm64-1 6.8.0-48-generic #48-Ubuntu SMP PREEMPT_DYNAMIC Fri Sep 27 14:35:45 UTC 2024 aarch64 aarch64 aarch64 GNU/Linux +CPU Info Linux: Neoverse-N1 +CPU Cores Linux: 80 +CPU Info Mac: +CPU Cores Mac: +Spec: MCProposerAcceptorStatic.tla +Config: models/MCProposerAcceptorStatic_p2_a5_t2_l2.cfg +---- +CONSTANTS +NULL = NULL +proposers = {p1, p2} +acceptors = {a1, a2, a3, a4, a5} +max_term = 2 +max_entries = 2 +SPECIFICATION Spec +CONSTRAINT StateConstraint +INVARIANT +TypeOk +ElectionSafety +LogIsMonotonic +LogSafety +SYMMETRY ProposerAcceptorSymmetry +CHECK_DEADLOCK FALSE +ALIAS Alias + +---- + +TLC2 Version 2.20 of Day Month 20?? (rev: f68cb71) +Running breadth-first search Model-Checking with fp 90 and seed 2164066158568118414 with 80 workers on 80 cores with 54613MB heap and 61440MB offheap memory [pid: 30788] (Linux 6.8.0-48-generic aarch64, Ubuntu 21.0.4 x86_64, OffHeapDiskFPSet, DiskStateQueue). +Parsing file /home/arseny/neon/safekeeper/spec/MCProposerAcceptorStatic.tla +Parsing file /tmp/tlc-13824636513165485309/TLC.tla (jar:file:/home/arseny/tla2tools.jar!/tla2sany/StandardModules/TLC.tla) +Parsing file /home/arseny/neon/safekeeper/spec/ProposerAcceptorStatic.tla +Parsing file /tmp/tlc-13824636513165485309/_TLCTrace.tla (jar:file:/home/arseny/tla2tools.jar!/tla2sany/StandardModules/_TLCTrace.tla) +Parsing file /tmp/tlc-13824636513165485309/Integers.tla (jar:file:/home/arseny/tla2tools.jar!/tla2sany/StandardModules/Integers.tla) +Parsing file /tmp/tlc-13824636513165485309/Sequences.tla (jar:file:/home/arseny/tla2tools.jar!/tla2sany/StandardModules/Sequences.tla) +Parsing file /tmp/tlc-13824636513165485309/FiniteSets.tla (jar:file:/home/arseny/tla2tools.jar!/tla2sany/StandardModules/FiniteSets.tla) +Parsing file /tmp/tlc-13824636513165485309/Naturals.tla (jar:file:/home/arseny/tla2tools.jar!/tla2sany/StandardModules/Naturals.tla) +Parsing file /tmp/tlc-13824636513165485309/TLCExt.tla (jar:file:/home/arseny/tla2tools.jar!/tla2sany/StandardModules/TLCExt.tla) +Semantic processing of module Naturals +Semantic processing of module Sequences +Semantic processing of module FiniteSets +Semantic processing of module TLC +Semantic processing of module Integers +Semantic processing of module ProposerAcceptorStatic +Semantic processing of module TLCExt +Semantic processing of module _TLCTrace +Semantic processing of module MCProposerAcceptorStatic +Starting... (2024-11-06 12:09:33) +Computing initial states... +Finished computing initial states: 1 distinct state generated at 2024-11-06 12:09:36. +Progress(16) at 2024-11-06 12:09:39: 405,675 states generated (405,675 s/min), 18,042 distinct states found (18,042 ds/min), 7,612 states left on queue. +Progress(23) at 2024-11-06 12:10:39: 12,449,257 states generated (12,043,582 s/min), 467,293 distinct states found (449,251 ds/min), 161,057 states left on queue. +Progress(25) at 2024-11-06 12:11:39: 24,461,332 states generated (12,012,075 s/min), 861,011 distinct states found (393,718 ds/min), 267,072 states left on queue. +Progress(26) at 2024-11-06 12:12:39: 36,440,377 states generated (11,979,045 s/min), 1,234,052 distinct states found (373,041 ds/min), 355,372 states left on queue. +Progress(26) at 2024-11-06 12:13:39: 48,327,873 states generated (11,887,496 s/min), 1,583,736 distinct states found (349,684 ds/min), 425,209 states left on queue. +Progress(27) at 2024-11-06 12:14:39: 60,246,136 states generated (11,918,263 s/min), 1,933,499 distinct states found (349,763 ds/min), 494,269 states left on queue. +Progress(28) at 2024-11-06 12:15:39: 71,977,716 states generated (11,731,580 s/min), 2,265,302 distinct states found (331,803 ds/min), 553,777 states left on queue. +Progress(28) at 2024-11-06 12:16:39: 83,644,537 states generated (11,666,821 s/min), 2,575,451 distinct states found (310,149 ds/min), 594,142 states left on queue. +Progress(29) at 2024-11-06 12:17:39: 95,287,089 states generated (11,642,552 s/min), 2,888,793 distinct states found (313,342 ds/min), 639,273 states left on queue. +Progress(29) at 2024-11-06 12:18:39: 107,000,972 states generated (11,713,883 s/min), 3,194,255 distinct states found (305,462 ds/min), 673,353 states left on queue. +Progress(29) at 2024-11-06 12:19:39: 118,305,248 states generated (11,304,276 s/min), 3,467,775 distinct states found (273,520 ds/min), 692,915 states left on queue. +Progress(29) at 2024-11-06 12:20:39: 129,954,327 states generated (11,649,079 s/min), 3,763,186 distinct states found (295,411 ds/min), 720,349 states left on queue. +Progress(29) at 2024-11-06 12:21:39: 141,251,359 states generated (11,297,032 s/min), 4,020,407 distinct states found (257,221 ds/min), 724,036 states left on queue. +Progress(30) at 2024-11-06 12:22:39: 152,551,873 states generated (11,300,514 s/min), 4,284,278 distinct states found (263,871 ds/min), 733,726 states left on queue. +Progress(30) at 2024-11-06 12:23:39: 164,324,788 states generated (11,772,915 s/min), 4,569,569 distinct states found (285,291 ds/min), 746,476 states left on queue. +Progress(30) at 2024-11-06 12:24:39: 175,121,317 states generated (10,796,529 s/min), 4,779,505 distinct states found (209,936 ds/min), 723,070 states left on queue. +Progress(31) at 2024-11-06 12:25:39: 186,238,236 states generated (11,116,919 s/min), 5,016,034 distinct states found (236,529 ds/min), 712,944 states left on queue. +Progress(31) at 2024-11-06 12:26:39: 197,884,578 states generated (11,646,342 s/min), 5,276,094 distinct states found (260,060 ds/min), 705,471 states left on queue. +Progress(31) at 2024-11-06 12:27:39: 208,535,096 states generated (10,650,518 s/min), 5,463,450 distinct states found (187,356 ds/min), 665,661 states left on queue. +Progress(32) at 2024-11-06 12:28:39: 219,424,829 states generated (10,889,733 s/min), 5,673,673 distinct states found (210,223 ds/min), 637,975 states left on queue. +Progress(32) at 2024-11-06 12:29:39: 230,906,372 states generated (11,481,543 s/min), 5,903,516 distinct states found (229,843 ds/min), 606,255 states left on queue. +Progress(33) at 2024-11-06 12:30:39: 241,261,887 states generated (10,355,515 s/min), 6,065,731 distinct states found (162,215 ds/min), 552,728 states left on queue. +Progress(33) at 2024-11-06 12:31:39: 252,028,921 states generated (10,767,034 s/min), 6,255,487 distinct states found (189,756 ds/min), 509,620 states left on queue. +Progress(33) at 2024-11-06 12:32:39: 262,856,171 states generated (10,827,250 s/min), 6,431,063 distinct states found (175,576 ds/min), 448,834 states left on queue. +Progress(34) at 2024-11-06 12:33:39: 273,211,882 states generated (10,355,711 s/min), 6,586,644 distinct states found (155,581 ds/min), 386,905 states left on queue. +Progress(34) at 2024-11-06 12:34:39: 283,843,415 states generated (10,631,533 s/min), 6,743,916 distinct states found (157,272 ds/min), 315,135 states left on queue. +Progress(35) at 2024-11-06 12:35:39: 293,931,115 states generated (10,087,700 s/min), 6,878,405 distinct states found (134,489 ds/min), 241,126 states left on queue. +Progress(36) at 2024-11-06 12:36:39: 303,903,441 states generated (9,972,326 s/min), 6,996,394 distinct states found (117,989 ds/min), 152,775 states left on queue. +Progress(37) at 2024-11-06 12:37:39: 313,501,886 states generated (9,598,445 s/min), 7,093,031 distinct states found (96,637 ds/min), 54,009 states left on queue. +Model checking completed. No error has been found. + Estimates of the probability that TLC did not check all reachable states + because two distinct states had the same fingerprint: + calculated (optimistic): val = 1.2E-4 + based on the actual fingerprints: val = 2.1E-6 +318172398 states generated, 7127950 distinct states found, 0 states left on queue. +The depth of the complete state graph search is 44. +The average outdegree of the complete state graph is 1 (minimum is 0, the maximum 9 and the 95th percentile is 3). +Finished in 28min 43s at (2024-11-06 12:38:16) diff --git a/safekeeper/src/bin/safekeeper.rs b/safekeeper/src/bin/safekeeper.rs index 1248428d33..4dc7edef37 100644 --- a/safekeeper/src/bin/safekeeper.rs +++ b/safekeeper/src/bin/safekeeper.rs @@ -48,6 +48,14 @@ use utils::{ tcp_listener, }; +#[global_allocator] +static GLOBAL: tikv_jemallocator::Jemalloc = tikv_jemallocator::Jemalloc; + +/// Configure jemalloc to sample allocations for profiles every 1 MB (1 << 20). +#[allow(non_upper_case_globals)] +#[export_name = "malloc_conf"] +pub static malloc_conf: &[u8] = b"prof:true,prof_active:true,lg_prof_sample:20\0"; + const PID_FILE_NAME: &str = "safekeeper.pid"; const ID_FILE_NAME: &str = "safekeeper.id"; diff --git a/safekeeper/src/handler.rs b/safekeeper/src/handler.rs index 22f33b17e0..8dd2929a03 100644 --- a/safekeeper/src/handler.rs +++ b/safekeeper/src/handler.rs @@ -212,8 +212,9 @@ impl postgres_backend::Handler ); if let Some(shard) = self.shard.as_ref() { - tracing::Span::current() - .record("shard", tracing::field::display(shard.shard_slug())); + if let Some(slug) = shard.shard_slug().strip_prefix("-") { + tracing::Span::current().record("shard", tracing::field::display(slug)); + } } Ok(()) diff --git a/safekeeper/src/http/client.rs b/safekeeper/src/http/client.rs index c56f7880d4..a166fc1ab9 100644 --- a/safekeeper/src/http/client.rs +++ b/safekeeper/src/http/client.rs @@ -8,6 +8,7 @@ //! etc. use reqwest::{IntoUrl, Method, StatusCode}; +use std::error::Error as _; use utils::{ http::error::HttpErrorBody, id::{NodeId, TenantId, TimelineId}, @@ -26,7 +27,7 @@ pub struct Client { #[derive(thiserror::Error, Debug)] pub enum Error { /// Failed to receive body (reqwest error). - #[error("receive body: {0}")] + #[error("receive body: {0}{}", .0.source().map(|e| format!(": {e}")).unwrap_or_default())] ReceiveBody(reqwest::Error), /// Status is not ok, but failed to parse body as `HttpErrorBody`. diff --git a/safekeeper/src/http/routes.rs b/safekeeper/src/http/routes.rs index 28294abdb9..69b775fd76 100644 --- a/safekeeper/src/http/routes.rs +++ b/safekeeper/src/http/routes.rs @@ -14,7 +14,8 @@ use tokio_util::sync::CancellationToken; use tracing::{info_span, Instrument}; use utils::failpoint_support::failpoints_handler; use utils::http::endpoint::{ - profile_cpu_handler, prometheus_metrics_handler, request_span, ChannelWriter, + profile_cpu_handler, profile_heap_handler, prometheus_metrics_handler, request_span, + ChannelWriter, }; use utils::http::request::parse_query_param; @@ -573,7 +574,8 @@ pub fn make_router(conf: SafeKeeperConf) -> RouterBuilder let mut router = endpoint::make_router(); if conf.http_auth.is_some() { router = router.middleware(auth_middleware(|request| { - const ALLOWLIST_ROUTES: &[&str] = &["/v1/status", "/metrics", "/profile/cpu"]; + const ALLOWLIST_ROUTES: &[&str] = + &["/v1/status", "/metrics", "/profile/cpu", "profile/heap"]; if ALLOWLIST_ROUTES.contains(&request.uri().path()) { None } else { @@ -594,6 +596,7 @@ pub fn make_router(conf: SafeKeeperConf) -> RouterBuilder .data(auth) .get("/metrics", |r| request_span(r, prometheus_metrics_handler)) .get("/profile/cpu", |r| request_span(r, profile_cpu_handler)) + .get("/profile/heap", |r| request_span(r, profile_heap_handler)) .get("/v1/status", |r| request_span(r, status_handler)) .put("/v1/failpoints", |r| { request_span(r, move |r| async { diff --git a/safekeeper/src/wal_service.rs b/safekeeper/src/wal_service.rs index 1ab54d4cce..5248d545db 100644 --- a/safekeeper/src/wal_service.rs +++ b/safekeeper/src/wal_service.rs @@ -44,7 +44,7 @@ pub async fn task_main( error!("connection handler exited: {}", err); } } - .instrument(info_span!("", cid = %conn_id, ttid = field::Empty, application_name = field::Empty)), + .instrument(info_span!("", cid = %conn_id, ttid = field::Empty, application_name = field::Empty, shard = field::Empty)), ); } } diff --git a/scripts/flaky_tests.py b/scripts/flaky_tests.py deleted file mode 100755 index 3fb668ed2d..0000000000 --- a/scripts/flaky_tests.py +++ /dev/null @@ -1,147 +0,0 @@ -#! /usr/bin/env python3 - -from __future__ import annotations - -import argparse -import json -import logging -import os -from collections import defaultdict -from typing import TYPE_CHECKING - -import psycopg2 -import psycopg2.extras -import toml - -if TYPE_CHECKING: - from typing import Any - -FLAKY_TESTS_QUERY = """ - SELECT - DISTINCT parent_suite, suite, name - FROM results - WHERE - started_at > CURRENT_DATE - INTERVAL '%s' day - AND ( - (status IN ('failed', 'broken') AND reference = 'refs/heads/main') - OR flaky - ) - ; -""" - - -def main(args: argparse.Namespace): - connstr = args.connstr - interval_days = args.days - output = args.output - - build_type = args.build_type - pg_version = args.pg_version - - res: defaultdict[str, defaultdict[str, dict[str, bool]]] - res = defaultdict(lambda: defaultdict(dict)) - - try: - logging.info("connecting to the database...") - with psycopg2.connect(connstr, connect_timeout=30) as conn: - with conn.cursor(cursor_factory=psycopg2.extras.DictCursor) as cur: - logging.info("fetching flaky tests...") - cur.execute(FLAKY_TESTS_QUERY, (interval_days,)) - rows = cur.fetchall() - except psycopg2.OperationalError as exc: - logging.error("cannot fetch flaky tests from the DB due to an error", exc) - rows = [] - - # If a test run has non-default PAGESERVER_VIRTUAL_FILE_IO_ENGINE (i.e. not empty, not tokio-epoll-uring), - # use it to parametrize test name along with build_type and pg_version - # - # See test_runner/fixtures/parametrize.py for details - if (io_engine := os.getenv("PAGESERVER_VIRTUAL_FILE_IO_ENGINE", "")) not in ( - "", - "tokio-epoll-uring", - ): - pageserver_virtual_file_io_engine_parameter = f"-{io_engine}" - else: - pageserver_virtual_file_io_engine_parameter = "" - - # re-use existing records of flaky tests from before parametrization by compaction_algorithm - def get_pageserver_default_tenant_config_compaction_algorithm() -> dict[str, Any] | None: - """Duplicated from parametrize.py""" - toml_table = os.getenv("PAGESERVER_DEFAULT_TENANT_CONFIG_COMPACTION_ALGORITHM") - if toml_table is None: - return None - v = toml.loads(toml_table) - assert isinstance(v, dict) - return v - - pageserver_default_tenant_config_compaction_algorithm_parameter = "" - if ( - explicit_default := get_pageserver_default_tenant_config_compaction_algorithm() - ) is not None: - pageserver_default_tenant_config_compaction_algorithm_parameter = ( - f"-{explicit_default['kind']}" - ) - - for row in rows: - # We don't want to automatically rerun tests in a performance suite - if row["parent_suite"] != "test_runner.regress": - continue - - if row["name"].endswith("]"): - parametrized_test = row["name"].replace( - "[", - f"[{build_type}-pg{pg_version}{pageserver_virtual_file_io_engine_parameter}{pageserver_default_tenant_config_compaction_algorithm_parameter}-", - ) - else: - parametrized_test = f"{row['name']}[{build_type}-pg{pg_version}{pageserver_virtual_file_io_engine_parameter}{pageserver_default_tenant_config_compaction_algorithm_parameter}]" - - res[row["parent_suite"]][row["suite"]][parametrized_test] = True - - logging.info( - f"\t{row['parent_suite'].replace('.', '/')}/{row['suite']}.py::{parametrized_test}" - ) - - logging.info(f"saving results to {output.name}") - json.dump(res, output, indent=2) - - -if __name__ == "__main__": - parser = argparse.ArgumentParser(description="Detect flaky tests in the last N days") - parser.add_argument( - "--output", - type=argparse.FileType("w"), - default="flaky.json", - help="path to output json file (default: flaky.json)", - ) - parser.add_argument( - "--days", - required=False, - default=10, - type=int, - help="how many days to look back for flaky tests (default: 10)", - ) - parser.add_argument( - "--build-type", - required=True, - type=str, - help="for which build type to create list of flaky tests (debug or release)", - ) - parser.add_argument( - "--pg-version", - required=True, - type=int, - help="for which Postgres version to create list of flaky tests (14, 15, etc.)", - ) - parser.add_argument( - "connstr", - help="connection string to the test results database", - ) - args = parser.parse_args() - - level = logging.INFO - logging.basicConfig( - format="%(message)s", - level=level, - ) - - main(args) diff --git a/storage_controller/src/compute_hook.rs b/storage_controller/src/compute_hook.rs index b63a322b87..2b2ece3f02 100644 --- a/storage_controller/src/compute_hook.rs +++ b/storage_controller/src/compute_hook.rs @@ -1,3 +1,4 @@ +use std::error::Error as _; use std::sync::Arc; use std::{collections::HashMap, time::Duration}; @@ -172,7 +173,7 @@ struct ComputeHookNotifyRequest { #[derive(thiserror::Error, Debug)] pub(crate) enum NotifyError { // Request was not send successfully, e.g. transport error - #[error("Sending request: {0}")] + #[error("Sending request: {0}{}", .0.source().map(|e| format!(": {e}")).unwrap_or_default())] Request(#[from] reqwest::Error), // Request could not be serviced right now due to ongoing Operation in control plane, but should be possible soon. #[error("Control plane tenant busy")] diff --git a/storage_controller/src/http.rs b/storage_controller/src/http.rs index 9b5d4caf31..39e078ba7c 100644 --- a/storage_controller/src/http.rs +++ b/storage_controller/src/http.rs @@ -1452,10 +1452,15 @@ async fn maybe_forward(req: Request) -> ForwardOutcome { let uri = req.uri().to_string(); let uri_for_forward = !NOT_FOR_FORWARD.contains(&uri.as_str()); + // Fast return before trying to take any Service locks, if we will never forward anyway + if !uri_for_forward { + return ForwardOutcome::NotForwarded(req); + } + let state = get_state(&req); let leadership_status = state.service.get_leadership_status(); - if leadership_status != LeadershipStatus::SteppedDown || !uri_for_forward { + if leadership_status != LeadershipStatus::SteppedDown { return ForwardOutcome::NotForwarded(req); } diff --git a/storage_controller/src/metrics.rs b/storage_controller/src/metrics.rs index a1f7bc2457..6d5885eba6 100644 --- a/storage_controller/src/metrics.rs +++ b/storage_controller/src/metrics.rs @@ -50,6 +50,12 @@ pub(crate) struct StorageControllerMetricGroup { /// Count of how many times we make an optimization change to a tenant's scheduling pub(crate) storage_controller_schedule_optimization: measured::Counter, + /// How many shards are not scheduled into their preferred AZ + pub(crate) storage_controller_schedule_az_violation: measured::Gauge, + + /// How many shards would like to reconcile but were blocked by concurrency limits + pub(crate) storage_controller_pending_reconciles: measured::Gauge, + /// HTTP request status counters for handled requests pub(crate) storage_controller_http_request_status: measured::CounterVec, diff --git a/storage_controller/src/peer_client.rs b/storage_controller/src/peer_client.rs index 3f8520fe55..ee4eb55294 100644 --- a/storage_controller/src/peer_client.rs +++ b/storage_controller/src/peer_client.rs @@ -1,7 +1,9 @@ use crate::tenant_shard::ObservedState; use pageserver_api::shard::TenantShardId; use serde::{Deserialize, Serialize}; -use std::{collections::HashMap, time::Duration}; +use std::collections::HashMap; +use std::error::Error as _; +use std::time::Duration; use tokio_util::sync::CancellationToken; use hyper::Uri; @@ -17,11 +19,14 @@ pub(crate) struct PeerClient { #[derive(thiserror::Error, Debug)] pub(crate) enum StorageControllerPeerError { - #[error("failed to deserialize error response with status code {0} at {1}: {2}")] + #[error( + "failed to deserialize error response with status code {0} at {1}: {2}{}", + .2.source().map(|e| format!(": {e}")).unwrap_or_default() + )] DeserializationError(StatusCode, Url, reqwest::Error), #[error("storage controller peer API error ({0}): {1}")] ApiError(StatusCode, String), - #[error("failed to send HTTP request: {0}")] + #[error("failed to send HTTP request: {0}{}", .0.source().map(|e| format!(": {e}")).unwrap_or_default())] SendError(reqwest::Error), #[error("Cancelled")] Cancelled, diff --git a/storage_controller/src/scheduler.rs b/storage_controller/src/scheduler.rs index 2414d95eb8..ecc6b11e47 100644 --- a/storage_controller/src/scheduler.rs +++ b/storage_controller/src/scheduler.rs @@ -305,7 +305,7 @@ impl std::ops::Add for AffinityScore { /// Hint for whether this is a sincere attempt to schedule, or a speculative /// check for where we _would_ schedule (done during optimization) -#[derive(Debug)] +#[derive(Debug, Clone)] pub(crate) enum ScheduleMode { Normal, Speculative, @@ -319,7 +319,7 @@ impl Default for ScheduleMode { // For carrying state between multiple calls to [`TenantShard::schedule`], e.g. when calling // it for many shards in the same tenant. -#[derive(Debug, Default)] +#[derive(Debug, Default, Clone)] pub(crate) struct ScheduleContext { /// Sparse map of nodes: omitting a node implicitly makes its affinity [`AffinityScore::FREE`] pub(crate) nodes: HashMap, @@ -331,6 +331,14 @@ pub(crate) struct ScheduleContext { } impl ScheduleContext { + pub(crate) fn new(mode: ScheduleMode) -> Self { + Self { + nodes: HashMap::new(), + attached_nodes: HashMap::new(), + mode, + } + } + /// Input is a list of nodes we would like to avoid using again within this context. The more /// times a node is passed into this call, the less inclined we are to use it. pub(crate) fn avoid(&mut self, nodes: &[NodeId]) { @@ -355,6 +363,11 @@ impl ScheduleContext { pub(crate) fn get_node_attachments(&self, node_id: NodeId) -> usize { self.attached_nodes.get(&node_id).copied().unwrap_or(0) } + + #[cfg(test)] + pub(crate) fn attach_count(&self) -> usize { + self.attached_nodes.values().sum() + } } pub(crate) enum RefCountUpdate { diff --git a/storage_controller/src/service.rs b/storage_controller/src/service.rs index 446c476b99..92ec58cb4d 100644 --- a/storage_controller/src/service.rs +++ b/storage_controller/src/service.rs @@ -1,3 +1,6 @@ +pub mod chaos_injector; +mod context_iterator; + use hyper::Uri; use std::{ borrow::Cow, @@ -41,12 +44,12 @@ use futures::{stream::FuturesUnordered, StreamExt}; use itertools::Itertools; use pageserver_api::{ controller_api::{ - MetadataHealthRecord, MetadataHealthUpdateRequest, NodeAvailability, NodeRegisterRequest, - NodeSchedulingPolicy, NodeShard, NodeShardResponse, PlacementPolicy, ShardSchedulingPolicy, - ShardsPreferredAzsRequest, ShardsPreferredAzsResponse, TenantCreateRequest, - TenantCreateResponse, TenantCreateResponseShard, TenantDescribeResponse, - TenantDescribeResponseShard, TenantLocateResponse, TenantPolicyRequest, - TenantShardMigrateRequest, TenantShardMigrateResponse, + AvailabilityZone, MetadataHealthRecord, MetadataHealthUpdateRequest, NodeAvailability, + NodeRegisterRequest, NodeSchedulingPolicy, NodeShard, NodeShardResponse, PlacementPolicy, + ShardSchedulingPolicy, ShardsPreferredAzsRequest, ShardsPreferredAzsResponse, + TenantCreateRequest, TenantCreateResponse, TenantCreateResponseShard, + TenantDescribeResponse, TenantDescribeResponseShard, TenantLocateResponse, + TenantPolicyRequest, TenantShardMigrateRequest, TenantShardMigrateResponse, }, models::{ SecondaryProgress, TenantConfigRequest, TimelineArchivalConfigRequest, @@ -95,7 +98,7 @@ use crate::{ }, }; -pub mod chaos_injector; +use context_iterator::TenantShardContextIterator; // For operations that should be quick, like attaching a new tenant const SHORT_RECONCILE_TIMEOUT: Duration = Duration::from_secs(5); @@ -465,6 +468,7 @@ struct ShardSplitParams { policy: PlacementPolicy, config: TenantConfig, shard_ident: ShardIdentity, + preferred_az_id: Option, } // When preparing for a shard split, we may either choose to proceed with the split, @@ -4100,7 +4104,7 @@ impl Service { for parent_id in parent_ids { let child_ids = parent_id.split(new_shard_count); - let (pageserver, generation, policy, parent_ident, config) = { + let (pageserver, generation, policy, parent_ident, config, preferred_az) = { let mut old_state = tenants .remove(&parent_id) .expect("It was present, we just split it"); @@ -4119,6 +4123,7 @@ impl Service { old_state.policy.clone(), old_state.shard, old_state.config.clone(), + old_state.preferred_az().cloned(), ) }; @@ -4151,6 +4156,9 @@ impl Service { }; child_state.generation = Some(generation); child_state.config = config.clone(); + if let Some(preferred_az) = &preferred_az { + child_state.set_preferred_az(preferred_az.clone()); + } // The child's TenantShard::splitting is intentionally left at the default value of Idle, // as at this point in the split process we have succeeded and this part is infallible: @@ -4343,6 +4351,7 @@ impl Service { let mut policy = None; let mut config = None; let mut shard_ident = None; + let mut preferred_az_id = None; // Validate input, and calculate which shards we will create let (old_shard_count, targets) = { @@ -4401,6 +4410,9 @@ impl Service { if config.is_none() { config = Some(shard.config.clone()); } + if preferred_az_id.is_none() { + preferred_az_id = shard.preferred_az().cloned(); + } if tenant_shard_id.shard_count.count() == split_req.new_shard_count { tracing::info!( @@ -4471,6 +4483,7 @@ impl Service { policy, config, shard_ident, + preferred_az_id, }))) } @@ -4493,6 +4506,7 @@ impl Service { policy, config, shard_ident, + preferred_az_id, } = *params; // Drop any secondary locations: pageservers do not support splitting these, and in any case the @@ -4566,7 +4580,7 @@ impl Service { // Scheduling policies and preferred AZ do not carry through to children scheduling_policy: serde_json::to_string(&ShardSchedulingPolicy::default()) .unwrap(), - preferred_az_id: None, + preferred_az_id: preferred_az_id.as_ref().map(|az| az.0.clone()), }); } @@ -4686,47 +4700,6 @@ impl Service { let (response, child_locations, waiters) = self.tenant_shard_split_commit_inmem(tenant_id, new_shard_count, new_stripe_size); - // Now that we have scheduled the child shards, attempt to set their preferred AZ - // to that of the pageserver they've been attached on. - let preferred_azs = { - let locked = self.inner.read().unwrap(); - child_locations - .iter() - .filter_map(|(tid, node_id, _stripe_size)| { - let az_id = locked - .nodes - .get(node_id) - .map(|n| n.get_availability_zone_id().clone())?; - - Some((*tid, az_id)) - }) - .collect::>() - }; - - let updated = self - .persistence - .set_tenant_shard_preferred_azs(preferred_azs) - .await - .map_err(|err| { - ApiError::InternalServerError(anyhow::anyhow!( - "Failed to persist preferred az ids: {err}" - )) - }); - - match updated { - Ok(updated) => { - let mut locked = self.inner.write().unwrap(); - for (tid, az_id) in updated { - if let Some(shard) = locked.tenants.get_mut(&tid) { - shard.set_preferred_az(az_id); - } - } - } - Err(err) => { - tracing::warn!("Failed to persist preferred AZs after split: {err}"); - } - } - // Send compute notifications for all the new shards let mut failed_notifications = Vec::new(); for (child_id, child_ps, stripe_size) in child_locations { @@ -5155,34 +5128,38 @@ impl Service { *nodes = Arc::new(nodes_mut); } - for (tenant_shard_id, shard) in tenants { - if shard.deref_node(node_id) { - // FIXME: we need to build a ScheduleContext that reflects this shard's peers, otherwise - // it won't properly do anti-affinity. - let mut schedule_context = ScheduleContext::default(); + for (_tenant_id, mut schedule_context, shards) in + TenantShardContextIterator::new(tenants, ScheduleMode::Normal) + { + for shard in shards { + if shard.deref_node(node_id) { + if let Err(e) = shard.schedule(scheduler, &mut schedule_context) { + // TODO: implement force flag to remove a node even if we can't reschedule + // a tenant + tracing::error!( + "Refusing to delete node, shard {} can't be rescheduled: {e}", + shard.tenant_shard_id + ); + return Err(e.into()); + } else { + tracing::info!( + "Rescheduled shard {} away from node during deletion", + shard.tenant_shard_id + ) + } - if let Err(e) = shard.schedule(scheduler, &mut schedule_context) { - // TODO: implement force flag to remove a node even if we can't reschedule - // a tenant - tracing::error!("Refusing to delete node, shard {tenant_shard_id} can't be rescheduled: {e}"); - return Err(e.into()); - } else { - tracing::info!( - "Rescheduled shard {tenant_shard_id} away from node during deletion" - ) + self.maybe_reconcile_shard(shard, nodes); } - self.maybe_reconcile_shard(shard, nodes); + // Here we remove an existing observed location for the node we're removing, and it will + // not be re-added by a reconciler's completion because we filter out removed nodes in + // process_result. + // + // Note that we update the shard's observed state _after_ calling maybe_reconcile_shard: that + // means any reconciles we spawned will know about the node we're deleting, enabling them + // to do live migrations if it's still online. + shard.observed.locations.remove(&node_id); } - - // Here we remove an existing observed location for the node we're removing, and it will - // not be re-added by a reconciler's completion because we filter out removed nodes in - // process_result. - // - // Note that we update the shard's observed state _after_ calling maybe_reconcile_shard: that - // means any reconciles we spawned will know about the node we're deleting, enabling them - // to do live migrations if it's still online. - shard.observed.locations.remove(&node_id); } scheduler.node_remove(node_id); @@ -5498,49 +5475,51 @@ impl Service { let mut tenants_affected: usize = 0; - for (tenant_shard_id, tenant_shard) in tenants { - if let Some(observed_loc) = tenant_shard.observed.locations.get_mut(&node_id) { - // When a node goes offline, we set its observed configuration to None, indicating unknown: we will - // not assume our knowledge of the node's configuration is accurate until it comes back online - observed_loc.conf = None; - } + for (_tenant_id, mut schedule_context, shards) in + TenantShardContextIterator::new(tenants, ScheduleMode::Normal) + { + for tenant_shard in shards { + let tenant_shard_id = tenant_shard.tenant_shard_id; + if let Some(observed_loc) = + tenant_shard.observed.locations.get_mut(&node_id) + { + // When a node goes offline, we set its observed configuration to None, indicating unknown: we will + // not assume our knowledge of the node's configuration is accurate until it comes back online + observed_loc.conf = None; + } - if nodes.len() == 1 { - // Special case for single-node cluster: there is no point trying to reschedule - // any tenant shards: avoid doing so, in order to avoid spewing warnings about - // failures to schedule them. - continue; - } + if nodes.len() == 1 { + // Special case for single-node cluster: there is no point trying to reschedule + // any tenant shards: avoid doing so, in order to avoid spewing warnings about + // failures to schedule them. + continue; + } - if !nodes - .values() - .any(|n| matches!(n.may_schedule(), MaySchedule::Yes(_))) - { - // Special case for when all nodes are unavailable and/or unschedulable: there is no point - // trying to reschedule since there's nowhere else to go. Without this - // branch we incorrectly detach tenants in response to node unavailability. - continue; - } + if !nodes + .values() + .any(|n| matches!(n.may_schedule(), MaySchedule::Yes(_))) + { + // Special case for when all nodes are unavailable and/or unschedulable: there is no point + // trying to reschedule since there's nowhere else to go. Without this + // branch we incorrectly detach tenants in response to node unavailability. + continue; + } - if tenant_shard.intent.demote_attached(scheduler, node_id) { - tenant_shard.sequence = tenant_shard.sequence.next(); + if tenant_shard.intent.demote_attached(scheduler, node_id) { + tenant_shard.sequence = tenant_shard.sequence.next(); - // TODO: populate a ScheduleContext including all shards in the same tenant_id (only matters - // for tenants without secondary locations: if they have a secondary location, then this - // schedule() call is just promoting an existing secondary) - let mut schedule_context = ScheduleContext::default(); - - match tenant_shard.schedule(scheduler, &mut schedule_context) { - Err(e) => { - // It is possible that some tenants will become unschedulable when too many pageservers - // go offline: in this case there isn't much we can do other than make the issue observable. - // TODO: give TenantShard a scheduling error attribute to be queried later. - tracing::warn!(%tenant_shard_id, "Scheduling error when marking pageserver {} offline: {e}", node_id); - } - Ok(()) => { - if self.maybe_reconcile_shard(tenant_shard, nodes).is_some() { - tenants_affected += 1; - }; + match tenant_shard.schedule(scheduler, &mut schedule_context) { + Err(e) => { + // It is possible that some tenants will become unschedulable when too many pageservers + // go offline: in this case there isn't much we can do other than make the issue observable. + // TODO: give TenantShard a scheduling error attribute to be queried later. + tracing::warn!(%tenant_shard_id, "Scheduling error when marking pageserver {} offline: {e}", node_id); + } + Ok(()) => { + if self.maybe_reconcile_shard(tenant_shard, nodes).is_some() { + tenants_affected += 1; + }; + } } } } @@ -5702,7 +5681,7 @@ impl Service { } match node_policy { - NodeSchedulingPolicy::Active | NodeSchedulingPolicy::Pause => { + NodeSchedulingPolicy::Active => { self.node_configure(node_id, None, Some(NodeSchedulingPolicy::Draining)) .await?; @@ -6011,12 +5990,24 @@ impl Service { let (nodes, tenants, _scheduler) = locked.parts_mut(); let pageservers = nodes.clone(); - let mut schedule_context = ScheduleContext::default(); + // This function is an efficient place to update lazy statistics, since we are walking + // all tenants. + let mut pending_reconciles = 0; + let mut az_violations = 0; let mut reconciles_spawned = 0; - for (tenant_shard_id, shard) in tenants.iter_mut() { - if tenant_shard_id.is_shard_zero() { - schedule_context = ScheduleContext::default(); + for shard in tenants.values_mut() { + // Accumulate scheduling statistics + if let (Some(attached), Some(preferred)) = + (shard.intent.get_attached(), shard.preferred_az()) + { + let node_az = nodes + .get(attached) + .expect("Nodes exist if referenced") + .get_availability_zone_id(); + if node_az != preferred { + az_violations += 1; + } } // Skip checking if this shard is already enqueued for reconciliation @@ -6025,6 +6016,7 @@ impl Service { // callers like reconcile_all_now do not incorrectly get the impression // that the system is in a quiescent state. reconciles_spawned = std::cmp::max(1, reconciles_spawned); + pending_reconciles += 1; continue; } @@ -6032,11 +6024,22 @@ impl Service { // dirty, spawn another rone if self.maybe_reconcile_shard(shard, &pageservers).is_some() { reconciles_spawned += 1; + } else if shard.delayed_reconcile { + // Shard wanted to reconcile but for some reason couldn't. + pending_reconciles += 1; } - - schedule_context.avoid(&shard.intent.all_pageservers()); } + metrics::METRICS_REGISTRY + .metrics_group + .storage_controller_schedule_az_violation + .set(az_violations as i64); + + metrics::METRICS_REGISTRY + .metrics_group + .storage_controller_pending_reconciles + .set(pending_reconciles as i64); + reconciles_spawned } @@ -6103,95 +6106,62 @@ impl Service { } fn optimize_all_plan(&self) -> Vec<(TenantShardId, ScheduleOptimization)> { - let mut schedule_context = ScheduleContext::default(); - - let mut tenant_shards: Vec<&TenantShard> = Vec::new(); - // How many candidate optimizations we will generate, before evaluating them for readniess: setting // this higher than the execution limit gives us a chance to execute some work even if the first // few optimizations we find are not ready. const MAX_OPTIMIZATIONS_PLAN_PER_PASS: usize = 8; let mut work = Vec::new(); - let mut locked = self.inner.write().unwrap(); let (nodes, tenants, scheduler) = locked.parts_mut(); - for (tenant_shard_id, shard) in tenants.iter() { - if tenant_shard_id.is_shard_zero() { - // Reset accumulators on the first shard in a tenant - schedule_context = ScheduleContext::default(); - schedule_context.mode = ScheduleMode::Speculative; - tenant_shards.clear(); - } - if work.len() >= MAX_OPTIMIZATIONS_PLAN_PER_PASS { - break; - } - - match shard.get_scheduling_policy() { - ShardSchedulingPolicy::Active => { - // Ok to do optimization + for (_tenant_id, schedule_context, shards) in + TenantShardContextIterator::new(tenants, ScheduleMode::Speculative) + { + for shard in shards { + if work.len() >= MAX_OPTIMIZATIONS_PLAN_PER_PASS { + break; } - ShardSchedulingPolicy::Essential - | ShardSchedulingPolicy::Pause - | ShardSchedulingPolicy::Stop => { - // Policy prevents optimizing this shard. - continue; + match shard.get_scheduling_policy() { + ShardSchedulingPolicy::Active => { + // Ok to do optimization + } + ShardSchedulingPolicy::Essential + | ShardSchedulingPolicy::Pause + | ShardSchedulingPolicy::Stop => { + // Policy prevents optimizing this shard. + continue; + } } - } - // Accumulate the schedule context for all the shards in a tenant: we must have - // the total view of all shards before we can try to optimize any of them. - schedule_context.avoid(&shard.intent.all_pageservers()); - if let Some(attached) = shard.intent.get_attached() { - schedule_context.push_attached(*attached); - } - tenant_shards.push(shard); - - // Once we have seen the last shard in the tenant, proceed to search across all shards - // in the tenant for optimizations - if shard.shard.number.0 == shard.shard.count.count() - 1 { - if tenant_shards.iter().any(|s| s.reconciler.is_some()) { + if !matches!(shard.splitting, SplitState::Idle) + || matches!(shard.policy, PlacementPolicy::Detached) + || shard.reconciler.is_some() + { // Do not start any optimizations while another change to the tenant is ongoing: this // is not necessary for correctness, but simplifies operations and implicitly throttles // optimization changes to happen in a "trickle" over time. continue; } - if tenant_shards.iter().any(|s| { - !matches!(s.splitting, SplitState::Idle) - || matches!(s.policy, PlacementPolicy::Detached) - }) { - // Never attempt to optimize a tenant that is currently being split, or - // a tenant that is meant to be detached - continue; - } - // TODO: optimization calculations are relatively expensive: create some fast-path for // the common idle case (avoiding the search on tenants that we have recently checked) - - for shard in &tenant_shards { - if let Some(optimization) = - // If idle, maybe ptimize attachments: if a shard has a secondary location that is preferable to - // its primary location based on soft constraints, cut it over. - shard.optimize_attachment(nodes, &schedule_context) - { - work.push((shard.tenant_shard_id, optimization)); - break; - } else if let Some(optimization) = - // If idle, maybe optimize secondary locations: if a shard has a secondary location that would be - // better placed on another node, based on ScheduleContext, then adjust it. This - // covers cases like after a shard split, where we might have too many shards - // in the same tenant with secondary locations on the node where they originally split. - shard.optimize_secondary(scheduler, &schedule_context) - { - work.push((shard.tenant_shard_id, optimization)); - break; - } - - // TODO: extend this mechanism to prefer attaching on nodes with fewer attached - // tenants (i.e. extend schedule state to distinguish attached from secondary counts), - // for the total number of attachments on a node (not just within a tenant.) + if let Some(optimization) = + // If idle, maybe ptimize attachments: if a shard has a secondary location that is preferable to + // its primary location based on soft constraints, cut it over. + shard.optimize_attachment(nodes, &schedule_context) + { + work.push((shard.tenant_shard_id, optimization)); + break; + } else if let Some(optimization) = + // If idle, maybe optimize secondary locations: if a shard has a secondary location that would be + // better placed on another node, based on ScheduleContext, then adjust it. This + // covers cases like after a shard split, where we might have too many shards + // in the same tenant with secondary locations on the node where they originally split. + shard.optimize_secondary(scheduler, &schedule_context) + { + work.push((shard.tenant_shard_id, optimization)); + break; } } } @@ -6283,6 +6253,14 @@ impl Service { > DOWNLOAD_FRESHNESS_THRESHOLD { tracing::info!("Skipping migration of {tenant_shard_id} to {node} because secondary isn't ready: {progress:?}"); + + #[cfg(feature = "testing")] + if progress.heatmap_mtime.is_none() { + // No heatmap might mean the attached location has never uploaded one, or that + // the secondary download hasn't happened yet. This is relatively unusual in the field, + // but fairly common in tests. + self.kick_secondary_download(tenant_shard_id).await; + } } else { // Location looks ready: proceed tracing::info!( @@ -6297,6 +6275,58 @@ impl Service { validated_work } + /// Some aspects of scheduling optimisation wait for secondary locations to be warm. This + /// happens on multi-minute timescales in the field, which is fine because optimisation is meant + /// to be a lazy background thing. However, when testing, it is not practical to wait around, so + /// we have this helper to move things along faster. + #[cfg(feature = "testing")] + async fn kick_secondary_download(&self, tenant_shard_id: TenantShardId) { + let (attached_node, secondary_node) = { + let locked = self.inner.read().unwrap(); + let Some(shard) = locked.tenants.get(&tenant_shard_id) else { + return; + }; + let (Some(attached), Some(secondary)) = ( + shard.intent.get_attached(), + shard.intent.get_secondary().first(), + ) else { + return; + }; + ( + locked.nodes.get(attached).unwrap().clone(), + locked.nodes.get(secondary).unwrap().clone(), + ) + }; + + // Make remote API calls to upload + download heatmaps: we ignore errors because this is just + // a 'kick' to let scheduling optimisation run more promptly. + attached_node + .with_client_retries( + |client| async move { client.tenant_heatmap_upload(tenant_shard_id).await }, + &self.config.jwt_token, + 3, + 10, + SHORT_RECONCILE_TIMEOUT, + &self.cancel, + ) + .await; + + secondary_node + .with_client_retries( + |client| async move { + client + .tenant_secondary_download(tenant_shard_id, Some(Duration::from_secs(1))) + .await + }, + &self.config.jwt_token, + 3, + 10, + SHORT_RECONCILE_TIMEOUT, + &self.cancel, + ) + .await; + } + /// Look for shards which are oversized and in need of splitting async fn autosplit_tenants(self: &Arc) { let Some(split_threshold) = self.config.split_threshold else { diff --git a/storage_controller/src/service/context_iterator.rs b/storage_controller/src/service/context_iterator.rs new file mode 100644 index 0000000000..d38010a27e --- /dev/null +++ b/storage_controller/src/service/context_iterator.rs @@ -0,0 +1,139 @@ +use std::collections::BTreeMap; + +use utils::id::TenantId; +use utils::shard::TenantShardId; + +use crate::scheduler::{ScheduleContext, ScheduleMode}; +use crate::tenant_shard::TenantShard; + +/// When making scheduling decisions, it is useful to have the ScheduleContext for a whole +/// tenant while considering the individual shards within it. This iterator is a helper +/// that gathers all the shards in a tenant and then yields them together with a ScheduleContext +/// for the tenant. +pub(super) struct TenantShardContextIterator<'a> { + schedule_mode: ScheduleMode, + inner: std::collections::btree_map::IterMut<'a, TenantShardId, TenantShard>, +} + +impl<'a> TenantShardContextIterator<'a> { + pub(super) fn new( + tenants: &'a mut BTreeMap, + schedule_mode: ScheduleMode, + ) -> Self { + Self { + schedule_mode, + inner: tenants.iter_mut(), + } + } +} + +impl<'a> Iterator for TenantShardContextIterator<'a> { + type Item = (TenantId, ScheduleContext, Vec<&'a mut TenantShard>); + + fn next(&mut self) -> Option { + let mut tenant_shards = Vec::new(); + let mut schedule_context = ScheduleContext::new(self.schedule_mode.clone()); + loop { + let (tenant_shard_id, shard) = self.inner.next()?; + + if tenant_shard_id.is_shard_zero() { + // Cleared on last shard of previous tenant + assert!(tenant_shards.is_empty()); + } + + // Accumulate the schedule context for all the shards in a tenant + schedule_context.avoid(&shard.intent.all_pageservers()); + if let Some(attached) = shard.intent.get_attached() { + schedule_context.push_attached(*attached); + } + tenant_shards.push(shard); + + if tenant_shard_id.shard_number.0 == tenant_shard_id.shard_count.count() - 1 { + return Some((tenant_shard_id.tenant_id, schedule_context, tenant_shards)); + } + } + } +} + +#[cfg(test)] +mod tests { + use std::{collections::BTreeMap, str::FromStr}; + + use pageserver_api::controller_api::PlacementPolicy; + use utils::shard::{ShardCount, ShardNumber}; + + use crate::{ + scheduler::test_utils::make_test_nodes, service::Scheduler, + tenant_shard::tests::make_test_tenant_with_id, + }; + + use super::*; + + #[test] + fn test_context_iterator() { + // Hand-crafted tenant IDs to ensure they appear in the expected order when put into + // a btreemap & iterated + let mut t_1_shards = make_test_tenant_with_id( + TenantId::from_str("af0480929707ee75372337efaa5ecf96").unwrap(), + PlacementPolicy::Attached(1), + ShardCount(1), + None, + ); + let t_2_shards = make_test_tenant_with_id( + TenantId::from_str("bf0480929707ee75372337efaa5ecf96").unwrap(), + PlacementPolicy::Attached(1), + ShardCount(4), + None, + ); + let mut t_3_shards = make_test_tenant_with_id( + TenantId::from_str("cf0480929707ee75372337efaa5ecf96").unwrap(), + PlacementPolicy::Attached(1), + ShardCount(1), + None, + ); + + let t1_id = t_1_shards[0].tenant_shard_id.tenant_id; + let t2_id = t_2_shards[0].tenant_shard_id.tenant_id; + let t3_id = t_3_shards[0].tenant_shard_id.tenant_id; + + let mut tenants = BTreeMap::new(); + tenants.insert(t_1_shards[0].tenant_shard_id, t_1_shards.pop().unwrap()); + for shard in t_2_shards { + tenants.insert(shard.tenant_shard_id, shard); + } + tenants.insert(t_3_shards[0].tenant_shard_id, t_3_shards.pop().unwrap()); + + let nodes = make_test_nodes(3, &[]); + let mut scheduler = Scheduler::new(nodes.values()); + let mut context = ScheduleContext::default(); + for shard in tenants.values_mut() { + shard.schedule(&mut scheduler, &mut context).unwrap(); + } + + let mut iter = TenantShardContextIterator::new(&mut tenants, ScheduleMode::Speculative); + let (tenant_id, context, shards) = iter.next().unwrap(); + assert_eq!(tenant_id, t1_id); + assert_eq!(shards[0].tenant_shard_id.shard_number, ShardNumber(0)); + assert_eq!(shards.len(), 1); + assert_eq!(context.attach_count(), 1); + + let (tenant_id, context, shards) = iter.next().unwrap(); + assert_eq!(tenant_id, t2_id); + assert_eq!(shards[0].tenant_shard_id.shard_number, ShardNumber(0)); + assert_eq!(shards[1].tenant_shard_id.shard_number, ShardNumber(1)); + assert_eq!(shards[2].tenant_shard_id.shard_number, ShardNumber(2)); + assert_eq!(shards[3].tenant_shard_id.shard_number, ShardNumber(3)); + assert_eq!(shards.len(), 4); + assert_eq!(context.attach_count(), 4); + + let (tenant_id, context, shards) = iter.next().unwrap(); + assert_eq!(tenant_id, t3_id); + assert_eq!(shards[0].tenant_shard_id.shard_number, ShardNumber(0)); + assert_eq!(shards.len(), 1); + assert_eq!(context.attach_count(), 1); + + for shard in tenants.values_mut() { + shard.intent.clear(&mut scheduler); + } + } +} diff --git a/storage_controller/src/tenant_shard.rs b/storage_controller/src/tenant_shard.rs index 27c97d3b86..2eb98ee825 100644 --- a/storage_controller/src/tenant_shard.rs +++ b/storage_controller/src/tenant_shard.rs @@ -1574,13 +1574,20 @@ pub(crate) mod tests { ) } - fn make_test_tenant( + pub(crate) fn make_test_tenant( policy: PlacementPolicy, shard_count: ShardCount, preferred_az: Option, ) -> Vec { - let tenant_id = TenantId::generate(); + make_test_tenant_with_id(TenantId::generate(), policy, shard_count, preferred_az) + } + pub(crate) fn make_test_tenant_with_id( + tenant_id: TenantId, + policy: PlacementPolicy, + shard_count: ShardCount, + preferred_az: Option, + ) -> Vec { (0..shard_count.count()) .map(|i| { let shard_number = ShardNumber(i); diff --git a/storage_scrubber/src/checks.rs b/storage_scrubber/src/checks.rs index 8d855d263c..1b4ff01a17 100644 --- a/storage_scrubber/src/checks.rs +++ b/storage_scrubber/src/checks.rs @@ -4,17 +4,21 @@ use itertools::Itertools; use pageserver::tenant::checks::check_valid_layermap; use pageserver::tenant::layer_map::LayerMap; use pageserver::tenant::remote_timeline_client::index::LayerFileMetadata; +use pageserver::tenant::remote_timeline_client::manifest::TenantManifest; use pageserver_api::shard::ShardIndex; use tokio_util::sync::CancellationToken; use tracing::{info, warn}; use utils::generation::Generation; use utils::id::TimelineId; +use utils::shard::TenantShardId; use crate::cloud_admin_api::BranchData; use crate::metadata_stream::stream_listing; use crate::{download_object_with_retries, RootTarget, TenantShardTimelineId}; use futures_util::StreamExt; -use pageserver::tenant::remote_timeline_client::{parse_remote_index_path, remote_layer_path}; +use pageserver::tenant::remote_timeline_client::{ + parse_remote_index_path, parse_remote_tenant_manifest_path, remote_layer_path, +}; use pageserver::tenant::storage_layer::LayerName; use pageserver::tenant::IndexPart; use remote_storage::{GenericRemoteStorage, ListingObject, RemotePath}; @@ -527,3 +531,132 @@ async fn list_timeline_blobs_impl( unknown_keys, })) } + +pub(crate) struct RemoteTenantManifestInfo { + pub(crate) latest_generation: Option, + pub(crate) manifests: Vec<(Generation, ListingObject)>, +} + +pub(crate) enum ListTenantManifestResult { + WithErrors { + errors: Vec<(String, String)>, + #[allow(dead_code)] + unknown_keys: Vec, + }, + NoErrors(RemoteTenantManifestInfo), +} + +/// Lists the tenant manifests in remote storage and parses the latest one, returning a [`ListTenantManifestResult`] object. +pub(crate) async fn list_tenant_manifests( + remote_client: &GenericRemoteStorage, + tenant_id: TenantShardId, + root_target: &RootTarget, +) -> anyhow::Result { + let mut errors = Vec::new(); + let mut unknown_keys = Vec::new(); + + let mut tenant_root_target = root_target.tenant_root(&tenant_id); + let original_prefix = tenant_root_target.prefix_in_bucket.clone(); + const TENANT_MANIFEST_STEM: &str = "tenant-manifest"; + tenant_root_target.prefix_in_bucket += TENANT_MANIFEST_STEM; + tenant_root_target.delimiter = String::new(); + + let mut manifests: Vec<(Generation, ListingObject)> = Vec::new(); + + let prefix_str = &original_prefix + .strip_prefix("/") + .unwrap_or(&original_prefix); + + let mut stream = std::pin::pin!(stream_listing(remote_client, &tenant_root_target)); + 'outer: while let Some(obj) = stream.next().await { + let (key, Some(obj)) = obj? else { + panic!("ListingObject not specified"); + }; + + 'err: { + // TODO a let chain would be nicer here. + let Some(name) = key.object_name() else { + break 'err; + }; + if !name.starts_with(TENANT_MANIFEST_STEM) { + break 'err; + } + let Some(generation) = parse_remote_tenant_manifest_path(key.clone()) else { + break 'err; + }; + tracing::debug!("tenant manifest {key}"); + manifests.push((generation, obj)); + continue 'outer; + } + tracing::info!("Listed an unknown key: {key}"); + unknown_keys.push(obj); + } + + if manifests.is_empty() { + tracing::debug!("No manifest for timeline."); + + return Ok(ListTenantManifestResult::WithErrors { + errors, + unknown_keys, + }); + } + if !unknown_keys.is_empty() { + errors.push(((*prefix_str).to_owned(), "unknown keys listed".to_string())); + + return Ok(ListTenantManifestResult::WithErrors { + errors, + unknown_keys, + }); + } + + // Find the manifest with the highest generation + let (latest_generation, latest_listing_object) = manifests + .iter() + .max_by_key(|i| i.0) + .map(|(g, obj)| (*g, obj.clone())) + .unwrap(); + + let manifest_bytes = + match download_object_with_retries(remote_client, &latest_listing_object.key).await { + Ok(bytes) => bytes, + Err(e) => { + // It is possible that the tenant gets deleted in-between we list the objects + // and we download the manifest file. + errors.push(( + latest_listing_object.key.get_path().as_str().to_owned(), + format!("failed to download tenant-manifest.json: {e}"), + )); + return Ok(ListTenantManifestResult::WithErrors { + errors, + unknown_keys, + }); + } + }; + + match TenantManifest::from_json_bytes(&manifest_bytes) { + Ok(_manifest) => { + return Ok(ListTenantManifestResult::NoErrors( + RemoteTenantManifestInfo { + latest_generation: Some(latest_generation), + manifests, + }, + )); + } + Err(parse_error) => errors.push(( + latest_listing_object.key.get_path().as_str().to_owned(), + format!("tenant-manifest.json body parsing error: {parse_error}"), + )), + } + + if errors.is_empty() { + errors.push(( + (*prefix_str).to_owned(), + "Unexpected: no errors did not lead to a successfully parsed blob return".to_string(), + )); + } + + Ok(ListTenantManifestResult::WithErrors { + errors, + unknown_keys, + }) +} diff --git a/storage_scrubber/src/cloud_admin_api.rs b/storage_scrubber/src/cloud_admin_api.rs index c9a62cd256..b1dfe3a53f 100644 --- a/storage_scrubber/src/cloud_admin_api.rs +++ b/storage_scrubber/src/cloud_admin_api.rs @@ -1,3 +1,5 @@ +use std::error::Error as _; + use chrono::{DateTime, Utc}; use futures::Future; use hex::FromHex; @@ -30,14 +32,18 @@ impl std::fmt::Display for Error { match &self.kind { ErrorKind::RequestSend(e) => write!( f, - "Failed to send a request. Context: {}, error: {}", - self.context, e + "Failed to send a request. Context: {}, error: {}{}", + self.context, + e, + e.source().map(|e| format!(": {e}")).unwrap_or_default() ), ErrorKind::BodyRead(e) => { write!( f, - "Failed to read a request body. Context: {}, error: {}", - self.context, e + "Failed to read a request body. Context: {}, error: {}{}", + self.context, + e, + e.source().map(|e| format!(": {e}")).unwrap_or_default() ) } ErrorKind::ResponseStatus(status) => { diff --git a/storage_scrubber/src/pageserver_physical_gc.rs b/storage_scrubber/src/pageserver_physical_gc.rs index 1e69ddbf15..20cb9c3633 100644 --- a/storage_scrubber/src/pageserver_physical_gc.rs +++ b/storage_scrubber/src/pageserver_physical_gc.rs @@ -2,12 +2,16 @@ use std::collections::{BTreeMap, BTreeSet, HashMap}; use std::sync::Arc; use std::time::Duration; -use crate::checks::{list_timeline_blobs, BlobDataParseResult}; +use crate::checks::{ + list_tenant_manifests, list_timeline_blobs, BlobDataParseResult, ListTenantManifestResult, +}; use crate::metadata_stream::{stream_tenant_timelines, stream_tenants}; use crate::{init_remote, BucketConfig, NodeKind, RootTarget, TenantShardTimelineId, MAX_RETRIES}; use futures_util::{StreamExt, TryStreamExt}; use pageserver::tenant::remote_timeline_client::index::LayerFileMetadata; -use pageserver::tenant::remote_timeline_client::{parse_remote_index_path, remote_layer_path}; +use pageserver::tenant::remote_timeline_client::{ + parse_remote_index_path, parse_remote_tenant_manifest_path, remote_layer_path, +}; use pageserver::tenant::storage_layer::LayerName; use pageserver::tenant::IndexPart; use pageserver_api::controller_api::TenantDescribeResponse; @@ -25,6 +29,7 @@ use utils::id::{TenantId, TenantTimelineId}; #[derive(Serialize, Default)] pub struct GcSummary { indices_deleted: usize, + tenant_manifests_deleted: usize, remote_storage_errors: usize, controller_api_errors: usize, ancestor_layers_deleted: usize, @@ -34,12 +39,14 @@ impl GcSummary { fn merge(&mut self, other: Self) { let Self { indices_deleted, + tenant_manifests_deleted, remote_storage_errors, ancestor_layers_deleted, controller_api_errors, } = other; self.indices_deleted += indices_deleted; + self.tenant_manifests_deleted += tenant_manifests_deleted; self.remote_storage_errors += remote_storage_errors; self.ancestor_layers_deleted += ancestor_layers_deleted; self.controller_api_errors += controller_api_errors; @@ -352,6 +359,69 @@ async fn maybe_delete_index( } } +async fn maybe_delete_tenant_manifest( + remote_client: &GenericRemoteStorage, + min_age: &Duration, + latest_gen: Generation, + obj: &ListingObject, + mode: GcMode, + summary: &mut GcSummary, +) { + // Validation: we will only delete things that parse cleanly + let basename = obj.key.get_path().file_name().unwrap(); + let Some(candidate_generation) = + parse_remote_tenant_manifest_path(RemotePath::from_string(basename).unwrap()) + else { + // A strange key: we will not delete this because we don't understand it. + tracing::warn!("Bad index key"); + return; + }; + + // Validation: we will only delete manifests more than one generation old, and in fact we + // should never be called with such recent generations. + if candidate_generation >= latest_gen { + tracing::warn!("Deletion candidate is >= latest generation, this is a bug!"); + return; + } else if candidate_generation.next() == latest_gen { + tracing::warn!("Deletion candidate is >= latest generation - 1, this is a bug!"); + return; + } + + if !is_old_enough(min_age, obj, summary) { + return; + } + + if matches!(mode, GcMode::DryRun) { + tracing::info!("Dry run: would delete this key"); + return; + } + + // All validations passed: erase the object + let cancel = CancellationToken::new(); + match backoff::retry( + || remote_client.delete(&obj.key, &cancel), + |_| false, + 3, + MAX_RETRIES as u32, + "maybe_delete_tenant_manifest", + &cancel, + ) + .await + { + None => { + unreachable!("Using a dummy cancellation token"); + } + Some(Ok(_)) => { + tracing::info!("Successfully deleted tenant manifest"); + summary.tenant_manifests_deleted += 1; + } + Some(Err(e)) => { + tracing::warn!("Failed to delete tenant manifest: {e}"); + summary.remote_storage_errors += 1; + } + } +} + #[allow(clippy::too_many_arguments)] async fn gc_ancestor( remote_client: &GenericRemoteStorage, @@ -451,13 +521,100 @@ async fn gc_ancestor( Ok(()) } +async fn gc_tenant_manifests( + remote_client: &GenericRemoteStorage, + min_age: Duration, + target: &RootTarget, + mode: GcMode, + tenant_shard_id: TenantShardId, +) -> anyhow::Result { + let mut gc_summary = GcSummary::default(); + match list_tenant_manifests(remote_client, tenant_shard_id, target).await? { + ListTenantManifestResult::WithErrors { + errors, + unknown_keys: _, + } => { + for (_key, error) in errors { + tracing::warn!(%tenant_shard_id, "list_tenant_manifests: {error}"); + } + } + ListTenantManifestResult::NoErrors(mut manifest_info) => { + let Some(latest_gen) = manifest_info.latest_generation else { + return Ok(gc_summary); + }; + manifest_info + .manifests + .sort_by_key(|(generation, _obj)| *generation); + // skip the two latest generations (they don't neccessarily have to be 1 apart from each other) + let candidates = manifest_info.manifests.iter().rev().skip(2); + for (_generation, key) in candidates { + maybe_delete_tenant_manifest( + remote_client, + &min_age, + latest_gen, + key, + mode, + &mut gc_summary, + ) + .instrument( + info_span!("maybe_delete_tenant_manifest", %tenant_shard_id, ?latest_gen, %key.key), + ) + .await; + } + } + } + Ok(gc_summary) +} + +async fn gc_timeline( + remote_client: &GenericRemoteStorage, + min_age: &Duration, + target: &RootTarget, + mode: GcMode, + ttid: TenantShardTimelineId, + accumulator: &Arc>, +) -> anyhow::Result { + let mut summary = GcSummary::default(); + let data = list_timeline_blobs(remote_client, ttid, target).await?; + + let (index_part, latest_gen, candidates) = match &data.blob_data { + BlobDataParseResult::Parsed { + index_part, + index_part_generation, + s3_layers: _s3_layers, + } => (index_part, *index_part_generation, data.unused_index_keys), + BlobDataParseResult::Relic => { + // Post-deletion tenant location: don't try and GC it. + return Ok(summary); + } + BlobDataParseResult::Incorrect { + errors, + s3_layers: _, + } => { + // Our primary purpose isn't to report on bad data, but log this rather than skipping silently + tracing::warn!("Skipping timeline {ttid}, bad metadata: {errors:?}"); + return Ok(summary); + } + }; + + accumulator.lock().unwrap().update(ttid, index_part); + + for key in candidates { + maybe_delete_index(remote_client, min_age, latest_gen, &key, mode, &mut summary) + .instrument(info_span!("maybe_delete_index", %ttid, ?latest_gen, %key.key)) + .await; + } + + Ok(summary) +} + /// Physical garbage collection: removing unused S3 objects. /// /// This is distinct from the garbage collection done inside the pageserver, which operates at a higher level /// (keys, layers). This type of garbage collection is about removing: /// - Objects that were uploaded but never referenced in the remote index (e.g. because of a shutdown between /// uploading a layer and uploading an index) -/// - Index objects from historic generations +/// - Index objects and tenant manifests from historic generations /// /// This type of GC is not necessary for correctness: rather it serves to reduce wasted storage capacity, and /// make sure that object listings don't get slowed down by large numbers of garbage objects. @@ -470,6 +627,7 @@ pub async fn pageserver_physical_gc( ) -> anyhow::Result { let (remote_client, target) = init_remote(bucket_config.clone(), NodeKind::Pageserver).await?; + let remote_client = Arc::new(remote_client); let tenants = if tenant_shard_ids.is_empty() { futures::future::Either::Left(stream_tenants(&remote_client, &target)) } else { @@ -484,59 +642,59 @@ pub async fn pageserver_physical_gc( let accumulator = Arc::new(std::sync::Mutex::new(TenantRefAccumulator::default())); // Generate a stream of TenantTimelineId - let timelines = tenants.map_ok(|t| stream_tenant_timelines(&remote_client, &target, t)); - let timelines = timelines.try_buffered(CONCURRENCY); - let timelines = timelines.try_flatten(); - - // Generate a stream of S3TimelineBlobData - async fn gc_timeline( - remote_client: &GenericRemoteStorage, - min_age: &Duration, - target: &RootTarget, - mode: GcMode, - ttid: TenantShardTimelineId, - accumulator: &Arc>, - ) -> anyhow::Result { - let mut summary = GcSummary::default(); - let data = list_timeline_blobs(remote_client, ttid, target).await?; - - let (index_part, latest_gen, candidates) = match &data.blob_data { - BlobDataParseResult::Parsed { - index_part, - index_part_generation, - s3_layers: _s3_layers, - } => (index_part, *index_part_generation, data.unused_index_keys), - BlobDataParseResult::Relic => { - // Post-deletion tenant location: don't try and GC it. - return Ok(summary); - } - BlobDataParseResult::Incorrect { - errors, - s3_layers: _, - } => { - // Our primary purpose isn't to report on bad data, but log this rather than skipping silently - tracing::warn!("Skipping timeline {ttid}, bad metadata: {errors:?}"); - return Ok(summary); - } - }; - - accumulator.lock().unwrap().update(ttid, index_part); - - for key in candidates { - maybe_delete_index(remote_client, min_age, latest_gen, &key, mode, &mut summary) - .instrument(info_span!("maybe_delete_index", %ttid, ?latest_gen, %key.key)) - .await; - } - - Ok(summary) + enum GcSummaryOrContent { + Content(T), + GcSummary(GcSummary), } + let timelines = tenants.map_ok(|tenant_shard_id| { + let target_ref = ⌖ + let remote_client_ref = &remote_client; + async move { + let summaries_from_manifests = match gc_tenant_manifests( + remote_client_ref, + min_age, + target_ref, + mode, + tenant_shard_id, + ) + .await + { + Ok(gc_summary) => vec![Ok(GcSummaryOrContent::::GcSummary( + gc_summary, + ))], + Err(e) => { + tracing::warn!(%tenant_shard_id, "Error in gc_tenant_manifests: {e}"); + Vec::new() + } + }; + stream_tenant_timelines(remote_client_ref, target_ref, tenant_shard_id) + .await + .map(|stream| { + stream + .map_ok(GcSummaryOrContent::Content) + .chain(futures::stream::iter(summaries_from_manifests.into_iter())) + }) + } + }); + let timelines = std::pin::pin!(timelines.try_buffered(CONCURRENCY)); + let timelines = timelines.try_flatten(); let mut summary = GcSummary::default(); // Drain futures for per-shard GC, populating accumulator as a side effect { - let timelines = timelines.map_ok(|ttid| { - gc_timeline(&remote_client, &min_age, &target, mode, ttid, &accumulator) + let timelines = timelines.map_ok(|summary_or_ttid| match summary_or_ttid { + GcSummaryOrContent::Content(ttid) => futures::future::Either::Left(gc_timeline( + &remote_client, + &min_age, + &target, + mode, + ttid, + &accumulator, + )), + GcSummaryOrContent::GcSummary(gc_summary) => { + futures::future::Either::Right(futures::future::ok(gc_summary)) + } }); let mut timelines = std::pin::pin!(timelines.try_buffered(CONCURRENCY)); diff --git a/test_runner/conftest.py b/test_runner/conftest.py index 84eda52d33..887bfef478 100644 --- a/test_runner/conftest.py +++ b/test_runner/conftest.py @@ -13,5 +13,5 @@ pytest_plugins = ( "fixtures.pg_stats", "fixtures.compare_fixtures", "fixtures.slow", - "fixtures.flaky", + "fixtures.reruns", ) diff --git a/test_runner/fixtures/benchmark_fixture.py b/test_runner/fixtures/benchmark_fixture.py index bb8e75902e..fa3747c08f 100644 --- a/test_runner/fixtures/benchmark_fixture.py +++ b/test_runner/fixtures/benchmark_fixture.py @@ -266,6 +266,16 @@ class NeonBenchmarker: name = f"{self.PROPERTY_PREFIX}_{metric_name}" if labels is None: labels = {} + + # Sometimes mypy can't catch non-numeric values, + # so adding a check here + try: + float(metric_value) + except ValueError as e: + raise ValueError( + f"`metric_value` (`{metric_value}`) must be a NUMERIC-friendly data type" + ) from e + self.property_recorder( name, { diff --git a/test_runner/fixtures/endpoint/http.py b/test_runner/fixtures/endpoint/http.py index db3723b7cc..1cd9158c68 100644 --- a/test_runner/fixtures/endpoint/http.py +++ b/test_runner/fixtures/endpoint/http.py @@ -1,5 +1,7 @@ from __future__ import annotations +import urllib.parse + import requests from requests.adapters import HTTPAdapter @@ -20,7 +22,9 @@ class EndpointHttpClient(requests.Session): return res.json() def database_schema(self, database: str): - res = self.get(f"http://localhost:{self.port}/database_schema?database={database}") + res = self.get( + f"http://localhost:{self.port}/database_schema?database={urllib.parse.quote(database, safe='')}" + ) res.raise_for_status() return res.text diff --git a/test_runner/fixtures/flaky.py b/test_runner/fixtures/flaky.py deleted file mode 100644 index 01634a29c5..0000000000 --- a/test_runner/fixtures/flaky.py +++ /dev/null @@ -1,78 +0,0 @@ -from __future__ import annotations - -import json -from collections.abc import MutableMapping -from pathlib import Path -from typing import TYPE_CHECKING, cast - -import pytest -from _pytest.config import Config -from _pytest.config.argparsing import Parser -from allure_commons.types import LabelType -from allure_pytest.utils import allure_name, allure_suite_labels - -from fixtures.log_helper import log - -if TYPE_CHECKING: - from collections.abc import MutableMapping - from typing import Any - - -""" -The plugin reruns flaky tests. -It uses `pytest.mark.flaky` provided by `pytest-rerunfailures` plugin and flaky tests detected by `scripts/flaky_tests.py` - -Note: the logic of getting flaky tests is extracted to a separate script to avoid running it for each of N xdist workers -""" - - -def pytest_addoption(parser: Parser): - parser.addoption( - "--flaky-tests-json", - action="store", - type=Path, - help="Path to json file with flaky tests generated by scripts/flaky_tests.py", - ) - - -def pytest_collection_modifyitems(config: Config, items: list[pytest.Item]): - if not config.getoption("--flaky-tests-json"): - return - - # Any error with getting flaky tests aren't critical, so just do not rerun any tests - flaky_json = config.getoption("--flaky-tests-json") - if not flaky_json.exists(): - return - - content = flaky_json.read_text() - try: - flaky_tests = json.loads(content) - except ValueError: - log.error(f"Can't parse {content} as json") - return - - for item in items: - # Use the same logic for constructing test name as Allure does (we store allure-provided data in DB) - # Ref https://github.com/allure-framework/allure-python/blob/2.13.1/allure-pytest/src/listener.py#L98-L100 - allure_labels = dict(allure_suite_labels(item)) - parent_suite = str(allure_labels.get(LabelType.PARENT_SUITE)) - suite = str(allure_labels.get(LabelType.SUITE)) - params = item.callspec.params if hasattr(item, "callspec") else {} - name = allure_name(item, params) - - if flaky_tests.get(parent_suite, {}).get(suite, {}).get(name, False): - # Rerun 3 times = 1 original run + 2 reruns - log.info(f"Marking {item.nodeid} as flaky. It will be rerun up to 3 times") - item.add_marker(pytest.mark.flaky(reruns=2)) - - # pytest-rerunfailures is not compatible with pytest-timeout (timeout is not set for reruns), - # we can workaround it by setting `timeout_func_only` to True[1]. - # Unfortunately, setting `timeout_func_only = True` globally in pytest.ini is broken[2], - # but we still can do it using pytest marker. - # - # - [1] https://github.com/pytest-dev/pytest-rerunfailures/issues/99 - # - [2] https://github.com/pytest-dev/pytest-timeout/issues/142 - timeout_marker = item.get_closest_marker("timeout") - if timeout_marker is not None: - kwargs = cast("MutableMapping[str, Any]", timeout_marker.kwargs) - kwargs["func_only"] = True diff --git a/test_runner/fixtures/metrics.py b/test_runner/fixtures/metrics.py index 3f90c233a6..ffdbd988a5 100644 --- a/test_runner/fixtures/metrics.py +++ b/test_runner/fixtures/metrics.py @@ -173,6 +173,7 @@ PAGESERVER_PER_TENANT_METRICS: tuple[str, ...] = ( counter("pageserver_tenant_throttling_count_accounted_finish"), counter("pageserver_tenant_throttling_wait_usecs_sum"), counter("pageserver_tenant_throttling_count"), + *histogram("pageserver_page_service_batch_size"), *PAGESERVER_PER_TENANT_REMOTE_TIMELINE_CLIENT_METRICS, # "pageserver_directory_entries_count", -- only used if above a certain threshold # "pageserver_broken_tenants_count" -- used only for broken diff --git a/test_runner/fixtures/neon_fixtures.py b/test_runner/fixtures/neon_fixtures.py index a45a311dc2..60c4a23936 100644 --- a/test_runner/fixtures/neon_fixtures.py +++ b/test_runner/fixtures/neon_fixtures.py @@ -269,7 +269,7 @@ class PgProtocol: for match in re.finditer(r"-c(\w*)=(\w*)", options): key = match.group(1) val = match.group(2) - if "server_options" in conn_options: + if "server_settings" in conn_options: conn_options["server_settings"].update({key: val}) else: conn_options["server_settings"] = {key: val} @@ -1095,6 +1095,17 @@ class NeonEnv: # the pageserver taking a long time to start up due to syncfs flushing other tests' data "no_sync": True, } + + # Batching (https://github.com/neondatabase/neon/issues/9377): + # enable batching by default in tests and benchmarks. + # Compat tests are exempt because old versions fail to parse the new config. + if not config.compatibility_neon_binpath: + ps_cfg["page_service_pipelining"] = { + "mode": "pipelined", + "execution": "concurrent-futures", + "max_batch_size": 32, + } + if self.pageserver_virtual_file_io_engine is not None: ps_cfg["virtual_file_io_engine"] = self.pageserver_virtual_file_io_engine if config.pageserver_default_tenant_config_compaction_algorithm is not None: @@ -1736,7 +1747,7 @@ class NeonStorageController(MetricsGetter, LogUtils): def storage_controller_ready(): assert self.ready() is True - wait_until(30, 1, storage_controller_ready) + wait_until(storage_controller_ready) return time.time() - t1 def attach_hook_issue( @@ -2574,7 +2585,7 @@ class NeonPageserver(PgProtocol, LogUtils): log.info(f"any_unstable={any_unstable}") assert not any_unstable - wait_until(20, 0.5, complete) + wait_until(complete) def __enter__(self) -> Self: return self @@ -3801,12 +3812,11 @@ class Endpoint(PgProtocol, LogUtils): assert size_to_bytes(size) >= size_to_bytes( "1MB" ), "LFC size cannot be set less than 1MB" - # shared_buffers = 512kB to make postgres use LFC intensively - # neon.max_file_cache_size and neon.file_cache size limit are - # set to 1MB because small LFC is better for testing (helps to find more problems) + lfc_path_escaped = str(lfc_path).replace("'", "''") config_lines = [ - "shared_buffers = 512kB", - f"neon.file_cache_path = '{self.lfc_path()}'", + f"neon.file_cache_path = '{lfc_path_escaped}'", + # neon.max_file_cache_size and neon.file_cache size limits are + # set to 1MB because small LFC is better for testing (helps to find more problems) "neon.max_file_cache_size = 1MB", "neon.file_cache_size_limit = 1MB", ] + config_lines @@ -3934,6 +3944,35 @@ class Endpoint(PgProtocol, LogUtils): log.info(json.dumps(dict(data_dict, **kwargs))) json.dump(dict(data_dict, **kwargs), file, indent=4) + def respec_deep(self, **kwargs: Any) -> None: + """ + Update the endpoint.json file taking into account nested keys. + It does one level deep update. Should enough for most cases. + Distinct method from respec() to do not break existing functionality. + NOTE: This method also updates the spec.json file, not endpoint.json. + We need it because neon_local also writes to spec.json, so intended + use-case is i) start endpoint with some config, ii) respec_deep(), + iii) call reconfigure() to apply the changes. + """ + config_path = os.path.join(self.endpoint_path(), "spec.json") + with open(config_path) as f: + data_dict: dict[str, Any] = json.load(f) + + log.info("Current compute spec: %s", json.dumps(data_dict, indent=4)) + + for key, value in kwargs.items(): + if isinstance(value, dict): + if key not in data_dict: + data_dict[key] = value + else: + data_dict[key] = {**data_dict[key], **value} + else: + data_dict[key] = value + + with open(config_path, "w") as file: + log.info("Updating compute spec to: %s", json.dumps(data_dict, indent=4)) + json.dump(data_dict, file, indent=4) + # Please note: Migrations only run if pg_skip_catalog_updates is false def wait_for_migrations(self, num_migrations: int = 11): with self.cursor() as cur: @@ -3943,7 +3982,7 @@ class Endpoint(PgProtocol, LogUtils): migration_id: int = cur.fetchall()[0][0] assert migration_id >= num_migrations - wait_until(20, 0.5, check_migrations_done) + wait_until(check_migrations_done) # Mock the extension part of spec passed from control plane for local testing # endpooint.rs adds content of this file as a part of the spec.json @@ -4375,6 +4414,10 @@ class Safekeeper(LogUtils): log.info(f"sk {self.id} flush LSN: {flush_lsn}") return flush_lsn + def get_commit_lsn(self, tenant_id: TenantId, timeline_id: TimelineId) -> Lsn: + timeline_status = self.http_client().timeline_status(tenant_id, timeline_id) + return timeline_status.commit_lsn + def pull_timeline( self, srcs: list[Safekeeper], tenant_id: TenantId, timeline_id: TimelineId ) -> dict[str, Any]: @@ -4455,12 +4498,10 @@ class Safekeeper(LogUtils): ) assert stat.remote_consistent_lsn >= lsn and stat.backup_lsn >= lsn.segment_lsn() - # xxx: max wait is long because we might be waiting for reconnection from - # pageserver to this safekeeper - wait_until(30, 1, are_lsns_advanced) + wait_until(are_lsns_advanced) client.checkpoint(tenant_id, timeline_id) if wait_wal_removal: - wait_until(30, 1, are_segments_removed) + wait_until(are_segments_removed) def wait_until_paused(self, failpoint: str): msg = f"at failpoint {failpoint}" @@ -4469,7 +4510,7 @@ class Safekeeper(LogUtils): log.info(f"waiting for hitting failpoint {failpoint}") self.assert_log_contains(msg) - wait_until(20, 0.5, paused) + wait_until(paused) class NeonBroker(LogUtils): @@ -4920,6 +4961,33 @@ def wait_for_last_flush_lsn( return min(results) +def wait_for_commit_lsn( + env: NeonEnv, + tenant: TenantId, + timeline: TimelineId, + lsn: Lsn, +) -> Lsn: + # TODO: it would be better to poll this in the compute, but there's no API for it. See: + # https://github.com/neondatabase/neon/issues/9758 + "Wait for the given LSN to be committed on any Safekeeper" + + max_commit_lsn = Lsn(0) + for i in range(1000): + for sk in env.safekeepers: + commit_lsn = sk.get_commit_lsn(tenant, timeline) + if commit_lsn >= lsn: + log.info(f"{tenant}/{timeline} at commit_lsn {commit_lsn}") + return commit_lsn + max_commit_lsn = max(max_commit_lsn, commit_lsn) + + if i % 10 == 0: + log.info( + f"{tenant}/{timeline} waiting for commit_lsn to reach {lsn}, now {max_commit_lsn}" + ) + time.sleep(0.1) + raise Exception(f"timed out while waiting for commit_lsn to reach {lsn}, was {max_commit_lsn}") + + def flush_ep_to_pageserver( env: NeonEnv, ep: Endpoint, diff --git a/test_runner/fixtures/pageserver/utils.py b/test_runner/fixtures/pageserver/utils.py index 46700e3fe3..7c10edc5fc 100644 --- a/test_runner/fixtures/pageserver/utils.py +++ b/test_runner/fixtures/pageserver/utils.py @@ -13,7 +13,7 @@ from mypy_boto3_s3.type_defs import ( from fixtures.common_types import Lsn, TenantId, TenantShardId, TimelineId from fixtures.log_helper import log from fixtures.pageserver.http import PageserverApiException, PageserverHttpClient -from fixtures.remote_storage import RemoteStorage, RemoteStorageKind, S3Storage +from fixtures.remote_storage import RemoteStorage, S3Storage from fixtures.utils import wait_until if TYPE_CHECKING: @@ -269,12 +269,7 @@ def wait_timeline_detail_404( pageserver_http: PageserverHttpClient, tenant_id: TenantId | TenantShardId, timeline_id: TimelineId, - iterations: int, - interval: float | None = None, ): - if interval is None: - interval = 0.25 - def timeline_is_missing(): data = {} try: @@ -287,19 +282,17 @@ def wait_timeline_detail_404( raise RuntimeError(f"Timeline exists state {data.get('state')}") - wait_until(iterations, interval, func=timeline_is_missing) + wait_until(timeline_is_missing) def timeline_delete_wait_completed( pageserver_http: PageserverHttpClient, tenant_id: TenantId | TenantShardId, timeline_id: TimelineId, - iterations: int = 20, - interval: float | None = None, **delete_args, ) -> None: pageserver_http.timeline_delete(tenant_id=tenant_id, timeline_id=timeline_id, **delete_args) - wait_timeline_detail_404(pageserver_http, tenant_id, timeline_id, iterations, interval) + wait_timeline_detail_404(pageserver_http, tenant_id, timeline_id) # remote_storage must not be None, but that's easier for callers to make mypy happy @@ -453,7 +446,3 @@ def many_small_layers_tenant_config() -> dict[str, Any]: "checkpoint_distance": 1024**2, "image_creation_threshold": 100, } - - -def poll_for_remote_storage_iterations(remote_storage_kind: RemoteStorageKind) -> int: - return 40 if remote_storage_kind is RemoteStorageKind.REAL_S3 else 15 diff --git a/test_runner/fixtures/paths.py b/test_runner/fixtures/paths.py index 1c71abea19..80777d65e9 100644 --- a/test_runner/fixtures/paths.py +++ b/test_runner/fixtures/paths.py @@ -30,7 +30,7 @@ def get_test_dir(request: FixtureRequest, top_output_dir: Path, prefix: str | No test_name = request.node.name test_dir = top_output_dir / f"{prefix or ''}{test_name.replace('/', '-')}" - # We rerun flaky tests multiple times, use a separate directory for each run. + # We rerun failed tests multiple times, use a separate directory for each run. if (suffix := getattr(request.node, "execution_count", None)) is not None: test_dir = test_dir.parent / f"{test_dir.name}-{suffix}" diff --git a/test_runner/fixtures/reruns.py b/test_runner/fixtures/reruns.py new file mode 100644 index 0000000000..f2a25ae8f6 --- /dev/null +++ b/test_runner/fixtures/reruns.py @@ -0,0 +1,31 @@ +from __future__ import annotations + +from collections.abc import MutableMapping +from typing import TYPE_CHECKING, cast + +import pytest + +if TYPE_CHECKING: + from collections.abc import MutableMapping + from typing import Any + + from _pytest.config import Config + + +def pytest_collection_modifyitems(config: Config, items: list[pytest.Item]): + # pytest-rerunfailures is not compatible with pytest-timeout (timeout is not set for reruns), + # we can workaround it by setting `timeout_func_only` to True[1]. + # Unfortunately, setting `timeout_func_only = True` globally in pytest.ini is broken[2], + # but we still can do it using pytest marker. + # + # - [1] https://github.com/pytest-dev/pytest-rerunfailures/issues/99 + # - [2] https://github.com/pytest-dev/pytest-timeout/issues/142 + + if not config.getoption("--reruns"): + return + + for item in items: + timeout_marker = item.get_closest_marker("timeout") + if timeout_marker is not None: + kwargs = cast("MutableMapping[str, Any]", timeout_marker.kwargs) + kwargs["func_only"] = True diff --git a/test_runner/fixtures/safekeeper/http.py b/test_runner/fixtures/safekeeper/http.py index 094188c0b5..286f80ba69 100644 --- a/test_runner/fixtures/safekeeper/http.py +++ b/test_runner/fixtures/safekeeper/http.py @@ -175,7 +175,7 @@ class SafekeeperHttpClient(requests.Session, MetricsGetter): assert s > Lsn(0) return s - return wait_until(30, 1, timeline_start_lsn_non_zero) + return wait_until(timeline_start_lsn_non_zero) def get_commit_lsn(self, tenant_id: TenantId, timeline_id: TimelineId) -> Lsn: return self.timeline_status(tenant_id, timeline_id).commit_lsn diff --git a/test_runner/fixtures/safekeeper/utils.py b/test_runner/fixtures/safekeeper/utils.py index 0246916470..922cdedccc 100644 --- a/test_runner/fixtures/safekeeper/utils.py +++ b/test_runner/fixtures/safekeeper/utils.py @@ -19,4 +19,4 @@ def wait_walreceivers_absent( log.info(f"waiting for walreceivers to be gone, currently {status.walreceivers}") assert len(status.walreceivers) == 0 - wait_until(30, 0.5, walreceivers_absent) + wait_until(walreceivers_absent) diff --git a/test_runner/fixtures/utils.py b/test_runner/fixtures/utils.py index 04e98fe494..c34ac298d1 100644 --- a/test_runner/fixtures/utils.py +++ b/test_runner/fixtures/utils.py @@ -9,6 +9,7 @@ import tarfile import threading import time from collections.abc import Callable, Iterable +from datetime import datetime, timedelta from hashlib import sha256 from pathlib import Path from typing import TYPE_CHECKING, Any, TypeVar @@ -380,15 +381,10 @@ def start_in_background( if return_code is not None: error = f"expected subprocess to run but it exited with code {return_code}" else: - attempts = 10 try: - wait_until( - number_of_iterations=attempts, - interval=1, - func=is_started, - ) + wait_until(is_started, timeout=10) except Exception: - error = f"Failed to get correct status from subprocess in {attempts} attempts" + error = "Failed to get correct status from subprocess" except Exception as e: error = f"expected subprocess to start but it failed with exception: {e}" @@ -402,28 +398,31 @@ def start_in_background( def wait_until( - number_of_iterations: int, - interval: float, func: Callable[[], WaitUntilRet], - show_intermediate_error: bool = False, + name: str | None = None, + timeout: float = 20.0, # seconds + interval: float = 0.5, # seconds + status_interval: float = 1.0, # seconds ) -> WaitUntilRet: """ Wait until 'func' returns successfully, without exception. Returns the last return value from the function. """ + if name is None: + name = getattr(func, "__name__", repr(func)) + deadline = datetime.now() + timedelta(seconds=timeout) + next_status = datetime.now() last_exception = None - for i in range(number_of_iterations): + while datetime.now() <= deadline: try: - res = func() + return func() except Exception as e: - log.info("waiting for %s iteration %s failed: %s", func, i + 1, e) + if datetime.now() >= next_status: + log.info("waiting for %s: %s", name, e) + next_status = datetime.now() + timedelta(seconds=status_interval) last_exception = e - if show_intermediate_error: - log.info(e) time.sleep(interval) - continue - return res - raise Exception(f"timed out while waiting for {func}") from last_exception + raise Exception(f"timed out while waiting for {name}") from last_exception def assert_eq(a, b) -> None: diff --git a/test_runner/logical_repl/test_clickhouse.py b/test_runner/logical_repl/test_clickhouse.py index 8e03bbe5d4..6b522fa46d 100644 --- a/test_runner/logical_repl/test_clickhouse.py +++ b/test_runner/logical_repl/test_clickhouse.py @@ -60,24 +60,22 @@ def test_clickhouse(remote_pg: RemotePostgres): "SETTINGS materialized_postgresql_tables_list = 'table1';" ) wait_until( - 120, - 0.5, lambda: query_clickhouse( client, "select * from db1_postgres.table1 order by 1", "ee600d8f7cd05bd0b169fa81f44300a9dd10085a", ), + timeout=60, ) cur.execute("INSERT INTO table1 (id, column1) VALUES (3, 'ghi'), (4, 'jkl');") conn.commit() wait_until( - 120, - 0.5, lambda: query_clickhouse( client, "select * from db1_postgres.table1 order by 1", "9eba2daaf7e4d7d27ac849525f68b562ab53947d", ), + timeout=60, ) log.debug("Sleeping before final checking if Neon is still alive") time.sleep(3) diff --git a/test_runner/logical_repl/test_debezium.py b/test_runner/logical_repl/test_debezium.py index d2cb087c92..8023d64d3d 100644 --- a/test_runner/logical_repl/test_debezium.py +++ b/test_runner/logical_repl/test_debezium.py @@ -148,14 +148,12 @@ def test_debezium(debezium): ) conn.commit() wait_until( - 100, - 0.5, lambda: get_kafka_msg( consumer, ts_ms, after={"first_name": "John", "last_name": "Dow", "email": "johndow@example.com"}, ), - show_intermediate_error=True, + timeout=60, ) ts_ms = time.time() * 1000 log.info("Insert 2 ts_ms: %s", ts_ms) @@ -165,28 +163,24 @@ def test_debezium(debezium): ) conn.commit() wait_until( - 100, - 0.5, lambda: get_kafka_msg( consumer, ts_ms, after={"first_name": "Alex", "last_name": "Row", "email": "alexrow@example.com"}, ), - show_intermediate_error=True, + timeout=60, ) ts_ms = time.time() * 1000 log.info("Update ts_ms: %s", ts_ms) cur.execute("update inventory.customers set first_name = 'Alexander' where id = 2") conn.commit() wait_until( - 100, - 0.5, lambda: get_kafka_msg( consumer, ts_ms, after={"first_name": "Alexander"}, ), - show_intermediate_error=True, + timeout=60, ) time.sleep(3) cur.execute("select 1") diff --git a/test_runner/performance/pageserver/test_pageserver_getpage_merge.py b/test_runner/performance/pageserver/test_page_service_batching.py similarity index 60% rename from test_runner/performance/pageserver/test_pageserver_getpage_merge.py rename to test_runner/performance/pageserver/test_page_service_batching.py index 34cce9900b..2c27368001 100644 --- a/test_runner/performance/pageserver/test_pageserver_getpage_merge.py +++ b/test_runner/performance/pageserver/test_page_service_batching.py @@ -11,36 +11,95 @@ from fixtures.log_helper import log from fixtures.neon_fixtures import NeonEnvBuilder, PgBin, wait_for_last_flush_lsn from fixtures.utils import humantime_to_ms -TARGET_RUNTIME = 60 +TARGET_RUNTIME = 30 + + +@dataclass +class PageServicePipeliningConfig: + pass + + +@dataclass +class PageServicePipeliningConfigSerial(PageServicePipeliningConfig): + mode: str = "serial" + + +@dataclass +class PageServicePipeliningConfigPipelined(PageServicePipeliningConfig): + max_batch_size: int + execution: str + mode: str = "pipelined" + + +EXECUTION = ["concurrent-futures", "tasks"] + +NON_BATCHABLE: list[PageServicePipeliningConfig] = [PageServicePipeliningConfigSerial()] +for max_batch_size in [1, 32]: + for execution in EXECUTION: + NON_BATCHABLE.append(PageServicePipeliningConfigPipelined(max_batch_size, execution)) + +BATCHABLE: list[PageServicePipeliningConfig] = [PageServicePipeliningConfigSerial()] +for max_batch_size in [1, 2, 4, 8, 16, 32]: + for execution in EXECUTION: + BATCHABLE.append(PageServicePipeliningConfigPipelined(max_batch_size, execution)) -@pytest.mark.skip("See https://github.com/neondatabase/neon/pull/9820#issue-2675856095") @pytest.mark.parametrize( - "tablesize_mib, batch_timeout, target_runtime, effective_io_concurrency, readhead_buffer_size, name", + "tablesize_mib, pipelining_config, target_runtime, effective_io_concurrency, readhead_buffer_size, name", [ - # the next 4 cases demonstrate how not-batchable workloads suffer from batching timeout - (50, None, TARGET_RUNTIME, 1, 128, "not batchable no batching"), - (50, "10us", TARGET_RUNTIME, 1, 128, "not batchable 10us timeout"), - (50, "1ms", TARGET_RUNTIME, 1, 128, "not batchable 1ms timeout"), - # the next 4 cases demonstrate how batchable workloads benefit from batching - (50, None, TARGET_RUNTIME, 100, 128, "batchable no batching"), - (50, "10us", TARGET_RUNTIME, 100, 128, "batchable 10us timeout"), - (50, "100us", TARGET_RUNTIME, 100, 128, "batchable 100us timeout"), - (50, "1ms", TARGET_RUNTIME, 100, 128, "batchable 1ms timeout"), + # non-batchable workloads + # (A separate benchmark will consider latency). + *[ + ( + 50, + config, + TARGET_RUNTIME, + 1, + 128, + f"not batchable {dataclasses.asdict(config)}", + ) + for config in NON_BATCHABLE + ], + # batchable workloads should show throughput and CPU efficiency improvements + *[ + ( + 50, + config, + TARGET_RUNTIME, + 100, + 128, + f"batchable {dataclasses.asdict(config)}", + ) + for config in BATCHABLE + ], ], ) -def test_getpage_merge_smoke( +def test_throughput( neon_env_builder: NeonEnvBuilder, zenbenchmark: NeonBenchmarker, tablesize_mib: int, - batch_timeout: str | None, + pipelining_config: PageServicePipeliningConfig, target_runtime: int, effective_io_concurrency: int, readhead_buffer_size: int, name: str, ): """ - Do a bunch of sequential scans and ensure that the pageserver does some merging. + Do a bunch of sequential scans with varying compute and pipelining configurations. + Primary performance metrics are the achieved batching factor and throughput (wall clock time). + Resource utilization is also interesting - we currently measure CPU time. + + The test is a fixed-runtime based type of test (target_runtime). + Hence, the results are normalized to the number of iterations completed within target runtime. + + If the compute doesn't provide pipeline depth (effective_io_concurrency=1), + performance should be about identical in all configurations. + Pipelining can still yield improvements in these scenarios because it parses the + next request while the current one is still being executed. + + If the compute provides pipeline depth (effective_io_concurrency=100), then + pipelining configs, especially with max_batch_size>1 should yield dramatic improvements + in all performance metrics. """ # @@ -51,25 +110,24 @@ def test_getpage_merge_smoke( params.update( { "tablesize_mib": (tablesize_mib, {"unit": "MiB"}), - "batch_timeout": ( - -1 if batch_timeout is None else 1e3 * humantime_to_ms(batch_timeout), - {"unit": "us"}, - ), # target_runtime is just a polite ask to the workload to run for this long "effective_io_concurrency": (effective_io_concurrency, {}), "readhead_buffer_size": (readhead_buffer_size, {}), - # name is not a metric + # name is not a metric, we just use it to identify the test easily in the `test_...[...]`` notation } ) + # For storing configuration as a metric, insert a fake 0 with labels with actual data + params.update({"pipelining_config": (0, {"labels": dataclasses.asdict(pipelining_config)})}) log.info("params: %s", params) for param, (value, kwargs) in params.items(): zenbenchmark.record( param, - metric_value=value, + metric_value=float(value), unit=kwargs.pop("unit", ""), report=MetricReport.TEST_PARAM, + labels=kwargs.pop("labels", None), **kwargs, ) @@ -106,18 +164,18 @@ def test_getpage_merge_smoke( @dataclass class Metrics: time: float - pageserver_getpage_count: float - pageserver_vectored_get_count: float + pageserver_batch_size_histo_sum: float + pageserver_batch_size_histo_count: float compute_getpage_count: float pageserver_cpu_seconds_total: float def __sub__(self, other: "Metrics") -> "Metrics": return Metrics( time=self.time - other.time, - pageserver_getpage_count=self.pageserver_getpage_count - - other.pageserver_getpage_count, - pageserver_vectored_get_count=self.pageserver_vectored_get_count - - other.pageserver_vectored_get_count, + pageserver_batch_size_histo_sum=self.pageserver_batch_size_histo_sum + - other.pageserver_batch_size_histo_sum, + pageserver_batch_size_histo_count=self.pageserver_batch_size_histo_count + - other.pageserver_batch_size_histo_count, compute_getpage_count=self.compute_getpage_count - other.compute_getpage_count, pageserver_cpu_seconds_total=self.pageserver_cpu_seconds_total - other.pageserver_cpu_seconds_total, @@ -126,8 +184,8 @@ def test_getpage_merge_smoke( def normalize(self, by) -> "Metrics": return Metrics( time=self.time / by, - pageserver_getpage_count=self.pageserver_getpage_count / by, - pageserver_vectored_get_count=self.pageserver_vectored_get_count / by, + pageserver_batch_size_histo_sum=self.pageserver_batch_size_histo_sum / by, + pageserver_batch_size_histo_count=self.pageserver_batch_size_histo_count / by, compute_getpage_count=self.compute_getpage_count / by, pageserver_cpu_seconds_total=self.pageserver_cpu_seconds_total / by, ) @@ -141,11 +199,11 @@ def test_getpage_merge_smoke( pageserver_metrics = ps_http.get_metrics() return Metrics( time=time.time(), - pageserver_getpage_count=pageserver_metrics.query_one( - "pageserver_smgr_query_seconds_count", {"smgr_query_type": "get_page_at_lsn"} + pageserver_batch_size_histo_sum=pageserver_metrics.query_one( + "pageserver_page_service_batch_size_sum" ).value, - pageserver_vectored_get_count=pageserver_metrics.query_one( - "pageserver_get_vectored_seconds_count", {"task_kind": "PageRequestHandler"} + pageserver_batch_size_histo_count=pageserver_metrics.query_one( + "pageserver_page_service_batch_size_count" ).value, compute_getpage_count=compute_getpage_count, pageserver_cpu_seconds_total=pageserver_metrics.query_one( @@ -170,7 +228,9 @@ def test_getpage_merge_smoke( after = get_metrics() return (after - before).normalize(iters - 1) - env.pageserver.patch_config_toml_nonrecursive({"server_side_batch_timeout": batch_timeout}) + env.pageserver.patch_config_toml_nonrecursive( + {"page_service_pipelining": dataclasses.asdict(pipelining_config)} + ) env.pageserver.restart() metrics = workload() @@ -180,7 +240,7 @@ def test_getpage_merge_smoke( # Sanity-checks on the collected data # # assert that getpage counts roughly match between compute and ps - assert metrics.pageserver_getpage_count == pytest.approx( + assert metrics.pageserver_batch_size_histo_sum == pytest.approx( metrics.compute_getpage_count, rel=0.01 ) @@ -193,29 +253,36 @@ def test_getpage_merge_smoke( zenbenchmark.record( "perfmetric.batching_factor", - metrics.pageserver_getpage_count / metrics.pageserver_vectored_get_count, + metrics.pageserver_batch_size_histo_sum / metrics.pageserver_batch_size_histo_count, unit="", report=MetricReport.HIGHER_IS_BETTER, ) -@pytest.mark.skip("See https://github.com/neondatabase/neon/pull/9820#issue-2675856095") +PRECISION_CONFIGS: list[PageServicePipeliningConfig] = [PageServicePipeliningConfigSerial()] +for max_batch_size in [1, 32]: + for execution in EXECUTION: + PRECISION_CONFIGS.append(PageServicePipeliningConfigPipelined(max_batch_size, execution)) + + @pytest.mark.parametrize( - "batch_timeout", [None, "10us", "20us", "50us", "100us", "200us", "500us", "1ms"] + "pipelining_config,name", + [(config, f"{dataclasses.asdict(config)}") for config in PRECISION_CONFIGS], ) -def test_timer_precision( +def test_latency( neon_env_builder: NeonEnvBuilder, zenbenchmark: NeonBenchmarker, pg_bin: PgBin, - batch_timeout: str | None, + pipelining_config: PageServicePipeliningConfig, + name: str, ): """ - Determine the batching timeout precision (mean latency) and tail latency impact. + Measure the latency impact of pipelining in an un-batchable workloads. - The baseline is `None`; an ideal batching timeout implementation would increase - the mean latency by exactly `batch_timeout`. + An ideal implementation should not increase average or tail latencies for such workloads. - That is not the case with the current implementation, will be addressed in future changes. + We don't have support in pagebench to create queue depth yet. + => https://github.com/neondatabase/neon/issues/9837 """ # @@ -223,7 +290,8 @@ def test_timer_precision( # def patch_ps_config(ps_config): - ps_config["server_side_batch_timeout"] = batch_timeout + if pipelining_config is not None: + ps_config["page_service_pipelining"] = dataclasses.asdict(pipelining_config) neon_env_builder.pageserver_config_override = patch_ps_config diff --git a/test_runner/performance/test_branch_creation.py b/test_runner/performance/test_branch_creation.py index c50c4ad432..cf2212d447 100644 --- a/test_runner/performance/test_branch_creation.py +++ b/test_runner/performance/test_branch_creation.py @@ -137,15 +137,14 @@ def test_branch_creation_many(neon_compare: NeonCompare, n_branches: int, shape: startup_line = "INFO version: git(-env)?:" # find the first line of the log file so we can find the next start later - _, first_start = wait_until(5, 1, lambda: env.pageserver.assert_log_contains(startup_line)) + _, first_start = wait_until(lambda: env.pageserver.assert_log_contains(startup_line)) # start without gc so we can time compaction with less noise; use shorter # period for compaction so it starts earlier def patch_default_tenant_config(config): - tenant_config = config.get("tenant_config", {}) + tenant_config = config.setdefault("tenant_config", {}) tenant_config["compaction_period"] = "3s" tenant_config["gc_period"] = "0s" - config["tenant_config"] = tenant_config env.pageserver.edit_config_toml(patch_default_tenant_config) env.pageserver.start( @@ -156,7 +155,7 @@ def test_branch_creation_many(neon_compare: NeonCompare, n_branches: int, shape: ) _, second_start = wait_until( - 5, 1, lambda: env.pageserver.assert_log_contains(startup_line, first_start) + lambda: env.pageserver.assert_log_contains(startup_line, first_start), ) env.pageserver.quiesce_tenants() @@ -164,8 +163,6 @@ def test_branch_creation_many(neon_compare: NeonCompare, n_branches: int, shape: # wait for compaction to complete, which most likely has already done so multiple times msg, _ = wait_until( - 30, - 1, lambda: env.pageserver.assert_log_contains( f".*tenant_id={env.initial_tenant}.*: compaction iteration complete.*", second_start ), @@ -205,7 +202,7 @@ def wait_and_record_startup_metrics( assert len(matching) == len(expected_labels) return matching - samples = wait_until(10, 1, metrics_are_filled) + samples = wait_until(metrics_are_filled) for sample in samples: phase = sample.labels["phase"] diff --git a/test_runner/performance/test_ingest_logical_message.py b/test_runner/performance/test_ingest_logical_message.py new file mode 100644 index 0000000000..d3118eb15a --- /dev/null +++ b/test_runner/performance/test_ingest_logical_message.py @@ -0,0 +1,101 @@ +from __future__ import annotations + +import pytest +from fixtures.benchmark_fixture import MetricReport, NeonBenchmarker +from fixtures.common_types import Lsn +from fixtures.log_helper import log +from fixtures.neon_fixtures import ( + NeonEnvBuilder, + wait_for_commit_lsn, + wait_for_last_flush_lsn, +) +from fixtures.pageserver.utils import wait_for_last_record_lsn + + +@pytest.mark.timeout(600) +@pytest.mark.parametrize("size", [1024, 8192, 131072]) +@pytest.mark.parametrize("fsync", [True, False], ids=["fsync", "nofsync"]) +def test_ingest_logical_message( + request: pytest.FixtureRequest, + neon_env_builder: NeonEnvBuilder, + zenbenchmark: NeonBenchmarker, + fsync: bool, + size: int, +): + """ + Benchmarks ingestion of 10 GB of logical message WAL. These are essentially noops, and don't + incur any pageserver writes. + """ + + VOLUME = 10 * 1024**3 + count = VOLUME // size + + neon_env_builder.safekeepers_enable_fsync = fsync + + env = neon_env_builder.init_start() + endpoint = env.endpoints.create_start( + "main", + config_lines=[ + f"fsync = {fsync}", + # Disable backpressure. We don't want to block on pageserver. + "max_replication_apply_lag = 0", + "max_replication_flush_lag = 0", + "max_replication_write_lag = 0", + ], + ) + client = env.pageserver.http_client() + + # Wait for the timeline to be propagated to the pageserver. + wait_for_last_flush_lsn(env, endpoint, env.initial_tenant, env.initial_timeline) + + # Ingest data and measure durations. + start_lsn = Lsn(endpoint.safe_psql("select pg_current_wal_lsn()")[0][0]) + + with endpoint.cursor() as cur: + cur.execute("set statement_timeout = 0") + + # Postgres will return once the logical messages have been written to its local WAL, without + # waiting for Safekeeper commit. We measure ingestion time both for Postgres, Safekeeper, + # and Pageserver to detect bottlenecks. + log.info("Ingesting data") + with zenbenchmark.record_duration("pageserver_ingest"): + with zenbenchmark.record_duration("safekeeper_ingest"): + with zenbenchmark.record_duration("postgres_ingest"): + cur.execute(f""" + select pg_logical_emit_message(false, '', repeat('x', {size})) + from generate_series(1, {count}) + """) + + end_lsn = Lsn(endpoint.safe_psql("select pg_current_wal_lsn()")[0][0]) + + # Wait for Safekeeper. + log.info("Waiting for Safekeeper to catch up") + wait_for_commit_lsn(env, env.initial_tenant, env.initial_timeline, end_lsn) + + # Wait for Pageserver. + log.info("Waiting for Pageserver to catch up") + wait_for_last_record_lsn(client, env.initial_tenant, env.initial_timeline, end_lsn) + + # Now that all data is ingested, delete and recreate the tenant in the pageserver. This will + # reingest all the WAL from the safekeeper without any other constraints. This gives us a + # baseline of how fast the pageserver can ingest this WAL in isolation. + status = env.storage_controller.inspect(tenant_shard_id=env.initial_tenant) + assert status is not None + + client.tenant_delete(env.initial_tenant) + env.pageserver.tenant_create(tenant_id=env.initial_tenant, generation=status[0]) + + with zenbenchmark.record_duration("pageserver_recover_ingest"): + log.info("Recovering WAL into pageserver") + client.timeline_create(env.pg_version, env.initial_tenant, env.initial_timeline) + wait_for_last_flush_lsn(env, endpoint, env.initial_tenant, env.initial_timeline) + + # Emit metrics. + wal_written_mb = round((end_lsn - start_lsn) / (1024 * 1024)) + zenbenchmark.record("wal_written", wal_written_mb, "MB", MetricReport.TEST_PARAM) + zenbenchmark.record("message_count", count, "messages", MetricReport.TEST_PARAM) + + props = {p["name"]: p["value"] for _, p in request.node.user_properties} + for name in ("postgres", "safekeeper", "pageserver", "pageserver_recover"): + throughput = int(wal_written_mb / props[f"{name}_ingest"]) + zenbenchmark.record(f"{name}_throughput", throughput, "MB/s", MetricReport.HIGHER_IS_BETTER) diff --git a/test_runner/performance/test_perf_ingest_using_pgcopydb.py b/test_runner/performance/test_perf_ingest_using_pgcopydb.py index 37f2e9db50..f0a0c1f5a2 100644 --- a/test_runner/performance/test_perf_ingest_using_pgcopydb.py +++ b/test_runner/performance/test_perf_ingest_using_pgcopydb.py @@ -60,13 +60,13 @@ def build_pgcopydb_command(pgcopydb_filter_file: Path, test_output_dir: Path): "--no-acl", "--skip-db-properties", "--table-jobs", - "8", + "4", "--index-jobs", - "8", + "4", "--restore-jobs", - "8", + "4", "--split-tables-larger-than", - "5GB", + "10GB", "--skip-extensions", "--use-copy-binary", "--filters", @@ -136,7 +136,7 @@ def run_command_and_log_output(command, log_file_path: Path): "LD_LIBRARY_PATH": f"{os.getenv('PGCOPYDB_LIB_PATH')}:{os.getenv('PG_16_LIB_PATH')}", "PGCOPYDB_SOURCE_PGURI": cast(str, os.getenv("BENCHMARK_INGEST_SOURCE_CONNSTR")), "PGCOPYDB_TARGET_PGURI": cast(str, os.getenv("BENCHMARK_INGEST_TARGET_CONNSTR")), - "PGOPTIONS": "-c maintenance_work_mem=8388608 -c max_parallel_maintenance_workers=16", + "PGOPTIONS": "-c maintenance_work_mem=8388608 -c max_parallel_maintenance_workers=7", } # Combine the current environment with custom variables env = os.environ.copy() @@ -184,7 +184,7 @@ def parse_log_and_report_metrics( for metric_name, pattern in metric_patterns.items(): if pattern.search(line): # Extract duration and convert it to seconds - duration_match = re.search(r"\d+h\d+m|\d+s|\d+ms|\d+\.\d+s", line) + duration_match = re.search(r"\d+h\d+m|\d+m\d+s|\d+s|\d+ms|\d+\.\d+s", line) if duration_match: duration_str = duration_match.group(0) parts = re.findall(r"\d+[a-zA-Z]+", duration_str) diff --git a/test_runner/performance/test_sharded_ingest.py b/test_runner/performance/test_sharded_ingest.py index 4c21e799c8..94fd54bade 100644 --- a/test_runner/performance/test_sharded_ingest.py +++ b/test_runner/performance/test_sharded_ingest.py @@ -90,6 +90,7 @@ def test_sharded_ingest( # Start the endpoint. endpoint = env.endpoints.create_start("main", tenant_id=tenant_id) start_lsn = Lsn(endpoint.safe_psql("select pg_current_wal_lsn()")[0][0]) + # Ingest data and measure WAL volume and duration. with closing(endpoint.connect()) as conn: with conn.cursor() as cur: @@ -104,6 +105,8 @@ def test_sharded_ingest( wait_for_last_flush_lsn(env, endpoint, tenant_id, timeline_id) end_lsn = Lsn(endpoint.safe_psql("select pg_current_wal_lsn()")[0][0]) + + # Record metrics. wal_written_mb = round((end_lsn - start_lsn) / (1024 * 1024)) zenbenchmark.record("wal_written", wal_written_mb, "MB", MetricReport.TEST_PARAM) @@ -152,3 +155,7 @@ def test_sharded_ingest( log.info(f"WAL ingested by each pageserver {ingested_by_ps}") assert tenant_get_shards(env, tenant_id) == shards, "shards moved" + + # The pageservers can take a long time to shut down gracefully, presumably due to the upload + # queue or compactions or something. Just stop them immediately, we don't care. + env.stop(immediate=True) diff --git a/test_runner/performance/test_storage_controller_scale.py b/test_runner/performance/test_storage_controller_scale.py index 142bd3d669..49f41483ec 100644 --- a/test_runner/performance/test_storage_controller_scale.py +++ b/test_runner/performance/test_storage_controller_scale.py @@ -72,7 +72,7 @@ def test_storage_controller_many_tenants( we don't fall over for a thousand shards. """ - neon_env_builder.num_pageservers = 5 + neon_env_builder.num_pageservers = 6 neon_env_builder.storage_controller_config = { # Default neon_local uses a small timeout: use a longer one to tolerate longer pageserver restarts. # TODO: tune this down as restarts get faster (https://github.com/neondatabase/neon/pull/7553), to @@ -84,6 +84,11 @@ def test_storage_controller_many_tenants( compute_reconfigure_listener.control_plane_compute_hook_api ) + AZS = ["alpha", "bravo", "charlie"] + neon_env_builder.pageserver_config_override = lambda ps_cfg: ps_cfg.update( + {"availability_zone": f"az-{AZS[ps_cfg['id'] % len(AZS)]}"} + ) + # A small sleep on each call into the notify hook, to simulate the latency of doing a database write compute_reconfigure_listener.register_on_notify(lambda body: time.sleep(0.01)) diff --git a/test_runner/regress/test_attach_tenant_config.py b/test_runner/regress/test_attach_tenant_config.py index 670c2698f5..45112fd67e 100644 --- a/test_runner/regress/test_attach_tenant_config.py +++ b/test_runner/regress/test_attach_tenant_config.py @@ -64,8 +64,6 @@ def negative_env(neon_env_builder: NeonEnvBuilder) -> Generator[NegativeTests, N ) wait_until( - 50, - 0.1, lambda: env.pageserver.assert_log_contains(".*Error processing HTTP request: Bad request"), ) diff --git a/test_runner/regress/test_compaction.py b/test_runner/regress/test_compaction.py index 302a8fd0d1..b6741aed68 100644 --- a/test_runner/regress/test_compaction.py +++ b/test_runner/regress/test_compaction.py @@ -385,7 +385,7 @@ def test_pageserver_compaction_circuit_breaker(neon_env_builder: NeonEnvBuilder) # Wait for enough failures to break the circuit breaker # This wait is fairly long because we back off on compaction failures, so 5 retries takes ~30s - wait_until(60, 1, assert_broken) + wait_until(assert_broken, timeout=60) # Sleep for a while, during which time we expect that compaction will _not_ be retried time.sleep(10) diff --git a/test_runner/regress/test_compute_catalog.py b/test_runner/regress/test_compute_catalog.py index d43c71ceac..b3719a45ed 100644 --- a/test_runner/regress/test_compute_catalog.py +++ b/test_runner/regress/test_compute_catalog.py @@ -3,13 +3,60 @@ from __future__ import annotations import requests from fixtures.neon_fixtures import NeonEnv +TEST_DB_NAMES = [ + { + "name": "neondb", + "owner": "cloud_admin", + }, + { + "name": "db with spaces", + "owner": "cloud_admin", + }, + { + "name": "db with%20spaces ", + "owner": "cloud_admin", + }, + { + "name": "db with whitespaces ", + "owner": "cloud_admin", + }, + { + "name": "injective db with spaces'; SELECT pg_sleep(10);", + "owner": "cloud_admin", + }, + { + "name": "db with #pound-sign and &ersands=true", + "owner": "cloud_admin", + }, + { + "name": "db with emoji 🌍", + "owner": "cloud_admin", + }, +] + def test_compute_catalog(neon_simple_env: NeonEnv): + """ + Create a bunch of databases with tricky names and test that we can list them + and dump via API. + """ env = neon_simple_env - endpoint = env.endpoints.create_start("main", config_lines=["log_min_messages=debug1"]) - client = endpoint.http_client() + endpoint = env.endpoints.create_start("main") + # Update the spec.json file to include new databases + # and reconfigure the endpoint to create some test databases. + endpoint.respec_deep( + **{ + "skip_pg_catalog_updates": False, + "cluster": { + "databases": TEST_DB_NAMES, + }, + } + ) + endpoint.reconfigure() + + client = endpoint.http_client() objects = client.dbs_and_roles() # Assert that 'cloud_admin' role exists in the 'roles' list @@ -22,9 +69,24 @@ def test_compute_catalog(neon_simple_env: NeonEnv): db["name"] == "postgres" for db in objects["databases"] ), "The 'postgres' database is missing" - ddl = client.database_schema(database="postgres") + # Check other databases + for test_db in TEST_DB_NAMES: + db = next((db for db in objects["databases"] if db["name"] == test_db["name"]), None) + assert db is not None, f"The '{test_db['name']}' database is missing" + assert ( + db["owner"] == test_db["owner"] + ), f"The '{test_db['name']}' database has incorrect owner" - assert "-- PostgreSQL database dump" in ddl + ddl = client.database_schema(database=test_db["name"]) + + # Check that it looks like a valid PostgreSQL dump + assert "-- PostgreSQL database dump" in ddl + + # Check that it doesn't contain health_check and migration traces. + # They are only created in system `postgres` database, so by checking + # that we ensure that we dump right databases. + assert "health_check" not in ddl, f"The '{test_db['name']}' database contains health_check" + assert "migration" not in ddl, f"The '{test_db['name']}' database contains migrations data" try: client.database_schema(database="nonexistentdb") @@ -33,3 +95,44 @@ def test_compute_catalog(neon_simple_env: NeonEnv): assert ( e.response.status_code == 404 ), f"Expected 404 status code, but got {e.response.status_code}" + + +def test_compute_create_databases(neon_simple_env: NeonEnv): + """ + Test that compute_ctl can create and work with databases with special + characters (whitespaces, %, tabs, etc.) in the name. + """ + env = neon_simple_env + + # Create and start endpoint so that neon_local put all the generated + # stuff into the spec.json file. + endpoint = env.endpoints.create_start("main") + + # Update the spec.json file to include new databases + # and reconfigure the endpoint to apply the changes. + endpoint.respec_deep( + **{ + "skip_pg_catalog_updates": False, + "cluster": { + "databases": TEST_DB_NAMES, + }, + } + ) + endpoint.reconfigure() + + for db in TEST_DB_NAMES: + # Check that database has a correct name in the system catalog + with endpoint.cursor() as cursor: + cursor.execute("SELECT datname FROM pg_database WHERE datname = %s", (db["name"],)) + catalog_db = cursor.fetchone() + assert catalog_db is not None + assert len(catalog_db) == 1 + assert catalog_db[0] == db["name"] + + # Check that we can connect to this database without any issues + with endpoint.cursor(dbname=db["name"]) as cursor: + cursor.execute("SELECT * FROM current_database()") + curr_db = cursor.fetchone() + assert curr_db is not None + assert len(curr_db) == 1 + assert curr_db[0] == db["name"] diff --git a/test_runner/regress/test_disk_usage_eviction.py b/test_runner/regress/test_disk_usage_eviction.py index 1807511008..954db914b9 100644 --- a/test_runner/regress/test_disk_usage_eviction.py +++ b/test_runner/regress/test_disk_usage_eviction.py @@ -62,9 +62,8 @@ def test_min_resident_size_override_handling( if config_level_override is not None: def set_min_resident_size(config): - tenant_config = config.get("tenant_config", {}) + tenant_config = config.setdefault("tenant_config", {}) tenant_config["min_resident_size_override"] = config_level_override - config["tenant_config"] = tenant_config env.pageserver.edit_config_toml(set_min_resident_size) env.pageserver.stop() @@ -211,7 +210,7 @@ class EvictionEnv: pageserver.assert_log_contains(".*running mocked statvfs.*") # we most likely have already completed multiple runs - wait_until(10, 1, statvfs_called) + wait_until(statvfs_called) def count_layers_per_tenant( @@ -772,14 +771,14 @@ def test_statvfs_pressure_usage(eviction_env: EvictionEnv): ) wait_until( - 10, 1, lambda: env.neon_env.pageserver.assert_log_contains(".*disk usage pressure relieved") + lambda: env.neon_env.pageserver.assert_log_contains(".*disk usage pressure relieved") ) def less_than_max_usage_pct(): post_eviction_total_size, _, _ = env.timelines_du(env.pageserver) assert post_eviction_total_size < 0.33 * total_size, "we requested max 33% usage" - wait_until(2, 2, less_than_max_usage_pct) + wait_until(less_than_max_usage_pct, timeout=5) # Disk usage candidate collection only takes into account active tenants. # However, the statvfs call takes into account the entire tenants directory, @@ -825,7 +824,7 @@ def test_statvfs_pressure_min_avail_bytes(eviction_env: EvictionEnv): ) wait_until( - 10, 1, lambda: env.neon_env.pageserver.assert_log_contains(".*disk usage pressure relieved") + lambda: env.neon_env.pageserver.assert_log_contains(".*disk usage pressure relieved"), ) def more_than_min_avail_bytes_freed(): @@ -834,7 +833,7 @@ def test_statvfs_pressure_min_avail_bytes(eviction_env: EvictionEnv): total_size - post_eviction_total_size >= min_avail_bytes ), f"we requested at least {min_avail_bytes} worth of free space" - wait_until(2, 2, more_than_min_avail_bytes_freed) + wait_until(more_than_min_avail_bytes_freed, timeout=5) def test_secondary_mode_eviction(eviction_env_ha: EvictionEnv): diff --git a/test_runner/regress/test_hot_standby.py b/test_runner/regress/test_hot_standby.py index 0b1ac11c16..4044f25b37 100644 --- a/test_runner/regress/test_hot_standby.py +++ b/test_runner/regress/test_hot_standby.py @@ -257,7 +257,7 @@ def test_hot_standby_feedback(neon_env_builder: NeonEnvBuilder, pg_bin: PgBin): # Wait until we see that the pgbench_accounts is created + filled on replica *and* # index is created. Otherwise index creation would conflict with # read queries and hs feedback won't save us. - wait_until(60, 1.0, partial(pgbench_accounts_initialized, secondary)) + wait_until(partial(pgbench_accounts_initialized, secondary), timeout=60) # Test should fail if hs feedback is disabled anyway, but cross # check that walproposer sets some xmin. @@ -269,7 +269,7 @@ def test_hot_standby_feedback(neon_env_builder: NeonEnvBuilder, pg_bin: PgBin): log.info(f"xmin is {slot_xmin}") assert int(slot_xmin) > 0 - wait_until(10, 1.0, xmin_is_not_null) + wait_until(xmin_is_not_null) for _ in range(1, 5): # in debug mode takes about 5-7s balance = secondary.safe_psql_scalar("select sum(abalance) from pgbench_accounts") @@ -286,7 +286,7 @@ def test_hot_standby_feedback(neon_env_builder: NeonEnvBuilder, pg_bin: PgBin): log.info(f"xmin is {slot_xmin}") assert slot_xmin is None - wait_until(10, 1.0, xmin_is_null) + wait_until(xmin_is_null) # Test race condition between WAL replay and backends performing queries diff --git a/test_runner/regress/test_layers_from_future.py b/test_runner/regress/test_layers_from_future.py index 761ec7568f..8818b40712 100644 --- a/test_runner/regress/test_layers_from_future.py +++ b/test_runner/regress/test_layers_from_future.py @@ -206,7 +206,7 @@ def test_issue_5878(neon_env_builder: NeonEnvBuilder, attach_mode: str): future_layers = set(get_future_layers()) assert future_layer not in future_layers - wait_until(10, 0.5, future_layer_is_gone_from_index_part) + wait_until(future_layer_is_gone_from_index_part) # We already make deletion stuck here, but we don't necessarily hit the failpoint # because deletions are batched. diff --git a/test_runner/regress/test_logging.py b/test_runner/regress/test_logging.py index f6fbdcabfd..d94c786f49 100644 --- a/test_runner/regress/test_logging.py +++ b/test_runner/regress/test_logging.py @@ -37,7 +37,7 @@ def test_logging_event_count(neon_env_builder: NeonEnvBuilder, level: str): return env.pageserver.assert_log_contains(f".*{msg_id}.*") - wait_until(10, 0.5, assert_logged) + wait_until(assert_logged) # make sure it's counted def assert_metric_value(): @@ -49,4 +49,4 @@ def test_logging_event_count(neon_env_builder: NeonEnvBuilder, level: str): log.info("libmetrics_tracing_event_count: %s", val) assert val > (before or 0.0) - wait_until(10, 1, assert_metric_value) + wait_until(assert_metric_value) diff --git a/test_runner/regress/test_logical_replication.py b/test_runner/regress/test_logical_replication.py index ba471b7147..db18e1758c 100644 --- a/test_runner/regress/test_logical_replication.py +++ b/test_runner/regress/test_logical_replication.py @@ -207,7 +207,7 @@ def test_obsolete_slot_drop(neon_simple_env: NeonEnv, vanilla_pg: VanillaPostgre log.info(f"ep connstr is {endpoint.connstr()}, subscriber connstr {vanilla_pg.connstr()}") vanilla_pg.safe_psql(f"create subscription sub1 connection '{connstr}' publication pub1") - wait_until(number_of_iterations=10, interval=2, func=partial(slot_removed, endpoint)) + wait_until(partial(slot_removed, endpoint)) def test_ondemand_wal_download_in_replication_slot_funcs(neon_env_builder: NeonEnvBuilder): @@ -519,7 +519,7 @@ def test_replication_shutdown(neon_simple_env: NeonEnv): assert len(res) == 4 assert [r[0] for r in res] == [10, 20, 30, 40] - wait_until(10, 0.5, check_that_changes_propagated) + wait_until(check_that_changes_propagated) def logical_replication_wait_flush_lsn_sync(publisher: PgProtocol) -> Lsn: @@ -549,7 +549,7 @@ select sent_lsn, flush_lsn, pg_current_wal_flush_lsn() from pg_stat_replication ) assert flush_lsn >= publisher_flush_lsn - wait_until(30, 0.5, check_caughtup) + wait_until(check_caughtup) return publisher_flush_lsn diff --git a/test_runner/regress/test_lsn_mapping.py b/test_runner/regress/test_lsn_mapping.py index 7f0b541128..e42e71646d 100644 --- a/test_runner/regress/test_lsn_mapping.py +++ b/test_runner/regress/test_lsn_mapping.py @@ -169,7 +169,7 @@ def test_get_lsn_by_timestamp_cancelled(neon_env_builder: NeonEnvBuilder): ) _, offset = wait_until( - 20, 0.5, lambda: env.pageserver.assert_log_contains(f"at failpoint {failpoint}") + lambda: env.pageserver.assert_log_contains(f"at failpoint {failpoint}") ) with pytest.raises(ReadTimeout): @@ -178,8 +178,6 @@ def test_get_lsn_by_timestamp_cancelled(neon_env_builder: NeonEnvBuilder): client.configure_failpoints((failpoint, "off")) _, offset = wait_until( - 20, - 0.5, lambda: env.pageserver.assert_log_contains( "Cancelled request finished with an error: Cancelled$", offset ), diff --git a/test_runner/regress/test_neon_superuser.py b/test_runner/regress/test_neon_superuser.py index 7118127a1f..49cd91906f 100644 --- a/test_runner/regress/test_neon_superuser.py +++ b/test_runner/regress/test_neon_superuser.py @@ -77,7 +77,7 @@ def test_neon_superuser(neon_simple_env: NeonEnv, pg_version: PgVersion): assert len(res) == 4 assert [r[0] for r in res] == [10, 20, 30, 40] - wait_until(10, 0.5, check_that_changes_propagated) + wait_until(check_that_changes_propagated) # Test that pg_monitor is working for neon_superuser role cur.execute("SELECT query from pg_stat_activity LIMIT 1") diff --git a/test_runner/regress/test_ondemand_download.py b/test_runner/regress/test_ondemand_download.py index e1caaeb6c1..028d1c2e49 100644 --- a/test_runner/regress/test_ondemand_download.py +++ b/test_runner/regress/test_ondemand_download.py @@ -256,7 +256,7 @@ def test_ondemand_download_timetravel(neon_env_builder: NeonEnvBuilder): ##### Second start, restore the data and ensure it's the same env.pageserver.start() - wait_until(10, 0.2, lambda: assert_tenant_state(client, tenant_id, "Active")) + wait_until(lambda: assert_tenant_state(client, tenant_id, "Active")) # The current_physical_size reports the sum of layers loaded in the layer # map, regardless of where the layer files are located. So even though we @@ -413,7 +413,7 @@ def test_download_remote_layers_api( ] ) - wait_until(10, 0.2, lambda: assert_tenant_state(client, tenant_id, "Active")) + wait_until(lambda: assert_tenant_state(client, tenant_id, "Active")) ###### Phase 1: exercise download error code path @@ -705,7 +705,7 @@ def test_layer_download_cancelled_by_config_location(neon_env_builder: NeonEnvBu ) _, offset = wait_until( - 20, 0.5, lambda: env.pageserver.assert_log_contains(f"at failpoint {failpoint}") + lambda: env.pageserver.assert_log_contains(f"at failpoint {failpoint}") ) location_conf = {"mode": "Detached", "tenant_conf": {}} @@ -713,8 +713,6 @@ def test_layer_download_cancelled_by_config_location(neon_env_builder: NeonEnvBu detach = exec.submit(client.tenant_location_conf, env.initial_tenant, location_conf) _, offset = wait_until( - 20, - 0.5, lambda: env.pageserver.assert_log_contains( "closing is taking longer than expected", offset ), @@ -734,8 +732,6 @@ def test_layer_download_cancelled_by_config_location(neon_env_builder: NeonEnvBu client.configure_failpoints((failpoint, "pause")) _, offset = wait_until( - 20, - 0.5, lambda: env.pageserver.assert_log_contains(f"cfg failpoint: {failpoint} pause", offset), ) @@ -750,8 +746,6 @@ def test_layer_download_cancelled_by_config_location(neon_env_builder: NeonEnvBu warmup = exec.submit(client.tenant_secondary_download, env.initial_tenant, wait_ms=30000) _, offset = wait_until( - 20, - 0.5, lambda: env.pageserver.assert_log_contains(f"at failpoint {failpoint}", offset), ) @@ -805,7 +799,7 @@ def test_layer_download_timeouted(neon_env_builder: NeonEnvBuilder): ) _, offset = wait_until( - 20, 0.5, lambda: env.pageserver.assert_log_contains(f"at failpoint {failpoint}") + lambda: env.pageserver.assert_log_contains(f"at failpoint {failpoint}") ) # ensure enough time while paused to trip the timeout time.sleep(2) @@ -824,8 +818,6 @@ def test_layer_download_timeouted(neon_env_builder: NeonEnvBuilder): # capture the next offset for a new synchronization with the failpoint _, offset = wait_until( - 20, - 0.5, lambda: env.pageserver.assert_log_contains(f"cfg failpoint: {failpoint} pause", offset), ) diff --git a/test_runner/regress/test_pageserver_api.py b/test_runner/regress/test_pageserver_api.py index 05e81b82e0..55fd7a8608 100644 --- a/test_runner/regress/test_pageserver_api.py +++ b/test_runner/regress/test_pageserver_api.py @@ -117,19 +117,11 @@ def test_pageserver_http_get_wal_receiver_success(neon_simple_env: NeonEnv): # We need to wait here because it's possible that we don't have access to # the latest WAL yet, when the `timeline_detail` API is first called. # See: https://github.com/neondatabase/neon/issues/1768. - lsn = wait_until( - number_of_iterations=5, - interval=1, - func=lambda: expect_updated_msg_lsn(client, tenant_id, timeline_id, None), - ) + lsn = wait_until(lambda: expect_updated_msg_lsn(client, tenant_id, timeline_id, None)) # Make a DB modification then expect getting a new WAL receiver's data. endpoint.safe_psql("INSERT INTO t VALUES (1, 'hey')") - wait_until( - number_of_iterations=5, - interval=1, - func=lambda: expect_updated_msg_lsn(client, tenant_id, timeline_id, lsn), - ) + wait_until(lambda: expect_updated_msg_lsn(client, tenant_id, timeline_id, lsn)) def test_pageserver_http_api_client(neon_simple_env: NeonEnv): diff --git a/test_runner/regress/test_pageserver_generations.py b/test_runner/regress/test_pageserver_generations.py index 6ba5753420..7e5bb45242 100644 --- a/test_runner/regress/test_pageserver_generations.py +++ b/test_runner/regress/test_pageserver_generations.py @@ -352,7 +352,7 @@ def test_deletion_queue_recovery( def assert_some_validations(): assert get_deletion_queue_validated(ps_http) > 0 - wait_until(20, 1, assert_some_validations) + wait_until(assert_some_validations) # The validatated keys statistic advances before the header is written, so we # also wait to see the header hit the disk: this seems paranoid but the race @@ -360,7 +360,7 @@ def test_deletion_queue_recovery( def assert_header_written(): assert (main_pageserver.workdir / "deletion" / "header-01").exists() - wait_until(20, 1, assert_header_written) + wait_until(assert_header_written) # If we will lose attachment, then our expectation on restart is that only the ones # we already validated will execute. Act like only those were present in the queue. @@ -382,11 +382,11 @@ def test_deletion_queue_recovery( # After restart, issue a flush to kick the deletion frontend to do recovery. # It should recover all the operations we submitted before the restart. ps_http.deletion_queue_flush(execute=False) - wait_until(20, 0.25, lambda: assert_deletions_submitted(before_restart_depth)) + wait_until(lambda: assert_deletions_submitted(before_restart_depth)) # The queue should drain through completely if we flush it ps_http.deletion_queue_flush(execute=True) - wait_until(10, 1, lambda: assert_deletion_queue(ps_http, lambda n: n == 0)) + wait_until(lambda: assert_deletion_queue(ps_http, lambda n: n == 0)) if keep_attachment == KeepAttachment.KEEP: # - If we kept the attachment, then our pre-restart deletions should execute @@ -564,7 +564,7 @@ def test_multi_attach( ) # Initially, the tenant will be attached to the first pageserver (first is default in our test harness) - wait_until(10, 0.2, lambda: assert_tenant_state(http_clients[0], tenant_id, "Active")) + wait_until(lambda: assert_tenant_state(http_clients[0], tenant_id, "Active")) _detail = http_clients[0].timeline_detail(tenant_id, timeline_id) with pytest.raises(PageserverApiException): http_clients[1].timeline_detail(tenant_id, timeline_id) @@ -579,8 +579,8 @@ def test_multi_attach( pageservers[1].tenant_attach(env.initial_tenant) pageservers[2].tenant_attach(env.initial_tenant) - wait_until(10, 0.2, lambda: assert_tenant_state(http_clients[1], tenant_id, "Active")) - wait_until(10, 0.2, lambda: assert_tenant_state(http_clients[2], tenant_id, "Active")) + wait_until(lambda: assert_tenant_state(http_clients[1], tenant_id, "Active")) + wait_until(lambda: assert_tenant_state(http_clients[2], tenant_id, "Active")) # Now they all have it attached _details = list([c.timeline_detail(tenant_id, timeline_id) for c in http_clients]) diff --git a/test_runner/regress/test_pageserver_getpage_throttle.py b/test_runner/regress/test_pageserver_getpage_throttle.py index f1aad85fe9..9644ebe3e2 100644 --- a/test_runner/regress/test_pageserver_getpage_throttle.py +++ b/test_runner/regress/test_pageserver_getpage_throttle.py @@ -4,6 +4,7 @@ import copy import json import uuid +import pytest from anyio import Path from fixtures.common_types import TenantId, TimelineId from fixtures.log_helper import log @@ -32,7 +33,9 @@ def test_pageserver_getpage_throttle(neon_env_builder: NeonEnvBuilder, pg_bin: P conf={ "compaction_period": f"{compaction_period}s", "timeline_get_throttle": { - "task_kinds": ["PageRequestHandler"], + "task_kinds": [ + "PageRequestHandler" + ], # any non-empty array will do here https://github.com/neondatabase/neon/pull/9962 "initial": 0, "refill_interval": "100ms", "refill_amount": int(rate_limit_rps / 10), @@ -70,20 +73,25 @@ def test_pageserver_getpage_throttle(neon_env_builder: NeonEnvBuilder, pg_bin: P log.info("warmup / make sure metrics are present") run_pagebench_at_max_speed_and_get_total_requests_completed(2) - metrics_query = { + smgr_metrics_query = { "tenant_id": str(tenant_id), "timeline_id": str(timeline_id), "smgr_query_type": "get_page_at_lsn", } - metric_name = "pageserver_smgr_query_seconds_sum" - smgr_query_seconds_pre = ps_http.get_metric_value(metric_name, metrics_query) + smgr_metric_name = "pageserver_smgr_query_seconds_sum" + throttle_metrics_query = { + "tenant_id": str(tenant_id), + } + throttle_metric_name = "pageserver_tenant_throttling_wait_usecs_sum_total" + + smgr_query_seconds_pre = ps_http.get_metric_value(smgr_metric_name, smgr_metrics_query) assert smgr_query_seconds_pre is not None + throttled_usecs_pre = ps_http.get_metric_value(throttle_metric_name, throttle_metrics_query) + assert throttled_usecs_pre is not None marker = uuid.uuid4().hex ps_http.post_tracing_event("info", marker) - _, marker_offset = wait_until( - 10, 0.5, lambda: env.pageserver.assert_log_contains(marker, offset=None) - ) + _, marker_offset = wait_until(lambda: env.pageserver.assert_log_contains(marker, offset=None)) log.info("run pagebench") duration_secs = 10 @@ -103,23 +111,31 @@ def test_pageserver_getpage_throttle(neon_env_builder: NeonEnvBuilder, pg_bin: P log.info("validate that we logged the throttling") wait_until( - 10, - compaction_period / 10, lambda: env.pageserver.assert_log_contains( f".*{tenant_id}.*shard was throttled in the last n_seconds.*", offset=marker_offset, ), + timeout=compaction_period, ) - log.info("validate that the metric doesn't include throttle wait time") - smgr_query_seconds_post = ps_http.get_metric_value(metric_name, metrics_query) + smgr_query_seconds_post = ps_http.get_metric_value(smgr_metric_name, smgr_metrics_query) assert smgr_query_seconds_post is not None + throttled_usecs_post = ps_http.get_metric_value(throttle_metric_name, throttle_metrics_query) + assert throttled_usecs_post is not None actual_smgr_query_seconds = smgr_query_seconds_post - smgr_query_seconds_pre + actual_throttled_usecs = throttled_usecs_post - throttled_usecs_pre + actual_throttled_secs = actual_throttled_usecs / 1_000_000 + log.info("validate that the metric doesn't include throttle wait time") assert ( duration_secs >= 10 * actual_smgr_query_seconds ), "smgr metrics should not include throttle wait time" + log.info("validate that the throttling wait time metrics is correct") + assert ( + pytest.approx(actual_throttled_secs + actual_smgr_query_seconds, 0.1) == duration_secs + ), "most of the time in this test is spent throttled because the rate-limit's contribution to latency dominates" + throttle_config_with_field_fair_set = { "task_kinds": ["PageRequestHandler"], @@ -167,7 +183,8 @@ def test_throttle_fair_config_is_settable_but_ignored_in_config_toml( """ def set_tenant_config(ps_cfg): - ps_cfg["tenant_config"] = {"timeline_get_throttle": throttle_config_with_field_fair_set} + tenant_config = ps_cfg.setdefault("tenant_config", {}) + tenant_config["timeline_get_throttle"] = throttle_config_with_field_fair_set neon_env_builder.pageserver_config_override = set_tenant_config env = neon_env_builder.init_start() diff --git a/test_runner/regress/test_pageserver_layer_rolling.py b/test_runner/regress/test_pageserver_layer_rolling.py index f6a7bfa1ad..706da1e35e 100644 --- a/test_runner/regress/test_pageserver_layer_rolling.py +++ b/test_runner/regress/test_pageserver_layer_rolling.py @@ -84,7 +84,7 @@ def wait_for_wal_ingest_metric(pageserver_http: PageserverHttpClient) -> float: # The metric gets initialised on the first update. # Retry a few times, but return 0 if it's stable. try: - return float(wait_until(3, 0.5, query)) + return float(wait_until(query, timeout=2, interval=0.5)) except Exception: return 0 @@ -131,7 +131,7 @@ def test_pageserver_small_inmemory_layers( wait_until_pageserver_is_caught_up(env, last_flush_lsns) # We didn't write enough data to trigger a size-based checkpoint: we should see dirty data. - wait_until(10, 1, lambda: assert_dirty_bytes_nonzero(env)) + wait_until(lambda: assert_dirty_bytes_nonzero(env)) ps_http_client = env.pageserver.http_client() total_wal_ingested_before_restart = wait_for_wal_ingest_metric(ps_http_client) @@ -139,7 +139,7 @@ def test_pageserver_small_inmemory_layers( # Within ~ the checkpoint interval, all the ephemeral layers should be frozen and flushed, # such that there are zero bytes of ephemeral layer left on the pageserver log.info("Waiting for background checkpoints...") - wait_until(CHECKPOINT_TIMEOUT_SECONDS * 2, 1, lambda: assert_dirty_bytes(env, 0)) + wait_until(lambda: assert_dirty_bytes(env, 0), timeout=2 * CHECKPOINT_TIMEOUT_SECONDS) # Zero ephemeral layer bytes does not imply that all the frozen layers were uploaded: they # must be uploaded to remain visible to the pageserver after restart. @@ -180,7 +180,7 @@ def test_idle_checkpoints(neon_env_builder: NeonEnvBuilder): wait_until_pageserver_is_caught_up(env, last_flush_lsns) # We didn't write enough data to trigger a size-based checkpoint: we should see dirty data. - wait_until(10, 1, lambda: assert_dirty_bytes_nonzero(env)) + wait_until(lambda: assert_dirty_bytes_nonzero(env)) # Stop the safekeepers, so that we cannot have any more WAL receiver connections for sk in env.safekeepers: @@ -193,7 +193,7 @@ def test_idle_checkpoints(neon_env_builder: NeonEnvBuilder): # Within ~ the checkpoint interval, all the ephemeral layers should be frozen and flushed, # such that there are zero bytes of ephemeral layer left on the pageserver log.info("Waiting for background checkpoints...") - wait_until(CHECKPOINT_TIMEOUT_SECONDS * 2, 1, lambda: assert_dirty_bytes(env, 0)) + wait_until(lambda: assert_dirty_bytes(env, 0), timeout=2 * CHECKPOINT_TIMEOUT_SECONDS) # The code below verifies that we do not flush on the first write # after an idle period longer than the checkpoint timeout. @@ -210,7 +210,7 @@ def test_idle_checkpoints(neon_env_builder: NeonEnvBuilder): run_worker_for_tenant(env, 5, tenant_with_extra_writes, offset=ENTRIES_PER_TIMELINE) ) - dirty_after_write = wait_until(10, 1, lambda: assert_dirty_bytes_nonzero(env)) + dirty_after_write = wait_until(lambda: assert_dirty_bytes_nonzero(env)) # We shouldn't flush since we've just opened a new layer waited_for = 0 @@ -305,11 +305,11 @@ def test_total_size_limit(neon_env_builder: NeonEnvBuilder): # Wait until enough layers have rolled that the amount of dirty data is under the threshold. # We do this indirectly via layer maps, rather than the dirty bytes metric, to avoid false-passing # if that metric isn't updated quickly enough to reflect the dirty bytes exceeding the limit. - wait_until(compaction_period_s * 2, 1, assert_bytes_rolled) + wait_until(assert_bytes_rolled, timeout=2 * compaction_period_s) # The end state should also have the reported metric under the limit def assert_dirty_data_limited(): dirty_bytes = get_dirty_bytes(env) assert dirty_bytes < max_dirty_data - wait_until(compaction_period_s * 2, 1, lambda: assert_dirty_data_limited()) + wait_until(lambda: assert_dirty_data_limited(), timeout=2 * compaction_period_s) diff --git a/test_runner/regress/test_pageserver_restart.py b/test_runner/regress/test_pageserver_restart.py index 4bf5705517..835ccbd5d4 100644 --- a/test_runner/regress/test_pageserver_restart.py +++ b/test_runner/regress/test_pageserver_restart.py @@ -103,7 +103,7 @@ def test_pageserver_restart(neon_env_builder: NeonEnvBuilder): raise AssertionError("No 'complete' metric yet") - wait_until(30, 1.0, assert_complete) + wait_until(assert_complete) # Expectation callbacks: arg t is sample value, arg p is the previous phase's sample value expectations = [ diff --git a/test_runner/regress/test_pageserver_secondary.py b/test_runner/regress/test_pageserver_secondary.py index a264f4d3c9..1292682f9e 100644 --- a/test_runner/regress/test_pageserver_secondary.py +++ b/test_runner/regress/test_pageserver_secondary.py @@ -356,7 +356,7 @@ def test_live_migration(neon_env_builder: NeonEnvBuilder): ) assert destination_lsn >= origin_lsn - wait_until(100, 0.1, caught_up) + wait_until(caught_up) # The destination should accept writes workload.churn_rows(64, pageserver_b.id) @@ -411,7 +411,7 @@ def test_live_migration(neon_env_builder: NeonEnvBuilder): assert submitted is not None assert submitted > 0 - wait_until(10, 0.1, blocked_deletions_drained) + wait_until(blocked_deletions_drained) workload.churn_rows(64, pageserver_b.id) workload.validate(pageserver_b.id) @@ -702,7 +702,7 @@ def test_secondary_background_downloads(neon_env_builder: NeonEnvBuilder): else: timeout = int(deadline - now) + 1 try: - wait_until(timeout, 1, lambda: pageserver.assert_log_contains(expression)) + wait_until(lambda: pageserver.assert_log_contains(expression), timeout=timeout) except: log.error(f"Timed out waiting for '{expression}'") raise diff --git a/test_runner/regress/test_physical_replication.py b/test_runner/regress/test_physical_replication.py index 043aff686b..6cb11b825d 100644 --- a/test_runner/regress/test_physical_replication.py +++ b/test_runner/regress/test_physical_replication.py @@ -4,6 +4,10 @@ import random import time from typing import TYPE_CHECKING +import pytest +from fixtures.log_helper import log +from fixtures.neon_fixtures import wait_replica_caughtup + if TYPE_CHECKING: from fixtures.neon_fixtures import NeonEnv @@ -19,8 +23,8 @@ def test_physical_replication(neon_simple_env: NeonEnv): p_cur.execute( "CREATE TABLE t(pk bigint primary key, payload text default repeat('?',200))" ) - time.sleep(1) with env.endpoints.new_replica_start(origin=primary, endpoint_id="secondary") as secondary: + wait_replica_caughtup(primary, secondary) with primary.connect() as p_con: with p_con.cursor() as p_cur: with secondary.connect() as s_con: @@ -42,3 +46,218 @@ def test_physical_replication(neon_simple_env: NeonEnv): s_cur.execute( "select * from t where pk=%s", (random.randrange(1, 2 * pk),) ) + + +def test_physical_replication_config_mismatch_max_connections(neon_simple_env: NeonEnv): + """ + Test for primary and replica with different configuration settings (max_connections). + PostgreSQL enforces that settings that affect how many transactions can be open at the same time + have values equal to or higher in a hot standby replica than in the primary. If they don't, the replica refuses + to start up. If the settings are changed in the primary, it emits a WAL record with the new settings, and + when the replica sees that record it pauses the replay. + + PostgreSQL enforces this to ensure that the replica can hold all the XIDs in the so-called + "known-assigned XIDs" array, which is a fixed size array that needs to be allocated + upfront and server startup. That's pretty pessimistic, though; usually you can get + away with smaller settings, because we allocate space for 64 subtransactions per + transaction too. If you get unlucky and you run out of space, WAL redo dies with + "ERROR: too many KnownAssignedXids". It's better to take the chances than refuse + to start up, especially in Neon: if the WAL redo dies, the server is restarted, which is + no worse than refusing to start up in the first place. Furthermore, the control plane + tries to ensure that on restart, the settings are set high enough, so most likely it will + work after restart. Because of that, we have patched Postgres to disable to checks when + the `recovery_pause_on_misconfig` setting is set to `false` (which is the default on neon). + + This test tests all those cases of running out of space in known-assigned XIDs array that + we can hit with `recovery_pause_on_misconfig=false`, which are unreachable in unpatched + Postgres. + There's a similar check for `max_locks_per_transactions` too, which is related to running out + of space in the lock manager rather than known-assigned XIDs. Similar story with that, although + running out of space in the lock manager is possible in unmodified Postgres too. Enforcing the + check for `max_locks_per_transactions` ensures that you don't run out of space in the lock manager + when there are no read-only queries holding locks in the replica, but you can still run out if you have + those. + """ + env = neon_simple_env + with env.endpoints.create_start( + branch_name="main", + endpoint_id="primary", + ) as primary: + with primary.connect() as p_con: + with p_con.cursor() as p_cur: + p_cur.execute( + "CREATE TABLE t(pk bigint primary key, payload text default repeat('?',200))" + ) + with env.endpoints.new_replica_start( + origin=primary, + endpoint_id="secondary", + config_lines=["max_connections=5"], + ) as secondary: + wait_replica_caughtup(primary, secondary) + with secondary.connect() as s_con: + with s_con.cursor() as s_cur: + cursors = [] + for i in range(10): + p_con = primary.connect() + p_cur = p_con.cursor() + p_cur.execute("begin") + p_cur.execute("insert into t (pk) values (%s)", (i,)) + cursors.append(p_cur) + + for p_cur in cursors: + p_cur.execute("commit") + + wait_replica_caughtup(primary, secondary) + s_cur.execute("select count(*) from t") + assert s_cur.fetchall()[0][0] == 10 + + +def test_physical_replication_config_mismatch_max_prepared(neon_simple_env: NeonEnv): + """ + Test for primary and replica with different configuration settings (max_prepared_transactions). + If number of transactions at primary exceeds its limit at replica then WAL replay is terminated. + """ + env = neon_simple_env + primary = env.endpoints.create_start( + branch_name="main", + endpoint_id="primary", + config_lines=["max_prepared_transactions=10"], + ) + p_con = primary.connect() + p_cur = p_con.cursor() + p_cur.execute("CREATE TABLE t(pk bigint primary key, payload text default repeat('?',200))") + + secondary = env.endpoints.new_replica_start( + origin=primary, + endpoint_id="secondary", + config_lines=["max_prepared_transactions=5"], + ) + wait_replica_caughtup(primary, secondary) + + s_con = secondary.connect() + s_cur = s_con.cursor() + cursors = [] + for i in range(10): + p_con = primary.connect() + p_cur = p_con.cursor() + p_cur.execute("begin") + p_cur.execute("insert into t (pk) values (%s)", (i,)) + p_cur.execute(f"prepare transaction 't{i}'") + cursors.append(p_cur) + + for i in range(10): + cursors[i].execute(f"commit prepared 't{i}'") + + time.sleep(5) + with pytest.raises(Exception) as e: + s_cur.execute("select count(*) from t") + assert s_cur.fetchall()[0][0] == 10 + secondary.stop() + + log.info(f"Replica crashed with {e}") + assert secondary.log_contains("maximum number of prepared transactions reached") + + +def connect(ep): + max_reconnect_attempts = 10 + for _ in range(max_reconnect_attempts): + try: + return ep.connect() + except Exception as e: + log.info(f"Failed to connect with primary: {e}") + time.sleep(1) + + +def test_physical_replication_config_mismatch_too_many_known_xids(neon_simple_env: NeonEnv): + """ + Test for primary and replica with different configuration settings (max_connections). + In this case large difference in this setting and larger number of concurrent transactions at primary + # cause too many known xids error at replica. + """ + env = neon_simple_env + primary = env.endpoints.create_start( + branch_name="main", + endpoint_id="primary", + config_lines=[ + "max_connections=1000", + "shared_buffers=128MB", # prevent "no unpinned buffers available" error + ], + ) + secondary = env.endpoints.new_replica_start( + origin=primary, + endpoint_id="secondary", + config_lines=[ + "max_connections=2", + "autovacuum_max_workers=1", + "max_worker_processes=5", + "max_wal_senders=1", + "superuser_reserved_connections=0", + ], + ) + + p_con = primary.connect() + p_cur = p_con.cursor() + p_cur.execute("CREATE TABLE t(x integer)") + + n_connections = 990 + cursors = [] + for i in range(n_connections): + p_con = connect(primary) + p_cur = p_con.cursor() + p_cur.execute("begin") + p_cur.execute(f"insert into t values({i})") + cursors.append(p_cur) + + for cur in cursors: + cur.execute("commit") + + time.sleep(5) + with pytest.raises(Exception) as e: + s_con = secondary.connect() + s_cur = s_con.cursor() + s_cur.execute("select count(*) from t") + assert s_cur.fetchall()[0][0] == n_connections + secondary.stop() + + log.info(f"Replica crashed with {e}") + assert secondary.log_contains("too many KnownAssignedXids") + + +def test_physical_replication_config_mismatch_max_locks_per_transaction(neon_simple_env: NeonEnv): + """ + Test for primary and replica with different configuration settings (max_locks_per_transaction). + In conjunction with different number of max_connections at primary and standby it can cause "out of shared memory" + error if the primary obtains more AccessExclusiveLocks than the standby can hold. + """ + env = neon_simple_env + primary = env.endpoints.create_start( + branch_name="main", + endpoint_id="primary", + config_lines=[ + "max_locks_per_transaction = 100", + ], + ) + secondary = env.endpoints.new_replica_start( + origin=primary, + endpoint_id="secondary", + config_lines=[ + "max_connections=10", + "max_locks_per_transaction = 10", + ], + ) + + n_tables = 1000 + + p_con = primary.connect() + p_cur = p_con.cursor() + p_cur.execute("begin") + for i in range(n_tables): + p_cur.execute(f"CREATE TABLE t_{i}(x integer)") + p_cur.execute("commit") + + with pytest.raises(Exception) as e: + wait_replica_caughtup(primary, secondary) + secondary.stop() + + log.info(f"Replica crashed with {e}") + assert secondary.log_contains("You might need to increase") diff --git a/test_runner/regress/test_prefetch_buffer_resize.py b/test_runner/regress/test_prefetch_buffer_resize.py new file mode 100644 index 0000000000..7676b78b0e --- /dev/null +++ b/test_runner/regress/test_prefetch_buffer_resize.py @@ -0,0 +1,40 @@ +from __future__ import annotations + +import random + +import pytest +from fixtures.neon_fixtures import NeonEnvBuilder + + +@pytest.mark.parametrize("shard_count", [None, 4]) +@pytest.mark.timeout(600) +def test_prefetch(neon_env_builder: NeonEnvBuilder, shard_count: int | None): + if shard_count is not None: + neon_env_builder.num_pageservers = shard_count + env = neon_env_builder.init_start( + initial_tenant_shard_count=shard_count, + ) + n_iter = 10 + n_rec = 100000 + + endpoint = env.endpoints.create_start( + "main", + config_lines=[ + "shared_buffers=10MB", + ], + ) + + cur = endpoint.connect().cursor() + + cur.execute("CREATE TABLE t(pk integer, filler text default repeat('?', 200))") + cur.execute(f"insert into t (pk) values (generate_series(1,{n_rec}))") + + cur.execute("set statement_timeout=0") + cur.execute("set effective_io_concurrency=20") + cur.execute("set max_parallel_workers_per_gather=0") + + for _ in range(n_iter): + buf_size = random.randrange(16, 32) + cur.execute(f"set neon.readahead_buffer_size={buf_size}") + limit = random.randrange(1, n_rec) + cur.execute(f"select sum(pk) from (select pk from t limit {limit}) s") diff --git a/test_runner/regress/test_proxy.py b/test_runner/regress/test_proxy.py index 5a01d90d85..d8df2efc78 100644 --- a/test_runner/regress/test_proxy.py +++ b/test_runner/regress/test_proxy.py @@ -5,6 +5,7 @@ import json import subprocess import time import urllib.parse +from contextlib import closing from typing import TYPE_CHECKING import psycopg2 @@ -131,6 +132,24 @@ def test_proxy_options(static_proxy: NeonProxy, option_name: str): assert out[0][0] == " str" +@pytest.mark.asyncio +async def test_proxy_arbitrary_params(static_proxy: NeonProxy): + with closing( + await static_proxy.connect_async(server_settings={"IntervalStyle": "iso_8601"}) + ) as conn: + out = await conn.fetchval("select to_json('0 seconds'::interval)") + assert out == '"00:00:00"' + + options = "neon_proxy_params_compat:true" + with closing( + await static_proxy.connect_async( + server_settings={"IntervalStyle": "iso_8601", "options": options} + ) + ) as conn: + out = await conn.fetchval("select to_json('0 seconds'::interval)") + assert out == '"PT0S"' + + def test_auth_errors(static_proxy: NeonProxy): """ Check that we throw very specific errors in some unsuccessful auth scenarios. diff --git a/test_runner/regress/test_readonly_node.py b/test_runner/regress/test_readonly_node.py index 70d558ac5a..c13bea7ee1 100644 --- a/test_runner/regress/test_readonly_node.py +++ b/test_runner/regress/test_readonly_node.py @@ -215,8 +215,6 @@ def test_readonly_node_gc(neon_env_builder: NeonEnvBuilder): # wait for lease renewal before running query. _, offset = wait_until( - 20, - 0.5, lambda: ep_static.assert_log_contains( "lsn_lease_bg_task.*Request succeeded", offset=offset ), diff --git a/test_runner/regress/test_remote_storage.py b/test_runner/regress/test_remote_storage.py index 137e75f784..76a42ef4a2 100644 --- a/test_runner/regress/test_remote_storage.py +++ b/test_runner/regress/test_remote_storage.py @@ -300,9 +300,9 @@ def test_remote_storage_upload_queue_retries( print_gc_result(gc_result) assert gc_result["layers_removed"] > 0 - wait_until(2, 1, lambda: assert_eq(get_queued_count(file_kind="layer", op_kind="upload"), 0)) - wait_until(2, 1, lambda: assert_eq(get_queued_count(file_kind="index", op_kind="upload"), 0)) - wait_until(2, 1, lambda: assert_eq(get_queued_count(file_kind="layer", op_kind="delete"), 0)) + wait_until(lambda: assert_eq(get_queued_count(file_kind="layer", op_kind="upload"), 0)) + wait_until(lambda: assert_eq(get_queued_count(file_kind="index", op_kind="upload"), 0)) + wait_until(lambda: assert_eq(get_queued_count(file_kind="layer", op_kind="delete"), 0)) # let all future operations queue up configure_storage_sync_failpoints("return") @@ -333,16 +333,28 @@ def test_remote_storage_upload_queue_retries( # wait for churn thread's data to get stuck in the upload queue # Exponential back-off in upload queue, so, gracious timeouts. - wait_until(30, 1, lambda: assert_gt(get_queued_count(file_kind="layer", op_kind="upload"), 0)) - wait_until(30, 1, lambda: assert_ge(get_queued_count(file_kind="index", op_kind="upload"), 1)) - wait_until(30, 1, lambda: assert_eq(get_queued_count(file_kind="layer", op_kind="delete"), 0)) + wait_until( + lambda: assert_gt(get_queued_count(file_kind="layer", op_kind="upload"), 0), timeout=30 + ) + wait_until( + lambda: assert_ge(get_queued_count(file_kind="index", op_kind="upload"), 1), timeout=30 + ) + wait_until( + lambda: assert_eq(get_queued_count(file_kind="layer", op_kind="delete"), 0), timeout=30 + ) # unblock churn operations configure_storage_sync_failpoints("off") - wait_until(30, 1, lambda: assert_eq(get_queued_count(file_kind="layer", op_kind="upload"), 0)) - wait_until(30, 1, lambda: assert_eq(get_queued_count(file_kind="index", op_kind="upload"), 0)) - wait_until(30, 1, lambda: assert_eq(get_queued_count(file_kind="layer", op_kind="delete"), 0)) + wait_until( + lambda: assert_eq(get_queued_count(file_kind="layer", op_kind="upload"), 0), timeout=30 + ) + wait_until( + lambda: assert_eq(get_queued_count(file_kind="index", op_kind="upload"), 0), timeout=30 + ) + wait_until( + lambda: assert_eq(get_queued_count(file_kind="layer", op_kind="delete"), 0), timeout=30 + ) # The churn thread doesn't make progress once it blocks on the first wait_completion() call, # so, give it some time to wrap up. @@ -580,7 +592,7 @@ def test_timeline_deletion_with_files_stuck_in_upload_queue( > 0 ) - wait_until(200, 0.1, assert_compacted_and_uploads_queued) + wait_until(assert_compacted_and_uploads_queued) # Regardless, give checkpoint some time to block for good. # Not strictly necessary, but might help uncover failure modes in the future. @@ -598,9 +610,7 @@ def test_timeline_deletion_with_files_stuck_in_upload_queue( ] ) - # Generous timeout, because currently deletions can get blocked waiting for compaction - # This can be reduced when https://github.com/neondatabase/neon/issues/4998 is fixed. - timeline_delete_wait_completed(client, tenant_id, timeline_id, iterations=30, interval=1) + timeline_delete_wait_completed(client, tenant_id, timeline_id) assert not timeline_path.exists() @@ -826,22 +836,16 @@ def wait_upload_queue_empty( client: PageserverHttpClient, tenant_id: TenantId, timeline_id: TimelineId ): wait_until( - 2, - 1, lambda: assert_eq( get_queued_count(client, tenant_id, timeline_id, file_kind="layer", op_kind="upload"), 0 ), ) wait_until( - 2, - 1, lambda: assert_eq( get_queued_count(client, tenant_id, timeline_id, file_kind="index", op_kind="upload"), 0 ), ) wait_until( - 2, - 1, lambda: assert_eq( get_queued_count(client, tenant_id, timeline_id, file_kind="layer", op_kind="delete"), 0 ), diff --git a/test_runner/regress/test_replica_start.py b/test_runner/regress/test_replica_start.py index 8e7c01f950..e2a22cc769 100644 --- a/test_runner/regress/test_replica_start.py +++ b/test_runner/regress/test_replica_start.py @@ -378,7 +378,7 @@ def test_replica_too_many_known_assigned_xids(neon_simple_env: NeonEnv): return None raise RuntimeError("connection succeeded") - wait_until(20, 0.5, check_replica_crashed) + wait_until(check_replica_crashed) assert secondary.log_contains("too many KnownAssignedXids") # Replica is crashed, so ignore stop result diff --git a/test_runner/regress/test_sharding.py b/test_runner/regress/test_sharding.py index 411574bd86..30abf91d3a 100644 --- a/test_runner/regress/test_sharding.py +++ b/test_runner/regress/test_sharding.py @@ -519,6 +519,13 @@ def test_sharding_split_smoke( # We will have 2 shards per pageserver once done (including secondaries) neon_env_builder.num_pageservers = split_shard_count + # Two AZs + def assign_az(ps_cfg): + az = f"az-{(ps_cfg['id'] - 1) % 2}" + ps_cfg["availability_zone"] = az + + neon_env_builder.pageserver_config_override = assign_az + # 1MiB stripes: enable getting some meaningful data distribution without # writing large quantities of data in this test. The stripe size is given # in number of 8KiB pages. @@ -836,7 +843,7 @@ def test_sharding_split_stripe_size( assert len(notifications) == 3 assert notifications[2] == expect_after - wait_until(10, 1, assert_restart_notification) + wait_until(assert_restart_notification) # The quantity of data isn't huge, but debug can be _very_ slow, and the things we're @@ -1025,7 +1032,7 @@ def test_sharding_ingest_gaps( assert Lsn(timeline_detail["disk_consistent_lsn"]) >= expect_lsn # We set a short checkpoint timeout: expect things to get frozen+flushed within that - wait_until(checkpoint_interval_secs * 3, 1, assert_all_disk_consistent) + wait_until(assert_all_disk_consistent, timeout=3 * checkpoint_interval_secs) def assert_all_remote_consistent(): """ @@ -1037,7 +1044,7 @@ def test_sharding_ingest_gaps( assert Lsn(timeline_detail["remote_consistent_lsn"]) >= expect_lsn # We set a short checkpoint timeout: expect things to get frozen+flushed within that - wait_until(checkpoint_interval_secs * 3, 1, assert_all_remote_consistent) + wait_until(assert_all_remote_consistent, timeout=3 * checkpoint_interval_secs) workload.validate() @@ -1405,14 +1412,14 @@ def test_sharding_split_failures( # e.g. while waiting for a storage controller to re-attach a parent shard if we failed # inside the pageserver and the storage controller responds by detaching children and attaching # parents concurrently (https://github.com/neondatabase/neon/issues/7148) - wait_until(10, 1, lambda: workload.churn_rows(10, upload=False, ingest=False)) + wait_until(lambda: workload.churn_rows(10, upload=False, ingest=False)) workload.validate() if failure.fails_forward(env): log.info("Fail-forward failure, checking split eventually completes...") # A failure type which results in eventual completion of the split - wait_until(30, 1, assert_split_done) + wait_until(assert_split_done) elif failure.can_mitigate(): log.info("Mitigating failure...") # Mitigation phase: we expect to be able to proceed with a successful shard split @@ -1420,21 +1427,21 @@ def test_sharding_split_failures( # The split should appear to be rolled back from the point of view of all pageservers # apart from the one that is offline - wait_until(30, 1, lambda: assert_rolled_back(exclude_ps_id=failure.pageserver_id)) + wait_until(lambda: assert_rolled_back(exclude_ps_id=failure.pageserver_id)) finish_split() - wait_until(30, 1, lambda: assert_split_done(exclude_ps_id=failure.pageserver_id)) + wait_until(lambda: assert_split_done(exclude_ps_id=failure.pageserver_id)) # Having cleared the failure, everything should converge to a pristine state failure.clear(env) - wait_until(30, 1, assert_split_done) + wait_until(assert_split_done) else: # Once we restore the faulty pageserver's API to good health, rollback should # eventually complete. log.info("Clearing failure...") failure.clear(env) - wait_until(30, 1, assert_rolled_back) + wait_until(assert_rolled_back) # Having rolled back, the tenant should be working workload.churn_rows(10) diff --git a/test_runner/regress/test_storage_controller.py b/test_runner/regress/test_storage_controller.py index 13bc54a114..f878116d53 100644 --- a/test_runner/regress/test_storage_controller.py +++ b/test_runner/regress/test_storage_controller.py @@ -154,7 +154,7 @@ def test_storage_controller_smoke(neon_env_builder: NeonEnvBuilder, combination) counts = get_node_shard_counts(env, tenant_ids) assert counts[node_id] == 0 - wait_until(10, 1, lambda: node_evacuated(env.pageservers[0].id)) + wait_until(lambda: node_evacuated(env.pageservers[0].id)) # Let all the reconciliations after marking the node offline complete env.storage_controller.reconcile_until_idle() @@ -222,7 +222,7 @@ def test_node_status_after_restart( def is_ready(): assert env.storage_controller.ready() is True - wait_until(30, 1, is_ready) + wait_until(is_ready) # We loaded nodes from database on restart nodes = env.storage_controller.node_list() @@ -606,7 +606,7 @@ def test_storage_controller_compute_hook( counts = get_node_shard_counts(env, [env.initial_tenant]) assert counts[node_id] == 0 - wait_until(10, 1, lambda: node_evacuated(env.pageservers[0].id)) + wait_until(lambda: node_evacuated(env.pageservers[0].id)) # Additional notification from migration log.info(f"notifications: {notifications}") @@ -620,7 +620,7 @@ def test_storage_controller_compute_hook( assert len(notifications) == 2 assert notifications[1] == expect - wait_until(20, 0.25, received_migration_notification) + wait_until(received_migration_notification) # When we restart, we should re-emit notifications for all tenants env.storage_controller.stop() @@ -630,7 +630,7 @@ def test_storage_controller_compute_hook( assert len(notifications) == 3 assert notifications[2] == expect - wait_until(10, 1, received_restart_notification) + wait_until(received_restart_notification) # Splitting a tenant should cause its stripe size to become visible in the compute notification env.storage_controller.tenant_shard_split(env.initial_tenant, shard_count=2) @@ -647,7 +647,7 @@ def test_storage_controller_compute_hook( assert len(notifications) == 4 assert notifications[3] == expect - wait_until(10, 1, received_split_notification) + wait_until(received_split_notification) # If the compute hook is unavailable, that should not block creating a tenant and # creating a timeline. This simulates a control plane refusing to accept notifications @@ -736,7 +736,7 @@ def test_storage_controller_stuck_compute_hook( def logged_stuck(): env.storage_controller.assert_log_contains(NOTIFY_BLOCKED_LOG) - wait_until(10, 0.25, logged_stuck) + wait_until(logged_stuck) contains_r = env.storage_controller.log_contains(NOTIFY_BLOCKED_LOG) assert contains_r is not None # Appease mypy (_, log_cursor) = contains_r @@ -764,7 +764,7 @@ def test_storage_controller_stuck_compute_hook( def logged_stuck_again(): env.storage_controller.assert_log_contains(NOTIFY_BLOCKED_LOG, offset=log_cursor) - wait_until(10, 0.25, logged_stuck_again) + wait_until(logged_stuck_again) assert migrate_fut.running() # This time, the compute hook remains stuck, but we mark the origin node offline: this should @@ -865,7 +865,7 @@ def test_storage_controller_compute_hook_revert( assert latest["shards"] is not None assert latest["shards"][0]["node_id"] == ps_id - wait_until(30, 1, lambda: notified_ps(pageserver_a.id)) + wait_until(lambda: notified_ps(pageserver_a.id)) env.storage_controller.allowed_errors.append(NOTIFY_BLOCKED_LOG) env.storage_controller.allowed_errors.extend(NOTIFY_FAILURE_LOGS) @@ -880,7 +880,7 @@ def test_storage_controller_compute_hook_revert( # Although the migration API failed, the hook should still see pageserver B (it remembers what # was posted even when returning an error code) - wait_until(30, 1, lambda: notified_ps(pageserver_b.id)) + wait_until(lambda: notified_ps(pageserver_b.id)) # Although the migration API failed, the tenant should still have moved to the right pageserver assert len(pageserver_b.http_client().tenant_list()) == 1 @@ -898,7 +898,7 @@ def test_storage_controller_compute_hook_revert( def logged_giving_up(): env.storage_controller.assert_log_contains(".*Giving up on compute notification.*") - wait_until(30, 1, logged_giving_up) + wait_until(logged_giving_up) pageserver_a.start() @@ -919,7 +919,7 @@ def test_storage_controller_compute_hook_revert( handle_params["status"] = 200 env.storage_controller.tenant_shard_migrate(tenant_shard_id, pageserver_a.id) - wait_until(30, 1, lambda: notified_ps(pageserver_a.id)) + wait_until(lambda: notified_ps(pageserver_a.id)) def test_storage_controller_debug_apis(neon_env_builder: NeonEnvBuilder): @@ -1453,7 +1453,7 @@ def test_storage_controller_heartbeats( # Check that each node got one tenant assert all(len(ts) == 1 for ts in node_to_tenants.values()) - wait_until(10, 1, tenants_placed) + wait_until(tenants_placed) # ... then we apply the failure offline_node_ids = set(failure.nodes()) @@ -1476,7 +1476,7 @@ def test_storage_controller_heartbeats( assert node["availability"] == "Offline" start = time.time() - wait_until(failure.offline_timeout, 1, nodes_offline) + wait_until(nodes_offline, timeout=failure.offline_timeout) detected_after = time.time() - start log.info(f"Detected node failures after {detected_after}s") @@ -1497,7 +1497,7 @@ def test_storage_controller_heartbeats( assert observed_tenants == set(tenant_ids) - wait_until(10, 1, tenant_migrated) + wait_until(tenant_migrated) # ... then we clear the failure failure.clear(env) @@ -1509,7 +1509,7 @@ def test_storage_controller_heartbeats( if node["id"] in online_node_ids: assert node["availability"] == "Active" - wait_until(10, 1, nodes_online) + wait_until(nodes_online) time.sleep(5) @@ -1562,7 +1562,7 @@ def test_storage_controller_re_attach(neon_env_builder: NeonEnvBuilder): # We could pre-empty this by configuring the node to Offline, but it's preferable to test # the realistic path we would take when a node restarts uncleanly. # The delay here will be ~NEON_LOCAL_MAX_UNAVAILABLE_INTERVAL in neon_local - wait_until(30, 1, failed_over) + wait_until(failed_over) reconciles_before_restart = env.storage_controller.get_metric_value( "storage_controller_reconcile_complete_total", filter={"status": "ok"} @@ -1640,12 +1640,12 @@ def test_storage_controller_shard_scheduling_policy(neon_env_builder: NeonEnvBui assert e > n return e - errs = wait_until(10, 1, lambda: assert_errors_gt(0)) + errs = wait_until(lambda: assert_errors_gt(0)) # Try reconciling again, it should fail again with pytest.raises(StorageControllerApiException): env.storage_controller.reconcile_all() - errs = wait_until(10, 1, lambda: assert_errors_gt(errs)) + errs = wait_until(lambda: assert_errors_gt(errs)) # Configure the tenant to disable reconciles env.storage_controller.tenant_policy_update( @@ -1674,7 +1674,7 @@ def test_storage_controller_shard_scheduling_policy(neon_env_builder: NeonEnvBui return o # We should see a successful reconciliation - wait_until(10, 1, lambda: assert_ok_gt(0)) + wait_until(lambda: assert_ok_gt(0)) # And indeed the tenant should be attached assert len(env.pageserver.http_client().tenant_list_locations()["tenant_shards"]) == 1 @@ -1747,8 +1747,8 @@ def test_storcon_cli(neon_env_builder: NeonEnvBuilder): # Describe a tenant tenant_lines = storcon_cli(["tenant-describe", "--tenant-id", str(env.initial_tenant)]) - assert len(tenant_lines) == 3 + shard_count * 2 - assert str(env.initial_tenant) in tenant_lines[3] + assert len(tenant_lines) >= 3 + shard_count * 2 + assert str(env.initial_tenant) in tenant_lines[0] # Pause changes on a tenant storcon_cli(["tenant-policy", "--tenant-id", str(env.initial_tenant), "--scheduling", "stop"]) @@ -2073,7 +2073,7 @@ def test_skip_drain_on_secondary_lag(neon_env_builder: NeonEnvBuilder, pg_bin: P raise Exception(f"Secondary lag not big enough: {lag}") log.info(f"Looking for lag to develop on the secondary {secondary}") - wait_until(10, 1, secondary_is_lagging) + wait_until(secondary_is_lagging) log.info(f"Starting drain of primary {primary} with laggy secondary {secondary}") env.storage_controller.retryable_node_operation( @@ -2107,7 +2107,7 @@ def test_skip_drain_on_secondary_lag(neon_env_builder: NeonEnvBuilder, pg_bin: P if lag > 1 * 1024 * 1024: raise Exception(f"Secondary lag not big enough: {lag}") - wait_until(10, 1, lag_is_acceptable) + wait_until(lag_is_acceptable) env.storage_controller.node_configure(primary, {"scheduling": "Active"}) @@ -2227,7 +2227,7 @@ def test_storage_controller_node_deletion( log.info(f"Shards on nodes other than on victim: {elsewhere}") assert elsewhere == tenant_count * shard_count_per_tenant - wait_until(30, 1, assert_shards_migrated) + wait_until(assert_shards_migrated) log.info(f"Deleting pageserver {victim.id}") env.storage_controller.node_delete(victim.id) @@ -2240,7 +2240,7 @@ def test_storage_controller_node_deletion( log.info(f"Shards on node {victim.id}: {count}") assert count == 0 - wait_until(30, 1, assert_victim_evacuated) + wait_until(assert_victim_evacuated) # The node should be gone from the list API assert victim.id not in [n["id"] for n in env.storage_controller.node_list()] @@ -2253,12 +2253,7 @@ def test_storage_controller_node_deletion( assert victim.id not in shard["node_secondary"] # Reconciles running during deletion should all complete - # FIXME: this currently doesn't work because the deletion schedules shards without a proper ScheduleContext, resulting - # in states that background_reconcile wants to optimize, but can't proceed with migrations yet because this is a short3 - # test that hasn't uploaded any heatmaps for secondaries. - # In the interim, just do a reconcile_all to enable the consistency check. - # env.storage_controller.reconcile_until_idle() - env.storage_controller.reconcile_all() + env.storage_controller.reconcile_until_idle() # Controller should pass its own consistency checks env.storage_controller.consistency_check() @@ -2267,7 +2262,6 @@ def test_storage_controller_node_deletion( env.storage_controller.stop() env.storage_controller.start() assert victim.id not in [n["id"] for n in env.storage_controller.node_list()] - env.storage_controller.reconcile_all() # FIXME: workaround for optimizations happening on startup, see FIXME above. env.storage_controller.consistency_check() @@ -2569,7 +2563,7 @@ def test_storage_controller_leadership_transfer( == StorageControllerLeadershipStatus.STEPPED_DOWN ) - wait_until(5, 1, previous_stepped_down) + wait_until(previous_stepped_down) storage_controller_proxy.route_to(f"http://127.0.0.1:{storage_controller_2_port}") @@ -2579,7 +2573,7 @@ def test_storage_controller_leadership_transfer( == StorageControllerLeadershipStatus.LEADER ) - wait_until(15, 1, new_becomes_leader) + wait_until(new_becomes_leader) leader = env.storage_controller.get_leader() assert leader["address"] == f"http://127.0.0.1:{storage_controller_2_port}/" @@ -2624,7 +2618,7 @@ def test_storage_controller_ps_restarted_during_drain(neon_env_builder: NeonEnvB env.storage_controller.configure_failpoints(("sleepy-drain-loop", "return(10000)")) env.storage_controller.node_drain(attached.id) - wait_until(10, 0.5, attached_is_draining) + wait_until(attached_is_draining) attached.restart() @@ -2646,7 +2640,7 @@ def test_storage_controller_ps_restarted_during_drain(neon_env_builder: NeonEnvB env.storage_controller.node_configure(attached.id, {"scheduling": "Pause"}) # allow for small delay between actually having cancelled and being able reconfigure again - wait_until(4, 0.5, reconfigure_node_again) + wait_until(reconfigure_node_again) def test_storage_controller_timeline_crud_race(neon_env_builder: NeonEnvBuilder): @@ -2691,7 +2685,7 @@ def test_storage_controller_timeline_crud_race(neon_env_builder: NeonEnvBuilder) ps.log_contains(f"at failpoint {failpoint}") is not None for ps in env.pageservers ) - wait_until(10, 1, has_hit_failpoint) + wait_until(has_hit_failpoint) # Migrate the tenant while the timeline creation is in progress: this migration will complete once it # can detach from the old pageserver, which will happen once the failpoint completes. @@ -2775,7 +2769,7 @@ def test_storage_controller_validate_during_migration(neon_env_builder: NeonEnvB def has_hit_compaction_failpoint(): assert origin_pageserver.log_contains(f"at failpoint {compaction_failpoint}") - wait_until(10, 1, has_hit_compaction_failpoint) + wait_until(has_hit_compaction_failpoint) # While the compaction is running, start a live migration which will pause long enough for the compaction to sleep, # after incrementing generation and attaching the new location @@ -2794,7 +2788,7 @@ def test_storage_controller_validate_during_migration(neon_env_builder: NeonEnvB # before it reaches this point. The timeout is because the AttachedStale transition includes # a flush of remote storage, and if the compaction already enqueued an index upload this cannot # make progress. - wait_until(60, 1, has_hit_migration_failpoint) + wait_until(has_hit_migration_failpoint, timeout=60) # Origin pageserver has succeeded with compaction before the migration completed. It has done all the writes it wanted to do in its own (stale) generation origin_pageserver.http_client().configure_failpoints((compaction_failpoint, "off")) @@ -2917,7 +2911,7 @@ def test_storage_controller_proxy_during_migration( log.info(expr) assert env.storage_controller.log_contains(expr) - wait_until(10, 1, has_hit_migration_failpoint) + wait_until(has_hit_migration_failpoint) # This request should be routed to whichever pageserver holds the highest generation tenant_info = env.storage_controller.pageserver_api().tenant_status( @@ -2934,7 +2928,7 @@ def test_storage_controller_proxy_during_migration( # We expect request to land on the origin assert tenant_info["generation"] == 1 - wait_until(10, 1, long_migration_metric_published) + wait_until(long_migration_metric_published) # Eventually migration completes env.storage_controller.configure_failpoints((migration_failpoint.value, "off")) @@ -3063,7 +3057,11 @@ def test_shard_preferred_azs(neon_env_builder: NeonEnvBuilder): for shard in shards: attached_to = shard["node_attached"] expected_az = env.get_pageserver(attached_to).az_id - assert shard["preferred_az_id"] == expected_az + + # The scheduling optimization logic is not yet AZ-aware, so doesn't succeed + # in putting the tenant shards in the preferred AZ. + # To be fixed in https://github.com/neondatabase/neon/pull/9916 + # assert shard["preferred_az_id"] == expected_az @run_only_on_default_postgres("Postgres version makes no difference here") @@ -3113,7 +3111,7 @@ def test_timeline_delete_mid_live_migration(neon_env_builder: NeonEnvBuilder, mi log.info(expr) assert env.storage_controller.log_contains(expr) - wait_until(10, 1, has_hit_migration_failpoint) + wait_until(has_hit_migration_failpoint) env.storage_controller.pageserver_api().timeline_delete( tenant_id=tenant_id, timeline_id=timeline_id @@ -3182,7 +3180,7 @@ def test_multi_attached_timeline_creation(neon_env_builder: NeonEnvBuilder, migr log.info(expr) assert env.storage_controller.log_contains(expr) - wait_until(10, 1, has_hit_migration_failpoint) + wait_until(has_hit_migration_failpoint) timeline_id = TimelineId.generate() env.storage_controller.pageserver_api().timeline_create( diff --git a/test_runner/regress/test_storage_scrubber.py b/test_runner/regress/test_storage_scrubber.py index 3991bd7061..b16dc54c24 100644 --- a/test_runner/regress/test_storage_scrubber.py +++ b/test_runner/regress/test_storage_scrubber.py @@ -431,8 +431,6 @@ def test_scrubber_physical_gc_ancestors_split(neon_env_builder: NeonEnvBuilder): # Let the controller reach the failpoint wait_until( - 10, - 1, lambda: env.storage_controller.assert_log_contains( 'failpoint "shard-split-post-remote-sleep": sleeping' ), diff --git a/test_runner/regress/test_subscriber_restart.py b/test_runner/regress/test_subscriber_restart.py index d37eeb1e6e..7d4f66d044 100644 --- a/test_runner/regress/test_subscriber_restart.py +++ b/test_runner/regress/test_subscriber_restart.py @@ -56,4 +56,4 @@ def test_subscriber_restart(neon_simple_env: NeonEnv): pcur.execute(f"INSERT into t values ({n_records}, 0)") n_records += 1 with sub.cursor() as scur: - wait_until(60, 0.5, check_that_changes_propagated) + wait_until(check_that_changes_propagated) diff --git a/test_runner/regress/test_tenant_conf.py b/test_runner/regress/test_tenant_conf.py index 1dd46ec3d1..f8f240cfdc 100644 --- a/test_runner/regress/test_tenant_conf.py +++ b/test_runner/regress/test_tenant_conf.py @@ -234,11 +234,7 @@ def test_creating_tenant_conf_after_attach(neon_env_builder: NeonEnvBuilder): assert not config_path.exists(), "detach did not remove config file" env.pageserver.tenant_attach(tenant_id) - wait_until( - number_of_iterations=5, - interval=1, - func=lambda: assert_tenant_state(http_client, tenant_id, "Active"), - ) + wait_until(lambda: assert_tenant_state(http_client, tenant_id, "Active")) env.config_tenant(tenant_id, {"gc_horizon": "1000000"}) contents_first = config_path.read_text() diff --git a/test_runner/regress/test_tenant_delete.py b/test_runner/regress/test_tenant_delete.py index 47df3ead70..48e55c1ab1 100644 --- a/test_runner/regress/test_tenant_delete.py +++ b/test_runner/regress/test_tenant_delete.py @@ -185,21 +185,21 @@ def test_long_timeline_create_cancelled_by_tenant_delete(neon_env_builder: NeonE deletion = None try: - wait_until(10, 1, has_hit_failpoint) + wait_until(has_hit_failpoint) # it should start ok, sync up with the stuck creation, then hang waiting for the timeline # to shut down. deletion = Thread(target=start_deletion) deletion.start() - wait_until(10, 1, deletion_has_started_waiting_for_timelines) + wait_until(deletion_has_started_waiting_for_timelines) pageserver_http.configure_failpoints((failpoint, "off")) creation.join() deletion.join() - wait_until(10, 1, tenant_is_deleted) + wait_until(tenant_is_deleted) finally: creation.join() if deletion is not None: @@ -264,7 +264,7 @@ def test_tenant_delete_races_timeline_creation(neon_env_builder: NeonEnvBuilder) def hit_initdb_upload_failpoint(): env.pageserver.assert_log_contains(f"at failpoint {BEFORE_INITDB_UPLOAD_FAILPOINT}") - wait_until(100, 0.1, hit_initdb_upload_failpoint) + wait_until(hit_initdb_upload_failpoint) def creation_connection_timed_out(): env.pageserver.assert_log_contains( @@ -273,7 +273,7 @@ def test_tenant_delete_races_timeline_creation(neon_env_builder: NeonEnvBuilder) # Wait so that we hit the timeout and the connection is dropped # (But timeline creation still continues) - wait_until(100, 0.1, creation_connection_timed_out) + wait_until(creation_connection_timed_out) ps_http.configure_failpoints((DELETE_BEFORE_CLEANUP_FAILPOINT, "pause")) @@ -281,7 +281,7 @@ def test_tenant_delete_races_timeline_creation(neon_env_builder: NeonEnvBuilder) def tenant_delete_inner(): ps_http.tenant_delete(tenant_id) - wait_until(100, 0.5, tenant_delete_inner) + wait_until(tenant_delete_inner) Thread(target=tenant_delete).start() @@ -290,7 +290,7 @@ def test_tenant_delete_races_timeline_creation(neon_env_builder: NeonEnvBuilder) f"cfg failpoint: {DELETE_BEFORE_CLEANUP_FAILPOINT} pause" ) - wait_until(100, 0.1, deletion_arrived) + wait_until(deletion_arrived) ps_http.configure_failpoints((DELETE_BEFORE_CLEANUP_FAILPOINT, "off")) diff --git a/test_runner/regress/test_tenant_detach.py b/test_runner/regress/test_tenant_detach.py index 8d7ca7bc4e..3f21dc895a 100644 --- a/test_runner/regress/test_tenant_detach.py +++ b/test_runner/regress/test_tenant_detach.py @@ -212,7 +212,7 @@ def test_tenant_reattach_while_busy( nonlocal updates_started, updates_finished, updates_to_perform # Wait until we have performed some updates - wait_until(20, 0.5, lambda: updates_finished > 500) + wait_until(lambda: updates_finished > 500) log.info("Detaching tenant") pageserver_http.tenant_detach(tenant_id) @@ -512,7 +512,7 @@ def test_metrics_while_ignoring_broken_tenant_and_reloading( ) assert only_int(active) == 0 and only_int(broken) == 1 and only_int(broken_set) == 1 - wait_until(10, 0.5, found_broken) + wait_until(found_broken) client.tenant_detach(env.initial_tenant) @@ -524,7 +524,7 @@ def test_metrics_while_ignoring_broken_tenant_and_reloading( ) assert only_int(broken) == 0 and len(broken_set) == 0 - wait_until(10, 0.5, found_cleaned_up) + wait_until(found_cleaned_up) env.pageserver.tenant_attach(env.initial_tenant) @@ -536,4 +536,4 @@ def test_metrics_while_ignoring_broken_tenant_and_reloading( ) assert only_int(active) == 1 and len(broken_set) == 0 - wait_until(10, 0.5, found_active) + wait_until(found_active) diff --git a/test_runner/regress/test_tenant_relocation.py b/test_runner/regress/test_tenant_relocation.py index bf6120aa0a..df53a98e92 100644 --- a/test_runner/regress/test_tenant_relocation.py +++ b/test_runner/regress/test_tenant_relocation.py @@ -298,11 +298,7 @@ def test_tenant_relocation( destination_ps.tenant_attach(tenant_id) # wait for tenant to finish attaching - wait_until( - number_of_iterations=10, - interval=1, - func=lambda: assert_tenant_state(destination_http, tenant_id, "Active"), - ) + wait_until(lambda: assert_tenant_state(destination_http, tenant_id, "Active")) check_timeline_attached( destination_http, diff --git a/test_runner/regress/test_tenant_size.py b/test_runner/regress/test_tenant_size.py index 8b733da0c6..713f89c60f 100644 --- a/test_runner/regress/test_tenant_size.py +++ b/test_runner/regress/test_tenant_size.py @@ -638,7 +638,7 @@ def test_synthetic_size_while_deleting(neon_env_builder: NeonEnvBuilder): with ThreadPoolExecutor(max_workers=1) as exec: completion = exec.submit(client.tenant_size, env.initial_tenant) _, last_offset = wait_until( - 10, 1.0, lambda: env.pageserver.assert_log_contains(f"at failpoint {failpoint}") + lambda: env.pageserver.assert_log_contains(f"at failpoint {failpoint}") ) timeline_delete_wait_completed(client, env.initial_tenant, branch_id) @@ -656,8 +656,6 @@ def test_synthetic_size_while_deleting(neon_env_builder: NeonEnvBuilder): with ThreadPoolExecutor(max_workers=1) as exec: completion = exec.submit(client.tenant_size, env.initial_tenant) wait_until( - 10, - 1.0, lambda: env.pageserver.assert_log_contains( f"at failpoint {failpoint}", offset=last_offset ), diff --git a/test_runner/regress/test_tenant_tasks.py b/test_runner/regress/test_tenant_tasks.py index 72183f5778..4c26b64d22 100644 --- a/test_runner/regress/test_tenant_tasks.py +++ b/test_runner/regress/test_tenant_tasks.py @@ -77,4 +77,4 @@ def test_tenant_tasks(neon_env_builder: NeonEnvBuilder): assert tasks_started == tasks_ended assert tasks_panicked is None or int(tasks_panicked) == 0 - wait_until(10, 0.2, assert_tasks_finish) + wait_until(assert_tasks_finish) diff --git a/test_runner/regress/test_tenants.py b/test_runner/regress/test_tenants.py index 158c3fddb0..d31901b384 100644 --- a/test_runner/regress/test_tenants.py +++ b/test_runner/regress/test_tenants.py @@ -330,7 +330,7 @@ def test_pageserver_with_empty_tenants(neon_env_builder: NeonEnvBuilder): assert len(tenants) == 1 assert all(t["state"]["slug"] != "Attaching" for t in tenants) - wait_until(10, 0.2, not_attaching) + wait_until(not_attaching) tenants = client.tenant_list() diff --git a/test_runner/regress/test_tenants_with_remote_storage.py b/test_runner/regress/test_tenants_with_remote_storage.py index 8d3ddf7e54..6b27c41d1c 100644 --- a/test_runner/regress/test_tenants_with_remote_storage.py +++ b/test_runner/regress/test_tenants_with_remote_storage.py @@ -178,11 +178,7 @@ def test_tenants_attached_after_download(neon_env_builder: NeonEnvBuilder): env.pageserver.start() client = env.pageserver.http_client() - wait_until( - number_of_iterations=5, - interval=1, - func=lambda: assert_tenant_state(client, tenant_id, "Active"), - ) + wait_until(lambda: assert_tenant_state(client, tenant_id, "Active")) restored_timelines = client.timeline_list(tenant_id) assert ( @@ -257,11 +253,7 @@ def test_tenant_redownloads_truncated_file_on_startup( env.pageserver.start() client = env.pageserver.http_client() - wait_until( - number_of_iterations=5, - interval=1, - func=lambda: assert_tenant_state(client, tenant_id, "Active"), - ) + wait_until(lambda: assert_tenant_state(client, tenant_id, "Active")) restored_timelines = client.timeline_list(tenant_id) assert ( diff --git a/test_runner/regress/test_timeline_archive.py b/test_runner/regress/test_timeline_archive.py index bc2e048f69..e808dd1396 100644 --- a/test_runner/regress/test_timeline_archive.py +++ b/test_runner/regress/test_timeline_archive.py @@ -227,8 +227,8 @@ def test_timeline_offloading(neon_env_builder: NeonEnvBuilder, manual_offload: b ps_http.timeline_offload(tenant_id=tenant_id, timeline_id=leaf_timeline_id) assert timeline_offloaded_logged(leaf_timeline_id) - wait_until(30, 1, leaf_offloaded) - wait_until(30, 1, parent_offloaded) + wait_until(leaf_offloaded) + wait_until(parent_offloaded) # Offloaded child timelines should still prevent deletion with pytest.raises( @@ -331,7 +331,7 @@ def test_timeline_offload_persist(neon_env_builder: NeonEnvBuilder, delete_timel ps_http.timeline_offload(tenant_id=tenant_id, timeline_id=child_timeline_id) assert timeline_offloaded_api(child_timeline_id) - wait_until(30, 1, child_offloaded) + wait_until(child_offloaded) assert timeline_offloaded_api(child_timeline_id) assert not timeline_offloaded_api(root_timeline_id) @@ -835,3 +835,117 @@ def test_timeline_retain_lsn( with env.endpoints.create_start("test_archived_branch", tenant_id=tenant_id) as endpoint: sum = endpoint.safe_psql("SELECT sum(key) from foo where v < 51200") assert sum == pre_branch_sum + + +def test_timeline_offload_generations(neon_env_builder: NeonEnvBuilder): + """ + Test for scrubber deleting old generations of manifests + """ + remote_storage_kind = s3_storage() + neon_env_builder.enable_pageserver_remote_storage(remote_storage_kind) + + env = neon_env_builder.init_start() + ps_http = env.pageserver.http_client() + + # Turn off gc and compaction loops: we want to issue them manually for better reliability + tenant_id, root_timeline_id = env.create_tenant( + conf={ + "gc_period": "0s", + "compaction_period": "0s", + "checkpoint_distance": f"{1024 ** 2}", + } + ) + + # Create a branch and archive it + child_timeline_id = env.create_branch("test_archived_branch_persisted", tenant_id) + + with env.endpoints.create_start( + "test_archived_branch_persisted", tenant_id=tenant_id + ) as endpoint: + endpoint.safe_psql_many( + [ + "CREATE TABLE foo(key serial primary key, t text default 'data_content')", + "INSERT INTO foo SELECT FROM generate_series(1,512)", + ] + ) + sum = endpoint.safe_psql("SELECT sum(key) from foo where key % 3 = 2") + last_flush_lsn_upload(env, endpoint, tenant_id, child_timeline_id) + + assert_prefix_not_empty( + neon_env_builder.pageserver_remote_storage, + prefix=f"tenants/{str(tenant_id)}/", + ) + assert_prefix_empty( + neon_env_builder.pageserver_remote_storage, + prefix=f"tenants/{str(tenant_id)}/tenant-manifest", + ) + + ps_http.timeline_archival_config( + tenant_id, + child_timeline_id, + state=TimelineArchivalState.ARCHIVED, + ) + + def timeline_offloaded_api(timeline_id: TimelineId) -> bool: + # TODO add a proper API to check if a timeline has been offloaded or not + return not any( + timeline["timeline_id"] == str(timeline_id) + for timeline in ps_http.timeline_list(tenant_id=tenant_id) + ) + + def child_offloaded(): + ps_http.timeline_offload(tenant_id=tenant_id, timeline_id=child_timeline_id) + assert timeline_offloaded_api(child_timeline_id) + + wait_until(child_offloaded) + + assert timeline_offloaded_api(child_timeline_id) + assert not timeline_offloaded_api(root_timeline_id) + + # Reboot the pageserver a bunch of times, do unoffloads, offloads + for i in range(5): + env.pageserver.stop() + env.pageserver.start() + + assert timeline_offloaded_api(child_timeline_id) + assert not timeline_offloaded_api(root_timeline_id) + + ps_http.timeline_archival_config( + tenant_id, + child_timeline_id, + state=TimelineArchivalState.UNARCHIVED, + ) + + assert not timeline_offloaded_api(child_timeline_id) + + if i % 2 == 0: + with env.endpoints.create_start( + "test_archived_branch_persisted", tenant_id=tenant_id + ) as endpoint: + sum_again = endpoint.safe_psql("SELECT sum(key) from foo where key % 3 = 2") + assert sum == sum_again + + ps_http.timeline_archival_config( + tenant_id, + child_timeline_id, + state=TimelineArchivalState.ARCHIVED, + ) + wait_until(child_offloaded) + + # + # Now ensure that scrubber runs will clean up old generations' manifests. + # + + # Sleep some amount larger than min_age_secs + time.sleep(3) + + # Ensure that min_age_secs has a deletion impeding effect + gc_summary = env.storage_scrubber.pageserver_physical_gc(min_age_secs=3600, mode="full") + assert gc_summary["remote_storage_errors"] == 0 + assert gc_summary["indices_deleted"] == 0 + assert gc_summary["tenant_manifests_deleted"] == 0 + + gc_summary = env.storage_scrubber.pageserver_physical_gc(min_age_secs=1, mode="full") + assert gc_summary["remote_storage_errors"] == 0 + assert gc_summary["indices_deleted"] > 0 + assert gc_summary["tenant_manifests_deleted"] > 0 diff --git a/test_runner/regress/test_timeline_delete.py b/test_runner/regress/test_timeline_delete.py index 155709e106..fbece68367 100644 --- a/test_runner/regress/test_timeline_delete.py +++ b/test_runner/regress/test_timeline_delete.py @@ -21,7 +21,6 @@ from fixtures.pageserver.utils import ( assert_prefix_empty, assert_prefix_not_empty, many_small_layers_tenant_config, - poll_for_remote_storage_iterations, timeline_delete_wait_completed, wait_for_last_record_lsn, wait_for_upload, @@ -94,12 +93,7 @@ def test_timeline_delete(neon_simple_env: NeonEnv): assert timeline_path.exists() # retry deletes when compaction or gc is running in pageserver - # TODO: review whether this wait_until is actually necessary, we do an await() internally - wait_until( - number_of_iterations=3, - interval=0.2, - func=lambda: timeline_delete_wait_completed(ps_http, env.initial_tenant, leaf_timeline_id), - ) + timeline_delete_wait_completed(ps_http, env.initial_tenant, leaf_timeline_id) assert not timeline_path.exists() @@ -111,13 +105,7 @@ def test_timeline_delete(neon_simple_env: NeonEnv): ps_http.timeline_detail(env.initial_tenant, leaf_timeline_id) assert exc.value.status_code == 404 - wait_until( - number_of_iterations=3, - interval=0.2, - func=lambda: timeline_delete_wait_completed( - ps_http, env.initial_tenant, parent_timeline_id - ), - ) + timeline_delete_wait_completed(ps_http, env.initial_tenant, parent_timeline_id) # Check that we didn't pick up the timeline again after restart. # See https://github.com/neondatabase/neon/issues/3560 @@ -226,8 +214,6 @@ def test_delete_timeline_exercise_crash_safety_failpoints( ps_http.configure_failpoints((failpoint, "return")) - iterations = poll_for_remote_storage_iterations(remote_storage_kind) - # These failpoints are earlier than background task is spawned. # so they result in api request failure. if failpoint in ( @@ -244,7 +230,7 @@ def test_delete_timeline_exercise_crash_safety_failpoints( tenant_id=env.initial_tenant, timeline_id=timeline_id, expected_state="Broken", - iterations=iterations, + iterations=40, ) reason = timeline_info["state"]["Broken"]["reason"] @@ -257,25 +243,21 @@ def test_delete_timeline_exercise_crash_safety_failpoints( env.pageserver.stop() env.pageserver.start() - wait_until_tenant_active(ps_http, env.initial_tenant, iterations=iterations) + wait_until_tenant_active(ps_http, env.initial_tenant) if failpoint == "timeline-delete-before-index-deleted-at": # We crashed before persisting this to remote storage, need to retry delete request timeline_delete_wait_completed(ps_http, env.initial_tenant, timeline_id) else: # Pageserver should've resumed deletion after restart. - wait_timeline_detail_404( - ps_http, env.initial_tenant, timeline_id, iterations=iterations - ) + wait_timeline_detail_404(ps_http, env.initial_tenant, timeline_id) elif check is Check.RETRY_WITHOUT_RESTART: # this should succeed # this also checks that delete can be retried even when timeline is in Broken state ps_http.configure_failpoints((failpoint, "off")) - timeline_delete_wait_completed( - ps_http, env.initial_tenant, timeline_id, iterations=iterations - ) + timeline_delete_wait_completed(ps_http, env.initial_tenant, timeline_id) # Check remote is empty if remote_storage_kind is RemoteStorageKind.MOCK_S3: @@ -378,7 +360,7 @@ def test_timeline_resurrection_on_attach( env.pageserver.tenant_attach(tenant_id=tenant_id) - wait_until_tenant_active(ps_http, tenant_id=tenant_id, iterations=10, period=0.5) + wait_until_tenant_active(ps_http, tenant_id=tenant_id) timelines = ps_http.timeline_list(tenant_id=tenant_id) assert {TimelineId(tl["timeline_id"]) for tl in timelines} == { @@ -439,7 +421,7 @@ def test_timeline_delete_fail_before_local_delete(neon_env_builder: NeonEnvBuild # Wait for tenant to finish loading. wait_until_tenant_active(ps_http, tenant_id=env.initial_tenant, iterations=10, period=1) - wait_timeline_detail_404(ps_http, env.initial_tenant, leaf_timeline_id, iterations=4) + wait_timeline_detail_404(ps_http, env.initial_tenant, leaf_timeline_id) assert ( not leaf_timeline_path.exists() @@ -481,11 +463,10 @@ def test_timeline_delete_fail_before_local_delete(neon_env_builder: NeonEnvBuild ) # for some reason the check above doesnt immediately take effect for the below. - # Assume it is mock server incosistency and check twice. + # Assume it is mock server incosistency and check a few times. wait_until( - 2, - 0.5, lambda: assert_prefix_empty(neon_env_builder.pageserver_remote_storage), + timeout=2, ) # We deleted our only tenant, and the scrubber fails if it detects nothing @@ -544,7 +525,7 @@ def test_concurrent_timeline_delete_stuck_on( f".*{child_timeline_id}.*at failpoint {stuck_failpoint}" ) - wait_until(50, 0.1, first_call_hit_failpoint) + wait_until(first_call_hit_failpoint, interval=0.1, status_interval=1.0) # make the second call and assert behavior log.info("second call start") @@ -613,7 +594,7 @@ def test_delete_timeline_client_hangup(neon_env_builder: NeonEnvBuilder): def hit_failpoint(): env.pageserver.assert_log_contains(at_failpoint_log_message) - wait_until(50, 0.1, hit_failpoint) + wait_until(hit_failpoint, interval=0.1) # we log this error if a client hangs up # might as well use it as another indicator that the test works @@ -623,7 +604,7 @@ def test_delete_timeline_client_hangup(neon_env_builder: NeonEnvBuilder): def got_hangup_log_message(): env.pageserver.assert_log_contains(hangup_log_message) - wait_until(50, 0.1, got_hangup_log_message) + wait_until(got_hangup_log_message, interval=0.1) # check that the timeline is still present ps_http.timeline_detail(env.initial_tenant, child_timeline_id) @@ -635,10 +616,10 @@ def test_delete_timeline_client_hangup(neon_env_builder: NeonEnvBuilder): message = f".*DELETE.*{child_timeline_id}.*Cancelled request finished" env.pageserver.assert_log_contains(message) - wait_until(50, 0.1, first_request_finished) + wait_until(first_request_finished, interval=0.1) # check that the timeline is gone - wait_timeline_detail_404(ps_http, env.initial_tenant, child_timeline_id, iterations=10) + wait_timeline_detail_404(ps_http, env.initial_tenant, child_timeline_id) def test_timeline_delete_works_for_remote_smoke( @@ -707,7 +688,7 @@ def test_timeline_delete_works_for_remote_smoke( # for some reason the check above doesnt immediately take effect for the below. # Assume it is mock server inconsistency and check twice. - wait_until(2, 0.5, lambda: assert_prefix_empty(neon_env_builder.pageserver_remote_storage)) + wait_until(lambda: assert_prefix_empty(neon_env_builder.pageserver_remote_storage)) # We deleted our only tenant, and the scrubber fails if it detects nothing neon_env_builder.disable_scrub_on_exit() @@ -753,15 +734,13 @@ def test_delete_orphaned_objects( env.pageserver.allowed_errors.append(f".*failpoint: {failpoint}") - iterations = poll_for_remote_storage_iterations(remote_storage_kind) - ps_http.timeline_delete(env.initial_tenant, timeline_id) timeline_info = wait_until_timeline_state( pageserver_http=ps_http, tenant_id=env.initial_tenant, timeline_id=timeline_id, expected_state="Broken", - iterations=iterations, + iterations=40, ) reason = timeline_info["state"]["Broken"]["reason"] @@ -827,8 +806,6 @@ def test_timeline_delete_resumed_on_attach( ) ) - iterations = poll_for_remote_storage_iterations(remote_storage_kind) - ps_http.timeline_delete(tenant_id, timeline_id) timeline_info = wait_until_timeline_state( @@ -836,7 +813,7 @@ def test_timeline_delete_resumed_on_attach( tenant_id=env.initial_tenant, timeline_id=timeline_id, expected_state="Broken", - iterations=iterations, + iterations=40, ) reason = timeline_info["state"]["Broken"]["reason"] @@ -871,7 +848,7 @@ def test_timeline_delete_resumed_on_attach( env.pageserver.tenant_attach(tenant_id=tenant_id) # delete should be resumed - wait_timeline_detail_404(ps_http, env.initial_tenant, timeline_id, iterations=iterations) + wait_timeline_detail_404(ps_http, env.initial_tenant, timeline_id) tenant_path = env.pageserver.timeline_dir(tenant_id, timeline_id) assert not tenant_path.exists() diff --git a/test_runner/regress/test_timeline_detach_ancestor.py b/test_runner/regress/test_timeline_detach_ancestor.py index 9c7e851ba8..2c3ee38bae 100644 --- a/test_runner/regress/test_timeline_detach_ancestor.py +++ b/test_runner/regress/test_timeline_detach_ancestor.py @@ -203,7 +203,7 @@ def test_ancestor_detach_branched_from( ) client.timeline_delete(env.initial_tenant, env.initial_timeline) - wait_timeline_detail_404(client, env.initial_tenant, env.initial_timeline, 10, 1.0) + wait_timeline_detail_404(client, env.initial_tenant, env.initial_timeline) # because we do the fullbackup from ancestor at the branch_lsn, the zenith.signal is always different # as there is always "PREV_LSN: invalid" for "before" @@ -336,10 +336,10 @@ def test_ancestor_detach_reparents_earlier(neon_env_builder: NeonEnvBuilder): # delete the timelines to confirm detach actually worked client.timeline_delete(env.initial_tenant, after) - wait_timeline_detail_404(client, env.initial_tenant, after, 10, 1.0) + wait_timeline_detail_404(client, env.initial_tenant, after) client.timeline_delete(env.initial_tenant, env.initial_timeline) - wait_timeline_detail_404(client, env.initial_tenant, env.initial_timeline, 10, 1.0) + wait_timeline_detail_404(client, env.initial_tenant, env.initial_timeline) def test_detached_receives_flushes_while_being_detached(neon_env_builder: NeonEnvBuilder): @@ -973,17 +973,17 @@ def test_timeline_detach_ancestor_interrupted_by_deletion( with ThreadPoolExecutor(max_workers=2) as pool: try: fut = pool.submit(detach_ancestor) - offset = wait_until(10, 1.0, at_failpoint) + offset = wait_until(at_failpoint) delete = pool.submit(start_delete) - offset = wait_until(10, 1.0, lambda: at_waiting_on_gate_close(offset)) + offset = wait_until(lambda: at_waiting_on_gate_close(offset)) victim_http.configure_failpoints((pausepoint, "off")) delete.result() - assert wait_until(10, 1.0, is_deleted), f"unimplemented mode {mode}" + assert wait_until(is_deleted), f"unimplemented mode {mode}" # TODO: match the error with pytest.raises(PageserverApiException) as exc: @@ -1115,11 +1115,11 @@ def test_sharded_tad_interleaved_after_partial_success(neon_env_builder: NeonEnv with ThreadPoolExecutor(max_workers=1) as pool: try: fut = pool.submit(detach_timeline) - wait_until(10, 1.0, paused_at_failpoint) + wait_until(paused_at_failpoint) # let stuck complete stuck_http.configure_failpoints((pausepoint, "off")) - wait_until(10, 1.0, first_completed) + wait_until(first_completed) if mode == "delete_reparentable_timeline": assert first_branch is not None @@ -1127,7 +1127,7 @@ def test_sharded_tad_interleaved_after_partial_success(neon_env_builder: NeonEnv env.initial_tenant, first_branch ) victim_http.configure_failpoints((pausepoint, "off")) - wait_until(10, 1.0, first_branch_gone) + wait_until(first_branch_gone) elif mode == "create_reparentable_timeline": first_branch = create_reparentable_timeline() victim_http.configure_failpoints((pausepoint, "off")) @@ -1271,11 +1271,11 @@ def test_retryable_500_hit_through_storcon_during_timeline_detach_ancestor( with ThreadPoolExecutor(max_workers=1) as pool: try: fut = pool.submit(detach_timeline) - wait_until(10, 1.0, paused_at_failpoint) + wait_until(paused_at_failpoint) # let stuck complete stuck_http.configure_failpoints((pausepoint, "off")) - wait_until(10, 1.0, first_completed) + wait_until(first_completed) victim_http.configure_failpoints((pausepoint, "off")) @@ -1456,7 +1456,7 @@ def test_retried_detach_ancestor_after_failed_reparenting(neon_env_builder: Neon # other tests take the "detach? reparent complete", but this only hits # "complete". http.timeline_delete(env.initial_tenant, env.initial_timeline) - wait_timeline_detail_404(http, env.initial_tenant, env.initial_timeline, 20) + wait_timeline_detail_404(http, env.initial_tenant, env.initial_timeline) http.configure_failpoints(("timeline-detach-ancestor::complete_before_uploading", "off")) @@ -1518,7 +1518,7 @@ def test_timeline_is_deleted_before_timeline_detach_ancestor_completes( with ThreadPoolExecutor(max_workers=1) as pool: detach = pool.submit(detach_and_get_stuck) - offset = wait_until(10, 1.0, request_processing_noted_in_log) + offset = wait_until(request_processing_noted_in_log) # make this named fn tor more clear failure test output logging def pausepoint_hit_with_gc_paused() -> LogCursor: @@ -1529,11 +1529,11 @@ def test_timeline_is_deleted_before_timeline_detach_ancestor_completes( ) return at - offset = wait_until(10, 1.0, pausepoint_hit_with_gc_paused) + offset = wait_until(pausepoint_hit_with_gc_paused) delete_detached() - wait_timeline_detail_404(http, env.initial_tenant, detached, 10, 1.0) + wait_timeline_detail_404(http, env.initial_tenant, detached) http.configure_failpoints((failpoint, "off")) diff --git a/test_runner/regress/test_timeline_gc_blocking.py b/test_runner/regress/test_timeline_gc_blocking.py index 5a5ca3290a..7605e1f758 100644 --- a/test_runner/regress/test_timeline_gc_blocking.py +++ b/test_runner/regress/test_timeline_gc_blocking.py @@ -61,7 +61,7 @@ def test_gc_blocking_by_timeline(neon_env_builder: NeonEnvBuilder, sharded: bool # deletion unblocks gc http.timeline_delete(env.initial_tenant, foo_branch) - wait_timeline_detail_404(http, env.initial_tenant, foo_branch, 10, 1.0) + wait_timeline_detail_404(http, env.initial_tenant, foo_branch) wait_for_another_gc_round() pss.assert_log_contains(gc_active_line) diff --git a/test_runner/regress/test_timeline_size.py b/test_runner/regress/test_timeline_size.py index 4528bc6180..95bf9106cd 100644 --- a/test_runner/regress/test_timeline_size.py +++ b/test_runner/regress/test_timeline_size.py @@ -396,11 +396,7 @@ def test_timeline_physical_size_init(neon_env_builder: NeonEnvBuilder): # Wait for the tenant to be loaded client = env.pageserver.http_client() - wait_until( - number_of_iterations=5, - interval=1, - func=lambda: assert_tenant_state(client, env.initial_tenant, "Active"), - ) + wait_until(lambda: assert_tenant_state(client, env.initial_tenant, "Active")) assert_physical_size_invariants( get_physical_size_values(env, env.initial_tenant, new_timeline_id), @@ -433,7 +429,7 @@ def test_timeline_physical_size_post_checkpoint(neon_env_builder: NeonEnvBuilder get_physical_size_values(env, env.initial_tenant, new_timeline_id), ) - wait_until(10, 1, check) + wait_until(check) def test_timeline_physical_size_post_compaction(neon_env_builder: NeonEnvBuilder): @@ -721,7 +717,7 @@ def wait_for_tenant_startup_completions(client: PageserverHttpClient, count: int def condition(): assert client.get_metric_value("pageserver_tenant_startup_complete_total") == count - wait_until(5, 1.0, condition) + wait_until(condition) def test_ondemand_activation(neon_env_builder: NeonEnvBuilder): @@ -768,7 +764,7 @@ def test_ondemand_activation(neon_env_builder: NeonEnvBuilder): assert "Active" in set(get_tenant_states().values()) # One tenant should activate, then get stuck in their logical size calculation - wait_until(10, 1, at_least_one_active) + wait_until(at_least_one_active) # Wait some walltime to gain confidence that other tenants really are stuck and not proceeding to activate time.sleep(5) @@ -836,13 +832,13 @@ def test_ondemand_activation(neon_env_builder: NeonEnvBuilder): def all_active(): assert all(s == "Active" for s in get_tenant_states().values()) - wait_until(10, 1, all_active) + wait_until(all_active) # Final control check: restarting with no failpoints at all results in all tenants coming active # without being prompted by client I/O env.pageserver.stop() env.pageserver.start() - wait_until(10, 1, all_active) + wait_until(all_active) assert ( pageserver_http.get_metric_value("pageserver_tenant_startup_scheduled_total") == n_tenants @@ -856,7 +852,7 @@ def test_ondemand_activation(neon_env_builder: NeonEnvBuilder): extra_env_vars={"FAILPOINTS": "timeline-calculate-logical-size-pause=pause"} ) - wait_until(10, 1, at_least_one_active) + wait_until(at_least_one_active) detach_tenant_id = list( [(tid, s) for (tid, s) in get_tenant_states().items() if s == "Attaching"] @@ -881,7 +877,7 @@ def test_ondemand_activation(neon_env_builder: NeonEnvBuilder): # Check that all the stuck tenants proceed to active (apart from the one that deletes, and the one # we detached) - wait_until(10, 1, all_active) + wait_until(all_active) assert len(get_tenant_states()) == n_tenants - 2 @@ -908,7 +904,7 @@ def delete_lazy_activating( try: # Deletion will get to the point in shutdown where it's waiting for timeline shutdown, then # hang because of our failpoint blocking activation. - wait_until(10, 1, shutting_down) + wait_until(shutting_down) finally: log.info("Clearing failpoint") pageserver_http.configure_failpoints(("timeline-calculate-logical-size-pause", "off")) @@ -1030,13 +1026,13 @@ def test_eager_attach_does_not_queue_up(neon_env_builder: NeonEnvBuilder): log.info(f"{states}") assert len(states["Active"]) == 1 - wait_until(10, 1, one_is_active) + wait_until(one_is_active) def other_is_attaching(): states = get_tenant_states() assert len(states["Attaching"]) == 1 - wait_until(10, 1, other_is_attaching) + wait_until(other_is_attaching) def eager_tenant_is_active(): resp = client.tenant_status(eager_tenant) @@ -1053,7 +1049,7 @@ def test_eager_attach_does_not_queue_up(neon_env_builder: NeonEnvBuilder): }, lazy=False, ) - wait_until(10, 1, eager_tenant_is_active) + wait_until(eager_tenant_is_active) other_is_attaching() @@ -1096,7 +1092,7 @@ def test_lazy_attach_activation(neon_env_builder: NeonEnvBuilder, activation_met resp = client.tenant_status(env.initial_tenant) assert resp["state"]["slug"] == "Active" - wait_until(10, 1, initial_tenant_is_active) + wait_until(initial_tenant_is_active) # even though the initial tenant is now active, because it was startup time # attach, it will consume the only permit because logical size calculation @@ -1119,7 +1115,7 @@ def test_lazy_attach_activation(neon_env_builder: NeonEnvBuilder, activation_met assert resp["state"]["slug"] == "Attaching" # paused logical size calculation of env.initial_tenant is keeping it attaching - wait_until(10, 1, lazy_tenant_is_attaching) + wait_until(lazy_tenant_is_attaching) for _ in range(5): lazy_tenant_is_attaching() @@ -1132,10 +1128,10 @@ def test_lazy_attach_activation(neon_env_builder: NeonEnvBuilder, activation_met if activation_method == "endpoint": with env.endpoints.create_start("main", tenant_id=lazy_tenant): # starting up the endpoint should make it jump the queue - wait_until(10, 1, lazy_tenant_is_active) + wait_until(lazy_tenant_is_active) elif activation_method == "branch": env.create_timeline("second_branch", lazy_tenant) - wait_until(10, 1, lazy_tenant_is_active) + wait_until(lazy_tenant_is_active) elif activation_method == "delete": delete_lazy_activating(lazy_tenant, env.pageserver, expect_attaching=True) else: diff --git a/test_runner/regress/test_wal_acceptor.py b/test_runner/regress/test_wal_acceptor.py index 8fa33b81a9..23d4f23cdb 100644 --- a/test_runner/regress/test_wal_acceptor.py +++ b/test_runner/regress/test_wal_acceptor.py @@ -2136,7 +2136,7 @@ def test_pull_timeline_while_evicted(neon_env_builder: NeonEnvBuilder): # Check that on source no segment files are present assert src_sk.list_segments(tenant_id, timeline_id) == [] - wait_until(60, 1, evicted_on_source) + wait_until(evicted_on_source, timeout=60) # Invoke pull_timeline: source should serve snapshot request without promoting anything to local disk, # destination should import the control file only & go into evicted mode immediately @@ -2155,7 +2155,7 @@ def test_pull_timeline_while_evicted(neon_env_builder: NeonEnvBuilder): # This should be fast, it is a wait_until because eviction state is updated # in the background wrt pull_timeline. - wait_until(10, 0.1, evicted_on_destination) + wait_until(evicted_on_destination, timeout=1.0, interval=0.1) # Delete the timeline on the source, to prove that deletion works on an # evicted timeline _and_ that the final compute test is really not using @@ -2178,7 +2178,7 @@ def test_pull_timeline_while_evicted(neon_env_builder: NeonEnvBuilder): n_evicted = dst_sk.http_client().get_metric_value("safekeeper_evicted_timelines") assert n_evicted == 0 - wait_until(10, 1, unevicted_on_dest) + wait_until(unevicted_on_dest, interval=0.1, timeout=1.0) # In this test we check for excessive START_REPLICATION and START_WAL_PUSH queries @@ -2606,10 +2606,10 @@ def test_s3_eviction( assert n_evicted # make mypy happy assert int(n_evicted) == n_timelines - wait_until(60, 0.5, all_evicted) + wait_until(all_evicted, timeout=30) # restart should preserve the metric value sk.stop().start() - wait_until(60, 0.5, all_evicted) + wait_until(all_evicted) # and endpoint start should reduce is endpoints[0].start() @@ -2618,7 +2618,7 @@ def test_s3_eviction( assert n_evicted # make mypy happy assert int(n_evicted) < n_timelines - wait_until(60, 0.5, one_unevicted) + wait_until(one_unevicted) # Test resetting uploaded partial segment state. @@ -2666,7 +2666,7 @@ def test_backup_partial_reset(neon_env_builder: NeonEnvBuilder): if isinstance(eviction_state, str) and eviction_state == "Present": raise Exception("eviction didn't happen yet") - wait_until(30, 1, evicted) + wait_until(evicted) # it must have uploaded something uploaded_segs = sk.list_uploaded_segments(tenant_id, timeline_id) log.info(f"uploaded segments before reset: {uploaded_segs}") @@ -2763,7 +2763,7 @@ def test_pull_timeline_partial_segment_integrity(neon_env_builder: NeonEnvBuilde raise Exception("Partial segment not uploaded yet") - source_partial_segment = wait_until(15, 1, source_partial_segment_uploaded) + source_partial_segment = wait_until(source_partial_segment_uploaded) log.info( f"Uploaded segments before pull are {src_sk.list_uploaded_segments(tenant_id, timeline_id)}" ) @@ -2787,7 +2787,7 @@ def test_pull_timeline_partial_segment_integrity(neon_env_builder: NeonEnvBuilde if evictions is None or evictions == 0: raise Exception("Eviction did not happen on source safekeeper yet") - wait_until(30, 1, evicted) + wait_until(evicted) endpoint.start(safekeepers=[2, 3]) @@ -2804,7 +2804,7 @@ def test_pull_timeline_partial_segment_integrity(neon_env_builder: NeonEnvBuilde ) endpoint.safe_psql("insert into t select generate_series(1, 1000), 'pear'") - wait_until(15, 1, new_partial_segment_uploaded) + wait_until(new_partial_segment_uploaded) log.info( f"Uploaded segments after post-pull ingest are {src_sk.list_uploaded_segments(tenant_id, timeline_id)}" @@ -2833,4 +2833,4 @@ def test_pull_timeline_partial_segment_integrity(neon_env_builder: NeonEnvBuilde if unevictions is None or unevictions == 0: raise Exception("Uneviction did not happen on source safekeeper yet") - wait_until(10, 1, unevicted) + wait_until(unevicted) diff --git a/test_runner/regress/test_wal_receiver.py b/test_runner/regress/test_wal_receiver.py index 294f86ffa7..d22a900c59 100644 --- a/test_runner/regress/test_wal_receiver.py +++ b/test_runner/regress/test_wal_receiver.py @@ -97,7 +97,7 @@ def test_pageserver_lsn_wait_error_safekeeper_stop(neon_env_builder: NeonEnvBuil str(safekeeper.id) in exception_string ), f"Should have safekeeper {safekeeper.id} printed in walreceiver state after WAL wait timeout" - wait_until(60, 0.5, all_sks_in_wareceiver_state) + wait_until(all_sks_in_wareceiver_state, timeout=30) stopped_safekeeper = env.safekeepers[-1] stopped_safekeeper_id = stopped_safekeeper.id @@ -124,7 +124,7 @@ def test_pageserver_lsn_wait_error_safekeeper_stop(neon_env_builder: NeonEnvBuil str(safekeeper.id) in exception_string ), f"Should have safekeeper {safekeeper.id} printed in walreceiver state after 2nd WAL wait timeout" - wait_until(60, 0.5, all_but_stopped_sks_in_wareceiver_state) + wait_until(all_but_stopped_sks_in_wareceiver_state, timeout=30) def insert_test_elements(env: NeonEnv, tenant_id: TenantId, start: int, count: int): diff --git a/vendor/postgres-v14 b/vendor/postgres-v14 index 284ae56be2..373f9decad 160000 --- a/vendor/postgres-v14 +++ b/vendor/postgres-v14 @@ -1 +1 @@ -Subproject commit 284ae56be2397fd3eaf20777fa220b2d0ad968f5 +Subproject commit 373f9decad933d2d46f321231032ae8b0da81acd diff --git a/vendor/postgres-v15 b/vendor/postgres-v15 index aed79ee87b..972e325e62 160000 --- a/vendor/postgres-v15 +++ b/vendor/postgres-v15 @@ -1 +1 @@ -Subproject commit aed79ee87b94779cc52ec13e3b74eba6ada93f05 +Subproject commit 972e325e62b455957adbbdd8580e31275bb5b8c9 diff --git a/vendor/postgres-v16 b/vendor/postgres-v16 index f5cfc6fa89..dff6615a8e 160000 --- a/vendor/postgres-v16 +++ b/vendor/postgres-v16 @@ -1 +1 @@ -Subproject commit f5cfc6fa898544050e821ac688adafece1ac3cff +Subproject commit dff6615a8e48a10bb17a03fa3c00635f1ace7a92 diff --git a/vendor/postgres-v17 b/vendor/postgres-v17 index 3c15b6565f..a10d95be67 160000 --- a/vendor/postgres-v17 +++ b/vendor/postgres-v17 @@ -1 +1 @@ -Subproject commit 3c15b6565f6c8d36d169ed9ea7412cf90cfb2a8f +Subproject commit a10d95be67265e0f10a422ba0457f5a7af01de71 diff --git a/vendor/revisions.json b/vendor/revisions.json index 4dae88e73d..8a73e14dcf 100644 --- a/vendor/revisions.json +++ b/vendor/revisions.json @@ -1,18 +1,18 @@ { "v17": [ "17.2", - "3c15b6565f6c8d36d169ed9ea7412cf90cfb2a8f" + "a10d95be67265e0f10a422ba0457f5a7af01de71" ], "v16": [ "16.6", - "f5cfc6fa898544050e821ac688adafece1ac3cff" + "dff6615a8e48a10bb17a03fa3c00635f1ace7a92" ], "v15": [ "15.10", - "aed79ee87b94779cc52ec13e3b74eba6ada93f05" + "972e325e62b455957adbbdd8580e31275bb5b8c9" ], "v14": [ "14.15", - "284ae56be2397fd3eaf20777fa220b2d0ad968f5" + "373f9decad933d2d46f321231032ae8b0da81acd" ] } diff --git a/workspace_hack/Cargo.toml b/workspace_hack/Cargo.toml index a73d9d6352..d19379aefd 100644 --- a/workspace_hack/Cargo.toml +++ b/workspace_hack/Cargo.toml @@ -55,13 +55,16 @@ log = { version = "0.4", default-features = false, features = ["std"] } memchr = { version = "2" } nix = { version = "0.26" } nom = { version = "7" } +num = { version = "0.4" } num-bigint = { version = "0.4" } +num-complex = { version = "0.4", default-features = false, features = ["std"] } num-integer = { version = "0.1", features = ["i128"] } +num-iter = { version = "0.1", default-features = false, features = ["i128", "std"] } +num-rational = { version = "0.4", default-features = false, features = ["num-bigint-std", "std"] } num-traits = { version = "0.2", features = ["i128", "libm"] } once_cell = { version = "1" } parquet = { version = "53", default-features = false, features = ["zstd"] } -postgres-types = { git = "https://github.com/neondatabase/rust-postgres.git", branch = "neon", default-features = false, features = ["with-serde_json-1"] } -prost = { version = "0.13", features = ["prost-derive"] } +prost = { version = "0.13", features = ["no-recursion-limit", "prost-derive"] } rand = { version = "0.8", features = ["small_rng"] } regex = { version = "1" } regex-automata = { version = "0.4", default-features = false, features = ["dfa-onepass", "hybrid", "meta", "nfa-backtrack", "perf-inline", "perf-literal", "unicode"] } @@ -77,10 +80,10 @@ smallvec = { version = "1", default-features = false, features = ["const_new", " spki = { version = "0.7", default-features = false, features = ["pem", "std"] } subtle = { version = "2" } sync_wrapper = { version = "0.1", default-features = false, features = ["futures"] } -tikv-jemalloc-sys = { version = "0.6", features = ["stats"] } +tikv-jemalloc-ctl = { version = "0.6", features = ["stats", "use_std"] } +tikv-jemalloc-sys = { version = "0.6", features = ["profiling", "stats", "unprefixed_malloc_on_supported_platforms"] } time = { version = "0.3", features = ["macros", "serde-well-known"] } -tokio = { version = "1", features = ["fs", "io-std", "io-util", "macros", "net", "process", "rt-multi-thread", "signal", "test-util"] } -tokio-postgres = { git = "https://github.com/neondatabase/rust-postgres.git", branch = "neon", features = ["with-serde_json-1"] } +tokio = { version = "1", features = ["full", "test-util"] } tokio-rustls = { version = "0.26", default-features = false, features = ["logging", "ring", "tls12"] } tokio-stream = { version = "0.1", features = ["net"] } tokio-util = { version = "0.7", features = ["codec", "compat", "io", "rt"] } @@ -113,14 +116,18 @@ libc = { version = "0.2", features = ["extra_traits", "use_std"] } log = { version = "0.4", default-features = false, features = ["std"] } memchr = { version = "2" } nom = { version = "7" } +num = { version = "0.4" } num-bigint = { version = "0.4" } +num-complex = { version = "0.4", default-features = false, features = ["std"] } num-integer = { version = "0.1", features = ["i128"] } +num-iter = { version = "0.1", default-features = false, features = ["i128", "std"] } +num-rational = { version = "0.4", default-features = false, features = ["num-bigint-std", "std"] } num-traits = { version = "0.2", features = ["i128", "libm"] } once_cell = { version = "1" } parquet = { version = "53", default-features = false, features = ["zstd"] } prettyplease = { version = "0.2", default-features = false, features = ["verbatim"] } proc-macro2 = { version = "1" } -prost = { version = "0.13", features = ["prost-derive"] } +prost = { version = "0.13", features = ["no-recursion-limit", "prost-derive"] } quote = { version = "1" } regex = { version = "1" } regex-automata = { version = "0.4", default-features = false, features = ["dfa-onepass", "hybrid", "meta", "nfa-backtrack", "perf-inline", "perf-literal", "unicode"] }