diff --git a/.github/actionlint.yml b/.github/actionlint.yml index aec5b4ee75..ecff0cc70b 100644 --- a/.github/actionlint.yml +++ b/.github/actionlint.yml @@ -4,6 +4,7 @@ self-hosted-runner: - large - large-arm64 - small + - small-metal - small-arm64 - us-east-2 config-variables: diff --git a/.github/actions/neon-project-create/action.yml b/.github/actions/neon-project-create/action.yml index f4a194639f..11f46bce8e 100644 --- a/.github/actions/neon-project-create/action.yml +++ b/.github/actions/neon-project-create/action.yml @@ -17,6 +17,31 @@ inputs: compute_units: description: '[Min, Max] compute units' default: '[1, 1]' + # settings below only needed if you want the project to be sharded from the beginning + shard_split_project: + description: 'by default new projects are not shard-split, specify true to shard-split' + required: false + default: 'false' + admin_api_key: + description: 'Admin API Key needed for shard-splitting. Must be specified if shard_split_project is true' + required: false + shard_count: + description: 'Number of shards to split the project into, only applies if shard_split_project is true' + required: false + default: '8' + stripe_size: + description: 'Stripe size, optional, in 8kiB pages. e.g. set 2048 for 16MB stripes. Default is 128 MiB, only applies if shard_split_project is true' + required: false + default: '32768' + psql_path: + description: 'Path to psql binary - it is caller responsibility to provision the psql binary' + required: false + default: '/tmp/neon/pg_install/v16/bin/psql' + libpq_lib_path: + description: 'Path to directory containing libpq library - it is caller responsibility to provision the libpq library' + required: false + default: '/tmp/neon/pg_install/v16/lib' + outputs: dsn: @@ -63,6 +88,23 @@ runs: echo "project_id=${project_id}" >> $GITHUB_OUTPUT echo "Project ${project_id} has been created" + + if [ "${SHARD_SPLIT_PROJECT}" = "true" ]; then + # determine tenant ID + TENANT_ID=`${PSQL} ${dsn} -t -A -c "SHOW neon.tenant_id"` + + echo "Splitting project ${project_id} with tenant_id ${TENANT_ID} into $((SHARD_COUNT)) shards with stripe size $((STRIPE_SIZE))" + + echo "Sending PUT request to https://${API_HOST}/regions/${REGION_ID}/api/v1/admin/storage/proxy/control/v1/tenant/${TENANT_ID}/shard_split" + echo "with body {\"new_shard_count\": $((SHARD_COUNT)), \"new_stripe_size\": $((STRIPE_SIZE))}" + + # we need an ADMIN API KEY to invoke storage controller API for shard splitting (bash -u above checks that the variable is set) + curl -X PUT \ + "https://${API_HOST}/regions/${REGION_ID}/api/v1/admin/storage/proxy/control/v1/tenant/${TENANT_ID}/shard_split" \ + -H "Accept: application/json" -H "Content-Type: application/json" -H "Authorization: Bearer ${ADMIN_API_KEY}" \ + -d "{\"new_shard_count\": $SHARD_COUNT, \"new_stripe_size\": $STRIPE_SIZE}" + fi + env: API_HOST: ${{ inputs.api_host }} API_KEY: ${{ inputs.api_key }} @@ -70,3 +112,9 @@ runs: POSTGRES_VERSION: ${{ inputs.postgres_version }} MIN_CU: ${{ fromJSON(inputs.compute_units)[0] }} MAX_CU: ${{ fromJSON(inputs.compute_units)[1] }} + SHARD_SPLIT_PROJECT: ${{ inputs.shard_split_project }} + ADMIN_API_KEY: ${{ inputs.admin_api_key }} + SHARD_COUNT: ${{ inputs.shard_count }} + STRIPE_SIZE: ${{ inputs.stripe_size }} + PSQL: ${{ inputs.psql_path }} + LD_LIBRARY_PATH: ${{ inputs.libpq_lib_path }} diff --git a/.github/workflows/_benchmarking_preparation.yml b/.github/workflows/_benchmarking_preparation.yml index fd328586b3..71aef1430e 100644 --- a/.github/workflows/_benchmarking_preparation.yml +++ b/.github/workflows/_benchmarking_preparation.yml @@ -17,7 +17,7 @@ jobs: strategy: fail-fast: false matrix: - platform: [ aws-rds-postgres, aws-aurora-serverless-v2-postgres, neon ] + platform: [ aws-rds-postgres, aws-aurora-serverless-v2-postgres, neon, neon_pg17 ] database: [ clickbench, tpch, userexample ] env: @@ -41,6 +41,9 @@ jobs: neon) CONNSTR=${{ secrets.BENCHMARK_CAPTEST_CONNSTR }} ;; + neon_pg17) + CONNSTR=${{ secrets.BENCHMARK_CAPTEST_CONNSTR_PG17 }} + ;; aws-rds-postgres) CONNSTR=${{ secrets.BENCHMARK_RDS_POSTGRES_CONNSTR }} ;; diff --git a/.github/workflows/_build-and-test-locally.yml b/.github/workflows/_build-and-test-locally.yml index 4263bacce8..f97402a90b 100644 --- a/.github/workflows/_build-and-test-locally.yml +++ b/.github/workflows/_build-and-test-locally.yml @@ -158,8 +158,6 @@ jobs: - name: Run cargo build run: | - PQ_LIB_DIR=$(pwd)/pg_install/v16/lib - export PQ_LIB_DIR ${cov_prefix} mold -run cargo build $CARGO_FLAGS $CARGO_FEATURES --bins --tests # Do install *before* running rust tests because they might recompile the @@ -217,8 +215,6 @@ jobs: env: NEXTEST_RETRIES: 3 run: | - PQ_LIB_DIR=$(pwd)/pg_install/v16/lib - export PQ_LIB_DIR LD_LIBRARY_PATH=$(pwd)/pg_install/v17/lib export LD_LIBRARY_PATH @@ -229,8 +225,13 @@ jobs: ${cov_prefix} cargo nextest run $CARGO_FLAGS $CARGO_FEATURES -E '!package(pageserver)' # run pageserver tests with different settings - for io_engine in std-fs tokio-epoll-uring ; do - NEON_PAGESERVER_UNIT_TEST_VIRTUAL_FILE_IOENGINE=$io_engine ${cov_prefix} cargo nextest run $CARGO_FLAGS $CARGO_FEATURES -E 'package(pageserver)' + for get_vectored_concurrent_io in sequential sidecar-task; do + for io_engine in std-fs tokio-epoll-uring ; do + NEON_PAGESERVER_UNIT_TEST_GET_VECTORED_CONCURRENT_IO=$get_vectored_concurrent_io \ + NEON_PAGESERVER_UNIT_TEST_VIRTUAL_FILE_IOENGINE=$io_engine \ + ${cov_prefix} \ + cargo nextest run $CARGO_FLAGS $CARGO_FEATURES -E 'package(pageserver)' + done done # Run separate tests for real S3 @@ -314,6 +315,7 @@ jobs: CHECK_ONDISK_DATA_COMPATIBILITY: nonempty BUILD_TAG: ${{ inputs.build-tag }} PAGESERVER_VIRTUAL_FILE_IO_ENGINE: tokio-epoll-uring + PAGESERVER_GET_VECTORED_CONCURRENT_IO: sidecar-task USE_LFC: ${{ matrix.lfc_state == 'with-lfc' && 'true' || 'false' }} # Temporary disable this step until we figure out why it's so flaky diff --git a/.github/workflows/_check-codestyle-rust.yml b/.github/workflows/_check-codestyle-rust.yml new file mode 100644 index 0000000000..cbc47c6406 --- /dev/null +++ b/.github/workflows/_check-codestyle-rust.yml @@ -0,0 +1,91 @@ +name: Check Codestyle Rust + +on: + workflow_call: + inputs: + build-tools-image: + description: "build-tools image" + required: true + type: string + archs: + description: "Json array of architectures to run on" + type: string + + +defaults: + run: + shell: bash -euxo pipefail {0} + +jobs: + check-codestyle-rust: + strategy: + matrix: + arch: ${{ fromJson(inputs.archs) }} + runs-on: ${{ fromJson(format('["self-hosted", "{0}"]', matrix.arch == 'arm64' && 'small-arm64' || 'small')) }} + + container: + image: ${{ inputs.build-tools-image }} + credentials: + username: ${{ secrets.NEON_DOCKERHUB_USERNAME }} + password: ${{ secrets.NEON_DOCKERHUB_PASSWORD }} + options: --init + + steps: + - name: Checkout + uses: actions/checkout@v4 + with: + submodules: true + + - name: Cache cargo deps + uses: actions/cache@v4 + with: + path: | + ~/.cargo/registry + !~/.cargo/registry/src + ~/.cargo/git + target + key: v1-${{ runner.os }}-${{ runner.arch }}-cargo-${{ hashFiles('./Cargo.lock') }}-${{ hashFiles('./rust-toolchain.toml') }}-rust + + # Some of our rust modules use FFI and need those to be checked + - name: Get postgres headers + run: make postgres-headers -j$(nproc) + + # cargo hack runs the given cargo subcommand (clippy in this case) for all feature combinations. + # This will catch compiler & clippy warnings in all feature combinations. + # TODO: use cargo hack for build and test as well, but, that's quite expensive. + # NB: keep clippy args in sync with ./run_clippy.sh + # + # The only difference between "clippy --debug" and "clippy --release" is that in --release mode, + # #[cfg(debug_assertions)] blocks are not built. It's not worth building everything for second + # time just for that, so skip "clippy --release". + - run: | + CLIPPY_COMMON_ARGS="$( source .neon_clippy_args; echo "$CLIPPY_COMMON_ARGS")" + if [ "$CLIPPY_COMMON_ARGS" = "" ]; then + echo "No clippy args found in .neon_clippy_args" + exit 1 + fi + echo "CLIPPY_COMMON_ARGS=${CLIPPY_COMMON_ARGS}" >> $GITHUB_ENV + - name: Run cargo clippy (debug) + run: cargo hack --features default --ignore-unknown-features --feature-powerset clippy $CLIPPY_COMMON_ARGS + + - name: Check documentation generation + run: cargo doc --workspace --no-deps --document-private-items + env: + RUSTDOCFLAGS: "-Dwarnings -Arustdoc::private_intra_doc_links" + + # Use `${{ !cancelled() }}` to run quck tests after the longer clippy run + - name: Check formatting + if: ${{ !cancelled() }} + run: cargo fmt --all -- --check + + # https://github.com/facebookincubator/cargo-guppy/tree/bec4e0eb29dcd1faac70b1b5360267fc02bf830e/tools/cargo-hakari#2-keep-the-workspace-hack-up-to-date-in-ci + - name: Check rust dependencies + if: ${{ !cancelled() }} + run: | + cargo hakari generate --diff # workspace-hack Cargo.toml is up-to-date + cargo hakari manage-deps --dry-run # all workspace crates depend on workspace-hack + + # https://github.com/EmbarkStudios/cargo-deny + - name: Check rust licenses/bans/advisories/sources + if: ${{ !cancelled() }} + run: cargo deny check --hide-inclusion-graph diff --git a/.github/workflows/benchmarking.yml b/.github/workflows/benchmarking.yml index ab0f2a6155..32747d825c 100644 --- a/.github/workflows/benchmarking.yml +++ b/.github/workflows/benchmarking.yml @@ -63,11 +63,15 @@ jobs: fail-fast: false matrix: include: - - DEFAULT_PG_VERSION: 16 + - PG_VERSION: 16 PLATFORM: "neon-staging" region_id: ${{ github.event.inputs.region_id || 'aws-us-east-2' }} RUNNER: [ self-hosted, us-east-2, x64 ] - - DEFAULT_PG_VERSION: 16 + - PG_VERSION: 17 + PLATFORM: "neon-staging" + region_id: ${{ github.event.inputs.region_id || 'aws-us-east-2' }} + RUNNER: [ self-hosted, us-east-2, x64 ] + - PG_VERSION: 16 PLATFORM: "azure-staging" region_id: 'azure-eastus2' RUNNER: [ self-hosted, eastus2, x64 ] @@ -75,7 +79,7 @@ jobs: TEST_PG_BENCH_DURATIONS_MATRIX: "300" TEST_PG_BENCH_SCALES_MATRIX: "10,100" POSTGRES_DISTRIB_DIR: /tmp/neon/pg_install - DEFAULT_PG_VERSION: ${{ matrix.DEFAULT_PG_VERSION }} + PG_VERSION: ${{ matrix.PG_VERSION }} TEST_OUTPUT: /tmp/test_output BUILD_TYPE: remote SAVE_PERF_REPORT: ${{ github.event.inputs.save_perf_report || ( github.ref_name == 'main' ) }} @@ -112,7 +116,7 @@ jobs: uses: ./.github/actions/neon-project-create with: region_id: ${{ matrix.region_id }} - postgres_version: ${{ env.DEFAULT_PG_VERSION }} + postgres_version: ${{ env.PG_VERSION }} api_key: ${{ secrets.NEON_STAGING_API_KEY }} - name: Run benchmark @@ -122,7 +126,7 @@ jobs: test_selection: performance run_in_parallel: false save_perf_report: ${{ env.SAVE_PERF_REPORT }} - pg_version: ${{ env.DEFAULT_PG_VERSION }} + pg_version: ${{ env.PG_VERSION }} aws-oicd-role-arn: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }} # Set --sparse-ordering option of pytest-order plugin # to ensure tests are running in order of appears in the file. @@ -313,7 +317,11 @@ jobs: { "pg_version": 16, "region_id": "azure-eastus2", "platform": "neonvm-azure-captest-freetier", "db_size": "3gb" ,"runner": '"$runner_azure"', "image": "neondatabase/build-tools:pinned-bookworm" }, { "pg_version": 16, "region_id": "azure-eastus2", "platform": "neonvm-azure-captest-new", "db_size": "10gb","runner": '"$runner_azure"', "image": "neondatabase/build-tools:pinned-bookworm" }, { "pg_version": 16, "region_id": "azure-eastus2", "platform": "neonvm-azure-captest-new", "db_size": "50gb","runner": '"$runner_azure"', "image": "neondatabase/build-tools:pinned-bookworm" }, - { "pg_version": 16, "region_id": "'"$region_id_default"'", "platform": "neonvm-captest-sharding-reuse", "db_size": "50gb","runner": '"$runner_default"', "image": "'"$image_default"'" }] + { "pg_version": 16, "region_id": "'"$region_id_default"'", "platform": "neonvm-captest-sharding-reuse", "db_size": "50gb","runner": '"$runner_default"', "image": "'"$image_default"'" }, + { "pg_version": 17, "region_id": "'"$region_id_default"'", "platform": "neonvm-captest-freetier", "db_size": "3gb" ,"runner": '"$runner_default"', "image": "'"$image_default"'" }, + { "pg_version": 17, "region_id": "'"$region_id_default"'", "platform": "neonvm-captest-new", "db_size": "10gb","runner": '"$runner_default"', "image": "'"$image_default"'" }, + { "pg_version": 17, "region_id": "'"$region_id_default"'", "platform": "neonvm-captest-new-many-tables","db_size": "10gb","runner": '"$runner_default"', "image": "'"$image_default"'" }, + { "pg_version": 17, "region_id": "'"$region_id_default"'", "platform": "neonvm-captest-new", "db_size": "50gb","runner": '"$runner_default"', "image": "'"$image_default"'" }] }' if [ "$(date +%A)" = "Saturday" ] || [ ${RUN_AWS_RDS_AND_AURORA} = "true" ]; then @@ -329,12 +337,15 @@ jobs: matrix='{ "platform": [ "neonvm-captest-reuse" - ] + ], + "pg_version" : [ + 16,17 + ], }' if [ "$(date +%A)" = "Saturday" ] || [ ${RUN_AWS_RDS_AND_AURORA} = "true" ]; then - matrix=$(echo "$matrix" | jq '.include += [{ "platform": "rds-postgres" }, - { "platform": "rds-aurora" }]') + matrix=$(echo "$matrix" | jq '.include += [{ "pg_version": 16, "platform": "rds-postgres" }, + { "pg_version": 16, "platform": "rds-aurora" }]') fi echo "matrix=$(echo "$matrix" | jq --compact-output '.')" >> $GITHUB_OUTPUT @@ -346,14 +357,14 @@ jobs: "platform": [ "neonvm-captest-reuse" ], - "scale": [ - "10" + "pg_version" : [ + 16,17 ] }' if [ "$(date +%A)" = "Saturday" ] || [ ${RUN_AWS_RDS_AND_AURORA} = "true" ]; then - matrix=$(echo "$matrix" | jq '.include += [{ "platform": "rds-postgres", "scale": "10" }, - { "platform": "rds-aurora", "scale": "10" }]') + matrix=$(echo "$matrix" | jq '.include += [{ "pg_version": 16, "platform": "rds-postgres" }, + { "pg_version": 16, "platform": "rds-aurora" }]') fi echo "matrix=$(echo "$matrix" | jq --compact-output '.')" >> $GITHUB_OUTPUT @@ -378,7 +389,7 @@ jobs: TEST_PG_BENCH_DURATIONS_MATRIX: "60m" TEST_PG_BENCH_SCALES_MATRIX: ${{ matrix.db_size }} POSTGRES_DISTRIB_DIR: /tmp/neon/pg_install - DEFAULT_PG_VERSION: ${{ matrix.pg_version }} + PG_VERSION: ${{ matrix.pg_version }} TEST_OUTPUT: /tmp/test_output BUILD_TYPE: remote SAVE_PERF_REPORT: ${{ github.event.inputs.save_perf_report || ( github.ref_name == 'main' ) }} @@ -416,7 +427,7 @@ jobs: uses: ./.github/actions/neon-project-create with: region_id: ${{ matrix.region_id }} - postgres_version: ${{ env.DEFAULT_PG_VERSION }} + postgres_version: ${{ env.PG_VERSION }} api_key: ${{ secrets.NEON_STAGING_API_KEY }} compute_units: ${{ (contains(matrix.platform, 'captest-freetier') && '[0.25, 0.25]') || '[1, 1]' }} @@ -459,7 +470,7 @@ jobs: run_in_parallel: false save_perf_report: ${{ env.SAVE_PERF_REPORT }} extra_params: -m remote_cluster --timeout 21600 -k test_perf_many_relations - pg_version: ${{ env.DEFAULT_PG_VERSION }} + pg_version: ${{ env.PG_VERSION }} aws-oicd-role-arn: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }} env: BENCHMARK_CONNSTR: ${{ steps.set-up-connstr.outputs.connstr }} @@ -475,7 +486,7 @@ jobs: run_in_parallel: false save_perf_report: ${{ env.SAVE_PERF_REPORT }} extra_params: -m remote_cluster --timeout 21600 -k test_pgbench_remote_init - pg_version: ${{ env.DEFAULT_PG_VERSION }} + pg_version: ${{ env.PG_VERSION }} aws-oicd-role-arn: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }} env: BENCHMARK_CONNSTR: ${{ steps.set-up-connstr.outputs.connstr }} @@ -490,7 +501,7 @@ jobs: run_in_parallel: false save_perf_report: ${{ env.SAVE_PERF_REPORT }} extra_params: -m remote_cluster --timeout 21600 -k test_pgbench_remote_simple_update - pg_version: ${{ env.DEFAULT_PG_VERSION }} + pg_version: ${{ env.PG_VERSION }} aws-oicd-role-arn: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }} env: BENCHMARK_CONNSTR: ${{ steps.set-up-connstr.outputs.connstr }} @@ -505,7 +516,7 @@ jobs: run_in_parallel: false save_perf_report: ${{ env.SAVE_PERF_REPORT }} extra_params: -m remote_cluster --timeout 21600 -k test_pgbench_remote_select_only - pg_version: ${{ env.DEFAULT_PG_VERSION }} + pg_version: ${{ env.PG_VERSION }} aws-oicd-role-arn: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }} env: BENCHMARK_CONNSTR: ${{ steps.set-up-connstr.outputs.connstr }} @@ -549,14 +560,19 @@ jobs: include: - PLATFORM: "neonvm-captest-pgvector" RUNNER: [ self-hosted, us-east-2, x64 ] + postgres_version: 16 + - PLATFORM: "neonvm-captest-pgvector-pg17" + RUNNER: [ self-hosted, us-east-2, x64 ] + postgres_version: 17 - PLATFORM: "azure-captest-pgvector" RUNNER: [ self-hosted, eastus2, x64 ] + postgres_version: 16 env: TEST_PG_BENCH_DURATIONS_MATRIX: "15m" TEST_PG_BENCH_SCALES_MATRIX: "1" POSTGRES_DISTRIB_DIR: /tmp/neon/pg_install - DEFAULT_PG_VERSION: 16 + PG_VERSION: ${{ matrix.postgres_version }} TEST_OUTPUT: /tmp/test_output BUILD_TYPE: remote @@ -590,9 +606,13 @@ jobs: dpkg -x postgresql-client-16_16.6-1.pgdg120+1_${arch}.deb pg mkdir -p /tmp/neon/pg_install/v16/bin + mkdir -p /tmp/neon/pg_install/v17/bin ln -s /home/nonroot/pg/usr/lib/postgresql/16/bin/pgbench /tmp/neon/pg_install/v16/bin/pgbench ln -s /home/nonroot/pg/usr/lib/postgresql/16/bin/psql /tmp/neon/pg_install/v16/bin/psql ln -s /home/nonroot/pg/usr/lib/$(uname -m)-linux-gnu /tmp/neon/pg_install/v16/lib + ln -s /home/nonroot/pg/usr/lib/postgresql/16/bin/pgbench /tmp/neon/pg_install/v17/bin/pgbench + ln -s /home/nonroot/pg/usr/lib/postgresql/16/bin/psql /tmp/neon/pg_install/v17/bin/psql + ln -s /home/nonroot/pg/usr/lib/$(uname -m)-linux-gnu /tmp/neon/pg_install/v17/lib LD_LIBRARY_PATH="/home/nonroot/pg/usr/lib/$(uname -m)-linux-gnu:${LD_LIBRARY_PATH:-}" export LD_LIBRARY_PATH @@ -608,6 +628,9 @@ jobs: neonvm-captest-pgvector) CONNSTR=${{ secrets.BENCHMARK_PGVECTOR_CONNSTR }} ;; + neonvm-captest-pgvector-pg17) + CONNSTR=${{ secrets.BENCHMARK_PGVECTOR_CONNSTR_PG17 }} + ;; azure-captest-pgvector) CONNSTR=${{ secrets.BENCHMARK_PGVECTOR_CONNSTR_AZURE }} ;; @@ -634,7 +657,7 @@ jobs: run_in_parallel: false save_perf_report: ${{ env.SAVE_PERF_REPORT }} extra_params: -m remote_cluster --timeout 21600 -k test_pgvector_indexing - pg_version: ${{ env.DEFAULT_PG_VERSION }} + pg_version: ${{ env.PG_VERSION }} aws-oicd-role-arn: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }} env: VIP_VAP_ACCESS_TOKEN: "${{ secrets.VIP_VAP_ACCESS_TOKEN }}" @@ -649,7 +672,7 @@ jobs: run_in_parallel: false save_perf_report: ${{ env.SAVE_PERF_REPORT }} extra_params: -m remote_cluster --timeout 21600 - pg_version: ${{ env.DEFAULT_PG_VERSION }} + pg_version: ${{ env.PG_VERSION }} aws-oicd-role-arn: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }} env: BENCHMARK_CONNSTR: ${{ steps.set-up-connstr.outputs.connstr }} @@ -696,7 +719,7 @@ jobs: env: POSTGRES_DISTRIB_DIR: /tmp/neon/pg_install - DEFAULT_PG_VERSION: 16 + PG_VERSION: ${{ matrix.pg_version }} TEST_OUTPUT: /tmp/test_output TEST_OLAP_COLLECT_EXPLAIN: ${{ github.event.inputs.collect_olap_explain }} TEST_OLAP_COLLECT_PG_STAT_STATEMENTS: ${{ github.event.inputs.collect_pg_stat_statements }} @@ -739,7 +762,18 @@ jobs: run: | case "${PLATFORM}" in neonvm-captest-reuse) - CONNSTR=${{ secrets.BENCHMARK_CAPTEST_CLICKBENCH_10M_CONNSTR }} + case "${PG_VERSION}" in + 16) + CONNSTR=${{ secrets.BENCHMARK_CAPTEST_CLICKBENCH_10M_CONNSTR_V16 }} + ;; + 17) + CONNSTR=${{ secrets.BENCHMARK_CAPTEST_CONNSTR_PG17 }} + ;; + *) + echo >&2 "Unsupported PG_VERSION=${PG_VERSION} for PLATFORM=${PLATFORM}" + exit 1 + ;; + esac ;; rds-aurora) CONNSTR=${{ secrets.BENCHMARK_RDS_AURORA_CLICKBENCH_10M_CONNSTR }} @@ -763,7 +797,7 @@ jobs: run_in_parallel: false save_perf_report: ${{ env.SAVE_PERF_REPORT }} extra_params: -m remote_cluster --timeout 43200 -k test_clickbench - pg_version: ${{ env.DEFAULT_PG_VERSION }} + pg_version: ${{ env.PG_VERSION }} aws-oicd-role-arn: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }} env: VIP_VAP_ACCESS_TOKEN: "${{ secrets.VIP_VAP_ACCESS_TOKEN }}" @@ -812,12 +846,11 @@ jobs: env: POSTGRES_DISTRIB_DIR: /tmp/neon/pg_install - DEFAULT_PG_VERSION: 16 + PG_VERSION: ${{ matrix.pg_version }} TEST_OUTPUT: /tmp/test_output BUILD_TYPE: remote SAVE_PERF_REPORT: ${{ github.event.inputs.save_perf_report || ( github.ref_name == 'main' ) }} PLATFORM: ${{ matrix.platform }} - TEST_OLAP_SCALE: ${{ matrix.scale }} runs-on: [ self-hosted, us-east-2, x64 ] container: @@ -849,21 +882,31 @@ jobs: run: | case "${PLATFORM}" in neonvm-captest-reuse) - ENV_PLATFORM=CAPTEST_TPCH + case "${PG_VERSION}" in + 16) + CONNSTR_SECRET_NAME="BENCHMARK_CAPTEST_TPCH_S10_CONNSTR" + ;; + 17) + CONNSTR_SECRET_NAME="BENCHMARK_CAPTEST_CONNSTR_PG17" + ;; + *) + echo >&2 "Unsupported PG_VERSION=${PG_VERSION} for PLATFORM=${PLATFORM}" + exit 1 + ;; + esac ;; rds-aurora) - ENV_PLATFORM=RDS_AURORA_TPCH + CONNSTR_SECRET_NAME="BENCHMARK_RDS_AURORA_TPCH_S10_CONNSTR" ;; rds-postgres) - ENV_PLATFORM=RDS_POSTGRES_TPCH + CONNSTR_SECRET_NAME="BENCHMARK_RDS_POSTGRES_TPCH_S10_CONNSTR" ;; *) echo >&2 "Unknown PLATFORM=${PLATFORM}. Allowed only 'neonvm-captest-reuse', 'rds-aurora', or 'rds-postgres'" exit 1 ;; esac - - CONNSTR_SECRET_NAME="BENCHMARK_${ENV_PLATFORM}_S${TEST_OLAP_SCALE}_CONNSTR" + echo "CONNSTR_SECRET_NAME=${CONNSTR_SECRET_NAME}" >> $GITHUB_ENV - name: Set up Connection String @@ -881,13 +924,13 @@ jobs: run_in_parallel: false save_perf_report: ${{ env.SAVE_PERF_REPORT }} extra_params: -m remote_cluster --timeout 21600 -k test_tpch - pg_version: ${{ env.DEFAULT_PG_VERSION }} + pg_version: ${{ env.PG_VERSION }} aws-oicd-role-arn: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }} env: VIP_VAP_ACCESS_TOKEN: "${{ secrets.VIP_VAP_ACCESS_TOKEN }}" PERF_TEST_RESULT_CONNSTR: "${{ secrets.PERF_TEST_RESULT_CONNSTR }}" BENCHMARK_CONNSTR: ${{ steps.set-up-connstr.outputs.connstr }} - TEST_OLAP_SCALE: ${{ matrix.scale }} + TEST_OLAP_SCALE: 10 - name: Create Allure report id: create-allure-report @@ -922,7 +965,7 @@ jobs: env: POSTGRES_DISTRIB_DIR: /tmp/neon/pg_install - DEFAULT_PG_VERSION: 16 + PG_VERSION: ${{ matrix.pg_version }} TEST_OUTPUT: /tmp/test_output BUILD_TYPE: remote SAVE_PERF_REPORT: ${{ github.event.inputs.save_perf_report || ( github.ref_name == 'main' ) }} @@ -959,7 +1002,18 @@ jobs: run: | case "${PLATFORM}" in neonvm-captest-reuse) - CONNSTR=${{ secrets.BENCHMARK_USER_EXAMPLE_CAPTEST_CONNSTR }} + case "${PG_VERSION}" in + 16) + CONNSTR=${{ secrets.BENCHMARK_USER_EXAMPLE_CAPTEST_CONNSTR }} + ;; + 17) + CONNSTR=${{ secrets.BENCHMARK_CAPTEST_CONNSTR_PG17 }} + ;; + *) + echo >&2 "Unsupported PG_VERSION=${PG_VERSION} for PLATFORM=${PLATFORM}" + exit 1 + ;; + esac ;; rds-aurora) CONNSTR=${{ secrets.BENCHMARK_USER_EXAMPLE_RDS_AURORA_CONNSTR }} @@ -983,7 +1037,7 @@ jobs: run_in_parallel: false save_perf_report: ${{ env.SAVE_PERF_REPORT }} extra_params: -m remote_cluster --timeout 21600 -k test_user_examples - pg_version: ${{ env.DEFAULT_PG_VERSION }} + pg_version: ${{ env.PG_VERSION }} aws-oicd-role-arn: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }} env: VIP_VAP_ACCESS_TOKEN: "${{ secrets.VIP_VAP_ACCESS_TOKEN }}" diff --git a/.github/workflows/build-macos.yml b/.github/workflows/build-macos.yml index 01d82a1ed2..347a511e98 100644 --- a/.github/workflows/build-macos.yml +++ b/.github/workflows/build-macos.yml @@ -235,7 +235,7 @@ jobs: echo 'CPPFLAGS=-I/usr/local/opt/openssl@3/include' >> $GITHUB_ENV - name: Run cargo build (only for v17) - run: PQ_LIB_DIR=$(pwd)/pg_install/v17/lib cargo build --all --release -j$(sysctl -n hw.ncpu) + run: cargo build --all --release -j$(sysctl -n hw.ncpu) - name: Check that no warnings are produced (only for v17) run: ./run_clippy.sh diff --git a/.github/workflows/build_and_test.yml b/.github/workflows/build_and_test.yml index 489a93f46d..e588fc5a0e 100644 --- a/.github/workflows/build_and_test.yml +++ b/.github/workflows/build_and_test.yml @@ -164,77 +164,11 @@ jobs: check-codestyle-rust: needs: [ check-permissions, build-build-tools-image ] - strategy: - matrix: - arch: [ x64, arm64 ] - runs-on: ${{ fromJson(format('["self-hosted", "{0}"]', matrix.arch == 'arm64' && 'small-arm64' || 'small')) }} - - container: - image: ${{ needs.build-build-tools-image.outputs.image }}-bookworm - credentials: - username: ${{ secrets.NEON_DOCKERHUB_USERNAME }} - password: ${{ secrets.NEON_DOCKERHUB_PASSWORD }} - options: --init - - steps: - - name: Checkout - uses: actions/checkout@v4 - with: - submodules: true - - - name: Cache cargo deps - uses: actions/cache@v4 - with: - path: | - ~/.cargo/registry - !~/.cargo/registry/src - ~/.cargo/git - target - key: v1-${{ runner.os }}-${{ runner.arch }}-cargo-${{ hashFiles('./Cargo.lock') }}-${{ hashFiles('./rust-toolchain.toml') }}-rust - - # Some of our rust modules use FFI and need those to be checked - - name: Get postgres headers - run: make postgres-headers -j$(nproc) - - # cargo hack runs the given cargo subcommand (clippy in this case) for all feature combinations. - # This will catch compiler & clippy warnings in all feature combinations. - # TODO: use cargo hack for build and test as well, but, that's quite expensive. - # NB: keep clippy args in sync with ./run_clippy.sh - # - # The only difference between "clippy --debug" and "clippy --release" is that in --release mode, - # #[cfg(debug_assertions)] blocks are not built. It's not worth building everything for second - # time just for that, so skip "clippy --release". - - run: | - CLIPPY_COMMON_ARGS="$( source .neon_clippy_args; echo "$CLIPPY_COMMON_ARGS")" - if [ "$CLIPPY_COMMON_ARGS" = "" ]; then - echo "No clippy args found in .neon_clippy_args" - exit 1 - fi - echo "CLIPPY_COMMON_ARGS=${CLIPPY_COMMON_ARGS}" >> $GITHUB_ENV - - name: Run cargo clippy (debug) - run: cargo hack --features default --ignore-unknown-features --feature-powerset clippy $CLIPPY_COMMON_ARGS - - - name: Check documentation generation - run: cargo doc --workspace --no-deps --document-private-items - env: - RUSTDOCFLAGS: "-Dwarnings -Arustdoc::private_intra_doc_links" - - # Use `${{ !cancelled() }}` to run quck tests after the longer clippy run - - name: Check formatting - if: ${{ !cancelled() }} - run: cargo fmt --all -- --check - - # https://github.com/facebookincubator/cargo-guppy/tree/bec4e0eb29dcd1faac70b1b5360267fc02bf830e/tools/cargo-hakari#2-keep-the-workspace-hack-up-to-date-in-ci - - name: Check rust dependencies - if: ${{ !cancelled() }} - run: | - cargo hakari generate --diff # workspace-hack Cargo.toml is up-to-date - cargo hakari manage-deps --dry-run # all workspace crates depend on workspace-hack - - # https://github.com/EmbarkStudios/cargo-deny - - name: Check rust licenses/bans/advisories/sources - if: ${{ !cancelled() }} - run: cargo deny check --hide-inclusion-graph + uses: ./.github/workflows/_check-codestyle-rust.yml + with: + build-tools-image: ${{ needs.build-build-tools-image.outputs.image }}-bookworm + archs: '["x64", "arm64"]' + secrets: inherit build-and-test-locally: needs: [ tag, build-build-tools-image ] @@ -308,7 +242,7 @@ jobs: statuses: write contents: write pull-requests: write - runs-on: [ self-hosted, small ] + runs-on: [ self-hosted, small-metal ] container: image: ${{ needs.build-build-tools-image.outputs.image }}-bookworm credentials: @@ -852,6 +786,17 @@ jobs: username: ${{ secrets.NEON_DOCKERHUB_USERNAME }} password: ${{ secrets.NEON_DOCKERHUB_PASSWORD }} + - name: Get the last compute release tag + id: get-last-compute-release-tag + env: + GH_TOKEN: ${{ secrets.CI_ACCESS_TOKEN }} + run: | + tag=$(gh api -q '[.[].tag_name | select(startswith("release-compute"))][0]'\ + -H "Accept: application/vnd.github+json" \ + -H "X-GitHub-Api-Version: 2022-11-28" \ + "/repos/${{ github.repository }}/releases") + echo tag=${tag} >> ${GITHUB_OUTPUT} + # `neondatabase/neon` contains multiple binaries, all of them use the same input for the version into the same version formatting library. # Pick pageserver as currently the only binary with extra "version" features printed in the string to verify. # Regular pageserver version string looks like @@ -883,14 +828,28 @@ jobs: TEST_VERSION_ONLY: ${{ matrix.pg_version }} run: ./docker-compose/docker_compose_test.sh + - name: Print logs and clean up docker-compose test + if: always() + run: | + docker compose --profile test-extensions -f ./docker-compose/docker-compose.yml logs || true + docker compose --profile test-extensions -f ./docker-compose/docker-compose.yml down + + - name: Test extension upgrade + timeout-minutes: 20 + if: ${{ needs.tag.outputs.build-tag == github.run_id }} + env: + NEWTAG: ${{ needs.tag.outputs.build-tag }} + OLDTAG: ${{ steps.get-last-compute-release-tag.outputs.tag }} + run: ./docker-compose/test_extensions_upgrade.sh + - name: Print logs and clean up if: always() run: | - docker compose -f ./docker-compose/docker-compose.yml logs || 0 - docker compose -f ./docker-compose/docker-compose.yml down + docker compose --profile test-extensions -f ./docker-compose/docker-compose.yml logs || true + docker compose --profile test-extensions -f ./docker-compose/docker-compose.yml down promote-images-dev: - needs: [ check-permissions, tag, vm-compute-node-image ] + needs: [ check-permissions, tag, vm-compute-node-image, neon-image ] runs-on: ubuntu-22.04 permissions: @@ -925,7 +884,7 @@ jobs: done promote-images-prod: - needs: [ check-permissions, tag, test-images, vm-compute-node-image ] + needs: [ check-permissions, tag, test-images, promote-images-dev ] runs-on: ubuntu-22.04 if: github.ref_name == 'main' || github.ref_name == 'release' || github.ref_name == 'release-proxy' || github.ref_name == 'release-compute' @@ -1116,6 +1075,7 @@ jobs: retries: 5 script: | const tag = "${{ needs.tag.outputs.build-tag }}"; + const branch = "${{ github.ref_name }}"; try { const existingRef = await github.rest.git.getRef({ @@ -1144,12 +1104,6 @@ jobs: console.log(`Tag ${tag} created successfully.`); } - // TODO: check how GitHub releases looks for proxy/compute releases and enable them if they're ok - if (context.ref !== 'refs/heads/release') { - console.log(`GitHub release skipped for ${context.ref}.`); - return; - } - try { const existingRelease = await github.rest.repos.getReleaseByTag({ owner: context.repo.owner, @@ -1164,11 +1118,48 @@ jobs: } console.log(`Release for tag ${tag} does not exist. Creating it...`); + + // Find the PR number using the commit SHA + const pullRequests = await github.rest.pulls.list({ + owner: context.repo.owner, + repo: context.repo.repo, + state: 'closed', + base: branch, + }); + + const pr = pullRequests.data.find(pr => pr.merge_commit_sha === context.sha); + const prNumber = pr ? pr.number : null; + + // Find the previous release on the branch + const releases = await github.rest.repos.listReleases({ + owner: context.repo.owner, + repo: context.repo.repo, + per_page: 100, + }); + + const branchReleases = releases.data + .filter((release) => { + const regex = new RegExp(`^${branch}-\\d+$`); + return regex.test(release.tag_name) && !release.draft && !release.prerelease; + }) + .sort((a, b) => new Date(b.created_at) - new Date(a.created_at)); + + const previousTag = branchReleases.length > 0 ? branchReleases[0].tag_name : null; + + const releaseNotes = [ + prNumber + ? `Release PR https://github.com/${context.repo.owner}/${context.repo.repo}/pull/${prNumber}.` + : 'Release PR not found.', + previousTag + ? `Diff with the previous release https://github.com/${context.repo.owner}/${context.repo.repo}/compare/${previousTag}...${tag}.` + : `No previous release found on branch ${branch}.`, + ].join('\n\n'); + await github.rest.repos.createRelease({ owner: context.repo.owner, repo: context.repo.repo, tag_name: tag, - generate_release_notes: true, + body: releaseNotes, }); console.log(`Release for tag ${tag} created successfully.`); } diff --git a/.github/workflows/ingest_benchmark.yml b/.github/workflows/ingest_benchmark.yml index fc33c0a980..7b303fa37a 100644 --- a/.github/workflows/ingest_benchmark.yml +++ b/.github/workflows/ingest_benchmark.yml @@ -28,7 +28,24 @@ jobs: strategy: fail-fast: false # allow other variants to continue even if one fails matrix: - target_project: [new_empty_project, large_existing_project] + include: + - target_project: new_empty_project_stripe_size_2048 + stripe_size: 2048 # 16 MiB + postgres_version: 16 + - target_project: new_empty_project_stripe_size_32768 + stripe_size: 32768 # 256 MiB # note that this is different from null because using null will shard_split the project only if it reaches the threshold + # while here it is sharded from the beginning with a shard size of 256 MiB + postgres_version: 16 + - target_project: new_empty_project + stripe_size: null # run with neon defaults which will shard split only when reaching the threshold + postgres_version: 16 + - target_project: new_empty_project + stripe_size: null # run with neon defaults which will shard split only when reaching the threshold + postgres_version: 17 + - target_project: large_existing_project + stripe_size: null # cannot re-shared or choose different stripe size for existing, already sharded project + postgres_version: 16 + max-parallel: 1 # we want to run each stripe size sequentially to be able to compare the results permissions: contents: write statuses: write @@ -67,17 +84,21 @@ jobs: aws-oicd-role-arn: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }} - name: Create Neon Project - if: ${{ matrix.target_project == 'new_empty_project' }} + if: ${{ startsWith(matrix.target_project, 'new_empty_project') }} id: create-neon-project-ingest-target uses: ./.github/actions/neon-project-create with: region_id: aws-us-east-2 - postgres_version: 16 + postgres_version: ${{ matrix.postgres_version }} compute_units: '[7, 7]' # we want to test large compute here to avoid compute-side bottleneck api_key: ${{ secrets.NEON_STAGING_API_KEY }} + shard_split_project: ${{ matrix.stripe_size != null && 'true' || 'false' }} + admin_api_key: ${{ secrets.NEON_STAGING_ADMIN_API_KEY }} + shard_count: 8 + stripe_size: ${{ matrix.stripe_size }} - name: Initialize Neon project - if: ${{ matrix.target_project == 'new_empty_project' }} + if: ${{ startsWith(matrix.target_project, 'new_empty_project') }} env: BENCHMARK_INGEST_TARGET_CONNSTR: ${{ steps.create-neon-project-ingest-target.outputs.dsn }} NEW_PROJECT_ID: ${{ steps.create-neon-project-ingest-target.outputs.project_id }} @@ -130,7 +151,7 @@ jobs: test_selection: performance/test_perf_ingest_using_pgcopydb.py run_in_parallel: false extra_params: -s -m remote_cluster --timeout 86400 -k test_ingest_performance_using_pgcopydb - pg_version: v16 + pg_version: v${{ matrix.postgres_version }} save_perf_report: true aws-oicd-role-arn: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }} env: @@ -146,7 +167,7 @@ jobs: ${PSQL} "${BENCHMARK_INGEST_TARGET_CONNSTR}" -c "\dt+" - name: Delete Neon Project - if: ${{ always() && matrix.target_project == 'new_empty_project' }} + if: ${{ always() && startsWith(matrix.target_project, 'new_empty_project') }} uses: ./.github/actions/neon-project-delete with: project_id: ${{ steps.create-neon-project-ingest-target.outputs.project_id }} diff --git a/.github/workflows/neon_extra_builds.yml b/.github/workflows/neon_extra_builds.yml index 5b5910badf..f077e04d1c 100644 --- a/.github/workflows/neon_extra_builds.yml +++ b/.github/workflows/neon_extra_builds.yml @@ -114,7 +114,7 @@ jobs: run: make walproposer-lib -j$(nproc) - name: Produce the build stats - run: PQ_LIB_DIR=$(pwd)/pg_install/v17/lib cargo build --all --release --timings -j$(nproc) + run: cargo build --all --release --timings -j$(nproc) - name: Configure AWS credentials uses: aws-actions/configure-aws-credentials@v4 diff --git a/.github/workflows/pre-merge-checks.yml b/.github/workflows/pre-merge-checks.yml index b2e00d94f7..e6dfbaeed8 100644 --- a/.github/workflows/pre-merge-checks.yml +++ b/.github/workflows/pre-merge-checks.yml @@ -1,6 +1,12 @@ name: Pre-merge checks on: + pull_request: + paths: + - .github/workflows/_check-codestyle-python.yml + - .github/workflows/_check-codestyle-rust.yml + - .github/workflows/build-build-tools-image.yml + - .github/workflows/pre-merge-checks.yml merge_group: branches: - main @@ -17,8 +23,10 @@ jobs: runs-on: ubuntu-22.04 outputs: python-changed: ${{ steps.python-src.outputs.any_changed }} + rust-changed: ${{ steps.rust-src.outputs.any_changed }} steps: - uses: actions/checkout@v4 + - uses: tj-actions/changed-files@4edd678ac3f81e2dc578756871e4d00c19191daf # v45.0.4 id: python-src with: @@ -30,11 +38,25 @@ jobs: poetry.lock pyproject.toml + - uses: tj-actions/changed-files@4edd678ac3f81e2dc578756871e4d00c19191daf # v45.0.4 + id: rust-src + with: + files: | + .github/workflows/_check-codestyle-rust.yml + .github/workflows/build-build-tools-image.yml + .github/workflows/pre-merge-checks.yml + **/**.rs + **/Cargo.toml + Cargo.toml + Cargo.lock + - name: PRINT ALL CHANGED FILES FOR DEBUG PURPOSES env: PYTHON_CHANGED_FILES: ${{ steps.python-src.outputs.all_changed_files }} + RUST_CHANGED_FILES: ${{ steps.rust-src.outputs.all_changed_files }} run: | echo "${PYTHON_CHANGED_FILES}" + echo "${RUST_CHANGED_FILES}" build-build-tools-image: if: needs.get-changed-files.outputs.python-changed == 'true' @@ -55,6 +77,16 @@ jobs: build-tools-image: ${{ needs.build-build-tools-image.outputs.image }}-bookworm-x64 secrets: inherit + check-codestyle-rust: + if: needs.get-changed-files.outputs.rust-changed == 'true' + needs: [ get-changed-files, build-build-tools-image ] + uses: ./.github/workflows/_check-codestyle-rust.yml + with: + # `-bookworm-x64` suffix should match the combination in `build-build-tools-image` + build-tools-image: ${{ needs.build-build-tools-image.outputs.image }}-bookworm-x64 + archs: '["x64"]' + secrets: inherit + # To get items from the merge queue merged into main we need to satisfy "Status checks that are required". # Currently we require 2 jobs (checks with exact name): # - conclusion @@ -67,6 +99,7 @@ jobs: needs: - get-changed-files - check-codestyle-python + - check-codestyle-rust runs-on: ubuntu-22.04 steps: - name: Create fake `neon-cloud-e2e` check diff --git a/.github/workflows/release.yml b/.github/workflows/release.yml index 3c1af1d9c6..919846ce44 100644 --- a/.github/workflows/release.yml +++ b/.github/workflows/release.yml @@ -3,8 +3,9 @@ name: Create Release Branch on: schedule: # It should be kept in sync with if-condition in jobs - - cron: '0 6 * * FRI' # Storage release - cron: '0 6 * * THU' # Proxy release + - cron: '0 6 * * FRI' # Storage release + - cron: '0 7 * * FRI' # Compute release workflow_dispatch: inputs: create-storage-release-branch: @@ -55,7 +56,7 @@ jobs: ci-access-token: ${{ secrets.CI_ACCESS_TOKEN }} create-compute-release-branch: - if: inputs.create-compute-release-branch + if: ${{ github.event.schedule == '0 7 * * FRI' || inputs.create-compute-release-branch }} permissions: contents: write diff --git a/Cargo.lock b/Cargo.lock index 3f184ebe0b..9ba90355df 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -179,7 +179,7 @@ dependencies = [ "nom", "num-traits", "rusticata-macros", - "thiserror", + "thiserror 1.0.69", "time", ] @@ -718,14 +718,14 @@ dependencies = [ [[package]] name = "axum" -version = "0.7.9" +version = "0.8.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "edca88bc138befd0323b20752846e6587272d3b03b0343c8ea28a6f819e6e71f" +checksum = "6d6fd624c75e18b3b4c6b9caf42b1afe24437daaee904069137d8bab077be8b8" dependencies = [ - "async-trait", "axum-core", "base64 0.22.1", "bytes", + "form_urlencoded", "futures-util", "http 1.1.0", "http-body 1.0.0", @@ -733,7 +733,7 @@ dependencies = [ "hyper 1.4.1", "hyper-util", "itoa", - "matchit 0.7.0", + "matchit", "memchr", "mime", "percent-encoding", @@ -746,7 +746,7 @@ dependencies = [ "sha1", "sync_wrapper 1.0.1", "tokio", - "tokio-tungstenite 0.24.0", + "tokio-tungstenite 0.26.1", "tower 0.5.2", "tower-layer", "tower-service", @@ -755,11 +755,10 @@ dependencies = [ [[package]] name = "axum-core" -version = "0.4.5" +version = "0.5.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "09f2bd6146b97ae3359fa0cc6d6b376d9539582c7b4220f041a33ec24c226199" +checksum = "df1362f362fd16024ae199c1970ce98f9661bf5ef94b9808fee734bc3698b733" dependencies = [ - "async-trait", "bytes", "futures-util", "http 1.1.0", @@ -942,6 +941,18 @@ version = "1.6.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "8c3c1a368f70d6cf7302d78f8f7093da241fb8e8807c05cc9e51a125895a6d5b" +[[package]] +name = "bb8" +version = "0.8.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d89aabfae550a5c44b43ab941844ffcd2e993cb6900b342debf59e9ea74acdb8" +dependencies = [ + "async-trait", + "futures-util", + "parking_lot 0.12.1", + "tokio", +] + [[package]] name = "bcder" version = "0.7.4" @@ -1118,7 +1129,7 @@ dependencies = [ "log", "nix 0.25.1", "regex", - "thiserror", + "thiserror 1.0.69", ] [[package]] @@ -1299,9 +1310,9 @@ dependencies = [ "serde_with", "signal-hook", "tar", - "thiserror", + "thiserror 1.0.69", "tokio", - "tokio-postgres", + "tokio-postgres 0.7.9", "tokio-stream", "tokio-util", "tower 0.5.2", @@ -1312,6 +1323,7 @@ dependencies = [ "tracing-utils", "url", "utils", + "uuid", "vm_monitor", "workspace_hack", "zstd", @@ -1407,9 +1419,9 @@ dependencies = [ "serde", "serde_json", "storage_broker", - "thiserror", + "thiserror 1.0.69", "tokio", - "tokio-postgres", + "tokio-postgres 0.7.9", "tokio-util", "toml", "toml_edit", @@ -1785,11 +1797,24 @@ dependencies = [ "chrono", "diesel_derives", "itoa", - "pq-sys", - "r2d2", "serde_json", ] +[[package]] +name = "diesel-async" +version = "0.5.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "51a307ac00f7c23f526a04a77761a0519b9f0eb2838ebf5b905a58580095bdcb" +dependencies = [ + "async-trait", + "bb8", + "diesel", + "futures-util", + "scoped-futures", + "tokio", + "tokio-postgres 0.7.12", +] + [[package]] name = "diesel_derives" version = "2.2.1" @@ -2238,7 +2263,7 @@ dependencies = [ "pin-project", "rand 0.8.5", "sha1", - "thiserror", + "thiserror 1.0.69", "tokio", "tokio-util", ] @@ -3364,12 +3389,6 @@ dependencies = [ "regex-automata 0.1.10", ] -[[package]] -name = "matchit" -version = "0.7.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b87248edafb776e59e6ee64a79086f65890d3510f2c656c000bf2a7e8a0aea40" - [[package]] name = "matchit" version = "0.8.4" @@ -3760,7 +3779,7 @@ dependencies = [ "serde_json", "serde_path_to_error", "sha2", - "thiserror", + "thiserror 1.0.69", "url", ] @@ -3810,7 +3829,7 @@ dependencies = [ "futures-sink", "js-sys", "pin-project-lite", - "thiserror", + "thiserror 1.0.69", "tracing", ] @@ -3842,7 +3861,7 @@ dependencies = [ "opentelemetry_sdk", "prost", "reqwest", - "thiserror", + "thiserror 1.0.69", ] [[package]] @@ -3878,7 +3897,7 @@ dependencies = [ "percent-encoding", "rand 0.8.5", "serde_json", - "thiserror", + "thiserror 1.0.69", "tokio", "tokio-stream", "tracing", @@ -3992,7 +4011,7 @@ dependencies = [ "remote_storage", "serde_json", "svg_fmt", - "thiserror", + "thiserror 1.0.69", "tokio", "tokio-util", "utils", @@ -4041,8 +4060,8 @@ dependencies = [ "pageserver_compaction", "pin-project-lite", "postgres", - "postgres-protocol", - "postgres-types", + "postgres-protocol 0.6.6", + "postgres-types 0.2.6", "postgres_backend", "postgres_connection", "postgres_ffi", @@ -4068,12 +4087,12 @@ dependencies = [ "strum_macros", "sysinfo", "tenant_size_model", - "thiserror", + "thiserror 1.0.69", "tikv-jemallocator", "tokio", "tokio-epoll-uring", "tokio-io-timeout", - "tokio-postgres", + "tokio-postgres 0.7.9", "tokio-stream", "tokio-tar", "tokio-util", @@ -4114,7 +4133,7 @@ dependencies = [ "storage_broker", "strum", "strum_macros", - "thiserror", + "thiserror 1.0.69", "utils", ] @@ -4129,9 +4148,9 @@ dependencies = [ "postgres", "reqwest", "serde", - "thiserror", + "thiserror 1.0.69", "tokio", - "tokio-postgres", + "tokio-postgres 0.7.9", "tokio-stream", "tokio-util", "utils", @@ -4429,23 +4448,23 @@ dependencies = [ [[package]] name = "postgres" -version = "0.19.4" -source = "git+https://github.com/neondatabase/rust-postgres.git?branch=neon#511f998c00148ab7c847bd7e6cfd3a906d0e7473" +version = "0.19.6" +source = "git+https://github.com/neondatabase/rust-postgres.git?branch=neon#8b44892f7851e705810b2cb54504325699966070" dependencies = [ "bytes", "fallible-iterator", "futures-util", "log", "tokio", - "tokio-postgres", + "tokio-postgres 0.7.9", ] [[package]] name = "postgres-protocol" -version = "0.6.4" -source = "git+https://github.com/neondatabase/rust-postgres.git?branch=neon#511f998c00148ab7c847bd7e6cfd3a906d0e7473" +version = "0.6.6" +source = "git+https://github.com/neondatabase/rust-postgres.git?branch=neon#8b44892f7851e705810b2cb54504325699966070" dependencies = [ - "base64 0.20.0", + "base64 0.21.1", "byteorder", "bytes", "fallible-iterator", @@ -4458,6 +4477,24 @@ dependencies = [ "stringprep", ] +[[package]] +name = "postgres-protocol" +version = "0.6.7" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "acda0ebdebc28befa84bee35e651e4c5f09073d668c7aed4cf7e23c3cda84b23" +dependencies = [ + "base64 0.22.1", + "byteorder", + "bytes", + "fallible-iterator", + "hmac", + "md-5", + "memchr", + "rand 0.8.5", + "sha2", + "stringprep", +] + [[package]] name = "postgres-protocol2" version = "0.1.0" @@ -4476,12 +4513,24 @@ dependencies = [ [[package]] name = "postgres-types" -version = "0.2.4" -source = "git+https://github.com/neondatabase/rust-postgres.git?branch=neon#511f998c00148ab7c847bd7e6cfd3a906d0e7473" +version = "0.2.6" +source = "git+https://github.com/neondatabase/rust-postgres.git?branch=neon#8b44892f7851e705810b2cb54504325699966070" +dependencies = [ + "bytes", + "chrono", + "fallible-iterator", + "postgres-protocol 0.6.6", +] + +[[package]] +name = "postgres-types" +version = "0.2.8" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f66ea23a2d0e5734297357705193335e0a957696f34bed2f2faefacb2fec336f" dependencies = [ "bytes", "fallible-iterator", - "postgres-protocol", + "postgres-protocol 0.6.7", ] [[package]] @@ -4504,9 +4553,9 @@ dependencies = [ "rustls 0.23.18", "rustls-pemfile 2.1.1", "serde", - "thiserror", + "thiserror 1.0.69", "tokio", - "tokio-postgres", + "tokio-postgres 0.7.9", "tokio-postgres-rustls", "tokio-rustls 0.26.0", "tokio-util", @@ -4521,7 +4570,7 @@ dependencies = [ "itertools 0.10.5", "once_cell", "postgres", - "tokio-postgres", + "tokio-postgres 0.7.9", "url", ] @@ -4542,7 +4591,7 @@ dependencies = [ "pprof", "regex", "serde", - "thiserror", + "thiserror 1.0.69", "tracing", "utils", ] @@ -4553,7 +4602,7 @@ version = "0.1.0" dependencies = [ "anyhow", "camino", - "thiserror", + "thiserror 1.0.69", "tokio", "workspace_hack", ] @@ -4586,7 +4635,7 @@ dependencies = [ "smallvec", "symbolic-demangle", "tempfile", - "thiserror", + "thiserror 1.0.69", ] [[package]] @@ -4608,15 +4657,6 @@ version = "0.2.17" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "5b40af805b3121feab8a3c29f04d8ad262fa8e0561883e7653e024ae4479e6de" -[[package]] -name = "pq-sys" -version = "0.6.3" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f6cc05d7ea95200187117196eee9edd0644424911821aeb28a18ce60ea0b8793" -dependencies = [ - "vcpkg", -] - [[package]] name = "pq_proto" version = "0.1.0" @@ -4624,10 +4664,10 @@ dependencies = [ "byteorder", "bytes", "itertools 0.10.5", - "postgres-protocol", + "postgres-protocol 0.6.6", "rand 0.8.5", "serde", - "thiserror", + "thiserror 1.0.69", "tokio", ] @@ -4698,7 +4738,7 @@ dependencies = [ "memchr", "parking_lot 0.12.1", "procfs", - "thiserror", + "thiserror 1.0.69", ] [[package]] @@ -4868,11 +4908,11 @@ dependencies = [ "strum", "strum_macros", "subtle", - "thiserror", + "thiserror 1.0.69", "tikv-jemalloc-ctl", "tikv-jemallocator", "tokio", - "tokio-postgres", + "tokio-postgres 0.7.9", "tokio-postgres2", "tokio-rustls 0.26.0", "tokio-tungstenite 0.21.0", @@ -4929,17 +4969,6 @@ dependencies = [ "proc-macro2", ] -[[package]] -name = "r2d2" -version = "0.8.10" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "51de85fb3fb6524929c8a2eb85e6b6d363de4e8c48f9e2c2eac4944abc181c93" -dependencies = [ - "log", - "parking_lot 0.12.1", - "scheduled-thread-pool", -] - [[package]] name = "rand" version = "0.7.3" @@ -5276,7 +5305,7 @@ dependencies = [ "http 1.1.0", "reqwest", "serde", - "thiserror", + "thiserror 1.0.69", "tower-service", ] @@ -5296,7 +5325,7 @@ dependencies = [ "reqwest", "reqwest-middleware", "retry-policies", - "thiserror", + "thiserror 1.0.69", "tokio", "tracing", "wasm-timer", @@ -5312,7 +5341,7 @@ dependencies = [ "async-trait", "getrandom 0.2.11", "http 1.1.0", - "matchit 0.8.4", + "matchit", "opentelemetry", "reqwest", "reqwest-middleware", @@ -5671,7 +5700,7 @@ dependencies = [ "pageserver_api", "parking_lot 0.12.1", "postgres", - "postgres-protocol", + "postgres-protocol 0.6.6", "postgres_backend", "postgres_ffi", "pprof", @@ -5691,11 +5720,11 @@ dependencies = [ "storage_broker", "strum", "strum_macros", - "thiserror", + "thiserror 1.0.69", "tikv-jemallocator", "tokio", "tokio-io-timeout", - "tokio-postgres", + "tokio-postgres 0.7.9", "tokio-stream", "tokio-tar", "tokio-util", @@ -5730,7 +5759,7 @@ dependencies = [ "reqwest", "safekeeper_api", "serde", - "thiserror", + "thiserror 1.0.69", "utils", "workspace_hack", ] @@ -5754,12 +5783,12 @@ dependencies = [ ] [[package]] -name = "scheduled-thread-pool" -version = "0.2.7" +name = "scoped-futures" +version = "0.1.4" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "3cbc66816425a074528352f5789333ecff06ca41b36b0b0efdfbb29edc391a19" +checksum = "1b24aae2d0636530f359e9d5ef0c04669d11c5e756699b27a6a6d845d8329091" dependencies = [ - "parking_lot 0.12.1", + "pin-project-lite", ] [[package]] @@ -5939,7 +5968,7 @@ dependencies = [ "rand 0.8.5", "serde", "serde_json", - "thiserror", + "thiserror 1.0.69", "time", "url", "uuid", @@ -6011,7 +6040,7 @@ checksum = "c7715380eec75f029a4ef7de39a9200e0a63823176b759d055b613f5a87df6a6" dependencies = [ "percent-encoding", "serde", - "thiserror", + "thiserror 1.0.69", ] [[package]] @@ -6173,7 +6202,7 @@ checksum = "adc4e5204eb1910f40f9cfa375f6f05b68c3abac4b6fd879c8ff5e7ae8a0a085" dependencies = [ "num-bigint", "num-traits", - "thiserror", + "thiserror 1.0.69", "time", ] @@ -6294,6 +6323,7 @@ dependencies = [ "clap", "control_plane", "diesel", + "diesel-async", "diesel_migrations", "fail", "futures", @@ -6308,16 +6338,16 @@ dependencies = [ "pageserver_api", "pageserver_client", "postgres_connection", - "r2d2", "rand 0.8.5", "reqwest", "routerify", + "scoped-futures", "scopeguard", "serde", "serde_json", "strum", "strum_macros", - "thiserror", + "thiserror 1.0.69", "tokio", "tokio-util", "tracing", @@ -6364,7 +6394,7 @@ dependencies = [ "serde_json", "storage_controller_client", "tokio", - "tokio-postgres", + "tokio-postgres 0.7.9", "tokio-postgres-rustls", "tokio-stream", "tokio-util", @@ -6609,7 +6639,16 @@ version = "1.0.69" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "b6aaf5339b578ea85b50e080feb250a3e8ae8cfcdff9a461c9ec2904bc923f52" dependencies = [ - "thiserror-impl", + "thiserror-impl 1.0.69", +] + +[[package]] +name = "thiserror" +version = "2.0.11" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d452f284b73e6d76dd36758a0c8684b1d5be31f92b89d07fd5822175732206fc" +dependencies = [ + "thiserror-impl 2.0.11", ] [[package]] @@ -6623,6 +6662,17 @@ dependencies = [ "syn 2.0.90", ] +[[package]] +name = "thiserror-impl" +version = "2.0.11" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "26afc1baea8a989337eeb52b6e72a039780ce45c3edfcc9c5b9d112feeb173c2" +dependencies = [ + "proc-macro2", + "quote", + "syn 2.0.90", +] + [[package]] name = "thread_local" version = "1.1.7" @@ -6773,13 +6823,13 @@ dependencies = [ [[package]] name = "tokio-epoll-uring" version = "0.1.0" -source = "git+https://github.com/neondatabase/tokio-epoll-uring.git?branch=main#33e00106a268644d02ba0461bbd64476073b0ee1" +source = "git+https://github.com/neondatabase/tokio-epoll-uring.git?branch=main#781989bb540a1408b0b93daa1e9d1fa452195497" dependencies = [ "futures", "nix 0.26.4", "once_cell", "scopeguard", - "thiserror", + "thiserror 1.0.69", "tokio", "tokio-util", "tracing", @@ -6809,8 +6859,8 @@ dependencies = [ [[package]] name = "tokio-postgres" -version = "0.7.7" -source = "git+https://github.com/neondatabase/rust-postgres.git?branch=neon#511f998c00148ab7c847bd7e6cfd3a906d0e7473" +version = "0.7.9" +source = "git+https://github.com/neondatabase/rust-postgres.git?branch=neon#8b44892f7851e705810b2cb54504325699966070" dependencies = [ "async-trait", "byteorder", @@ -6823,11 +6873,39 @@ dependencies = [ "percent-encoding", "phf", "pin-project-lite", - "postgres-protocol", - "postgres-types", + "postgres-protocol 0.6.6", + "postgres-types 0.2.6", + "rand 0.8.5", "socket2", "tokio", "tokio-util", + "whoami", +] + +[[package]] +name = "tokio-postgres" +version = "0.7.12" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "3b5d3742945bc7d7f210693b0c58ae542c6fd47b17adbbda0885f3dcb34a6bdb" +dependencies = [ + "async-trait", + "byteorder", + "bytes", + "fallible-iterator", + "futures-channel", + "futures-util", + "log", + "parking_lot 0.12.1", + "percent-encoding", + "phf", + "pin-project-lite", + "postgres-protocol 0.6.7", + "postgres-types 0.2.8", + "rand 0.8.5", + "socket2", + "tokio", + "tokio-util", + "whoami", ] [[package]] @@ -6839,7 +6917,7 @@ dependencies = [ "ring", "rustls 0.23.18", "tokio", - "tokio-postgres", + "tokio-postgres 0.7.9", "tokio-rustls 0.26.0", "x509-certificate", ] @@ -6860,6 +6938,7 @@ dependencies = [ "pin-project-lite", "postgres-protocol2", "postgres-types2", + "serde", "tokio", "tokio-util", ] @@ -6936,14 +7015,14 @@ dependencies = [ [[package]] name = "tokio-tungstenite" -version = "0.24.0" +version = "0.26.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "edc5f74e248dc973e0dbb7b74c7e0d6fcc301c694ff50049504004ef4d0cdcd9" +checksum = "be4bf6fecd69fcdede0ec680aaf474cdab988f9de6bc73d3758f0160e3b7025a" dependencies = [ "futures-util", "log", "tokio", - "tungstenite 0.24.0", + "tungstenite 0.26.1", ] [[package]] @@ -7003,12 +7082,9 @@ version = "0.12.3" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "877c5b330756d856ffcc4553ab34a5684481ade925ecc54bcd1bf02b1d0d4d52" dependencies = [ - "async-stream", "async-trait", - "axum", "base64 0.22.1", "bytes", - "h2 0.4.4", "http 1.1.0", "http-body 1.0.0", "http-body-util", @@ -7020,7 +7096,6 @@ dependencies = [ "prost", "rustls-native-certs 0.8.0", "rustls-pemfile 2.1.1", - "socket2", "tokio", "tokio-rustls 0.26.0", "tokio-stream", @@ -7257,16 +7332,16 @@ dependencies = [ "log", "rand 0.8.5", "sha1", - "thiserror", + "thiserror 1.0.69", "url", "utf-8", ] [[package]] name = "tungstenite" -version = "0.24.0" +version = "0.26.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "18e5b8366ee7a95b16d32197d0b2604b43a0be89dc5fac9f8e96ccafbaedda8a" +checksum = "413083a99c579593656008130e29255e54dcaae495be556cc26888f211648c24" dependencies = [ "byteorder", "bytes", @@ -7276,7 +7351,7 @@ dependencies = [ "log", "rand 0.8.5", "sha1", - "thiserror", + "thiserror 2.0.11", "utf-8", ] @@ -7372,7 +7447,7 @@ dependencies = [ [[package]] name = "uring-common" version = "0.1.0" -source = "git+https://github.com/neondatabase/tokio-epoll-uring.git?branch=main#33e00106a268644d02ba0461bbd64476073b0ee1" +source = "git+https://github.com/neondatabase/tokio-epoll-uring.git?branch=main#781989bb540a1408b0b93daa1e9d1fa452195497" dependencies = [ "bytes", "io-uring", @@ -7471,7 +7546,7 @@ dependencies = [ "signal-hook", "strum", "strum_macros", - "thiserror", + "thiserror 1.0.69", "tokio", "tokio-stream", "tokio-tar", @@ -7501,12 +7576,6 @@ version = "0.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "830b7e5d4d90034032940e4ace0d9a9a057e7a45cd94e6c007832e39edb82f6d" -[[package]] -name = "vcpkg" -version = "0.2.15" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "accd4ea62f7bb7a82fe23066fb0957d48ef677f6eeb8215f372f52e48bb32426" - [[package]] name = "version_check" version = "0.9.4" @@ -7526,7 +7595,7 @@ dependencies = [ "serde_json", "sysinfo", "tokio", - "tokio-postgres", + "tokio-postgres 0.7.9", "tokio-util", "tracing", "tracing-subscriber", @@ -7577,11 +7646,10 @@ dependencies = [ "remote_storage", "serde", "serde_json", - "thiserror", + "thiserror 1.0.69", "tikv-jemallocator", "tokio", "tokio-util", - "tonic", "tonic-build", "tracing", "utils", @@ -7990,8 +8058,6 @@ version = "0.1.0" dependencies = [ "ahash", "anyhow", - "axum", - "axum-core", "base64 0.13.1", "base64 0.21.1", "base64ct", @@ -8072,7 +8138,6 @@ dependencies = [ "toml_edit", "tonic", "tower 0.4.13", - "tower 0.5.2", "tracing", "tracing-core", "url", @@ -8110,7 +8175,7 @@ dependencies = [ "ring", "signature 2.2.0", "spki 0.7.3", - "thiserror", + "thiserror 1.0.69", "zeroize", ] @@ -8127,7 +8192,7 @@ dependencies = [ "nom", "oid-registry", "rusticata-macros", - "thiserror", + "thiserror 1.0.69", "time", ] diff --git a/Cargo.toml b/Cargo.toml index a4e601bb58..9ccdb45f6d 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -65,7 +65,7 @@ aws-smithy-types = "1.2" aws-credential-types = "1.2.0" aws-sigv4 = { version = "1.2", features = ["sign-http"] } aws-types = "1.3" -axum = { version = "0.7.9", features = ["ws"] } +axum = { version = "0.8.1", features = ["ws"] } base64 = "0.13.0" bincode = "1.3" bindgen = "0.70" @@ -187,7 +187,7 @@ tokio-tar = "0.3" tokio-util = { version = "0.7.10", features = ["io", "rt"] } toml = "0.8" toml_edit = "0.22" -tonic = {version = "0.12.3", features = ["tls", "tls-roots"]} +tonic = {version = "0.12.3", default-features = false, features = ["channel", "tls", "tls-roots"]} tower = { version = "0.5.2", default-features = false } tower-http = { version = "0.6.2", features = ["request-id", "trace"] } tower-service = "0.3.3" diff --git a/Dockerfile b/Dockerfile index 2e4f8e5546..7ba54c8ca5 100644 --- a/Dockerfile +++ b/Dockerfile @@ -45,7 +45,7 @@ COPY --chown=nonroot . . ARG ADDITIONAL_RUSTFLAGS RUN set -e \ - && PQ_LIB_DIR=$(pwd)/pg_install/v${STABLE_PG_VERSION}/lib RUSTFLAGS="-Clinker=clang -Clink-arg=-fuse-ld=mold -Clink-arg=-Wl,--no-rosegment -Cforce-frame-pointers=yes ${ADDITIONAL_RUSTFLAGS}" cargo build \ + && RUSTFLAGS="-Clinker=clang -Clink-arg=-fuse-ld=mold -Clink-arg=-Wl,--no-rosegment -Cforce-frame-pointers=yes ${ADDITIONAL_RUSTFLAGS}" cargo build \ --bin pg_sni_router \ --bin pageserver \ --bin pagectl \ @@ -64,6 +64,7 @@ ARG DEFAULT_PG_VERSION WORKDIR /data RUN set -e \ + && echo 'Acquire::Retries "5";' > /etc/apt/apt.conf.d/80-retries \ && apt update \ && apt install -y \ libreadline-dev \ @@ -72,6 +73,7 @@ RUN set -e \ # System postgres for use with client libraries (e.g. in storage controller) postgresql-15 \ openssl \ + && rm -f /etc/apt/apt.conf.d/80-retries \ && rm -rf /var/lib/apt/lists/* /tmp/* /var/tmp/* \ && useradd -d /data neon \ && chown -R neon:neon /data diff --git a/Makefile b/Makefile index 22ebfea7d5..d1238caebf 100644 --- a/Makefile +++ b/Makefile @@ -64,8 +64,6 @@ CARGO_BUILD_FLAGS += $(filter -j1,$(MAKEFLAGS)) CARGO_CMD_PREFIX += $(if $(filter n,$(MAKEFLAGS)),,+) # Force cargo not to print progress bar CARGO_CMD_PREFIX += CARGO_TERM_PROGRESS_WHEN=never CI=1 -# Set PQ_LIB_DIR to make sure `storage_controller` get linked with bundled libpq (through diesel) -CARGO_CMD_PREFIX += PQ_LIB_DIR=$(POSTGRES_INSTALL_DIR)/v16/lib CACHEDIR_TAG_CONTENTS := "Signature: 8a477f597d28d172789f06886806bc55" diff --git a/README.md b/README.md index 1417d6b9e7..4453904346 100644 --- a/README.md +++ b/README.md @@ -21,8 +21,10 @@ The Neon storage engine consists of two major components: See developer documentation in [SUMMARY.md](/docs/SUMMARY.md) for more information. -## Running local installation +## Running a local development environment +Neon can be run on a workstation for small experiments and to test code changes, by +following these instructions. #### Installing dependencies on Linux 1. Install build dependencies and other applicable packages @@ -238,7 +240,7 @@ postgres=# select * from t; > cargo neon stop ``` -More advanced usages can be found at [Control Plane and Neon Local](./control_plane/README.md). +More advanced usages can be found at [Local Development Control Plane (`neon_local`))](./control_plane/README.md). #### Handling build failures diff --git a/build-tools.Dockerfile b/build-tools.Dockerfile index 7a2ec9c43e..9c13e480c1 100644 --- a/build-tools.Dockerfile +++ b/build-tools.Dockerfile @@ -3,6 +3,10 @@ ARG DEBIAN_VERSION=bookworm FROM debian:bookworm-slim AS pgcopydb_builder ARG DEBIAN_VERSION +RUN echo 'Acquire::Retries "5";' > /etc/apt/apt.conf.d/80-retries && \ + echo -e "retry_connrefused = on\ntimeout=15\ntries=5\n" > /root/.wgetrc \ + echo -e "--retry-connrefused\n--connect-timeout 15\n--retry 5\n--max-time 300\n" > /root/.curlrc + RUN if [ "${DEBIAN_VERSION}" = "bookworm" ]; then \ set -e && \ apt update && \ @@ -61,6 +65,10 @@ RUN mkdir -p /pgcopydb/bin && \ COPY --from=pgcopydb_builder /usr/lib/postgresql/16/bin/pgcopydb /pgcopydb/bin/pgcopydb COPY --from=pgcopydb_builder /pgcopydb/lib/libpq.so.5 /pgcopydb/lib/libpq.so.5 +RUN echo 'Acquire::Retries "5";' > /etc/apt/apt.conf.d/80-retries && \ + echo -e "retry_connrefused = on\ntimeout=15\ntries=5\n" > /root/.wgetrc \ + echo -e "--retry-connrefused\n--connect-timeout 15\n--retry 5\n--max-time 300\n" > /root/.curlrc + # System deps # # 'gdb' is included so that we get backtraces of core dumps produced in @@ -218,6 +226,8 @@ RUN wget -O /tmp/libicu-${ICU_VERSION}.tgz https://github.com/unicode-org/icu/re USER nonroot:nonroot WORKDIR /home/nonroot +RUN echo -e "--retry-connrefused\n--connect-timeout 15\n--retry 5\n--max-time 300\n" > /home/nonroot/.curlrc + # Python ENV PYTHON_VERSION=3.11.10 \ PYENV_ROOT=/home/nonroot/.pyenv \ diff --git a/compute/compute-node.Dockerfile b/compute/compute-node.Dockerfile index f56a8358d2..a428c61f34 100644 --- a/compute/compute-node.Dockerfile +++ b/compute/compute-node.Dockerfile @@ -5,6 +5,7 @@ ARG TAG=pinned ARG BUILD_TAG ARG DEBIAN_VERSION=bookworm ARG DEBIAN_FLAVOR=${DEBIAN_VERSION}-slim +ARG ALPINE_CURL_VERSION=8.11.1 ######################################################################################### # @@ -17,6 +18,10 @@ ARG DEBIAN_VERSION # Use strict mode for bash to catch errors early SHELL ["/bin/bash", "-euo", "pipefail", "-c"] +RUN echo 'Acquire::Retries "5";' > /etc/apt/apt.conf.d/80-retries && \ + echo -e "retry_connrefused = on\ntimeout=15\ntries=5\n" > /root/.wgetrc \ + echo -e "--retry-connrefused\n--connect-timeout 15\n--retry 5\n--max-time 300\n" > /root/.curlrc + RUN case $DEBIAN_VERSION in \ # Version-specific installs for Bullseye (PG14-PG16): # The h3_pg extension needs a cmake 3.20+, but Debian bullseye has 3.18. @@ -67,6 +72,9 @@ RUN cd postgres && \ # Enable some of contrib extensions echo 'trusted = true' >> /usr/local/pgsql/share/extension/autoinc.control && \ echo 'trusted = true' >> /usr/local/pgsql/share/extension/dblink.control && \ + echo 'trusted = true' >> /usr/local/pgsql/share/extension/postgres_fdw.control && \ + file=/usr/local/pgsql/share/extension/postgres_fdw--1.0.sql && [ -e $file ] && \ + echo 'GRANT USAGE ON FOREIGN DATA WRAPPER postgres_fdw TO neon_superuser;' >> $file && \ echo 'trusted = true' >> /usr/local/pgsql/share/extension/bloom.control && \ echo 'trusted = true' >> /usr/local/pgsql/share/extension/earthdistance.control && \ echo 'trusted = true' >> /usr/local/pgsql/share/extension/insert_username.control && \ @@ -360,6 +368,8 @@ COPY compute/patches/pgvector.patch /pgvector.patch RUN wget https://github.com/pgvector/pgvector/archive/refs/tags/v0.8.0.tar.gz -O pgvector.tar.gz && \ echo "867a2c328d4928a5a9d6f052cd3bc78c7d60228a9b914ad32aa3db88e9de27b0 pgvector.tar.gz" | sha256sum --check && \ mkdir pgvector-src && cd pgvector-src && tar xzf ../pgvector.tar.gz --strip-components=1 -C . && \ + wget https://github.com/pgvector/pgvector/raw/refs/tags/v0.7.4/sql/vector.sql -O ./sql/vector--0.7.4.sql && \ + echo "10218d05dc02299562252a9484775178b14a1d8edb92a2d1672ef488530f7778 ./sql/vector--0.7.4.sql" | sha256sum --check && \ patch -p1 < /pgvector.patch && \ make -j $(getconf _NPROCESSORS_ONLN) OPTFLAGS="" && \ make -j $(getconf _NPROCESSORS_ONLN) OPTFLAGS="" install && \ @@ -832,6 +842,8 @@ ENV PATH="/home/nonroot/.cargo/bin:$PATH" USER nonroot WORKDIR /home/nonroot +RUN echo -e "--retry-connrefused\n--connect-timeout 15\n--retry 5\n--max-time 300\n" > /home/nonroot/.curlrc + RUN curl -sSO https://static.rust-lang.org/rustup/dist/$(uname -m)-unknown-linux-gnu/rustup-init && \ chmod +x rustup-init && \ ./rustup-init -y --no-modify-path --profile minimal --default-toolchain stable && \ @@ -868,6 +880,8 @@ ENV PATH="/home/nonroot/.cargo/bin:$PATH" USER nonroot WORKDIR /home/nonroot +RUN echo -e "--retry-connrefused\n--connect-timeout 15\n--retry 5\n--max-time 300\n" > /home/nonroot/.curlrc + RUN curl -sSO https://static.rust-lang.org/rustup/dist/$(uname -m)-unknown-linux-gnu/rustup-init && \ chmod +x rustup-init && \ ./rustup-init -y --no-modify-path --profile minimal --default-toolchain stable && \ @@ -995,24 +1009,50 @@ RUN wget https://github.com/kelvich/pg_tiktoken/archive/9118dd4549b7d8c0bbc98e04 ######################################################################################### # # Layer "pg-pgx-ulid-build" -# Compile "pgx_ulid" extension +# Compile "pgx_ulid" extension for v16 and below # ######################################################################################### FROM rust-extensions-build AS pg-pgx-ulid-build ARG PG_VERSION -# doesn't support v17 yet -# https://github.com/pksunkara/pgx_ulid/pull/52 -RUN case "${PG_VERSION}" in "v17") \ - echo "pgx_ulid does not support pg17 as of the latest version (0.1.5)" && exit 0;; \ +RUN case "${PG_VERSION}" in \ + "v14" | "v15" | "v16") \ + ;; \ + *) \ + echo "skipping the version of pgx_ulid for $PG_VERSION" && exit 0 \ + ;; \ esac && \ wget https://github.com/pksunkara/pgx_ulid/archive/refs/tags/v0.1.5.tar.gz -O pgx_ulid.tar.gz && \ - echo "9d1659a2da65af0133d5451c454de31b37364e3502087dadf579f790bc8bef17 pgx_ulid.tar.gz" | sha256sum --check && \ + echo "9d1659a2da65af0133d5451c454de31b37364e3502087dadf579f790bc8bef17 pgx_ulid.tar.gz" | sha256sum --check && \ mkdir pgx_ulid-src && cd pgx_ulid-src && tar xzf ../pgx_ulid.tar.gz --strip-components=1 -C . && \ - sed -i 's/pgrx = "^0.11.2"/pgrx = { version = "=0.11.3", features = [ "unsafe-postgres" ] }/g' Cargo.toml && \ + sed -i 's/pgrx = "^0.11.2"/pgrx = { version = "0.11.3", features = [ "unsafe-postgres" ] }/g' Cargo.toml && \ cargo pgrx install --release && \ - echo "trusted = true" >> /usr/local/pgsql/share/extension/ulid.control + echo 'trusted = true' >> /usr/local/pgsql/share/extension/ulid.control + +######################################################################################### +# +# Layer "pg-pgx-ulid-pgrx12-build" +# Compile "pgx_ulid" extension for v17 and up +# +######################################################################################### + +FROM rust-extensions-build-pgrx12 AS pg-pgx-ulid-pgrx12-build +ARG PG_VERSION + +RUN case "${PG_VERSION}" in \ + "v17") \ + ;; \ + *) \ + echo "skipping the version of pgx_ulid for $PG_VERSION" && exit 0 \ + ;; \ + esac && \ + wget https://github.com/pksunkara/pgx_ulid/archive/refs/tags/v0.2.0.tar.gz -O pgx_ulid.tar.gz && \ + echo "cef6a9a2e5e7bd1a10a18989286586ee9e6c1c06005a4055cff190de41bf3e9f pgx_ulid.tar.gz" | sha256sum --check && \ + mkdir pgx_ulid-src && cd pgx_ulid-src && tar xzf ../pgx_ulid.tar.gz --strip-components=1 -C . && \ + sed -i 's/pgrx = "^0.12.7"/pgrx = { version = "0.12.9", features = [ "unsafe-postgres" ] }/g' Cargo.toml && \ + cargo pgrx install --release && \ + echo 'trusted = true' >> /usr/local/pgsql/share/extension/pgx_ulid.control ######################################################################################### # @@ -1157,6 +1197,7 @@ COPY --from=timescaledb-pg-build /usr/local/pgsql/ /usr/local/pgsql/ COPY --from=pg-hint-plan-pg-build /usr/local/pgsql/ /usr/local/pgsql/ COPY --from=pg-cron-pg-build /usr/local/pgsql/ /usr/local/pgsql/ COPY --from=pg-pgx-ulid-build /usr/local/pgsql/ /usr/local/pgsql/ +COPY --from=pg-pgx-ulid-pgrx12-build /usr/local/pgsql/ /usr/local/pgsql/ COPY --from=pg-session-jwt-build /usr/local/pgsql/ /usr/local/pgsql/ COPY --from=rdkit-pg-build /usr/local/pgsql/ /usr/local/pgsql/ COPY --from=pg-uuidv7-pg-build /usr/local/pgsql/ /usr/local/pgsql/ @@ -1210,6 +1251,7 @@ RUN mold -run cargo build --locked --profile release-line-debug-size-lto --bin c FROM debian:$DEBIAN_FLAVOR AS pgbouncer RUN set -e \ + && echo 'Acquire::Retries "5";' > /etc/apt/apt.conf.d/80-retries \ && apt update \ && apt install --no-install-suggests --no-install-recommends -y \ build-essential \ @@ -1234,15 +1276,31 @@ RUN set -e \ ######################################################################################### # -# Layers "postgres-exporter" and "sql-exporter" +# Layer "exporters" # ######################################################################################### - -FROM quay.io/prometheuscommunity/postgres-exporter:v0.16.0 AS postgres-exporter - -# Keep the version the same as in build-tools.Dockerfile and -# test_runner/regress/test_compute_metrics.py. -FROM burningalchemist/sql_exporter:0.17.0 AS sql-exporter +FROM alpine/curl:${ALPINE_CURL_VERSION} AS exporters +ARG TARGETARCH +# Keep sql_exporter version same as in build-tools.Dockerfile and +# test_runner/regress/test_compute_metrics.py +RUN if [ "$TARGETARCH" = "amd64" ]; then\ + postgres_exporter_sha256='027e75dda7af621237ff8f5ac66b78a40b0093595f06768612b92b1374bd3105';\ + pgbouncer_exporter_sha256='c9f7cf8dcff44f0472057e9bf52613d93f3ffbc381ad7547a959daa63c5e84ac';\ + sql_exporter_sha256='38e439732bbf6e28ca4a94d7bc3686d3fa1abdb0050773d5617a9efdb9e64d08';\ + else\ + postgres_exporter_sha256='131a376d25778ff9701a4c81f703f179e0b58db5c2c496e66fa43f8179484786';\ + pgbouncer_exporter_sha256='217c4afd7e6492ae904055bc14fe603552cf9bac458c063407e991d68c519da3';\ + sql_exporter_sha256='11918b00be6e2c3a67564adfdb2414fdcbb15a5db76ea17d1d1a944237a893c6';\ + fi\ + && curl -sL https://github.com/prometheus-community/postgres_exporter/releases/download/v0.16.0/postgres_exporter-0.16.0.linux-${TARGETARCH}.tar.gz\ + | tar xzf - --strip-components=1 -C.\ + && curl -sL https://github.com/prometheus-community/pgbouncer_exporter/releases/download/v0.10.2/pgbouncer_exporter-0.10.2.linux-${TARGETARCH}.tar.gz\ + | tar xzf - --strip-components=1 -C.\ + && curl -sL https://github.com/burningalchemist/sql_exporter/releases/download/0.17.0/sql_exporter-0.17.0.linux-${TARGETARCH}.tar.gz\ + | tar xzf - --strip-components=1 -C.\ + && echo "${postgres_exporter_sha256} postgres_exporter" | sha256sum -c -\ + && echo "${pgbouncer_exporter_sha256} pgbouncer_exporter" | sha256sum -c -\ + && echo "${sql_exporter_sha256} sql_exporter" | sha256sum -c - ######################################################################################### # @@ -1297,7 +1355,8 @@ COPY --from=vector-pg-build /pgvector.patch /ext-src/ COPY --from=pgjwt-pg-build /pgjwt.tar.gz /ext-src #COPY --from=pgrag-pg-build /usr/local/pgsql/ /usr/local/pgsql/ #COPY --from=pg-jsonschema-pg-build /home/nonroot/pg_jsonschema.tar.gz /ext-src -#COPY --from=pg-graphql-pg-build /home/nonroot/pg_graphql.tar.gz /ext-src +COPY --from=pg-graphql-pg-build /home/nonroot/pg_graphql.tar.gz /ext-src +COPY compute/patches/pg_graphql.patch /ext-src #COPY --from=pg-tiktoken-pg-build /home/nonroot/pg_tiktoken.tar.gz /ext-src COPY --from=hypopg-pg-build /hypopg.tar.gz /ext-src COPY --from=pg-hashids-pg-build /pg_hashids.tar.gz /ext-src @@ -1320,9 +1379,6 @@ COPY --from=pg-roaringbitmap-pg-build /pg_roaringbitmap.tar.gz /ext-src COPY --from=pg-semver-pg-build /pg_semver.tar.gz /ext-src #COPY --from=pg-embedding-pg-build /home/nonroot/pg_embedding-src/ /ext-src #COPY --from=wal2json-pg-build /wal2json_2_5.tar.gz /ext-src -#pg_anon is not supported yet for pg v17 so, don't fail if nothing found -COPY --from=pg-anon-pg-build /pg_anon.tar.g? /ext-src -COPY compute/patches/pg_anon.patch /ext-src COPY --from=pg-ivm-build /pg_ivm.tar.gz /ext-src COPY --from=pg-partman-build /pg_partman.tar.gz /ext-src RUN cd /ext-src/ && for f in *.tar.gz; \ @@ -1333,10 +1389,8 @@ RUN cd /ext-src/rum-src && patch -p1 <../rum.patch RUN cd /ext-src/pgvector-src && patch -p1 <../pgvector.patch RUN cd /ext-src/pg_hint_plan-src && patch -p1 < /ext-src/pg_hint_plan_${PG_VERSION}.patch COPY --chmod=755 docker-compose/run-tests.sh /run-tests.sh -RUN case "${PG_VERSION}" in "v17") \ - echo "postgresql_anonymizer does not yet support PG17" && exit 0;; \ - esac && patch -p1 /etc/apt/apt.conf.d/80-retries && \ + echo -e "retry_connrefused = on\ntimeout=15\ntries=5\n" > /root/.wgetrc RUN apt update && \ case $DEBIAN_VERSION in \ @@ -1454,7 +1511,7 @@ RUN set -ex; \ else \ echo "Unsupported architecture: ${TARGETARCH}"; exit 1; \ fi; \ - curl -L "https://awscli.amazonaws.com/awscli-exe-linux-${TARGETARCH_ALT}-2.17.5.zip" -o /tmp/awscliv2.zip; \ + curl --retry 5 -L "https://awscli.amazonaws.com/awscli-exe-linux-${TARGETARCH_ALT}-2.17.5.zip" -o /tmp/awscliv2.zip; \ echo "${CHECKSUM} /tmp/awscliv2.zip" | sha256sum -c -; \ unzip /tmp/awscliv2.zip -d /tmp/awscliv2; \ /tmp/awscliv2/aws/install; \ diff --git a/compute/etc/pgbouncer.ini b/compute/etc/pgbouncer.ini index 604b4e41ea..9d68cbb8d5 100644 --- a/compute/etc/pgbouncer.ini +++ b/compute/etc/pgbouncer.ini @@ -19,6 +19,8 @@ max_prepared_statements=0 admin_users=postgres unix_socket_dir=/tmp/ unix_socket_mode=0777 +; required for pgbouncer_exporter +ignore_startup_parameters=extra_float_digits ;; Disable connection logging. It produces a lot of logs that no one looks at, ;; and we can get similar log entries from the proxy too. We had incidents in diff --git a/compute/patches/pg_graphql.patch b/compute/patches/pg_graphql.patch new file mode 100644 index 0000000000..bf0ac38afa --- /dev/null +++ b/compute/patches/pg_graphql.patch @@ -0,0 +1,19 @@ +commit ec6a491d126882966a696f9ad5d3698935361d55 +Author: Alexey Masterov +Date: Tue Dec 17 10:25:00 2024 +0100 + + Changes required to run tests on Neon + +diff --git a/test/expected/permissions_functions.out b/test/expected/permissions_functions.out +index 1e9fbc2..94cbe25 100644 +--- a/test/expected/permissions_functions.out ++++ b/test/expected/permissions_functions.out +@@ -64,7 +64,7 @@ begin; + select current_user; + current_user + -------------- +- postgres ++ cloud_admin + (1 row) + + -- revoke default access from the public role for new functions diff --git a/compute/patches/pgvector.patch b/compute/patches/pgvector.patch index 3e1ffcaaaf..da41c86140 100644 --- a/compute/patches/pgvector.patch +++ b/compute/patches/pgvector.patch @@ -1,8 +1,24 @@ +diff --git a/Makefile b/Makefile +index 7a4b88c..56678af 100644 +--- a/Makefile ++++ b/Makefile +@@ -3,7 +3,10 @@ EXTVERSION = 0.8.0 + + MODULE_big = vector + DATA = $(wildcard sql/*--*--*.sql) +-DATA_built = sql/$(EXTENSION)--$(EXTVERSION).sql ++# This change is needed to install different per-version SQL files ++# like pgvector--0.8.0.sql and pgvector--0.7.4.sql ++# The corresponding file is downloaded during the Docker image build process ++DATA_built = sql/$(EXTENSION)--$(EXTVERSION).sql sql/vector--0.7.4.sql + OBJS = src/bitutils.o src/bitvec.o src/halfutils.o src/halfvec.o src/hnsw.o src/hnswbuild.o src/hnswinsert.o src/hnswscan.o src/hnswutils.o src/hnswvacuum.o src/ivfbuild.o src/ivfflat.o src/ivfinsert.o src/ivfkmeans.o src/ivfscan.o src/ivfutils.o src/ivfvacuum.o src/sparsevec.o src/vector.o + HEADERS = src/halfvec.h src/sparsevec.h src/vector.h + diff --git a/src/hnswbuild.c b/src/hnswbuild.c -index dcfb2bd..d5189ee 100644 +index b667478..fc1897c 100644 --- a/src/hnswbuild.c +++ b/src/hnswbuild.c -@@ -860,9 +860,17 @@ HnswParallelBuildMain(dsm_segment *seg, shm_toc *toc) +@@ -843,9 +843,17 @@ HnswParallelBuildMain(dsm_segment *seg, shm_toc *toc) hnswarea = shm_toc_lookup(toc, PARALLEL_KEY_HNSW_AREA, false); @@ -20,7 +36,7 @@ index dcfb2bd..d5189ee 100644 /* Close relations within worker */ index_close(indexRel, indexLockmode); table_close(heapRel, heapLockmode); -@@ -1117,12 +1125,38 @@ BuildIndex(Relation heap, Relation index, IndexInfo *indexInfo, +@@ -1100,12 +1108,38 @@ BuildIndex(Relation heap, Relation index, IndexInfo *indexInfo, SeedRandom(42); #endif diff --git a/compute/vm-image-spec-bookworm.yaml b/compute/vm-image-spec-bookworm.yaml index ac9f5c6904..005143fff3 100644 --- a/compute/vm-image-spec-bookworm.yaml +++ b/compute/vm-image-spec-bookworm.yaml @@ -27,6 +27,10 @@ commands: user: nobody sysvInitAction: respawn shell: 'DATA_SOURCE_NAME="user=cloud_admin sslmode=disable dbname=postgres application_name=postgres-exporter" /bin/postgres_exporter --config.file=/etc/postgres_exporter.yml' + - name: pgbouncer-exporter + user: postgres + sysvInitAction: respawn + shell: '/bin/pgbouncer_exporter --pgBouncer.connectionString="postgres:///pgbouncer?host=/tmp&port=6432&dbname=pgbouncer&user=pgbouncer"' - name: sql-exporter user: nobody sysvInitAction: respawn diff --git a/compute/vm-image-spec-bullseye.yaml b/compute/vm-image-spec-bullseye.yaml index 0d178e1c24..2fe50c3a45 100644 --- a/compute/vm-image-spec-bullseye.yaml +++ b/compute/vm-image-spec-bullseye.yaml @@ -27,6 +27,10 @@ commands: user: nobody sysvInitAction: respawn shell: 'DATA_SOURCE_NAME="user=cloud_admin sslmode=disable dbname=postgres application_name=postgres-exporter" /bin/postgres_exporter --config.file=/etc/postgres_exporter.yml' + - name: pgbouncer-exporter + user: postgres + sysvInitAction: respawn + shell: '/bin/pgbouncer_exporter --pgBouncer.connectionString="postgres:///pgbouncer?host=/tmp&port=6432&dbname=pgbouncer&user=pgbouncer"' - name: sql-exporter user: nobody sysvInitAction: respawn diff --git a/compute_tools/Cargo.toml b/compute_tools/Cargo.toml index 33892813c4..b04f364cbb 100644 --- a/compute_tools/Cargo.toml +++ b/compute_tools/Cargo.toml @@ -51,6 +51,7 @@ tracing-subscriber.workspace = true tracing-utils.workspace = true thiserror.workspace = true url.workspace = true +uuid.workspace = true prometheus.workspace = true postgres_initdb.workspace = true diff --git a/compute_tools/src/bin/fast_import.rs b/compute_tools/src/bin/fast_import.rs index 5b008f8182..c8440afb64 100644 --- a/compute_tools/src/bin/fast_import.rs +++ b/compute_tools/src/bin/fast_import.rs @@ -58,6 +58,8 @@ struct Args { pg_bin_dir: Utf8PathBuf, #[clap(long)] pg_lib_dir: Utf8PathBuf, + #[clap(long)] + pg_port: Option, // port to run postgres on, 5432 is default } #[serde_with::serde_as] @@ -74,6 +76,13 @@ enum EncryptionSecret { KMS { key_id: String }, } +// copied from pageserver_api::config::defaults::DEFAULT_LOCALE to avoid dependency just for a constant +const DEFAULT_LOCALE: &str = if cfg!(target_os = "macos") { + "C" +} else { + "C.UTF-8" +}; + #[tokio::main] pub(crate) async fn main() -> anyhow::Result<()> { utils::logging::init( @@ -97,6 +106,10 @@ pub(crate) async fn main() -> anyhow::Result<()> { let working_directory = args.working_directory; let pg_bin_dir = args.pg_bin_dir; let pg_lib_dir = args.pg_lib_dir; + let pg_port = args.pg_port.unwrap_or_else(|| { + info!("pg_port not specified, using default 5432"); + 5432 + }); // Initialize AWS clients only if s3_prefix is specified let (aws_config, kms_client) = if args.s3_prefix.is_some() { @@ -180,7 +193,7 @@ pub(crate) async fn main() -> anyhow::Result<()> { let superuser = "cloud_admin"; // XXX: this shouldn't be hard-coded postgres_initdb::do_run_initdb(postgres_initdb::RunInitdbArgs { superuser, - locale: "en_US.UTF-8", // XXX: this shouldn't be hard-coded, + locale: DEFAULT_LOCALE, // XXX: this shouldn't be hard-coded, pg_version, initdb_bin: pg_bin_dir.join("initdb").as_ref(), library_search_path: &pg_lib_dir, // TODO: is this right? Prob works in compute image, not sure about neon_local. @@ -197,6 +210,7 @@ pub(crate) async fn main() -> anyhow::Result<()> { let mut postgres_proc = tokio::process::Command::new(pgbin) .arg("-D") .arg(&pgdata_dir) + .args(["-p", &format!("{pg_port}")]) .args(["-c", "wal_level=minimal"]) .args(["-c", "shared_buffers=10GB"]) .args(["-c", "max_wal_senders=0"]) @@ -216,6 +230,7 @@ pub(crate) async fn main() -> anyhow::Result<()> { ), ]) .env_clear() + .env("LD_LIBRARY_PATH", &pg_lib_dir) .stdout(std::process::Stdio::piped()) .stderr(std::process::Stdio::piped()) .spawn() @@ -232,7 +247,7 @@ pub(crate) async fn main() -> anyhow::Result<()> { // Create neondb database in the running postgres let restore_pg_connstring = - format!("host=localhost port=5432 user={superuser} dbname=postgres"); + format!("host=localhost port={pg_port} user={superuser} dbname=postgres"); let start_time = std::time::Instant::now(); @@ -314,6 +329,7 @@ pub(crate) async fn main() -> anyhow::Result<()> { .arg(&source_connection_string) // how we run it .env_clear() + .env("LD_LIBRARY_PATH", &pg_lib_dir) .kill_on_drop(true) .stdout(std::process::Stdio::piped()) .stderr(std::process::Stdio::piped()) @@ -347,6 +363,7 @@ pub(crate) async fn main() -> anyhow::Result<()> { .arg(&dumpdir) // how we run it .env_clear() + .env("LD_LIBRARY_PATH", &pg_lib_dir) .kill_on_drop(true) .stdout(std::process::Stdio::piped()) .stderr(std::process::Stdio::piped()) diff --git a/compute_tools/src/compute.rs b/compute_tools/src/compute.rs index 1ac97a378b..fd76e404c6 100644 --- a/compute_tools/src/compute.rs +++ b/compute_tools/src/compute.rs @@ -41,14 +41,14 @@ use crate::local_proxy; use crate::pg_helpers::*; use crate::spec::*; use crate::spec_apply::ApplySpecPhase::{ - CreateAndAlterDatabases, CreateAndAlterRoles, CreateAvailabilityCheck, CreateSuperUser, - DropInvalidDatabases, DropRoles, HandleNeonExtension, HandleOtherExtensions, - RenameAndDeleteDatabases, RenameRoles, RunInEachDatabase, + CreateAndAlterDatabases, CreateAndAlterRoles, CreateAvailabilityCheck, CreateSchemaNeon, + CreateSuperUser, DropInvalidDatabases, DropRoles, FinalizeDropLogicalSubscriptions, + HandleNeonExtension, HandleOtherExtensions, RenameAndDeleteDatabases, RenameRoles, + RunInEachDatabase, }; use crate::spec_apply::PerDatabasePhase; use crate::spec_apply::PerDatabasePhase::{ - ChangeSchemaPerms, DeleteDBRoleReferences, DropSubscriptionsForDeletedDatabases, - HandleAnonExtension, + ChangeSchemaPerms, DeleteDBRoleReferences, DropLogicalSubscriptions, HandleAnonExtension, }; use crate::spec_apply::{apply_operations, MutableApplyContext, DB}; use crate::sync_sk::{check_if_synced, ping_safekeeper}; @@ -340,6 +340,15 @@ impl ComputeNode { self.state.lock().unwrap().status } + pub fn get_timeline_id(&self) -> Option { + self.state + .lock() + .unwrap() + .pspec + .as_ref() + .map(|s| s.timeline_id) + } + // Remove `pgdata` directory and create it again with right permissions. fn create_pgdata(&self) -> Result<()> { // Ignore removal error, likely it is a 'No such file or directory (os error 2)'. @@ -929,6 +938,48 @@ impl ComputeNode { .map(|role| (role.name.clone(), role)) .collect::>(); + // Check if we need to drop subscriptions before starting the endpoint. + // + // It is important to do this operation exactly once when endpoint starts on a new branch. + // Otherwise, we may drop not inherited, but newly created subscriptions. + // + // We cannot rely only on spec.drop_subscriptions_before_start flag, + // because if for some reason compute restarts inside VM, + // it will start again with the same spec and flag value. + // + // To handle this, we save the fact of the operation in the database + // in the neon.drop_subscriptions_done table. + // If the table does not exist, we assume that the operation was never performed, so we must do it. + // If table exists, we check if the operation was performed on the current timelilne. + // + let mut drop_subscriptions_done = false; + + if spec.drop_subscriptions_before_start { + let timeline_id = self.get_timeline_id().context("timeline_id must be set")?; + let query = format!("select 1 from neon.drop_subscriptions_done where timeline_id = '{}'", timeline_id); + + info!("Checking if drop subscription operation was already performed for timeline_id: {}", timeline_id); + + drop_subscriptions_done = match + client.simple_query(&query).await { + Ok(result) => { + matches!(&result[0], postgres::SimpleQueryMessage::Row(_)) + }, + Err(e) => + { + match e.code() { + Some(&SqlState::UNDEFINED_TABLE) => false, + _ => { + // We don't expect any other error here, except for the schema/table not existing + error!("Error checking if drop subscription operation was already performed: {}", e); + return Err(e.into()); + } + } + } + } + }; + + let jwks_roles = Arc::new( spec.as_ref() .local_proxy_config @@ -996,7 +1047,7 @@ impl ComputeNode { jwks_roles.clone(), concurrency_token.clone(), db, - [DropSubscriptionsForDeletedDatabases].to_vec(), + [DropLogicalSubscriptions].to_vec(), ); Ok(spawn(fut)) @@ -1024,6 +1075,7 @@ impl ComputeNode { CreateAndAlterRoles, RenameAndDeleteDatabases, CreateAndAlterDatabases, + CreateSchemaNeon, ] { info!("Applying phase {:?}", &phase); apply_operations( @@ -1064,6 +1116,17 @@ impl ComputeNode { } let conf = Arc::new(conf); + let mut phases = vec![ + DeleteDBRoleReferences, + ChangeSchemaPerms, + HandleAnonExtension, + ]; + + if spec.drop_subscriptions_before_start && !drop_subscriptions_done { + info!("Adding DropLogicalSubscriptions phase because drop_subscriptions_before_start is set"); + phases.push(DropLogicalSubscriptions); + } + let fut = Self::apply_spec_sql_db( spec.clone(), conf, @@ -1071,12 +1134,7 @@ impl ComputeNode { jwks_roles.clone(), concurrency_token.clone(), db, - [ - DeleteDBRoleReferences, - ChangeSchemaPerms, - HandleAnonExtension, - ] - .to_vec(), + phases, ); Ok(spawn(fut)) @@ -1088,12 +1146,20 @@ impl ComputeNode { handle.await??; } - for phase in vec![ + let mut phases = vec![ HandleOtherExtensions, - HandleNeonExtension, + HandleNeonExtension, // This step depends on CreateSchemaNeon CreateAvailabilityCheck, DropRoles, - ] { + ]; + + // This step depends on CreateSchemaNeon + if spec.drop_subscriptions_before_start && !drop_subscriptions_done { + info!("Adding FinalizeDropLogicalSubscriptions phase because drop_subscriptions_before_start is set"); + phases.push(FinalizeDropLogicalSubscriptions); + } + + for phase in phases { debug!("Applying phase {:?}", &phase); apply_operations( spec.clone(), @@ -1463,6 +1529,14 @@ impl ComputeNode { Ok(()) }, )?; + + let postgresql_conf_path = pgdata_path.join("postgresql.conf"); + if config::line_in_file( + &postgresql_conf_path, + "neon.disable_logical_replication_subscribers=false", + )? { + info!("updated postgresql.conf to set neon.disable_logical_replication_subscribers=false"); + } self.pg_reload_conf()?; } self.post_apply_config()?; diff --git a/compute_tools/src/config.rs b/compute_tools/src/config.rs index b257c8a68f..e1bdfffa54 100644 --- a/compute_tools/src/config.rs +++ b/compute_tools/src/config.rs @@ -129,6 +129,13 @@ pub fn write_postgres_conf( writeln!(file, "neon.extension_server_port={}", extension_server_port)?; + if spec.drop_subscriptions_before_start { + writeln!(file, "neon.disable_logical_replication_subscribers=true")?; + } else { + // be explicit about the default value + writeln!(file, "neon.disable_logical_replication_subscribers=false")?; + } + // This is essential to keep this line at the end of the file, // because it is intended to override any settings above. writeln!(file, "include_if_exists = 'compute_ctl_temp_override.conf'")?; diff --git a/compute_tools/src/extension_server.rs b/compute_tools/src/extension_server.rs index f13b2308e7..64c338f4d7 100644 --- a/compute_tools/src/extension_server.rs +++ b/compute_tools/src/extension_server.rs @@ -85,6 +85,8 @@ use tracing::info; use tracing::log::warn; use zstd::stream::read::Decoder; +use crate::metrics::{REMOTE_EXT_REQUESTS_TOTAL, UNKNOWN_HTTP_STATUS}; + fn get_pg_config(argument: &str, pgbin: &str) -> String { // gives the result of `pg_config [argument]` // where argument is a flag like `--version` or `--sharedir` @@ -258,21 +260,58 @@ async fn download_extension_tar(ext_remote_storage: &str, ext_path: &str) -> Res info!("Download extension {:?} from uri {:?}", ext_path, uri); - let resp = reqwest::get(uri).await?; + match do_extension_server_request(&uri).await { + Ok(resp) => { + info!( + "Successfully downloaded remote extension data {:?}", + ext_path + ); + REMOTE_EXT_REQUESTS_TOTAL + .with_label_values(&[&StatusCode::OK.to_string()]) + .inc(); + Ok(resp) + } + Err((msg, status)) => { + REMOTE_EXT_REQUESTS_TOTAL + .with_label_values(&[&status]) + .inc(); + bail!(msg); + } + } +} - match resp.status() { +// Do a single remote extensions server request. +// Return result or (error message + stringified status code) in case of any failures. +async fn do_extension_server_request(uri: &str) -> Result { + let resp = reqwest::get(uri).await.map_err(|e| { + ( + format!("could not perform remote extensions server request: {}", e), + UNKNOWN_HTTP_STATUS.to_string(), + ) + })?; + let status = resp.status(); + + match status { StatusCode::OK => match resp.bytes().await { - Ok(resp) => { - info!("Download extension {:?} completed successfully", ext_path); - Ok(resp) - } - Err(e) => bail!("could not deserialize remote extension response: {}", e), + Ok(resp) => Ok(resp), + Err(e) => Err(( + format!("could not read remote extensions server response: {}", e), + // It's fine to return and report error with status as 200 OK, + // because we still failed to read the response. + status.to_string(), + )), }, - StatusCode::SERVICE_UNAVAILABLE => bail!("remote extension is temporarily unavailable"), - _ => bail!( - "unexpected remote extension response status code: {}", - resp.status() - ), + StatusCode::SERVICE_UNAVAILABLE => Err(( + "remote extensions server is temporarily unavailable".to_string(), + status.to_string(), + )), + _ => Err(( + format!( + "unexpected remote extensions server response status code: {}", + status + ), + status.to_string(), + )), } } diff --git a/compute_tools/src/http/extract/json.rs b/compute_tools/src/http/extract/json.rs index 41f13625ad..104cc25d5f 100644 --- a/compute_tools/src/http/extract/json.rs +++ b/compute_tools/src/http/extract/json.rs @@ -1,9 +1,6 @@ use std::ops::{Deref, DerefMut}; -use axum::{ - async_trait, - extract::{rejection::JsonRejection, FromRequest, Request}, -}; +use axum::extract::{rejection::JsonRejection, FromRequest, Request}; use compute_api::responses::GenericAPIError; use http::StatusCode; @@ -12,7 +9,6 @@ use http::StatusCode; #[derive(Debug, Clone, Copy, Default)] pub(crate) struct Json(pub T); -#[async_trait] impl FromRequest for Json where axum::Json: FromRequest, diff --git a/compute_tools/src/http/extract/path.rs b/compute_tools/src/http/extract/path.rs index 95edc657f2..09637a96a4 100644 --- a/compute_tools/src/http/extract/path.rs +++ b/compute_tools/src/http/extract/path.rs @@ -1,9 +1,6 @@ use std::ops::{Deref, DerefMut}; -use axum::{ - async_trait, - extract::{rejection::PathRejection, FromRequestParts}, -}; +use axum::extract::{rejection::PathRejection, FromRequestParts}; use compute_api::responses::GenericAPIError; use http::{request::Parts, StatusCode}; @@ -12,7 +9,6 @@ use http::{request::Parts, StatusCode}; #[derive(Debug, Clone, Copy, Default)] pub(crate) struct Path(pub T); -#[async_trait] impl FromRequestParts for Path where axum::extract::Path: FromRequestParts, diff --git a/compute_tools/src/http/extract/query.rs b/compute_tools/src/http/extract/query.rs index a1f1b0cef0..9dec3642cf 100644 --- a/compute_tools/src/http/extract/query.rs +++ b/compute_tools/src/http/extract/query.rs @@ -1,9 +1,6 @@ use std::ops::{Deref, DerefMut}; -use axum::{ - async_trait, - extract::{rejection::QueryRejection, FromRequestParts}, -}; +use axum::extract::{rejection::QueryRejection, FromRequestParts}; use compute_api::responses::GenericAPIError; use http::{request::Parts, StatusCode}; @@ -12,7 +9,6 @@ use http::{request::Parts, StatusCode}; #[derive(Debug, Clone, Copy, Default)] pub(crate) struct Query(pub T); -#[async_trait] impl FromRequestParts for Query where axum::extract::Query: FromRequestParts, diff --git a/compute_tools/src/http/openapi_spec.yaml b/compute_tools/src/http/openapi_spec.yaml index 50319cdd85..bbdb7d0917 100644 --- a/compute_tools/src/http/openapi_spec.yaml +++ b/compute_tools/src/http/openapi_spec.yaml @@ -68,35 +68,6 @@ paths: schema: $ref: "#/components/schemas/ComputeInsights" - /installed_extensions: - get: - tags: - - Info - summary: Get installed extensions. - description: "" - operationId: getInstalledExtensions - responses: - 200: - description: List of installed extensions - content: - application/json: - schema: - $ref: "#/components/schemas/InstalledExtensions" - /info: - get: - tags: - - Info - summary: Get info about the compute pod / VM. - description: "" - operationId: getInfo - responses: - 200: - description: Info - content: - application/json: - schema: - $ref: "#/components/schemas/Info" - /dbs_and_roles: get: tags: diff --git a/compute_tools/src/http/routes/extension_server.rs b/compute_tools/src/http/routes/extension_server.rs index ee5bc675ba..5cc9b6d277 100644 --- a/compute_tools/src/http/routes/extension_server.rs +++ b/compute_tools/src/http/routes/extension_server.rs @@ -17,7 +17,8 @@ use crate::{ #[derive(Debug, Clone, Deserialize)] pub(in crate::http) struct ExtensionServerParams { - is_library: Option, + #[serde(default)] + is_library: bool, } /// Download a remote extension. @@ -51,7 +52,7 @@ pub(in crate::http) async fn download_extension( remote_extensions.get_ext( &filename, - params.is_library.unwrap_or(false), + params.is_library, &compute.build_tag, &compute.pgversion, ) diff --git a/compute_tools/src/http/routes/info.rs b/compute_tools/src/http/routes/info.rs deleted file mode 100644 index 32d6fea74c..0000000000 --- a/compute_tools/src/http/routes/info.rs +++ /dev/null @@ -1,11 +0,0 @@ -use axum::response::Response; -use compute_api::responses::InfoResponse; -use http::StatusCode; - -use crate::http::JsonResponse; - -/// Get information about the physical characteristics about the compute. -pub(in crate::http) async fn get_info() -> Response { - let num_cpus = num_cpus::get_physical(); - JsonResponse::success(StatusCode::OK, &InfoResponse { num_cpus }) -} diff --git a/compute_tools/src/http/routes/installed_extensions.rs b/compute_tools/src/http/routes/installed_extensions.rs deleted file mode 100644 index db74a6b195..0000000000 --- a/compute_tools/src/http/routes/installed_extensions.rs +++ /dev/null @@ -1,33 +0,0 @@ -use std::sync::Arc; - -use axum::{extract::State, response::Response}; -use compute_api::responses::ComputeStatus; -use http::StatusCode; -use tokio::task; - -use crate::{compute::ComputeNode, http::JsonResponse, installed_extensions}; - -/// Get a list of installed extensions. -pub(in crate::http) async fn get_installed_extensions( - State(compute): State>, -) -> Response { - let status = compute.get_status(); - if status != ComputeStatus::Running { - return JsonResponse::invalid_status(status); - } - - let conf = compute.get_conn_conf(None); - let res = task::spawn_blocking(move || installed_extensions::get_installed_extensions(conf)) - .await - .unwrap(); - - match res { - Ok(installed_extensions) => { - JsonResponse::success(StatusCode::OK, Some(installed_extensions)) - } - Err(e) => JsonResponse::error( - StatusCode::INTERNAL_SERVER_ERROR, - format!("failed to get list of installed extensions: {e}"), - ), - } -} diff --git a/compute_tools/src/http/routes/metrics.rs b/compute_tools/src/http/routes/metrics.rs index 40d71b5de7..13150a7588 100644 --- a/compute_tools/src/http/routes/metrics.rs +++ b/compute_tools/src/http/routes/metrics.rs @@ -2,17 +2,16 @@ use axum::{body::Body, response::Response}; use http::header::CONTENT_TYPE; use http::StatusCode; use metrics::proto::MetricFamily; -use metrics::Encoder; -use metrics::TextEncoder; +use metrics::{Encoder, TextEncoder}; -use crate::{http::JsonResponse, installed_extensions}; +use crate::{http::JsonResponse, metrics::collect}; /// Expose Prometheus metrics. pub(in crate::http) async fn get_metrics() -> Response { // When we call TextEncoder::encode() below, it will immediately return an // error if a metric family has no metrics, so we need to preemptively // filter out metric families with no metrics. - let metrics = installed_extensions::collect() + let metrics = collect() .into_iter() .filter(|m| !m.get_metric().is_empty()) .collect::>(); diff --git a/compute_tools/src/http/routes/mod.rs b/compute_tools/src/http/routes/mod.rs index 3efa1153ad..a67be7fd5a 100644 --- a/compute_tools/src/http/routes/mod.rs +++ b/compute_tools/src/http/routes/mod.rs @@ -10,9 +10,7 @@ pub(in crate::http) mod extension_server; pub(in crate::http) mod extensions; pub(in crate::http) mod failpoints; pub(in crate::http) mod grants; -pub(in crate::http) mod info; pub(in crate::http) mod insights; -pub(in crate::http) mod installed_extensions; pub(in crate::http) mod metrics; pub(in crate::http) mod metrics_json; pub(in crate::http) mod status; diff --git a/compute_tools/src/http/server.rs b/compute_tools/src/http/server.rs index 33d4b489a0..e41ed9df2d 100644 --- a/compute_tools/src/http/server.rs +++ b/compute_tools/src/http/server.rs @@ -1,15 +1,14 @@ use std::{ net::{IpAddr, Ipv6Addr, SocketAddr}, - sync::{ - atomic::{AtomicU64, Ordering}, - Arc, - }, + sync::Arc, thread, time::Duration, }; use anyhow::Result; use axum::{ + extract::Request, + middleware::{self, Next}, response::{IntoResponse, Response}, routing::{get, post}, Router, @@ -17,16 +16,13 @@ use axum::{ use http::StatusCode; use tokio::net::TcpListener; use tower::ServiceBuilder; -use tower_http::{ - request_id::{MakeRequestId, PropagateRequestIdLayer, RequestId, SetRequestIdLayer}, - trace::TraceLayer, -}; +use tower_http::{request_id::PropagateRequestIdLayer, trace::TraceLayer}; use tracing::{debug, error, info, Span}; +use uuid::Uuid; use super::routes::{ check_writability, configure, database_schema, dbs_and_roles, extension_server, extensions, - grants, info as info_route, insights, installed_extensions, metrics, metrics_json, status, - terminate, + grants, insights, metrics, metrics_json, status, terminate, }; use crate::compute::ComputeNode; @@ -34,47 +30,36 @@ async fn handle_404() -> Response { StatusCode::NOT_FOUND.into_response() } -#[derive(Clone, Default)] -struct ComputeMakeRequestId(Arc); +const X_REQUEST_ID: &str = "x-request-id"; -impl MakeRequestId for ComputeMakeRequestId { - fn make_request_id( - &mut self, - _request: &http::Request, - ) -> Option { - let request_id = self - .0 - .fetch_add(1, Ordering::SeqCst) - .to_string() - .parse() - .unwrap(); +/// This middleware function allows compute_ctl to generate its own request ID +/// if one isn't supplied. The control plane will always send one as a UUID. The +/// neon Postgres extension on the other hand does not send one. +async fn maybe_add_request_id_header(mut request: Request, next: Next) -> Response { + let headers = request.headers_mut(); - Some(RequestId::new(request_id)) + if headers.get(X_REQUEST_ID).is_none() { + headers.append(X_REQUEST_ID, Uuid::new_v4().to_string().parse().unwrap()); } + + next.run(request).await } /// Run the HTTP server and wait on it forever. #[tokio::main] async fn serve(port: u16, compute: Arc) { - const X_REQUEST_ID: &str = "x-request-id"; - let mut app = Router::new() .route("/check_writability", post(check_writability::is_writable)) .route("/configure", post(configure::configure)) .route("/database_schema", get(database_schema::get_schema_dump)) .route("/dbs_and_roles", get(dbs_and_roles::get_catalog_objects)) .route( - "/extension_server/*filename", + "/extension_server/{*filename}", post(extension_server::download_extension), ) .route("/extensions", post(extensions::install_extension)) .route("/grants", post(grants::add_grant)) - .route("/info", get(info_route::get_info)) .route("/insights", get(insights::get_insights)) - .route( - "/installed_extensions", - get(installed_extensions::get_installed_extensions), - ) .route("/metrics", get(metrics::get_metrics)) .route("/metrics.json", get(metrics_json::get_metrics)) .route("/status", get(status::get_status)) @@ -82,9 +67,8 @@ async fn serve(port: u16, compute: Arc) { .fallback(handle_404) .layer( ServiceBuilder::new() - .layer(SetRequestIdLayer::x_request_id( - ComputeMakeRequestId::default(), - )) + // Add this middleware since we assume the request ID exists + .layer(middleware::from_fn(maybe_add_request_id_header)) .layer( TraceLayer::new_for_http() .on_request(|request: &http::Request<_>, _span: &Span| { diff --git a/compute_tools/src/installed_extensions.rs b/compute_tools/src/installed_extensions.rs index 0ab259ddf1..173dbf40b0 100644 --- a/compute_tools/src/installed_extensions.rs +++ b/compute_tools/src/installed_extensions.rs @@ -1,13 +1,10 @@ use compute_api::responses::{InstalledExtension, InstalledExtensions}; -use metrics::proto::MetricFamily; use std::collections::HashMap; use anyhow::Result; use postgres::{Client, NoTls}; -use metrics::core::Collector; -use metrics::{register_uint_gauge_vec, UIntGaugeVec}; -use once_cell::sync::Lazy; +use crate::metrics::INSTALLED_EXTENSIONS; /// We don't reuse get_existing_dbs() just for code clarity /// and to make database listing query here more explicit. @@ -102,16 +99,3 @@ pub fn get_installed_extensions(mut conf: postgres::config::Config) -> Result = Lazy::new(|| { - register_uint_gauge_vec!( - "compute_installed_extensions", - "Number of databases where the version of extension is installed", - &["extension_name", "version", "owned_by_superuser"] - ) - .expect("failed to define a metric") -}); - -pub fn collect() -> Vec { - INSTALLED_EXTENSIONS.collect() -} diff --git a/compute_tools/src/lib.rs b/compute_tools/src/lib.rs index 12fea4e61a..b08df22134 100644 --- a/compute_tools/src/lib.rs +++ b/compute_tools/src/lib.rs @@ -16,6 +16,7 @@ pub mod extension_server; pub mod installed_extensions; pub mod local_proxy; pub mod lsn_lease; +pub mod metrics; mod migration; pub mod monitor; pub mod params; diff --git a/compute_tools/src/metrics.rs b/compute_tools/src/metrics.rs new file mode 100644 index 0000000000..870b294d08 --- /dev/null +++ b/compute_tools/src/metrics.rs @@ -0,0 +1,70 @@ +use metrics::core::Collector; +use metrics::proto::MetricFamily; +use metrics::{register_int_counter_vec, register_uint_gauge_vec, IntCounterVec, UIntGaugeVec}; +use once_cell::sync::Lazy; + +pub(crate) static INSTALLED_EXTENSIONS: Lazy = Lazy::new(|| { + register_uint_gauge_vec!( + "compute_installed_extensions", + "Number of databases where the version of extension is installed", + &["extension_name", "version", "owned_by_superuser"] + ) + .expect("failed to define a metric") +}); + +// Normally, any HTTP API request is described by METHOD (e.g. GET, POST, etc.) + PATH, +// but for all our APIs we defined a 'slug'/method/operationId in the OpenAPI spec. +// And it's fair to call it a 'RPC' (Remote Procedure Call). +pub enum CPlaneRequestRPC { + GetSpec, +} + +impl CPlaneRequestRPC { + pub fn as_str(&self) -> &str { + match self { + CPlaneRequestRPC::GetSpec => "GetSpec", + } + } +} + +pub const UNKNOWN_HTTP_STATUS: &str = "unknown"; + +pub(crate) static CPLANE_REQUESTS_TOTAL: Lazy = Lazy::new(|| { + register_int_counter_vec!( + "compute_ctl_cplane_requests_total", + "Total number of control plane requests made by compute_ctl by status", + &["rpc", "http_status"] + ) + .expect("failed to define a metric") +}); + +/// Total number of failed database migrations. Per-compute, this is actually a boolean metric, +/// either empty or with a single value (1, migration_id) because we stop at the first failure. +/// Yet, the sum over the fleet will provide the total number of failures. +pub(crate) static DB_MIGRATION_FAILED: Lazy = Lazy::new(|| { + register_int_counter_vec!( + "compute_ctl_db_migration_failed_total", + "Total number of failed database migrations", + &["migration_id"] + ) + .expect("failed to define a metric") +}); + +pub(crate) static REMOTE_EXT_REQUESTS_TOTAL: Lazy = Lazy::new(|| { + register_int_counter_vec!( + "compute_ctl_remote_ext_requests_total", + "Total number of requests made by compute_ctl to download extensions from S3 proxy by status", + // Do not use any labels like extension name yet. + // We can add them later if needed. + &["http_status"] + ) + .expect("failed to define a metric") +}); + +pub fn collect() -> Vec { + let mut metrics = INSTALLED_EXTENSIONS.collect(); + metrics.extend(CPLANE_REQUESTS_TOTAL.collect()); + metrics.extend(REMOTE_EXT_REQUESTS_TOTAL.collect()); + metrics.extend(DB_MIGRATION_FAILED.collect()); + metrics +} diff --git a/compute_tools/src/migration.rs b/compute_tools/src/migration.rs index 45c33172f7..aa3c6b01f0 100644 --- a/compute_tools/src/migration.rs +++ b/compute_tools/src/migration.rs @@ -1,7 +1,9 @@ use anyhow::{Context, Result}; use fail::fail_point; use postgres::{Client, Transaction}; -use tracing::info; +use tracing::{error, info}; + +use crate::metrics::DB_MIGRATION_FAILED; /// Runs a series of migrations on a target database pub(crate) struct MigrationRunner<'m> { @@ -78,24 +80,31 @@ impl<'m> MigrationRunner<'m> { Ok(()) } - /// Run an individual migration - fn run_migration(txn: &mut Transaction, migration_id: i64, migration: &str) -> Result<()> { + /// Run an individual migration in a separate transaction block. + fn run_migration(client: &mut Client, migration_id: i64, migration: &str) -> Result<()> { + let mut txn = client + .transaction() + .with_context(|| format!("begin transaction for migration {migration_id}"))?; + if migration.starts_with("-- SKIP") { info!("Skipping migration id={}", migration_id); // Even though we are skipping the migration, updating the // migration ID should help keep logic easy to understand when // trying to understand the state of a cluster. - Self::update_migration_id(txn, migration_id)?; + Self::update_migration_id(&mut txn, migration_id)?; } else { info!("Running migration id={}:\n{}\n", migration_id, migration); txn.simple_query(migration) .with_context(|| format!("apply migration {migration_id}"))?; - Self::update_migration_id(txn, migration_id)?; + Self::update_migration_id(&mut txn, migration_id)?; } + txn.commit() + .with_context(|| format!("commit transaction for migration {migration_id}"))?; + Ok(()) } @@ -109,19 +118,20 @@ impl<'m> MigrationRunner<'m> { // The index lags the migration ID by 1, so the current migration // ID is also the next index let migration_id = (current_migration + 1) as i64; + let migration = self.migrations[current_migration]; - let mut txn = self - .client - .transaction() - .with_context(|| format!("begin transaction for migration {migration_id}"))?; - - Self::run_migration(&mut txn, migration_id, self.migrations[current_migration]) - .with_context(|| format!("running migration {migration_id}"))?; - - txn.commit() - .with_context(|| format!("commit transaction for migration {migration_id}"))?; - - info!("Finished migration id={}", migration_id); + match Self::run_migration(self.client, migration_id, migration) { + Ok(_) => { + info!("Finished migration id={}", migration_id); + } + Err(e) => { + error!("Failed to run migration id={}: {}", migration_id, e); + DB_MIGRATION_FAILED + .with_label_values(&[migration_id.to_string().as_str()]) + .inc(); + return Err(e); + } + } current_migration += 1; } diff --git a/compute_tools/src/spec.rs b/compute_tools/src/spec.rs index c7d2deb090..43a820885b 100644 --- a/compute_tools/src/spec.rs +++ b/compute_tools/src/spec.rs @@ -6,6 +6,7 @@ use std::path::Path; use tracing::{error, info, instrument, warn}; use crate::config; +use crate::metrics::{CPlaneRequestRPC, CPLANE_REQUESTS_TOTAL, UNKNOWN_HTTP_STATUS}; use crate::migration::MigrationRunner; use crate::params::PG_HBA_ALL_MD5; use crate::pg_helpers::*; @@ -19,7 +20,7 @@ use compute_api::spec::ComputeSpec; fn do_control_plane_request( uri: &str, jwt: &str, -) -> Result { +) -> Result { let resp = reqwest::blocking::Client::new() .get(uri) .header("Authorization", format!("Bearer {}", jwt)) @@ -28,34 +29,41 @@ fn do_control_plane_request( ( true, format!("could not perform spec request to control plane: {}", e), + UNKNOWN_HTTP_STATUS.to_string(), ) })?; - match resp.status() { + let status = resp.status(); + match status { StatusCode::OK => match resp.json::() { Ok(spec_resp) => Ok(spec_resp), Err(e) => Err(( true, format!("could not deserialize control plane response: {}", e), + status.to_string(), )), }, - StatusCode::SERVICE_UNAVAILABLE => { - Err((true, "control plane is temporarily unavailable".to_string())) - } + StatusCode::SERVICE_UNAVAILABLE => Err(( + true, + "control plane is temporarily unavailable".to_string(), + status.to_string(), + )), StatusCode::BAD_GATEWAY => { // We have a problem with intermittent 502 errors now // https://github.com/neondatabase/cloud/issues/2353 // It's fine to retry GET request in this case. - Err((true, "control plane request failed with 502".to_string())) + Err(( + true, + "control plane request failed with 502".to_string(), + status.to_string(), + )) } // Another code, likely 500 or 404, means that compute is unknown to the control plane // or some internal failure happened. Doesn't make much sense to retry in this case. _ => Err(( false, - format!( - "unexpected control plane response status code: {}", - resp.status() - ), + format!("unexpected control plane response status code: {}", status), + status.to_string(), )), } } @@ -83,17 +91,28 @@ pub fn get_spec_from_control_plane( // - got spec -> return Ok(Some(spec)) while attempt < 4 { spec = match do_control_plane_request(&cp_uri, &jwt) { - Ok(spec_resp) => match spec_resp.status { - ControlPlaneComputeStatus::Empty => Ok(None), - ControlPlaneComputeStatus::Attached => { - if let Some(spec) = spec_resp.spec { - Ok(Some(spec)) - } else { - bail!("compute is attached, but spec is empty") + Ok(spec_resp) => { + CPLANE_REQUESTS_TOTAL + .with_label_values(&[ + CPlaneRequestRPC::GetSpec.as_str(), + &StatusCode::OK.to_string(), + ]) + .inc(); + match spec_resp.status { + ControlPlaneComputeStatus::Empty => Ok(None), + ControlPlaneComputeStatus::Attached => { + if let Some(spec) = spec_resp.spec { + Ok(Some(spec)) + } else { + bail!("compute is attached, but spec is empty") + } } } - }, - Err((retry, msg)) => { + } + Err((retry, msg, status)) => { + CPLANE_REQUESTS_TOTAL + .with_label_values(&[CPlaneRequestRPC::GetSpec.as_str(), &status]) + .inc(); if retry { Err(anyhow!(msg)) } else { diff --git a/compute_tools/src/spec_apply.rs b/compute_tools/src/spec_apply.rs index 7401de2e60..5ee9c5fbd8 100644 --- a/compute_tools/src/spec_apply.rs +++ b/compute_tools/src/spec_apply.rs @@ -47,7 +47,7 @@ pub enum PerDatabasePhase { DeleteDBRoleReferences, ChangeSchemaPerms, HandleAnonExtension, - DropSubscriptionsForDeletedDatabases, + DropLogicalSubscriptions, } #[derive(Clone, Debug)] @@ -58,11 +58,13 @@ pub enum ApplySpecPhase { CreateAndAlterRoles, RenameAndDeleteDatabases, CreateAndAlterDatabases, + CreateSchemaNeon, RunInEachDatabase { db: DB, subphase: PerDatabasePhase }, HandleOtherExtensions, HandleNeonExtension, CreateAvailabilityCheck, DropRoles, + FinalizeDropLogicalSubscriptions, } pub struct Operation { @@ -331,7 +333,7 @@ async fn get_operations<'a>( // NB: there could be other db states, which prevent us from dropping // the database. For example, if db is used by any active subscription // or replication slot. - // Such cases are handled in the DropSubscriptionsForDeletedDatabases + // Such cases are handled in the DropLogicalSubscriptions // phase. We do all the cleanup before actually dropping the database. let drop_db_query: String = format!( "DROP DATABASE IF EXISTS {} WITH (FORCE)", @@ -442,13 +444,19 @@ async fn get_operations<'a>( Ok(Box::new(operations)) } + ApplySpecPhase::CreateSchemaNeon => Ok(Box::new(once(Operation { + query: String::from("CREATE SCHEMA IF NOT EXISTS neon"), + comment: Some(String::from( + "create schema for neon extension and utils tables", + )), + }))), ApplySpecPhase::RunInEachDatabase { db, subphase } => { match subphase { - PerDatabasePhase::DropSubscriptionsForDeletedDatabases => { + PerDatabasePhase::DropLogicalSubscriptions => { match &db { DB::UserDB(db) => { let drop_subscription_query: String = format!( - include_str!("sql/drop_subscription_for_drop_dbs.sql"), + include_str!("sql/drop_subscriptions.sql"), datname_str = escape_literal(&db.name), ); @@ -666,10 +674,6 @@ async fn get_operations<'a>( } ApplySpecPhase::HandleNeonExtension => { let operations = vec![ - Operation { - query: String::from("CREATE SCHEMA IF NOT EXISTS neon"), - comment: Some(String::from("init: add schema for extension")), - }, Operation { query: String::from("CREATE EXTENSION IF NOT EXISTS neon WITH SCHEMA neon"), comment: Some(String::from( @@ -712,5 +716,9 @@ async fn get_operations<'a>( Ok(Box::new(operations)) } + ApplySpecPhase::FinalizeDropLogicalSubscriptions => Ok(Box::new(once(Operation { + query: String::from(include_str!("sql/finalize_drop_subscriptions.sql")), + comment: None, + }))), } } diff --git a/compute_tools/src/sql/drop_subscription_for_drop_dbs.sql b/compute_tools/src/sql/drop_subscriptions.sql similarity index 100% rename from compute_tools/src/sql/drop_subscription_for_drop_dbs.sql rename to compute_tools/src/sql/drop_subscriptions.sql diff --git a/compute_tools/src/sql/finalize_drop_subscriptions.sql b/compute_tools/src/sql/finalize_drop_subscriptions.sql new file mode 100644 index 0000000000..4bb291876f --- /dev/null +++ b/compute_tools/src/sql/finalize_drop_subscriptions.sql @@ -0,0 +1,21 @@ +DO $$ +BEGIN + IF NOT EXISTS( + SELECT 1 + FROM pg_catalog.pg_tables + WHERE tablename = 'drop_subscriptions_done' + AND schemaname = 'neon' + ) + THEN + CREATE TABLE neon.drop_subscriptions_done + (id serial primary key, timeline_id text); + END IF; + + -- preserve the timeline_id of the last drop_subscriptions run + -- to ensure that the cleanup of a timeline is executed only once. + -- use upsert to avoid the table bloat in case of cascade branching (branch of a branch) + INSERT INTO neon.drop_subscriptions_done VALUES (1, current_setting('neon.timeline_id')) + ON CONFLICT (id) DO UPDATE + SET timeline_id = current_setting('neon.timeline_id'); +END +$$ diff --git a/control_plane/README.md b/control_plane/README.md index 827aba5c1f..aa6f935e27 100644 --- a/control_plane/README.md +++ b/control_plane/README.md @@ -1,6 +1,10 @@ -# Control Plane and Neon Local +# Local Development Control Plane (`neon_local`) -This crate contains tools to start a Neon development environment locally. This utility can be used with the `cargo neon` command. +This crate contains tools to start a Neon development environment locally. This utility can be used with the `cargo neon` command. This is a convenience to invoke +the `neon_local` binary. + +**Note**: this is a dev/test tool -- a minimal control plane suitable for testing +code changes locally, but not suitable for running production systems. ## Example: Start with Postgres 16 diff --git a/control_plane/src/bin/neon_local.rs b/control_plane/src/bin/neon_local.rs index c73debae4c..ba67ffa2dd 100644 --- a/control_plane/src/bin/neon_local.rs +++ b/control_plane/src/bin/neon_local.rs @@ -1357,6 +1357,7 @@ async fn handle_endpoint(subcmd: &EndpointCmd, env: &local_env::LocalEnv) -> Res args.pg_version, mode, !args.update_catalog, + false, )?; } EndpointCmd::Start(args) => { diff --git a/control_plane/src/endpoint.rs b/control_plane/src/endpoint.rs index b8027abf7c..bc86d09103 100644 --- a/control_plane/src/endpoint.rs +++ b/control_plane/src/endpoint.rs @@ -76,6 +76,7 @@ pub struct EndpointConf { http_port: u16, pg_version: u32, skip_pg_catalog_updates: bool, + drop_subscriptions_before_start: bool, features: Vec, } @@ -143,6 +144,7 @@ impl ComputeControlPlane { pg_version: u32, mode: ComputeMode, skip_pg_catalog_updates: bool, + drop_subscriptions_before_start: bool, ) -> Result> { let pg_port = pg_port.unwrap_or_else(|| self.get_port()); let http_port = http_port.unwrap_or_else(|| self.get_port() + 1); @@ -162,6 +164,7 @@ impl ComputeControlPlane { // with this we basically test a case of waking up an idle compute, where // we also skip catalog updates in the cloud. skip_pg_catalog_updates, + drop_subscriptions_before_start, features: vec![], }); @@ -177,6 +180,7 @@ impl ComputeControlPlane { pg_port, pg_version, skip_pg_catalog_updates, + drop_subscriptions_before_start, features: vec![], })?, )?; @@ -240,6 +244,7 @@ pub struct Endpoint { // Optimizations skip_pg_catalog_updates: bool, + drop_subscriptions_before_start: bool, // Feature flags features: Vec, } @@ -291,6 +296,7 @@ impl Endpoint { tenant_id: conf.tenant_id, pg_version: conf.pg_version, skip_pg_catalog_updates: conf.skip_pg_catalog_updates, + drop_subscriptions_before_start: conf.drop_subscriptions_before_start, features: conf.features, }) } @@ -625,6 +631,7 @@ impl Endpoint { shard_stripe_size: Some(shard_stripe_size), local_proxy_config: None, reconfigure_concurrency: 1, + drop_subscriptions_before_start: self.drop_subscriptions_before_start, }; let spec_path = self.endpoint_path().join("spec.json"); std::fs::write(spec_path, serde_json::to_string_pretty(&spec)?)?; diff --git a/control_plane/src/pageserver.rs b/control_plane/src/pageserver.rs index ef5b3d6593..383c174684 100644 --- a/control_plane/src/pageserver.rs +++ b/control_plane/src/pageserver.rs @@ -347,11 +347,31 @@ impl PageServerNode { .map(|x| x.parse::()) .transpose() .context("Failed to parse 'compaction_threshold' as an integer")?, + compaction_upper_limit: settings + .remove("compaction_upper_limit") + .map(|x| x.parse::()) + .transpose() + .context("Failed to parse 'compaction_upper_limit' as an integer")?, compaction_algorithm: settings .remove("compaction_algorithm") .map(serde_json::from_str) .transpose() .context("Failed to parse 'compaction_algorithm' json")?, + l0_flush_delay_threshold: settings + .remove("l0_flush_delay_threshold") + .map(|x| x.parse::()) + .transpose() + .context("Failed to parse 'l0_flush_delay_threshold' as an integer")?, + l0_flush_wait_upload: settings + .remove("l0_flush_wait_upload") + .map(|x| x.parse::()) + .transpose() + .context("Failed to parse 'l0_flush_wait_upload' as a boolean")?, + l0_flush_stall_threshold: settings + .remove("l0_flush_stall_threshold") + .map(|x| x.parse::()) + .transpose() + .context("Failed to parse 'l0_flush_stall_threshold' as an integer")?, gc_horizon: settings .remove("gc_horizon") .map(|x| x.parse::()) @@ -418,6 +438,26 @@ impl PageServerNode { .map(serde_json::from_str) .transpose() .context("parse `wal_receiver_protocol_override` from json")?, + rel_size_v2_enabled: settings + .remove("rel_size_v2_enabled") + .map(|x| x.parse::()) + .transpose() + .context("Failed to parse 'rel_size_v2_enabled' as bool")?, + gc_compaction_enabled: settings + .remove("gc_compaction_enabled") + .map(|x| x.parse::()) + .transpose() + .context("Failed to parse 'gc_compaction_enabled' as bool")?, + gc_compaction_initial_threshold_kb: settings + .remove("gc_compaction_initial_threshold_kb") + .map(|x| x.parse::()) + .transpose() + .context("Failed to parse 'gc_compaction_initial_threshold_kb' as integer")?, + gc_compaction_ratio_percent: settings + .remove("gc_compaction_ratio_percent") + .map(|x| x.parse::()) + .transpose() + .context("Failed to parse 'gc_compaction_ratio_percent' as integer")?, }; if !settings.is_empty() { bail!("Unrecognized tenant settings: {settings:?}") diff --git a/control_plane/storcon_cli/src/main.rs b/control_plane/storcon_cli/src/main.rs index 96bfad4c86..d9b76b9600 100644 --- a/control_plane/storcon_cli/src/main.rs +++ b/control_plane/storcon_cli/src/main.rs @@ -298,14 +298,7 @@ impl FromStr for SkSchedulingPolicyArg { type Err = anyhow::Error; fn from_str(s: &str) -> Result { - match s { - "active" => Ok(Self(SkSchedulingPolicy::Active)), - "disabled" => Ok(Self(SkSchedulingPolicy::Disabled)), - "decomissioned" => Ok(Self(SkSchedulingPolicy::Decomissioned)), - _ => Err(anyhow::anyhow!( - "Unknown scheduling policy '{s}', try active,disabled,decomissioned" - )), - } + SkSchedulingPolicy::from_str(s).map(Self) } } diff --git a/deny.toml b/deny.toml index ff8d71cda5..df00a34c60 100644 --- a/deny.toml +++ b/deny.toml @@ -41,8 +41,8 @@ allow = [ "MIT", "MPL-2.0", "OpenSSL", - "Unicode-DFS-2016", "Unicode-3.0", + "Zlib", ] confidence-threshold = 0.8 exceptions = [ diff --git a/docker-compose/compute_wrapper/Dockerfile b/docker-compose/compute_wrapper/Dockerfile index 05a2cf124c..61f44681da 100644 --- a/docker-compose/compute_wrapper/Dockerfile +++ b/docker-compose/compute_wrapper/Dockerfile @@ -7,14 +7,12 @@ FROM $REPOSITORY/${COMPUTE_IMAGE}:$TAG ARG COMPUTE_IMAGE USER root -RUN apt-get update && \ +RUN echo 'Acquire::Retries "5";' > /etc/apt/apt.conf.d/80-retries && \ + apt-get update && \ apt-get install -y curl \ jq \ - python3-pip \ netcat-openbsd -#Faker is required for the pg_anon test -RUN case $COMPUTE_IMAGE in compute-node-v17) OPT="--break-system-packages";; *) OPT= ;; esac && pip3 install $OPT Faker #This is required for the pg_hintplan test -RUN mkdir -p /ext-src/pg_hint_plan-src && chown postgres /ext-src/pg_hint_plan-src +RUN mkdir -p /ext-src/pg_hint_plan-src && chown postgres /ext-src/pg_hint_plan-src -USER postgres \ No newline at end of file +USER postgres diff --git a/docker-compose/compute_wrapper/shell/compute.sh b/docker-compose/compute_wrapper/shell/compute.sh index 33455e458a..b4f8d3d66a 100755 --- a/docker-compose/compute_wrapper/shell/compute.sh +++ b/docker-compose/compute_wrapper/shell/compute.sh @@ -20,30 +20,55 @@ while ! nc -z pageserver 6400; do done echo "Page server is ready." -echo "Create a tenant and timeline" -generate_id tenant_id -PARAMS=( - -X PUT - -H "Content-Type: application/json" - -d "{\"mode\": \"AttachedSingle\", \"generation\": 1, \"tenant_conf\": {}}" - "http://pageserver:9898/v1/tenant/${tenant_id}/location_config" -) -result=$(curl "${PARAMS[@]}") -echo $result | jq . +cp ${SPEC_FILE_ORG} ${SPEC_FILE} -generate_id timeline_id -PARAMS=( - -sbf - -X POST - -H "Content-Type: application/json" - -d "{\"new_timeline_id\": \"${timeline_id}\", \"pg_version\": ${PG_VERSION}}" - "http://pageserver:9898/v1/tenant/${tenant_id}/timeline/" -) -result=$(curl "${PARAMS[@]}") -echo $result | jq . + if [ -n "${TENANT_ID:-}" ] && [ -n "${TIMELINE_ID:-}" ]; then + tenant_id=${TENANT_ID} + timeline_id=${TIMELINE_ID} +else + echo "Check if a tenant present" + PARAMS=( + -X GET + -H "Content-Type: application/json" + "http://pageserver:9898/v1/tenant" + ) + tenant_id=$(curl "${PARAMS[@]}" | jq -r .[0].id) + if [ -z "${tenant_id}" ] || [ "${tenant_id}" = null ]; then + echo "Create a tenant" + generate_id tenant_id + PARAMS=( + -X PUT + -H "Content-Type: application/json" + -d "{\"mode\": \"AttachedSingle\", \"generation\": 1, \"tenant_conf\": {}}" + "http://pageserver:9898/v1/tenant/${tenant_id}/location_config" + ) + result=$(curl "${PARAMS[@]}") + echo $result | jq . + fi + + echo "Check if a timeline present" + PARAMS=( + -X GET + -H "Content-Type: application/json" + "http://pageserver:9898/v1/tenant/${tenant_id}/timeline" + ) + timeline_id=$(curl "${PARAMS[@]}" | jq -r .[0].timeline_id) + if [ -z "${timeline_id}" ] || [ "${timeline_id}" = null ]; then + generate_id timeline_id + PARAMS=( + -sbf + -X POST + -H "Content-Type: application/json" + -d "{\"new_timeline_id\": \"${timeline_id}\", \"pg_version\": ${PG_VERSION}}" + "http://pageserver:9898/v1/tenant/${tenant_id}/timeline/" + ) + result=$(curl "${PARAMS[@]}") + echo $result | jq . + fi +fi echo "Overwrite tenant id and timeline id in spec file" -sed "s/TENANT_ID/${tenant_id}/" ${SPEC_FILE_ORG} > ${SPEC_FILE} +sed -i "s/TENANT_ID/${tenant_id}/" ${SPEC_FILE} sed -i "s/TIMELINE_ID/${timeline_id}/" ${SPEC_FILE} cat ${SPEC_FILE} diff --git a/docker-compose/docker-compose.yml b/docker-compose/docker-compose.yml index 6e15fdbe0d..489d60f38c 100644 --- a/docker-compose/docker-compose.yml +++ b/docker-compose/docker-compose.yml @@ -149,11 +149,13 @@ services: args: - REPOSITORY=${REPOSITORY:-neondatabase} - COMPUTE_IMAGE=compute-node-v${PG_VERSION:-16} - - TAG=${TAG:-latest} - - http_proxy=$http_proxy - - https_proxy=$https_proxy + - TAG=${COMPUTE_TAG:-${TAG:-latest}} + - http_proxy=${http_proxy:-} + - https_proxy=${https_proxy:-} environment: - PG_VERSION=${PG_VERSION:-16} + - TENANT_ID=${TENANT_ID:-} + - TIMELINE_ID=${TIMELINE_ID:-} #- RUST_BACKTRACE=1 # Mount the test files directly, for faster editing cycle. volumes: @@ -185,6 +187,8 @@ services: neon-test-extensions: profiles: ["test-extensions"] image: ${REPOSITORY:-neondatabase}/neon-test-extensions-v${PG_TEST_VERSION:-16}:${TAG:-latest} + environment: + - PGPASSWORD=cloud_admin entrypoint: - "/bin/bash" - "-c" diff --git a/docker-compose/docker_compose_test.sh b/docker-compose/docker_compose_test.sh index 063664d0c6..a05d6c043d 100755 --- a/docker-compose/docker_compose_test.sh +++ b/docker-compose/docker_compose_test.sh @@ -18,14 +18,10 @@ cd $(dirname $0) COMPUTE_CONTAINER_NAME=docker-compose-compute-1 TEST_CONTAINER_NAME=docker-compose-neon-test-extensions-1 PSQL_OPTION="-h localhost -U cloud_admin -p 55433 -d postgres" -: ${http_proxy:=} -: ${https_proxy:=} -export http_proxy https_proxy cleanup() { echo "show container information" docker ps - docker compose --profile test-extensions -f $COMPOSE_FILE logs echo "stop containers..." docker compose --profile test-extensions -f $COMPOSE_FILE down } @@ -35,13 +31,7 @@ for pg_version in ${TEST_VERSION_ONLY-14 15 16 17}; do echo "clean up containers if exists" cleanup PG_TEST_VERSION=$((pg_version < 16 ? 16 : pg_version)) - # The support of pg_anon not yet added to PG17, so we have to add the corresponding option for other PG versions - if [ "${pg_version}" -ne 17 ]; then - SPEC_PATH="compute_wrapper/var/db/postgres/specs" - mv $SPEC_PATH/spec.json $SPEC_PATH/spec.bak - jq '.cluster.settings += [{"name": "session_preload_libraries","value": "anon","vartype": "string"}]' "${SPEC_PATH}/spec.bak" > "${SPEC_PATH}/spec.json" - fi - PG_VERSION=$pg_version PG_TEST_VERSION=$PG_TEST_VERSION docker compose --profile test-extensions -f $COMPOSE_FILE up --build -d + PG_VERSION=$pg_version PG_TEST_VERSION=$PG_TEST_VERSION docker compose --profile test-extensions -f $COMPOSE_FILE up --quiet-pull --build -d echo "wait until the compute is ready. timeout after 60s. " cnt=0 @@ -50,7 +40,6 @@ for pg_version in ${TEST_VERSION_ONLY-14 15 16 17}; do cnt=`expr $cnt + 3` if [ $cnt -gt 60 ]; then echo "timeout before the compute is ready." - cleanup exit 1 fi if docker compose --profile test-extensions -f $COMPOSE_FILE logs "compute_is_ready" | grep -q "accepting connections"; then @@ -62,36 +51,20 @@ for pg_version in ${TEST_VERSION_ONLY-14 15 16 17}; do done if [ $pg_version -ge 16 ]; then - echo Enabling trust connection - docker exec $COMPUTE_CONTAINER_NAME bash -c "sed -i '\$d' /var/db/postgres/compute/pg_hba.conf && echo -e 'host\t all\t all\t all\t trust' >> /var/db/postgres/compute/pg_hba.conf && psql $PSQL_OPTION -c 'select pg_reload_conf()' " - echo Adding postgres role - docker exec $COMPUTE_CONTAINER_NAME psql $PSQL_OPTION -c "CREATE ROLE postgres SUPERUSER LOGIN" + docker cp ext-src $TEST_CONTAINER_NAME:/ # This is required for the pg_hint_plan test, to prevent flaky log message causing the test to fail # It cannot be moved to Dockerfile now because the database directory is created after the start of the container echo Adding dummy config docker exec $COMPUTE_CONTAINER_NAME touch /var/db/postgres/compute/compute_ctl_temp_override.conf - # This block is required for the pg_anon extension test. - # The test assumes that it is running on the same host with the postgres engine. - # In our case it's not true, that's why we are copying files to the compute node + # The following block copies the files for the pg_hintplan test to the compute node for the extension test in an isolated docker-compose environment TMPDIR=$(mktemp -d) - # Add support for pg_anon for pg_v16 - if [ $pg_version -ne 17 ]; then - docker cp $TEST_CONTAINER_NAME:/ext-src/pg_anon-src/data $TMPDIR/data - echo -e '1\t too \t many \t tabs' > $TMPDIR/data/bad.csv - docker cp $TMPDIR/data $COMPUTE_CONTAINER_NAME:/tmp/tmp_anon_alternate_data - rm -rf $TMPDIR - fi - TMPDIR=$(mktemp -d) - # The following block does the same for the pg_hintplan test docker cp $TEST_CONTAINER_NAME:/ext-src/pg_hint_plan-src/data $TMPDIR/data docker cp $TMPDIR/data $COMPUTE_CONTAINER_NAME:/ext-src/pg_hint_plan-src/ rm -rf $TMPDIR # We are running tests now - if docker exec -e SKIP=timescaledb-src,rdkit-src,postgis-src,pgx_ulid-src,pgtap-src,pg_tiktoken-src,pg_jsonschema-src,pg_graphql-src,kq_imcx-src,wal2json_2_5-src \ + if ! docker exec -e SKIP=timescaledb-src,rdkit-src,postgis-src,pgx_ulid-src,pgtap-src,pg_tiktoken-src,pg_jsonschema-src,kq_imcx-src,wal2json_2_5-src \ $TEST_CONTAINER_NAME /run-tests.sh | tee testout.txt then - cleanup - else FAILED=$(tail -1 testout.txt) for d in $FAILED do @@ -101,13 +74,7 @@ for pg_version in ${TEST_VERSION_ONLY-14 15 16 17}; do cat $d/regression.out $d/regression.diffs || true done rm -rf $FAILED - cleanup exit 1 fi fi - cleanup - # Restore the original spec.json - if [ "$pg_version" -ne 17 ]; then - mv "$SPEC_PATH/spec.bak" "$SPEC_PATH/spec.json" - fi done diff --git a/docker-compose/ext-src/hll-src/test-upgrade.sh b/docker-compose/ext-src/hll-src/test-upgrade.sh new file mode 100755 index 0000000000..f9e9aedcb2 --- /dev/null +++ b/docker-compose/ext-src/hll-src/test-upgrade.sh @@ -0,0 +1,5 @@ +#!/bin/sh +set -ex +cd "$(dirname ${0})" +PG_REGRESS=$(dirname "$(pg_config --pgxs)")/../test/regress/pg_regress +${PG_REGRESS} --use-existing --inputdir=./ --bindir='/usr/local/pgsql/bin' --dbname=contrib_regression add_agg agg_oob auto_sparse card_op cast_shape copy_binary cumulative_add_cardinality_correction cumulative_add_comprehensive_promotion cumulative_add_sparse_edge cumulative_add_sparse_random cumulative_add_sparse_step cumulative_union_comprehensive cumulative_union_explicit_explicit cumulative_union_explicit_promotion cumulative_union_probabilistic_probabilistic cumulative_union_sparse_full_representation cumulative_union_sparse_promotion cumulative_union_sparse_sparse disable_hashagg equal explicit_thresh hash hash_any meta_func murmur_bigint murmur_bytea nosparse notequal scalar_oob storedproc transaction typmod typmod_insert union_op \ No newline at end of file diff --git a/docker-compose/ext-src/hypopg-src/test-upgrade.patch b/docker-compose/ext-src/hypopg-src/test-upgrade.patch new file mode 100644 index 0000000000..71fe26b164 --- /dev/null +++ b/docker-compose/ext-src/hypopg-src/test-upgrade.patch @@ -0,0 +1,27 @@ +diff --git a/expected/hypopg.out b/expected/hypopg.out +index 90121d0..859260b 100644 +--- a/expected/hypopg.out ++++ b/expected/hypopg.out +@@ -11,7 +11,8 @@ BEGIN + END; + $_$ + LANGUAGE plpgsql; +-CREATE EXTENSION hypopg; ++CREATE EXTENSION IF NOT EXISTS hypopg; ++NOTICE: extension "hypopg" already exists, skipping + CREATE TABLE hypo (id integer, val text, "Id2" bigint); + INSERT INTO hypo SELECT i, 'line ' || i + FROM generate_series(1,100000) f(i); +diff --git a/test/sql/hypopg.sql b/test/sql/hypopg.sql +index 99722b0..8d6bacb 100644 +--- a/test/sql/hypopg.sql ++++ b/test/sql/hypopg.sql +@@ -12,7 +12,7 @@ END; + $_$ + LANGUAGE plpgsql; + +-CREATE EXTENSION hypopg; ++CREATE EXTENSION IF NOT EXISTS hypopg; + + CREATE TABLE hypo (id integer, val text, "Id2" bigint); + diff --git a/docker-compose/ext-src/hypopg-src/test-upgrade.sh b/docker-compose/ext-src/hypopg-src/test-upgrade.sh new file mode 100755 index 0000000000..066ac3329e --- /dev/null +++ b/docker-compose/ext-src/hypopg-src/test-upgrade.sh @@ -0,0 +1,6 @@ +#!/bin/sh +set -ex +cd "$(dirname ${0})" +patch -p1 /dev/null || break +for d in ${LIST}; do + [ -d "${d}" ] || continue + if ! psql -w -c "select 1" >/dev/null; then + FAILED="${d} ${FAILED}" + break + fi + if [ -f "${d}/neon-test.sh" ]; then + "${d}/neon-test.sh" || FAILED="${d} ${FAILED}" + else USE_PGXS=1 make -C "${d}" installcheck || FAILED="${d} ${FAILED}" + fi done [ -z "${FAILED}" ] && exit 0 echo "${FAILED}" diff --git a/docker-compose/test_extensions_upgrade.sh b/docker-compose/test_extensions_upgrade.sh new file mode 100755 index 0000000000..ff93b98065 --- /dev/null +++ b/docker-compose/test_extensions_upgrade.sh @@ -0,0 +1,93 @@ +#!/bin/bash +set -eux -o pipefail +cd "$(dirname "${0}")" +# Takes a variable name as argument. The result is stored in that variable. +generate_id() { + local -n resvar=$1 + printf -v resvar '%08x%08x%08x%08x' $SRANDOM $SRANDOM $SRANDOM $SRANDOM +} +if [ -z ${OLDTAG+x} ] || [ -z ${NEWTAG+x} ] || [ -z "${OLDTAG}" ] || [ -z "${NEWTAG}" ]; then + echo OLDTAG and NEWTAG must be defined + exit 1 +fi +export PG_VERSION=${PG_VERSION:-16} +function wait_for_ready { + TIME=0 + while ! docker compose logs compute_is_ready | grep -q "accepting connections" && [ ${TIME} -le 300 ] ; do + ((TIME += 1 )) + sleep 1 + done + if [ ${TIME} -gt 300 ]; then + echo Time is out. + exit 2 + fi +} +function create_extensions() { + for ext in ${1}; do + docker compose exec neon-test-extensions psql -X -v ON_ERROR_STOP=1 -d contrib_regression -c "CREATE EXTENSION IF NOT EXISTS ${ext}" + done +} +EXTENSIONS='[ +{"extname": "plv8", "extdir": "plv8-src"}, +{"extname": "vector", "extdir": "pgvector-src"}, +{"extname": "unit", "extdir": "postgresql-unit-src"}, +{"extname": "hypopg", "extdir": "hypopg-src"}, +{"extname": "rum", "extdir": "rum-src"}, +{"extname": "ip4r", "extdir": "ip4r-src"}, +{"extname": "prefix", "extdir": "prefix-src"}, +{"extname": "hll", "extdir": "hll-src"}, +{"extname": "pg_cron", "extdir": "pg_cron-src"}, +{"extname": "pg_uuidv7", "extdir": "pg_uuidv7-src"}, +{"extname": "roaringbitmap", "extdir": "pg_roaringbitmap-src"}, +{"extname": "semver", "extdir": "pg_semver-src"}, +{"extname": "pg_ivm", "extdir": "pg_ivm-src"} +]' +EXTNAMES=$(echo ${EXTENSIONS} | jq -r '.[].extname' | paste -sd ' ' -) +TAG=${NEWTAG} docker compose --profile test-extensions up --quiet-pull --build -d +wait_for_ready +docker compose exec neon-test-extensions psql -c "DROP DATABASE IF EXISTS contrib_regression" +docker compose exec neon-test-extensions psql -c "CREATE DATABASE contrib_regression" +create_extensions "${EXTNAMES}" +query="select json_object_agg(extname,extversion) from pg_extension where extname in ('${EXTNAMES// /\',\'}')" +new_vers=$(docker compose exec neon-test-extensions psql -Aqt -d contrib_regression -c "$query") +docker compose --profile test-extensions down +TAG=${OLDTAG} docker compose --profile test-extensions up --quiet-pull --build -d --force-recreate +wait_for_ready +docker compose cp ext-src neon-test-extensions:/ +docker compose exec neon-test-extensions psql -c "DROP DATABASE IF EXISTS contrib_regression" +docker compose exec neon-test-extensions psql -c "CREATE DATABASE contrib_regression" +create_extensions "${EXTNAMES}" +query="select pge.extname from pg_extension pge join (select key as extname, value as extversion from json_each_text('${new_vers}')) x on pge.extname=x.extname and pge.extversion <> x.extversion" +exts=$(docker compose exec neon-test-extensions psql -Aqt -d contrib_regression -c "$query") +if [ -z "${exts}" ]; then + echo "No extensions were upgraded" +else + tenant_id=$(docker compose exec neon-test-extensions psql -Aqt -c "SHOW neon.tenant_id") + timeline_id=$(docker compose exec neon-test-extensions psql -Aqt -c "SHOW neon.timeline_id") + for ext in ${exts}; do + echo Testing ${ext}... + EXTDIR=$(echo ${EXTENSIONS} | jq -r '.[] | select(.extname=="'${ext}'") | .extdir') + generate_id new_timeline_id + PARAMS=( + -sbf + -X POST + -H "Content-Type: application/json" + -d "{\"new_timeline_id\": \"${new_timeline_id}\", \"pg_version\": ${PG_VERSION}, \"ancestor_timeline_id\": \"${timeline_id}\"}" + "http://127.0.0.1:9898/v1/tenant/${tenant_id}/timeline/" + ) + result=$(curl "${PARAMS[@]}") + echo $result | jq . + TENANT_ID=${tenant_id} TIMELINE_ID=${new_timeline_id} TAG=${OLDTAG} docker compose down compute compute_is_ready + COMPUTE_TAG=${NEWTAG} TAG=${OLDTAG} TENANT_ID=${tenant_id} TIMELINE_ID=${new_timeline_id} docker compose up --quiet-pull -d --build compute compute_is_ready + wait_for_ready + TID=$(docker compose exec neon-test-extensions psql -Aqt -c "SHOW neon.timeline_id") + if [ ${TID} != ${new_timeline_id} ]; then + echo Timeline mismatch + exit 1 + fi + docker compose exec neon-test-extensions psql -d contrib_regression -c "\dx ${ext}" + docker compose exec neon-test-extensions sh -c /ext-src/${EXTDIR}/test-upgrade.sh + docker compose exec neon-test-extensions psql -d contrib_regression -c "alter extension ${ext} update" + docker compose exec neon-test-extensions psql -d contrib_regression -c "\dx ${ext}" + done +fi diff --git a/docs/rfcs/041-sharded-ingest.md b/docs/rfcs/041-sharded-ingest.md new file mode 100644 index 0000000000..47b314891c --- /dev/null +++ b/docs/rfcs/041-sharded-ingest.md @@ -0,0 +1,255 @@ +# +Created on Aug 2024 +Implemented on Jan 2025 + +## Summary + +Data in large tenants is split up between multiple pageservers according to key hashes, as +introduced in the [sharding RFC](031-sharding-static.md) and [shard splitting RFC](032-shard-splitting.md). + +Whereas currently we send all WAL to all pageserver shards, and each shard filters out the data that it needs, +in this RFC we add a mechanism to filter the WAL on the safekeeper, so that each shard receives +only the data it needs. + +This will place some extra CPU load on the safekeepers, in exchange for reducing the network bandwidth +for ingesting WAL back to scaling as O(1) with shard count, rather than O(N_shards). + +## Motivation + +1. Large databases require higher shard counts. Whereas currently we run with up to 8 shards for tenants +with a few TB of storage, the next order of magnitude capacity increase will require tens of shards, such +that sending all WAL to all shards is impractical in terms of bandwidth. +2. For contemporary database sizes (~2TB), the pageserver is the bottleneck for ingest: since each + shard has to decode and process the whole WAL, sharding doesn't fully relieve this bottleneck. To achieve significantly higher ingest speeds, we need to filter the WAL earlier so that each pageserver + only has to process relevant parts. + +## Non Goals (if relevant) + +We do not seek to introduce multiple WALs per timeline, or to share the work of handling a timeline's +WAL across safekeepers (beyond simple 3x replication). This RFC may be thought of as an incremental +move of the ingestion bottleneck up the stack: instead of high write rates bottlenecking on the +pageserver, they will bottleneck on the safekeeper. + +## Impacted components (e.g. pageserver, safekeeper, console, etc) + +Safekeeper, pageserver. + +There will be no control plane or storage controller coordination needed, as pageservers will directly +indicate their sharding parameters to the safekeeper when subscribing for WAL. + +## Proposed implementation + +Terminology: +- "Data pages" refers to postgres relation blocks, and SLRU blocks. +- "Metadata pages" refers to everything else the pageserver stores, such as relation sizes and + directories of relations. + +### Phase 1: Refactor ingest + +Currently, pageserver ingest code is structured approximately as follows: +1. `handle_walreceiver_connection` reads a stream of binary WAL records off a network + socket +2. `WalIngest::ingest_record` to translate the record into a series of page-level modifications +3. `DatadirModification` accumulates page updates from several `ingest_record` calls, and when + its `commit()` method is called, flushes these into a Timeline's open `InMemoryLayer`. + +This process currently assumes access to a pageserver `Timeline` throughout `ingest_record` and +from `DatadirModification`, which is used to do read-modify-write cycles on metadata pages +such as relation sizes and the master DBDIR page. It also assumes that records are ingested +strictly one after the other: they cannot be ingested in parallel because each record assumes +that earlier records' changes have already been applied to `Timeline`. + +This code will be refactored to disentangle the simple, fast decode of relation page writes +from the more complex logic for updating internal metadata. An intermediate representation +called `InterpretedWalRecords` will be introduced. This is similar to the internal state of +a `DatadirModification`, but does not require access to a Timeline. Instead of storing +metadata updates as materialized writes to pages, it will accumulate these as abstract operations, +for example rather than including a write to a relation size key, this structure will include +an operation that indicates "Update relation _foo_'s size to the max of its current value and +_bar_", such that these may be applied later to a real Timeline. + +The `DatadirModification` will be aware of the `EphemeralFile` format, so that as it accumulates +simple page writes of relation blocks, it can write them directly into a buffer in the serialized +format. This will avoid the need to later deserialize/reserialize this data when passing the +structure between safekeeper and pageserver. + +The new pipeline will be: +1. `handle_walreceiver_connection` reads a stream of binary WAL records off a network +2. A `InterpretedWalRecords` is generated from the incoming WAL records. This does not + require a reference to a Timeline. +3. The logic that is current spread between `WalIngest` and `DatadirModification` for updating + metadata will be refactored to consume the metadata operations from the `InterpretedWalRecords` + and turn them into literal writes to metadata pages. This part must be done sequentially. +4. The resulting buffer of metadata page writes is combined with the buffer of relation block + writes, and written into the `InMemoryLayer`. + +Implemented in: +1. https://github.com/neondatabase/neon/pull/9472 +2. https://github.com/neondatabase/neon/pull/9504 +3. https://github.com/neondatabase/neon/pull/9524 + +### Phase 2: Decode & filter on safekeeper + +In the previous phase, the ingest code was modified to be able to do most of its work without access to +a Timeline: this first stage of ingest simply converts a series of binary wal records into +a buffer of relation/SLRU page writes, and a buffer of abstract metadata writes. + +The modified ingest code may be transplanted from pageserver to safekeeper (probably via a +shared crate). The safekeeper->pageserver network protocol is modified to: + - in subscription requests, send the `ShardIdentity` from the pageserver to the safekeeper + - in responses, transmit a `InterpretedWalRecords` instead of a raw `WalRecord`. + - use the `ShardIdentity` to filter the `ProcessedWalIngest` to relevant content for + the subscribing shard before transmitting it. + +The overall behavior of the pageserver->safekeeper interaction remains the same, in terms of +consistent LSN feedback, and connection management. Only the payload of the subscriptions +changes, to express an LSN range of WAL as a filtered `ProcessedWalIngest` instead of the +raw data. + +The ingest code on the pageserver can now skip the part where it does the first phase of +processing, as it will receive pre-processed, compressed data off the wire. + +Note that `InterpretedWalRecord` batches multiple `InterpretedWalRecord(s)` in the same network +message. Safekeeper reads WAL in chunks of 16 blocks and then decodes as many Postgres WAL records +as possible. Each Postgres WAL record maps to one `InterpretedWalRecord` for potentially multiple shards. +Hence, the size of the batch is given by the number of Postgres WAL records that fit in 16 blocks. + +The protocol needs to support evolution. Protobuf was chosen here with the view that, in the future, +we may migrate it to GRPC altogether + +Implemented in: +1. https://github.com/neondatabase/neon/pull/9746 +2. https://github.com/neondatabase/neon/pull/9821 + +### Phase 3: Fan out interpreted WAL + +In the previous phase, the initial processing of WAL was moved to the safekeeper, but it is still +done once for each shard: this will generate O(N_shards) CPU work on the safekeeper (especially +when considering converting to Protobuf format and compression). + +To avoid this, we fan-out WAL from one (tenant, timeline, shard) to all other shards subscribed on +the same safekeeper. Under normal operation, the WAL will be read from disk, decoded and interpreted +_only_ once per (safekeeper, timeline). + +When the first shard of a sharded timeline subscribes to a given safekeeper a task is spawned +for the WAL reader (`InterpretedWalReader`). This task reads WAL, decodes, interprets it and sends +it to the sender (`InterpretedWalSender`). The sender is a future that is polled from the connection +task. When further shards subscribe on the safekeeper they will attach themselves to the existing WAL reader. +There's two cases to consider: +1. The shard's requested `start_lsn` is ahead of the current position of the WAL reader. In this case, the shard +will start receiving data when the reader reaches that LSN. The intuition here is that there's little to gain +by letting shards "front-run" since compute backpressure is based on the laggard LSN. +2. The shard's requested `start_lsn` is below the current position of the WAL reader. In this case, the WAL reader +gets reset to this requested position (same intuition). Special care is taken such that advanced shards do not receive +interpreted WAL records below their current position. + +The approach above implies that there is at most one WAL reader per (tenant, timeline) on a given safekeeper at any point in time. +If this turns out to be operationally problematic, there's a trick we can deploy: `--max-delta-for-fanout` is an optional safekeeper +argument that controls the max absolute delta between a new shard and the current WAL position of the WAL reader. If the absolute +delta is above that value, a new reader is spawned. Note that there's currently no concurrency control on the number of WAL readers, +so it's recommended to use large values to avoid pushing CPU utilisation too high. + +Unsharded tenants do not spawn a separate task for the interpreted WAL reader since there's no benefit to it. Instead they poll +the reader and sender concurrently from the connection task. + +Shard splits are interesting here because it is the only case when the same shard might have two subscriptions at the same time. +This is handled by giving readers a unique identifier. Both shards will receive the same data while respecting their requested start +position. + +Implemented in: +1. https://github.com/neondatabase/neon/pull/10190 + +## Deployment + +Each phase shall be deployed independently. Special care should be taken around protocol changes. + +## Observability Tips + +* The safekeeper logs the protocol requested by the pageserver +along with the pageserver ID, tenant, timeline and shard: `starting streaming from`. +* There's metrics for the number of wal readers: + * `safekeeper_wal_readers{kind="task", target=~"pageserver.*"}` gives the number of wal reader tasks for each SK + * `safekeeper_wal_readers{kind="future", target=~"pageserver.*"}` gives the numer of wal readers polled inline by each SK + * `safekeeper_interpreted_wal_reader_tasks` gives the number of wal reader tasks per tenant, timeline +* Interesting log lines for the fan-out reader: + * `Spawning interpreted`: first shard creates the interpreted wal reader + * `Fanning out`: a subsequent shard attaches itself to an interpreted wal reader + * `Aborting interpreted`: all senders have finished and the reader task is being aborted + +## Future Optimizations + +This sections describes some improvement areas which may be revisited in the future. + +### Buffering of Interpreted WAL + +The interpreted WAL reader may buffer interpreted WAL records in user space to help with serving +subscriptions that are lagging behind the current position of the reader. + +Counterpoints: +* Safekeepers serve many thousands of timelines and allocating a buffer for each might be wasteful, +especially given that it would go unused on the happy path. +* WAL is buffered in the kernel page cache. Usually we'd only pay the CPU cost of decoding and interpreting. + +### Tweaking the Pagserver Safekeeper Selection Algorithm + +We could make the pageserver aware of which safekeeper's already host shards for the timeline along +with their current WAL positions. The pageserver should then prefer safkeepers that are in the same +AZ _and_ already have a shard with a position close to the desired start position. + +We currently run one safekeeper per AZ, so the point is mute until that changes. + +### Pipelining first ingest phase + +The first ingest phase is a stateless transformation of a binary WAL record into a pre-processed +output per shard. To put multiple CPUs to work, we may pipeline this processing up to some defined buffer +depth. + +## Alternatives considered + +### Give safekeepers enough state to fully decode WAL + +In this RFC, we only do the first phase of ingest on the safekeeper, because this is +the phase that is stateless. Subsequent changes then happen on the pageserver, with +access to the `Timeline` state. + +We could do more work on the safekeeper if we transmitted metadata state to the safekeeper +when subscribing to the WAL: for example, by telling the safekeeper all the relation sizes, +so that it could then generate all the metadata writes for relation sizes. + +We avoid doing this for several reasons: +1. Complexity: it's a more invasive protocol change +2. Decoupling: having the safekeeper understand the `ProcessedWalIngest` already somewhat + infects it with knowledge of the pageserver, but this is mainly an abstract structure + that describes postgres writes. However, if we taught the safekeeper about the exact + way that pageserver deals with metadata keys, this would be a much tighter coupling. +3. Load: once the WAL has been processed to the point that it can be split between shards, + it is preferable to share out work on the remaining shards rather than adding extra CPU + load to the safekeeper. + +### Do pre-processing on the compute instead of the safekeeper + +Since our first stage of ingest is stateless, it could be done at any stage in the pipeline, +all the way up to the compute. + +We choose not to do this, because it is useful for the safekeeper to store the raw WAL rather +than just the preprocessed WAL: +- The safekeeper still needs to be able to serve raw WAL back to postgres for e.g. physical replication +- It simplifies our paxos implementation to have the offset in the write log be literally + the same as the LSN +- Raw WAL must have a stable protocol since we might have to re-ingest it at arbitrary points in the future. + Storing raw WAL give us more flexibility to evolve the pageserver, safekeeper protocol. + +### Do wal pre-processing on shard 0 or a separate service, send it to other shards from there + +If we wanted to keep the safekeepers as entirely pure stores of raw WAL bytes, then +we could do the initial decode and shard-splitting in some other location: +- Shard 0 could subscribe to the full WAL and then send writes to other shards +- A new intermediate service between the safekeeper and pageserver could do the splitting. + +So why not? +- Extra network hop from shard 0 to the final destination shard +- Clearly there is more infrastructure involved here compared with doing it inline on the safekeeper. +- Safekeepers already have very light CPU load: typical cloud instances shapes with appropriate + disks for the safekeepers effectively have "free" CPU resources. +- Doing extra work on shard 0 would complicate scheduling of shards on pageservers, because + shard 0 would have significantly higher CPU load under write workloads than other shards. diff --git a/libs/compute_api/src/responses.rs b/libs/compute_api/src/responses.rs index 9ce605089b..5286e0e61d 100644 --- a/libs/compute_api/src/responses.rs +++ b/libs/compute_api/src/responses.rs @@ -15,11 +15,6 @@ pub struct GenericAPIError { pub error: String, } -#[derive(Debug, Clone, Serialize)] -pub struct InfoResponse { - pub num_cpus: usize, -} - #[derive(Debug, Clone, Serialize)] pub struct ExtensionInstallResponse { pub extension: PgIdent, diff --git a/libs/compute_api/src/spec.rs b/libs/compute_api/src/spec.rs index 54d6a1d38f..b3f18dc6da 100644 --- a/libs/compute_api/src/spec.rs +++ b/libs/compute_api/src/spec.rs @@ -138,6 +138,13 @@ pub struct ComputeSpec { /// enough spare connections for reconfiguration process to succeed. #[serde(default = "default_reconfigure_concurrency")] pub reconfigure_concurrency: usize, + + /// If set to true, the compute_ctl will drop all subscriptions before starting the + /// compute. This is needed when we start an endpoint on a branch, so that child + /// would not compete with parent branch subscriptions + /// over the same replication content from publisher. + #[serde(default)] // Default false + pub drop_subscriptions_before_start: bool, } /// Feature flag to signal `compute_ctl` to enable certain experimental functionality. diff --git a/libs/pageserver_api/src/config.rs b/libs/pageserver_api/src/config.rs index 09cfbc55fd..422da0dc95 100644 --- a/libs/pageserver_api/src/config.rs +++ b/libs/pageserver_api/src/config.rs @@ -120,6 +120,7 @@ pub struct ConfigToml { pub no_sync: Option, pub wal_receiver_protocol: PostgresClientProtocol, pub page_service_pipelining: PageServicePipeliningConfig, + pub get_vectored_concurrent_io: GetVectoredConcurrentIo, } #[derive(Debug, Clone, PartialEq, Eq, serde::Serialize, serde::Deserialize)] @@ -158,6 +159,25 @@ pub enum PageServiceProtocolPipelinedExecutionStrategy { Tasks, } +#[derive(Debug, Clone, PartialEq, Eq, serde::Serialize, serde::Deserialize)] +#[serde(tag = "mode", rename_all = "kebab-case")] +#[serde(deny_unknown_fields)] +pub enum GetVectoredConcurrentIo { + /// The read path is fully sequential: layers are visited + /// one after the other and IOs are issued and waited upon + /// from the same task that traverses the layers. + Sequential, + /// The read path still traverses layers sequentially, and + /// index blocks will be read into the PS PageCache from + /// that task, with waiting. + /// But data IOs are dispatched and waited upon from a sidecar + /// task so that the traversing task can continue to traverse + /// layers while the IOs are in flight. + /// If the PS PageCache miss rate is low, this improves + /// throughput dramatically. + SidecarTask, +} + pub mod statvfs { pub mod mock { #[derive(Debug, Clone, PartialEq, Eq, serde::Serialize, serde::Deserialize)] @@ -234,9 +254,26 @@ pub struct TenantConfigToml { // Duration::ZERO means automatic compaction is disabled. #[serde(with = "humantime_serde")] pub compaction_period: Duration, - // Level0 delta layer threshold for compaction. + /// Level0 delta layer threshold for compaction. pub compaction_threshold: usize, + /// Controls the amount of L0 included in a single compaction iteration. + /// The unit is `checkpoint_distance`, i.e., a size. + /// We add L0s to the set of layers to compact until their cumulative + /// size exceeds `compaction_upper_limit * checkpoint_distance`. + pub compaction_upper_limit: usize, pub compaction_algorithm: crate::models::CompactionAlgorithmSettings, + /// Level0 delta layer threshold at which to delay layer flushes for compaction backpressure, + /// such that they take 2x as long, and start waiting for layer flushes during ephemeral layer + /// rolls. This helps compaction keep up with WAL ingestion, and avoids read amplification + /// blowing up. Should be >compaction_threshold. 0 to disable. Disabled by default. + pub l0_flush_delay_threshold: Option, + /// Level0 delta layer threshold at which to stall layer flushes. Must be >compaction_threshold + /// to avoid deadlock. 0 to disable. Disabled by default. + pub l0_flush_stall_threshold: Option, + /// If true, Level0 delta layer flushes will wait for S3 upload before flushing the next + /// layer. This is a temporary backpressure mechanism which should be removed once + /// l0_flush_{delay,stall}_threshold is fully enabled. + pub l0_flush_wait_upload: bool, // Determines how much history is retained, to allow // branching and read replicas at an older point in time. // The unit is #of bytes of WAL. @@ -301,6 +338,20 @@ pub struct TenantConfigToml { pub timeline_offloading: bool, pub wal_receiver_protocol_override: Option, + + /// Enable rel_size_v2 for this tenant. Once enabled, the tenant will persist this information into + /// `index_part.json`, and it cannot be reversed. + pub rel_size_v2_enabled: Option, + + // gc-compaction related configs + /// Enable automatic gc-compaction trigger on this tenant. + pub gc_compaction_enabled: bool, + /// The initial threshold for gc-compaction in KB. Once the total size of layers below the gc-horizon is above this threshold, + /// gc-compaction will be triggered. + pub gc_compaction_initial_threshold_kb: u64, + /// The ratio that triggers the auto gc-compaction. If (the total size of layers between L2 LSN and gc-horizon) / (size below the L2 LSN) + /// is above this ratio, gc-compaction will be triggered. + pub gc_compaction_ratio_percent: u64, } pub mod defaults { @@ -450,6 +501,11 @@ impl Default for ConfigToml { execution: PageServiceProtocolPipelinedExecutionStrategy::ConcurrentFutures, }) }, + get_vectored_concurrent_io: if !cfg!(test) { + GetVectoredConcurrentIo::Sequential + } else { + GetVectoredConcurrentIo::SidecarTask + }, } } } @@ -472,9 +528,17 @@ pub mod tenant_conf_defaults { pub const DEFAULT_COMPACTION_PERIOD: &str = "20 s"; pub const DEFAULT_COMPACTION_THRESHOLD: usize = 10; + + // This value needs to be tuned to avoid OOM. We have 3/4 of the total CPU threads to do background works, that's 16*3/4=9 on + // most of our pageservers. Compaction ~50 layers requires about 2GB memory (could be reduced later by optimizing L0 hole + // calculation to avoid loading all keys into the memory). So with this config, we can get a maximum peak compaction usage of 18GB. + pub const DEFAULT_COMPACTION_UPPER_LIMIT: usize = 50; + pub const DEFAULT_COMPACTION_ALGORITHM: crate::models::CompactionAlgorithm = crate::models::CompactionAlgorithm::Legacy; + pub const DEFAULT_L0_FLUSH_WAIT_UPLOAD: bool = true; + pub const DEFAULT_GC_HORIZON: u64 = 64 * 1024 * 1024; // Large DEFAULT_GC_PERIOD is fine as long as PITR_INTERVAL is larger. @@ -494,6 +558,9 @@ pub mod tenant_conf_defaults { // By default ingest enough WAL for two new L0 layers before checking if new image // image layers should be created. pub const DEFAULT_IMAGE_LAYER_CREATION_CHECK_THRESHOLD: u8 = 2; + pub const DEFAULT_GC_COMPACTION_ENABLED: bool = false; + pub const DEFAULT_GC_COMPACTION_INITIAL_THRESHOLD_KB: u64 = 10240000; + pub const DEFAULT_GC_COMPACTION_RATIO_PERCENT: u64 = 100; } impl Default for TenantConfigToml { @@ -507,9 +574,13 @@ impl Default for TenantConfigToml { compaction_period: humantime::parse_duration(DEFAULT_COMPACTION_PERIOD) .expect("cannot parse default compaction period"), compaction_threshold: DEFAULT_COMPACTION_THRESHOLD, + compaction_upper_limit: DEFAULT_COMPACTION_UPPER_LIMIT, compaction_algorithm: crate::models::CompactionAlgorithmSettings { kind: DEFAULT_COMPACTION_ALGORITHM, }, + l0_flush_delay_threshold: None, + l0_flush_stall_threshold: None, + l0_flush_wait_upload: DEFAULT_L0_FLUSH_WAIT_UPLOAD, gc_horizon: DEFAULT_GC_HORIZON, gc_period: humantime::parse_duration(DEFAULT_GC_PERIOD) .expect("cannot parse default gc period"), @@ -538,6 +609,10 @@ impl Default for TenantConfigToml { lsn_lease_length_for_ts: LsnLease::DEFAULT_LENGTH_FOR_TS, timeline_offloading: false, wal_receiver_protocol_override: None, + rel_size_v2_enabled: None, + gc_compaction_enabled: DEFAULT_GC_COMPACTION_ENABLED, + gc_compaction_initial_threshold_kb: DEFAULT_GC_COMPACTION_INITIAL_THRESHOLD_KB, + gc_compaction_ratio_percent: DEFAULT_GC_COMPACTION_RATIO_PERCENT, } } } diff --git a/libs/pageserver_api/src/controller_api.rs b/libs/pageserver_api/src/controller_api.rs index 08d1fa55b9..78e080981a 100644 --- a/libs/pageserver_api/src/controller_api.rs +++ b/libs/pageserver_api/src/controller_api.rs @@ -324,7 +324,7 @@ impl From for String { #[derive(Serialize, Deserialize, Clone, Copy, Eq, PartialEq, Debug)] pub enum SkSchedulingPolicy { Active, - Disabled, + Pause, Decomissioned, } @@ -334,9 +334,13 @@ impl FromStr for SkSchedulingPolicy { fn from_str(s: &str) -> Result { Ok(match s { "active" => Self::Active, - "disabled" => Self::Disabled, + "pause" => Self::Pause, "decomissioned" => Self::Decomissioned, - _ => return Err(anyhow::anyhow!("Unknown scheduling state '{s}'")), + _ => { + return Err(anyhow::anyhow!( + "Unknown scheduling policy '{s}', try active,pause,decomissioned" + )) + } }) } } @@ -346,7 +350,7 @@ impl From for String { use SkSchedulingPolicy::*; match value { Active => "active", - Disabled => "disabled", + Pause => "pause", Decomissioned => "decomissioned", } .to_string() diff --git a/libs/pageserver_api/src/models.rs b/libs/pageserver_api/src/models.rs index 87e8df2ab6..43447c67bd 100644 --- a/libs/pageserver_api/src/models.rs +++ b/libs/pageserver_api/src/models.rs @@ -33,7 +33,6 @@ use crate::{ reltag::RelTag, shard::{ShardCount, ShardStripeSize, TenantShardId}, }; -use anyhow::bail; use bytes::{Buf, BufMut, Bytes, BytesMut}; /// The state of a tenant in this pageserver. @@ -459,10 +458,18 @@ pub struct TenantConfigPatch { pub compaction_period: FieldPatch, #[serde(skip_serializing_if = "FieldPatch::is_noop")] pub compaction_threshold: FieldPatch, + #[serde(skip_serializing_if = "FieldPatch::is_noop")] + pub compaction_upper_limit: FieldPatch, // defer parsing compaction_algorithm, like eviction_policy #[serde(skip_serializing_if = "FieldPatch::is_noop")] pub compaction_algorithm: FieldPatch, #[serde(skip_serializing_if = "FieldPatch::is_noop")] + pub l0_flush_delay_threshold: FieldPatch, + #[serde(skip_serializing_if = "FieldPatch::is_noop")] + pub l0_flush_stall_threshold: FieldPatch, + #[serde(skip_serializing_if = "FieldPatch::is_noop")] + pub l0_flush_wait_upload: FieldPatch, + #[serde(skip_serializing_if = "FieldPatch::is_noop")] pub gc_horizon: FieldPatch, #[serde(skip_serializing_if = "FieldPatch::is_noop")] pub gc_period: FieldPatch, @@ -498,6 +505,14 @@ pub struct TenantConfigPatch { pub timeline_offloading: FieldPatch, #[serde(skip_serializing_if = "FieldPatch::is_noop")] pub wal_receiver_protocol_override: FieldPatch, + #[serde(skip_serializing_if = "FieldPatch::is_noop")] + pub rel_size_v2_enabled: FieldPatch, + #[serde(skip_serializing_if = "FieldPatch::is_noop")] + pub gc_compaction_enabled: FieldPatch, + #[serde(skip_serializing_if = "FieldPatch::is_noop")] + pub gc_compaction_initial_threshold_kb: FieldPatch, + #[serde(skip_serializing_if = "FieldPatch::is_noop")] + pub gc_compaction_ratio_percent: FieldPatch, } /// An alternative representation of `pageserver::tenant::TenantConf` with @@ -509,8 +524,12 @@ pub struct TenantConfig { pub compaction_target_size: Option, pub compaction_period: Option, pub compaction_threshold: Option, + pub compaction_upper_limit: Option, // defer parsing compaction_algorithm, like eviction_policy pub compaction_algorithm: Option, + pub l0_flush_delay_threshold: Option, + pub l0_flush_stall_threshold: Option, + pub l0_flush_wait_upload: Option, pub gc_horizon: Option, pub gc_period: Option, pub image_creation_threshold: Option, @@ -529,6 +548,10 @@ pub struct TenantConfig { pub lsn_lease_length_for_ts: Option, pub timeline_offloading: Option, pub wal_receiver_protocol_override: Option, + pub rel_size_v2_enabled: Option, + pub gc_compaction_enabled: Option, + pub gc_compaction_initial_threshold_kb: Option, + pub gc_compaction_ratio_percent: Option, } impl TenantConfig { @@ -539,7 +562,11 @@ impl TenantConfig { mut compaction_target_size, mut compaction_period, mut compaction_threshold, + mut compaction_upper_limit, mut compaction_algorithm, + mut l0_flush_delay_threshold, + mut l0_flush_stall_threshold, + mut l0_flush_wait_upload, mut gc_horizon, mut gc_period, mut image_creation_threshold, @@ -558,6 +585,10 @@ impl TenantConfig { mut lsn_lease_length_for_ts, mut timeline_offloading, mut wal_receiver_protocol_override, + mut rel_size_v2_enabled, + mut gc_compaction_enabled, + mut gc_compaction_initial_threshold_kb, + mut gc_compaction_ratio_percent, } = self; patch.checkpoint_distance.apply(&mut checkpoint_distance); @@ -567,7 +598,17 @@ impl TenantConfig { .apply(&mut compaction_target_size); patch.compaction_period.apply(&mut compaction_period); patch.compaction_threshold.apply(&mut compaction_threshold); + patch + .compaction_upper_limit + .apply(&mut compaction_upper_limit); patch.compaction_algorithm.apply(&mut compaction_algorithm); + patch + .l0_flush_delay_threshold + .apply(&mut l0_flush_delay_threshold); + patch + .l0_flush_stall_threshold + .apply(&mut l0_flush_stall_threshold); + patch.l0_flush_wait_upload.apply(&mut l0_flush_wait_upload); patch.gc_horizon.apply(&mut gc_horizon); patch.gc_period.apply(&mut gc_period); patch @@ -602,6 +643,16 @@ impl TenantConfig { patch .wal_receiver_protocol_override .apply(&mut wal_receiver_protocol_override); + patch.rel_size_v2_enabled.apply(&mut rel_size_v2_enabled); + patch + .gc_compaction_enabled + .apply(&mut gc_compaction_enabled); + patch + .gc_compaction_initial_threshold_kb + .apply(&mut gc_compaction_initial_threshold_kb); + patch + .gc_compaction_ratio_percent + .apply(&mut gc_compaction_ratio_percent); Self { checkpoint_distance, @@ -609,7 +660,11 @@ impl TenantConfig { compaction_target_size, compaction_period, compaction_threshold, + compaction_upper_limit, compaction_algorithm, + l0_flush_delay_threshold, + l0_flush_stall_threshold, + l0_flush_wait_upload, gc_horizon, gc_period, image_creation_threshold, @@ -628,6 +683,10 @@ impl TenantConfig { lsn_lease_length_for_ts, timeline_offloading, wal_receiver_protocol_override, + rel_size_v2_enabled, + gc_compaction_enabled, + gc_compaction_initial_threshold_kb, + gc_compaction_ratio_percent, } } } @@ -970,6 +1029,13 @@ pub struct TenantConfigPatchRequest { pub config: TenantConfigPatch, // as we have a flattened field, we should reject all unknown fields in it } +#[derive(Serialize, Deserialize, Debug)] +pub struct TenantWaitLsnRequest { + #[serde(flatten)] + pub timelines: HashMap, + pub timeout: Duration, +} + /// See [`TenantState::attachment_status`] and the OpenAPI docs for context. #[derive(Serialize, Deserialize, Clone)] #[serde(tag = "slug", content = "data", rename_all = "snake_case")] @@ -1400,6 +1466,8 @@ pub enum PagestreamFeMessage { GetPage(PagestreamGetPageRequest), DbSize(PagestreamDbSizeRequest), GetSlruSegment(PagestreamGetSlruSegmentRequest), + #[cfg(feature = "testing")] + Test(PagestreamTestRequest), } // Wrapped in libpq CopyData @@ -1411,6 +1479,22 @@ pub enum PagestreamBeMessage { Error(PagestreamErrorResponse), DbSize(PagestreamDbSizeResponse), GetSlruSegment(PagestreamGetSlruSegmentResponse), + #[cfg(feature = "testing")] + Test(PagestreamTestResponse), +} + +// Keep in sync with `pagestore_client.h` +#[repr(u8)] +enum PagestreamFeMessageTag { + Exists = 0, + Nblocks = 1, + GetPage = 2, + DbSize = 3, + GetSlruSegment = 4, + /* future tags above this line */ + /// For testing purposes, not available in production. + #[cfg(feature = "testing")] + Test = 99, } // Keep in sync with `pagestore_client.h` @@ -1422,7 +1506,28 @@ enum PagestreamBeMessageTag { Error = 103, DbSize = 104, GetSlruSegment = 105, + /* future tags above this line */ + /// For testing purposes, not available in production. + #[cfg(feature = "testing")] + Test = 199, } + +impl TryFrom for PagestreamFeMessageTag { + type Error = u8; + fn try_from(value: u8) -> Result { + match value { + 0 => Ok(PagestreamFeMessageTag::Exists), + 1 => Ok(PagestreamFeMessageTag::Nblocks), + 2 => Ok(PagestreamFeMessageTag::GetPage), + 3 => Ok(PagestreamFeMessageTag::DbSize), + 4 => Ok(PagestreamFeMessageTag::GetSlruSegment), + #[cfg(feature = "testing")] + 99 => Ok(PagestreamFeMessageTag::Test), + _ => Err(value), + } + } +} + impl TryFrom for PagestreamBeMessageTag { type Error = u8; fn try_from(value: u8) -> Result { @@ -1433,6 +1538,8 @@ impl TryFrom for PagestreamBeMessageTag { 103 => Ok(PagestreamBeMessageTag::Error), 104 => Ok(PagestreamBeMessageTag::DbSize), 105 => Ok(PagestreamBeMessageTag::GetSlruSegment), + #[cfg(feature = "testing")] + 199 => Ok(PagestreamBeMessageTag::Test), _ => Err(value), } } @@ -1550,6 +1657,20 @@ pub struct PagestreamDbSizeResponse { pub db_size: i64, } +#[cfg(feature = "testing")] +#[derive(Debug, PartialEq, Eq, Clone)] +pub struct PagestreamTestRequest { + pub hdr: PagestreamRequest, + pub batch_key: u64, + pub message: String, +} + +#[cfg(feature = "testing")] +#[derive(Debug)] +pub struct PagestreamTestResponse { + pub req: PagestreamTestRequest, +} + // This is a cut-down version of TenantHistorySize from the pageserver crate, omitting fields // that require pageserver-internal types. It is sufficient to get the total size. #[derive(Serialize, Deserialize, Debug)] @@ -1569,7 +1690,7 @@ impl PagestreamFeMessage { match self { Self::Exists(req) => { - bytes.put_u8(0); + bytes.put_u8(PagestreamFeMessageTag::Exists as u8); bytes.put_u64(req.hdr.reqid); bytes.put_u64(req.hdr.request_lsn.0); bytes.put_u64(req.hdr.not_modified_since.0); @@ -1580,7 +1701,7 @@ impl PagestreamFeMessage { } Self::Nblocks(req) => { - bytes.put_u8(1); + bytes.put_u8(PagestreamFeMessageTag::Nblocks as u8); bytes.put_u64(req.hdr.reqid); bytes.put_u64(req.hdr.request_lsn.0); bytes.put_u64(req.hdr.not_modified_since.0); @@ -1591,7 +1712,7 @@ impl PagestreamFeMessage { } Self::GetPage(req) => { - bytes.put_u8(2); + bytes.put_u8(PagestreamFeMessageTag::GetPage as u8); bytes.put_u64(req.hdr.reqid); bytes.put_u64(req.hdr.request_lsn.0); bytes.put_u64(req.hdr.not_modified_since.0); @@ -1603,7 +1724,7 @@ impl PagestreamFeMessage { } Self::DbSize(req) => { - bytes.put_u8(3); + bytes.put_u8(PagestreamFeMessageTag::DbSize as u8); bytes.put_u64(req.hdr.reqid); bytes.put_u64(req.hdr.request_lsn.0); bytes.put_u64(req.hdr.not_modified_since.0); @@ -1611,13 +1732,24 @@ impl PagestreamFeMessage { } Self::GetSlruSegment(req) => { - bytes.put_u8(4); + bytes.put_u8(PagestreamFeMessageTag::GetSlruSegment as u8); bytes.put_u64(req.hdr.reqid); bytes.put_u64(req.hdr.request_lsn.0); bytes.put_u64(req.hdr.not_modified_since.0); bytes.put_u8(req.kind); bytes.put_u32(req.segno); } + #[cfg(feature = "testing")] + Self::Test(req) => { + bytes.put_u8(PagestreamFeMessageTag::Test as u8); + bytes.put_u64(req.hdr.reqid); + bytes.put_u64(req.hdr.request_lsn.0); + bytes.put_u64(req.hdr.not_modified_since.0); + bytes.put_u64(req.batch_key); + let message = req.message.as_bytes(); + bytes.put_u64(message.len() as u64); + bytes.put_slice(message); + } } bytes.into() @@ -1645,56 +1777,66 @@ impl PagestreamFeMessage { ), }; - match msg_tag { - 0 => Ok(PagestreamFeMessage::Exists(PagestreamExistsRequest { - hdr: PagestreamRequest { - reqid, - request_lsn, - not_modified_since, - }, - rel: RelTag { - spcnode: body.read_u32::()?, + match PagestreamFeMessageTag::try_from(msg_tag) + .map_err(|tag: u8| anyhow::anyhow!("invalid tag {tag}"))? + { + PagestreamFeMessageTag::Exists => { + Ok(PagestreamFeMessage::Exists(PagestreamExistsRequest { + hdr: PagestreamRequest { + reqid, + request_lsn, + not_modified_since, + }, + rel: RelTag { + spcnode: body.read_u32::()?, + dbnode: body.read_u32::()?, + relnode: body.read_u32::()?, + forknum: body.read_u8()?, + }, + })) + } + PagestreamFeMessageTag::Nblocks => { + Ok(PagestreamFeMessage::Nblocks(PagestreamNblocksRequest { + hdr: PagestreamRequest { + reqid, + request_lsn, + not_modified_since, + }, + rel: RelTag { + spcnode: body.read_u32::()?, + dbnode: body.read_u32::()?, + relnode: body.read_u32::()?, + forknum: body.read_u8()?, + }, + })) + } + PagestreamFeMessageTag::GetPage => { + Ok(PagestreamFeMessage::GetPage(PagestreamGetPageRequest { + hdr: PagestreamRequest { + reqid, + request_lsn, + not_modified_since, + }, + rel: RelTag { + spcnode: body.read_u32::()?, + dbnode: body.read_u32::()?, + relnode: body.read_u32::()?, + forknum: body.read_u8()?, + }, + blkno: body.read_u32::()?, + })) + } + PagestreamFeMessageTag::DbSize => { + Ok(PagestreamFeMessage::DbSize(PagestreamDbSizeRequest { + hdr: PagestreamRequest { + reqid, + request_lsn, + not_modified_since, + }, dbnode: body.read_u32::()?, - relnode: body.read_u32::()?, - forknum: body.read_u8()?, - }, - })), - 1 => Ok(PagestreamFeMessage::Nblocks(PagestreamNblocksRequest { - hdr: PagestreamRequest { - reqid, - request_lsn, - not_modified_since, - }, - rel: RelTag { - spcnode: body.read_u32::()?, - dbnode: body.read_u32::()?, - relnode: body.read_u32::()?, - forknum: body.read_u8()?, - }, - })), - 2 => Ok(PagestreamFeMessage::GetPage(PagestreamGetPageRequest { - hdr: PagestreamRequest { - reqid, - request_lsn, - not_modified_since, - }, - rel: RelTag { - spcnode: body.read_u32::()?, - dbnode: body.read_u32::()?, - relnode: body.read_u32::()?, - forknum: body.read_u8()?, - }, - blkno: body.read_u32::()?, - })), - 3 => Ok(PagestreamFeMessage::DbSize(PagestreamDbSizeRequest { - hdr: PagestreamRequest { - reqid, - request_lsn, - not_modified_since, - }, - dbnode: body.read_u32::()?, - })), - 4 => Ok(PagestreamFeMessage::GetSlruSegment( + })) + } + PagestreamFeMessageTag::GetSlruSegment => Ok(PagestreamFeMessage::GetSlruSegment( PagestreamGetSlruSegmentRequest { hdr: PagestreamRequest { reqid, @@ -1705,7 +1847,21 @@ impl PagestreamFeMessage { segno: body.read_u32::()?, }, )), - _ => bail!("unknown smgr message tag: {:?}", msg_tag), + #[cfg(feature = "testing")] + PagestreamFeMessageTag::Test => Ok(PagestreamFeMessage::Test(PagestreamTestRequest { + hdr: PagestreamRequest { + reqid, + request_lsn, + not_modified_since, + }, + batch_key: body.read_u64::()?, + message: { + let len = body.read_u64::()?; + let mut buf = vec![0; len as usize]; + body.read_exact(&mut buf)?; + String::from_utf8(buf)? + }, + })), } } } @@ -1748,6 +1904,15 @@ impl PagestreamBeMessage { bytes.put_u32((resp.segment.len() / BLCKSZ as usize) as u32); bytes.put(&resp.segment[..]); } + + #[cfg(feature = "testing")] + Self::Test(resp) => { + bytes.put_u8(Tag::Test as u8); + bytes.put_u64(resp.req.batch_key); + let message = resp.req.message.as_bytes(); + bytes.put_u64(message.len() as u64); + bytes.put_slice(message); + } } } PagestreamProtocolVersion::V3 => { @@ -1816,6 +1981,18 @@ impl PagestreamBeMessage { bytes.put_u32((resp.segment.len() / BLCKSZ as usize) as u32); bytes.put(&resp.segment[..]); } + + #[cfg(feature = "testing")] + Self::Test(resp) => { + bytes.put_u8(Tag::Test as u8); + bytes.put_u64(resp.req.hdr.reqid); + bytes.put_u64(resp.req.hdr.request_lsn.0); + bytes.put_u64(resp.req.hdr.not_modified_since.0); + bytes.put_u64(resp.req.batch_key); + let message = resp.req.message.as_bytes(); + bytes.put_u64(message.len() as u64); + bytes.put_slice(message); + } } } } @@ -1958,6 +2135,28 @@ impl PagestreamBeMessage { segment: segment.into(), }) } + #[cfg(feature = "testing")] + Tag::Test => { + let reqid = buf.read_u64::()?; + let request_lsn = Lsn(buf.read_u64::()?); + let not_modified_since = Lsn(buf.read_u64::()?); + let batch_key = buf.read_u64::()?; + let len = buf.read_u64::()?; + let mut msg = vec![0; len as usize]; + buf.read_exact(&mut msg)?; + let message = String::from_utf8(msg)?; + Self::Test(PagestreamTestResponse { + req: PagestreamTestRequest { + hdr: PagestreamRequest { + reqid, + request_lsn, + not_modified_since, + }, + batch_key, + message, + }, + }) + } }; let remaining = buf.into_inner(); if !remaining.is_empty() { @@ -1977,6 +2176,8 @@ impl PagestreamBeMessage { Self::Error(_) => "Error", Self::DbSize(_) => "DbSize", Self::GetSlruSegment(_) => "GetSlruSegment", + #[cfg(feature = "testing")] + Self::Test(_) => "Test", } } } diff --git a/libs/pq_proto/src/lib.rs b/libs/pq_proto/src/lib.rs index 50b2c69d24..f99128b76a 100644 --- a/libs/pq_proto/src/lib.rs +++ b/libs/pq_proto/src/lib.rs @@ -182,6 +182,13 @@ pub struct CancelKeyData { pub cancel_key: i32, } +pub fn id_to_cancel_key(id: u64) -> CancelKeyData { + CancelKeyData { + backend_pid: (id >> 32) as i32, + cancel_key: (id & 0xffffffff) as i32, + } +} + impl fmt::Display for CancelKeyData { fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { let hi = (self.backend_pid as u64) << 32; diff --git a/libs/proxy/tokio-postgres2/Cargo.toml b/libs/proxy/tokio-postgres2/Cargo.toml index 56e7c4da47..ade0ffc9f6 100644 --- a/libs/proxy/tokio-postgres2/Cargo.toml +++ b/libs/proxy/tokio-postgres2/Cargo.toml @@ -19,3 +19,4 @@ postgres-protocol2 = { path = "../postgres-protocol2" } postgres-types2 = { path = "../postgres-types2" } tokio = { workspace = true, features = ["io-util", "time", "net"] } tokio-util = { workspace = true, features = ["codec"] } +serde = { workspace = true, features = ["derive"] } \ No newline at end of file diff --git a/libs/proxy/tokio-postgres2/src/cancel_token.rs b/libs/proxy/tokio-postgres2/src/cancel_token.rs index a10e8bf5c3..718f903a92 100644 --- a/libs/proxy/tokio-postgres2/src/cancel_token.rs +++ b/libs/proxy/tokio-postgres2/src/cancel_token.rs @@ -3,12 +3,13 @@ use crate::tls::TlsConnect; use crate::{cancel_query, client::SocketConfig, tls::MakeTlsConnect}; use crate::{cancel_query_raw, Error}; +use serde::{Deserialize, Serialize}; use tokio::io::{AsyncRead, AsyncWrite}; use tokio::net::TcpStream; /// The capability to request cancellation of in-progress queries on a /// connection. -#[derive(Clone)] +#[derive(Clone, Serialize, Deserialize)] pub struct CancelToken { pub socket_config: Option, pub ssl_mode: SslMode, diff --git a/libs/proxy/tokio-postgres2/src/client.rs b/libs/proxy/tokio-postgres2/src/client.rs index a7cd53afc3..9bbbd4c260 100644 --- a/libs/proxy/tokio-postgres2/src/client.rs +++ b/libs/proxy/tokio-postgres2/src/client.rs @@ -18,6 +18,7 @@ use fallible_iterator::FallibleIterator; use futures_util::{future, ready, TryStreamExt}; use parking_lot::Mutex; use postgres_protocol2::message::{backend::Message, frontend}; +use serde::{Deserialize, Serialize}; use std::collections::HashMap; use std::fmt; use std::sync::Arc; @@ -137,7 +138,7 @@ impl InnerClient { } } -#[derive(Clone)] +#[derive(Clone, Serialize, Deserialize)] pub struct SocketConfig { pub host: Host, pub port: u16, diff --git a/libs/proxy/tokio-postgres2/src/config.rs b/libs/proxy/tokio-postgres2/src/config.rs index 11a361a81b..47cc45ac80 100644 --- a/libs/proxy/tokio-postgres2/src/config.rs +++ b/libs/proxy/tokio-postgres2/src/config.rs @@ -7,6 +7,7 @@ use crate::tls::MakeTlsConnect; use crate::tls::TlsConnect; use crate::{Client, Connection, Error}; use postgres_protocol2::message::frontend::StartupMessageParams; +use serde::{Deserialize, Serialize}; use std::fmt; use std::str; use std::time::Duration; @@ -16,7 +17,7 @@ pub use postgres_protocol2::authentication::sasl::ScramKeys; use tokio::net::TcpStream; /// TLS configuration. -#[derive(Debug, Copy, Clone, PartialEq, Eq)] +#[derive(Debug, Copy, Clone, PartialEq, Eq, Serialize, Deserialize)] #[non_exhaustive] pub enum SslMode { /// Do not use TLS. @@ -50,7 +51,7 @@ pub enum ReplicationMode { } /// A host specification. -#[derive(Debug, Clone, PartialEq, Eq)] +#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)] pub enum Host { /// A TCP hostname. Tcp(String), diff --git a/libs/remote_storage/src/azure_blob.rs b/libs/remote_storage/src/azure_blob.rs index c89f50ef2b..9027a8bf55 100644 --- a/libs/remote_storage/src/azure_blob.rs +++ b/libs/remote_storage/src/azure_blob.rs @@ -377,7 +377,8 @@ impl RemoteStorage for AzureBlobStorage { let next_item = next_item?; - if timeout_try_cnt >= 2 { + // Log a warning if we saw two timeouts in a row before a successful request + if timeout_try_cnt > 2 { tracing::warn!("Azure Blob Storage list timed out and succeeded after {} tries", timeout_try_cnt); } timeout_try_cnt = 1; diff --git a/libs/safekeeper_api/src/models.rs b/libs/safekeeper_api/src/models.rs index b5fa903820..30418b0efd 100644 --- a/libs/safekeeper_api/src/models.rs +++ b/libs/safekeeper_api/src/models.rs @@ -277,3 +277,8 @@ pub struct TimelineTermBumpResponse { pub previous_term: u64, pub current_term: u64, } + +#[derive(Debug, Clone, Deserialize, Serialize)] +pub struct SafekeeperUtilization { + pub timeline_count: u64, +} diff --git a/libs/utils/src/env.rs b/libs/utils/src/env.rs index b3e326bfd0..a1bcec9229 100644 --- a/libs/utils/src/env.rs +++ b/libs/utils/src/env.rs @@ -2,6 +2,7 @@ use std::{fmt::Display, str::FromStr}; +/// For types `V` that implement [`FromStr`]. pub fn var(varname: &str) -> Option where V: FromStr, @@ -10,7 +11,9 @@ where match std::env::var(varname) { Ok(s) => Some( s.parse() - .map_err(|e| format!("failed to parse env var {varname}: {e:#}")) + .map_err(|e| { + format!("failed to parse env var {varname} using FromStr::parse: {e:#}") + }) .unwrap(), ), Err(std::env::VarError::NotPresent) => None, @@ -19,3 +22,24 @@ where } } } + +/// For types `V` that implement [`serde::de::DeserializeOwned`]. +pub fn var_serde_json_string(varname: &str) -> Option +where + V: serde::de::DeserializeOwned, +{ + match std::env::var(varname) { + Ok(s) => Some({ + let value = serde_json::Value::String(s); + serde_json::from_value(value) + .map_err(|e| { + format!("failed to parse env var {varname} as a serde_json json string: {e:#}") + }) + .unwrap() + }), + Err(std::env::VarError::NotPresent) => None, + Err(std::env::VarError::NotUnicode(_)) => { + panic!("env var {varname} is not unicode") + } + } +} diff --git a/libs/utils/src/failpoint_support.rs b/libs/utils/src/failpoint_support.rs index 701ba2d42c..272c6ebb26 100644 --- a/libs/utils/src/failpoint_support.rs +++ b/libs/utils/src/failpoint_support.rs @@ -11,31 +11,55 @@ use tracing::*; /// Declare a failpoint that can use to `pause` failpoint action. /// We don't want to block the executor thread, hence, spawn_blocking + await. +/// +/// Optionally pass a cancellation token, and this failpoint will drop out of +/// its pause when the cancellation token fires. This is useful for testing +/// cases where we would like to block something, but test its clean shutdown behavior. +/// The macro evaluates to a Result in that case, where Ok(()) is the case +/// where the failpoint was not paused, and Err() is the case where cancellation +/// token fired while evaluating the failpoint. +/// +/// Remember to unpause the failpoint in the test; until that happens, one of the +/// limited number of spawn_blocking thread pool threads is leaked. #[macro_export] macro_rules! pausable_failpoint { - ($name:literal) => { + ($name:literal) => {{ if cfg!(feature = "testing") { - tokio::task::spawn_blocking({ - let current = tracing::Span::current(); + let cancel = ::tokio_util::sync::CancellationToken::new(); + let _ = $crate::pausable_failpoint!($name, &cancel); + } + }}; + ($name:literal, $cancel:expr) => {{ + if cfg!(feature = "testing") { + let failpoint_fut = ::tokio::task::spawn_blocking({ + let current = ::tracing::Span::current(); move || { let _entered = current.entered(); - tracing::info!("at failpoint {}", $name); - fail::fail_point!($name); + ::tracing::info!("at failpoint {}", $name); + ::fail::fail_point!($name); + } + }); + let cancel_fut = async move { + $cancel.cancelled().await; + }; + ::tokio::select! { + res = failpoint_fut => { + res.expect("spawn_blocking"); + // continue with execution + Ok(()) + }, + _ = cancel_fut => { + Err(()) } - }) - .await - .expect("spawn_blocking"); - } - }; - ($name:literal, $cond:expr) => { - if cfg!(feature = "testing") { - if $cond { - pausable_failpoint!($name) } + } else { + Ok(()) } - }; + }}; } +pub use pausable_failpoint; + /// use with fail::cfg("$name", "return(2000)") /// /// The effect is similar to a "sleep(2000)" action, i.e. we sleep for the diff --git a/libs/utils/src/guard_arc_swap.rs b/libs/utils/src/guard_arc_swap.rs new file mode 100644 index 0000000000..cec5202460 --- /dev/null +++ b/libs/utils/src/guard_arc_swap.rs @@ -0,0 +1,54 @@ +//! A wrapper around `ArcSwap` that ensures there is only one writer at a time and writes +//! don't block reads. + +use arc_swap::ArcSwap; +use std::sync::Arc; +use tokio::sync::TryLockError; + +pub struct GuardArcSwap { + inner: ArcSwap, + guard: tokio::sync::Mutex<()>, +} + +pub struct Guard<'a, T> { + _guard: tokio::sync::MutexGuard<'a, ()>, + inner: &'a ArcSwap, +} + +impl GuardArcSwap { + pub fn new(inner: T) -> Self { + Self { + inner: ArcSwap::new(Arc::new(inner)), + guard: tokio::sync::Mutex::new(()), + } + } + + pub fn read(&self) -> Arc { + self.inner.load_full() + } + + pub async fn write_guard(&self) -> Guard<'_, T> { + Guard { + _guard: self.guard.lock().await, + inner: &self.inner, + } + } + + pub fn try_write_guard(&self) -> Result, TryLockError> { + let guard = self.guard.try_lock()?; + Ok(Guard { + _guard: guard, + inner: &self.inner, + }) + } +} + +impl Guard<'_, T> { + pub fn read(&self) -> Arc { + self.inner.load_full() + } + + pub fn write(&mut self, value: T) { + self.inner.store(Arc::new(value)); + } +} diff --git a/libs/utils/src/lib.rs b/libs/utils/src/lib.rs index 2c56dd750f..1fb18e9e9a 100644 --- a/libs/utils/src/lib.rs +++ b/libs/utils/src/lib.rs @@ -98,6 +98,8 @@ pub mod try_rcu; pub mod pprof; +pub mod guard_arc_swap; + // Re-export used in macro. Avoids adding git-version as dep in target crates. #[doc(hidden)] pub use git_version; diff --git a/libs/utils/src/sync/gate.rs b/libs/utils/src/sync/gate.rs index 16ec563fa7..0a1ed81621 100644 --- a/libs/utils/src/sync/gate.rs +++ b/libs/utils/src/sync/gate.rs @@ -64,6 +64,12 @@ pub struct GateGuard { gate: Arc, } +impl GateGuard { + pub fn try_clone(&self) -> Result { + Gate::enter_impl(self.gate.clone()) + } +} + impl Drop for GateGuard { fn drop(&mut self) { if self.gate.closing.load(Ordering::Relaxed) { @@ -107,11 +113,11 @@ impl Gate { /// to avoid blocking close() indefinitely: typically types that contain a Gate will /// also contain a CancellationToken. pub fn enter(&self) -> Result { - let permit = self - .inner - .sem - .try_acquire() - .map_err(|_| GateError::GateClosed)?; + Self::enter_impl(self.inner.clone()) + } + + fn enter_impl(gate: Arc) -> Result { + let permit = gate.sem.try_acquire().map_err(|_| GateError::GateClosed)?; // we now have the permit, let's disable the normal raii functionality and leave // "returning" the permit to our GateGuard::drop. @@ -122,7 +128,7 @@ impl Gate { Ok(GateGuard { span_at_enter: tracing::Span::current(), - gate: self.inner.clone(), + gate, }) } @@ -252,4 +258,39 @@ mod tests { // Attempting to enter() is still forbidden gate.enter().expect_err("enter should fail finishing close"); } + + #[tokio::test(start_paused = true)] + async fn clone_gate_guard() { + let gate = Gate::default(); + let forever = Duration::from_secs(24 * 7 * 365); + + let guard1 = gate.enter().expect("gate isn't closed"); + + let guard2 = guard1.try_clone().expect("gate isn't clsoed"); + + let mut close_fut = std::pin::pin!(gate.close()); + + tokio::time::timeout(forever, &mut close_fut) + .await + .unwrap_err(); + + // we polled close_fut once, that should prevent all later enters and clones + gate.enter().unwrap_err(); + guard1.try_clone().unwrap_err(); + guard2.try_clone().unwrap_err(); + + // guard2 keeps gate open even if guard1 is closed + drop(guard1); + tokio::time::timeout(forever, &mut close_fut) + .await + .unwrap_err(); + + drop(guard2); + + // now that the last guard is dropped, closing should complete + close_fut.await; + + // entering is still forbidden + gate.enter().expect_err("enter should stilll fail"); + } } diff --git a/libs/vm_monitor/src/dispatcher.rs b/libs/vm_monitor/src/dispatcher.rs index 6a965ace9b..c81848cb70 100644 --- a/libs/vm_monitor/src/dispatcher.rs +++ b/libs/vm_monitor/src/dispatcher.rs @@ -7,7 +7,7 @@ //! (notifying it of upscale). use anyhow::{bail, Context}; -use axum::extract::ws::{Message, WebSocket}; +use axum::extract::ws::{Message, Utf8Bytes, WebSocket}; use futures::{ stream::{SplitSink, SplitStream}, SinkExt, StreamExt, @@ -82,21 +82,21 @@ impl Dispatcher { let highest_shared_version = match monitor_range.highest_shared_version(&agent_range) { Ok(version) => { - sink.send(Message::Text( + sink.send(Message::Text(Utf8Bytes::from( serde_json::to_string(&ProtocolResponse::Version(version)).unwrap(), - )) + ))) .await .context("failed to notify agent of negotiated protocol version")?; version } Err(e) => { - sink.send(Message::Text( + sink.send(Message::Text(Utf8Bytes::from( serde_json::to_string(&ProtocolResponse::Error(format!( "Received protocol version range {} which does not overlap with {}", agent_range, monitor_range ))) .unwrap(), - )) + ))) .await .context("failed to notify agent of no overlap between protocol version ranges")?; Err(e).context("error determining suitable protocol version range")? @@ -126,7 +126,7 @@ impl Dispatcher { let json = serde_json::to_string(&message).context("failed to serialize message")?; self.sink - .send(Message::Text(json)) + .send(Message::Text(Utf8Bytes::from(json))) .await .context("stream error sending message") } diff --git a/libs/wal_decoder/Cargo.toml b/libs/wal_decoder/Cargo.toml index 09c4afb18a..cb0ef4b00d 100644 --- a/libs/wal_decoder/Cargo.toml +++ b/libs/wal_decoder/Cargo.toml @@ -17,7 +17,6 @@ postgres_ffi.workspace = true serde.workspace = true thiserror.workspace = true tokio = { workspace = true, features = ["io-util"] } -tonic.workspace = true tracing.workspace = true utils.workspace = true workspace_hack = { version = "0.1", path = "../../workspace_hack" } diff --git a/libs/wal_decoder/src/models.rs b/libs/wal_decoder/src/models.rs index c2f9125b21..51bf7e44ab 100644 --- a/libs/wal_decoder/src/models.rs +++ b/libs/wal_decoder/src/models.rs @@ -45,7 +45,7 @@ pub mod proto { #![allow(clippy::derive_partial_eq_without_eq)] // The generated ValueMeta has a `len` method generate for its `len` field. #![allow(clippy::len_without_is_empty)] - tonic::include_proto!("interpreted_wal"); + include!(concat!(env!("OUT_DIR"), concat!("/interpreted_wal.rs"))); } #[derive(Copy, Clone, Serialize, Deserialize)] diff --git a/pageserver/Cargo.toml b/pageserver/Cargo.toml index 9195951191..6e4eaa0efd 100644 --- a/pageserver/Cargo.toml +++ b/pageserver/Cargo.toml @@ -8,7 +8,7 @@ license.workspace = true default = [] # Enables test-only APIs, incuding failpoints. In particular, enables the `fail_point!` macro, # which adds some runtime cost to run tests on outage conditions -testing = ["fail/failpoints", "pageserver_api/testing", "wal_decoder/testing"] +testing = ["fail/failpoints", "pageserver_api/testing", "wal_decoder/testing", "pageserver_client/testing"] [dependencies] anyhow.workspace = true @@ -36,7 +36,7 @@ itertools.workspace = true md5.workspace = true nix.workspace = true # hack to get the number of worker threads tokio uses -num_cpus = { version = "1.15" } +num_cpus.workspace = true num-traits.workspace = true once_cell.workspace = true pin-project-lite.workspace = true @@ -114,3 +114,7 @@ harness = false [[bench]] name = "upload_queue" harness = false + +[[bin]] +name = "test_helper_slow_client_reads" +required-features = [ "testing" ] diff --git a/pageserver/client/Cargo.toml b/pageserver/client/Cargo.toml index d9b36bf3d4..f582d307a7 100644 --- a/pageserver/client/Cargo.toml +++ b/pageserver/client/Cargo.toml @@ -4,6 +4,9 @@ version = "0.1.0" edition.workspace = true license.workspace = true +[features] +testing = [ "pageserver_api/testing" ] + [dependencies] pageserver_api.workspace = true thiserror.workspace = true diff --git a/pageserver/client/src/mgmt_api.rs b/pageserver/client/src/mgmt_api.rs index 4e9b11879d..0359bfcd0b 100644 --- a/pageserver/client/src/mgmt_api.rs +++ b/pageserver/client/src/mgmt_api.rs @@ -763,4 +763,19 @@ impl Client { .await .map_err(Error::ReceiveBody) } + + pub async fn wait_lsn( + &self, + tenant_shard_id: TenantShardId, + request: TenantWaitLsnRequest, + ) -> Result { + let uri = format!( + "{}/v1/tenant/{tenant_shard_id}/wait_lsn", + self.mgmt_api_endpoint, + ); + + self.request_noerror(Method::POST, uri, request) + .await + .map(|resp| resp.status()) + } } diff --git a/pageserver/client/src/page_service.rs b/pageserver/client/src/page_service.rs index 207ec4166c..27280912b4 100644 --- a/pageserver/client/src/page_service.rs +++ b/pageserver/client/src/page_service.rs @@ -1,6 +1,9 @@ -use std::pin::Pin; +use std::sync::{Arc, Mutex}; -use futures::SinkExt; +use futures::{ + stream::{SplitSink, SplitStream}, + SinkExt, StreamExt, +}; use pageserver_api::{ models::{ PagestreamBeMessage, PagestreamFeMessage, PagestreamGetPageRequest, @@ -10,7 +13,6 @@ use pageserver_api::{ }; use tokio::task::JoinHandle; use tokio_postgres::CopyOutStream; -use tokio_stream::StreamExt; use tokio_util::sync::CancellationToken; use utils::{ id::{TenantId, TimelineId}, @@ -62,15 +64,28 @@ impl Client { .client .copy_both_simple(&format!("pagestream_v3 {tenant_id} {timeline_id}")) .await?; + let (sink, stream) = copy_both.split(); // TODO: actually support splitting of the CopyBothDuplex so the lock inside this split adaptor goes away. let Client { cancel_on_client_drop, conn_task, client: _, } = self; + let shared = Arc::new(Mutex::new(PagestreamShared::ConnTaskRunning( + ConnTaskRunning { + cancel_on_client_drop, + conn_task, + }, + ))); Ok(PagestreamClient { - copy_both: Box::pin(copy_both), - conn_task, - cancel_on_client_drop, + sink: PagestreamSender { + shared: shared.clone(), + sink, + }, + stream: PagestreamReceiver { + shared: shared.clone(), + stream, + }, + shared, }) } @@ -97,7 +112,28 @@ impl Client { /// Create using [`Client::pagestream`]. pub struct PagestreamClient { - copy_both: Pin>>, + shared: Arc>, + sink: PagestreamSender, + stream: PagestreamReceiver, +} + +pub struct PagestreamSender { + #[allow(dead_code)] + shared: Arc>, + sink: SplitSink, bytes::Bytes>, +} + +pub struct PagestreamReceiver { + #[allow(dead_code)] + shared: Arc>, + stream: SplitStream>, +} + +enum PagestreamShared { + ConnTaskRunning(ConnTaskRunning), + ConnTaskCancelledJoinHandleReturnedOrDropped, +} +struct ConnTaskRunning { cancel_on_client_drop: Option, conn_task: JoinHandle<()>, } @@ -110,11 +146,11 @@ pub struct RelTagBlockNo { impl PagestreamClient { pub async fn shutdown(self) { let Self { - copy_both, - cancel_on_client_drop: cancel_conn_task, - conn_task, - } = self; - // The `copy_both` contains internal channel sender, the receiver of which is polled by `conn_task`. + shared, + sink, + stream, + } = { self }; + // The `copy_both` split into `sink` and `stream` contains internal channel sender, the receiver of which is polled by `conn_task`. // When `conn_task` observes the sender has been dropped, it sends a `FeMessage::CopyFail` into the connection. // (see https://github.com/neondatabase/rust-postgres/blob/2005bf79573b8add5cf205b52a2b208e356cc8b0/tokio-postgres/src/copy_both.rs#L56). // @@ -131,27 +167,77 @@ impl PagestreamClient { // // NB: page_service doesn't have a use case to exit the `pagestream` mode currently. // => https://github.com/neondatabase/neon/issues/6390 - let _ = cancel_conn_task.unwrap(); + let ConnTaskRunning { + cancel_on_client_drop, + conn_task, + } = { + let mut guard = shared.lock().unwrap(); + match std::mem::replace( + &mut *guard, + PagestreamShared::ConnTaskCancelledJoinHandleReturnedOrDropped, + ) { + PagestreamShared::ConnTaskRunning(conn_task_running) => conn_task_running, + PagestreamShared::ConnTaskCancelledJoinHandleReturnedOrDropped => unreachable!(), + } + }; + let _ = cancel_on_client_drop.unwrap(); conn_task.await.unwrap(); - drop(copy_both); + + // Now drop the split copy_both. + drop(sink); + drop(stream); + } + + pub fn split(self) -> (PagestreamSender, PagestreamReceiver) { + let Self { + shared: _, + sink, + stream, + } = self; + (sink, stream) } pub async fn getpage( &mut self, req: PagestreamGetPageRequest, ) -> anyhow::Result { - let req = PagestreamFeMessage::GetPage(req); - let req: bytes::Bytes = req.serialize(); - // let mut req = tokio_util::io::ReaderStream::new(&req); - let mut req = tokio_stream::once(Ok(req)); + self.getpage_send(req).await?; + self.getpage_recv().await + } - self.copy_both.send_all(&mut req).await?; + pub async fn getpage_send(&mut self, req: PagestreamGetPageRequest) -> anyhow::Result<()> { + self.sink.getpage_send(req).await + } - let next: Option> = self.copy_both.next().await; + pub async fn getpage_recv(&mut self) -> anyhow::Result { + self.stream.getpage_recv().await + } +} + +impl PagestreamSender { + // TODO: maybe make this impl Sink instead for better composability? + pub async fn send(&mut self, msg: PagestreamFeMessage) -> anyhow::Result<()> { + let msg = msg.serialize(); + self.sink.send_all(&mut tokio_stream::once(Ok(msg))).await?; + Ok(()) + } + + pub async fn getpage_send(&mut self, req: PagestreamGetPageRequest) -> anyhow::Result<()> { + self.send(PagestreamFeMessage::GetPage(req)).await + } +} + +impl PagestreamReceiver { + // TODO: maybe make this impl Stream instead for better composability? + pub async fn recv(&mut self) -> anyhow::Result { + let next: Option> = self.stream.next().await; let next: bytes::Bytes = next.unwrap()?; + PagestreamBeMessage::deserialize(next) + } - let msg = PagestreamBeMessage::deserialize(next)?; - match msg { + pub async fn getpage_recv(&mut self) -> anyhow::Result { + let next: PagestreamBeMessage = self.recv().await?; + match next { PagestreamBeMessage::GetPage(p) => Ok(p), PagestreamBeMessage::Error(e) => anyhow::bail!("Error: {:?}", e), PagestreamBeMessage::Exists(_) @@ -160,7 +246,14 @@ impl PagestreamClient { | PagestreamBeMessage::GetSlruSegment(_) => { anyhow::bail!( "unexpected be message kind in response to getpage request: {}", - msg.kind() + next.kind() + ) + } + #[cfg(feature = "testing")] + PagestreamBeMessage::Test(_) => { + anyhow::bail!( + "unexpected be message kind in response to getpage request: {}", + next.kind() ) } } diff --git a/pageserver/compaction/src/simulator/draw.rs b/pageserver/compaction/src/simulator/draw.rs index 997925067f..4559db09f1 100644 --- a/pageserver/compaction/src/simulator/draw.rs +++ b/pageserver/compaction/src/simulator/draw.rs @@ -160,9 +160,12 @@ pub fn draw_history(history: &[LayerTraceEvent], mut output: // Fill in and thicken rectangle if it's an // image layer so that we can see it. - let mut style = Style::default(); - style.fill = Fill::Color(rgb(0x80, 0x80, 0x80)); - style.stroke = Stroke::Color(rgb(0, 0, 0), 0.5); + let mut style = Style { + fill: Fill::Color(rgb(0x80, 0x80, 0x80)), + stroke: Stroke::Color(rgb(0, 0, 0), 0.5), + opacity: 1.0, + stroke_opacity: 1.0, + }; let y_start = lsn_max - lsn_start; let y_end = lsn_max - lsn_end; @@ -214,10 +217,6 @@ pub fn draw_history(history: &[LayerTraceEvent], mut output: files_seen.insert(f); } - let mut record_style = Style::default(); - record_style.fill = Fill::Color(rgb(0x80, 0x80, 0x80)); - record_style.stroke = Stroke::None; - writeln!(svg, "{}", EndSvg)?; let mut layer_events_str = String::new(); diff --git a/pageserver/pagebench/src/cmd/getpage_latest_lsn.rs b/pageserver/pagebench/src/cmd/getpage_latest_lsn.rs index 9f3984f1bd..a60efc7567 100644 --- a/pageserver/pagebench/src/cmd/getpage_latest_lsn.rs +++ b/pageserver/pagebench/src/cmd/getpage_latest_lsn.rs @@ -13,7 +13,7 @@ use rand::prelude::*; use tokio::task::JoinSet; use tracing::info; -use std::collections::HashSet; +use std::collections::{HashSet, VecDeque}; use std::future::Future; use std::num::NonZeroUsize; use std::pin::Pin; @@ -63,6 +63,10 @@ pub(crate) struct Args { #[clap(long)] set_io_mode: Option, + /// Queue depth generated in each client. + #[clap(long, default_value = "1")] + queue_depth: NonZeroUsize, + targets: Option>, } @@ -298,6 +302,7 @@ async fn main_impl( start_work_barrier.wait().await; let client_start = Instant::now(); let mut ticks_processed = 0; + let mut inflight = VecDeque::new(); while !cancel.is_cancelled() { // Detect if a request took longer than the RPS rate if let Some(period) = &rps_period { @@ -311,31 +316,37 @@ async fn main_impl( ticks_processed = periods_passed_until_now; } - let start = Instant::now(); - let req = { - let mut rng = rand::thread_rng(); - let r = &ranges[weights.sample(&mut rng)]; - let key: i128 = rng.gen_range(r.start..r.end); - let key = Key::from_i128(key); - assert!(key.is_rel_block_key()); - let (rel_tag, block_no) = key - .to_rel_block() - .expect("we filter non-rel-block keys out above"); - PagestreamGetPageRequest { - hdr: PagestreamRequest { - reqid: 0, - request_lsn: if rng.gen_bool(args.req_latest_probability) { - Lsn::MAX - } else { - r.timeline_lsn + while inflight.len() < args.queue_depth.get() { + let start = Instant::now(); + let req = { + let mut rng = rand::thread_rng(); + let r = &ranges[weights.sample(&mut rng)]; + let key: i128 = rng.gen_range(r.start..r.end); + let key = Key::from_i128(key); + assert!(key.is_rel_block_key()); + let (rel_tag, block_no) = key + .to_rel_block() + .expect("we filter non-rel-block keys out above"); + PagestreamGetPageRequest { + hdr: PagestreamRequest { + reqid: 0, + request_lsn: if rng.gen_bool(args.req_latest_probability) { + Lsn::MAX + } else { + r.timeline_lsn + }, + not_modified_since: r.timeline_lsn, }, - not_modified_since: r.timeline_lsn, - }, - rel: rel_tag, - blkno: block_no, - } - }; - client.getpage(req).await.unwrap(); + rel: rel_tag, + blkno: block_no, + } + }; + client.getpage_send(req).await.unwrap(); + inflight.push_back(start); + } + + let start = inflight.pop_front().unwrap(); + client.getpage_recv().await.unwrap(); let end = Instant::now(); live_stats.request_done(); ticks_processed += 1; diff --git a/pageserver/src/basebackup.rs b/pageserver/src/basebackup.rs index e1b5676f46..a6087920fd 100644 --- a/pageserver/src/basebackup.rs +++ b/pageserver/src/basebackup.rs @@ -25,6 +25,7 @@ use tokio_tar::{Builder, EntryType, Header}; use crate::context::RequestContext; use crate::pgdatadir_mapping::Version; +use crate::tenant::storage_layer::IoConcurrency; use crate::tenant::Timeline; use pageserver_api::reltag::{RelTag, SlruKind}; @@ -123,6 +124,13 @@ where full_backup, replica, ctx, + io_concurrency: IoConcurrency::spawn_from_conf( + timeline.conf, + timeline + .gate + .enter() + .map_err(|e| BasebackupError::Server(e.into()))?, + ), }; basebackup .send_tarball() @@ -144,6 +152,7 @@ where full_backup: bool, replica: bool, ctx: &'a RequestContext, + io_concurrency: IoConcurrency, } /// A sink that accepts SLRU blocks ordered by key and forwards @@ -303,7 +312,7 @@ where for part in slru_partitions.parts { let blocks = self .timeline - .get_vectored(part, self.lsn, self.ctx) + .get_vectored(part, self.lsn, self.io_concurrency.clone(), self.ctx) .await .map_err(|e| BasebackupError::Server(e.into()))?; @@ -358,7 +367,7 @@ where let start_time = Instant::now(); let aux_files = self .timeline - .list_aux_files(self.lsn, self.ctx) + .list_aux_files(self.lsn, self.ctx, self.io_concurrency.clone()) .await .map_err(|e| BasebackupError::Server(e.into()))?; let aux_scan_time = start_time.elapsed(); @@ -422,7 +431,7 @@ where } let repl_origins = self .timeline - .get_replorigins(self.lsn, self.ctx) + .get_replorigins(self.lsn, self.ctx, self.io_concurrency.clone()) .await .map_err(|e| BasebackupError::Server(e.into()))?; let n_origins = repl_origins.len(); @@ -489,7 +498,13 @@ where for blknum in startblk..endblk { let img = self .timeline - .get_rel_page_at_lsn(src, blknum, Version::Lsn(self.lsn), self.ctx) + .get_rel_page_at_lsn( + src, + blknum, + Version::Lsn(self.lsn), + self.ctx, + self.io_concurrency.clone(), + ) .await .map_err(|e| BasebackupError::Server(e.into()))?; segment_data.extend_from_slice(&img[..]); diff --git a/pageserver/src/bin/pageserver.rs b/pageserver/src/bin/pageserver.rs index 921c6a5092..5764728505 100644 --- a/pageserver/src/bin/pageserver.rs +++ b/pageserver/src/bin/pageserver.rs @@ -135,6 +135,7 @@ fn main() -> anyhow::Result<()> { info!(?conf.virtual_file_io_mode, "starting with virtual_file IO mode"); info!(?conf.wal_receiver_protocol, "starting with WAL receiver protocol"); info!(?conf.page_service_pipelining, "starting with page service pipelining config"); + info!(?conf.get_vectored_concurrent_io, "starting with get_vectored IO concurrency config"); // The tenants directory contains all the pageserver local disk state. // Create if not exists and make sure all the contents are durable before proceeding. diff --git a/pageserver/src/bin/test_helper_slow_client_reads.rs b/pageserver/src/bin/test_helper_slow_client_reads.rs new file mode 100644 index 0000000000..c1ce332b6c --- /dev/null +++ b/pageserver/src/bin/test_helper_slow_client_reads.rs @@ -0,0 +1,65 @@ +use std::{ + io::{stdin, stdout, Read, Write}, + time::Duration, +}; + +use clap::Parser; +use pageserver_api::models::{PagestreamRequest, PagestreamTestRequest}; +use utils::{ + id::{TenantId, TimelineId}, + lsn::Lsn, +}; + +#[derive(clap::Parser)] +struct Args { + connstr: String, + tenant_id: TenantId, + timeline_id: TimelineId, +} + +#[tokio::main] +async fn main() -> anyhow::Result<()> { + let Args { + connstr, + tenant_id, + timeline_id, + } = Args::parse(); + let client = pageserver_client::page_service::Client::new(connstr).await?; + let client = client.pagestream(tenant_id, timeline_id).await?; + let (mut sender, _receiver) = client.split(); + + eprintln!("filling the pipe"); + let mut msg = 0; + loop { + msg += 1; + let fut = sender.send(pageserver_api::models::PagestreamFeMessage::Test( + PagestreamTestRequest { + hdr: PagestreamRequest { + reqid: 0, + request_lsn: Lsn(23), + not_modified_since: Lsn(23), + }, + batch_key: 42, + message: format!("message {}", msg), + }, + )); + let Ok(res) = tokio::time::timeout(Duration::from_secs(10), fut).await else { + eprintln!("pipe seems full"); + break; + }; + let _: () = res?; + } + + let n = stdout().write(b"R")?; + assert_eq!(n, 1); + stdout().flush()?; + + eprintln!("waiting for signal to tell us to exit"); + + let mut buf = [0u8; 1]; + stdin().read_exact(&mut buf)?; + + eprintln!("termination signal received, exiting"); + + anyhow::Ok(()) +} diff --git a/pageserver/src/config.rs b/pageserver/src/config.rs index 1651db8500..ce480c70a0 100644 --- a/pageserver/src/config.rs +++ b/pageserver/src/config.rs @@ -191,6 +191,8 @@ pub struct PageServerConf { pub wal_receiver_protocol: PostgresClientProtocol, pub page_service_pipelining: pageserver_api::config::PageServicePipeliningConfig, + + pub get_vectored_concurrent_io: pageserver_api::config::GetVectoredConcurrentIo, } /// Token for authentication to safekeepers @@ -352,6 +354,7 @@ impl PageServerConf { no_sync, wal_receiver_protocol, page_service_pipelining, + get_vectored_concurrent_io, } = config_toml; let mut conf = PageServerConf { @@ -396,6 +399,7 @@ impl PageServerConf { import_pgdata_aws_endpoint_url, wal_receiver_protocol, page_service_pipelining, + get_vectored_concurrent_io, // ------------------------------------------------------------ // fields that require additional validation or custom handling diff --git a/pageserver/src/http/openapi_spec.yml b/pageserver/src/http/openapi_spec.yml index ee43440534..4b976e7f6f 100644 --- a/pageserver/src/http/openapi_spec.yml +++ b/pageserver/src/http/openapi_spec.yml @@ -984,6 +984,8 @@ components: type: string compaction_threshold: type: string + compaction_upper_limit: + type: string image_creation_threshold: type: integer walreceiver_connect_timeout: diff --git a/pageserver/src/http/routes.rs b/pageserver/src/http/routes.rs index 33b2d04588..0f3e9fdab6 100644 --- a/pageserver/src/http/routes.rs +++ b/pageserver/src/http/routes.rs @@ -10,6 +10,7 @@ use std::time::Duration; use anyhow::{anyhow, Context, Result}; use enumset::EnumSet; +use futures::future::join_all; use futures::StreamExt; use futures::TryFutureExt; use humantime::format_rfc3339; @@ -40,6 +41,7 @@ use pageserver_api::models::TenantShardSplitRequest; use pageserver_api::models::TenantShardSplitResponse; use pageserver_api::models::TenantSorting; use pageserver_api::models::TenantState; +use pageserver_api::models::TenantWaitLsnRequest; use pageserver_api::models::TimelineArchivalConfigRequest; use pageserver_api::models::TimelineCreateRequestMode; use pageserver_api::models::TimelineCreateRequestModeImportPgdata; @@ -84,6 +86,7 @@ use crate::tenant::remote_timeline_client::list_remote_tenant_shards; use crate::tenant::remote_timeline_client::list_remote_timelines; use crate::tenant::secondary::SecondaryController; use crate::tenant::size::ModelInputs; +use crate::tenant::storage_layer::IoConcurrency; use crate::tenant::storage_layer::LayerAccessStatsReset; use crate::tenant::storage_layer::LayerName; use crate::tenant::timeline::import_pgdata; @@ -94,6 +97,8 @@ use crate::tenant::timeline::CompactOptions; use crate::tenant::timeline::CompactRequest; use crate::tenant::timeline::CompactionError; use crate::tenant::timeline::Timeline; +use crate::tenant::timeline::WaitLsnTimeout; +use crate::tenant::timeline::WaitLsnWaiter; use crate::tenant::GetTimelineError; use crate::tenant::OffloadedTimeline; use crate::tenant::{LogicalSizeCalculationCause, PageReconstructError}; @@ -2789,6 +2794,63 @@ async fn secondary_download_handler( json_response(status, progress) } +async fn wait_lsn_handler( + mut request: Request, + cancel: CancellationToken, +) -> Result, ApiError> { + let tenant_shard_id: TenantShardId = parse_request_param(&request, "tenant_shard_id")?; + let wait_lsn_request: TenantWaitLsnRequest = json_request(&mut request).await?; + + let state = get_state(&request); + let tenant = state + .tenant_manager + .get_attached_tenant_shard(tenant_shard_id)?; + + let mut wait_futures = Vec::default(); + for timeline in tenant.list_timelines() { + let Some(lsn) = wait_lsn_request.timelines.get(&timeline.timeline_id) else { + continue; + }; + + let fut = { + let timeline = timeline.clone(); + let ctx = RequestContext::new(TaskKind::MgmtRequest, DownloadBehavior::Error); + async move { + timeline + .wait_lsn( + *lsn, + WaitLsnWaiter::HttpEndpoint, + WaitLsnTimeout::Custom(wait_lsn_request.timeout), + &ctx, + ) + .await + } + }; + wait_futures.push(fut); + } + + if wait_futures.is_empty() { + return json_response(StatusCode::NOT_FOUND, ()); + } + + let all_done = tokio::select! { + results = join_all(wait_futures) => { + results.iter().all(|res| res.is_ok()) + }, + _ = cancel.cancelled() => { + return Err(ApiError::Cancelled); + } + }; + + let status = if all_done { + StatusCode::OK + } else { + StatusCode::ACCEPTED + }; + + json_response(status, ()) +} + async fn secondary_status_handler( request: Request, _cancel: CancellationToken, @@ -2938,8 +3000,15 @@ async fn list_aux_files( active_timeline_of_active_tenant(&state.tenant_manager, tenant_shard_id, timeline_id) .await?; + let io_concurrency = IoConcurrency::spawn_from_conf( + state.conf, + timeline.gate.enter().map_err(|_| ApiError::Cancelled)?, + ); + let ctx = RequestContext::new(TaskKind::MgmtRequest, DownloadBehavior::Download); - let files = timeline.list_aux_files(body.lsn, &ctx).await?; + let files = timeline + .list_aux_files(body.lsn, &ctx, io_concurrency) + .await?; json_response(StatusCode::OK, files) } @@ -3569,6 +3638,9 @@ pub fn make_router( .post("/v1/tenant/:tenant_shard_id/secondary/download", |r| { api_handler(r, secondary_download_handler) }) + .post("/v1/tenant/:tenant_shard_id/wait_lsn", |r| { + api_handler(r, wait_lsn_handler) + }) .put("/v1/tenant/:tenant_shard_id/break", |r| { testing_api_handler("set tenant state to broken", r, handle_tenant_break) }) diff --git a/pageserver/src/metrics.rs b/pageserver/src/metrics.rs index 5b1cbbad63..77c0967afc 100644 --- a/pageserver/src/metrics.rs +++ b/pageserver/src/metrics.rs @@ -1,4 +1,13 @@ +use std::collections::HashMap; +use std::num::NonZeroUsize; +use std::pin::Pin; +use std::sync::atomic::AtomicU64; +use std::sync::{Arc, Mutex}; +use std::task::{Context, Poll}; +use std::time::{Duration, Instant}; + use enum_map::EnumMap; +use futures::Future; use metrics::{ register_counter_vec, register_gauge_vec, register_histogram, register_histogram_vec, register_int_counter, register_int_counter_pair_vec, register_int_counter_vec, @@ -11,13 +20,26 @@ use pageserver_api::config::{ PageServicePipeliningConfig, PageServicePipeliningConfigPipelined, PageServiceProtocolPipelinedExecutionStrategy, }; +use pageserver_api::models::InMemoryLayerInfo; use pageserver_api::shard::TenantShardId; +use pin_project_lite::pin_project; use postgres_backend::{is_expected_io_error, QueryError}; use pq_proto::framed::ConnectionError; -use strum::{EnumCount, VariantNames}; + +use strum::{EnumCount, IntoEnumIterator as _, VariantNames}; use strum_macros::{IntoStaticStr, VariantNames}; use utils::id::TimelineId; +use crate::config::PageServerConf; +use crate::context::{PageContentKind, RequestContext}; +use crate::task_mgr::TaskKind; +use crate::tenant::layer_map::LayerMap; +use crate::tenant::mgr::TenantSlot; +use crate::tenant::storage_layer::{InMemoryLayer, PersistentLayerDesc}; +use crate::tenant::tasks::BackgroundLoopKind; +use crate::tenant::throttle::ThrottleResult; +use crate::tenant::Timeline; + /// Prometheus histogram buckets (in seconds) for operations in the critical /// path. In other words, operations that directly affect that latency of user /// queries. @@ -38,6 +60,9 @@ pub(crate) enum StorageTimeOperation { #[strum(serialize = "layer flush")] LayerFlush, + #[strum(serialize = "layer flush delay")] + LayerFlushDelay, + #[strum(serialize = "compact")] Compact, @@ -100,71 +125,30 @@ pub(crate) static VEC_READ_NUM_LAYERS_VISITED: Lazy = Lazy::new(|| { .expect("failed to define a metric") }); -// Metrics collected on operations on the storage repository. -#[derive( - Clone, Copy, enum_map::Enum, strum_macros::EnumString, strum_macros::Display, IntoStaticStr, -)] -pub(crate) enum GetKind { - Singular, - Vectored, -} - -pub(crate) struct ReconstructTimeMetrics { - singular: Histogram, - vectored: Histogram, -} - -pub(crate) static RECONSTRUCT_TIME: Lazy = Lazy::new(|| { - let inner = register_histogram_vec!( - "pageserver_getpage_reconstruct_seconds", - "Time spent in reconstruct_value (reconstruct a page from deltas)", - &["get_kind"], - CRITICAL_OP_BUCKETS.into(), +pub(crate) static CONCURRENT_INITDBS: Lazy = Lazy::new(|| { + register_uint_gauge!( + "pageserver_concurrent_initdb", + "Number of initdb processes running" ) - .expect("failed to define a metric"); - - ReconstructTimeMetrics { - singular: inner.with_label_values(&[GetKind::Singular.into()]), - vectored: inner.with_label_values(&[GetKind::Vectored.into()]), - } + .expect("failed to define a metric") }); -impl ReconstructTimeMetrics { - pub(crate) fn for_get_kind(&self, get_kind: GetKind) -> &Histogram { - match get_kind { - GetKind::Singular => &self.singular, - GetKind::Vectored => &self.vectored, - } - } -} - -pub(crate) struct ReconstructDataTimeMetrics { - singular: Histogram, - vectored: Histogram, -} - -impl ReconstructDataTimeMetrics { - pub(crate) fn for_get_kind(&self, get_kind: GetKind) -> &Histogram { - match get_kind { - GetKind::Singular => &self.singular, - GetKind::Vectored => &self.vectored, - } - } -} - -pub(crate) static GET_RECONSTRUCT_DATA_TIME: Lazy = Lazy::new(|| { - let inner = register_histogram_vec!( - "pageserver_getpage_get_reconstruct_data_seconds", - "Time spent in get_reconstruct_value_data", - &["get_kind"], - CRITICAL_OP_BUCKETS.into(), +pub(crate) static INITDB_SEMAPHORE_ACQUISITION_TIME: Lazy = Lazy::new(|| { + register_histogram!( + "pageserver_initdb_semaphore_seconds_global", + "Time spent getting a permit from the global initdb semaphore", + STORAGE_OP_BUCKETS.into() ) - .expect("failed to define a metric"); + .expect("failed to define metric") +}); - ReconstructDataTimeMetrics { - singular: inner.with_label_values(&[GetKind::Singular.into()]), - vectored: inner.with_label_values(&[GetKind::Vectored.into()]), - } +pub(crate) static INITDB_RUN_TIME: Lazy = Lazy::new(|| { + register_histogram!( + "pageserver_initdb_seconds_global", + "Time spent performing initdb", + STORAGE_OP_BUCKETS.into() + ) + .expect("failed to define metric") }); pub(crate) struct GetVectoredLatency { @@ -481,18 +465,38 @@ static PITR_HISTORY_SIZE: Lazy = Lazy::new(|| { .expect("failed to define a metric") }); -#[derive(strum_macros::EnumString, strum_macros::Display, strum_macros::IntoStaticStr)] +#[derive( + strum_macros::EnumIter, + strum_macros::EnumString, + strum_macros::Display, + strum_macros::IntoStaticStr, +)] #[strum(serialize_all = "kebab_case")] -pub(crate) enum MetricLayerKind { +pub(crate) enum LayerKind { Delta, Image, } +#[derive( + strum_macros::EnumIter, + strum_macros::EnumString, + strum_macros::Display, + strum_macros::IntoStaticStr, +)] +#[strum(serialize_all = "kebab_case")] +pub(crate) enum LayerLevel { + // We don't track the currently open ephemeral layer, since there's always exactly 1 and its + // size changes. See `TIMELINE_EPHEMERAL_BYTES`. + Frozen, + L0, + L1, +} + static TIMELINE_LAYER_SIZE: Lazy = Lazy::new(|| { register_uint_gauge_vec!( "pageserver_layer_bytes", - "Sum of layer physical sizes in bytes", - &["tenant_id", "shard_id", "timeline_id", "kind"] + "Sum of frozen, L0, and L1 layer physical sizes in bytes (excluding the open ephemeral layer)", + &["tenant_id", "shard_id", "timeline_id", "level", "kind"] ) .expect("failed to define a metric") }); @@ -500,8 +504,8 @@ static TIMELINE_LAYER_SIZE: Lazy = Lazy::new(|| { static TIMELINE_LAYER_COUNT: Lazy = Lazy::new(|| { register_uint_gauge_vec!( "pageserver_layer_count", - "Number of layers that exist", - &["tenant_id", "shard_id", "timeline_id", "kind"] + "Number of frozen, L0, and L1 layers (excluding the open ephemeral layer)", + &["tenant_id", "shard_id", "timeline_id", "level", "kind"] ) .expect("failed to define a metric") }); @@ -1463,6 +1467,8 @@ pub enum SmgrQueryType { GetPageAtLsn, GetDbSize, GetSlruSegment, + #[cfg(feature = "testing")] + Test, } pub(crate) struct SmgrQueryTimePerTimeline { @@ -2522,12 +2528,19 @@ impl StorageTimeMetricsTimer { } } - /// Record the time from creation to now. - pub fn stop_and_record(self) { - let duration = self.start.elapsed().as_secs_f64(); - self.metrics.timeline_sum.inc_by(duration); + /// Returns the elapsed duration of the timer. + pub fn elapsed(&self) -> Duration { + self.start.elapsed() + } + + /// Record the time from creation to now and return it. + pub fn stop_and_record(self) -> Duration { + let duration = self.elapsed(); + let seconds = duration.as_secs_f64(); + self.metrics.timeline_sum.inc_by(seconds); self.metrics.timeline_count.inc(); - self.metrics.global_histogram.observe(duration); + self.metrics.global_histogram.observe(seconds); + duration } /// Turns this timer into a timer, which will always record -- usually this means recording @@ -2547,6 +2560,13 @@ impl Drop for AlwaysRecordingStorageTimeMetricsTimer { } } +impl AlwaysRecordingStorageTimeMetricsTimer { + /// Returns the elapsed duration of the timer. + pub fn elapsed(&self) -> Duration { + self.0.as_ref().expect("not dropped yet").elapsed() + } +} + /// Timing facilities for an globally histogrammed metric, which is supported by per tenant and /// timeline total sum and count. #[derive(Clone, Debug)] @@ -2599,6 +2619,7 @@ pub(crate) struct TimelineMetrics { shard_id: String, timeline_id: String, pub flush_time_histo: StorageTimeMetrics, + pub flush_delay_histo: StorageTimeMetrics, pub flush_wait_upload_time_gauge: Gauge, pub compact_time_histo: StorageTimeMetrics, pub create_images_time_histo: StorageTimeMetrics, @@ -2611,10 +2632,6 @@ pub(crate) struct TimelineMetrics { pub disk_consistent_lsn_gauge: IntGauge, pub pitr_history_size: UIntGauge, pub archival_size: UIntGauge, - pub(crate) layer_size_image: UIntGauge, - pub(crate) layer_count_image: UIntGauge, - pub(crate) layer_size_delta: UIntGauge, - pub(crate) layer_count_delta: UIntGauge, pub standby_horizon_gauge: IntGauge, pub resident_physical_size_gauge: UIntGauge, pub visible_physical_size_gauge: UIntGauge, @@ -2645,6 +2662,12 @@ impl TimelineMetrics { &shard_id, &timeline_id, ); + let flush_delay_histo = StorageTimeMetrics::new( + StorageTimeOperation::LayerFlushDelay, + &tenant_id, + &shard_id, + &timeline_id, + ); let flush_wait_upload_time_gauge = FLUSH_WAIT_UPLOAD_TIME .get_metric_with_label_values(&[&tenant_id, &shard_id, &timeline_id]) .unwrap(); @@ -2706,42 +2729,6 @@ impl TimelineMetrics { .get_metric_with_label_values(&[&tenant_id, &shard_id, &timeline_id]) .unwrap(); - let layer_size_image = TIMELINE_LAYER_SIZE - .get_metric_with_label_values(&[ - &tenant_id, - &shard_id, - &timeline_id, - MetricLayerKind::Image.into(), - ]) - .unwrap(); - - let layer_count_image = TIMELINE_LAYER_COUNT - .get_metric_with_label_values(&[ - &tenant_id, - &shard_id, - &timeline_id, - MetricLayerKind::Image.into(), - ]) - .unwrap(); - - let layer_size_delta = TIMELINE_LAYER_SIZE - .get_metric_with_label_values(&[ - &tenant_id, - &shard_id, - &timeline_id, - MetricLayerKind::Delta.into(), - ]) - .unwrap(); - - let layer_count_delta = TIMELINE_LAYER_COUNT - .get_metric_with_label_values(&[ - &tenant_id, - &shard_id, - &timeline_id, - MetricLayerKind::Delta.into(), - ]) - .unwrap(); - let standby_horizon_gauge = STANDBY_HORIZON .get_metric_with_label_values(&[&tenant_id, &shard_id, &timeline_id]) .unwrap(); @@ -2793,6 +2780,7 @@ impl TimelineMetrics { shard_id, timeline_id, flush_time_histo, + flush_delay_histo, flush_wait_upload_time_gauge, compact_time_histo, create_images_time_histo, @@ -2805,10 +2793,6 @@ impl TimelineMetrics { disk_consistent_lsn_gauge, pitr_history_size, archival_size, - layer_size_image, - layer_count_image, - layer_size_delta, - layer_count_delta, standby_horizon_gauge, resident_physical_size_gauge, visible_physical_size_gauge, @@ -2851,6 +2835,92 @@ impl TimelineMetrics { .add(duration); } + /// Generates TIMELINE_LAYER labels for a persistent layer. + fn make_layer_labels(&self, layer_desc: &PersistentLayerDesc) -> [&str; 5] { + let level = match LayerMap::is_l0(&layer_desc.key_range, layer_desc.is_delta()) { + true => LayerLevel::L0, + false => LayerLevel::L1, + }; + let kind = match layer_desc.is_delta() { + true => LayerKind::Delta, + false => LayerKind::Image, + }; + [ + &self.tenant_id, + &self.shard_id, + &self.timeline_id, + level.into(), + kind.into(), + ] + } + + /// Generates TIMELINE_LAYER labels for a frozen ephemeral layer. + fn make_frozen_layer_labels(&self, _layer: &InMemoryLayer) -> [&str; 5] { + [ + &self.tenant_id, + &self.shard_id, + &self.timeline_id, + LayerLevel::Frozen.into(), + LayerKind::Delta.into(), // by definition + ] + } + + /// Removes a frozen ephemeral layer to TIMELINE_LAYER metrics. + pub fn dec_frozen_layer(&self, layer: &InMemoryLayer) { + assert!(matches!(layer.info(), InMemoryLayerInfo::Frozen { .. })); + let labels = self.make_frozen_layer_labels(layer); + let size = layer.try_len().expect("frozen layer should have no writer"); + TIMELINE_LAYER_COUNT + .get_metric_with_label_values(&labels) + .unwrap() + .dec(); + TIMELINE_LAYER_SIZE + .get_metric_with_label_values(&labels) + .unwrap() + .sub(size); + } + + /// Adds a frozen ephemeral layer to TIMELINE_LAYER metrics. + pub fn inc_frozen_layer(&self, layer: &InMemoryLayer) { + assert!(matches!(layer.info(), InMemoryLayerInfo::Frozen { .. })); + let labels = self.make_frozen_layer_labels(layer); + let size = layer.try_len().expect("frozen layer should have no writer"); + TIMELINE_LAYER_COUNT + .get_metric_with_label_values(&labels) + .unwrap() + .inc(); + TIMELINE_LAYER_SIZE + .get_metric_with_label_values(&labels) + .unwrap() + .add(size); + } + + /// Removes a persistent layer from TIMELINE_LAYER metrics. + pub fn dec_layer(&self, layer_desc: &PersistentLayerDesc) { + let labels = self.make_layer_labels(layer_desc); + TIMELINE_LAYER_COUNT + .get_metric_with_label_values(&labels) + .unwrap() + .dec(); + TIMELINE_LAYER_SIZE + .get_metric_with_label_values(&labels) + .unwrap() + .sub(layer_desc.file_size); + } + + /// Adds a persistent layer to TIMELINE_LAYER metrics. + pub fn inc_layer(&self, layer_desc: &PersistentLayerDesc) { + let labels = self.make_layer_labels(layer_desc); + TIMELINE_LAYER_COUNT + .get_metric_with_label_values(&labels) + .unwrap() + .inc(); + TIMELINE_LAYER_SIZE + .get_metric_with_label_values(&labels) + .unwrap() + .add(layer_desc.file_size); + } + pub(crate) fn shutdown(&self) { let was_shutdown = self .shutdown @@ -2883,30 +2953,14 @@ impl TimelineMetrics { let _ = TIMELINE_ARCHIVE_SIZE.remove_label_values(&[tenant_id, shard_id, timeline_id]); let _ = PITR_HISTORY_SIZE.remove_label_values(&[tenant_id, shard_id, timeline_id]); - let _ = TIMELINE_LAYER_SIZE.remove_label_values(&[ - tenant_id, - shard_id, - timeline_id, - MetricLayerKind::Image.into(), - ]); - let _ = TIMELINE_LAYER_COUNT.remove_label_values(&[ - tenant_id, - shard_id, - timeline_id, - MetricLayerKind::Image.into(), - ]); - let _ = TIMELINE_LAYER_SIZE.remove_label_values(&[ - tenant_id, - shard_id, - timeline_id, - MetricLayerKind::Delta.into(), - ]); - let _ = TIMELINE_LAYER_COUNT.remove_label_values(&[ - tenant_id, - shard_id, - timeline_id, - MetricLayerKind::Delta.into(), - ]); + for ref level in LayerLevel::iter() { + for ref kind in LayerKind::iter() { + let labels: [&str; 5] = + [tenant_id, shard_id, timeline_id, level.into(), kind.into()]; + let _ = TIMELINE_LAYER_SIZE.remove_label_values(&labels); + let _ = TIMELINE_LAYER_COUNT.remove_label_values(&labels); + } + } let _ = EVICTIONS.remove_label_values(&[tenant_id, shard_id, timeline_id]); let _ = AUX_FILE_SIZE.remove_label_values(&[tenant_id, shard_id, timeline_id]); @@ -2988,24 +3042,6 @@ pub(crate) fn remove_tenant_metrics(tenant_shard_id: &TenantShardId) { // we leave the BROKEN_TENANTS_SET entry if any } -use futures::Future; -use pin_project_lite::pin_project; -use std::collections::HashMap; -use std::num::NonZeroUsize; -use std::pin::Pin; -use std::sync::atomic::AtomicU64; -use std::sync::{Arc, Mutex}; -use std::task::{Context, Poll}; -use std::time::{Duration, Instant}; - -use crate::config::PageServerConf; -use crate::context::{PageContentKind, RequestContext}; -use crate::task_mgr::TaskKind; -use crate::tenant::mgr::TenantSlot; -use crate::tenant::tasks::BackgroundLoopKind; -use crate::tenant::throttle::ThrottleResult; -use crate::tenant::Timeline; - /// Maintain a per timeline gauge in addition to the global gauge. pub(crate) struct PerTimelineRemotePhysicalSizeGauge { last_set: AtomicU64, @@ -3891,7 +3927,6 @@ pub fn preinitialize_metrics(conf: &'static PageServerConf) { }); // Custom - Lazy::force(&RECONSTRUCT_TIME); Lazy::force(&BASEBACKUP_QUERY_TIME); Lazy::force(&COMPUTE_COMMANDS_COUNTERS); Lazy::force(&tokio_epoll_uring::THREAD_LOCAL_METRICS_STORAGE); diff --git a/pageserver/src/page_service.rs b/pageserver/src/page_service.rs index da4180a927..e103338c7c 100644 --- a/pageserver/src/page_service.rs +++ b/pageserver/src/page_service.rs @@ -39,6 +39,7 @@ use tokio::io::{AsyncWriteExt, BufWriter}; use tokio::task::JoinHandle; use tokio_util::sync::CancellationToken; use tracing::*; +use utils::sync::gate::{Gate, GateGuard}; use utils::sync::spsc_fold; use utils::{ auth::{Claims, Scope, SwappableJwtAuth}, @@ -61,6 +62,7 @@ use crate::task_mgr::{self, COMPUTE_REQUEST_RUNTIME}; use crate::tenant::mgr::ShardSelector; use crate::tenant::mgr::TenantManager; use crate::tenant::mgr::{GetActiveTenantError, GetTenantError, ShardResolveResult}; +use crate::tenant::storage_layer::IoConcurrency; use crate::tenant::timeline::{self, WaitLsnError}; use crate::tenant::GetTimelineError; use crate::tenant::PageReconstructError; @@ -90,6 +92,7 @@ pub struct Listener { pub struct Connections { cancel: CancellationToken, tasks: tokio::task::JoinSet, + gate: Gate, } pub fn spawn( @@ -110,6 +113,7 @@ pub fn spawn( let task = COMPUTE_REQUEST_RUNTIME.spawn(task_mgr::exit_on_panic_or_error( "libpq listener", libpq_listener_main( + conf, tenant_manager, pg_auth, tcp_listener, @@ -134,11 +138,16 @@ impl Listener { } impl Connections { pub(crate) async fn shutdown(self) { - let Self { cancel, mut tasks } = self; + let Self { + cancel, + mut tasks, + gate, + } = self; cancel.cancel(); while let Some(res) = tasks.join_next().await { Self::handle_connection_completion(res); } + gate.close().await; } fn handle_connection_completion(res: Result, tokio::task::JoinError>) { @@ -158,7 +167,9 @@ impl Connections { /// Returns Ok(()) upon cancellation via `cancel`, returning the set of /// open connections. /// +#[allow(clippy::too_many_arguments)] pub async fn libpq_listener_main( + conf: &'static PageServerConf, tenant_manager: Arc, auth: Option>, listener: tokio::net::TcpListener, @@ -168,9 +179,15 @@ pub async fn libpq_listener_main( listener_cancel: CancellationToken, ) -> Connections { let connections_cancel = CancellationToken::new(); + let connections_gate = Gate::default(); let mut connection_handler_tasks = tokio::task::JoinSet::default(); loop { + let gate_guard = match connections_gate.enter() { + Ok(guard) => guard, + Err(_) => break, + }; + let accepted = tokio::select! { biased; _ = listener_cancel.cancelled() => break, @@ -190,6 +207,7 @@ pub async fn libpq_listener_main( let connection_ctx = listener_ctx .detached_child(TaskKind::PageRequestHandler, DownloadBehavior::Download); connection_handler_tasks.spawn(page_service_conn_main( + conf, tenant_manager.clone(), local_auth, socket, @@ -197,6 +215,7 @@ pub async fn libpq_listener_main( pipelining_config.clone(), connection_ctx, connections_cancel.child_token(), + gate_guard, )); } Err(err) => { @@ -211,13 +230,16 @@ pub async fn libpq_listener_main( Connections { cancel: connections_cancel, tasks: connection_handler_tasks, + gate: connections_gate, } } type ConnectionHandlerResult = anyhow::Result<()>; #[instrument(skip_all, fields(peer_addr))] +#[allow(clippy::too_many_arguments)] async fn page_service_conn_main( + conf: &'static PageServerConf, tenant_manager: Arc, auth: Option>, socket: tokio::net::TcpStream, @@ -225,6 +247,7 @@ async fn page_service_conn_main( pipelining_config: PageServicePipeliningConfig, connection_ctx: RequestContext, cancel: CancellationToken, + gate_guard: GateGuard, ) -> ConnectionHandlerResult { let _guard = LIVE_CONNECTIONS .with_label_values(&["page_service"]) @@ -274,11 +297,13 @@ async fn page_service_conn_main( // But it's in a shared crate, so, we store connection_ctx inside PageServerHandler // and create the per-query context in process_query ourselves. let mut conn_handler = PageServerHandler::new( + conf, tenant_manager, auth, pipelining_config, connection_ctx, cancel.clone(), + gate_guard, ); let pgbackend = PostgresBackend::new_from_io(socket, peer_addr, auth_type, None)?; @@ -310,6 +335,7 @@ async fn page_service_conn_main( } struct PageServerHandler { + conf: &'static PageServerConf, auth: Option>, claims: Option, @@ -325,6 +351,8 @@ struct PageServerHandler { timeline_handles: Option, pipelining_config: PageServicePipeliningConfig, + + gate_guard: GateGuard, } struct TimelineHandles { @@ -555,37 +583,52 @@ struct BatchedGetPageRequest { timer: SmgrOpTimer, } +#[cfg(feature = "testing")] +struct BatchedTestRequest { + req: models::PagestreamTestRequest, + timer: SmgrOpTimer, +} + +/// NB: we only hold [`timeline::handle::WeakHandle`] inside this enum, +/// so that we don't keep the [`Timeline::gate`] open while the batch +/// is being built up inside the [`spsc_fold`] (pagestream pipelining). enum BatchedFeMessage { Exists { span: Span, timer: SmgrOpTimer, - shard: timeline::handle::Handle, + shard: timeline::handle::WeakHandle, req: models::PagestreamExistsRequest, }, Nblocks { span: Span, timer: SmgrOpTimer, - shard: timeline::handle::Handle, + shard: timeline::handle::WeakHandle, req: models::PagestreamNblocksRequest, }, GetPage { span: Span, - shard: timeline::handle::Handle, + shard: timeline::handle::WeakHandle, effective_request_lsn: Lsn, pages: smallvec::SmallVec<[BatchedGetPageRequest; 1]>, }, DbSize { span: Span, timer: SmgrOpTimer, - shard: timeline::handle::Handle, + shard: timeline::handle::WeakHandle, req: models::PagestreamDbSizeRequest, }, GetSlruSegment { span: Span, timer: SmgrOpTimer, - shard: timeline::handle::Handle, + shard: timeline::handle::WeakHandle, req: models::PagestreamGetSlruSegmentRequest, }, + #[cfg(feature = "testing")] + Test { + span: Span, + shard: timeline::handle::WeakHandle, + requests: Vec, + }, RespondError { span: Span, error: BatchedPageStreamError, @@ -606,6 +649,12 @@ impl BatchedFeMessage { page.timer.observe_execution_start(at); } } + #[cfg(feature = "testing")] + BatchedFeMessage::Test { requests, .. } => { + for req in requests { + req.timer.observe_execution_start(at); + } + } BatchedFeMessage::RespondError { .. } => {} } } @@ -613,19 +662,23 @@ impl BatchedFeMessage { impl PageServerHandler { pub fn new( + conf: &'static PageServerConf, tenant_manager: Arc, auth: Option>, pipelining_config: PageServicePipeliningConfig, connection_ctx: RequestContext, cancel: CancellationToken, + gate_guard: GateGuard, ) -> Self { PageServerHandler { + conf, auth, claims: None, connection_ctx, timeline_handles: Some(TimelineHandles::new(tenant_manager)), cancel, pipelining_config, + gate_guard, } } @@ -735,7 +788,7 @@ impl PageServerHandler { BatchedFeMessage::Exists { span, timer, - shard, + shard: shard.downgrade(), req, } } @@ -754,7 +807,7 @@ impl PageServerHandler { BatchedFeMessage::Nblocks { span, timer, - shard, + shard: shard.downgrade(), req, } } @@ -773,7 +826,7 @@ impl PageServerHandler { BatchedFeMessage::DbSize { span, timer, - shard, + shard: shard.downgrade(), req, } } @@ -792,7 +845,7 @@ impl PageServerHandler { BatchedFeMessage::GetSlruSegment { span, timer, - shard, + shard: shard.downgrade(), req, } } @@ -844,6 +897,7 @@ impl PageServerHandler { ) .await?; + // We're holding the Handle let effective_request_lsn = match Self::wait_or_get_last_lsn( &shard, req.hdr.request_lsn, @@ -861,11 +915,27 @@ impl PageServerHandler { }; BatchedFeMessage::GetPage { span, - shard, + shard: shard.downgrade(), effective_request_lsn, pages: smallvec::smallvec![BatchedGetPageRequest { req, timer }], } } + #[cfg(feature = "testing")] + PagestreamFeMessage::Test(req) => { + let span = tracing::info_span!(parent: parent_span, "handle_test_request"); + let shard = timeline_handles + .get(tenant_id, timeline_id, ShardSelector::Zero) + .instrument(span.clone()) // sets `shard_id` field + .await?; + let timer = + record_op_start_and_throttle(&shard, metrics::SmgrQueryType::Test, received_at) + .await?; + BatchedFeMessage::Test { + span, + shard: shard.downgrade(), + requests: vec![BatchedTestRequest { req, timer }], + } + } }; Ok(Some(batched_msg)) } @@ -907,9 +977,7 @@ impl PageServerHandler { assert_eq!(accum_pages.len(), max_batch_size.get()); return false; } - if (accum_shard.tenant_shard_id, accum_shard.timeline_id) - != (this_shard.tenant_shard_id, this_shard.timeline_id) - { + if !accum_shard.is_same_handle_as(&this_shard) { trace!(%accum_lsn, %this_lsn, "stopping batching because timeline object mismatch"); // TODO: we _could_ batch & execute each shard seperately (and in parallel). // But the current logic for keeping responses in order does not support that. @@ -928,6 +996,44 @@ impl PageServerHandler { accum_pages.extend(this_pages); Ok(()) } + #[cfg(feature = "testing")] + ( + Ok(BatchedFeMessage::Test { + shard: accum_shard, + requests: accum_requests, + .. + }), + BatchedFeMessage::Test { + shard: this_shard, + requests: this_requests, + .. + }, + ) if (|| { + assert!(this_requests.len() == 1); + if accum_requests.len() >= max_batch_size.get() { + trace!(%max_batch_size, "stopping batching because of batch size"); + assert_eq!(accum_requests.len(), max_batch_size.get()); + return false; + } + if !accum_shard.is_same_handle_as(&this_shard) { + trace!("stopping batching because timeline object mismatch"); + // TODO: we _could_ batch & execute each shard seperately (and in parallel). + // But the current logic for keeping responses in order does not support that. + return false; + } + let this_batch_key = this_requests[0].req.batch_key; + let accum_batch_key = accum_requests[0].req.batch_key; + if this_requests[0].req.batch_key != accum_requests[0].req.batch_key { + trace!(%accum_batch_key, %this_batch_key, "stopping batching because batch key changed"); + return false; + } + true + })() => + { + // ok to batch + accum_requests.extend(this_requests); + Ok(()) + } // something batched already but this message is unbatchable (_, this_msg) => { // by default, don't continue batching @@ -941,6 +1047,7 @@ impl PageServerHandler { &mut self, pgb_writer: &mut PostgresBackend, batch: BatchedFeMessage, + io_concurrency: IoConcurrency, cancel: &CancellationToken, protocol_version: PagestreamProtocolVersion, ctx: &RequestContext, @@ -969,7 +1076,7 @@ impl PageServerHandler { fail::fail_point!("ps::handle-pagerequest-message::exists"); ( vec![self - .handle_get_rel_exists_request(&shard, &req, ctx) + .handle_get_rel_exists_request(&*shard.upgrade()?, &req, ctx) .instrument(span.clone()) .await .map(|msg| (msg, timer)) @@ -986,7 +1093,7 @@ impl PageServerHandler { fail::fail_point!("ps::handle-pagerequest-message::nblocks"); ( vec![self - .handle_get_nblocks_request(&shard, &req, ctx) + .handle_get_nblocks_request(&*shard.upgrade()?, &req, ctx) .instrument(span.clone()) .await .map(|msg| (msg, timer)) @@ -1007,9 +1114,10 @@ impl PageServerHandler { trace!(npages, "handling getpage request"); let res = self .handle_get_page_at_lsn_request_batched( - &shard, + &*shard.upgrade()?, effective_request_lsn, pages, + io_concurrency, ctx, ) .instrument(span.clone()) @@ -1029,7 +1137,7 @@ impl PageServerHandler { fail::fail_point!("ps::handle-pagerequest-message::dbsize"); ( vec![self - .handle_db_size_request(&shard, &req, ctx) + .handle_db_size_request(&*shard.upgrade()?, &req, ctx) .instrument(span.clone()) .await .map(|msg| (msg, timer)) @@ -1046,7 +1154,7 @@ impl PageServerHandler { fail::fail_point!("ps::handle-pagerequest-message::slrusegment"); ( vec![self - .handle_get_slru_segment_request(&shard, &req, ctx) + .handle_get_slru_segment_request(&*shard.upgrade()?, &req, ctx) .instrument(span.clone()) .await .map(|msg| (msg, timer)) @@ -1054,6 +1162,27 @@ impl PageServerHandler { span, ) } + #[cfg(feature = "testing")] + BatchedFeMessage::Test { + span, + shard, + requests, + } => { + fail::fail_point!("ps::handle-pagerequest-message::test"); + ( + { + let npages = requests.len(); + trace!(npages, "handling getpage request"); + let res = self + .handle_test_request_batch(&*shard.upgrade()?, requests, ctx) + .instrument(span.clone()) + .await; + assert_eq!(res.len(), npages); + res + }, + span, + ) + } BatchedFeMessage::RespondError { span, error } => { // We've already decided to respond with an error, so we don't need to // call the handler. @@ -1193,6 +1322,17 @@ impl PageServerHandler { } } + let io_concurrency = IoConcurrency::spawn_from_conf( + self.conf, + match self.gate_guard.try_clone() { + Ok(guard) => guard, + Err(_) => { + info!("shutdown request received in page handler"); + return Err(QueryError::Shutdown); + } + }, + ); + let pgb_reader = pgb .split() .context("implementation error: split pgb into reader and writer")?; @@ -1214,6 +1354,7 @@ impl PageServerHandler { request_span, pipelining_config, protocol_version, + io_concurrency, &ctx, ) .await @@ -1227,6 +1368,7 @@ impl PageServerHandler { timeline_handles, request_span, protocol_version, + io_concurrency, &ctx, ) .await @@ -1254,6 +1396,7 @@ impl PageServerHandler { mut timeline_handles: TimelineHandles, request_span: Span, protocol_version: PagestreamProtocolVersion, + io_concurrency: IoConcurrency, ctx: &RequestContext, ) -> ( (PostgresBackendReader, TimelineHandles), @@ -1288,7 +1431,14 @@ impl PageServerHandler { }; let err = self - .pagesteam_handle_batched_message(pgb_writer, msg, &cancel, protocol_version, ctx) + .pagesteam_handle_batched_message( + pgb_writer, + msg, + io_concurrency.clone(), + &cancel, + protocol_version, + ctx, + ) .await; match err { Ok(()) => {} @@ -1312,6 +1462,7 @@ impl PageServerHandler { request_span: Span, pipelining_config: PageServicePipeliningConfigPipelined, protocol_version: PagestreamProtocolVersion, + io_concurrency: IoConcurrency, ctx: &RequestContext, ) -> ( (PostgresBackendReader, TimelineHandles), @@ -1455,6 +1606,7 @@ impl PageServerHandler { self.pagesteam_handle_batched_message( pgb_writer, batch, + io_concurrency.clone(), &cancel, protocol_version, &ctx, @@ -1556,6 +1708,7 @@ impl PageServerHandler { .wait_lsn( not_modified_since, crate::tenant::timeline::WaitLsnWaiter::PageService, + timeline::WaitLsnTimeout::Default, ctx, ) .await?; @@ -1711,6 +1864,7 @@ impl PageServerHandler { timeline: &Timeline, effective_lsn: Lsn, requests: smallvec::SmallVec<[BatchedGetPageRequest; 1]>, + io_concurrency: IoConcurrency, ctx: &RequestContext, ) -> Vec> { debug_assert_current_span_has_tenant_and_timeline_id(); @@ -1737,6 +1891,7 @@ impl PageServerHandler { .get_rel_page_at_lsn_batched( requests.iter().map(|p| (&p.req.rel, &p.req.blkno)), effective_lsn, + io_concurrency, ctx, ) .await; @@ -1791,6 +1946,51 @@ impl PageServerHandler { )) } + // NB: this impl mimics what we do for batched getpage requests. + #[cfg(feature = "testing")] + #[instrument(skip_all, fields(shard_id))] + async fn handle_test_request_batch( + &mut self, + timeline: &Timeline, + requests: Vec, + _ctx: &RequestContext, + ) -> Vec> { + // real requests would do something with the timeline + let mut results = Vec::with_capacity(requests.len()); + for _req in requests.iter() { + tokio::task::yield_now().await; + + results.push({ + if timeline.cancel.is_cancelled() { + Err(PageReconstructError::Cancelled) + } else { + Ok(()) + } + }); + } + + // TODO: avoid creating the new Vec here + Vec::from_iter( + requests + .into_iter() + .zip(results.into_iter()) + .map(|(req, res)| { + res.map(|()| { + ( + PagestreamBeMessage::Test(models::PagestreamTestResponse { + req: req.req.clone(), + }), + req.timer, + ) + }) + .map_err(|e| BatchedPageStreamError { + err: PageStreamError::from(e), + req: req.req.hdr, + }) + }), + ) + } + /// Note on "fullbackup": /// Full basebackups should only be used for debugging purposes. /// Originally, it was introduced to enable breaking storage format changes, @@ -1845,6 +2045,7 @@ impl PageServerHandler { .wait_lsn( lsn, crate::tenant::timeline::WaitLsnWaiter::PageService, + crate::tenant::timeline::WaitLsnTimeout::Default, ctx, ) .await?; @@ -2406,6 +2607,14 @@ impl From for QueryError { } } +impl From for QueryError { + fn from(e: crate::tenant::timeline::handle::HandleUpgradeError) -> Self { + match e { + crate::tenant::timeline::handle::HandleUpgradeError::ShutDown => QueryError::Shutdown, + } + } +} + fn set_tracing_field_shard_id(timeline: &Timeline) { debug_assert_current_span_has_tenant_and_timeline_id_no_shard_id(); tracing::Span::current().record( diff --git a/pageserver/src/pgdatadir_mapping.rs b/pageserver/src/pgdatadir_mapping.rs index b65fe6cf7c..40c657524d 100644 --- a/pageserver/src/pgdatadir_mapping.rs +++ b/pageserver/src/pgdatadir_mapping.rs @@ -17,6 +17,7 @@ use crate::span::{ debug_assert_current_span_has_tenant_and_timeline_id, debug_assert_current_span_has_tenant_and_timeline_id_no_shard_id, }; +use crate::tenant::storage_layer::IoConcurrency; use crate::tenant::timeline::GetVectoredError; use anyhow::{ensure, Context}; use bytes::{Buf, Bytes, BytesMut}; @@ -200,6 +201,7 @@ impl Timeline { blknum: BlockNumber, version: Version<'_>, ctx: &RequestContext, + io_concurrency: IoConcurrency, ) -> Result { match version { Version::Lsn(effective_lsn) => { @@ -208,6 +210,7 @@ impl Timeline { .get_rel_page_at_lsn_batched( pages.iter().map(|(tag, blknum)| (tag, blknum)), effective_lsn, + io_concurrency.clone(), ctx, ) .await; @@ -246,6 +249,7 @@ impl Timeline { &self, pages: impl ExactSizeIterator, effective_lsn: Lsn, + io_concurrency: IoConcurrency, ctx: &RequestContext, ) -> Vec> { debug_assert_current_span_has_tenant_and_timeline_id(); @@ -309,7 +313,10 @@ impl Timeline { acc.to_keyspace() }; - match self.get_vectored(keyspace, effective_lsn, ctx).await { + match self + .get_vectored(keyspace, effective_lsn, io_concurrency, ctx) + .await + { Ok(results) => { for (key, res) in results { let mut key_slots = keys_slots.remove(&key).unwrap().into_iter(); @@ -889,9 +896,15 @@ impl Timeline { &self, lsn: Lsn, ctx: &RequestContext, + io_concurrency: IoConcurrency, ) -> Result, PageReconstructError> { let kv = self - .scan(KeySpace::single(Key::metadata_aux_key_range()), lsn, ctx) + .scan( + KeySpace::single(Key::metadata_aux_key_range()), + lsn, + ctx, + io_concurrency, + ) .await?; let mut result = HashMap::new(); let mut sz = 0; @@ -914,8 +927,9 @@ impl Timeline { &self, lsn: Lsn, ctx: &RequestContext, + io_concurrency: IoConcurrency, ) -> Result<(), PageReconstructError> { - self.list_aux_files_v2(lsn, ctx).await?; + self.list_aux_files_v2(lsn, ctx, io_concurrency).await?; Ok(()) } @@ -923,17 +937,24 @@ impl Timeline { &self, lsn: Lsn, ctx: &RequestContext, + io_concurrency: IoConcurrency, ) -> Result, PageReconstructError> { - self.list_aux_files_v2(lsn, ctx).await + self.list_aux_files_v2(lsn, ctx, io_concurrency).await } pub(crate) async fn get_replorigins( &self, lsn: Lsn, ctx: &RequestContext, + io_concurrency: IoConcurrency, ) -> Result, PageReconstructError> { let kv = self - .scan(KeySpace::single(repl_origin_key_range()), lsn, ctx) + .scan( + KeySpace::single(repl_origin_key_range()), + lsn, + ctx, + io_concurrency, + ) .await?; let mut result = HashMap::new(); for (k, v) in kv { @@ -2432,7 +2453,11 @@ mod tests { ("foo/bar2".to_string(), Bytes::from_static(b"content2")), ]); - let readback = tline.list_aux_files(Lsn(0x1008), &ctx).await?; + let io_concurrency = IoConcurrency::spawn_for_test(); + + let readback = tline + .list_aux_files(Lsn(0x1008), &ctx, io_concurrency.clone()) + .await?; assert_eq!(readback, expect_1008); // Second modification: update one key, remove the other @@ -2444,11 +2469,15 @@ mod tests { let expect_2008 = HashMap::from([("foo/bar1".to_string(), Bytes::from_static(b"content3"))]); - let readback = tline.list_aux_files(Lsn(0x2008), &ctx).await?; + let readback = tline + .list_aux_files(Lsn(0x2008), &ctx, io_concurrency.clone()) + .await?; assert_eq!(readback, expect_2008); // Reading back in time works - let readback = tline.list_aux_files(Lsn(0x1008), &ctx).await?; + let readback = tline + .list_aux_files(Lsn(0x1008), &ctx, io_concurrency.clone()) + .await?; assert_eq!(readback, expect_1008); Ok(()) diff --git a/pageserver/src/tenant.rs b/pageserver/src/tenant.rs index f6d758ad22..085f76c05d 100644 --- a/pageserver/src/tenant.rs +++ b/pageserver/src/tenant.rs @@ -37,6 +37,8 @@ use remote_timeline_client::manifest::{ OffloadedTimelineManifest, TenantManifest, LATEST_TENANT_MANIFEST_VERSION, }; use remote_timeline_client::UploadQueueNotReadyError; +use remote_timeline_client::FAILED_REMOTE_OP_RETRIES; +use remote_timeline_client::FAILED_UPLOAD_WARN_THRESHOLD; use std::collections::BTreeMap; use std::fmt; use std::future::Future; @@ -95,6 +97,9 @@ use crate::deletion_queue::DeletionQueueError; use crate::import_datadir; use crate::is_uninit_mark; use crate::l0_flush::L0FlushGlobalState; +use crate::metrics::CONCURRENT_INITDBS; +use crate::metrics::INITDB_RUN_TIME; +use crate::metrics::INITDB_SEMAPHORE_ACQUISITION_TIME; use crate::metrics::TENANT; use crate::metrics::{ remove_tenant_metrics, BROKEN_TENANTS_SET, CIRCUIT_BREAKERS_BROKEN, CIRCUIT_BREAKERS_UNBROKEN, @@ -2555,7 +2560,12 @@ impl Tenant { // sizes etc. and that would get confused if the previous page versions // are not in the repository yet. ancestor_timeline - .wait_lsn(*lsn, timeline::WaitLsnWaiter::Tenant, ctx) + .wait_lsn( + *lsn, + timeline::WaitLsnWaiter::Tenant, + timeline::WaitLsnTimeout::Default, + ctx, + ) .await .map_err(|e| match e { e @ (WaitLsnError::Timeout(_) | WaitLsnError::BadState { .. }) => { @@ -3806,6 +3816,13 @@ impl Tenant { .unwrap_or(self.conf.default_tenant_conf.compaction_threshold) } + pub fn get_compaction_upper_limit(&self) -> usize { + let tenant_conf = self.tenant_conf.load().tenant_conf.clone(); + tenant_conf + .compaction_upper_limit + .unwrap_or(self.conf.default_tenant_conf.compaction_upper_limit) + } + pub fn get_gc_horizon(&self) -> u64 { let tenant_conf = self.tenant_conf.load().tenant_conf.clone(); tenant_conf @@ -5305,27 +5322,37 @@ impl Tenant { return Ok(()); } - upload_tenant_manifest( - &self.remote_storage, - &self.tenant_shard_id, - self.generation, - &manifest, + // Remote storage does no retries internally, so wrap it + match backoff::retry( + || async { + upload_tenant_manifest( + &self.remote_storage, + &self.tenant_shard_id, + self.generation, + &manifest, + &self.cancel, + ) + .await + }, + |_e| self.cancel.is_cancelled(), + FAILED_UPLOAD_WARN_THRESHOLD, + FAILED_REMOTE_OP_RETRIES, + "uploading tenant manifest", &self.cancel, ) .await - .map_err(|e| { - if self.cancel.is_cancelled() { - TenantManifestError::Cancelled - } else { - TenantManifestError::RemoteStorage(e) + { + None => Err(TenantManifestError::Cancelled), + Some(Err(_)) if self.cancel.is_cancelled() => Err(TenantManifestError::Cancelled), + Some(Err(e)) => Err(TenantManifestError::RemoteStorage(e)), + Some(Ok(_)) => { + // Store the successfully uploaded manifest, so that future callers can avoid + // re-uploading the same thing. + *guard = Some(manifest); + + Ok(()) } - })?; - - // Store the successfully uploaded manifest, so that future callers can avoid - // re-uploading the same thing. - *guard = Some(manifest); - - Ok(()) + } } } @@ -5347,8 +5374,17 @@ async fn run_initdb( initdb_bin_path, initdb_target_dir, initdb_lib_dir, ); - let _permit = INIT_DB_SEMAPHORE.acquire().await; + let _permit = { + let _timer = INITDB_SEMAPHORE_ACQUISITION_TIME.start_timer(); + INIT_DB_SEMAPHORE.acquire().await + }; + CONCURRENT_INITDBS.inc(); + scopeguard::defer! { + CONCURRENT_INITDBS.dec(); + } + + let _timer = INITDB_RUN_TIME.start_timer(); let res = postgres_initdb::do_run_initdb(postgres_initdb::RunInitdbArgs { superuser: &conf.superuser, locale: &conf.locale, @@ -5440,7 +5476,11 @@ pub(crate) mod harness { compaction_target_size: Some(tenant_conf.compaction_target_size), compaction_period: Some(tenant_conf.compaction_period), compaction_threshold: Some(tenant_conf.compaction_threshold), + compaction_upper_limit: Some(tenant_conf.compaction_upper_limit), compaction_algorithm: Some(tenant_conf.compaction_algorithm), + l0_flush_delay_threshold: tenant_conf.l0_flush_delay_threshold, + l0_flush_stall_threshold: tenant_conf.l0_flush_stall_threshold, + l0_flush_wait_upload: Some(tenant_conf.l0_flush_wait_upload), gc_horizon: Some(tenant_conf.gc_horizon), gc_period: Some(tenant_conf.gc_period), image_creation_threshold: Some(tenant_conf.image_creation_threshold), @@ -5463,6 +5503,12 @@ pub(crate) mod harness { lsn_lease_length_for_ts: Some(tenant_conf.lsn_lease_length_for_ts), timeline_offloading: Some(tenant_conf.timeline_offloading), wal_receiver_protocol_override: tenant_conf.wal_receiver_protocol_override, + rel_size_v2_enabled: tenant_conf.rel_size_v2_enabled, + gc_compaction_enabled: Some(tenant_conf.gc_compaction_enabled), + gc_compaction_initial_threshold_kb: Some( + tenant_conf.gc_compaction_initial_threshold_kb, + ), + gc_compaction_ratio_percent: Some(tenant_conf.gc_compaction_ratio_percent), } } } @@ -5696,7 +5742,7 @@ mod tests { use pageserver_api::value::Value; use pageserver_compaction::helpers::overlaps_with; use rand::{thread_rng, Rng}; - use storage_layer::PersistentLayerKey; + use storage_layer::{IoConcurrency, PersistentLayerKey}; use tests::storage_layer::ValuesReconstructState; use tests::timeline::{GetVectoredError, ShutdownMode}; use timeline::{CompactOptions, DeltaLayerTestDesc}; @@ -6477,6 +6523,7 @@ mod tests { async fn test_get_vectored() -> anyhow::Result<()> { let harness = TenantHarness::create("test_get_vectored").await?; let (tenant, ctx) = harness.load().await; + let io_concurrency = IoConcurrency::spawn_for_test(); let tline = tenant .create_test_timeline(TIMELINE_ID, Lsn(0x08), DEFAULT_PG_VERSION, &ctx) .await?; @@ -6541,7 +6588,7 @@ mod tests { .get_vectored_impl( read.clone(), reads_lsn, - &mut ValuesReconstructState::new(), + &mut ValuesReconstructState::new(io_concurrency.clone()), &ctx, ) .await; @@ -6588,6 +6635,7 @@ mod tests { let harness = TenantHarness::create("test_get_vectored_aux_files").await?; let (tenant, ctx) = harness.load().await; + let io_concurrency = IoConcurrency::spawn_for_test(); let tline = tenant .create_empty_timeline(TIMELINE_ID, Lsn(0), DEFAULT_PG_VERSION, &ctx) .await?; @@ -6622,7 +6670,7 @@ mod tests { .get_vectored_impl( aux_keyspace.clone(), read_lsn, - &mut ValuesReconstructState::new(), + &mut ValuesReconstructState::new(io_concurrency.clone()), &ctx, ) .await; @@ -6670,6 +6718,7 @@ mod tests { ) .await?; let (tenant, ctx) = harness.load().await; + let io_concurrency = IoConcurrency::spawn_for_test(); let mut current_key = Key::from_hex("010000000033333333444444445500000000").unwrap(); let gap_at_key = current_key.add(100); @@ -6770,7 +6819,7 @@ mod tests { .get_vectored_impl( read.clone(), current_lsn, - &mut ValuesReconstructState::new(), + &mut ValuesReconstructState::new(io_concurrency.clone()), &ctx, ) .await?; @@ -6813,6 +6862,7 @@ mod tests { async fn test_get_vectored_ancestor_descent() -> anyhow::Result<()> { let harness = TenantHarness::create("test_get_vectored_on_lsn_axis").await?; let (tenant, ctx) = harness.load().await; + let io_concurrency = IoConcurrency::spawn_for_test(); let start_key = Key::from_hex("010000000033333333444444445500000000").unwrap(); let end_key = start_key.add(1000); @@ -6905,7 +6955,7 @@ mod tests { ranges: vec![child_gap_at_key..child_gap_at_key.next()], }, query_lsn, - &mut ValuesReconstructState::new(), + &mut ValuesReconstructState::new(io_concurrency.clone()), &ctx, ) .await; @@ -7351,6 +7401,7 @@ mod tests { async fn test_metadata_scan() -> anyhow::Result<()> { let harness = TenantHarness::create("test_metadata_scan").await?; let (tenant, ctx) = harness.load().await; + let io_concurrency = IoConcurrency::spawn_for_test(); let tline = tenant .create_test_timeline(TIMELINE_ID, Lsn(0x10), DEFAULT_PG_VERSION, &ctx) .await?; @@ -7404,7 +7455,7 @@ mod tests { .get_vectored_impl( keyspace.clone(), lsn, - &mut ValuesReconstructState::default(), + &mut ValuesReconstructState::new(io_concurrency.clone()), &ctx, ) .await? @@ -7519,6 +7570,7 @@ mod tests { let harness = TenantHarness::create("test_aux_file_e2e").await.unwrap(); let (tenant, ctx) = harness.load().await; + let io_concurrency = IoConcurrency::spawn_for_test(); let mut lsn = Lsn(0x08); @@ -7538,7 +7590,10 @@ mod tests { } // we can read everything from the storage - let files = tline.list_aux_files(lsn, &ctx).await.unwrap(); + let files = tline + .list_aux_files(lsn, &ctx, io_concurrency.clone()) + .await + .unwrap(); assert_eq!( files.get("pg_logical/mappings/test1"), Some(&bytes::Bytes::from_static(b"first")) @@ -7554,7 +7609,10 @@ mod tests { modification.commit(&ctx).await.unwrap(); } - let files = tline.list_aux_files(lsn, &ctx).await.unwrap(); + let files = tline + .list_aux_files(lsn, &ctx, io_concurrency.clone()) + .await + .unwrap(); assert_eq!( files.get("pg_logical/mappings/test2"), Some(&bytes::Bytes::from_static(b"second")) @@ -7565,7 +7623,10 @@ mod tests { .await .unwrap(); - let files = child.list_aux_files(lsn, &ctx).await.unwrap(); + let files = child + .list_aux_files(lsn, &ctx, io_concurrency.clone()) + .await + .unwrap(); assert_eq!(files.get("pg_logical/mappings/test1"), None); assert_eq!(files.get("pg_logical/mappings/test2"), None); } @@ -7574,6 +7635,7 @@ mod tests { async fn test_metadata_image_creation() -> anyhow::Result<()> { let harness = TenantHarness::create("test_metadata_image_creation").await?; let (tenant, ctx) = harness.load().await; + let io_concurrency = IoConcurrency::spawn_for_test(); let tline = tenant .create_test_timeline(TIMELINE_ID, Lsn(0x10), DEFAULT_PG_VERSION, &ctx) .await?; @@ -7593,8 +7655,9 @@ mod tests { keyspace: &KeySpace, lsn: Lsn, ctx: &RequestContext, + io_concurrency: IoConcurrency, ) -> anyhow::Result<(BTreeMap>, usize)> { - let mut reconstruct_state = ValuesReconstructState::default(); + let mut reconstruct_state = ValuesReconstructState::new(io_concurrency); let res = tline .get_vectored_impl(keyspace.clone(), lsn, &mut reconstruct_state, ctx) .await?; @@ -7642,7 +7705,8 @@ mod tests { if iter % 5 == 0 { let (_, before_delta_file_accessed) = - scan_with_statistics(&tline, &keyspace, lsn, &ctx).await?; + scan_with_statistics(&tline, &keyspace, lsn, &ctx, io_concurrency.clone()) + .await?; tline .compact( &cancel, @@ -7656,7 +7720,8 @@ mod tests { ) .await?; let (_, after_delta_file_accessed) = - scan_with_statistics(&tline, &keyspace, lsn, &ctx).await?; + scan_with_statistics(&tline, &keyspace, lsn, &ctx, io_concurrency.clone()) + .await?; assert!(after_delta_file_accessed < before_delta_file_accessed, "after_delta_file_accessed={after_delta_file_accessed}, before_delta_file_accessed={before_delta_file_accessed}"); // Given that we already produced an image layer, there should be no delta layer needed for the scan, but still setting a low threshold there for unforeseen circumstances. assert!( @@ -7745,6 +7810,7 @@ mod tests { async fn test_vectored_missing_metadata_key_reads() -> anyhow::Result<()> { let harness = TenantHarness::create("test_vectored_missing_metadata_key_reads").await?; let (tenant, ctx) = harness.load().await; + let io_concurrency = IoConcurrency::spawn_for_test(); let base_key = Key::from_hex("620000000033333333444444445500000000").unwrap(); let base_key_child = Key::from_hex("620000000033333333444444445500000001").unwrap(); @@ -7883,7 +7949,7 @@ mod tests { ); // test vectored scan on parent timeline - let mut reconstruct_state = ValuesReconstructState::new(); + let mut reconstruct_state = ValuesReconstructState::new(io_concurrency.clone()); let res = tline .get_vectored_impl( KeySpace::single(Key::metadata_key_range()), @@ -7909,7 +7975,7 @@ mod tests { ); // test vectored scan on child timeline - let mut reconstruct_state = ValuesReconstructState::new(); + let mut reconstruct_state = ValuesReconstructState::new(io_concurrency.clone()); let res = child .get_vectored_impl( KeySpace::single(Key::metadata_key_range()), @@ -7947,7 +8013,9 @@ mod tests { lsn: Lsn, ctx: &RequestContext, ) -> Result, GetVectoredError> { - let mut reconstruct_state = ValuesReconstructState::new(); + let io_concurrency = + IoConcurrency::spawn_from_conf(tline.conf, tline.gate.enter().unwrap()); + let mut reconstruct_state = ValuesReconstructState::new(io_concurrency); let mut res = tline .get_vectored_impl( KeySpace::single(key..key.next()), @@ -8048,6 +8116,7 @@ mod tests { .await .unwrap(); let (tenant, ctx) = harness.load().await; + let io_concurrency = IoConcurrency::spawn_for_test(); let key0 = Key::from_hex("620000000033333333444444445500000000").unwrap(); let key1 = Key::from_hex("620000000033333333444444445500000001").unwrap(); @@ -8107,7 +8176,7 @@ mod tests { // Image layers are created at last_record_lsn let images = tline - .inspect_image_layers(Lsn(0x40), &ctx) + .inspect_image_layers(Lsn(0x40), &ctx, io_concurrency.clone()) .await .unwrap() .into_iter() @@ -8122,6 +8191,7 @@ mod tests { .await .unwrap(); let (tenant, ctx) = harness.load().await; + let io_concurrency = IoConcurrency::spawn_for_test(); let key1 = Key::from_hex("620000000033333333444444445500000001").unwrap(); let key2 = Key::from_hex("620000000033333333444444445500000002").unwrap(); @@ -8172,7 +8242,7 @@ mod tests { // Image layers are created at last_record_lsn let images = tline - .inspect_image_layers(Lsn(0x30), &ctx) + .inspect_image_layers(Lsn(0x30), &ctx, io_concurrency.clone()) .await .unwrap() .into_iter() @@ -8185,6 +8255,7 @@ mod tests { async fn test_simple_bottom_most_compaction_images() -> anyhow::Result<()> { let harness = TenantHarness::create("test_simple_bottom_most_compaction_images").await?; let (tenant, ctx) = harness.load().await; + let io_concurrency = IoConcurrency::spawn_for_test(); fn get_key(id: u32) -> Key { // using aux key here b/c they are guaranteed to be inside `collect_keyspace`. @@ -8326,7 +8397,7 @@ mod tests { // Check if the image layer at the GC horizon contains exactly what we want let image_at_gc_horizon = tline - .inspect_image_layers(Lsn(0x30), &ctx) + .inspect_image_layers(Lsn(0x30), &ctx, io_concurrency.clone()) .await .unwrap() .into_iter() @@ -10039,7 +10110,12 @@ mod tests { let keyspace = KeySpace::single(get_key(0)..get_key(10)); let results = tline - .get_vectored(keyspace, delta_layer_end_lsn, &ctx) + .get_vectored( + keyspace, + delta_layer_end_lsn, + IoConcurrency::sequential(), + &ctx, + ) .await .expect("No vectored errors"); for (key, res) in results { diff --git a/pageserver/src/tenant/config.rs b/pageserver/src/tenant/config.rs index edf2e6a3aa..139ed27bd2 100644 --- a/pageserver/src/tenant/config.rs +++ b/pageserver/src/tenant/config.rs @@ -277,10 +277,26 @@ pub struct TenantConfOpt { #[serde(default)] pub compaction_threshold: Option, + #[serde(skip_serializing_if = "Option::is_none")] + #[serde(default)] + pub compaction_upper_limit: Option, + #[serde(skip_serializing_if = "Option::is_none")] #[serde(default)] pub compaction_algorithm: Option, + #[serde(skip_serializing_if = "Option::is_none")] + #[serde(default)] + pub l0_flush_delay_threshold: Option, + + #[serde(skip_serializing_if = "Option::is_none")] + #[serde(default)] + pub l0_flush_stall_threshold: Option, + + #[serde(skip_serializing_if = "Option::is_none")] + #[serde(default)] + pub l0_flush_wait_upload: Option, + #[serde(skip_serializing_if = "Option::is_none")] #[serde(default)] pub gc_horizon: Option, @@ -357,6 +373,18 @@ pub struct TenantConfOpt { #[serde(skip_serializing_if = "Option::is_none")] pub wal_receiver_protocol_override: Option, + + #[serde(skip_serializing_if = "Option::is_none")] + pub rel_size_v2_enabled: Option, + + #[serde(skip_serializing_if = "Option::is_none")] + pub gc_compaction_enabled: Option, + + #[serde(skip_serializing_if = "Option::is_none")] + pub gc_compaction_initial_threshold_kb: Option, + + #[serde(skip_serializing_if = "Option::is_none")] + pub gc_compaction_ratio_percent: Option, } impl TenantConfOpt { @@ -377,11 +405,23 @@ impl TenantConfOpt { compaction_threshold: self .compaction_threshold .unwrap_or(global_conf.compaction_threshold), + compaction_upper_limit: self + .compaction_upper_limit + .unwrap_or(global_conf.compaction_upper_limit), compaction_algorithm: self .compaction_algorithm .as_ref() .unwrap_or(&global_conf.compaction_algorithm) .clone(), + l0_flush_delay_threshold: self + .l0_flush_delay_threshold + .or(global_conf.l0_flush_delay_threshold), + l0_flush_stall_threshold: self + .l0_flush_stall_threshold + .or(global_conf.l0_flush_stall_threshold), + l0_flush_wait_upload: self + .l0_flush_wait_upload + .unwrap_or(global_conf.l0_flush_wait_upload), gc_horizon: self.gc_horizon.unwrap_or(global_conf.gc_horizon), gc_period: self.gc_period.unwrap_or(global_conf.gc_period), image_creation_threshold: self @@ -425,6 +465,16 @@ impl TenantConfOpt { wal_receiver_protocol_override: self .wal_receiver_protocol_override .or(global_conf.wal_receiver_protocol_override), + rel_size_v2_enabled: self.rel_size_v2_enabled.or(global_conf.rel_size_v2_enabled), + gc_compaction_enabled: self + .gc_compaction_enabled + .unwrap_or(global_conf.gc_compaction_enabled), + gc_compaction_initial_threshold_kb: self + .gc_compaction_initial_threshold_kb + .unwrap_or(global_conf.gc_compaction_initial_threshold_kb), + gc_compaction_ratio_percent: self + .gc_compaction_ratio_percent + .unwrap_or(global_conf.gc_compaction_ratio_percent), } } @@ -435,7 +485,11 @@ impl TenantConfOpt { mut compaction_target_size, mut compaction_period, mut compaction_threshold, + mut compaction_upper_limit, mut compaction_algorithm, + mut l0_flush_delay_threshold, + mut l0_flush_stall_threshold, + mut l0_flush_wait_upload, mut gc_horizon, mut gc_period, mut image_creation_threshold, @@ -454,6 +508,10 @@ impl TenantConfOpt { mut lsn_lease_length_for_ts, mut timeline_offloading, mut wal_receiver_protocol_override, + mut rel_size_v2_enabled, + mut gc_compaction_enabled, + mut gc_compaction_initial_threshold_kb, + mut gc_compaction_ratio_percent, } = self; patch.checkpoint_distance.apply(&mut checkpoint_distance); @@ -469,7 +527,17 @@ impl TenantConfOpt { .map(|v| humantime::parse_duration(&v))? .apply(&mut compaction_period); patch.compaction_threshold.apply(&mut compaction_threshold); + patch + .compaction_upper_limit + .apply(&mut compaction_upper_limit); patch.compaction_algorithm.apply(&mut compaction_algorithm); + patch + .l0_flush_delay_threshold + .apply(&mut l0_flush_delay_threshold); + patch + .l0_flush_stall_threshold + .apply(&mut l0_flush_stall_threshold); + patch.l0_flush_wait_upload.apply(&mut l0_flush_wait_upload); patch.gc_horizon.apply(&mut gc_horizon); patch .gc_period @@ -522,6 +590,16 @@ impl TenantConfOpt { patch .wal_receiver_protocol_override .apply(&mut wal_receiver_protocol_override); + patch.rel_size_v2_enabled.apply(&mut rel_size_v2_enabled); + patch + .gc_compaction_enabled + .apply(&mut gc_compaction_enabled); + patch + .gc_compaction_initial_threshold_kb + .apply(&mut gc_compaction_initial_threshold_kb); + patch + .gc_compaction_ratio_percent + .apply(&mut gc_compaction_ratio_percent); Ok(Self { checkpoint_distance, @@ -529,7 +607,11 @@ impl TenantConfOpt { compaction_target_size, compaction_period, compaction_threshold, + compaction_upper_limit, compaction_algorithm, + l0_flush_delay_threshold, + l0_flush_stall_threshold, + l0_flush_wait_upload, gc_horizon, gc_period, image_creation_threshold, @@ -548,6 +630,10 @@ impl TenantConfOpt { lsn_lease_length_for_ts, timeline_offloading, wal_receiver_protocol_override, + rel_size_v2_enabled, + gc_compaction_enabled, + gc_compaction_initial_threshold_kb, + gc_compaction_ratio_percent, }) } } @@ -583,6 +669,10 @@ impl From for models::TenantConfig { compaction_target_size: value.compaction_target_size, compaction_period: value.compaction_period.map(humantime), compaction_threshold: value.compaction_threshold, + compaction_upper_limit: value.compaction_upper_limit, + l0_flush_delay_threshold: value.l0_flush_delay_threshold, + l0_flush_stall_threshold: value.l0_flush_stall_threshold, + l0_flush_wait_upload: value.l0_flush_wait_upload, gc_horizon: value.gc_horizon, gc_period: value.gc_period.map(humantime), image_creation_threshold: value.image_creation_threshold, @@ -603,6 +693,10 @@ impl From for models::TenantConfig { lsn_lease_length_for_ts: value.lsn_lease_length_for_ts.map(humantime), timeline_offloading: value.timeline_offloading, wal_receiver_protocol_override: value.wal_receiver_protocol_override, + rel_size_v2_enabled: value.rel_size_v2_enabled, + gc_compaction_enabled: value.gc_compaction_enabled, + gc_compaction_initial_threshold_kb: value.gc_compaction_initial_threshold_kb, + gc_compaction_ratio_percent: value.gc_compaction_ratio_percent, } } } diff --git a/pageserver/src/tenant/layer_map.rs b/pageserver/src/tenant/layer_map.rs index 1b6924425c..a69cce932e 100644 --- a/pageserver/src/tenant/layer_map.rs +++ b/pageserver/src/tenant/layer_map.rs @@ -57,6 +57,7 @@ use std::collections::{HashMap, VecDeque}; use std::iter::Peekable; use std::ops::Range; use std::sync::Arc; +use tokio::sync::watch; use utils::lsn::Lsn; use historic_layer_coverage::BufferedHistoricLayerCoverage; @@ -67,7 +68,6 @@ use super::storage_layer::{LayerVisibilityHint, PersistentLayerDesc}; /// /// LayerMap tracks what layers exist on a timeline. /// -#[derive(Default)] pub struct LayerMap { // // 'open_layer' holds the current InMemoryLayer that is accepting new @@ -93,7 +93,25 @@ pub struct LayerMap { /// L0 layers have key range Key::MIN..Key::MAX, and locating them using R-Tree search is very inefficient. /// So L0 layers are held in l0_delta_layers vector, in addition to the R-tree. + /// + /// NB: make sure to notify `watch_l0_deltas` on changes. l0_delta_layers: Vec>, + + /// Notifies about L0 delta layer changes, sending the current number of L0 layers. + watch_l0_deltas: watch::Sender, +} + +impl Default for LayerMap { + fn default() -> Self { + Self { + open_layer: Default::default(), + next_open_layer_at: Default::default(), + frozen_layers: Default::default(), + historic: Default::default(), + l0_delta_layers: Default::default(), + watch_l0_deltas: watch::channel(0).0, + } + } } /// The primary update API for the layer map. @@ -466,6 +484,8 @@ impl LayerMap { if Self::is_l0(&layer_desc.key_range, layer_desc.is_delta) { self.l0_delta_layers.push(layer_desc.clone().into()); + self.watch_l0_deltas + .send_replace(self.l0_delta_layers.len()); } self.historic.insert( @@ -488,6 +508,8 @@ impl LayerMap { let mut l0_delta_layers = std::mem::take(&mut self.l0_delta_layers); l0_delta_layers.retain(|other| other.key() != layer_key); self.l0_delta_layers = l0_delta_layers; + self.watch_l0_deltas + .send_replace(self.l0_delta_layers.len()); // this assertion is related to use of Arc::ptr_eq in Self::compare_arced_layers, // there's a chance that the comparison fails at runtime due to it comparing (pointer, // vtable) pairs. @@ -850,6 +872,11 @@ impl LayerMap { &self.l0_delta_layers } + /// Subscribes to L0 delta layer changes, sending the current number of L0 delta layers. + pub fn watch_level0_deltas(&self) -> watch::Receiver { + self.watch_l0_deltas.subscribe() + } + /// debugging function to print out the contents of the layer map #[allow(unused)] pub async fn dump(&self, verbose: bool, ctx: &RequestContext) -> Result<()> { diff --git a/pageserver/src/tenant/mgr.rs b/pageserver/src/tenant/mgr.rs index e8b0d1d4dd..dfa89a765c 100644 --- a/pageserver/src/tenant/mgr.rs +++ b/pageserver/src/tenant/mgr.rs @@ -1643,6 +1643,7 @@ impl TenantManager { .wait_lsn( *target_lsn, crate::tenant::timeline::WaitLsnWaiter::Tenant, + crate::tenant::timeline::WaitLsnTimeout::Default, ctx, ) .await diff --git a/pageserver/src/tenant/remote_timeline_client.rs b/pageserver/src/tenant/remote_timeline_client.rs index 47c4a8637d..bcba6d1f62 100644 --- a/pageserver/src/tenant/remote_timeline_client.rs +++ b/pageserver/src/tenant/remote_timeline_client.rs @@ -382,6 +382,12 @@ pub(crate) struct RemoteTimelineClient { cancel: CancellationToken, } +impl Drop for RemoteTimelineClient { + fn drop(&mut self) { + debug!("dropping RemoteTimelineClient"); + } +} + impl RemoteTimelineClient { /// /// Create a remote storage client for given timeline @@ -797,6 +803,12 @@ impl RemoteTimelineClient { upload_queue.dirty.metadata.apply(update); + // Defense in depth: if we somehow generated invalid metadata, do not persist it. + upload_queue + .dirty + .validate() + .map_err(|e| anyhow::anyhow!(e))?; + self.schedule_index_upload(upload_queue); Ok(()) diff --git a/pageserver/src/tenant/remote_timeline_client/index.rs b/pageserver/src/tenant/remote_timeline_client/index.rs index 244be5bbb7..b8b18005fd 100644 --- a/pageserver/src/tenant/remote_timeline_client/index.rs +++ b/pageserver/src/tenant/remote_timeline_client/index.rs @@ -77,8 +77,32 @@ pub struct IndexPart { /// /// None means no aux files have been written to the storage before the point /// when this flag is introduced. + /// + /// This flag is not used any more as all tenants have been transitioned to the new aux file policy. #[serde(skip_serializing_if = "Option::is_none", default)] pub(crate) last_aux_file_policy: Option, + + #[serde(skip_serializing_if = "Option::is_none", default)] + pub(crate) rel_size_migration: Option, + + /// The LSN of gc-compaction horizon. Once gc-compaction is finished for all layer files below an LSN, this LSN will be updated. + #[serde(skip_serializing_if = "Option::is_none", default)] + pub(crate) l2_lsn: Option, +} + +#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)] +#[serde(rename_all = "camelCase")] +pub enum RelSizeMigration { + /// The tenant is using the old rel_size format. + /// Note that this enum is persisted as `Option` in the index part, so + /// `None` is the same as `Some(RelSizeMigration::Legacy)`. + Legacy, + /// The tenant is migrating to the new rel_size format. Both old and new rel_size format are + /// persisted in the index part. The read path will read both formats and merge them. + Migrating, + /// The tenant has migrated to the new rel_size format. Only the new rel_size format is persisted + /// in the index part, and the read path will not read the old format. + Migrated, } impl IndexPart { @@ -97,10 +121,12 @@ impl IndexPart { /// - 8: added `archived_at` /// - 9: +gc_blocking /// - 10: +import_pgdata - const LATEST_VERSION: usize = 10; + /// - 11: +rel_size_migration + /// - 12: +l2_lsn + const LATEST_VERSION: usize = 12; // Versions we may see when reading from a bucket. - pub const KNOWN_VERSIONS: &'static [usize] = &[1, 2, 3, 4, 5, 6, 7, 8, 9, 10]; + pub const KNOWN_VERSIONS: &'static [usize] = &[1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12]; pub const FILE_NAME: &'static str = "index_part.json"; @@ -116,6 +142,8 @@ impl IndexPart { gc_blocking: None, last_aux_file_policy: None, import_pgdata: None, + rel_size_migration: None, + l2_lsn: None, } } @@ -152,6 +180,21 @@ impl IndexPart { }; is_same_remote_layer_path(name, metadata, name, index_metadata) } + + /// Check for invariants in the index: this is useful when uploading an index to ensure that if + /// we encounter a bug, we do not persist buggy metadata. + pub(crate) fn validate(&self) -> Result<(), String> { + if self.import_pgdata.is_none() + && self.metadata.ancestor_timeline().is_none() + && self.layer_metadata.is_empty() + { + // Unless we're in the middle of a raw pgdata import, or this is a child timeline,the index must + // always have at least one layer. + return Err("Index has no ancestor and no layers".to_string()); + } + + Ok(()) + } } /// Metadata gathered for each of the layer files. @@ -179,6 +222,10 @@ impl LayerFileMetadata { shard, } } + /// Helper to get both generation and file size in a tuple + pub fn generation_file_size(&self) -> (Generation, u64) { + (self.generation, self.file_size) + } } /// Limited history of earlier ancestors. @@ -401,6 +448,8 @@ mod tests { gc_blocking: None, last_aux_file_policy: None, import_pgdata: None, + rel_size_migration: None, + l2_lsn: None, }; let part = IndexPart::from_json_bytes(example.as_bytes()).unwrap(); @@ -446,6 +495,8 @@ mod tests { gc_blocking: None, last_aux_file_policy: None, import_pgdata: None, + rel_size_migration: None, + l2_lsn: None, }; let part = IndexPart::from_json_bytes(example.as_bytes()).unwrap(); @@ -492,6 +543,8 @@ mod tests { gc_blocking: None, last_aux_file_policy: None, import_pgdata: None, + rel_size_migration: None, + l2_lsn: None, }; let part = IndexPart::from_json_bytes(example.as_bytes()).unwrap(); @@ -541,6 +594,8 @@ mod tests { gc_blocking: None, last_aux_file_policy: None, import_pgdata: None, + rel_size_migration: None, + l2_lsn: None, }; let empty_layers_parsed = IndexPart::from_json_bytes(empty_layers_json.as_bytes()).unwrap(); @@ -585,6 +640,8 @@ mod tests { gc_blocking: None, last_aux_file_policy: None, import_pgdata: None, + rel_size_migration: None, + l2_lsn: None, }; let part = IndexPart::from_json_bytes(example.as_bytes()).unwrap(); @@ -632,6 +689,8 @@ mod tests { gc_blocking: None, last_aux_file_policy: None, import_pgdata: None, + rel_size_migration: None, + l2_lsn: None, }; let part = IndexPart::from_json_bytes(example.as_bytes()).unwrap(); @@ -684,6 +743,8 @@ mod tests { gc_blocking: None, last_aux_file_policy: Some(AuxFilePolicy::V2), import_pgdata: None, + rel_size_migration: None, + l2_lsn: None, }; let part = IndexPart::from_json_bytes(example.as_bytes()).unwrap(); @@ -741,6 +802,8 @@ mod tests { gc_blocking: None, last_aux_file_policy: Default::default(), import_pgdata: None, + rel_size_migration: None, + l2_lsn: None, }; let part = IndexPart::from_json_bytes(example.as_bytes()).unwrap(); @@ -799,6 +862,8 @@ mod tests { gc_blocking: None, last_aux_file_policy: Default::default(), import_pgdata: None, + rel_size_migration: None, + l2_lsn: None, }; let part = IndexPart::from_json_bytes(example.as_bytes()).unwrap(); @@ -862,6 +927,8 @@ mod tests { last_aux_file_policy: Default::default(), archived_at: None, import_pgdata: None, + rel_size_migration: None, + l2_lsn: None, }; let part = IndexPart::from_json_bytes(example.as_bytes()).unwrap(); @@ -937,7 +1004,168 @@ mod tests { started_at: parse_naive_datetime("2024-11-13T09:23:42.123000000"), finished_at: parse_naive_datetime("2024-11-13T09:42:23.123000000"), idempotency_key: import_pgdata::index_part_format::IdempotencyKey::new("specified-by-client-218a5213-5044-4562-a28d-d024c5f057f5".to_string()), - }))) + }))), + rel_size_migration: None, + l2_lsn: None, + }; + + let part = IndexPart::from_json_bytes(example.as_bytes()).unwrap(); + assert_eq!(part, expected); + } + + #[test] + fn v11_rel_size_migration_is_parsed() { + let example = r#"{ + "version": 11, + "layer_metadata":{ + "000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__0000000001696070-00000000016960E9": { "file_size": 25600000 }, + "000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__00000000016B59D8-00000000016B5A51": { "file_size": 9007199254741001 } + }, + "disk_consistent_lsn":"0/16960E8", + "metadata": { + "disk_consistent_lsn": "0/16960E8", + "prev_record_lsn": "0/1696070", + "ancestor_timeline": "e45a7f37d3ee2ff17dc14bf4f4e3f52e", + "ancestor_lsn": "0/0", + "latest_gc_cutoff_lsn": "0/1696070", + "initdb_lsn": "0/1696070", + "pg_version": 14 + }, + "gc_blocking": { + "started_at": "2024-07-19T09:00:00.123", + "reasons": ["DetachAncestor"] + }, + "import_pgdata": { + "V1": { + "Done": { + "idempotency_key": "specified-by-client-218a5213-5044-4562-a28d-d024c5f057f5", + "started_at": "2024-11-13T09:23:42.123", + "finished_at": "2024-11-13T09:42:23.123" + } + } + }, + "rel_size_migration": "legacy" + }"#; + + let expected = IndexPart { + version: 11, + layer_metadata: HashMap::from([ + ("000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__0000000001696070-00000000016960E9".parse().unwrap(), LayerFileMetadata { + file_size: 25600000, + generation: Generation::none(), + shard: ShardIndex::unsharded() + }), + ("000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__00000000016B59D8-00000000016B5A51".parse().unwrap(), LayerFileMetadata { + file_size: 9007199254741001, + generation: Generation::none(), + shard: ShardIndex::unsharded() + }) + ]), + disk_consistent_lsn: "0/16960E8".parse::().unwrap(), + metadata: TimelineMetadata::new( + Lsn::from_str("0/16960E8").unwrap(), + Some(Lsn::from_str("0/1696070").unwrap()), + Some(TimelineId::from_str("e45a7f37d3ee2ff17dc14bf4f4e3f52e").unwrap()), + Lsn::INVALID, + Lsn::from_str("0/1696070").unwrap(), + Lsn::from_str("0/1696070").unwrap(), + 14, + ).with_recalculated_checksum().unwrap(), + deleted_at: None, + lineage: Default::default(), + gc_blocking: Some(GcBlocking { + started_at: parse_naive_datetime("2024-07-19T09:00:00.123000000"), + reasons: enumset::EnumSet::from_iter([GcBlockingReason::DetachAncestor]), + }), + last_aux_file_policy: Default::default(), + archived_at: None, + import_pgdata: Some(import_pgdata::index_part_format::Root::V1(import_pgdata::index_part_format::V1::Done(import_pgdata::index_part_format::Done{ + started_at: parse_naive_datetime("2024-11-13T09:23:42.123000000"), + finished_at: parse_naive_datetime("2024-11-13T09:42:23.123000000"), + idempotency_key: import_pgdata::index_part_format::IdempotencyKey::new("specified-by-client-218a5213-5044-4562-a28d-d024c5f057f5".to_string()), + }))), + rel_size_migration: Some(RelSizeMigration::Legacy), + l2_lsn: None, + }; + + let part = IndexPart::from_json_bytes(example.as_bytes()).unwrap(); + assert_eq!(part, expected); + } + + #[test] + fn v12_l2_lsn_is_parsed() { + let example = r#"{ + "version": 12, + "layer_metadata":{ + "000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__0000000001696070-00000000016960E9": { "file_size": 25600000 }, + "000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__00000000016B59D8-00000000016B5A51": { "file_size": 9007199254741001 } + }, + "disk_consistent_lsn":"0/16960E8", + "metadata": { + "disk_consistent_lsn": "0/16960E8", + "prev_record_lsn": "0/1696070", + "ancestor_timeline": "e45a7f37d3ee2ff17dc14bf4f4e3f52e", + "ancestor_lsn": "0/0", + "latest_gc_cutoff_lsn": "0/1696070", + "initdb_lsn": "0/1696070", + "pg_version": 14 + }, + "gc_blocking": { + "started_at": "2024-07-19T09:00:00.123", + "reasons": ["DetachAncestor"] + }, + "import_pgdata": { + "V1": { + "Done": { + "idempotency_key": "specified-by-client-218a5213-5044-4562-a28d-d024c5f057f5", + "started_at": "2024-11-13T09:23:42.123", + "finished_at": "2024-11-13T09:42:23.123" + } + } + }, + "rel_size_migration": "legacy", + "l2_lsn": "0/16960E8" + }"#; + + let expected = IndexPart { + version: 12, + layer_metadata: HashMap::from([ + ("000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__0000000001696070-00000000016960E9".parse().unwrap(), LayerFileMetadata { + file_size: 25600000, + generation: Generation::none(), + shard: ShardIndex::unsharded() + }), + ("000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__00000000016B59D8-00000000016B5A51".parse().unwrap(), LayerFileMetadata { + file_size: 9007199254741001, + generation: Generation::none(), + shard: ShardIndex::unsharded() + }) + ]), + disk_consistent_lsn: "0/16960E8".parse::().unwrap(), + metadata: TimelineMetadata::new( + Lsn::from_str("0/16960E8").unwrap(), + Some(Lsn::from_str("0/1696070").unwrap()), + Some(TimelineId::from_str("e45a7f37d3ee2ff17dc14bf4f4e3f52e").unwrap()), + Lsn::INVALID, + Lsn::from_str("0/1696070").unwrap(), + Lsn::from_str("0/1696070").unwrap(), + 14, + ).with_recalculated_checksum().unwrap(), + deleted_at: None, + lineage: Default::default(), + gc_blocking: Some(GcBlocking { + started_at: parse_naive_datetime("2024-07-19T09:00:00.123000000"), + reasons: enumset::EnumSet::from_iter([GcBlockingReason::DetachAncestor]), + }), + last_aux_file_policy: Default::default(), + archived_at: None, + import_pgdata: Some(import_pgdata::index_part_format::Root::V1(import_pgdata::index_part_format::V1::Done(import_pgdata::index_part_format::Done{ + started_at: parse_naive_datetime("2024-11-13T09:23:42.123000000"), + finished_at: parse_naive_datetime("2024-11-13T09:42:23.123000000"), + idempotency_key: import_pgdata::index_part_format::IdempotencyKey::new("specified-by-client-218a5213-5044-4562-a28d-d024c5f057f5".to_string()), + }))), + rel_size_migration: Some(RelSizeMigration::Legacy), + l2_lsn: Some("0/16960E8".parse::().unwrap()), }; let part = IndexPart::from_json_bytes(example.as_bytes()).unwrap(); diff --git a/pageserver/src/tenant/remote_timeline_client/upload.rs b/pageserver/src/tenant/remote_timeline_client/upload.rs index e434d24e5f..af4dbbbfb6 100644 --- a/pageserver/src/tenant/remote_timeline_client/upload.rs +++ b/pageserver/src/tenant/remote_timeline_client/upload.rs @@ -40,6 +40,10 @@ pub(crate) async fn upload_index_part( }); pausable_failpoint!("before-upload-index-pausable"); + // Safety: refuse to persist invalid index metadata, to mitigate the impact of any bug that produces this + // (this should never happen) + index_part.validate().map_err(|e| anyhow::anyhow!(e))?; + // FIXME: this error comes too late let serialized = index_part.to_json_bytes()?; let serialized = Bytes::from(serialized); diff --git a/pageserver/src/tenant/secondary/downloader.rs b/pageserver/src/tenant/secondary/downloader.rs index 395e34e404..cf524fcb25 100644 --- a/pageserver/src/tenant/secondary/downloader.rs +++ b/pageserver/src/tenant/secondary/downloader.rs @@ -559,6 +559,13 @@ impl JobGenerator { @@ -1008,69 +1015,17 @@ impl<'a> TenantDownloader<'a> { return (Err(UpdateError::Restart), touched); } - // Existing on-disk layers: just update their access time. - if let Some(on_disk) = timeline_state.on_disk_layers.get(&layer.name) { - tracing::debug!("Layer {} is already on disk", layer.name); - - if cfg!(debug_assertions) { - // Debug for https://github.com/neondatabase/neon/issues/6966: check that the files we think - // are already present on disk are really there. - match tokio::fs::metadata(&on_disk.local_path).await { - Ok(meta) => { - tracing::debug!( - "Layer {} present at {}, size {}", - layer.name, - on_disk.local_path, - meta.len(), - ); - } - Err(e) => { - tracing::warn!( - "Layer {} not found at {} ({})", - layer.name, - on_disk.local_path, - e - ); - debug_assert!(false); - } - } - } - - if on_disk.metadata != layer.metadata || on_disk.access_time != layer.access_time { - // We already have this layer on disk. Update its access time. - tracing::debug!( - "Access time updated for layer {}: {} -> {}", - layer.name, - strftime(&on_disk.access_time), - strftime(&layer.access_time) - ); - touched.push(layer); - } - continue; - } else { - tracing::debug!("Layer {} not present on disk yet", layer.name); - } - - // Eviction: if we evicted a layer, then do not re-download it unless it was accessed more - // recently than it was evicted. - if let Some(evicted_at) = timeline_state.evicted_at.get(&layer.name) { - if &layer.access_time > evicted_at { - tracing::info!( - "Re-downloading evicted layer {}, accessed at {}, evicted at {}", - layer.name, - strftime(&layer.access_time), - strftime(evicted_at) - ); - } else { - tracing::trace!( - "Not re-downloading evicted layer {}, accessed at {}, evicted at {}", - layer.name, - strftime(&layer.access_time), - strftime(evicted_at) - ); + match self.layer_action(&timeline_state, &layer).await { + LayerAction::Download => (), + LayerAction::NoAction => continue, + LayerAction::Skip => { self.skip_layer(layer); continue; } + LayerAction::Touch => { + touched.push(layer); + continue; + } } match self @@ -1091,6 +1046,86 @@ impl<'a> TenantDownloader<'a> { (Ok(()), touched) } + async fn layer_action( + &self, + timeline_state: &SecondaryDetailTimeline, + layer: &HeatMapLayer, + ) -> LayerAction { + // Existing on-disk layers: just update their access time. + if let Some(on_disk) = timeline_state.on_disk_layers.get(&layer.name) { + tracing::debug!("Layer {} is already on disk", layer.name); + + if cfg!(debug_assertions) { + // Debug for https://github.com/neondatabase/neon/issues/6966: check that the files we think + // are already present on disk are really there. + match tokio::fs::metadata(&on_disk.local_path).await { + Ok(meta) => { + tracing::debug!( + "Layer {} present at {}, size {}", + layer.name, + on_disk.local_path, + meta.len(), + ); + } + Err(e) => { + tracing::warn!( + "Layer {} not found at {} ({})", + layer.name, + on_disk.local_path, + e + ); + debug_assert!(false); + } + } + } + + if on_disk.metadata.generation_file_size() != on_disk.metadata.generation_file_size() { + tracing::info!( + "Re-downloading layer {} with changed size or generation: {:?}->{:?}", + layer.name, + on_disk.metadata.generation_file_size(), + on_disk.metadata.generation_file_size() + ); + return LayerAction::Download; + } + if on_disk.metadata != layer.metadata || on_disk.access_time != layer.access_time { + // We already have this layer on disk. Update its access time. + tracing::debug!( + "Access time updated for layer {}: {} -> {}", + layer.name, + strftime(&on_disk.access_time), + strftime(&layer.access_time) + ); + return LayerAction::Touch; + } + return LayerAction::NoAction; + } else { + tracing::debug!("Layer {} not present on disk yet", layer.name); + } + + // Eviction: if we evicted a layer, then do not re-download it unless it was accessed more + // recently than it was evicted. + if let Some(evicted_at) = timeline_state.evicted_at.get(&layer.name) { + if &layer.access_time > evicted_at { + tracing::info!( + "Re-downloading evicted layer {}, accessed at {}, evicted at {}", + layer.name, + strftime(&layer.access_time), + strftime(evicted_at) + ); + } else { + tracing::trace!( + "Not re-downloading evicted layer {}, accessed at {}, evicted at {}", + layer.name, + strftime(&layer.access_time), + strftime(evicted_at) + ); + return LayerAction::Skip; + } + } + LayerAction::Download + } + async fn download_timeline( &self, timeline: HeatMapTimeline, diff --git a/pageserver/src/tenant/storage_layer.rs b/pageserver/src/tenant/storage_layer.rs index 3913637ca0..c1fe67c87c 100644 --- a/pageserver/src/tenant/storage_layer.rs +++ b/pageserver/src/tenant/storage_layer.rs @@ -10,21 +10,30 @@ mod layer_desc; mod layer_name; pub mod merge_iterator; +use crate::config::PageServerConf; use crate::context::{AccessStatsBehavior, RequestContext}; use bytes::Bytes; +use futures::stream::FuturesUnordered; +use futures::StreamExt; use pageserver_api::key::Key; use pageserver_api::keyspace::{KeySpace, KeySpaceRandomAccum}; use pageserver_api::record::NeonWalRecord; use pageserver_api::value::Value; -use std::cmp::{Ordering, Reverse}; +use std::cmp::Ordering; use std::collections::hash_map::Entry; use std::collections::{BinaryHeap, HashMap}; +use std::future::Future; use std::ops::Range; +use std::pin::Pin; +use std::sync::atomic::AtomicUsize; use std::sync::Arc; use std::time::{Duration, SystemTime, UNIX_EPOCH}; +use tracing::{trace, Instrument}; +use utils::sync::gate::GateGuard; use utils::lsn::Lsn; +pub use batch_split_writer::{BatchLayerWriter, SplitDeltaLayerWriter, SplitImageLayerWriter}; pub use delta_layer::{DeltaLayer, DeltaLayerWriter, ValueRef}; pub use image_layer::{ImageLayer, ImageLayerWriter}; pub use inmemory_layer::InMemoryLayer; @@ -78,30 +87,151 @@ pub(crate) enum ValueReconstructSituation { Continue, } -/// Reconstruct data accumulated for a single key during a vectored get -#[derive(Debug, Default, Clone)] -pub(crate) struct VectoredValueReconstructState { - pub(crate) records: Vec<(Lsn, NeonWalRecord)>, - pub(crate) img: Option<(Lsn, Bytes)>, - - situation: ValueReconstructSituation, +/// On disk representation of a value loaded in a buffer +#[derive(Debug)] +pub(crate) enum OnDiskValue { + /// Unencoded [`Value::Image`] + RawImage(Bytes), + /// Encoded [`Value`]. Can deserialize into an image or a WAL record + WalRecordOrImage(Bytes), } -impl VectoredValueReconstructState { - fn get_cached_lsn(&self) -> Option { - self.img.as_ref().map(|img| img.0) +/// Reconstruct data accumulated for a single key during a vectored get +#[derive(Debug, Default)] +pub(crate) struct VectoredValueReconstructState { + pub(crate) on_disk_values: Vec<(Lsn, OnDiskValueIoWaiter)>, + + pub(crate) situation: ValueReconstructSituation, +} + +#[derive(Debug)] +pub(crate) struct OnDiskValueIoWaiter { + rx: tokio::sync::oneshot::Receiver, +} + +#[derive(Debug)] +#[must_use] +pub(crate) enum OnDiskValueIo { + /// Traversal identified this IO as required to complete the vectored get. + Required { + num_active_ios: Arc, + tx: tokio::sync::oneshot::Sender, + }, + /// Sparse keyspace reads always read all the values for a given key, + /// even though only the first value is needed. + /// + /// This variant represents the unnecessary IOs for those values at lower LSNs + /// that aren't needed, but are currently still being done. + /// + /// The execution of unnecessary IOs was a pre-existing behavior before concurrent IO. + /// We added this explicit representation here so that we can drop + /// unnecessary IO results immediately, instead of buffering them in + /// `oneshot` channels inside [`VectoredValueReconstructState`] until + /// [`VectoredValueReconstructState::collect_pending_ios`] gets called. + Unnecessary, +} + +type OnDiskValueIoResult = Result; + +impl OnDiskValueIo { + pub(crate) fn complete(self, res: OnDiskValueIoResult) { + match self { + OnDiskValueIo::Required { num_active_ios, tx } => { + num_active_ios.fetch_sub(1, std::sync::atomic::Ordering::Release); + let _ = tx.send(res); + } + OnDiskValueIo::Unnecessary => { + // Nobody cared, see variant doc comment. + } + } } } -impl From for ValueReconstructState { - fn from(mut state: VectoredValueReconstructState) -> Self { - // walredo expects the records to be descending in terms of Lsn - state.records.sort_by_key(|(lsn, _)| Reverse(*lsn)); +#[derive(Debug, thiserror::Error)] +pub(crate) enum WaitCompletionError { + #[error("OnDiskValueIo was dropped without completing, likely the sidecar task panicked")] + IoDropped, +} - ValueReconstructState { - records: state.records, - img: state.img, +impl OnDiskValueIoWaiter { + pub(crate) async fn wait_completion(self) -> Result { + // NB: for Unnecessary IOs, this method never gets called because we don't add them to `on_disk_values`. + self.rx.await.map_err(|_| WaitCompletionError::IoDropped) + } +} + +impl VectoredValueReconstructState { + /// # Cancel-Safety + /// + /// Technically fine to stop polling this future, but, the IOs will still + /// be executed to completion by the sidecar task and hold on to / consume resources. + /// Better not do it to make reasonsing about the system easier. + pub(crate) async fn collect_pending_ios( + self, + ) -> Result { + use utils::bin_ser::BeSer; + + let mut res = Ok(ValueReconstructState::default()); + + // We should try hard not to bail early, so that by the time we return from this + // function, all IO for this value is done. It's not required -- we could totally + // stop polling the IO futures in the sidecar task, they need to support that, + // but just stopping to poll doesn't reduce the IO load on the disk. It's easier + // to reason about the system if we just wait for all IO to complete, even if + // we're no longer interested in the result. + // + // Revisit this when IO futures are replaced with a more sophisticated IO system + // and an IO scheduler, where we know which IOs were submitted and which ones + // just queued. Cf the comment on IoConcurrency::spawn_io. + for (lsn, waiter) in self.on_disk_values { + let value_recv_res = waiter + .wait_completion() + // we rely on the caller to poll us to completion, so this is not a bail point + .await; + // Force not bailing early by wrapping the code into a closure. + #[allow(clippy::redundant_closure_call)] + let _: () = (|| { + match (&mut res, value_recv_res) { + (Err(_), _) => { + // We've already failed, no need to process more. + } + (Ok(_), Err(wait_err)) => { + // This shouldn't happen - likely the sidecar task panicked. + res = Err(PageReconstructError::Other(wait_err.into())); + } + (Ok(_), Ok(Err(err))) => { + let err: std::io::Error = err; + // TODO: returning IO error here will fail a compute query. + // Probably not what we want, we're not doing `maybe_fatal_err` + // in the IO futures. + // But it's been like that for a long time, not changing it + // as part of concurrent IO. + // => https://github.com/neondatabase/neon/issues/10454 + res = Err(PageReconstructError::Other(err.into())); + } + (Ok(ok), Ok(Ok(OnDiskValue::RawImage(img)))) => { + assert!(ok.img.is_none()); + ok.img = Some((lsn, img)); + } + (Ok(ok), Ok(Ok(OnDiskValue::WalRecordOrImage(buf)))) => { + match Value::des(&buf) { + Ok(Value::WalRecord(rec)) => { + ok.records.push((lsn, rec)); + } + Ok(Value::Image(img)) => { + assert!(ok.img.is_none()); + ok.img = Some((lsn, img)); + } + Err(err) => { + res = Err(PageReconstructError::Other(err.into())); + } + } + } + } + })(); } + + res } } @@ -109,7 +239,7 @@ impl From for ValueReconstructState { pub(crate) struct ValuesReconstructState { /// The keys will be removed after `get_vectored` completes. The caller outside `Timeline` /// should not expect to get anything from this hashmap. - pub(crate) keys: HashMap>, + pub(crate) keys: HashMap, /// The keys which are already retrieved keys_done: KeySpaceRandomAccum, @@ -119,27 +249,365 @@ pub(crate) struct ValuesReconstructState { // Statistics that are still accessible as a caller of `get_vectored_impl`. layers_visited: u32, delta_layers_visited: u32, + + pub(crate) io_concurrency: IoConcurrency, + num_active_ios: Arc, +} + +/// The level of IO concurrency to be used on the read path +/// +/// The desired end state is that we always do parallel IO. +/// This struct and the dispatching in the impl will be removed once +/// we've built enough confidence. +pub(crate) enum IoConcurrency { + Sequential, + SidecarTask { + task_id: usize, + ios_tx: tokio::sync::mpsc::UnboundedSender, + }, +} + +type IoFuture = Pin>>; + +pub(crate) enum SelectedIoConcurrency { + Sequential, + SidecarTask(GateGuard), +} + +impl std::fmt::Debug for IoConcurrency { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + match self { + IoConcurrency::Sequential => write!(f, "Sequential"), + IoConcurrency::SidecarTask { .. } => write!(f, "SidecarTask"), + } + } +} + +impl std::fmt::Debug for SelectedIoConcurrency { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + match self { + SelectedIoConcurrency::Sequential => write!(f, "Sequential"), + SelectedIoConcurrency::SidecarTask(_) => write!(f, "SidecarTask"), + } + } +} + +impl IoConcurrency { + /// Force sequential IO. This is a temporary workaround until we have + /// moved plumbing-through-the-call-stack + /// of IoConcurrency into `RequestContextq. + /// + /// DO NOT USE for new code. + /// + /// Tracking issue: . + pub(crate) fn sequential() -> Self { + Self::spawn(SelectedIoConcurrency::Sequential) + } + + pub(crate) fn spawn_from_conf( + conf: &'static PageServerConf, + gate_guard: GateGuard, + ) -> IoConcurrency { + use pageserver_api::config::GetVectoredConcurrentIo; + let selected = match conf.get_vectored_concurrent_io { + GetVectoredConcurrentIo::Sequential => SelectedIoConcurrency::Sequential, + GetVectoredConcurrentIo::SidecarTask => SelectedIoConcurrency::SidecarTask(gate_guard), + }; + Self::spawn(selected) + } + + pub(crate) fn spawn(io_concurrency: SelectedIoConcurrency) -> Self { + match io_concurrency { + SelectedIoConcurrency::Sequential => IoConcurrency::Sequential, + SelectedIoConcurrency::SidecarTask(gate_guard) => { + let (ios_tx, ios_rx) = tokio::sync::mpsc::unbounded_channel(); + static TASK_ID: AtomicUsize = AtomicUsize::new(0); + let task_id = TASK_ID.fetch_add(1, std::sync::atomic::Ordering::Relaxed); + // TODO: enrich the span with more context (tenant,shard,timeline) + (basebackup|pagestream|...) + let span = + tracing::info_span!(parent: None, "IoConcurrency_sidecar", task_id = task_id); + trace!(task_id, "spawning sidecar task"); + tokio::spawn(async move { + trace!("start"); + scopeguard::defer!{ trace!("end") }; + type IosRx = tokio::sync::mpsc::UnboundedReceiver; + enum State { + Waiting { + // invariant: is_empty(), but we recycle the allocation + empty_futures: FuturesUnordered, + ios_rx: IosRx, + }, + Executing { + futures: FuturesUnordered, + ios_rx: IosRx, + }, + ShuttingDown { + futures: FuturesUnordered, + }, + } + let mut state = State::Waiting { + empty_futures: FuturesUnordered::new(), + ios_rx, + }; + loop { + match state { + State::Waiting { + empty_futures, + mut ios_rx, + } => { + assert!(empty_futures.is_empty()); + tokio::select! { + fut = ios_rx.recv() => { + if let Some(fut) = fut { + trace!("received new io future"); + empty_futures.push(fut); + state = State::Executing { futures: empty_futures, ios_rx }; + } else { + state = State::ShuttingDown { futures: empty_futures } + } + } + } + } + State::Executing { + mut futures, + mut ios_rx, + } => { + tokio::select! { + res = futures.next() => { + trace!("io future completed"); + assert!(res.is_some()); + if futures.is_empty() { + state = State::Waiting { empty_futures: futures, ios_rx}; + } else { + state = State::Executing { futures, ios_rx }; + } + } + fut = ios_rx.recv() => { + if let Some(fut) = fut { + trace!("received new io future"); + futures.push(fut); + state = State::Executing { futures, ios_rx}; + } else { + state = State::ShuttingDown { futures }; + } + } + } + } + State::ShuttingDown { + mut futures, + } => { + trace!("shutting down"); + while let Some(()) = futures.next().await { + trace!("io future completed (shutdown)"); + // drain + } + trace!("shutdown complete"); + break; + } + } + } + drop(gate_guard); // drop it right before we exit + }.instrument(span)); + IoConcurrency::SidecarTask { task_id, ios_tx } + } + } + } + + pub(crate) fn clone(&self) -> Self { + match self { + IoConcurrency::Sequential => IoConcurrency::Sequential, + IoConcurrency::SidecarTask { task_id, ios_tx } => IoConcurrency::SidecarTask { + task_id: *task_id, + ios_tx: ios_tx.clone(), + }, + } + } + + /// Submit an IO to be executed in the background. DEADLOCK RISK, read the full doc string. + /// + /// The IO is represented as an opaque future. + /// IO completion must be handled inside the future, e.g., through a oneshot channel. + /// + /// The API seems simple but there are multiple **pitfalls** involving + /// DEADLOCK RISK. + /// + /// First, there are no guarantees about the exexecution of the IO. + /// It may be `await`ed in-place before this function returns. + /// It may be polled partially by this task and handed off to another task to be finished. + /// It may be polled and then dropped before returning ready. + /// + /// This means that submitted IOs must not be interedependent. + /// Interdependence may be through shared limited resources, e.g., + /// - VirtualFile file descriptor cache slot acquisition + /// - tokio-epoll-uring slot + /// + /// # Why current usage is safe from deadlocks + /// + /// Textbook condition for a deadlock is that _all_ of the following be given + /// - Mutual exclusion + /// - Hold and wait + /// - No preemption + /// - Circular wait + /// + /// The current usage is safe because: + /// - Mutual exclusion: IO futures definitely use mutexes, no way around that for now + /// - Hold and wait: IO futures currently hold two kinds of locks/resources while waiting + /// for acquisition of other resources: + /// - VirtualFile file descriptor cache slot tokio mutex + /// - tokio-epoll-uring slot (uses tokio notify => wait queue, much like mutex) + /// - No preemption: there's no taking-away of acquired locks/resources => given + /// - Circular wait: this is the part of the condition that isn't met: all IO futures + /// first acquire VirtualFile mutex, then tokio-epoll-uring slot. + /// There is no IO future that acquires slot before VirtualFile. + /// Hence there can be no circular waiting. + /// Hence there cannot be a deadlock. + /// + /// This is a very fragile situation and must be revisited whenver any code called from + /// inside the IO futures is changed. + /// + /// We will move away from opaque IO futures towards well-defined IOs at some point in + /// the future when we have shipped this first version of concurrent IO to production + /// and are ready to retire the Sequential mode which runs the futures in place. + /// Right now, while brittle, the opaque IO approach allows us to ship the feature + /// with minimal changes to the code and minimal changes to existing behavior in Sequential mode. + /// + /// Also read the comment in `collect_pending_ios`. + pub(crate) async fn spawn_io(&mut self, fut: F) + where + F: std::future::Future + Send + 'static, + { + match self { + IoConcurrency::Sequential => fut.await, + IoConcurrency::SidecarTask { ios_tx, .. } => { + let fut = Box::pin(fut); + // NB: experiments showed that doing an opportunistic poll of `fut` here was bad for throughput + // while insignificant for latency. + // It would make sense to revisit the tokio-epoll-uring API in the future such that we can try + // a submission here, but never poll the future. That way, io_uring can make proccess while + // the future sits in the ios_tx queue. + match ios_tx.send(fut) { + Ok(()) => {} + Err(_) => { + unreachable!("the io task must have exited, likely it panicked") + } + } + } + } + } + + #[cfg(test)] + pub(crate) fn spawn_for_test() -> impl std::ops::DerefMut { + use std::ops::{Deref, DerefMut}; + use tracing::info; + use utils::sync::gate::Gate; + + // Spawn needs a Gate, give it one. + struct Wrapper { + inner: IoConcurrency, + #[allow(dead_code)] + gate: Box, + } + impl Deref for Wrapper { + type Target = IoConcurrency; + + fn deref(&self) -> &Self::Target { + &self.inner + } + } + impl DerefMut for Wrapper { + fn deref_mut(&mut self) -> &mut Self::Target { + &mut self.inner + } + } + let gate = Box::new(Gate::default()); + + // The default behavior when running Rust unit tests without any further + // flags is to use the new behavior. + // The CI uses the following environment variable to unit test both old + // and new behavior. + // NB: the Python regression & perf tests take the `else` branch + // below and have their own defaults management. + let selected = { + // The pageserver_api::config type is unsuitable because it's internally tagged. + #[derive(serde::Deserialize)] + #[serde(rename_all = "kebab-case")] + enum TestOverride { + Sequential, + SidecarTask, + } + use once_cell::sync::Lazy; + static TEST_OVERRIDE: Lazy = Lazy::new(|| { + utils::env::var_serde_json_string( + "NEON_PAGESERVER_UNIT_TEST_GET_VECTORED_CONCURRENT_IO", + ) + .unwrap_or(TestOverride::SidecarTask) + }); + + match *TEST_OVERRIDE { + TestOverride::Sequential => SelectedIoConcurrency::Sequential, + TestOverride::SidecarTask => { + SelectedIoConcurrency::SidecarTask(gate.enter().expect("just created it")) + } + } + }; + + info!(?selected, "get_vectored_concurrent_io test"); + + Wrapper { + inner: Self::spawn(selected), + gate, + } + } +} + +/// Make noise in case the [`ValuesReconstructState`] gets dropped while +/// there are still IOs in flight. +/// Refer to `collect_pending_ios` for why we prefer not to do that. +// +/// We log from here instead of from the sidecar task because the [`ValuesReconstructState`] +/// gets dropped in a tracing span with more context. +/// We repeat the sidecar tasks's `task_id` so we can correlate what we emit here with +/// the logs / panic handler logs from the sidecar task, which also logs the `task_id`. +impl Drop for ValuesReconstructState { + fn drop(&mut self) { + let num_active_ios = self + .num_active_ios + .load(std::sync::atomic::Ordering::Acquire); + if num_active_ios == 0 { + return; + } + let sidecar_task_id = match &self.io_concurrency { + IoConcurrency::Sequential => None, + IoConcurrency::SidecarTask { task_id, .. } => Some(*task_id), + }; + tracing::warn!( + num_active_ios, + ?sidecar_task_id, + backtrace=%std::backtrace::Backtrace::force_capture(), + "dropping ValuesReconstructState while some IOs have not been completed", + ); + } } impl ValuesReconstructState { - pub(crate) fn new() -> Self { + pub(crate) fn new(io_concurrency: IoConcurrency) -> Self { Self { keys: HashMap::new(), keys_done: KeySpaceRandomAccum::new(), keys_with_image_coverage: None, layers_visited: 0, delta_layers_visited: 0, + io_concurrency, + num_active_ios: Arc::new(AtomicUsize::new(0)), } } - /// Associate a key with the error which it encountered and mark it as done - pub(crate) fn on_key_error(&mut self, key: Key, err: PageReconstructError) { - let previous = self.keys.insert(key, Err(err)); - if let Some(Ok(state)) = previous { - if state.situation == ValueReconstructSituation::Continue { - self.keys_done.add_key(key); - } - } + /// Absolutely read [`IoConcurrency::spawn_io`] to learn about assumptions & pitfalls. + pub(crate) async fn spawn_io(&mut self, fut: F) + where + F: std::future::Future + Send + 'static, + { + self.io_concurrency.spawn_io(fut).await; } pub(crate) fn on_layer_visited(&mut self, layer: &ReadableLayer) { @@ -159,29 +627,6 @@ impl ValuesReconstructState { self.layers_visited } - /// This function is called after reading a keyspace from a layer. - /// It checks if the read path has now moved past the cached Lsn for any keys. - /// - /// Implementation note: We intentionally iterate over the keys for which we've - /// already collected some reconstruct data. This avoids scaling complexity with - /// the size of the search space. - pub(crate) fn on_lsn_advanced(&mut self, keyspace: &KeySpace, advanced_to: Lsn) { - for (key, value) in self.keys.iter_mut() { - if !keyspace.contains(key) { - continue; - } - - if let Ok(state) = value { - if state.situation != ValueReconstructSituation::Complete - && state.get_cached_lsn() >= Some(advanced_to) - { - state.situation = ValueReconstructSituation::Complete; - self.keys_done.add_key(*key); - } - } - } - } - /// On hitting image layer, we can mark all keys in this range as done, because /// if the image layer does not contain a key, it is deleted/never added. pub(crate) fn on_image_layer_visited(&mut self, key_range: &Range) { @@ -199,70 +644,42 @@ impl ValuesReconstructState { /// /// If the key is in the sparse keyspace (i.e., aux files), we do not track them in /// `key_done`. - pub(crate) fn update_key( - &mut self, - key: &Key, - lsn: Lsn, - value: Value, - ) -> ValueReconstructSituation { - let state = self - .keys - .entry(*key) - .or_insert(Ok(VectoredValueReconstructState::default())); + // TODO: rename this method & update description. + pub(crate) fn update_key(&mut self, key: &Key, lsn: Lsn, completes: bool) -> OnDiskValueIo { + let state = self.keys.entry(*key).or_default(); + let is_sparse_key = key.is_sparse(); - if let Ok(state) = state { - let key_done = match state.situation { - ValueReconstructSituation::Complete => { - if is_sparse_key { - // Sparse keyspace might be visited multiple times because - // we don't track unmapped keyspaces. - return ValueReconstructSituation::Complete; - } else { - unreachable!() - } - } - ValueReconstructSituation::Continue => match value { - Value::Image(img) => { - state.img = Some((lsn, img)); - true - } - Value::WalRecord(rec) => { - debug_assert!( - Some(lsn) > state.get_cached_lsn(), - "Attempt to collect a record below cached LSN for walredo: {} < {}", - lsn, - state - .get_cached_lsn() - .expect("Assertion can only fire if a cached lsn is present") - ); - let will_init = rec.will_init(); - state.records.push((lsn, rec)); - will_init - } - }, - }; - - if key_done && state.situation == ValueReconstructSituation::Continue { - state.situation = ValueReconstructSituation::Complete; - if !is_sparse_key { - self.keys_done.add_key(*key); + let required_io = match state.situation { + ValueReconstructSituation::Complete => { + if is_sparse_key { + // Sparse keyspace might be visited multiple times because + // we don't track unmapped keyspaces. + return OnDiskValueIo::Unnecessary; + } else { + unreachable!() } } + ValueReconstructSituation::Continue => { + self.num_active_ios + .fetch_add(1, std::sync::atomic::Ordering::Release); + let (tx, rx) = tokio::sync::oneshot::channel(); + state.on_disk_values.push((lsn, OnDiskValueIoWaiter { rx })); + OnDiskValueIo::Required { + tx, + num_active_ios: Arc::clone(&self.num_active_ios), + } + } + }; - state.situation - } else { - ValueReconstructSituation::Complete + if completes && state.situation == ValueReconstructSituation::Continue { + state.situation = ValueReconstructSituation::Complete; + if !is_sparse_key { + self.keys_done.add_key(*key); + } } - } - /// Returns the Lsn at which this key is cached if one exists. - /// The read path should go no further than this Lsn for the given key. - pub(crate) fn get_cached_lsn(&self, key: &Key) -> Option { - self.keys - .get(key) - .and_then(|k| k.as_ref().ok()) - .and_then(|state| state.get_cached_lsn()) + required_io } /// Returns the key space describing the keys that have @@ -276,12 +693,6 @@ impl ValuesReconstructState { } } -impl Default for ValuesReconstructState { - fn default() -> Self { - Self::new() - } -} - /// A key that uniquely identifies a layer in a timeline #[derive(Debug, PartialEq, Eq, Clone, Hash)] pub(crate) enum LayerId { @@ -720,3 +1131,78 @@ impl std::fmt::Debug for RangeDisplayDebug<'_, T> { write!(f, "{}..{}", self.0.start, self.0.end) } } + +#[cfg(test)] +mod tests2 { + use pageserver_api::key::DBDIR_KEY; + use tracing::info; + + use super::*; + use crate::tenant::storage_layer::IoConcurrency; + + /// TODO: currently this test relies on manual visual inspection of the --no-capture output. + /// Should look like so: + /// ```text + /// RUST_LOG=trace cargo nextest run --features testing --no-capture test_io_concurrency_noise + /// running 1 test + /// 2025-01-21T17:42:01.335679Z INFO get_vectored_concurrent_io test selected=SidecarTask + /// 2025-01-21T17:42:01.335680Z TRACE spawning sidecar task task_id=0 + /// 2025-01-21T17:42:01.335937Z TRACE IoConcurrency_sidecar{task_id=0}: start + /// 2025-01-21T17:42:01.335972Z TRACE IoConcurrency_sidecar{task_id=0}: received new io future + /// 2025-01-21T17:42:01.335999Z INFO IoConcurrency_sidecar{task_id=0}: waiting for signal to complete IO + /// 2025-01-21T17:42:01.336229Z WARN dropping ValuesReconstructState while some IOs have not been completed num_active_ios=1 sidecar_task_id=Some(0) backtrace= 0: ::drop + /// at ./src/tenant/storage_layer.rs:553:24 + /// 1: core::ptr::drop_in_place + /// at /home/christian/.rustup/toolchains/1.84.0-x86_64-unknown-linux-gnu/lib/rustlib/src/rust/library/core/src/ptr/mod.rs:521:1 + /// 2: core::mem::drop + /// at /home/christian/.rustup/toolchains/1.84.0-x86_64-unknown-linux-gnu/lib/rustlib/src/rust/library/core/src/mem/mod.rs:942:24 + /// 3: pageserver::tenant::storage_layer::tests2::test_io_concurrency_noise::{{closure}} + /// at ./src/tenant/storage_layer.rs:1159:9 + /// ... + /// 49: + /// 2025-01-21T17:42:01.452293Z INFO IoConcurrency_sidecar{task_id=0}: completing IO + /// 2025-01-21T17:42:01.452357Z TRACE IoConcurrency_sidecar{task_id=0}: io future completed + /// 2025-01-21T17:42:01.452473Z TRACE IoConcurrency_sidecar{task_id=0}: end + /// test tenant::storage_layer::tests2::test_io_concurrency_noise ... ok + /// + /// ``` + #[tokio::test] + async fn test_io_concurrency_noise() { + crate::tenant::harness::setup_logging(); + + let io_concurrency = IoConcurrency::spawn_for_test(); + match *io_concurrency { + IoConcurrency::Sequential => { + // This test asserts behavior in sidecar mode, doesn't make sense in sequential mode. + return; + } + IoConcurrency::SidecarTask { .. } => {} + } + let mut reconstruct_state = ValuesReconstructState::new(io_concurrency.clone()); + + let (io_fut_is_waiting_tx, io_fut_is_waiting) = tokio::sync::oneshot::channel(); + let (do_complete_io, should_complete_io) = tokio::sync::oneshot::channel(); + let (io_fut_exiting_tx, io_fut_exiting) = tokio::sync::oneshot::channel(); + + let io = reconstruct_state.update_key(&DBDIR_KEY, Lsn(8), true); + reconstruct_state + .spawn_io(async move { + info!("waiting for signal to complete IO"); + io_fut_is_waiting_tx.send(()).unwrap(); + should_complete_io.await.unwrap(); + info!("completing IO"); + io.complete(Ok(OnDiskValue::RawImage(Bytes::new()))); + io_fut_exiting_tx.send(()).unwrap(); + }) + .await; + + io_fut_is_waiting.await.unwrap(); + + // this is what makes the noise + drop(reconstruct_state); + + do_complete_io.send(()).unwrap(); + + io_fut_exiting.await.unwrap(); + } +} diff --git a/pageserver/src/tenant/storage_layer/batch_split_writer.rs b/pageserver/src/tenant/storage_layer/batch_split_writer.rs index 8a397ceb7a..22d8b81bcc 100644 --- a/pageserver/src/tenant/storage_layer/batch_split_writer.rs +++ b/pageserver/src/tenant/storage_layer/batch_split_writer.rs @@ -87,6 +87,23 @@ impl BatchLayerWriter { )); } + pub(crate) async fn finish( + self, + tline: &Arc, + ctx: &RequestContext, + ) -> anyhow::Result> { + let res = self + .finish_with_discard_fn(tline, ctx, |_| async { false }) + .await?; + let mut output = Vec::new(); + for r in res { + if let BatchWriterResult::Produced(layer) = r { + output.push(layer); + } + } + Ok(output) + } + pub(crate) async fn finish_with_discard_fn( self, tline: &Arc, diff --git a/pageserver/src/tenant/storage_layer/delta_layer.rs b/pageserver/src/tenant/storage_layer/delta_layer.rs index ade1b794c6..885c50425f 100644 --- a/pageserver/src/tenant/storage_layer/delta_layer.rs +++ b/pageserver/src/tenant/storage_layer/delta_layer.rs @@ -41,13 +41,12 @@ use crate::tenant::vectored_blob_io::{ BlobFlag, BufView, StreamingVectoredReadPlanner, VectoredBlobReader, VectoredRead, VectoredReadPlanner, }; -use crate::tenant::PageReconstructError; use crate::virtual_file::owned_buffers_io::io_buf_ext::{FullSlice, IoBufExt}; use crate::virtual_file::IoBufferMut; use crate::virtual_file::{self, MaybeFatalIo, VirtualFile}; use crate::TEMP_FILE_SUFFIX; use crate::{DELTA_FILE_MAGIC, STORAGE_FORMAT_VERSION}; -use anyhow::{anyhow, bail, ensure, Context, Result}; +use anyhow::{bail, ensure, Context, Result}; use camino::{Utf8Path, Utf8PathBuf}; use futures::StreamExt; use itertools::Itertools; @@ -60,7 +59,7 @@ use pageserver_api::shard::TenantShardId; use pageserver_api::value::Value; use rand::{distributions::Alphanumeric, Rng}; use serde::{Deserialize, Serialize}; -use std::collections::VecDeque; +use std::collections::{HashMap, VecDeque}; use std::fs::File; use std::io::SeekFrom; use std::ops::Range; @@ -77,7 +76,10 @@ use utils::{ lsn::Lsn, }; -use super::{AsLayerDesc, LayerName, PersistentLayerDesc, ValuesReconstructState}; +use super::{ + AsLayerDesc, LayerName, OnDiskValue, OnDiskValueIo, PersistentLayerDesc, ResidentLayer, + ValuesReconstructState, +}; /// /// Header stored in the beginning of the file @@ -226,7 +228,7 @@ pub struct DeltaLayerInner { index_start_blk: u32, index_root_blk: u32, - file: VirtualFile, + file: Arc, file_id: FileId, layer_key_range: Range, @@ -795,9 +797,11 @@ impl DeltaLayerInner { max_vectored_read_bytes: Option, ctx: &RequestContext, ) -> anyhow::Result { - let file = VirtualFile::open_v2(path, ctx) - .await - .context("open layer file")?; + let file = Arc::new( + VirtualFile::open_v2(path, ctx) + .await + .context("open layer file")?, + ); let file_id = page_cache::next_file_id(); @@ -842,12 +846,11 @@ impl DeltaLayerInner { // Look up the keys in the provided keyspace and update // the reconstruct state with whatever is found. // - // If the key is cached, go no further than the cached Lsn. - // // Currently, the index is visited for each range, but this // can be further optimised to visit the index only once. pub(super) async fn get_values_reconstruct_data( &self, + this: ResidentLayer, keyspace: KeySpace, lsn_range: Range, reconstruct_state: &mut ValuesReconstructState, @@ -875,17 +878,14 @@ impl DeltaLayerInner { data_end_offset, index_reader, planner, - reconstruct_state, ctx, ) .await .map_err(GetVectoredError::Other)?; - self.do_reads_and_update_state(reads, reconstruct_state, ctx) + self.do_reads_and_update_state(this, reads, reconstruct_state, ctx) .await; - reconstruct_state.on_lsn_advanced(&keyspace, lsn_range.start); - Ok(()) } @@ -895,7 +895,6 @@ impl DeltaLayerInner { data_end_offset: u64, index_reader: DiskBtreeReader, mut planner: VectoredReadPlanner, - reconstruct_state: &mut ValuesReconstructState, ctx: &RequestContext, ) -> anyhow::Result> where @@ -922,10 +921,9 @@ impl DeltaLayerInner { assert!(key >= range.start); let outside_lsn_range = !lsn_range.contains(&lsn); - let below_cached_lsn = reconstruct_state.get_cached_lsn(&key) >= Some(lsn); let flag = { - if outside_lsn_range || below_cached_lsn { + if outside_lsn_range { BlobFlag::Ignore } else if blob_ref.will_init() { BlobFlag::ReplaceAll @@ -994,98 +992,78 @@ impl DeltaLayerInner { async fn do_reads_and_update_state( &self, + this: ResidentLayer, reads: Vec, reconstruct_state: &mut ValuesReconstructState, ctx: &RequestContext, ) { - let vectored_blob_reader = VectoredBlobReader::new(&self.file); - let mut ignore_key_with_err = None; - let max_vectored_read_bytes = self .max_vectored_read_bytes .expect("Layer is loaded with max vectored bytes config") .0 .into(); let buf_size = Self::get_min_read_buffer_size(&reads, max_vectored_read_bytes); - let mut buf = Some(IoBufferMut::with_capacity(buf_size)); // Note that reads are processed in reverse order (from highest key+lsn). // This is the order that `ReconstructState` requires such that it can // track when a key is done. for read in reads.into_iter().rev() { - let res = vectored_blob_reader - .read_blobs(&read, buf.take().expect("Should have a buffer"), ctx) - .await; - - let blobs_buf = match res { - Ok(blobs_buf) => blobs_buf, - Err(err) => { - let kind = err.kind(); - for (_, blob_meta) in read.blobs_at.as_slice() { - reconstruct_state.on_key_error( - blob_meta.key, - PageReconstructError::Other(anyhow!( - "Failed to read blobs from virtual file {}: {}", - self.file.path(), - kind - )), - ); - } - - // We have "lost" the buffer since the lower level IO api - // doesn't return the buffer on error. Allocate a new one. - buf = Some(IoBufferMut::with_capacity(buf_size)); - - continue; - } - }; - let view = BufView::new_slice(&blobs_buf.buf); - for meta in blobs_buf.blobs.iter().rev() { - if Some(meta.meta.key) == ignore_key_with_err { - continue; - } - let blob_read = meta.read(&view).await; - let blob_read = match blob_read { - Ok(buf) => buf, - Err(e) => { - reconstruct_state.on_key_error( - meta.meta.key, - PageReconstructError::Other(anyhow!(e).context(format!( - "Failed to decompress blob from virtual file {}", - self.file.path(), - ))), - ); - - ignore_key_with_err = Some(meta.meta.key); - continue; - } - }; - - let value = Value::des(&blob_read); - - let value = match value { - Ok(v) => v, - Err(e) => { - reconstruct_state.on_key_error( - meta.meta.key, - PageReconstructError::Other(anyhow!(e).context(format!( - "Failed to deserialize blob from virtual file {}", - self.file.path(), - ))), - ); - - ignore_key_with_err = Some(meta.meta.key); - continue; - } - }; - - // Invariant: once a key reaches [`ValueReconstructSituation::Complete`] - // state, no further updates shall be made to it. The call below will - // panic if the invariant is violated. - reconstruct_state.update_key(&meta.meta.key, meta.meta.lsn, value); + let mut ios: HashMap<(Key, Lsn), OnDiskValueIo> = Default::default(); + for (_, blob_meta) in read.blobs_at.as_slice().iter().rev() { + let io = reconstruct_state.update_key( + &blob_meta.key, + blob_meta.lsn, + blob_meta.will_init, + ); + ios.insert((blob_meta.key, blob_meta.lsn), io); } - buf = Some(blobs_buf.buf); + let read_extend_residency = this.clone(); + let read_from = self.file.clone(); + let read_ctx = ctx.attached_child(); + reconstruct_state + .spawn_io(async move { + let vectored_blob_reader = VectoredBlobReader::new(&read_from); + let buf = IoBufferMut::with_capacity(buf_size); + + let res = vectored_blob_reader.read_blobs(&read, buf, &read_ctx).await; + match res { + Ok(blobs_buf) => { + let view = BufView::new_slice(&blobs_buf.buf); + for meta in blobs_buf.blobs.iter().rev() { + let io = ios.remove(&(meta.meta.key, meta.meta.lsn)).unwrap(); + + let blob_read = meta.read(&view).await; + let blob_read = match blob_read { + Ok(buf) => buf, + Err(e) => { + io.complete(Err(e)); + continue; + } + }; + + io.complete(Ok(OnDiskValue::WalRecordOrImage( + blob_read.into_bytes(), + ))); + } + + assert!(ios.is_empty()); + } + Err(err) => { + for (_, sender) in ios { + sender.complete(Err(std::io::Error::new( + err.kind(), + "vec read failed", + ))); + } + } + } + + // keep layer resident until this IO is done; this spawned IO future generally outlives the + // call to `self` / the `Arc` / the `ResidentLayer` that guarantees residency + drop(read_extend_residency); + }) + .await; } } @@ -1224,7 +1202,14 @@ impl DeltaLayerInner { let actionable = if let Some((key, lsn, start_offset)) = prev.take() { let end_offset = offset; - Some((BlobMeta { key, lsn }, start_offset..end_offset)) + Some(( + BlobMeta { + key, + lsn, + will_init: false, + }, + start_offset..end_offset, + )) } else { None }; @@ -1560,7 +1545,9 @@ impl DeltaLayerIterator<'_> { let lsn = DeltaKey::extract_lsn_from_buf(&raw_key); let blob_ref = BlobRef(value); let offset = blob_ref.pos(); - if let Some(batch_plan) = self.planner.handle(key, lsn, offset) { + if let Some(batch_plan) = + self.planner.handle(key, lsn, offset, blob_ref.will_init()) + { break batch_plan; } } else { @@ -1673,7 +1660,6 @@ pub(crate) mod test { .expect("In memory disk finish should never fail"); let reader = DiskBtreeReader::<_, DELTA_KEY_SIZE>::new(0, root_offset, disk); let planner = VectoredReadPlanner::new(100); - let mut reconstruct_state = ValuesReconstructState::new(); let ctx = RequestContext::new(TaskKind::UnitTest, DownloadBehavior::Error); let keyspace = KeySpace { @@ -1691,7 +1677,6 @@ pub(crate) mod test { disk_offset, reader, planner, - &mut reconstruct_state, &ctx, ) .await @@ -1935,7 +1920,6 @@ pub(crate) mod test { ); let planner = VectoredReadPlanner::new(constants::MAX_VECTORED_READ_BYTES); - let mut reconstruct_state = ValuesReconstructState::new(); let keyspace = pick_random_keyspace(rng, &entries_meta.key_range); let data_end_offset = inner.index_start_blk as u64 * PAGE_SZ as u64; @@ -1945,7 +1929,6 @@ pub(crate) mod test { data_end_offset, index_reader, planner, - &mut reconstruct_state, &ctx, ) .await?; diff --git a/pageserver/src/tenant/storage_layer/image_layer.rs b/pageserver/src/tenant/storage_layer/image_layer.rs index 0d3c9d5a44..c49281dc45 100644 --- a/pageserver/src/tenant/storage_layer/image_layer.rs +++ b/pageserver/src/tenant/storage_layer/image_layer.rs @@ -38,12 +38,11 @@ use crate::tenant::vectored_blob_io::{ BlobFlag, BufView, StreamingVectoredReadPlanner, VectoredBlobReader, VectoredRead, VectoredReadPlanner, }; -use crate::tenant::PageReconstructError; use crate::virtual_file::owned_buffers_io::io_buf_ext::IoBufExt; use crate::virtual_file::IoBufferMut; use crate::virtual_file::{self, MaybeFatalIo, VirtualFile}; use crate::{IMAGE_FILE_MAGIC, STORAGE_FORMAT_VERSION, TEMP_FILE_SUFFIX}; -use anyhow::{anyhow, bail, ensure, Context, Result}; +use anyhow::{bail, ensure, Context, Result}; use bytes::Bytes; use camino::{Utf8Path, Utf8PathBuf}; use hex; @@ -56,12 +55,13 @@ use pageserver_api::shard::{ShardIdentity, TenantShardId}; use pageserver_api::value::Value; use rand::{distributions::Alphanumeric, Rng}; use serde::{Deserialize, Serialize}; -use std::collections::VecDeque; +use std::collections::{HashMap, VecDeque}; use std::fs::File; use std::io::SeekFrom; use std::ops::Range; use std::os::unix::prelude::FileExt; use std::str::FromStr; +use std::sync::Arc; use tokio::sync::OnceCell; use tokio_stream::StreamExt; use tracing::*; @@ -73,7 +73,10 @@ use utils::{ }; use super::layer_name::ImageLayerName; -use super::{AsLayerDesc, LayerName, PersistentLayerDesc, ValuesReconstructState}; +use super::{ + AsLayerDesc, LayerName, OnDiskValue, OnDiskValueIo, PersistentLayerDesc, ResidentLayer, + ValuesReconstructState, +}; /// /// Header stored in the beginning of the file @@ -164,7 +167,7 @@ pub struct ImageLayerInner { key_range: Range, lsn: Lsn, - file: VirtualFile, + file: Arc, file_id: FileId, max_vectored_read_bytes: Option, @@ -391,9 +394,11 @@ impl ImageLayerInner { max_vectored_read_bytes: Option, ctx: &RequestContext, ) -> anyhow::Result { - let file = VirtualFile::open_v2(path, ctx) - .await - .context("open layer file")?; + let file = Arc::new( + VirtualFile::open_v2(path, ctx) + .await + .context("open layer file")?, + ); let file_id = page_cache::next_file_id(); let block_reader = FileBlockReader::new(&file, file_id); let summary_blk = block_reader @@ -439,6 +444,7 @@ impl ImageLayerInner { // the reconstruct state with whatever is found. pub(super) async fn get_values_reconstruct_data( &self, + this: ResidentLayer, keyspace: KeySpace, reconstruct_state: &mut ValuesReconstructState, ctx: &RequestContext, @@ -448,7 +454,7 @@ impl ImageLayerInner { .await .map_err(GetVectoredError::Other)?; - self.do_reads_and_update_state(reads, reconstruct_state, ctx) + self.do_reads_and_update_state(this, reads, reconstruct_state, ctx) .await; reconstruct_state.on_image_layer_visited(&self.key_range); @@ -570,6 +576,7 @@ impl ImageLayerInner { async fn do_reads_and_update_state( &self, + this: ResidentLayer, reads: Vec, reconstruct_state: &mut ValuesReconstructState, ctx: &RequestContext, @@ -580,8 +587,13 @@ impl ImageLayerInner { .0 .into(); - let vectored_blob_reader = VectoredBlobReader::new(&self.file); for read in reads.into_iter() { + let mut ios: HashMap<(Key, Lsn), OnDiskValueIo> = Default::default(); + for (_, blob_meta) in read.blobs_at.as_slice() { + let io = reconstruct_state.update_key(&blob_meta.key, blob_meta.lsn, true); + ios.insert((blob_meta.key, blob_meta.lsn), io); + } + let buf_size = read.size(); if buf_size > max_vectored_read_bytes { @@ -611,50 +623,51 @@ impl ImageLayerInner { } } - let buf = IoBufferMut::with_capacity(buf_size); - let res = vectored_blob_reader.read_blobs(&read, buf, ctx).await; + let read_extend_residency = this.clone(); + let read_from = self.file.clone(); + let read_ctx = ctx.attached_child(); + reconstruct_state + .spawn_io(async move { + let buf = IoBufferMut::with_capacity(buf_size); + let vectored_blob_reader = VectoredBlobReader::new(&read_from); + let res = vectored_blob_reader.read_blobs(&read, buf, &read_ctx).await; - match res { - Ok(blobs_buf) => { - let view = BufView::new_slice(&blobs_buf.buf); - for meta in blobs_buf.blobs.iter() { - let img_buf = meta.read(&view).await; + match res { + Ok(blobs_buf) => { + let view = BufView::new_slice(&blobs_buf.buf); + for meta in blobs_buf.blobs.iter() { + let io: OnDiskValueIo = + ios.remove(&(meta.meta.key, meta.meta.lsn)).unwrap(); + let img_buf = meta.read(&view).await; - let img_buf = match img_buf { - Ok(img_buf) => img_buf, - Err(e) => { - reconstruct_state.on_key_error( - meta.meta.key, - PageReconstructError::Other(anyhow!(e).context(format!( - "Failed to decompress blob from virtual file {}", - self.file.path(), - ))), - ); + let img_buf = match img_buf { + Ok(img_buf) => img_buf, + Err(e) => { + io.complete(Err(e)); + continue; + } + }; - continue; + io.complete(Ok(OnDiskValue::RawImage(img_buf.into_bytes()))); } - }; - reconstruct_state.update_key( - &meta.meta.key, - self.lsn, - Value::Image(img_buf.into_bytes()), - ); + + assert!(ios.is_empty()); + } + Err(err) => { + for (_, io) in ios { + io.complete(Err(std::io::Error::new( + err.kind(), + "vec read failed", + ))); + } + } } - } - Err(err) => { - let kind = err.kind(); - for (_, blob_meta) in read.blobs_at.as_slice() { - reconstruct_state.on_key_error( - blob_meta.key, - PageReconstructError::from(anyhow!( - "Failed to read blobs from virtual file {}: {}", - self.file.path(), - kind - )), - ); - } - } - }; + + // keep layer resident until this IO is done; this spawned IO future generally outlives the + // call to `self` / the `Arc` / the `ResidentLayer` that guarantees residency + drop(read_extend_residency); + }) + .await; } } @@ -1069,6 +1082,7 @@ impl ImageLayerIterator<'_> { Key::from_slice(&raw_key[..KEY_SIZE]), self.image_layer.lsn, offset, + true, ) { break batch_plan; } diff --git a/pageserver/src/tenant/storage_layer/inmemory_layer.rs b/pageserver/src/tenant/storage_layer/inmemory_layer.rs index 2b67f55a17..61a0fdea8c 100644 --- a/pageserver/src/tenant/storage_layer/inmemory_layer.rs +++ b/pageserver/src/tenant/storage_layer/inmemory_layer.rs @@ -8,23 +8,22 @@ use crate::assert_u64_eq_usize::{u64_to_usize, U64IsUsize, UsizeIsU64}; use crate::config::PageServerConf; use crate::context::{PageContentKind, RequestContext, RequestContextBuilder}; use crate::tenant::ephemeral_file::EphemeralFile; +use crate::tenant::storage_layer::{OnDiskValue, OnDiskValueIo}; use crate::tenant::timeline::GetVectoredError; -use crate::tenant::PageReconstructError; use crate::virtual_file::owned_buffers_io::io_buf_ext::IoBufExt; use crate::{l0_flush, page_cache}; -use anyhow::{anyhow, Result}; +use anyhow::Result; use camino::Utf8PathBuf; use pageserver_api::key::CompactKey; use pageserver_api::key::Key; use pageserver_api::keyspace::KeySpace; use pageserver_api::models::InMemoryLayerInfo; use pageserver_api::shard::TenantShardId; -use pageserver_api::value::Value; use std::collections::{BTreeMap, HashMap}; use std::sync::{Arc, OnceLock}; use std::time::Instant; use tracing::*; -use utils::{bin_ser::BeSer, id::TimelineId, lsn::Lsn, vec_map::VecMap}; +use utils::{id::TimelineId, lsn::Lsn, vec_map::VecMap}; use wal_decoder::serialized_batch::{SerializedValueBatch, SerializedValueMeta, ValueMeta}; // avoid binding to Write (conflicts with std::io::Write) // while being able to use std::fmt::Write's methods @@ -36,9 +35,7 @@ use std::sync::atomic::Ordering as AtomicOrdering; use std::sync::atomic::{AtomicU64, AtomicUsize}; use tokio::sync::RwLock; -use super::{ - DeltaLayerWriter, PersistentLayerDesc, ValueReconstructSituation, ValuesReconstructState, -}; +use super::{DeltaLayerWriter, PersistentLayerDesc, ValuesReconstructState}; pub(crate) mod vectored_dio_read; @@ -415,10 +412,8 @@ impl InMemoryLayer { // Look up the keys in the provided keyspace and update // the reconstruct state with whatever is found. - // - // If the key is cached, go no further than the cached Lsn. pub(crate) async fn get_values_reconstruct_data( - &self, + self: &Arc, keyspace: KeySpace, end_lsn: Lsn, reconstruct_state: &mut ValuesReconstructState, @@ -435,6 +430,9 @@ impl InMemoryLayer { read: vectored_dio_read::LogicalRead>, } let mut reads: HashMap> = HashMap::new(); + let mut ios: HashMap<(Key, Lsn), OnDiskValueIo> = Default::default(); + + let lsn_range = self.start_lsn..end_lsn; for range in keyspace.ranges.iter() { for (key, vec_map) in inner @@ -442,12 +440,7 @@ impl InMemoryLayer { .range(range.start.to_compact()..range.end.to_compact()) { let key = Key::from_compact(*key); - let lsn_range = match reconstruct_state.get_cached_lsn(&key) { - Some(cached_lsn) => (cached_lsn + 1)..end_lsn, - None => self.start_lsn..end_lsn, - }; - - let slice = vec_map.slice_range(lsn_range); + let slice = vec_map.slice_range(lsn_range.clone()); for (entry_lsn, index_entry) in slice.iter().rev() { let IndexEntryUnpacked { @@ -463,55 +456,59 @@ impl InMemoryLayer { Vec::with_capacity(len as usize), ), }); + + let io = reconstruct_state.update_key(&key, *entry_lsn, will_init); + ios.insert((key, *entry_lsn), io); + if will_init { break; } } } } + drop(inner); // release the lock before we spawn the IO; if it's serial-mode IO we will deadlock on the read().await below + let read_from = Arc::clone(self); + let read_ctx = ctx.attached_child(); + reconstruct_state + .spawn_io(async move { + let inner = read_from.inner.read().await; + let f = vectored_dio_read::execute( + &inner.file, + reads + .iter() + .flat_map(|(_, value_reads)| value_reads.iter().map(|v| &v.read)), + &read_ctx, + ); + send_future::SendFuture::send(f) // https://github.com/rust-lang/rust/issues/96865 + .await; - // Execute the reads. - - let f = vectored_dio_read::execute( - &inner.file, - reads - .iter() - .flat_map(|(_, value_reads)| value_reads.iter().map(|v| &v.read)), - &ctx, - ); - send_future::SendFuture::send(f) // https://github.com/rust-lang/rust/issues/96865 - .await; - - // Process results into the reconstruct state - 'next_key: for (key, value_reads) in reads { - for ValueRead { entry_lsn, read } in value_reads { - match read.into_result().expect("we run execute() above") { - Err(e) => { - reconstruct_state.on_key_error(key, PageReconstructError::from(anyhow!(e))); - continue 'next_key; - } - Ok(value_buf) => { - let value = Value::des(&value_buf); - if let Err(e) = value { - reconstruct_state - .on_key_error(key, PageReconstructError::from(anyhow!(e))); - continue 'next_key; + for (key, value_reads) in reads { + for ValueRead { entry_lsn, read } in value_reads { + let io = ios.remove(&(key, entry_lsn)).expect("sender must exist"); + match read.into_result().expect("we run execute() above") { + Err(e) => { + io.complete(Err(std::io::Error::new( + e.kind(), + "dio vec read failed", + ))); + } + Ok(value_buf) => { + io.complete(Ok(OnDiskValue::WalRecordOrImage(value_buf.into()))); + } } - - let key_situation = - reconstruct_state.update_key(&key, entry_lsn, value.unwrap()); - if key_situation == ValueReconstructSituation::Complete { - // TODO: metric to see if we fetched more values than necessary - continue 'next_key; - } - - // process the next value in the next iteration of the loop } } - } - } - reconstruct_state.on_lsn_advanced(&keyspace, self.start_lsn); + assert!(ios.is_empty()); + + // Keep layer existent until this IO is done; + // This is kinda forced for InMemoryLayer because we need to inner.read() anyway, + // but it's less obvious for DeltaLayer and ImageLayer. So, keep this explicit + // drop for consistency among all three layer types. + drop(inner); + drop(read_from); + }) + .await; Ok(()) } @@ -606,6 +603,7 @@ impl InMemoryLayer { // Write the batch to the file inner.file.write_raw(&raw, ctx).await?; let new_size = inner.file.len(); + let expected_new_len = base_offset .checked_add(raw.len().into_u64()) // write_raw would error if we were to overflow u64. diff --git a/pageserver/src/tenant/storage_layer/layer.rs b/pageserver/src/tenant/storage_layer/layer.rs index 2b06c88e8b..99e0ff1aa5 100644 --- a/pageserver/src/tenant/storage_layer/layer.rs +++ b/pageserver/src/tenant/storage_layer/layer.rs @@ -308,7 +308,7 @@ impl Layer { reconstruct_data: &mut ValuesReconstructState, ctx: &RequestContext, ) -> Result<(), GetVectoredError> { - let layer = self + let downloaded = self .0 .get_or_maybe_download(true, Some(ctx)) .await @@ -318,11 +318,15 @@ impl Layer { } other => GetVectoredError::Other(anyhow::anyhow!(other)), })?; + let this = ResidentLayer { + downloaded: downloaded.clone(), + owner: self.clone(), + }; self.record_access(ctx); - layer - .get_values_reconstruct_data(keyspace, lsn_range, reconstruct_data, &self.0, ctx) + downloaded + .get_values_reconstruct_data(this, keyspace, lsn_range, reconstruct_data, ctx) .instrument(tracing::debug_span!("get_values_reconstruct_data", layer=%self)) .await .map_err(|err| match err { @@ -697,13 +701,7 @@ impl Drop for LayerInner { if let Some(timeline) = timeline.as_ref() { // Only need to decrement metrics if the timeline still exists: otherwise // it will have already de-registered these metrics via TimelineMetrics::shutdown - if self.desc.is_delta() { - timeline.metrics.layer_count_delta.dec(); - timeline.metrics.layer_size_delta.sub(self.desc.file_size); - } else { - timeline.metrics.layer_count_image.dec(); - timeline.metrics.layer_size_image.sub(self.desc.file_size); - } + timeline.metrics.dec_layer(&self.desc); if matches!(self.access_stats.visibility(), LayerVisibilityHint::Visible) { debug_assert!( @@ -813,13 +811,7 @@ impl LayerInner { }; // This object acts as a RAII guard on these metrics: increment on construction - if desc.is_delta() { - timeline.metrics.layer_count_delta.inc(); - timeline.metrics.layer_size_delta.add(desc.file_size); - } else { - timeline.metrics.layer_count_image.inc(); - timeline.metrics.layer_size_image.add(desc.file_size); - } + timeline.metrics.inc_layer(&desc); // New layers are visible by default. This metric is later updated on drop or in set_visibility timeline @@ -1768,25 +1760,25 @@ impl DownloadedLayer { async fn get_values_reconstruct_data( &self, + this: ResidentLayer, keyspace: KeySpace, lsn_range: Range, reconstruct_data: &mut ValuesReconstructState, - owner: &Arc, ctx: &RequestContext, ) -> Result<(), GetVectoredError> { use LayerKind::*; match self - .get(owner, ctx) + .get(&this.owner.0, ctx) .await .map_err(GetVectoredError::Other)? { Delta(d) => { - d.get_values_reconstruct_data(keyspace, lsn_range, reconstruct_data, ctx) + d.get_values_reconstruct_data(this, keyspace, lsn_range, reconstruct_data, ctx) .await } Image(i) => { - i.get_values_reconstruct_data(keyspace, reconstruct_data, ctx) + i.get_values_reconstruct_data(this, keyspace, reconstruct_data, ctx) .await } } diff --git a/pageserver/src/tenant/storage_layer/layer/tests.rs b/pageserver/src/tenant/storage_layer/layer/tests.rs index 36dcc8d805..d93c378ffc 100644 --- a/pageserver/src/tenant/storage_layer/layer/tests.rs +++ b/pageserver/src/tenant/storage_layer/layer/tests.rs @@ -1,6 +1,6 @@ use std::time::UNIX_EPOCH; -use pageserver_api::key::CONTROLFILE_KEY; +use pageserver_api::key::{Key, CONTROLFILE_KEY}; use tokio::task::JoinSet; use utils::{ completion::{self, Completion}, @@ -9,7 +9,13 @@ use utils::{ use super::failpoints::{Failpoint, FailpointKind}; use super::*; -use crate::{context::DownloadBehavior, tenant::storage_layer::LayerVisibilityHint}; +use crate::{ + context::DownloadBehavior, + tenant::{ + harness::test_img, + storage_layer::{IoConcurrency, LayerVisibilityHint}, + }, +}; use crate::{task_mgr::TaskKind, tenant::harness::TenantHarness}; /// Used in tests to advance a future to wanted await point, and not futher. @@ -28,23 +34,55 @@ async fn smoke_test() { let span = h.span(); let download_span = span.in_scope(|| tracing::info_span!("downloading", timeline_id = 1)); let (tenant, _) = h.load().await; + let io_concurrency = IoConcurrency::spawn_for_test(); let ctx = RequestContext::new(TaskKind::UnitTest, DownloadBehavior::Download); + let image_layers = vec![( + Lsn(0x40), + vec![( + Key::from_hex("620000000033333333444444445500000000").unwrap(), + test_img("foo"), + )], + )]; + + // Create a test timeline with one real layer, and one synthetic test layer. The synthetic + // one is only there so that we can GC the real one without leaving the timeline's metadata + // empty, which is an illegal state (see [`IndexPart::validate`]). let timeline = tenant - .create_test_timeline(TimelineId::generate(), Lsn(0x10), 14, &ctx) + .create_test_timeline_with_layers( + TimelineId::generate(), + Lsn(0x10), + 14, + &ctx, + Default::default(), + image_layers, + Lsn(0x100), + ) .await .unwrap(); - let layer = { + // Grab one of the timeline's layers to exercise in the test, and the other layer that is just + // there to avoid the timeline being illegally empty + let (layer, dummy_layer) = { let mut layers = { let layers = timeline.layers.read().await; layers.likely_resident_layers().cloned().collect::>() }; - assert_eq!(layers.len(), 1); + assert_eq!(layers.len(), 2); - layers.swap_remove(0) + layers.sort_by_key(|l| l.layer_desc().get_key_range().start); + let synthetic_layer = layers.pop().unwrap(); + let real_layer = layers.pop().unwrap(); + tracing::info!( + "real_layer={:?} ({}), synthetic_layer={:?} ({})", + real_layer, + real_layer.layer_desc().file_size, + synthetic_layer, + synthetic_layer.layer_desc().file_size + ); + (real_layer, synthetic_layer) }; // all layers created at pageserver are like `layer`, initialized with strong @@ -55,7 +93,7 @@ async fn smoke_test() { }; let img_before = { - let mut data = ValuesReconstructState::default(); + let mut data = ValuesReconstructState::new(io_concurrency.clone()); layer .get_values_reconstruct_data( controlfile_keyspace.clone(), @@ -65,10 +103,13 @@ async fn smoke_test() { ) .await .unwrap(); + data.keys .remove(&CONTROLFILE_KEY) .expect("must be present") - .expect("should not error") + .collect_pending_ios() + .await + .expect("must not error") .img .take() .expect("tenant harness writes the control file") @@ -87,7 +128,7 @@ async fn smoke_test() { // on accesses when the layer is evicted, it will automatically be downloaded. let img_after = { - let mut data = ValuesReconstructState::default(); + let mut data = ValuesReconstructState::new(io_concurrency.clone()); layer .get_values_reconstruct_data( controlfile_keyspace.clone(), @@ -101,7 +142,9 @@ async fn smoke_test() { data.keys .remove(&CONTROLFILE_KEY) .expect("must be present") - .expect("should not error") + .collect_pending_ios() + .await + .expect("must not error") .img .take() .expect("tenant harness writes the control file") @@ -173,10 +216,13 @@ async fn smoke_test() { let rtc = &timeline.remote_client; + // Simulate GC removing our test layer. { - let layers = &[layer]; let mut g = timeline.layers.write().await; + + let layers = &[layer]; g.open_mut().unwrap().finish_gc_timeline(layers); + // this just updates the remote_physical_size for demonstration purposes rtc.schedule_gc_update(layers).unwrap(); } @@ -191,7 +237,10 @@ async fn smoke_test() { rtc.wait_completion().await.unwrap(); - assert_eq!(rtc.get_remote_physical_size(), 0); + assert_eq!( + rtc.get_remote_physical_size(), + dummy_layer.metadata().file_size + ); assert_eq!(0, LAYER_IMPL_METRICS.inits_cancelled.get()) } diff --git a/pageserver/src/tenant/tasks.rs b/pageserver/src/tenant/tasks.rs index 0118a5ce5f..3725e2f7fc 100644 --- a/pageserver/src/tenant/tasks.rs +++ b/pageserver/src/tenant/tasks.rs @@ -67,10 +67,9 @@ pub(crate) async fn concurrent_background_tasks_rate_limit_permit( ) -> tokio::sync::SemaphorePermit<'static> { let _guard = crate::metrics::BACKGROUND_LOOP_SEMAPHORE.measure_acquisition(loop_kind); - pausable_failpoint!( - "initial-size-calculation-permit-pause", - loop_kind == BackgroundLoopKind::InitialLogicalSizeCalculation - ); + if loop_kind == BackgroundLoopKind::InitialLogicalSizeCalculation { + pausable_failpoint!("initial-size-calculation-permit-pause"); + } // TODO: assert that we run on BACKGROUND_RUNTIME; requires tokio_unstable Handle::id(); match CONCURRENT_BACKGROUND_TASKS.acquire().await { diff --git a/pageserver/src/tenant/timeline.rs b/pageserver/src/tenant/timeline.rs index d6ae11e67d..b4b30fcd23 100644 --- a/pageserver/src/tenant/timeline.rs +++ b/pageserver/src/tenant/timeline.rs @@ -20,12 +20,13 @@ use camino::Utf8Path; use chrono::{DateTime, Utc}; use enumset::EnumSet; use fail::fail_point; +use futures::{stream::FuturesUnordered, StreamExt}; use handle::ShardTimelineId; +use layer_manager::Shutdown; use offload::OffloadError; use once_cell::sync::Lazy; use pageserver_api::models::PageTraceEvent; use pageserver_api::{ - config::tenant_conf_defaults::DEFAULT_COMPACTION_THRESHOLD, key::{ KEY_SIZE, METADATA_KEY_BEGIN_PREFIX, METADATA_KEY_END_PREFIX, NON_INHERITED_RANGE, SPARSE_RANGE, @@ -51,34 +52,36 @@ use tokio::{ use tokio_util::sync::CancellationToken; use tracing::*; use utils::{ - fs_ext, pausable_failpoint, + fs_ext, + guard_arc_swap::GuardArcSwap, + pausable_failpoint, postgres_client::PostgresClientProtocol, sync::gate::{Gate, GateGuard}, }; use wal_decoder::serialized_batch::{SerializedValueBatch, ValueMeta}; -use std::sync::atomic::Ordering as AtomicOrdering; -use std::sync::{Arc, Mutex, RwLock, Weak}; +use std::array; +use std::cmp::{max, min}; +use std::collections::btree_map::Entry; +use std::collections::{BTreeMap, HashMap, HashSet}; +use std::ops::{ControlFlow, Deref, Range}; +use std::sync::atomic::{AtomicBool, AtomicU64, Ordering as AtomicOrdering}; +use std::sync::{Arc, Mutex, OnceLock, RwLock, Weak}; use std::time::{Duration, Instant, SystemTime}; -use std::{ - array, - collections::{BTreeMap, HashMap, HashSet}, - sync::atomic::AtomicU64, -}; -use std::{cmp::min, ops::ControlFlow}; -use std::{ - collections::btree_map::Entry, - ops::{Deref, Range}, -}; -use std::{pin::pin, sync::OnceLock}; +use crate::l0_flush::{self, L0FlushGlobalState}; +use crate::tenant::storage_layer::ImageLayerName; use crate::{ aux_file::AuxFileSizeEstimator, + page_service::TenantManagerTypes, tenant::{ config::AttachmentMode, layer_map::{LayerMap, SearchResult}, metadata::TimelineMetadata, - storage_layer::{inmemory_layer::IndexEntry, PersistentLayerDesc}, + storage_layer::{ + inmemory_layer::IndexEntry, BatchLayerWriter, IoConcurrency, PersistentLayerDesc, + ValueReconstructSituation, + }, }, walingest::WalLagCooldown, walredo, @@ -99,10 +102,6 @@ use crate::{ use crate::{ disk_usage_eviction_task::EvictionCandidate, tenant::storage_layer::delta_layer::DeltaEntry, }; -use crate::{ - l0_flush::{self, L0FlushGlobalState}, - metrics::GetKind, -}; use crate::{ metrics::ScanLatencyOngoingRecording, tenant::timeline::logical_size::CurrentLogicalSize, }; @@ -353,8 +352,8 @@ pub struct Timeline { // though let's keep them both for better error visibility. pub initdb_lsn: Lsn, - /// When did we last calculate the partitioning? Make it pub to test cases. - pub(super) partitioning: tokio::sync::Mutex<((KeyPartitioning, SparseKeyPartitioning), Lsn)>, + /// The repartitioning result. Allows a single writer and multiple readers. + pub(crate) partitioning: GuardArcSwap<((KeyPartitioning, SparseKeyPartitioning), Lsn)>, /// Configuration: how often should the partitioning be recalculated. repartition_threshold: u64, @@ -404,6 +403,9 @@ pub struct Timeline { /// Timeline deletion will acquire both compaction and gc locks in whatever order. compaction_lock: tokio::sync::Mutex<()>, + /// If true, the last compaction failed. + compaction_failed: AtomicBool, + /// Make sure we only have one running gc at a time. /// /// Must only be taken in two places: @@ -429,7 +431,7 @@ pub struct Timeline { pub(crate) l0_flush_global_state: L0FlushGlobalState, - pub(crate) handles: handle::PerTimelineState, + pub(crate) handles: handle::PerTimelineState, pub(crate) attach_wal_lag_cooldown: Arc>, @@ -900,10 +902,17 @@ impl From for PageReconstructError { } } +pub(crate) enum WaitLsnTimeout { + Custom(Duration), + // Use the [`PageServerConf::wait_lsn_timeout`] default + Default, +} + pub(crate) enum WaitLsnWaiter<'a> { Timeline(&'a Timeline), Tenant, PageService, + HttpEndpoint, } /// Argument to [`Timeline::shutdown`]. @@ -925,7 +934,7 @@ pub(crate) enum ShutdownMode { } struct ImageLayerCreationOutcome { - image: Option, + unfinished_image_layer: Option, next_start_key: Key, } @@ -1002,9 +1011,7 @@ impl Timeline { ranges: vec![key..key.next()], }; - // Initialise the reconstruct state for the key with the cache - // entry returned above. - let mut reconstruct_state = ValuesReconstructState::new(); + let mut reconstruct_state = ValuesReconstructState::new(IoConcurrency::sequential()); let vectored_res = self .get_vectored_impl(keyspace.clone(), lsn, &mut reconstruct_state, ctx) @@ -1047,6 +1054,7 @@ impl Timeline { &self, keyspace: KeySpace, lsn: Lsn, + io_concurrency: super::storage_layer::IoConcurrency, ctx: &RequestContext, ) -> Result>, GetVectoredError> { if !lsn.is_valid() { @@ -1081,7 +1089,7 @@ impl Timeline { .get_vectored_impl( keyspace.clone(), lsn, - &mut ValuesReconstructState::new(), + &mut ValuesReconstructState::new(io_concurrency), ctx, ) .await; @@ -1106,6 +1114,7 @@ impl Timeline { keyspace: KeySpace, lsn: Lsn, ctx: &RequestContext, + io_concurrency: super::storage_layer::IoConcurrency, ) -> Result>, GetVectoredError> { if !lsn.is_valid() { return Err(GetVectoredError::InvalidLsn(lsn)); @@ -1137,7 +1146,7 @@ impl Timeline { .get_vectored_impl( keyspace.clone(), lsn, - &mut ValuesReconstructState::default(), + &mut ValuesReconstructState::new(io_concurrency), ctx, ) .await; @@ -1156,39 +1165,56 @@ impl Timeline { reconstruct_state: &mut ValuesReconstructState, ctx: &RequestContext, ) -> Result>, GetVectoredError> { - let get_kind = if keyspace.total_raw_size() == 1 { - GetKind::Singular - } else { - GetKind::Vectored + let traversal_res: Result<(), _> = self + .get_vectored_reconstruct_data(keyspace.clone(), lsn, reconstruct_state, ctx) + .await; + if let Err(err) = traversal_res { + // Wait for all the spawned IOs to complete. + // See comments on `spawn_io` inside `storage_layer` for more details. + let mut collect_futs = std::mem::take(&mut reconstruct_state.keys) + .into_values() + .map(|state| state.collect_pending_ios()) + .collect::>(); + while collect_futs.next().await.is_some() {} + return Err(err); }; - let get_data_timer = crate::metrics::GET_RECONSTRUCT_DATA_TIME - .for_get_kind(get_kind) - .start_timer(); - self.get_vectored_reconstruct_data(keyspace.clone(), lsn, reconstruct_state, ctx) - .await?; - get_data_timer.stop_and_record(); - - let reconstruct_timer = crate::metrics::RECONSTRUCT_TIME - .for_get_kind(get_kind) - .start_timer(); - let mut results: BTreeMap> = BTreeMap::new(); let layers_visited = reconstruct_state.get_layers_visited(); - for (key, res) in std::mem::take(&mut reconstruct_state.keys) { - match res { - Err(err) => { - results.insert(key, Err(err)); - } - Ok(state) => { - let state = ValueReconstructState::from(state); + let futs = FuturesUnordered::new(); + for (key, state) in std::mem::take(&mut reconstruct_state.keys) { + futs.push({ + let walredo_self = self.myself.upgrade().expect("&self method holds the arc"); + async move { + assert_eq!(state.situation, ValueReconstructSituation::Complete); - let reconstruct_res = self.reconstruct_value(key, lsn, state).await; - results.insert(key, reconstruct_res); + let converted = match state.collect_pending_ios().await { + Ok(ok) => ok, + Err(err) => { + return (key, Err(err)); + } + }; + + // The walredo module expects the records to be descending in terms of Lsn. + // And we submit the IOs in that order, so, there shuold be no need to sort here. + debug_assert!( + converted + .records + .is_sorted_by_key(|(lsn, _)| std::cmp::Reverse(*lsn)), + "{converted:?}" + ); + + ( + key, + walredo_self.reconstruct_value(key, lsn, converted).await, + ) } - } + }); } - reconstruct_timer.stop_and_record(); + + let results = futs + .collect::>>() + .await; // For aux file keys (v1 or v2) the vectored read path does not return an error // when they're missing. Instead they are omitted from the resulting btree @@ -1283,6 +1309,7 @@ impl Timeline { &self, lsn: Lsn, who_is_waiting: WaitLsnWaiter<'_>, + timeout: WaitLsnTimeout, ctx: &RequestContext, /* Prepare for use by cancellation */ ) -> Result<(), WaitLsnError> { let state = self.current_state(); @@ -1299,7 +1326,7 @@ impl Timeline { | TaskKind::WalReceiverConnectionPoller => { let is_myself = match who_is_waiting { WaitLsnWaiter::Timeline(waiter) => Weak::ptr_eq(&waiter.myself, &self.myself), - WaitLsnWaiter::Tenant | WaitLsnWaiter::PageService => unreachable!("tenant or page_service context are not expected to have task kind {:?}", ctx.task_kind()), + WaitLsnWaiter::Tenant | WaitLsnWaiter::PageService | WaitLsnWaiter::HttpEndpoint => unreachable!("tenant or page_service context are not expected to have task kind {:?}", ctx.task_kind()), }; if is_myself { if let Err(current) = self.last_record_lsn.would_wait_for(lsn) { @@ -1315,13 +1342,14 @@ impl Timeline { } } + let timeout = match timeout { + WaitLsnTimeout::Custom(t) => t, + WaitLsnTimeout::Default => self.conf.wait_lsn_timeout, + }; + let _timer = crate::metrics::WAIT_LSN_TIME.start_timer(); - match self - .last_record_lsn - .wait_for_timeout(lsn, self.conf.wait_lsn_timeout) - .await - { + match self.last_record_lsn.wait_for_timeout(lsn, timeout).await { Ok(()) => Ok(()), Err(e) => { use utils::seqwait::SeqWaitError::*; @@ -1681,13 +1709,27 @@ impl Timeline { return Ok(false); } - match self.get_compaction_algorithm_settings().kind { + let result = match self.get_compaction_algorithm_settings().kind { CompactionAlgorithm::Tiered => { self.compact_tiered(cancel, ctx).await?; Ok(false) } CompactionAlgorithm::Legacy => self.compact_legacy(cancel, options, ctx).await, - } + }; + + // Signal compaction failure to avoid L0 flush stalls when it's broken. + match result { + Ok(_) => self.compaction_failed.store(false, AtomicOrdering::Relaxed), + Err(CompactionError::Other(_)) => { + self.compaction_failed.store(true, AtomicOrdering::Relaxed) + } + // Don't change the current value on offload failure or shutdown. We don't want to + // abruptly stall nor resume L0 flushes in these cases. + Err(CompactionError::Offload(_)) => {} + Err(CompactionError::ShuttingDown) => {} + }; + + result } /// Mutate the timeline with a [`TimelineWriter`]. @@ -2116,6 +2158,13 @@ impl Timeline { .unwrap_or(self.conf.default_tenant_conf.checkpoint_timeout) } + fn get_compaction_period(&self) -> Duration { + let tenant_conf = self.tenant_conf.load().tenant_conf.clone(); + tenant_conf + .compaction_period + .unwrap_or(self.conf.default_tenant_conf.compaction_period) + } + fn get_compaction_target_size(&self) -> u64 { let tenant_conf = self.tenant_conf.load(); tenant_conf @@ -2132,6 +2181,101 @@ impl Timeline { .unwrap_or(self.conf.default_tenant_conf.compaction_threshold) } + fn get_compaction_upper_limit(&self) -> usize { + let tenant_conf = self.tenant_conf.load(); + tenant_conf + .tenant_conf + .compaction_upper_limit + .unwrap_or(self.conf.default_tenant_conf.compaction_upper_limit) + } + + fn get_l0_flush_delay_threshold(&self) -> Option { + // Disable L0 flushes by default. This and compaction needs further tuning. + const DEFAULT_L0_FLUSH_DELAY_FACTOR: usize = 0; // TODO: default to e.g. 3 + + // If compaction is disabled, don't delay. + if self.get_compaction_period() == Duration::ZERO { + return None; + } + + let compaction_threshold = self.get_compaction_threshold(); + let tenant_conf = self.tenant_conf.load(); + let l0_flush_delay_threshold = tenant_conf + .tenant_conf + .l0_flush_delay_threshold + .or(self.conf.default_tenant_conf.l0_flush_delay_threshold) + .unwrap_or(DEFAULT_L0_FLUSH_DELAY_FACTOR * compaction_threshold); + + // 0 disables backpressure. + if l0_flush_delay_threshold == 0 { + return None; + } + + // Clamp the flush delay threshold to the compaction threshold; it doesn't make sense to + // backpressure flushes below this. + // TODO: the tenant config should have validation to prevent this instead. + debug_assert!(l0_flush_delay_threshold >= compaction_threshold); + Some(max(l0_flush_delay_threshold, compaction_threshold)) + } + + fn get_l0_flush_stall_threshold(&self) -> Option { + // Disable L0 stalls by default. In ingest benchmarks, we see image compaction take >10 + // minutes, blocking L0 compaction, and we can't stall L0 flushes for that long. + const DEFAULT_L0_FLUSH_STALL_FACTOR: usize = 0; // TODO: default to e.g. 5 + + // If compaction is disabled, don't stall. + if self.get_compaction_period() == Duration::ZERO { + return None; + } + + // If compaction is failing, don't stall and try to keep the tenant alive. This may not be a + // good idea: read amp can grow unbounded, leading to terrible performance, and we may take + // on unbounded compaction debt that can take a long time to fix once compaction comes back + // online. At least we'll delay flushes, slowing down the growth and buying some time. + if self.compaction_failed.load(AtomicOrdering::Relaxed) { + return None; + } + + let compaction_threshold = self.get_compaction_threshold(); + let tenant_conf = self.tenant_conf.load(); + let l0_flush_stall_threshold = tenant_conf + .tenant_conf + .l0_flush_stall_threshold + .or(self.conf.default_tenant_conf.l0_flush_stall_threshold); + + // Tests sometimes set compaction_threshold=1 to generate lots of layer files, and don't + // handle the 20-second compaction delay. Some (e.g. `test_backward_compatibility`) can't + // easily adjust the L0 backpressure settings, so just disable stalls in this case. + if cfg!(feature = "testing") + && compaction_threshold == 1 + && l0_flush_stall_threshold.is_none() + { + return None; + } + + let l0_flush_stall_threshold = l0_flush_stall_threshold + .unwrap_or(DEFAULT_L0_FLUSH_STALL_FACTOR * compaction_threshold); + + // 0 disables backpressure. + if l0_flush_stall_threshold == 0 { + return None; + } + + // Clamp the flush stall threshold to the compaction threshold; it doesn't make sense to + // backpressure flushes below this. + // TODO: the tenant config should have validation to prevent this instead. + debug_assert!(l0_flush_stall_threshold >= compaction_threshold); + Some(max(l0_flush_stall_threshold, compaction_threshold)) + } + + fn get_l0_flush_wait_upload(&self) -> bool { + let tenant_conf = self.tenant_conf.load(); + tenant_conf + .tenant_conf + .l0_flush_wait_upload + .unwrap_or(self.conf.default_tenant_conf.l0_flush_wait_upload) + } + fn get_image_creation_threshold(&self) -> usize { let tenant_conf = self.tenant_conf.load(); tenant_conf @@ -2340,7 +2484,8 @@ impl Timeline { // initial logical size is 0. LogicalSize::empty_initial() }, - partitioning: tokio::sync::Mutex::new(( + + partitioning: GuardArcSwap::new(( (KeyPartitioning::new(), KeyPartitioning::new().into_sparse()), Lsn(0), )), @@ -2367,6 +2512,7 @@ impl Timeline { gate: Gate::default(), compaction_lock: tokio::sync::Mutex::default(), + compaction_failed: AtomicBool::default(), gc_lock: tokio::sync::Mutex::default(), standby_horizon: AtomicLsn::new(0), @@ -2800,12 +2946,10 @@ impl Timeline { "initial size calculation", // NB: don't log errors here, task_mgr will do that. async move { - let cancel = task_mgr::shutdown_token(); self_clone .initial_logical_size_calculation_task( initial_part_end, cancel_wait_for_background_loop_concurrency_limit_semaphore, - cancel, background_ctx, ) .await; @@ -2815,11 +2959,21 @@ impl Timeline { ); } + /// # Cancellation + /// + /// This method is sensitive to `Timeline::cancel`. + /// + /// It is _not_ sensitive to task_mgr::shutdown_token(). + /// + /// # Cancel-Safety + /// + /// It does Timeline IO, hence this should be polled to completion because + /// we could be leaving in-flight IOs behind, which is safe, but annoying + /// to reason about. async fn initial_logical_size_calculation_task( self: Arc, initial_part_end: Lsn, skip_concurrency_limiter: CancellationToken, - cancel: CancellationToken, background_ctx: RequestContext, ) { scopeguard::defer! { @@ -2832,7 +2986,6 @@ impl Timeline { let self_ref = &self; let skip_concurrency_limiter = &skip_concurrency_limiter; async move { - let cancel = task_mgr::shutdown_token(); let wait_for_permit = super::tasks::concurrent_background_tasks_rate_limit_permit( BackgroundLoopKind::InitialLogicalSizeCalculation, background_ctx, @@ -2846,9 +2999,6 @@ impl Timeline { _ = self_ref.cancel.cancelled() => { return Err(CalculateLogicalSizeError::Cancelled); } - _ = cancel.cancelled() => { - return Err(CalculateLogicalSizeError::Cancelled); - }, () = skip_concurrency_limiter.cancelled() => { // Some action that is part of a end user interaction requested logical size // => break out of the rate limit @@ -2865,6 +3015,14 @@ impl Timeline { crate::metrics::initial_logical_size::START_CALCULATION.retry(circumstances) }; + let io_concurrency = IoConcurrency::spawn_from_conf( + self_ref.conf, + self_ref + .gate + .enter() + .map_err(|_| CalculateLogicalSizeError::Cancelled)?, + ); + let calculated_size = self_ref .logical_size_calculation_task( initial_part_end, @@ -2874,7 +3032,11 @@ impl Timeline { .await?; self_ref - .trigger_aux_file_size_computation(initial_part_end, background_ctx) + .trigger_aux_file_size_computation( + initial_part_end, + background_ctx, + io_concurrency, + ) .await?; // TODO: add aux file size to logical size @@ -2907,22 +3069,18 @@ impl Timeline { ) .expect("10min < 1hour"), ); - tokio::time::sleep(sleep_duration).await; + tokio::select! { + _ = tokio::time::sleep(sleep_duration) => {} + _ = self.cancel.cancelled() => return ControlFlow::Break(()), + } } } } }; - let (calculated_size, metrics_guard) = tokio::select! { - res = retrying => { - match res { - ControlFlow::Continue(calculated_size) => calculated_size, - ControlFlow::Break(()) => return, - } - } - _ = cancel.cancelled() => { - return; - } + let (calculated_size, metrics_guard) = match retrying.await { + ControlFlow::Continue(calculated_size) => calculated_size, + ControlFlow::Break(()) => return, }; // we cannot query current_logical_size.current_size() to know the current @@ -2978,9 +3136,6 @@ impl Timeline { receiver } - /// # Cancel-Safety - /// - /// This method is cancellation-safe. #[instrument(skip_all)] async fn logical_size_calculation_task( self: &Arc, @@ -2998,32 +3153,13 @@ impl Timeline { .enter() .map_err(|_| CalculateLogicalSizeError::Cancelled)?; - let self_calculation = Arc::clone(self); - - let mut calculation = pin!(async { - let ctx = ctx.attached_child(); - self_calculation - .calculate_logical_size(lsn, cause, &guard, &ctx) - .await - }); - - tokio::select! { - res = &mut calculation => { res } - _ = self.cancel.cancelled() => { - debug!("cancelling logical size calculation for timeline shutdown"); - calculation.await - } - } + self.calculate_logical_size(lsn, cause, &guard, ctx).await } /// Calculate the logical size of the database at the latest LSN. /// /// NOTE: counted incrementally, includes ancestors. This can be a slow operation, /// especially if we need to download remote layers. - /// - /// # Cancel-Safety - /// - /// This method is cancellation-safe. async fn calculate_logical_size( &self, up_to_lsn: Lsn, @@ -3036,7 +3172,10 @@ impl Timeline { self.timeline_id, up_to_lsn ); - pausable_failpoint!("timeline-calculate-logical-size-pause"); + if let Err(()) = pausable_failpoint!("timeline-calculate-logical-size-pause", &self.cancel) + { + return Err(CalculateLogicalSizeError::Cancelled); + } // See if we've already done the work for initial size calculation. // This is a short-cut for timelines that are mostly unused. @@ -3329,6 +3468,13 @@ impl Timeline { let mut completed_keyspace = KeySpace::default(); let mut image_covered_keyspace = KeySpaceRandomAccum::new(); + // Prevent GC from progressing while visiting the current timeline. + // If we are GC-ing because a new image layer was added while traversing + // the timeline, then it will remove layers that are required for fulfilling + // the current get request (read-path cannot "look back" and notice the new + // image layer). + let _gc_cutoff_holder = timeline.get_latest_gc_cutoff_lsn(); + loop { if cancel.is_cancelled() { return Err(GetVectoredError::Cancelled); @@ -3469,7 +3615,12 @@ impl Timeline { } } ancestor - .wait_lsn(self.ancestor_lsn, WaitLsnWaiter::Timeline(self), ctx) + .wait_lsn( + self.ancestor_lsn, + WaitLsnWaiter::Timeline(self), + WaitLsnTimeout::Default, + ctx, + ) .await .map_err(|e| match e { e @ WaitLsnError::Timeout(_) => GetReadyAncestorError::AncestorLsnTimeout(e), @@ -3552,7 +3703,7 @@ impl Timeline { let mut guard = self.layers.write().await; guard .open_mut()? - .try_freeze_in_memory_layer(at, &self.last_freeze_at, write_lock) + .try_freeze_in_memory_layer(at, &self.last_freeze_at, write_lock, &self.metrics) .await }; @@ -3589,6 +3740,12 @@ impl Timeline { mut layer_flush_start_rx: tokio::sync::watch::Receiver<(u64, Lsn)>, ctx: &RequestContext, ) { + // Subscribe to L0 delta layer updates, for compaction backpressure. + let mut watch_l0 = match self.layers.read().await.layer_map() { + Ok(lm) => lm.watch_level0_deltas(), + Err(Shutdown) => return, + }; + info!("started flush loop"); loop { tokio::select! { @@ -3613,43 +3770,68 @@ impl Timeline { return; } - let timer = self.metrics.flush_time_histo.start_timer(); + // Break to notify potential waiters as soon as we've flushed the requested LSN. If + // more requests have arrived in the meanwhile, we'll resume flushing afterwards. + if flushed_to_lsn >= frozen_to_lsn { + break Ok(()); + } - let num_frozen_layers; - let frozen_layer_total_size; - let layer_to_flush = { - let guard = self.layers.read().await; - let Ok(lm) = guard.layer_map() else { + // Fetch the next layer to flush, if any. + let (layer, l0_count, frozen_count, frozen_size) = { + let layers = self.layers.read().await; + let Ok(lm) = layers.layer_map() else { info!("dropping out of flush loop for timeline shutdown"); return; }; - num_frozen_layers = lm.frozen_layers.len(); - frozen_layer_total_size = lm + let l0_count = lm.level0_deltas().len(); + let frozen_count = lm.frozen_layers.len(); + let frozen_size: u64 = lm .frozen_layers .iter() .map(|l| l.estimated_in_mem_size()) - .sum::(); - lm.frozen_layers.front().cloned() - // drop 'layers' lock to allow concurrent reads and writes + .sum(); + let layer = lm.frozen_layers.front().cloned(); + (layer, l0_count, frozen_count, frozen_size) + // drop 'layers' lock }; - let Some(layer_to_flush) = layer_to_flush else { + let Some(layer) = layer else { break Ok(()); }; - if num_frozen_layers - > std::cmp::max( - self.get_compaction_threshold(), - DEFAULT_COMPACTION_THRESHOLD, - ) - && frozen_layer_total_size >= /* 128 MB */ 128000000 - { - tracing::warn!( - "too many frozen layers: {num_frozen_layers} layers with estimated in-mem size of {frozen_layer_total_size} bytes", - ); - } - match self.flush_frozen_layer(layer_to_flush, ctx).await { - Ok(this_layer_to_lsn) => { - flushed_to_lsn = std::cmp::max(flushed_to_lsn, this_layer_to_lsn); + + // Stall flushes to backpressure if compaction can't keep up. This is propagated up + // to WAL ingestion by having ephemeral layer rolls wait for flushes. + // + // NB: the compaction loop only checks `compaction_threshold` every 20 seconds, so + // we can end up stalling before compaction even starts. Consider making it more + // responsive (e.g. via `watch_level0_deltas`). + if let Some(stall_threshold) = self.get_l0_flush_stall_threshold() { + if l0_count >= stall_threshold { + warn!( + "stalling layer flushes for compaction backpressure at {l0_count} \ + L0 layers ({frozen_count} frozen layers with {frozen_size} bytes)" + ); + let stall_timer = self + .metrics + .flush_delay_histo + .start_timer() + .record_on_drop(); + tokio::select! { + result = watch_l0.wait_for(|l0| *l0 < stall_threshold) => { + if let Ok(l0) = result.as_deref() { + let delay = stall_timer.elapsed().as_secs_f64(); + info!("resuming layer flushes at {l0} L0 layers after {delay:.3}s"); + } + }, + _ = self.cancel.cancelled() => {}, + } + continue; // check again } + } + + // Flush the layer. + let flush_timer = self.metrics.flush_time_histo.start_timer(); + match self.flush_frozen_layer(layer, ctx).await { + Ok(layer_lsn) => flushed_to_lsn = max(flushed_to_lsn, layer_lsn), Err(FlushLayerError::Cancelled) => { info!("dropping out of flush loop for timeline shutdown"); return; @@ -3663,7 +3845,30 @@ impl Timeline { break err.map(|_| ()); } } - timer.stop_and_record(); + let flush_duration = flush_timer.stop_and_record(); + + // Delay the next flush to backpressure if compaction can't keep up. We delay by the + // flush duration such that the flush takes 2x as long. This is propagated up to WAL + // ingestion by having ephemeral layer rolls wait for flushes. + if let Some(delay_threshold) = self.get_l0_flush_delay_threshold() { + if l0_count >= delay_threshold { + let delay = flush_duration.as_secs_f64(); + info!( + "delaying layer flush by {delay:.3}s for compaction backpressure at \ + {l0_count} L0 layers ({frozen_count} frozen layers with {frozen_size} bytes)" + ); + let _delay_timer = self + .metrics + .flush_delay_histo + .start_timer() + .record_on_drop(); + tokio::select! { + _ = tokio::time::sleep(flush_duration) => {}, + _ = watch_l0.wait_for(|l0| *l0 < delay_threshold) => {}, + _ = self.cancel.cancelled() => {}, + } + } + } }; // Unsharded tenants should never advance their LSN beyond the end of the @@ -3867,21 +4072,24 @@ impl Timeline { // Backpressure mechanism: wait with continuation of the flush loop until we have uploaded all layer files. // This makes us refuse ingest until the new layers have been persisted to the remote - let start = Instant::now(); - self.remote_client - .wait_completion() - .await - .map_err(|e| match e { - WaitCompletionError::UploadQueueShutDownOrStopped - | WaitCompletionError::NotInitialized( - NotInitialized::ShuttingDown | NotInitialized::Stopped, - ) => FlushLayerError::Cancelled, - WaitCompletionError::NotInitialized(NotInitialized::Uninitialized) => { - FlushLayerError::Other(anyhow!(e).into()) - } - })?; - let duration = start.elapsed().as_secs_f64(); - self.metrics.flush_wait_upload_time_gauge_add(duration); + // TODO: remove this, and rely on l0_flush_{delay,stall}_threshold instead. + if self.get_l0_flush_wait_upload() { + let start = Instant::now(); + self.remote_client + .wait_completion() + .await + .map_err(|e| match e { + WaitCompletionError::UploadQueueShutDownOrStopped + | WaitCompletionError::NotInitialized( + NotInitialized::ShuttingDown | NotInitialized::Stopped, + ) => FlushLayerError::Cancelled, + WaitCompletionError::NotInitialized(NotInitialized::Uninitialized) => { + FlushLayerError::Other(anyhow!(e).into()) + } + })?; + let duration = start.elapsed().as_secs_f64(); + self.metrics.flush_wait_upload_time_gauge_add(duration); + } // FIXME: between create_delta_layer and the scheduling of the upload in `update_metadata_file`, // a compaction can delete the file and then it won't be available for uploads any more. @@ -4028,18 +4236,15 @@ impl Timeline { flags: EnumSet, ctx: &RequestContext, ) -> Result<((KeyPartitioning, SparseKeyPartitioning), Lsn), CompactionError> { - let Ok(mut partitioning_guard) = self.partitioning.try_lock() else { + let Ok(mut guard) = self.partitioning.try_write_guard() else { // NB: there are two callers, one is the compaction task, of which there is only one per struct Tenant and hence Timeline. // The other is the initdb optimization in flush_frozen_layer, used by `boostrap_timeline`, which runs before `.activate()` // and hence before the compaction task starts. - // Note that there are a third "caller" that will take the `partitioning` lock. It is `gc_compaction_split_jobs` for - // gc-compaction where it uses the repartition data to determine the split jobs. In the future, it might use its own - // heuristics, but for now, we should allow concurrent access to it and let the caller retry compaction. return Err(CompactionError::Other(anyhow!( - "repartition() called concurrently, this is rare and a retry should be fine" + "repartition() called concurrently" ))); }; - let ((dense_partition, sparse_partition), partition_lsn) = &*partitioning_guard; + let ((dense_partition, sparse_partition), partition_lsn) = &*guard.read(); if lsn < *partition_lsn { return Err(CompactionError::Other(anyhow!( "repartition() called with LSN going backwards, this should not happen" @@ -4067,9 +4272,9 @@ impl Timeline { let sparse_partitioning = SparseKeyPartitioning { parts: vec![sparse_ks], }; // no partitioning for metadata keys for now - *partitioning_guard = ((dense_partitioning, sparse_partitioning), lsn); - - Ok((partitioning_guard.0.clone(), partitioning_guard.1)) + let result = ((dense_partitioning, sparse_partitioning), lsn); + guard.write(result.clone()); + Ok(result) } // Is it time to create a new image layer for the given partition? @@ -4127,6 +4332,7 @@ impl Timeline { /// Create image layers for Postgres data. Assumes the caller passes a partition that is not too large, /// so that at most one image layer will be produced from this function. + #[allow(clippy::too_many_arguments)] async fn create_image_layer_for_rel_blocks( self: &Arc, partition: &KeySpace, @@ -4135,6 +4341,7 @@ impl Timeline { ctx: &RequestContext, img_range: Range, start: Key, + io_concurrency: IoConcurrency, ) -> Result { let mut wrote_keys = false; @@ -4163,7 +4370,12 @@ impl Timeline { || (last_key_in_range && key_request_accum.raw_size() > 0) { let results = self - .get_vectored(key_request_accum.consume_keyspace(), lsn, ctx) + .get_vectored( + key_request_accum.consume_keyspace(), + lsn, + io_concurrency.clone(), + ctx, + ) .await?; if self.cancel.is_cancelled() { @@ -4209,11 +4421,15 @@ impl Timeline { if wrote_keys { // Normal path: we have written some data into the new image layer for this // partition, so flush it to disk. - let (desc, path) = image_layer_writer.finish(ctx).await?; - let image_layer = Layer::finish_creating(self.conf, self, desc, &path)?; - info!("created image layer for rel {}", image_layer.local_path()); + info!( + "produced image layer for rel {}", + ImageLayerName { + key_range: img_range.clone(), + lsn + }, + ); Ok(ImageLayerCreationOutcome { - image: Some(image_layer), + unfinished_image_layer: Some(image_layer_writer), next_start_key: img_range.end, }) } else { @@ -4223,7 +4439,7 @@ impl Timeline { // layer we write will cover the key range that we just scanned. tracing::debug!("no data in range {}-{}", img_range.start, img_range.end); Ok(ImageLayerCreationOutcome { - image: None, + unfinished_image_layer: None, next_start_key: start, }) } @@ -4242,9 +4458,10 @@ impl Timeline { img_range: Range, mode: ImageLayerCreationMode, start: Key, + io_concurrency: IoConcurrency, ) -> Result { // Metadata keys image layer creation. - let mut reconstruct_state = ValuesReconstructState::default(); + let mut reconstruct_state = ValuesReconstructState::new(io_concurrency); let begin = Instant::now(); let data = self .get_vectored_impl(partition.clone(), lsn, &mut reconstruct_state, ctx) @@ -4271,7 +4488,7 @@ impl Timeline { if !trigger_generation && mode == ImageLayerCreationMode::Try { return Ok(ImageLayerCreationOutcome { - image: None, + unfinished_image_layer: None, next_start_key: img_range.end, }); } @@ -4297,14 +4514,15 @@ impl Timeline { if wrote_any_image { // Normal path: we have written some data into the new image layer for this // partition, so flush it to disk. - let (desc, path) = image_layer_writer.finish(ctx).await?; - let image_layer = Layer::finish_creating(self.conf, self, desc, &path)?; info!( "created image layer for metadata {}", - image_layer.local_path() + ImageLayerName { + key_range: img_range.clone(), + lsn + } ); Ok(ImageLayerCreationOutcome { - image: Some(image_layer), + unfinished_image_layer: Some(image_layer_writer), next_start_key: img_range.end, }) } else { @@ -4314,7 +4532,7 @@ impl Timeline { // layer we write will cover the key range that we just scanned. tracing::debug!("no data in range {}-{}", img_range.start, img_range.end); Ok(ImageLayerCreationOutcome { - image: None, + unfinished_image_layer: None, next_start_key: start, }) } @@ -4381,7 +4599,6 @@ impl Timeline { ctx: &RequestContext, ) -> Result, CreateImageLayersError> { let timer = self.metrics.create_images_time_histo.start_timer(); - let mut image_layers = Vec::new(); // We need to avoid holes between generated image layers. // Otherwise LayerMap::image_layer_exists will return false if key range of some layer is covered by more than one @@ -4396,6 +4613,8 @@ impl Timeline { let check_for_image_layers = self.should_check_if_image_layers_required(lsn); + let mut batch_image_writer = BatchLayerWriter::new(self.conf).await?; + for partition in partitioning.parts.iter() { if self.cancel.is_cancelled() { return Err(CreateImageLayersError::Cancelled); @@ -4461,43 +4680,52 @@ impl Timeline { ))) }); - if !compact_metadata { - let ImageLayerCreationOutcome { - image, - next_start_key, - } = self - .create_image_layer_for_rel_blocks( - partition, - image_layer_writer, - lsn, - ctx, - img_range, - start, - ) - .await?; + let io_concurrency = IoConcurrency::spawn_from_conf( + self.conf, + self.gate + .enter() + .map_err(|_| CreateImageLayersError::Cancelled)?, + ); - start = next_start_key; - image_layers.extend(image); + let ImageLayerCreationOutcome { + unfinished_image_layer, + next_start_key, + } = if !compact_metadata { + self.create_image_layer_for_rel_blocks( + partition, + image_layer_writer, + lsn, + ctx, + img_range.clone(), + start, + io_concurrency, + ) + .await? } else { - let ImageLayerCreationOutcome { - image, - next_start_key, - } = self - .create_image_layer_for_metadata_keys( - partition, - image_layer_writer, - lsn, - ctx, - img_range, - mode, - start, - ) - .await?; - start = next_start_key; - image_layers.extend(image); + self.create_image_layer_for_metadata_keys( + partition, + image_layer_writer, + lsn, + ctx, + img_range.clone(), + mode, + start, + io_concurrency, + ) + .await? + }; + start = next_start_key; + if let Some(unfinished_image_layer) = unfinished_image_layer { + batch_image_writer.add_unfinished_image_writer( + unfinished_image_layer, + img_range, + lsn, + ); } } + let image_layers = batch_image_writer.finish(self, ctx).await?; + let mut guard = self.layers.write().await; // FIXME: we could add the images to be uploaded *before* returning from here, but right @@ -4625,6 +4853,10 @@ impl Drop for Timeline { } } } + info!( + "Timeline {} for tenant {} is being dropped", + self.timeline_id, self.tenant_shard_id.tenant_id + ); } } @@ -5673,9 +5905,17 @@ impl Timeline { info!("force created image layer {}", image_layer.local_path()); { let mut guard = self.layers.write().await; - guard.open_mut().unwrap().force_insert_layer(image_layer); + guard + .open_mut() + .unwrap() + .force_insert_layer(image_layer.clone()); } + // Update remote_timeline_client state to reflect existence of this layer + self.remote_client + .schedule_layer_file_upload(image_layer) + .unwrap(); + Ok(()) } @@ -5726,9 +5966,17 @@ impl Timeline { info!("force created delta layer {}", delta_layer.local_path()); { let mut guard = self.layers.write().await; - guard.open_mut().unwrap().force_insert_layer(delta_layer); + guard + .open_mut() + .unwrap() + .force_insert_layer(delta_layer.clone()); } + // Update remote_timeline_client state to reflect existence of this layer + self.remote_client + .schedule_layer_file_upload(delta_layer) + .unwrap(); + Ok(()) } @@ -5738,13 +5986,14 @@ impl Timeline { self: &Arc, lsn: Lsn, ctx: &RequestContext, + io_concurrency: IoConcurrency, ) -> anyhow::Result> { let mut all_data = Vec::new(); let guard = self.layers.read().await; for layer in guard.layer_map()?.iter_historic_layers() { if !layer.is_delta() && layer.image_layer_lsn() == lsn { let layer = guard.get_from_desc(&layer); - let mut reconstruct_data = ValuesReconstructState::default(); + let mut reconstruct_data = ValuesReconstructState::new(io_concurrency.clone()); layer .get_values_reconstruct_data( KeySpace::single(Key::MIN..Key::MAX), @@ -5753,8 +6002,9 @@ impl Timeline { ctx, ) .await?; - for (k, v) in reconstruct_data.keys { - all_data.push((k, v?.img.unwrap().1)); + for (k, v) in std::mem::take(&mut reconstruct_data.keys) { + let v = v.collect_pending_ios().await?; + all_data.push((k, v.img.unwrap().1)); } } } @@ -5875,13 +6125,37 @@ impl TimelineWriter<'_> { async fn roll_layer(&mut self, freeze_at: Lsn) -> Result<(), FlushLayerError> { let current_size = self.write_guard.as_ref().unwrap().current_size; + // If layer flushes are backpressured due to compaction not keeping up, wait for the flush + // to propagate the backpressure up into WAL ingestion. + let l0_count = self + .tl + .layers + .read() + .await + .layer_map()? + .level0_deltas() + .len(); + let wait_thresholds = [ + self.get_l0_flush_delay_threshold(), + self.get_l0_flush_stall_threshold(), + ]; + let wait_threshold = wait_thresholds.into_iter().flatten().min(); + // self.write_guard will be taken by the freezing - self.tl + let flush_id = self + .tl .freeze_inmem_layer_at(freeze_at, &mut self.write_guard) .await?; assert!(self.write_guard.is_none()); + if let Some(wait_threshold) = wait_threshold { + if l0_count >= wait_threshold { + info!("layer roll waiting for flush due to compaction backpressure at {l0_count} L0 layers"); + self.tl.wait_flush_completion(flush_id).await?; + } + } + if current_size >= self.get_checkpoint_distance() * 2 { warn!("Flushed oversized open layer with size {}", current_size) } diff --git a/pageserver/src/tenant/timeline/compaction.rs b/pageserver/src/tenant/timeline/compaction.rs index 2042a18e96..5f7b5f1af5 100644 --- a/pageserver/src/tenant/timeline/compaction.rs +++ b/pageserver/src/tenant/timeline/compaction.rs @@ -42,14 +42,12 @@ use crate::tenant::storage_layer::merge_iterator::MergeIterator; use crate::tenant::storage_layer::{ AsLayerDesc, PersistentLayerDesc, PersistentLayerKey, ValueReconstructState, }; -use crate::tenant::timeline::ImageLayerCreationOutcome; use crate::tenant::timeline::{drop_rlock, DeltaLayerWriter, ImageLayerWriter}; +use crate::tenant::timeline::{ImageLayerCreationOutcome, IoConcurrency}; use crate::tenant::timeline::{Layer, ResidentLayer}; use crate::tenant::{gc_block, DeltaLayer, MaybeOffloaded}; use crate::virtual_file::{MaybeFatalIo, VirtualFile}; -use pageserver_api::config::tenant_conf_defaults::{ - DEFAULT_CHECKPOINT_DISTANCE, DEFAULT_COMPACTION_THRESHOLD, -}; +use pageserver_api::config::tenant_conf_defaults::DEFAULT_CHECKPOINT_DISTANCE; use pageserver_api::key::Key; use pageserver_api::keyspace::KeySpace; @@ -436,12 +434,14 @@ impl KeyHistoryRetention { if dry_run { return true; } - let guard = tline.layers.read().await; - if !guard.contains_key(key) { - return false; + let layer_generation; + { + let guard = tline.layers.read().await; + if !guard.contains_key(key) { + return false; + } + layer_generation = guard.get_from_key(key).metadata().generation; } - let layer_generation = guard.get_from_key(key).metadata().generation; - drop(guard); if layer_generation == tline.generation { info!( key=%key, @@ -624,7 +624,13 @@ impl Timeline { // High level strategy for compaction / image creation: // - // 1. First, calculate the desired "partitioning" of the + // 1. First, do a L0 compaction to ensure we move the L0 + // layers into the historic layer map get flat levels of + // layers. If we did not compact all L0 layers, we will + // prioritize compacting the timeline again and not do + // any of the compactions below. + // + // 2. Then, calculate the desired "partitioning" of the // currently in-use key space. The goal is to partition the // key space into roughly fixed-size chunks, but also take into // account any existing image layers, and try to align the @@ -638,7 +644,7 @@ impl Timeline { // identify a relation. This is just an optimization, // though. // - // 2. Once we know the partitioning, for each partition, + // 3. Once we know the partitioning, for each partition, // decide if it's time to create a new image layer. The // criteria is: there has been too much "churn" since the last // image layer? The "churn" is fuzzy concept, it's a @@ -646,15 +652,8 @@ impl Timeline { // total in the delta file. Or perhaps: if creating an image // file would allow to delete some older files. // - // 3. After that, we compact all level0 delta files if there - // are too many of them. While compacting, we also garbage - // collect any page versions that are no longer needed because - // of the new image layers we created in step 2. - // - // TODO: This high level strategy hasn't been implemented yet. - // Below are functions compact_level0() and create_image_layers() - // but they are a bit ad hoc and don't quite work like it's explained - // above. Rewrite it. + // 4. In the end, if the tenant gets auto-sharded, we will run + // a shard-ancestor compaction. // Is the timeline being deleted? if self.is_stopping() { @@ -666,10 +665,32 @@ impl Timeline { // Define partitioning schema if needed - // FIXME: the match should only cover repartitioning, not the next steps - let (partition_count, has_pending_tasks) = match self + // 1. L0 Compact + let fully_compacted = { + let timer = self.metrics.compact_time_histo.start_timer(); + let fully_compacted = self + .compact_level0( + target_file_size, + options.flags.contains(CompactFlags::ForceL0Compaction), + ctx, + ) + .await?; + timer.stop_and_record(); + fully_compacted + }; + + if !fully_compacted { + // Yield and do not do any other kind of compaction. True means + // that we have pending L0 compaction tasks and the compaction scheduler + // will prioritize compacting this tenant/timeline again. + info!("skipping image layer generation and shard ancestor compaction due to L0 compaction did not include all layers."); + return Ok(true); + } + + // 2. Repartition and create image layers if necessary + let partition_count = match self .repartition( - self.get_last_record_lsn(), + self.get_last_record_lsn(), // TODO: use L0-L1 boundary self.get_compaction_target_size(), options.flags, ctx, @@ -682,46 +703,30 @@ impl Timeline { .access_stats_behavior(AccessStatsBehavior::Skip) .build(); - // 2. Compact - let timer = self.metrics.compact_time_histo.start_timer(); - let fully_compacted = self - .compact_level0( - target_file_size, - options.flags.contains(CompactFlags::ForceL0Compaction), - ctx, - ) - .await?; - timer.stop_and_record(); - let mut partitioning = dense_partitioning; partitioning .parts .extend(sparse_partitioning.into_dense().parts); - // 3. Create new image layers for partitions that have been modified - // "enough". Skip image layer creation if L0 compaction cannot keep up. - if fully_compacted { - let image_layers = self - .create_image_layers( - &partitioning, - lsn, - if options - .flags - .contains(CompactFlags::ForceImageLayerCreation) - { - ImageLayerCreationMode::Force - } else { - ImageLayerCreationMode::Try - }, - &image_ctx, - ) - .await?; + // 3. Create new image layers for partitions that have been modified "enough". + let image_layers = self + .create_image_layers( + &partitioning, + lsn, + if options + .flags + .contains(CompactFlags::ForceImageLayerCreation) + { + ImageLayerCreationMode::Force + } else { + ImageLayerCreationMode::Try + }, + &image_ctx, + ) + .await?; - self.upload_new_image_layers(image_layers)?; - } else { - info!("skipping image layer generation due to L0 compaction did not include all layers."); - } - (partitioning.parts.len(), !fully_compacted) + self.upload_new_image_layers(image_layers)?; + partitioning.parts.len() } Err(err) => { // no partitioning? This is normal, if the timeline was just created @@ -733,10 +738,12 @@ impl Timeline { if !self.cancel.is_cancelled() && !err.is_cancelled() { tracing::error!("could not compact, repartitioning keyspace failed: {err:?}"); } - (1, false) + 1 } }; + // 4. Shard ancestor compaction + if self.shard_identity.count >= ShardCount::new(2) { // Limit the number of layer rewrites to the number of partitions: this means its // runtime should be comparable to a full round of image layer creations, rather than @@ -746,7 +753,7 @@ impl Timeline { self.compact_shard_ancestors(rewrite_max, ctx).await?; } - Ok(has_pending_tasks) + Ok(false) } /// Check for layers that are elegible to be rewritten: @@ -1112,16 +1119,15 @@ impl Timeline { // Accumulate the size of layers in `deltas_to_compact` let mut deltas_to_compact_bytes = 0; - // Under normal circumstances, we will accumulate up to compaction_interval L0s of size + // Under normal circumstances, we will accumulate up to compaction_upper_limit L0s of size // checkpoint_distance each. To avoid edge cases using extra system resources, bound our // work in this function to only operate on this much delta data at once. // - // Take the max of the configured value & the default, so that tests that configure tiny values - // can still use a sensible amount of memory, but if a deployed system configures bigger values we - // still let them compact a full stack of L0s in one go. + // In general, compaction_threshold should be <= compaction_upper_limit, but in case that + // the constraint is not respected, we use the larger of the two. let delta_size_limit = std::cmp::max( + self.get_compaction_upper_limit(), self.get_compaction_threshold(), - DEFAULT_COMPACTION_THRESHOLD, ) as u64 * std::cmp::max(self.get_checkpoint_distance(), DEFAULT_CHECKPOINT_DISTANCE); @@ -2138,6 +2144,11 @@ impl Timeline { self.get_gc_compaction_watermark() }; + if compact_below_lsn == Lsn::INVALID { + tracing::warn!("no layers to compact with gc: gc_cutoff not generated yet, skipping gc bottom-most compaction"); + return Ok(vec![]); + } + // Split compaction job to about 4GB each const GC_COMPACT_MAX_SIZE_MB: u64 = 4 * 1024; let sub_compaction_max_job_size_mb = @@ -2146,12 +2157,7 @@ impl Timeline { let mut compact_jobs = Vec::new(); // For now, we simply use the key partitioning information; we should do a more fine-grained partitioning // by estimating the amount of files read for a compaction job. We should also partition on LSN. - let ((dense_ks, sparse_ks), _) = { - let Ok(partition) = self.partitioning.try_lock() else { - bail!("failed to acquire partition lock during gc-compaction"); - }; - partition.clone() - }; + let ((dense_ks, sparse_ks), _) = self.partitioning.read().as_ref().clone(); // Truncate the key range to be within user specified compaction range. fn truncate_to( source_start: &Key, @@ -2217,6 +2223,12 @@ impl Timeline { } else { end }; + let end = if ranges_num == idx + 1 { + // extend the compaction range to the end of the key range if it's the last partition + end.max(job.compact_key_range.end) + } else { + end + }; info!( "splitting compaction job: {}..{}, estimated_size={}", start, end, total_size @@ -2337,6 +2349,11 @@ impl Timeline { // each of the retain_lsn. Therefore, if the user-provided `compact_lsn_range.end` is larger than the real gc cutoff, we will use // the real cutoff. let mut gc_cutoff = if compact_lsn_range.end == Lsn::MAX { + if real_gc_cutoff == Lsn::INVALID { + // If the gc_cutoff is not generated yet, we should not compact anything. + tracing::warn!("no layers to compact with gc: gc_cutoff not generated yet, skipping gc bottom-most compaction"); + return Ok(()); + } real_gc_cutoff } else { compact_lsn_range.end @@ -2868,7 +2885,7 @@ impl Timeline { "produced {} delta layers and {} image layers, {} layers are kept", produced_delta_layers_len, produced_image_layers_len, - layer_selection.len() + keep_layers.len() ); // Step 3: Place back to the layer map. @@ -2914,8 +2931,28 @@ impl Timeline { // be batched into `schedule_compaction_update`. let disk_consistent_lsn = self.disk_consistent_lsn.load(); self.schedule_uploads(disk_consistent_lsn, None)?; + // If a layer gets rewritten throughout gc-compaction, we need to keep that layer only in `compact_to` instead + // of `compact_from`. + let compact_from = { + let mut compact_from = Vec::new(); + let mut compact_to_set = HashMap::new(); + for layer in &compact_to { + compact_to_set.insert(layer.layer_desc().key(), layer); + } + for layer in &layer_selection { + if let Some(to) = compact_to_set.get(&layer.layer_desc().key()) { + tracing::info!( + "skipping delete {} because found same layer key at different generation {}", + layer, to + ); + } else { + compact_from.push(layer.clone()); + } + } + compact_from + }; self.remote_client - .schedule_compaction_update(&layer_selection, &compact_to)?; + .schedule_compaction_update(&compact_from, &compact_to)?; drop(gc_lock); @@ -3164,7 +3201,7 @@ impl TimelineAdaptor { // TODO set proper (stateful) start. The create_image_layer_for_rel_blocks function mostly let start = Key::MIN; let ImageLayerCreationOutcome { - image, + unfinished_image_layer, next_start_key: _, } = self .timeline @@ -3175,10 +3212,14 @@ impl TimelineAdaptor { ctx, key_range.clone(), start, + IoConcurrency::sequential(), ) .await?; - if let Some(image_layer) = image { + if let Some(image_layer_writer) = unfinished_image_layer { + let (desc, path) = image_layer_writer.finish(ctx).await?; + let image_layer = + Layer::finish_creating(self.timeline.conf, &self.timeline, desc, &path)?; self.new_images.push(image_layer); } diff --git a/pageserver/src/tenant/timeline/delete.rs b/pageserver/src/tenant/timeline/delete.rs index bdc315d985..3c828c8a9e 100644 --- a/pageserver/src/tenant/timeline/delete.rs +++ b/pageserver/src/tenant/timeline/delete.rs @@ -112,7 +112,7 @@ pub(super) async fn delete_local_timeline_directory( } /// It is important that this gets called when DeletionGuard is being held. -/// For more context see comments in [`DeleteTimelineFlow::prepare`] +/// For more context see comments in [`make_timeline_delete_guard`] async fn remove_maybe_offloaded_timeline_from_tenant( tenant: &Tenant, timeline: &TimelineOrOffloaded, @@ -193,10 +193,8 @@ impl DeleteTimelineFlow { ) -> Result<(), DeleteTimelineError> { super::debug_assert_current_span_has_tenant_and_timeline_id(); - let allow_offloaded_children = false; - let set_stopping = true; let (timeline, mut guard) = - Self::prepare(tenant, timeline_id, allow_offloaded_children, set_stopping)?; + make_timeline_delete_guard(tenant, timeline_id, TimelineDeleteGuardKind::Delete)?; guard.mark_in_progress()?; @@ -333,75 +331,6 @@ impl DeleteTimelineFlow { Ok(()) } - pub(super) fn prepare( - tenant: &Tenant, - timeline_id: TimelineId, - allow_offloaded_children: bool, - set_stopping: bool, - ) -> Result<(TimelineOrOffloaded, DeletionGuard), DeleteTimelineError> { - // Note the interaction between this guard and deletion guard. - // Here we attempt to lock deletion guard when we're holding a lock on timelines. - // This is important because when you take into account `remove_timeline_from_tenant` - // we remove timeline from memory when we still hold the deletion guard. - // So here when timeline deletion is finished timeline wont be present in timelines map at all - // which makes the following sequence impossible: - // T1: get preempted right before the try_lock on `Timeline::delete_progress` - // T2: do a full deletion, acquire and drop `Timeline::delete_progress` - // T1: acquire deletion lock, do another `DeleteTimelineFlow::run` - // For more context see this discussion: `https://github.com/neondatabase/neon/pull/4552#discussion_r1253437346` - let timelines = tenant.timelines.lock().unwrap(); - let timelines_offloaded = tenant.timelines_offloaded.lock().unwrap(); - - let timeline = match timelines.get(&timeline_id) { - Some(t) => TimelineOrOffloaded::Timeline(Arc::clone(t)), - None => match timelines_offloaded.get(&timeline_id) { - Some(t) => TimelineOrOffloaded::Offloaded(Arc::clone(t)), - None => return Err(DeleteTimelineError::NotFound), - }, - }; - - // Ensure that there are no child timelines, because we are about to remove files, - // which will break child branches - let mut children = Vec::new(); - if !allow_offloaded_children { - children.extend(timelines_offloaded.iter().filter_map(|(id, entry)| { - (entry.ancestor_timeline_id == Some(timeline_id)).then_some(*id) - })); - } - children.extend(timelines.iter().filter_map(|(id, entry)| { - (entry.get_ancestor_timeline_id() == Some(timeline_id)).then_some(*id) - })); - - if !children.is_empty() { - return Err(DeleteTimelineError::HasChildren(children)); - } - - // Note that using try_lock here is important to avoid a deadlock. - // Here we take lock on timelines and then the deletion guard. - // At the end of the operation we're holding the guard and need to lock timelines map - // to remove the timeline from it. - // Always if you have two locks that are taken in different order this can result in a deadlock. - - let delete_progress = Arc::clone(timeline.delete_progress()); - let delete_lock_guard = match delete_progress.try_lock_owned() { - Ok(guard) => DeletionGuard(guard), - Err(_) => { - // Unfortunately if lock fails arc is consumed. - return Err(DeleteTimelineError::AlreadyInProgress(Arc::clone( - timeline.delete_progress(), - ))); - } - }; - - if set_stopping { - if let TimelineOrOffloaded::Timeline(timeline) = &timeline { - timeline.set_state(TimelineState::Stopping); - } - } - - Ok((timeline, delete_lock_guard)) - } - fn schedule_background( guard: DeletionGuard, conf: &'static PageServerConf, @@ -483,6 +412,80 @@ impl DeleteTimelineFlow { } } +#[derive(Copy, Clone, PartialEq, Eq)] +pub(super) enum TimelineDeleteGuardKind { + Offload, + Delete, +} + +pub(super) fn make_timeline_delete_guard( + tenant: &Tenant, + timeline_id: TimelineId, + guard_kind: TimelineDeleteGuardKind, +) -> Result<(TimelineOrOffloaded, DeletionGuard), DeleteTimelineError> { + // Note the interaction between this guard and deletion guard. + // Here we attempt to lock deletion guard when we're holding a lock on timelines. + // This is important because when you take into account `remove_timeline_from_tenant` + // we remove timeline from memory when we still hold the deletion guard. + // So here when timeline deletion is finished timeline wont be present in timelines map at all + // which makes the following sequence impossible: + // T1: get preempted right before the try_lock on `Timeline::delete_progress` + // T2: do a full deletion, acquire and drop `Timeline::delete_progress` + // T1: acquire deletion lock, do another `DeleteTimelineFlow::run` + // For more context see this discussion: `https://github.com/neondatabase/neon/pull/4552#discussion_r1253437346` + let timelines = tenant.timelines.lock().unwrap(); + let timelines_offloaded = tenant.timelines_offloaded.lock().unwrap(); + + let timeline = match timelines.get(&timeline_id) { + Some(t) => TimelineOrOffloaded::Timeline(Arc::clone(t)), + None => match timelines_offloaded.get(&timeline_id) { + Some(t) => TimelineOrOffloaded::Offloaded(Arc::clone(t)), + None => return Err(DeleteTimelineError::NotFound), + }, + }; + + // Ensure that there are no child timelines, because we are about to remove files, + // which will break child branches + let mut children = Vec::new(); + if guard_kind == TimelineDeleteGuardKind::Delete { + children.extend(timelines_offloaded.iter().filter_map(|(id, entry)| { + (entry.ancestor_timeline_id == Some(timeline_id)).then_some(*id) + })); + } + children.extend(timelines.iter().filter_map(|(id, entry)| { + (entry.get_ancestor_timeline_id() == Some(timeline_id)).then_some(*id) + })); + + if !children.is_empty() { + return Err(DeleteTimelineError::HasChildren(children)); + } + + // Note that using try_lock here is important to avoid a deadlock. + // Here we take lock on timelines and then the deletion guard. + // At the end of the operation we're holding the guard and need to lock timelines map + // to remove the timeline from it. + // Always if you have two locks that are taken in different order this can result in a deadlock. + + let delete_progress = Arc::clone(timeline.delete_progress()); + let delete_lock_guard = match delete_progress.try_lock_owned() { + Ok(guard) => DeletionGuard(guard), + Err(_) => { + // Unfortunately if lock fails arc is consumed. + return Err(DeleteTimelineError::AlreadyInProgress(Arc::clone( + timeline.delete_progress(), + ))); + } + }; + + if guard_kind == TimelineDeleteGuardKind::Delete { + if let TimelineOrOffloaded::Timeline(timeline) = &timeline { + timeline.set_state(TimelineState::Stopping); + } + } + + Ok((timeline, delete_lock_guard)) +} + pub(super) struct DeletionGuard(OwnedMutexGuard); impl Deref for DeletionGuard { diff --git a/pageserver/src/tenant/timeline/handle.rs b/pageserver/src/tenant/timeline/handle.rs index e82559b8b3..5b39daaaf8 100644 --- a/pageserver/src/tenant/timeline/handle.rs +++ b/pageserver/src/tenant/timeline/handle.rs @@ -32,54 +32,151 @@ //! //! # Design //! +//! ## Data Structures +//! //! There are three user-facing data structures: //! - `PerTimelineState`: a struct embedded into each Timeline struct. Lifetime == Timeline lifetime. //! - `Cache`: a struct private to each connection handler; Lifetime == connection lifetime. //! - `Handle`: a smart pointer that holds the Timeline gate open and derefs to `&Timeline`. -//! Lifetime: for a single request dispatch on the Timeline (i.e., one getpage request) +//! - `WeakHandle`: downgrade of a `Handle` that does not keep the gate open, but allows +//! trying to ugprade back to a `Handle`, guaranteeing it's the same `Timeline` *object*. //! -//! The `Handle` is just a wrapper around an `Arc`. +//! Internally, there is 0 or 1 `HandleInner` per `(Cache,Timeline)`. +//! Since Cache:Connection is 1:1, there is 0 or 1 `HandleInner` per `(Connection,Timeline)`. //! -//! There is one long-lived `Arc`, which is stored in the `PerTimelineState`. -//! The `Cache` stores a `Weak` for each cached Timeline. +//! The `HandleInner` is allocated as a `Arc>` and +//! referenced weakly and strongly from various places which we are now illustrating. +//! For brevity, we will omit the `Arc>` part in the following and instead +//! use `strong ref` and `weak ref` when referring to the `Arc>` +//! or `Weak>`, respectively. +//! +//! - The `Handle` is a strong ref. +//! - The `WeakHandle` is a weak ref. +//! - The `PerTimelineState` contains a `HashMap`. +//! - The `Cache` is a `HashMap`. +//! +//! Lifetimes: +//! - `WeakHandle` and `Handle`: single pagestream request. +//! - `Cache`: single page service connection. +//! - `PerTimelineState`: lifetime of the Timeline object (i.e., i.e., till `Timeline::shutdown`). +//! +//! ## Request Handling Flow (= filling and using the `Cache``) //! //! To dispatch a request, the page service connection calls `Cache::get`. //! //! A cache miss means we consult the tenant manager for shard routing, -//! resulting in an `Arc`. We enter its gate _once_ and construct an -//! `Arc`. We store a `Weak` in the cache -//! and the `Arc` in the `PerTimelineState`. +//! resulting in an `Arc`. We enter its gate _once_ and store it in the the +//! `Arc>>`. A weak ref is stored in the `Cache` +//! and a strong ref in the `PerTimelineState`. +//! A strong ref is returned wrapped in a `Handle`. //! //! For subsequent requests, `Cache::get` will perform a "fast path" shard routing -//! and find the `Weak` in the cache. -//! We upgrade the `Weak` to an `Arc` and wrap it in the user-facing `Handle` type. +//! and find the weak ref in the cache. +//! We upgrade the weak ref to a strong ref and return it wrapped in a `Handle`. //! -//! The request handler dispatches the request to the right `>::$request_method`. +//! The pagestream processing is pipelined and involves a batching step. +//! While a request is batching, the `Handle` is downgraded to a `WeakHandle`. +//! When the batch is ready to be executed, the `WeakHandle` is upgraded back to a `Handle` +//! and the request handler dispatches the request to the right `>::$request_method`. //! It then drops the `Handle`, which drops the `Arc`. //! -//! # Memory Management / How The Reference Cycle Is Broken +//! # Performance //! -//! The attentive reader may have noticed the strong reference cycle -//! from `Arc` to `PerTimelineState` to `Arc`. +//! Remember from the introductory section: //! -//! This cycle is intentional: while it exists, the `Cache` can upgrade its -//! `Weak` to an `Arc` in a single atomic operation. +//! > However, we want to avoid the overhead of entering the gate for every +//! > method invocation. +//! +//! Why do we want to avoid that? +//! Because the gate is a shared location in memory and entering it involves +//! bumping refcounts, which leads to cache contention if done frequently +//! from multiple cores in parallel. +//! +//! So, we only acquire the `GateGuard` once on `Cache` miss, and wrap it in an `Arc`. +//! That `Arc` is private to the `HandleInner` and hence to the connection. +//! (Review the "Data Structures" section if that is unclear to you.) +//! +//! A `WeakHandle` is a weak ref to the `HandleInner`. +//! When upgrading a `WeakHandle`, we upgrade to a strong ref to the `HandleInner` and +//! further acquire an additional strong ref to the `Arc` inside it. +//! Again, this manipulation of ref counts is is cheap because `Arc` is private to the connection. +//! +//! When downgrading a `Handle` to a `WeakHandle`, we drop the `Arc`. +//! Again, this is cheap because the `Arc` is private to the connection. +//! +//! In addition to the GateGuard, we need to provide `Deref` impl. +//! For this, both `Handle` need infallible access to an `Arc`. +//! We could clone the `Arc` when upgrading a `WeakHandle`, but that would cause contention +//! on the shared memory location that trakcs the refcount of the `Arc`. +//! Instead, we wrap the `Arc` into another `Arc`. +//! so that we can clone it cheaply when upgrading a `WeakHandle`. +//! +//! # Shutdown +//! +//! The attentive reader may have noticed the following reference cycle around the `Arc`: +//! +//! ```text +//! Timeline --owns--> PerTimelineState --strong--> HandleInner --strong--> Timeline +//! ``` +//! +//! Further, there is this cycle: +//! +//! ```text +//! Timeline --owns--> PerTimelineState --strong--> HandleInner --strong--> GateGuard --keepalive--> Timeline +//! ``` +//! +//! The former cycle is a memory leak if not broken. +//! The latter cycle further prevents the Timeline from shutting down +//! because we certainly won't drop the Timeline while the GateGuard is alive. +//! Preventing shutdown is the whole point of this handle/cache system, +//! but when the Timeline needs to shut down, we need to break the cycle. //! //! The cycle is broken by either -//! - `PerTimelineState::shutdown` or -//! - dropping the `Cache`. +//! - Timeline shutdown (=> `PerTimelineState::shutdown`) +//! - Connection shutdown (=> dropping the `Cache`). //! -//! Concurrently existing `Handle`s will extend the existence of the cycle. +//! Both transition the `HandleInner` from [`HandleInner::KeepingTimelineGateOpen`] to +//! [`HandleInner::ShutDown`], which drops the only long-lived strong ref to the +//! `Arc`. +//! +//! `PerTimelineState::shutdown` drops all the `HandleInners` it contains, +//! thereby breaking the cycle. +//! It also initiates draining of already existing `Handle`s by +//! poisoning things so that no new `HandleInner`'s can be added +//! to the `PerTimelineState`, which will make subsequent `Cache::get` fail. +//! +//! Concurrently existing / already upgraded `Handle`s will extend the +//! lifetime of the `Arc>` and hence cycles. //! However, since `Handle`s are short-lived and new `Handle`s are not -//! handed out after either `PerTimelineState::shutdown` or `Cache` drop, -//! that extension of the cycle is bounded. +//! handed out from `Cache::get` or `WeakHandle::upgrade` after +//! `PerTimelineState::shutdown`, that extension of the cycle is bounded. +//! +//! Concurrently existing `WeakHandle`s will fail to `upgrade()`: +//! while they will succeed in upgrading `Weak>`, +//! they will find the inner in state `HandleInner::ShutDown` state where the +//! `Arc` and Timeline has already been dropped. +//! +//! Dropping the `Cache` undoes the registration of this `Cache`'s +//! `HandleInner`s from all the `PerTimelineState`s, i.e., it +//! removes the strong ref to each of its `HandleInner`s +//! from all the `PerTimelineState`. +//! +//! # Locking Rules +//! +//! To prevent deadlocks we: +//! +//! 1. Only ever hold one of the locks at a time. +//! 2. Don't add more than one Drop impl that locks on the +//! cycles above. +//! +//! As per (2), that impl is in `Drop for Cache`. //! //! # Fast Path for Shard Routing //! //! The `Cache` has a fast path for shard routing to avoid calling into //! the tenant manager for every request. //! -//! The `Cache` maintains a hash map of `ShardTimelineId` to `Weak`. +//! The `Cache` maintains a hash map of `ShardTimelineId` to `WeakHandle`s. //! //! The current implementation uses the first entry in the hash map //! to determine the `ShardParameters` and derive the correct @@ -87,18 +184,18 @@ //! //! It then looks up the hash map for that `ShardTimelineId := {ShardIndex,TimelineId}`. //! -//! If the lookup is successful and the `Weak` can be upgraded, +//! If the lookup is successful and the `WeakHandle` can be upgraded, //! it's a hit. //! //! ## Cache invalidation //! -//! The insight is that cache invalidation is sufficient and most efficiently done lazily. +//! The insight is that cache invalidation is sufficient and most efficiently if done lazily. //! The only reasons why an entry in the cache can become stale are: //! 1. The `PerTimelineState` / Timeline is shutting down e.g. because the shard is //! being detached, timeline or shard deleted, or pageserver is shutting down. //! 2. We're doing a shard split and new traffic should be routed to the child shards. //! -//! Regarding (1), we will eventually fail to upgrade the `Weak` once the +//! Regarding (1), we will eventually fail to upgrade the `WeakHandle` once the //! timeline has shut down, and when that happens, we remove the entry from the cache. //! //! Regarding (2), the insight is that it is toally fine to keep dispatching requests @@ -107,8 +204,6 @@ use std::collections::hash_map; use std::collections::HashMap; -use std::sync::atomic::AtomicBool; -use std::sync::atomic::Ordering; use std::sync::Arc; use std::sync::Mutex; use std::sync::Weak; @@ -152,7 +247,7 @@ pub(crate) struct Cache { map: Map, } -type Map = HashMap>>; +type Map = HashMap>; impl Default for Cache { fn default() -> Self { @@ -170,12 +265,22 @@ pub(crate) struct ShardTimelineId { } /// See module-level comment. -pub(crate) struct Handle(Arc>); -struct HandleInner { - shut_down: AtomicBool, - timeline: T::Timeline, - // The timeline's gate held open. - _gate_guard: utils::sync::gate::GateGuard, +pub(crate) struct Handle { + timeline: Arc, + #[allow(dead_code)] // the field exists to keep the gate open + gate_guard: Arc, + inner: Arc>>, +} +pub(crate) struct WeakHandle { + inner: Weak>>, +} +enum HandleInner { + KeepingTimelineGateOpen { + #[allow(dead_code)] + gate_guard: Arc, + timeline: Arc, + }, + ShutDown, } /// Embedded in each [`Types::Timeline`] as the anchor for the only long-lived strong ref to `HandleInner`. @@ -183,7 +288,8 @@ struct HandleInner { /// See module-level comment for details. pub struct PerTimelineState { // None = shutting down - handles: Mutex>>>>, + #[allow(clippy::type_complexity)] + handles: Mutex>>>>>, } impl Default for PerTimelineState { @@ -243,49 +349,24 @@ impl Cache { shard_selector: ShardSelector, tenant_manager: &T::TenantManager, ) -> Result, GetError> { - // terminates because each iteration removes an element from the map - loop { - let handle = self - .get_impl(timeline_id, shard_selector, tenant_manager) - .await?; - if handle.0.shut_down.load(Ordering::Relaxed) { - let removed = self - .map - .remove(&handle.0.timeline.shard_timeline_id()) - .expect("invariant of get_impl is that the returned handle is in the map"); - assert!( - Weak::ptr_eq(&removed, &Arc::downgrade(&handle.0)), - "shard_timeline_id() incorrect?" - ); - } else { - return Ok(handle); - } - } - } - - #[instrument(level = "trace", skip_all)] - async fn get_impl( - &mut self, - timeline_id: TimelineId, - shard_selector: ShardSelector, - tenant_manager: &T::TenantManager, - ) -> Result, GetError> { - let miss: ShardSelector = { + // terminates because when every iteration we remove an element from the map + let miss: ShardSelector = loop { let routing_state = self.shard_routing(timeline_id, shard_selector); match routing_state { RoutingResult::FastPath(handle) => return Ok(handle), RoutingResult::SlowPath(key) => match self.map.get(&key) { Some(cached) => match cached.upgrade() { - Some(upgraded) => return Ok(Handle(upgraded)), - None => { + Ok(upgraded) => return Ok(upgraded), + Err(HandleUpgradeError::ShutDown) => { + // TODO: dedup with shard_routing() trace!("handle cache stale"); self.map.remove(&key).unwrap(); - ShardSelector::Known(key.shard_index) + continue; } }, - None => ShardSelector::Known(key.shard_index), + None => break ShardSelector::Known(key.shard_index), }, - RoutingResult::NeedConsultTenantManager => shard_selector, + RoutingResult::NeedConsultTenantManager => break shard_selector, } }; self.get_miss(timeline_id, miss, tenant_manager).await @@ -302,7 +383,7 @@ impl Cache { let Some((first_key, first_handle)) = self.map.iter().next() else { return RoutingResult::NeedConsultTenantManager; }; - let Some(first_handle) = first_handle.upgrade() else { + let Ok(first_handle) = first_handle.upgrade() else { // TODO: dedup with get() trace!("handle cache stale"); let first_key_owned = *first_key; @@ -310,7 +391,7 @@ impl Cache { continue; }; - let first_handle_shard_identity = first_handle.timeline.get_shard_identity(); + let first_handle_shard_identity = first_handle.get_shard_identity(); let make_shard_index = |shard_num: ShardNumber| ShardIndex { shard_number: shard_num, shard_count: first_handle_shard_identity.count, @@ -329,11 +410,11 @@ impl Cache { }; let first_handle_shard_timeline_id = ShardTimelineId { shard_index: first_handle_shard_identity.shard_index(), - timeline_id: first_handle.timeline.shard_timeline_id().timeline_id, + timeline_id: first_handle.shard_timeline_id().timeline_id, }; if need_shard_timeline_id == first_handle_shard_timeline_id { - return RoutingResult::FastPath(Handle(first_handle)); + return RoutingResult::FastPath(first_handle); } else { return RoutingResult::SlowPath(need_shard_timeline_id); } @@ -357,23 +438,30 @@ impl Cache { ShardSelector::Known(idx) => assert_eq!(idx, &key.shard_index), } - let gate_guard = match timeline.gate().enter() { - Ok(guard) => guard, - Err(_) => { - return Err(GetError::TimelineGateClosed); - } - }; trace!("creating new HandleInner"); - let handle = Arc::new( - // TODO: global metric that keeps track of the number of live HandlerTimeline instances - // so we can identify reference cycle bugs. - HandleInner { - shut_down: AtomicBool::new(false), - _gate_guard: gate_guard, - timeline: timeline.clone(), - }, - ); - let handle = { + let handle_inner_arc = Arc::new(Mutex::new(HandleInner::KeepingTimelineGateOpen { + gate_guard: Arc::new( + // this enter() is expensive in production code because + // it hits the global Arc::gate refcounts + match timeline.gate().enter() { + Ok(guard) => guard, + Err(_) => { + return Err(GetError::TimelineGateClosed); + } + }, + ), + // this clone is expensive in production code because + // it hits the global Arc::clone refcounts + timeline: Arc::new(timeline.clone()), + })); + let handle_weak = WeakHandle { + inner: Arc::downgrade(&handle_inner_arc), + }; + let handle = handle_weak + .upgrade() + .ok() + .expect("we just created it and it's not linked anywhere yet"); + { let mut lock_guard = timeline .per_timeline_state() .handles @@ -381,7 +469,8 @@ impl Cache { .expect("mutex poisoned"); match &mut *lock_guard { Some(per_timeline_state) => { - let replaced = per_timeline_state.insert(self.id, Arc::clone(&handle)); + let replaced = + per_timeline_state.insert(self.id, Arc::clone(&handle_inner_arc)); assert!(replaced.is_none(), "some earlier code left a stale handle"); match self.map.entry(key) { hash_map::Entry::Occupied(_o) => { @@ -392,8 +481,7 @@ impl Cache { unreachable!() } hash_map::Entry::Vacant(v) => { - v.insert(Arc::downgrade(&handle)); - handle + v.insert(handle_weak); } } } @@ -401,14 +489,62 @@ impl Cache { return Err(GetError::PerTimelineStateShutDown); } } - }; - Ok(Handle(handle)) + } + Ok(handle) } Err(e) => Err(GetError::TenantManager(e)), } } } +pub(crate) enum HandleUpgradeError { + ShutDown, +} + +impl WeakHandle { + pub(crate) fn upgrade(&self) -> Result, HandleUpgradeError> { + let Some(inner) = Weak::upgrade(&self.inner) else { + return Err(HandleUpgradeError::ShutDown); + }; + let lock_guard = inner.lock().expect("poisoned"); + match &*lock_guard { + HandleInner::KeepingTimelineGateOpen { + timeline, + gate_guard, + } => { + let gate_guard = Arc::clone(gate_guard); + let timeline = Arc::clone(timeline); + drop(lock_guard); + Ok(Handle { + timeline, + gate_guard, + inner, + }) + } + HandleInner::ShutDown => Err(HandleUpgradeError::ShutDown), + } + } + + pub(crate) fn is_same_handle_as(&self, other: &WeakHandle) -> bool { + Weak::ptr_eq(&self.inner, &other.inner) + } +} + +impl std::ops::Deref for Handle { + type Target = T::Timeline; + fn deref(&self) -> &Self::Target { + &self.timeline + } +} + +impl Handle { + pub(crate) fn downgrade(&self) -> WeakHandle { + WeakHandle { + inner: Arc::downgrade(&self.inner), + } + } +} + impl PerTimelineState { /// After this method returns, [`Cache::get`] will never again return a [`Handle`] /// to the [`Types::Timeline`] that embeds this per-timeline state. @@ -430,43 +566,62 @@ impl PerTimelineState { trace!("already shut down"); return; }; - for handle in handles.values() { + for handle_inner_arc in handles.values() { // Make hits fail. - handle.shut_down.store(true, Ordering::Relaxed); + let mut lock_guard = handle_inner_arc.lock().expect("poisoned"); + lock_guard.shutdown(); } drop(handles); } } -impl std::ops::Deref for Handle { - type Target = T::Timeline; - fn deref(&self) -> &Self::Target { - &self.0.timeline - } -} - -#[cfg(test)] -impl Drop for HandleInner { - fn drop(&mut self) { - trace!("HandleInner dropped"); - } -} - // When dropping a [`Cache`], prune its handles in the [`PerTimelineState`] to break the reference cycle. impl Drop for Cache { fn drop(&mut self) { - for (_, weak) in self.map.drain() { - if let Some(strong) = weak.upgrade() { - // handle is still being kept alive in PerTimelineState - let timeline = strong.timeline.per_timeline_state(); - let mut handles = timeline.handles.lock().expect("mutex poisoned"); - if let Some(handles) = &mut *handles { - let Some(removed) = handles.remove(&self.id) else { - // There could have been a shutdown inbetween us upgrading the weak and locking the mutex. - continue; - }; - assert!(Arc::ptr_eq(&removed, &strong)); - } + for ( + _, + WeakHandle { + inner: handle_inner_weak, + }, + ) in self.map.drain() + { + let Some(handle_inner_arc) = handle_inner_weak.upgrade() else { + continue; + }; + let Some(handle_timeline) = handle_inner_arc + // locking rules: drop lock before acquiring other lock below + .lock() + .expect("poisoned") + .shutdown() + else { + // Concurrent PerTimelineState::shutdown. + continue; + }; + // Clean up per_timeline_state so the HandleInner allocation can be dropped. + let per_timeline_state = handle_timeline.per_timeline_state(); + let mut handles_lock_guard = per_timeline_state.handles.lock().expect("mutex poisoned"); + let Some(handles) = &mut *handles_lock_guard else { + continue; + }; + let Some(removed_handle_inner_arc) = handles.remove(&self.id) else { + // Concurrent PerTimelineState::shutdown. + continue; + }; + drop(handles_lock_guard); // locking rules! + assert!(Arc::ptr_eq(&removed_handle_inner_arc, &handle_inner_arc)); + } + } +} + +impl HandleInner { + fn shutdown(&mut self) -> Option> { + match std::mem::replace(self, HandleInner::ShutDown) { + HandleInner::KeepingTimelineGateOpen { timeline, .. } => Some(timeline), + HandleInner::ShutDown => { + // Duplicate shutdowns are possible because both Cache::drop and PerTimelineState::shutdown + // may do it concurrently, but locking rules disallow holding per-timeline-state lock and + // the handle lock at the same time. + None } } } @@ -474,6 +629,8 @@ impl Drop for Cache { #[cfg(test)] mod tests { + use std::sync::Weak; + use pageserver_api::{ key::{rel_block_to_key, Key, DBDIR_KEY}, models::ShardParameters, @@ -583,39 +740,13 @@ mod tests { // // fill the cache // - assert_eq!( - (Arc::strong_count(&shard0), Arc::weak_count(&shard0)), - (2, 1), - "strong: shard0, mgr; weak: myself" - ); - let handle: Handle<_> = cache .get(timeline_id, ShardSelector::Page(key), &mgr) .await .expect("we have the timeline"); - let handle_inner_weak = Arc::downgrade(&handle.0); assert!(Weak::ptr_eq(&handle.myself, &shard0.myself)); - assert_eq!( - ( - Weak::strong_count(&handle_inner_weak), - Weak::weak_count(&handle_inner_weak) - ), - (2, 2), - "strong: handle, per_timeline_state, weak: handle_inner_weak, cache" - ); assert_eq!(cache.map.len(), 1); - - assert_eq!( - (Arc::strong_count(&shard0), Arc::weak_count(&shard0)), - (3, 1), - "strong: handleinner(per_timeline_state), shard0, mgr; weak: myself" - ); drop(handle); - assert_eq!( - (Arc::strong_count(&shard0), Arc::weak_count(&shard0)), - (3, 1), - "strong: handleinner(per_timeline_state), shard0, mgr; weak: myself" - ); // // demonstrate that Handle holds up gate closure @@ -640,21 +771,11 @@ mod tests { // SHUTDOWN shard0.per_timeline_state.shutdown(); // keeping handle alive across shutdown - assert_eq!( - 1, - Weak::strong_count(&handle_inner_weak), - "through local var handle" - ); assert_eq!( cache.map.len(), 1, "this is an implementation detail but worth pointing out: we can't clear the cache from shutdown(), it's cleared on first access after" ); - assert_eq!( - (Arc::strong_count(&shard0), Arc::weak_count(&shard0)), - (3, 1), - "strong: handleinner(via handle), shard0, mgr; weak: myself" - ); // this handle is perfectly usable handle.getpage(); @@ -678,16 +799,6 @@ mod tests { } drop(handle); - assert_eq!( - 0, - Weak::strong_count(&handle_inner_weak), - "the HandleInner destructor already ran" - ); - assert_eq!( - (Arc::strong_count(&shard0), Arc::weak_count(&shard0)), - (2, 1), - "strong: shard0, mgr; weak: myself" - ); // closing gate succeeds after dropping handle tokio::select! { @@ -706,10 +817,8 @@ mod tests { assert_eq!(cache.map.len(), 0); // ensure all refs to shard0 are gone and we're not leaking anything - let myself = Weak::clone(&shard0.myself); drop(shard0); drop(mgr); - assert_eq!(Weak::strong_count(&myself), 0); } #[tokio::test] @@ -948,15 +1057,11 @@ mod tests { handle }; handle.getpage(); - used_handles.push(Arc::downgrade(&handle.0)); + used_handles.push(Arc::downgrade(&handle.timeline)); } - // No handles exist, thus gates are closed and don't require shutdown - assert!(used_handles - .iter() - .all(|weak| Weak::strong_count(weak) == 0)); - - // ... thus the gate should close immediately, even without shutdown + // No handles exist, thus gates are closed and don't require shutdown. + // Thus the gate should close immediately, even without shutdown. tokio::select! { _ = shard0.gate.close() => { } _ = tokio::time::sleep(FOREVER) => { @@ -964,4 +1069,172 @@ mod tests { } } } + + #[tokio::test(start_paused = true)] + async fn test_weak_handles() { + crate::tenant::harness::setup_logging(); + let timeline_id = TimelineId::generate(); + let shard0 = Arc::new_cyclic(|myself| StubTimeline { + gate: Default::default(), + id: timeline_id, + shard: ShardIdentity::unsharded(), + per_timeline_state: PerTimelineState::default(), + myself: myself.clone(), + }); + let mgr = StubManager { + shards: vec![shard0.clone()], + }; + + let refcount_start = Arc::strong_count(&shard0); + + let key = DBDIR_KEY; + + let mut cache = Cache::::default(); + + let handle = cache + .get(timeline_id, ShardSelector::Page(key), &mgr) + .await + .expect("we have the timeline"); + assert!(Weak::ptr_eq(&handle.myself, &shard0.myself)); + + let weak_handle = handle.downgrade(); + + drop(handle); + + let upgraded_handle = weak_handle.upgrade().ok().expect("we can upgrade it"); + + // Start shutdown + shard0.per_timeline_state.shutdown(); + + // Upgrades during shutdown don't work, even if upgraded_handle exists. + weak_handle + .upgrade() + .err() + .expect("can't upgrade weak handle as soon as shutdown started"); + + // But upgraded_handle is still alive, so the gate won't close. + tokio::select! { + _ = shard0.gate.close() => { + panic!("handle is keeping gate open"); + } + _ = tokio::time::sleep(FOREVER) => { } + } + + // Drop the last handle. + drop(upgraded_handle); + + // The gate should close now, despite there still being a weak_handle. + tokio::select! { + _ = shard0.gate.close() => { } + _ = tokio::time::sleep(FOREVER) => { + panic!("only strong handle is dropped and we shut down per-timeline-state") + } + } + + // The weak handle still can't be upgraded. + weak_handle + .upgrade() + .err() + .expect("still shouldn't be able to upgrade the weak handle"); + + // There should be no strong references to the timeline object except the one on "stack". + assert_eq!(Arc::strong_count(&shard0), refcount_start); + } + + #[tokio::test(start_paused = true)] + async fn test_reference_cycle_broken_when_cache_is_dropped() { + crate::tenant::harness::setup_logging(); + let timeline_id = TimelineId::generate(); + let shard0 = Arc::new_cyclic(|myself| StubTimeline { + gate: Default::default(), + id: timeline_id, + shard: ShardIdentity::unsharded(), + per_timeline_state: PerTimelineState::default(), + myself: myself.clone(), + }); + let mgr = StubManager { + shards: vec![shard0.clone()], + }; + let key = DBDIR_KEY; + + let mut cache = Cache::::default(); + + // helper to check if a handle is referenced by per_timeline_state + let per_timeline_state_refs_handle = |handle_weak: &Weak>>| { + let per_timeline_state = shard0.per_timeline_state.handles.lock().unwrap(); + let per_timeline_state = per_timeline_state.as_ref().unwrap(); + per_timeline_state + .values() + .any(|v| Weak::ptr_eq(&Arc::downgrade(v), handle_weak)) + }; + + // Fill the cache. + let handle = cache + .get(timeline_id, ShardSelector::Page(key), &mgr) + .await + .expect("we have the timeline"); + assert!(Weak::ptr_eq(&handle.myself, &shard0.myself)); + let handle_inner_weak = Arc::downgrade(&handle.inner); + assert!( + per_timeline_state_refs_handle(&handle_inner_weak), + "we still hold `handle` _and_ haven't dropped `cache` yet" + ); + + // Drop the cache. + drop(cache); + + assert!( + !(per_timeline_state_refs_handle(&handle_inner_weak)), + "nothing should reference the handle allocation anymore" + ); + assert!( + Weak::upgrade(&handle_inner_weak).is_some(), + "the local `handle` still keeps the allocation alive" + ); + // but obviously the cache is gone so no new allocations can be handed out. + + // Drop handle. + drop(handle); + assert!( + Weak::upgrade(&handle_inner_weak).is_none(), + "the local `handle` is dropped, so the allocation should be dropped by now" + ); + } + + #[tokio::test(start_paused = true)] + async fn test_reference_cycle_broken_when_per_timeline_state_shutdown() { + crate::tenant::harness::setup_logging(); + let timeline_id = TimelineId::generate(); + let shard0 = Arc::new_cyclic(|myself| StubTimeline { + gate: Default::default(), + id: timeline_id, + shard: ShardIdentity::unsharded(), + per_timeline_state: PerTimelineState::default(), + myself: myself.clone(), + }); + let mgr = StubManager { + shards: vec![shard0.clone()], + }; + let key = DBDIR_KEY; + + let mut cache = Cache::::default(); + let handle = cache + .get(timeline_id, ShardSelector::Page(key), &mgr) + .await + .expect("we have the timeline"); + // grab a weak reference to the inner so can later try to Weak::upgrade it and assert that fails + let handle_inner_weak = Arc::downgrade(&handle.inner); + + // drop the handle, obviously the lifetime of `inner` is at least as long as each strong reference to it + drop(handle); + assert!(Weak::upgrade(&handle_inner_weak).is_some(), "can still"); + + // Shutdown the per_timeline_state. + shard0.per_timeline_state.shutdown(); + assert!(Weak::upgrade(&handle_inner_weak).is_none(), "can no longer"); + + // cache only contains Weak's, so, it can outlive the per_timeline_state without + // Drop explicitly solely to make this point. + drop(cache); + } } diff --git a/pageserver/src/tenant/timeline/import_pgdata.rs b/pageserver/src/tenant/timeline/import_pgdata.rs index de56468580..6940179ae9 100644 --- a/pageserver/src/tenant/timeline/import_pgdata.rs +++ b/pageserver/src/tenant/timeline/import_pgdata.rs @@ -113,7 +113,7 @@ pub async fn doit( match res { Ok(_) => break, Err(err) => { - info!(?err, "indefintely waiting for pgdata to finish"); + info!(?err, "indefinitely waiting for pgdata to finish"); if tokio::time::timeout(std::time::Duration::from_secs(10), cancel.cancelled()) .await .is_ok() diff --git a/pageserver/src/tenant/timeline/import_pgdata/importbucket_client.rs b/pageserver/src/tenant/timeline/import_pgdata/importbucket_client.rs index bc4d148a29..68937e535d 100644 --- a/pageserver/src/tenant/timeline/import_pgdata/importbucket_client.rs +++ b/pageserver/src/tenant/timeline/import_pgdata/importbucket_client.rs @@ -308,7 +308,7 @@ impl ControlFile { 202107181 => 14, 202209061 => 15, 202307071 => 16, - /* XXX pg17 */ + 202406281 => 17, catversion => { anyhow::bail!("unrecognized catalog version {catversion}") } diff --git a/pageserver/src/tenant/timeline/layer_manager.rs b/pageserver/src/tenant/timeline/layer_manager.rs index 3888e7f86a..cb7783d779 100644 --- a/pageserver/src/tenant/timeline/layer_manager.rs +++ b/pageserver/src/tenant/timeline/layer_manager.rs @@ -91,6 +91,7 @@ impl LayerManager { layer_map, layer_fmgr: LayerFileManager(hashmap), }) => { + // NB: no need to decrement layer metrics; metrics are removed on timeline shutdown. let open = layer_map.open_layer.take(); let frozen = layer_map.frozen_layers.len(); let taken_writer_state = writer_state.take(); @@ -234,6 +235,7 @@ impl OpenLayerManager { lsn: Lsn, last_freeze_at: &AtomicLsn, write_lock: &mut tokio::sync::MutexGuard<'_, Option>, + metrics: &TimelineMetrics, ) -> bool { let Lsn(last_record_lsn) = lsn; let end_lsn = Lsn(last_record_lsn + 1); @@ -242,6 +244,11 @@ impl OpenLayerManager { let open_layer_rc = Arc::clone(open_layer); open_layer.freeze(end_lsn).await; + // Increment the frozen layer metrics. This is decremented in `finish_flush_l0_layer()`. + // TODO: It would be nicer to do this via `InMemoryLayer::drop()`, but it requires a + // reference to the timeline metrics. Other methods use a metrics borrow as well. + metrics.inc_frozen_layer(open_layer); + // The layer is no longer open, update the layer map to reflect this. // We will replace it with on-disk historics below. self.layer_map.frozen_layers.push_back(open_layer_rc); @@ -298,6 +305,7 @@ impl OpenLayerManager { .frozen_layers .pop_front() .expect("there must be a inmem layer to flush"); + metrics.dec_frozen_layer(&inmem); // Only one task may call this function at a time (for this // timeline). If two tasks tried to flush the same frozen @@ -337,16 +345,45 @@ impl OpenLayerManager { compact_to: &[ResidentLayer], metrics: &TimelineMetrics, ) { - // We can simply reuse compact l0 logic. Use a different function name to indicate a different type of layer map modification. - self.finish_compact_l0(compact_from, compact_to, metrics) + // gc-compaction could contain layer rewrites. We need to delete the old layers and insert the new ones. + + // Match the old layers with the new layers + let mut add_layers = HashMap::new(); + let mut rewrite_layers = HashMap::new(); + let mut drop_layers = HashMap::new(); + for layer in compact_from { + drop_layers.insert(layer.layer_desc().key(), layer.clone()); + } + for layer in compact_to { + if let Some(old_layer) = drop_layers.remove(&layer.layer_desc().key()) { + rewrite_layers.insert(layer.layer_desc().key(), (old_layer.clone(), layer.clone())); + } else { + add_layers.insert(layer.layer_desc().key(), layer.clone()); + } + } + let add_layers = add_layers.values().cloned().collect::>(); + let drop_layers = drop_layers.values().cloned().collect::>(); + let rewrite_layers = rewrite_layers.values().cloned().collect::>(); + + self.rewrite_layers_inner(&rewrite_layers, &drop_layers, &add_layers, metrics); } /// Called post-compaction when some previous generation image layers were trimmed. - pub(crate) fn rewrite_layers( + pub fn rewrite_layers( &mut self, rewrite_layers: &[(Layer, ResidentLayer)], drop_layers: &[Layer], metrics: &TimelineMetrics, + ) { + self.rewrite_layers_inner(rewrite_layers, drop_layers, &[], metrics); + } + + fn rewrite_layers_inner( + &mut self, + rewrite_layers: &[(Layer, ResidentLayer)], + drop_layers: &[Layer], + add_layers: &[ResidentLayer], + metrics: &TimelineMetrics, ) { let mut updates = self.layer_map.batch_update(); for (old_layer, new_layer) in rewrite_layers { @@ -382,6 +419,10 @@ impl OpenLayerManager { for l in drop_layers { Self::delete_historic_layer(l, &mut updates, &mut self.layer_fmgr); } + for l in add_layers { + Self::insert_historic_layer(l.as_ref().clone(), &mut updates, &mut self.layer_fmgr); + metrics.record_new_file_metrics(l.layer_desc().file_size); + } updates.flush(); } diff --git a/pageserver/src/tenant/timeline/offload.rs b/pageserver/src/tenant/timeline/offload.rs index 6c6b19e8b1..3b5bf8290c 100644 --- a/pageserver/src/tenant/timeline/offload.rs +++ b/pageserver/src/tenant/timeline/offload.rs @@ -2,10 +2,11 @@ use std::sync::Arc; use pageserver_api::models::{TenantState, TimelineState}; -use super::delete::{delete_local_timeline_directory, DeleteTimelineFlow, DeletionGuard}; +use super::delete::{delete_local_timeline_directory, DeletionGuard}; use super::Timeline; use crate::span::debug_assert_current_span_has_tenant_and_timeline_id; use crate::tenant::remote_timeline_client::ShutdownIfArchivedError; +use crate::tenant::timeline::delete::{make_timeline_delete_guard, TimelineDeleteGuardKind}; use crate::tenant::{OffloadedTimeline, Tenant, TenantManifestError, TimelineOrOffloaded}; #[derive(thiserror::Error, Debug)] @@ -36,13 +37,10 @@ pub(crate) async fn offload_timeline( debug_assert_current_span_has_tenant_and_timeline_id(); tracing::info!("offloading archived timeline"); - let allow_offloaded_children = true; - let set_stopping = false; - let (timeline, guard) = DeleteTimelineFlow::prepare( + let (timeline, guard) = make_timeline_delete_guard( tenant, timeline.timeline_id, - allow_offloaded_children, - set_stopping, + TimelineDeleteGuardKind::Offload, ) .map_err(|e| OffloadError::Other(anyhow::anyhow!(e)))?; @@ -106,7 +104,7 @@ pub(crate) async fn offload_timeline( } /// It is important that this gets called when DeletionGuard is being held. -/// For more context see comments in [`DeleteTimelineFlow::prepare`] +/// For more context see comments in [`make_timeline_delete_guard`] /// /// Returns the strong count of the timeline `Arc` fn remove_timeline_from_tenant( diff --git a/pageserver/src/tenant/timeline/walreceiver/connection_manager.rs b/pageserver/src/tenant/timeline/walreceiver/connection_manager.rs index 583d6309ab..65f9d39078 100644 --- a/pageserver/src/tenant/timeline/walreceiver/connection_manager.rs +++ b/pageserver/src/tenant/timeline/walreceiver/connection_manager.rs @@ -164,9 +164,10 @@ pub(super) async fn connection_manager_loop_step( Ok(Some(broker_update)) => connection_manager_state.register_timeline_update(broker_update), Err(status) => { match status.code() { - Code::Unknown if status.message().contains("stream closed because of a broken pipe") || status.message().contains("connection reset") => { + Code::Unknown if status.message().contains("stream closed because of a broken pipe") || status.message().contains("connection reset") || status.message().contains("error reading a body from connection") => { // tonic's error handling doesn't provide a clear code for disconnections: we get // "h2 protocol error: error reading a body from connection: stream closed because of a broken pipe" + // => https://github.com/neondatabase/neon/issues/9562 info!("broker disconnected: {status}"); }, _ => { @@ -273,7 +274,7 @@ pub(super) async fn connection_manager_loop_step( }; last_discovery_ts = Some(std::time::Instant::now()); - debug!("No active connection and no candidates, sending discovery request to the broker"); + info!("No active connection and no candidates, sending discovery request to the broker"); // Cancellation safety: we want to send a message to the broker, but publish_one() // function can get cancelled by the other select! arm. This is absolutely fine, because diff --git a/pageserver/src/tenant/timeline/walreceiver/walreceiver_connection.rs b/pageserver/src/tenant/timeline/walreceiver/walreceiver_connection.rs index 129b987e57..d69e7dbd32 100644 --- a/pageserver/src/tenant/timeline/walreceiver/walreceiver_connection.rs +++ b/pageserver/src/tenant/timeline/walreceiver/walreceiver_connection.rs @@ -118,7 +118,7 @@ pub(super) async fn handle_walreceiver_connection( cancellation: CancellationToken, connect_timeout: Duration, ctx: RequestContext, - node: NodeId, + safekeeper_node: NodeId, ingest_batch_size: u64, ) -> Result<(), WalReceiverError> { debug_assert_current_span_has_tenant_and_timeline_id(); @@ -140,7 +140,7 @@ pub(super) async fn handle_walreceiver_connection( let (replication_client, connection) = { let mut config = wal_source_connconf.to_tokio_postgres_config(); - config.application_name("pageserver"); + config.application_name(format!("pageserver-{}", timeline.conf.id.0).as_str()); config.replication_mode(tokio_postgres::config::ReplicationMode::Physical); match time::timeout(connect_timeout, config.connect(postgres::NoTls)).await { Ok(client_and_conn) => client_and_conn?, @@ -162,7 +162,7 @@ pub(super) async fn handle_walreceiver_connection( latest_wal_update: Utc::now().naive_utc(), streaming_lsn: None, commit_lsn: None, - node, + node: safekeeper_node, }; if let Err(e) = events_sender.send(TaskStateUpdate::Progress(connection_status)) { warn!("Wal connection event listener dropped right after connection init, aborting the connection: {e}"); diff --git a/pageserver/src/tenant/vectored_blob_io.rs b/pageserver/src/tenant/vectored_blob_io.rs index dfe2352310..47fb4a276b 100644 --- a/pageserver/src/tenant/vectored_blob_io.rs +++ b/pageserver/src/tenant/vectored_blob_io.rs @@ -35,6 +35,7 @@ use crate::virtual_file::{self, VirtualFile}; pub struct BlobMeta { pub key: Key, pub lsn: Lsn, + pub will_init: bool, } /// A view into the vectored blobs read buffer. @@ -310,7 +311,15 @@ pub enum BlobFlag { /// * Iterate over the collected blobs and coalesce them into reads at the end pub struct VectoredReadPlanner { // Track all the blob offsets. Start offsets must be ordered. - blobs: BTreeMap>, + // Values in the value tuples are: + // ( + // lsn of the blob, + // start offset of the blob in the underlying file, + // end offset of the blob in the underlying file, + // whether the blob initializes the page image or not + // see [`pageserver_api::record::NeonWalRecord::will_init`] + // ) + blobs: BTreeMap>, // Arguments for previous blob passed into [`VectoredReadPlanner::handle`] prev: Option<(Key, Lsn, u64, BlobFlag)>, @@ -371,12 +380,12 @@ impl VectoredReadPlanner { match flag { BlobFlag::None => { let blobs_for_key = self.blobs.entry(key).or_default(); - blobs_for_key.push((lsn, start_offset, end_offset)); + blobs_for_key.push((lsn, start_offset, end_offset, false)); } BlobFlag::ReplaceAll => { let blobs_for_key = self.blobs.entry(key).or_default(); blobs_for_key.clear(); - blobs_for_key.push((lsn, start_offset, end_offset)); + blobs_for_key.push((lsn, start_offset, end_offset, true)); } BlobFlag::Ignore => {} } @@ -387,11 +396,17 @@ impl VectoredReadPlanner { let mut reads = Vec::new(); for (key, blobs_for_key) in self.blobs { - for (lsn, start_offset, end_offset) in blobs_for_key { + for (lsn, start_offset, end_offset, will_init) in blobs_for_key { let extended = match &mut current_read_builder { - Some(read_builder) => { - read_builder.extend(start_offset, end_offset, BlobMeta { key, lsn }) - } + Some(read_builder) => read_builder.extend( + start_offset, + end_offset, + BlobMeta { + key, + lsn, + will_init, + }, + ), None => VectoredReadExtended::No, }; @@ -399,7 +414,11 @@ impl VectoredReadPlanner { let next_read_builder = ChunkedVectoredReadBuilder::new( start_offset, end_offset, - BlobMeta { key, lsn }, + BlobMeta { + key, + lsn, + will_init, + }, self.max_read_size, ); @@ -527,7 +546,7 @@ impl<'a> VectoredBlobReader<'a> { pub struct StreamingVectoredReadPlanner { read_builder: Option, // Arguments for previous blob passed into [`StreamingVectoredReadPlanner::handle`] - prev: Option<(Key, Lsn, u64)>, + prev: Option<(Key, Lsn, u64, bool)>, /// Max read size per batch. This is not a strict limit. If there are [0, 100) and [100, 200), while the `max_read_size` is 150, /// we will produce a single batch instead of split them. max_read_size: u64, @@ -550,27 +569,47 @@ impl StreamingVectoredReadPlanner { } } - pub fn handle(&mut self, key: Key, lsn: Lsn, offset: u64) -> Option { + pub fn handle( + &mut self, + key: Key, + lsn: Lsn, + offset: u64, + will_init: bool, + ) -> Option { // Implementation note: internally lag behind by one blob such that // we have a start and end offset when initialising [`VectoredRead`] - let (prev_key, prev_lsn, prev_offset) = match self.prev { + let (prev_key, prev_lsn, prev_offset, prev_will_init) = match self.prev { None => { - self.prev = Some((key, lsn, offset)); + self.prev = Some((key, lsn, offset, will_init)); return None; } Some(prev) => prev, }; - let res = self.add_blob(prev_key, prev_lsn, prev_offset, offset, false); + let res = self.add_blob( + prev_key, + prev_lsn, + prev_offset, + offset, + false, + prev_will_init, + ); - self.prev = Some((key, lsn, offset)); + self.prev = Some((key, lsn, offset, will_init)); res } pub fn handle_range_end(&mut self, offset: u64) -> Option { - let res = if let Some((prev_key, prev_lsn, prev_offset)) = self.prev { - self.add_blob(prev_key, prev_lsn, prev_offset, offset, true) + let res = if let Some((prev_key, prev_lsn, prev_offset, prev_will_init)) = self.prev { + self.add_blob( + prev_key, + prev_lsn, + prev_offset, + offset, + true, + prev_will_init, + ) } else { None }; @@ -587,10 +626,19 @@ impl StreamingVectoredReadPlanner { start_offset: u64, end_offset: u64, is_last_blob_in_read: bool, + will_init: bool, ) -> Option { match &mut self.read_builder { Some(read_builder) => { - let extended = read_builder.extend(start_offset, end_offset, BlobMeta { key, lsn }); + let extended = read_builder.extend( + start_offset, + end_offset, + BlobMeta { + key, + lsn, + will_init, + }, + ); assert_eq!(extended, VectoredReadExtended::Yes); } None => { @@ -598,7 +646,11 @@ impl StreamingVectoredReadPlanner { Some(ChunkedVectoredReadBuilder::new_streaming( start_offset, end_offset, - BlobMeta { key, lsn }, + BlobMeta { + key, + lsn, + will_init, + }, )) }; } @@ -812,7 +864,7 @@ mod tests { let mut planner = StreamingVectoredReadPlanner::new(max_read_size, 1000); let mut reads = Vec::new(); for (key, lsn, offset, _) in blob_descriptions.clone() { - reads.extend(planner.handle(key, lsn, offset)); + reads.extend(planner.handle(key, lsn, offset, false)); } reads.extend(planner.handle_range_end(652 * 1024)); @@ -850,7 +902,7 @@ mod tests { let mut planner = StreamingVectoredReadPlanner::new(max_read_size, 2); let mut reads = Vec::new(); for (key, lsn, offset, _) in blob_descriptions.clone() { - reads.extend(planner.handle(key, lsn, offset)); + reads.extend(planner.handle(key, lsn, offset, false)); } reads.extend(planner.handle_range_end(652 * 1024)); @@ -875,7 +927,7 @@ mod tests { { let mut planner = StreamingVectoredReadPlanner::new(max_read_size, 1); let mut reads = Vec::new(); - reads.extend(planner.handle(key, lsn, 0)); + reads.extend(planner.handle(key, lsn, 0, false)); reads.extend(planner.handle_range_end(652 * 1024)); assert_eq!(reads.len(), 1); validate_read(&reads[0], &[(key, lsn, 0, BlobFlag::None)]); @@ -883,8 +935,8 @@ mod tests { { let mut planner = StreamingVectoredReadPlanner::new(max_read_size, 1); let mut reads = Vec::new(); - reads.extend(planner.handle(key, lsn, 0)); - reads.extend(planner.handle(key, lsn, 128 * 1024)); + reads.extend(planner.handle(key, lsn, 0, false)); + reads.extend(planner.handle(key, lsn, 128 * 1024, false)); reads.extend(planner.handle_range_end(652 * 1024)); assert_eq!(reads.len(), 2); validate_read(&reads[0], &[(key, lsn, 0, BlobFlag::None)]); @@ -893,8 +945,8 @@ mod tests { { let mut planner = StreamingVectoredReadPlanner::new(max_read_size, 2); let mut reads = Vec::new(); - reads.extend(planner.handle(key, lsn, 0)); - reads.extend(planner.handle(key, lsn, 128 * 1024)); + reads.extend(planner.handle(key, lsn, 0, false)); + reads.extend(planner.handle(key, lsn, 128 * 1024, false)); reads.extend(planner.handle_range_end(652 * 1024)); assert_eq!(reads.len(), 1); validate_read( @@ -923,6 +975,7 @@ mod tests { let meta = BlobMeta { key: Key::MIN, lsn: Lsn(0), + will_init: false, }; for (idx, (blob, offset)) in blobs.iter().zip(offsets.iter()).enumerate() { diff --git a/pageserver/src/utilization.rs b/pageserver/src/utilization.rs index a0223f3bce..093a944777 100644 --- a/pageserver/src/utilization.rs +++ b/pageserver/src/utilization.rs @@ -49,7 +49,7 @@ pub(crate) fn regenerate( }; // Express a static value for how many shards we may schedule on one node - const MAX_SHARDS: u32 = 20000; + const MAX_SHARDS: u32 = 5000; let mut doc = PageserverUtilization { disk_usage_bytes: used, diff --git a/pageserver/src/walingest.rs b/pageserver/src/walingest.rs index ad7bcc0714..e0283d99e0 100644 --- a/pageserver/src/walingest.rs +++ b/pageserver/src/walingest.rs @@ -499,7 +499,13 @@ impl WalIngest { let content = modification .tline - .get_rel_page_at_lsn(src_rel, blknum, Version::Modified(modification), ctx) + .get_rel_page_at_lsn( + src_rel, + blknum, + Version::Modified(modification), + ctx, + crate::tenant::storage_layer::IoConcurrency::sequential(), + ) .await?; modification.put_rel_page_image(dst_rel, blknum, content)?; num_blocks_copied += 1; @@ -1489,6 +1495,7 @@ mod tests { use super::*; use crate::tenant::harness::*; use crate::tenant::remote_timeline_client::{remote_initdb_archive_path, INITDB_PATH}; + use crate::tenant::storage_layer::IoConcurrency; use postgres_ffi::RELSEG_SIZE; use crate::DEFAULT_PG_VERSION; @@ -1532,6 +1539,7 @@ mod tests { #[tokio::test] async fn test_relsize() -> Result<()> { let (tenant, ctx) = TenantHarness::create("test_relsize").await?.load().await; + let io_concurrency = IoConcurrency::spawn_for_test(); let tline = tenant .create_test_timeline(TIMELINE_ID, Lsn(8), DEFAULT_PG_VERSION, &ctx) .await?; @@ -1599,7 +1607,13 @@ mod tests { // Check page contents at each LSN assert_eq!( tline - .get_rel_page_at_lsn(TESTREL_A, 0, Version::Lsn(Lsn(0x20)), &ctx) + .get_rel_page_at_lsn( + TESTREL_A, + 0, + Version::Lsn(Lsn(0x20)), + &ctx, + io_concurrency.clone() + ) .instrument(test_span.clone()) .await?, test_img("foo blk 0 at 2") @@ -1607,7 +1621,13 @@ mod tests { assert_eq!( tline - .get_rel_page_at_lsn(TESTREL_A, 0, Version::Lsn(Lsn(0x30)), &ctx) + .get_rel_page_at_lsn( + TESTREL_A, + 0, + Version::Lsn(Lsn(0x30)), + &ctx, + io_concurrency.clone() + ) .instrument(test_span.clone()) .await?, test_img("foo blk 0 at 3") @@ -1615,14 +1635,26 @@ mod tests { assert_eq!( tline - .get_rel_page_at_lsn(TESTREL_A, 0, Version::Lsn(Lsn(0x40)), &ctx) + .get_rel_page_at_lsn( + TESTREL_A, + 0, + Version::Lsn(Lsn(0x40)), + &ctx, + io_concurrency.clone() + ) .instrument(test_span.clone()) .await?, test_img("foo blk 0 at 3") ); assert_eq!( tline - .get_rel_page_at_lsn(TESTREL_A, 1, Version::Lsn(Lsn(0x40)), &ctx) + .get_rel_page_at_lsn( + TESTREL_A, + 1, + Version::Lsn(Lsn(0x40)), + &ctx, + io_concurrency.clone() + ) .instrument(test_span.clone()) .await?, test_img("foo blk 1 at 4") @@ -1630,21 +1662,39 @@ mod tests { assert_eq!( tline - .get_rel_page_at_lsn(TESTREL_A, 0, Version::Lsn(Lsn(0x50)), &ctx) + .get_rel_page_at_lsn( + TESTREL_A, + 0, + Version::Lsn(Lsn(0x50)), + &ctx, + io_concurrency.clone() + ) .instrument(test_span.clone()) .await?, test_img("foo blk 0 at 3") ); assert_eq!( tline - .get_rel_page_at_lsn(TESTREL_A, 1, Version::Lsn(Lsn(0x50)), &ctx) + .get_rel_page_at_lsn( + TESTREL_A, + 1, + Version::Lsn(Lsn(0x50)), + &ctx, + io_concurrency.clone() + ) .instrument(test_span.clone()) .await?, test_img("foo blk 1 at 4") ); assert_eq!( tline - .get_rel_page_at_lsn(TESTREL_A, 2, Version::Lsn(Lsn(0x50)), &ctx) + .get_rel_page_at_lsn( + TESTREL_A, + 2, + Version::Lsn(Lsn(0x50)), + &ctx, + io_concurrency.clone() + ) .instrument(test_span.clone()) .await?, test_img("foo blk 2 at 5") @@ -1667,14 +1717,26 @@ mod tests { ); assert_eq!( tline - .get_rel_page_at_lsn(TESTREL_A, 0, Version::Lsn(Lsn(0x60)), &ctx) + .get_rel_page_at_lsn( + TESTREL_A, + 0, + Version::Lsn(Lsn(0x60)), + &ctx, + io_concurrency.clone() + ) .instrument(test_span.clone()) .await?, test_img("foo blk 0 at 3") ); assert_eq!( tline - .get_rel_page_at_lsn(TESTREL_A, 1, Version::Lsn(Lsn(0x60)), &ctx) + .get_rel_page_at_lsn( + TESTREL_A, + 1, + Version::Lsn(Lsn(0x60)), + &ctx, + io_concurrency.clone() + ) .instrument(test_span.clone()) .await?, test_img("foo blk 1 at 4") @@ -1689,7 +1751,13 @@ mod tests { ); assert_eq!( tline - .get_rel_page_at_lsn(TESTREL_A, 2, Version::Lsn(Lsn(0x50)), &ctx) + .get_rel_page_at_lsn( + TESTREL_A, + 2, + Version::Lsn(Lsn(0x50)), + &ctx, + io_concurrency.clone() + ) .instrument(test_span.clone()) .await?, test_img("foo blk 2 at 5") @@ -1722,14 +1790,26 @@ mod tests { ); assert_eq!( tline - .get_rel_page_at_lsn(TESTREL_A, 0, Version::Lsn(Lsn(0x70)), &ctx) + .get_rel_page_at_lsn( + TESTREL_A, + 0, + Version::Lsn(Lsn(0x70)), + &ctx, + io_concurrency.clone() + ) .instrument(test_span.clone()) .await?, ZERO_PAGE ); assert_eq!( tline - .get_rel_page_at_lsn(TESTREL_A, 1, Version::Lsn(Lsn(0x70)), &ctx) + .get_rel_page_at_lsn( + TESTREL_A, + 1, + Version::Lsn(Lsn(0x70)), + &ctx, + io_concurrency.clone() + ) .instrument(test_span.clone()) .await?, test_img("foo blk 1") @@ -1750,7 +1830,13 @@ mod tests { for blk in 2..1500 { assert_eq!( tline - .get_rel_page_at_lsn(TESTREL_A, blk, Version::Lsn(Lsn(0x80)), &ctx) + .get_rel_page_at_lsn( + TESTREL_A, + blk, + Version::Lsn(Lsn(0x80)), + &ctx, + io_concurrency.clone() + ) .instrument(test_span.clone()) .await?, ZERO_PAGE @@ -1758,7 +1844,13 @@ mod tests { } assert_eq!( tline - .get_rel_page_at_lsn(TESTREL_A, 1500, Version::Lsn(Lsn(0x80)), &ctx) + .get_rel_page_at_lsn( + TESTREL_A, + 1500, + Version::Lsn(Lsn(0x80)), + &ctx, + io_concurrency.clone() + ) .instrument(test_span.clone()) .await?, test_img("foo blk 1500") @@ -1851,6 +1943,7 @@ mod tests { .await? .load() .await; + let io_concurrency = IoConcurrency::spawn_for_test(); let tline = tenant .create_test_timeline(TIMELINE_ID, Lsn(8), DEFAULT_PG_VERSION, &ctx) .await?; @@ -1903,7 +1996,13 @@ mod tests { let data = format!("foo blk {} at {}", blkno, lsn); assert_eq!( tline - .get_rel_page_at_lsn(TESTREL_A, blkno, Version::Lsn(lsn), &ctx) + .get_rel_page_at_lsn( + TESTREL_A, + blkno, + Version::Lsn(lsn), + &ctx, + io_concurrency.clone() + ) .instrument(test_span.clone()) .await?, test_img(&data) @@ -1931,7 +2030,13 @@ mod tests { let data = format!("foo blk {} at {}", blkno, lsn); assert_eq!( tline - .get_rel_page_at_lsn(TESTREL_A, blkno, Version::Lsn(Lsn(0x60)), &ctx) + .get_rel_page_at_lsn( + TESTREL_A, + blkno, + Version::Lsn(Lsn(0x60)), + &ctx, + io_concurrency.clone() + ) .instrument(test_span.clone()) .await?, test_img(&data) @@ -1950,7 +2055,13 @@ mod tests { let data = format!("foo blk {} at {}", blkno, lsn); assert_eq!( tline - .get_rel_page_at_lsn(TESTREL_A, blkno, Version::Lsn(Lsn(0x50)), &ctx) + .get_rel_page_at_lsn( + TESTREL_A, + blkno, + Version::Lsn(Lsn(0x50)), + &ctx, + io_concurrency.clone() + ) .instrument(test_span.clone()) .await?, test_img(&data) @@ -1987,7 +2098,13 @@ mod tests { let data = format!("foo blk {} at {}", blkno, lsn); assert_eq!( tline - .get_rel_page_at_lsn(TESTREL_A, blkno, Version::Lsn(Lsn(0x80)), &ctx) + .get_rel_page_at_lsn( + TESTREL_A, + blkno, + Version::Lsn(Lsn(0x80)), + &ctx, + io_concurrency.clone() + ) .instrument(test_span.clone()) .await?, test_img(&data) diff --git a/pgxn/neon/file_cache.c b/pgxn/neon/file_cache.c index ad5667cbab..08b7652175 100644 --- a/pgxn/neon/file_cache.c +++ b/pgxn/neon/file_cache.c @@ -480,7 +480,7 @@ lfc_cache_contains(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber blkno) if (LFC_ENABLED()) { entry = hash_search_with_hash_value(lfc_hash, &tag, hash, HASH_FIND, NULL); - found = entry != NULL && (entry->bitmap[chunk_offs >> 5] & (1 << (chunk_offs & 31))) != 0; + found = entry != NULL && (entry->bitmap[chunk_offs >> 5] & ((uint32)1 << (chunk_offs & 31))) != 0; } LWLockRelease(lfc_lock); return found; @@ -527,7 +527,7 @@ lfc_cache_containsv(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber blkno, for (; chunk_offs < BLOCKS_PER_CHUNK && i < nblocks; chunk_offs++, i++) { if ((entry->bitmap[chunk_offs >> 5] & - (1 << (chunk_offs & 31))) != 0) + ((uint32)1 << (chunk_offs & 31))) != 0) { BITMAP_SET(bitmap, i); found++; @@ -620,7 +620,7 @@ lfc_evict(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber blkno) } /* remove the page from the cache */ - entry->bitmap[chunk_offs >> 5] &= ~(1 << (chunk_offs & (32 - 1))); + entry->bitmap[chunk_offs >> 5] &= ~((uint32)1 << (chunk_offs & (32 - 1))); if (entry->access_count == 0) { @@ -774,7 +774,7 @@ lfc_readv_select(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber blkno, * If the page is valid, we consider it "read". * All other pages will be fetched separately by the next cache */ - if (entry->bitmap[(chunk_offs + i) / 32] & (1 << ((chunk_offs + i) % 32))) + if (entry->bitmap[(chunk_offs + i) / 32] & ((uint32)1 << ((chunk_offs + i) % 32))) { BITMAP_SET(mask, buf_offset + i); iteration_hits++; @@ -911,57 +911,85 @@ lfc_writev(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber blkno, if (entry->access_count++ == 0) dlist_delete(&entry->list_node); } - else + /*----------- + * If the chunk wasn't already in the LFC then we have these + * options, in order of preference: + * + * Unless there is no space available, we can: + * 1. Use an entry from the `holes` list, and + * 2. Create a new entry. + * We can always, regardless of space in the LFC: + * 3. evict an entry from LRU, and + * 4. ignore the write operation (the least favorite option) + */ + else if (lfc_ctl->used < lfc_ctl->limit) { - /* - * We have two choices if all cache pages are pinned (i.e. used in IO - * operations): - * - * 1) Wait until some of this operation is completed and pages is - * unpinned. - * - * 2) Allocate one more chunk, so that specified cache size is more - * recommendation than hard limit. - * - * As far as probability of such event (that all pages are pinned) is - * considered to be very very small: there are should be very large - * number of concurrent IO operations and them are limited by - * max_connections, we prefer not to complicate code and use second - * approach. - */ - if (lfc_ctl->used >= lfc_ctl->limit && !dlist_is_empty(&lfc_ctl->lru)) - { - /* Cache overflow: evict least recently used chunk */ - FileCacheEntry *victim = dlist_container(FileCacheEntry, list_node, dlist_pop_head_node(&lfc_ctl->lru)); - - for (int i = 0; i < BLOCKS_PER_CHUNK; i++) - { - lfc_ctl->used_pages -= (victim->bitmap[i >> 5] >> (i & 31)) & 1; - } - CriticalAssert(victim->access_count == 0); - entry->offset = victim->offset; /* grab victim's chunk */ - hash_search_with_hash_value(lfc_hash, &victim->key, victim->hash, HASH_REMOVE, NULL); - neon_log(DEBUG2, "Swap file cache page"); - } - else if (!dlist_is_empty(&lfc_ctl->holes)) + if (!dlist_is_empty(&lfc_ctl->holes)) { /* We can reuse a hole that was left behind when the LFC was shrunk previously */ - FileCacheEntry *hole = dlist_container(FileCacheEntry, list_node, dlist_pop_head_node(&lfc_ctl->holes)); - uint32 offset = hole->offset; - bool hole_found; - - hash_search_with_hash_value(lfc_hash, &hole->key, hole->hash, HASH_REMOVE, &hole_found); + FileCacheEntry *hole = dlist_container(FileCacheEntry, list_node, + dlist_pop_head_node(&lfc_ctl->holes)); + uint32 offset = hole->offset; + bool hole_found; + + hash_search_with_hash_value(lfc_hash, &hole->key, + hole->hash, HASH_REMOVE, &hole_found); CriticalAssert(hole_found); - + lfc_ctl->used += 1; - entry->offset = offset; /* reuse the hole */ + entry->offset = offset; /* reuse the hole */ } else { lfc_ctl->used += 1; - entry->offset = lfc_ctl->size++; /* allocate new chunk at end - * of file */ + entry->offset = lfc_ctl->size++;/* allocate new chunk at end + * of file */ } + } + /* + * We've already used up all allocated LFC entries. + * + * If we can clear an entry from the LRU, do that. + * If we can't (e.g. because all other slots are being accessed) + * then we will remove this entry from the hash and continue + * on to the next chunk, as we may not exceed the limit. + */ + else if (!dlist_is_empty(&lfc_ctl->lru)) + { + /* Cache overflow: evict least recently used chunk */ + FileCacheEntry *victim = dlist_container(FileCacheEntry, list_node, + dlist_pop_head_node(&lfc_ctl->lru)); + + for (int i = 0; i < BLOCKS_PER_CHUNK; i++) + { + lfc_ctl->used_pages -= (victim->bitmap[i >> 5] >> (i & 31)) & 1; + } + + CriticalAssert(victim->access_count == 0); + entry->offset = victim->offset; /* grab victim's chunk */ + hash_search_with_hash_value(lfc_hash, &victim->key, + victim->hash, HASH_REMOVE, NULL); + neon_log(DEBUG2, "Swap file cache page"); + } + else + { + /* Can't add this chunk - we don't have the space for it */ + hash_search_with_hash_value(lfc_hash, &entry->key, hash, + HASH_REMOVE, NULL); + + /* + * We can't process this chunk due to lack of space in LFC, + * so skip to the next one + */ + LWLockRelease(lfc_lock); + blkno += blocks_in_chunk; + buf_offset += blocks_in_chunk; + nblocks -= blocks_in_chunk; + continue; + } + + if (!found) + { entry->access_count = 1; entry->hash = hash; memset(entry->bitmap, 0, sizeof entry->bitmap); @@ -1006,7 +1034,7 @@ lfc_writev(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber blkno, { lfc_ctl->used_pages += 1 - ((entry->bitmap[(chunk_offs + i) >> 5] >> ((chunk_offs + i) & 31)) & 1); entry->bitmap[(chunk_offs + i) >> 5] |= - (1 << ((chunk_offs + i) & 31)); + ((uint32)1 << ((chunk_offs + i) & 31)); } } @@ -1254,7 +1282,7 @@ local_cache_pages(PG_FUNCTION_ARGS) { for (int i = 0; i < BLOCKS_PER_CHUNK; i++) { - if (entry->bitmap[i >> 5] & (1 << (i & 31))) + if (entry->bitmap[i >> 5] & ((uint32)1 << (i & 31))) { fctx->record[n].pageoffs = entry->offset * BLOCKS_PER_CHUNK + i; fctx->record[n].relfilenode = NInfoGetRelNumber(BufTagGetNRelFileInfo(entry->key)); diff --git a/pgxn/neon/neon.c b/pgxn/neon/neon.c index ff08f9164d..ce2938cfd5 100644 --- a/pgxn/neon/neon.c +++ b/pgxn/neon/neon.c @@ -19,6 +19,7 @@ #include "access/xlogrecovery.h" #endif #include "replication/logical.h" +#include "replication/logicallauncher.h" #include "replication/slot.h" #include "replication/walsender.h" #include "storage/proc.h" @@ -434,6 +435,15 @@ _PG_init(void) restore_running_xacts_callback = RestoreRunningXactsFromClog; + DefineCustomBoolVariable( + "neon.disable_logical_replication_subscribers", + "Disables incomming logical replication", + NULL, + &disable_logical_replication_subscribers, + false, + PGC_SIGHUP, + 0, + NULL, NULL, NULL); DefineCustomBoolVariable( "neon.allow_replica_misconfig", diff --git a/pgxn/neon/pagestore_client.h b/pgxn/neon/pagestore_client.h index b751235595..7b748d7252 100644 --- a/pgxn/neon/pagestore_client.h +++ b/pgxn/neon/pagestore_client.h @@ -34,6 +34,8 @@ typedef enum T_NeonGetPageRequest, T_NeonDbSizeRequest, T_NeonGetSlruSegmentRequest, + /* future tags above this line */ + T_NeonTestRequest = 99, /* only in cfg(feature = "testing") */ /* pagestore -> pagestore_client */ T_NeonExistsResponse = 100, @@ -42,6 +44,8 @@ typedef enum T_NeonErrorResponse, T_NeonDbSizeResponse, T_NeonGetSlruSegmentResponse, + /* future tags above this line */ + T_NeonTestResponse = 199, /* only in cfg(feature = "testing") */ } NeonMessageTag; typedef uint64 NeonRequestId; diff --git a/pgxn/neon/walproposer.c b/pgxn/neon/walproposer.c index e89ffdb628..7472fd6afc 100644 --- a/pgxn/neon/walproposer.c +++ b/pgxn/neon/walproposer.c @@ -1024,7 +1024,8 @@ DetermineEpochStartLsn(WalProposer *wp) dth = &wp->safekeeper[wp->donor].voteResponse.termHistory; wp->propTermHistory.n_entries = dth->n_entries + 1; wp->propTermHistory.entries = palloc(sizeof(TermSwitchEntry) * wp->propTermHistory.n_entries); - memcpy(wp->propTermHistory.entries, dth->entries, sizeof(TermSwitchEntry) * dth->n_entries); + if (dth->n_entries > 0) + memcpy(wp->propTermHistory.entries, dth->entries, sizeof(TermSwitchEntry) * dth->n_entries); wp->propTermHistory.entries[wp->propTermHistory.n_entries - 1].term = wp->propTerm; wp->propTermHistory.entries[wp->propTermHistory.n_entries - 1].lsn = wp->propEpochStartLsn; diff --git a/poetry.lock b/poetry.lock index 2cd2bc6383..c471d3e69c 100644 --- a/poetry.lock +++ b/poetry.lock @@ -1,4 +1,4 @@ -# This file is automatically @generated by Poetry 1.8.5 and should not be changed by hand. +# This file is automatically @generated by Poetry 2.0.1 and should not be changed by hand. [[package]] name = "aiohappyeyeballs" @@ -6,6 +6,7 @@ version = "2.3.5" description = "Happy Eyeballs for asyncio" optional = false python-versions = ">=3.8" +groups = ["main"] files = [ {file = "aiohappyeyeballs-2.3.5-py3-none-any.whl", hash = "sha256:4d6dea59215537dbc746e93e779caea8178c866856a721c9c660d7a5a7b8be03"}, {file = "aiohappyeyeballs-2.3.5.tar.gz", hash = "sha256:6fa48b9f1317254f122a07a131a86b71ca6946ca989ce6326fff54a99a920105"}, @@ -17,6 +18,7 @@ version = "3.10.11" description = "Async http client/server framework (asyncio)" optional = false python-versions = ">=3.8" +groups = ["main"] files = [ {file = "aiohttp-3.10.11-cp310-cp310-macosx_10_9_universal2.whl", hash = "sha256:5077b1a5f40ffa3ba1f40d537d3bec4383988ee51fbba6b74aa8fb1bc466599e"}, {file = "aiohttp-3.10.11-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:8d6a14a4d93b5b3c2891fca94fa9d41b2322a68194422bef0dd5ec1e57d7d298"}, @@ -128,6 +130,7 @@ version = "1.4.0" description = "Postgres integration with asyncio." optional = false python-versions = ">=3.7" +groups = ["main"] files = [ {file = "aiopg-1.4.0-py3-none-any.whl", hash = "sha256:aea46e8aff30b039cfa818e6db4752c97656e893fc75e5a5dc57355a9e9dedbd"}, {file = "aiopg-1.4.0.tar.gz", hash = "sha256:116253bef86b4d954116716d181e9a0294037f266718b2e1c9766af995639d71"}, @@ -146,6 +149,7 @@ version = "1.3.1" description = "aiosignal: a list of registered asynchronous callbacks" optional = false python-versions = ">=3.7" +groups = ["main"] files = [ {file = "aiosignal-1.3.1-py3-none-any.whl", hash = "sha256:f8376fb07dd1e86a584e4fcdec80b36b7f81aac666ebc724e2c090300dd83b17"}, {file = "aiosignal-1.3.1.tar.gz", hash = "sha256:54cd96e15e1649b75d6c87526a6ff0b6c1b0dd3459f43d9ca11d48c339b68cfc"}, @@ -160,6 +164,7 @@ version = "2.13.2" description = "Allure pytest integration" optional = false python-versions = "*" +groups = ["main"] files = [ {file = "allure-pytest-2.13.2.tar.gz", hash = "sha256:22243159e8ec81ce2b5254b4013802198821b1b42f118f69d4a289396607c7b3"}, {file = "allure_pytest-2.13.2-py3-none-any.whl", hash = "sha256:17de9dbee7f61c8e66a5b5e818b00e419dbcea44cb55c24319401ba813220690"}, @@ -175,6 +180,7 @@ version = "2.13.2" description = "Common module for integrate allure with python-based frameworks" optional = false python-versions = ">=3.6" +groups = ["main"] files = [ {file = "allure-python-commons-2.13.2.tar.gz", hash = "sha256:8a03681330231b1deadd86b97ff68841c6591320114ae638570f1ed60d7a2033"}, {file = "allure_python_commons-2.13.2-py3-none-any.whl", hash = "sha256:2bb3646ec3fbf5b36d178a5e735002bc130ae9f9ba80f080af97d368ba375051"}, @@ -190,6 +196,7 @@ version = "0.6.0" description = "Reusable constraint types to use with typing.Annotated" optional = false python-versions = ">=3.8" +groups = ["main"] files = [ {file = "annotated_types-0.6.0-py3-none-any.whl", hash = "sha256:0641064de18ba7a25dee8f96403ebc39113d0cb953a01429249d5c7564666a43"}, {file = "annotated_types-0.6.0.tar.gz", hash = "sha256:563339e807e53ffd9c267e99fc6d9ea23eb8443c08f112651963e24e22f84a5d"}, @@ -201,6 +208,7 @@ version = "4.13.1" description = "ANTLR 4.13.1 runtime for Python 3" optional = false python-versions = "*" +groups = ["main"] files = [ {file = "antlr4-python3-runtime-4.13.1.tar.gz", hash = "sha256:3cd282f5ea7cfb841537fe01f143350fdb1c0b1ce7981443a2fa8513fddb6d1a"}, {file = "antlr4_python3_runtime-4.13.1-py3-none-any.whl", hash = "sha256:78ec57aad12c97ac039ca27403ad61cb98aaec8a3f9bb8144f889aa0fa28b943"}, @@ -212,6 +220,7 @@ version = "4.3.0" description = "High level compatibility layer for multiple asynchronous event loop implementations" optional = false python-versions = ">=3.8" +groups = ["main"] files = [ {file = "anyio-4.3.0-py3-none-any.whl", hash = "sha256:048e05d0f6caeed70d731f3db756d35dcc1f35747c8c403364a8332c630441b8"}, {file = "anyio-4.3.0.tar.gz", hash = "sha256:f75253795a87df48568485fd18cdd2a3fa5c4f7c5be8e5e36637733fce06fed6"}, @@ -232,6 +241,7 @@ version = "4.0.3" description = "Timeout context manager for asyncio programs" optional = false python-versions = ">=3.7" +groups = ["main"] files = [ {file = "async-timeout-4.0.3.tar.gz", hash = "sha256:4640d96be84d82d02ed59ea2b7105a0f7b33abe8703703cd0ab0bf87c427522f"}, {file = "async_timeout-4.0.3-py3-none-any.whl", hash = "sha256:7405140ff1230c310e51dc27b3145b9092d659ce68ff733fb0cefe3ee42be028"}, @@ -243,6 +253,7 @@ version = "0.30.0" description = "An asyncio PostgreSQL driver" optional = false python-versions = ">=3.8.0" +groups = ["main"] files = [ {file = "asyncpg-0.30.0-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:bfb4dd5ae0699bad2b233672c8fc5ccbd9ad24b89afded02341786887e37927e"}, {file = "asyncpg-0.30.0-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:dc1f62c792752a49f88b7e6f774c26077091b44caceb1983509edc18a2222ec0"}, @@ -306,6 +317,7 @@ version = "21.4.0" description = "Classes Without Boilerplate" optional = false python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*, !=3.4.*" +groups = ["main"] files = [ {file = "attrs-21.4.0-py2.py3-none-any.whl", hash = "sha256:2d27e3784d7a565d36ab851fe94887c5eccd6a463168875832a1be79c82828b4"}, {file = "attrs-21.4.0.tar.gz", hash = "sha256:626ba8234211db98e869df76230a137c4c40a12d72445c45d5f5b716f076e2fd"}, @@ -323,6 +335,7 @@ version = "1.88.0" description = "AWS SAM Translator is a library that transform SAM templates into AWS CloudFormation templates" optional = false python-versions = "!=4.0,<=4.0,>=3.8" +groups = ["main"] files = [ {file = "aws_sam_translator-1.88.0-py3-none-any.whl", hash = "sha256:aa93d498d8de3fb3d485c316155b1628144b823bbc176099a20de06df666fcac"}, {file = "aws_sam_translator-1.88.0.tar.gz", hash = "sha256:e77c65f3488566122277accd44a0f1ec018e37403e0d5fe25120d96e537e91a7"}, @@ -343,6 +356,7 @@ version = "2.10.0" description = "The AWS X-Ray SDK for Python (the SDK) enables Python developers to record and emit information from within their applications to the AWS X-Ray service." optional = false python-versions = "*" +groups = ["main"] files = [ {file = "aws-xray-sdk-2.10.0.tar.gz", hash = "sha256:9b14924fd0628cf92936055864655354003f0b1acc3e1c3ffde6403d0799dd7a"}, {file = "aws_xray_sdk-2.10.0-py2.py3-none-any.whl", hash = "sha256:7551e81a796e1a5471ebe84844c40e8edf7c218db33506d046fec61f7495eda4"}, @@ -358,6 +372,7 @@ version = "2.2.1" description = "Function decoration for backoff and retry" optional = false python-versions = ">=3.7,<4.0" +groups = ["main"] files = [ {file = "backoff-2.2.1-py3-none-any.whl", hash = "sha256:63579f9a0628e06278f7e47b7d7d5b6ce20dc65c5e96a6f3ca99a6adca0396e8"}, {file = "backoff-2.2.1.tar.gz", hash = "sha256:03f829f5bb1923180821643f8753b0502c3b682293992485b0eef2807afa5cba"}, @@ -369,6 +384,7 @@ version = "1.34.11" description = "The AWS SDK for Python" optional = false python-versions = ">= 3.8" +groups = ["main"] files = [ {file = "boto3-1.34.11-py3-none-any.whl", hash = "sha256:1af021e0c6e3040e8de66d403e963566476235bb70f9a8e3f6784813ac2d8026"}, {file = "boto3-1.34.11.tar.gz", hash = "sha256:31c130a40ec0631059b77d7e87f67ad03ff1685a5b37638ac0c4687026a3259d"}, @@ -388,6 +404,7 @@ version = "1.26.16" description = "Type annotations for boto3 1.26.16 generated with mypy-boto3-builder 7.11.11" optional = false python-versions = ">=3.7" +groups = ["main"] files = [ {file = "boto3-stubs-1.26.16.tar.gz", hash = "sha256:618253ae19f1480785759bcaee8c8b10ed3fc037027247c26a3461a50f58406d"}, {file = "boto3_stubs-1.26.16-py3-none-any.whl", hash = "sha256:8cf2925bc3e1349c93eb0f49c1061affc5ca314d69eeb335349037969d0787ed"}, @@ -732,6 +749,7 @@ version = "1.34.11" description = "Low-level, data-driven core of boto 3." optional = false python-versions = ">= 3.8" +groups = ["main"] files = [ {file = "botocore-1.34.11-py3-none-any.whl", hash = "sha256:1ff1398b6ea670e1c01ac67a33af3da854f8e700d3528289c04f319c330d8250"}, {file = "botocore-1.34.11.tar.gz", hash = "sha256:51905c3d623c60df5dc5794387de7caf886d350180a01a3dfa762e903edb45a9"}, @@ -751,6 +769,7 @@ version = "1.27.38" description = "Type annotations for botocore 1.27.38 generated with mypy-boto3-builder 7.10.1" optional = false python-versions = ">=3.7" +groups = ["main"] files = [ {file = "botocore-stubs-1.27.38.tar.gz", hash = "sha256:408e8b86b5d171b58f81c74ca9d3b5317a5a8e2d3bc2073aa841ac13b8939e56"}, {file = "botocore_stubs-1.27.38-py3-none-any.whl", hash = "sha256:7add7641e9a479a9c8366893bb522fd9ca3d58714201e43662a200a148a1bc38"}, @@ -765,6 +784,7 @@ version = "2024.7.4" description = "Python package for providing Mozilla's CA Bundle." optional = false python-versions = ">=3.6" +groups = ["main"] files = [ {file = "certifi-2024.7.4-py3-none-any.whl", hash = "sha256:c198e21b1289c2ab85ee4e67bb4b4ef3ead0892059901a8d5b622f24a1101e90"}, {file = "certifi-2024.7.4.tar.gz", hash = "sha256:5a1e7645bc0ec61a09e26c36f6106dd4cf40c6db3a1fb6352b0244e7fb057c7b"}, @@ -776,6 +796,7 @@ version = "1.17.1" description = "Foreign Function Interface for Python calling C code." optional = false python-versions = ">=3.8" +groups = ["main"] files = [ {file = "cffi-1.17.1-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:df8b1c11f177bc2313ec4b2d46baec87a5f3e71fc8b45dab2ee7cae86d9aba14"}, {file = "cffi-1.17.1-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:8f2cdc858323644ab277e9bb925ad72ae0e67f69e804f4898c070998d50b1a67"}, @@ -855,6 +876,7 @@ version = "0.87.1" description = "Checks CloudFormation templates for practices and behaviour that could potentially be improved" optional = false python-versions = "!=4.0,<=4.0,>=3.8" +groups = ["main"] files = [ {file = "cfn_lint-0.87.1-py3-none-any.whl", hash = "sha256:d450f450635fc223b6f66880ccac52a5fd1a52966fa1705f1ba52b88dfed3071"}, {file = "cfn_lint-0.87.1.tar.gz", hash = "sha256:b3ce9d3e5e0eadcea5d584c8ccaa00bf2a990a36a64d7ffd8683bc60b7e4f06f"}, @@ -878,6 +900,7 @@ version = "2.1.0" description = "The Real First Universal Charset Detector. Open, modern and actively maintained alternative to Chardet." optional = false python-versions = ">=3.6.0" +groups = ["main"] files = [ {file = "charset-normalizer-2.1.0.tar.gz", hash = "sha256:575e708016ff3a5e3681541cb9d79312c416835686d054a23accb873b254f413"}, {file = "charset_normalizer-2.1.0-py3-none-any.whl", hash = "sha256:5189b6f22b01957427f35b6a08d9a0bc45b46d3788ef5a92e978433c7a35f8a5"}, @@ -892,6 +915,7 @@ version = "8.1.3" description = "Composable command line interface toolkit" optional = false python-versions = ">=3.7" +groups = ["main"] files = [ {file = "click-8.1.3-py3-none-any.whl", hash = "sha256:bb4d8133cb15a609f44e8213d9b391b0809795062913b383c62be0ee95b1db48"}, {file = "click-8.1.3.tar.gz", hash = "sha256:7682dc8afb30297001674575ea00d1814d808d6a36af415a82bd481d37ba7b8e"}, @@ -906,6 +930,7 @@ version = "0.7.17" description = "ClickHouse Database Core Driver for Python, Pandas, and Superset" optional = false python-versions = "~=3.8" +groups = ["main"] files = [ {file = "clickhouse-connect-0.7.17.tar.gz", hash = "sha256:854f1f9f3e024e7f89ae5d57cd3289d7a4c3dc91a9f24c4d233014f0ea19cb2d"}, {file = "clickhouse_connect-0.7.17-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:aca36f5f28be1ada2981fce87724bbf451f267c918015baec59e527de3c9c882"}, @@ -996,6 +1021,8 @@ version = "0.4.5" description = "Cross-platform colored terminal text." optional = false python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*, !=3.4.*" +groups = ["main"] +markers = "sys_platform == \"win32\" or platform_system == \"Windows\"" files = [ {file = "colorama-0.4.5-py2.py3-none-any.whl", hash = "sha256:854bf444933e37f5824ae7bfc1e98d5bce2ebe4160d46b5edf346a89358e99da"}, {file = "colorama-0.4.5.tar.gz", hash = "sha256:e6c6b4334fc50988a639d9b98aa429a0b57da6e17b9a44f0451f930b6967b7a4"}, @@ -1007,6 +1034,7 @@ version = "43.0.1" description = "cryptography is a package which provides cryptographic recipes and primitives to Python developers." optional = false python-versions = ">=3.7" +groups = ["main"] files = [ {file = "cryptography-43.0.1-cp37-abi3-macosx_10_9_universal2.whl", hash = "sha256:8385d98f6a3bf8bb2d65a73e17ed87a3ba84f6991c155691c51112075f9ffc5d"}, {file = "cryptography-43.0.1-cp37-abi3-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:27e613d7077ac613e399270253259d9d53872aaf657471473ebfc9a52935c062"}, @@ -1056,6 +1084,7 @@ version = "7.1.0" description = "A Python library for the Docker Engine API." optional = false python-versions = ">=3.8" +groups = ["main"] files = [ {file = "docker-7.1.0-py3-none-any.whl", hash = "sha256:c96b93b7f0a746f9e77d325bcfb87422a3d8bd4f03136ae8a85b37f1898d5fc0"}, {file = "docker-7.1.0.tar.gz", hash = "sha256:ad8c70e6e3f8926cb8a92619b832b4ea5299e2831c14284663184e200546fa6c"}, @@ -1078,6 +1107,7 @@ version = "1.9.0" description = "execnet: rapid multi-Python deployment" optional = false python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*, !=3.4.*" +groups = ["main"] files = [ {file = "execnet-1.9.0-py2.py3-none-any.whl", hash = "sha256:a295f7cc774947aac58dde7fdc85f4aa00c42adf5d8f5468fc630c1acf30a142"}, {file = "execnet-1.9.0.tar.gz", hash = "sha256:8f694f3ba9cc92cab508b152dcfe322153975c29bda272e2fd7f3f00f36e47c5"}, @@ -1092,6 +1122,7 @@ version = "2.2.5" description = "A simple framework for building complex web applications." optional = false python-versions = ">=3.7" +groups = ["main"] files = [ {file = "Flask-2.2.5-py3-none-any.whl", hash = "sha256:58107ed83443e86067e41eff4631b058178191a355886f8e479e347fa1285fdf"}, {file = "Flask-2.2.5.tar.gz", hash = "sha256:edee9b0a7ff26621bd5a8c10ff484ae28737a2410d99b0bb9a6850c7fb977aa0"}, @@ -1113,6 +1144,7 @@ version = "5.0.0" description = "A Flask extension adding a decorator for CORS support" optional = false python-versions = "*" +groups = ["main"] files = [ {file = "Flask_Cors-5.0.0-py2.py3-none-any.whl", hash = "sha256:b9e307d082a9261c100d8fb0ba909eec6a228ed1b60a8315fd85f783d61910bc"}, {file = "flask_cors-5.0.0.tar.gz", hash = "sha256:5aadb4b950c4e93745034594d9f3ea6591f734bb3662e16e255ffbf5e89c88ef"}, @@ -1127,6 +1159,7 @@ version = "1.5.0" description = "A list-like structure which implements collections.abc.MutableSequence" optional = false python-versions = ">=3.8" +groups = ["main"] files = [ {file = "frozenlist-1.5.0-cp310-cp310-macosx_10_9_universal2.whl", hash = "sha256:5b6a66c18b5b9dd261ca98dffcb826a525334b2f29e7caa54e182255c5f6a65a"}, {file = "frozenlist-1.5.0-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:d1b3eb7b05ea246510b43a7e53ed1653e55c2121019a97e60cad7efb881a97bb"}, @@ -1228,6 +1261,7 @@ version = "3.2.1" description = "GraphQL implementation for Python, a port of GraphQL.js, the JavaScript reference implementation for GraphQL." optional = false python-versions = ">=3.6,<4" +groups = ["main"] files = [ {file = "graphql-core-3.2.1.tar.gz", hash = "sha256:9d1bf141427b7d54be944587c8349df791ce60ade2e3cccaf9c56368c133c201"}, {file = "graphql_core-3.2.1-py3-none-any.whl", hash = "sha256:f83c658e4968998eed1923a2e3e3eddd347e005ac0315fbb7ca4d70ea9156323"}, @@ -1239,6 +1273,7 @@ version = "0.14.0" description = "A pure-Python, bring-your-own-I/O implementation of HTTP/1.1" optional = false python-versions = ">=3.7" +groups = ["main"] files = [ {file = "h11-0.14.0-py3-none-any.whl", hash = "sha256:e3fe4ac4b851c468cc8363d500db52c2ead036020723024a109d37346efaa761"}, {file = "h11-0.14.0.tar.gz", hash = "sha256:8f19fbbe99e72420ff35c00b27a34cb9937e902a8b810e2c88300c6f0a3b699d"}, @@ -1247,27 +1282,33 @@ files = [ [[package]] name = "h2" version = "4.1.0" -description = "HTTP/2 State-Machine based protocol implementation" +description = "Pure-Python HTTP/2 protocol implementation" optional = false -python-versions = ">=3.6.1" -files = [ - {file = "h2-4.1.0-py3-none-any.whl", hash = "sha256:03a46bcf682256c95b5fd9e9a99c1323584c3eec6440d379b9903d709476bc6d"}, - {file = "h2-4.1.0.tar.gz", hash = "sha256:a83aca08fbe7aacb79fec788c9c0bac936343560ed9ec18b82a13a12c28d2abb"}, -] +python-versions = ">=3.9" +groups = ["main"] +files = [] +develop = false [package.dependencies] -hpack = ">=4.0,<5" -hyperframe = ">=6.0,<7" +hpack = ">=4.1,<5" +hyperframe = ">=6.1,<7" + +[package.source] +type = "git" +url = "https://github.com/python-hyper/h2" +reference = "HEAD" +resolved_reference = "0b98b244b5fd1fe96100ac14905417a3b70a4286" [[package]] name = "hpack" -version = "4.0.0" -description = "Pure-Python HPACK header compression" +version = "4.1.0" +description = "Pure-Python HPACK header encoding" optional = false -python-versions = ">=3.6.1" +python-versions = ">=3.9" +groups = ["main"] files = [ - {file = "hpack-4.0.0-py3-none-any.whl", hash = "sha256:84a076fad3dc9a9f8063ccb8041ef100867b1878b25ef0ee63847a5d53818a6c"}, - {file = "hpack-4.0.0.tar.gz", hash = "sha256:fc41de0c63e687ebffde81187a948221294896f6bdc0ae2312708df339430095"}, + {file = "hpack-4.1.0-py3-none-any.whl", hash = "sha256:157ac792668d995c657d93111f46b4535ed114f0c9c8d672271bbec7eae1b496"}, + {file = "hpack-4.1.0.tar.gz", hash = "sha256:ec5eca154f7056aa06f196a557655c5b009b382873ac8d1e66e79e87535f1dca"}, ] [[package]] @@ -1276,6 +1317,7 @@ version = "1.0.3" description = "A minimal low-level HTTP client." optional = false python-versions = ">=3.8" +groups = ["main"] files = [ {file = "httpcore-1.0.3-py3-none-any.whl", hash = "sha256:9a6a501c3099307d9fd76ac244e08503427679b1e81ceb1d922485e2f2462ad2"}, {file = "httpcore-1.0.3.tar.gz", hash = "sha256:5c0f9546ad17dac4d0772b0808856eb616eb8b48ce94f49ed819fd6982a8a544"}, @@ -1297,6 +1339,7 @@ version = "0.26.0" description = "The next generation HTTP client." optional = false python-versions = ">=3.8" +groups = ["main"] files = [ {file = "httpx-0.26.0-py3-none-any.whl", hash = "sha256:8915f5a3627c4d47b73e8202457cb28f1266982d1159bd5779d86a80c0eab1cd"}, {file = "httpx-0.26.0.tar.gz", hash = "sha256:451b55c30d5185ea6b23c2c793abf9bb237d2a7dfb901ced6ff69ad37ec1dfaf"}, @@ -1318,13 +1361,14 @@ socks = ["socksio (==1.*)"] [[package]] name = "hyperframe" -version = "6.0.1" -description = "HTTP/2 framing layer for Python" +version = "6.1.0" +description = "Pure-Python HTTP/2 framing" optional = false -python-versions = ">=3.6.1" +python-versions = ">=3.9" +groups = ["main"] files = [ - {file = "hyperframe-6.0.1-py3-none-any.whl", hash = "sha256:0ec6bafd80d8ad2195c4f03aacba3a8265e57bc4cff261e802bf39970ed02a15"}, - {file = "hyperframe-6.0.1.tar.gz", hash = "sha256:ae510046231dc8e9ecb1a6586f63d2347bf4c8905914aa84ba585ae85f28a914"}, + {file = "hyperframe-6.1.0-py3-none-any.whl", hash = "sha256:b03380493a519fce58ea5af42e4a42317bf9bd425596f7a0835ffce80f1a42e5"}, + {file = "hyperframe-6.1.0.tar.gz", hash = "sha256:f630908a00854a7adeabd6382b43923a4c4cd4b821fcb527e6ab9e15382a3b08"}, ] [[package]] @@ -1333,6 +1377,7 @@ version = "3.7" description = "Internationalized Domain Names in Applications (IDNA)" optional = false python-versions = ">=3.5" +groups = ["main"] files = [ {file = "idna-3.7-py3-none-any.whl", hash = "sha256:82fee1fc78add43492d3a1898bfa6d8a904cc97d8427f683ed8e798d07761aa0"}, {file = "idna-3.7.tar.gz", hash = "sha256:028ff3aadf0609c1fd278d8ea3089299412a7a8b9bd005dd08b9f8285bcb5cfc"}, @@ -1344,6 +1389,7 @@ version = "1.1.1" description = "iniconfig: brain-dead simple config-ini parsing" optional = false python-versions = "*" +groups = ["main"] files = [ {file = "iniconfig-1.1.1-py2.py3-none-any.whl", hash = "sha256:011e24c64b7f47f6ebd835bb12a743f2fbe9a26d4cecaa7f53bc4f35ee9da8b3"}, {file = "iniconfig-1.1.1.tar.gz", hash = "sha256:bc3af051d7d14b2ee5ef9969666def0cd1a000e121eaea580d4a313df4b37f32"}, @@ -1355,6 +1401,7 @@ version = "2.1.2" description = "Safely pass data to untrusted environments and back." optional = false python-versions = ">=3.7" +groups = ["main"] files = [ {file = "itsdangerous-2.1.2-py3-none-any.whl", hash = "sha256:2c2349112351b88699d8d4b6b075022c0808887cb7ad10069318a8b0bc88db44"}, {file = "itsdangerous-2.1.2.tar.gz", hash = "sha256:5dbbc68b317e5e42f327f9021763545dc3fc3bfe22e6deb96aaf1fc38874156a"}, @@ -1366,6 +1413,7 @@ version = "3.1.5" description = "A very fast and expressive template engine." optional = false python-versions = ">=3.7" +groups = ["main"] files = [ {file = "jinja2-3.1.5-py3-none-any.whl", hash = "sha256:aba0f4dc9ed8013c424088f68a5c226f7d6097ed89b246d7749c2ec4175c6adb"}, {file = "jinja2-3.1.5.tar.gz", hash = "sha256:8fefff8dc3034e27bb80d67c671eb8a9bc424c0ef4c0826edbff304cceff43bb"}, @@ -1383,6 +1431,7 @@ version = "1.0.1" description = "JSON Matching Expressions" optional = false python-versions = ">=3.7" +groups = ["main"] files = [ {file = "jmespath-1.0.1-py3-none-any.whl", hash = "sha256:02e2e4cc71b5bcab88332eebf907519190dd9e6e82107fa7f83b1003a6252980"}, {file = "jmespath-1.0.1.tar.gz", hash = "sha256:90261b206d6defd58fdd5e85f478bf633a2901798906be2ad389150c5c60edbe"}, @@ -1394,6 +1443,7 @@ version = "0.9.0" description = "The ultimate Python library for JOSE RFCs, including JWS, JWE, JWK, JWA, JWT" optional = false python-versions = ">=3.8" +groups = ["main"] files = [ {file = "joserfc-0.9.0-py3-none-any.whl", hash = "sha256:4026bdbe2c196cd40574e916fa1e28874d99649412edaab0e373dec3077153fb"}, {file = "joserfc-0.9.0.tar.gz", hash = "sha256:eebca7f587b1761ce43a98ffd5327f2b600b9aa5bb0a77b947687f503ad43bc0"}, @@ -1411,6 +1461,7 @@ version = "1.2.3" description = "Generate source code for Python classes from a JSON schema." optional = false python-versions = ">= 2.7" +groups = ["main"] files = [ {file = "jschema_to_python-1.2.3-py3-none-any.whl", hash = "sha256:8a703ca7604d42d74b2815eecf99a33359a8dccbb80806cce386d5e2dd992b05"}, {file = "jschema_to_python-1.2.3.tar.gz", hash = "sha256:76ff14fe5d304708ccad1284e4b11f96a658949a31ee7faed9e0995279549b91"}, @@ -1427,6 +1478,7 @@ version = "2.0.0" description = "Diff JSON and JSON-like structures in Python" optional = false python-versions = "*" +groups = ["main"] files = [ {file = "jsondiff-2.0.0-py3-none-any.whl", hash = "sha256:689841d66273fc88fc79f7d33f4c074774f4f214b6466e3aff0e5adaf889d1e0"}, {file = "jsondiff-2.0.0.tar.gz", hash = "sha256:2795844ef075ec8a2b8d385c4d59f5ea48b08e7180fce3cb2787be0db00b1fb4"}, @@ -1438,6 +1490,8 @@ version = "0.20.0" description = "Python bindings for Jsonnet - The data templating language" optional = false python-versions = "*" +groups = ["main"] +markers = "python_version < \"3.13\"" files = [ {file = "jsonnet-0.20.0.tar.gz", hash = "sha256:7e770c7bf3a366b97b650a39430450f77612e74406731eb75c5bd59f3f104d4f"}, ] @@ -1448,6 +1502,7 @@ version = "1.32" description = "Apply JSON-Patches (RFC 6902)" optional = false python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*, !=3.4.*" +groups = ["main"] files = [ {file = "jsonpatch-1.32-py2.py3-none-any.whl", hash = "sha256:26ac385719ac9f54df8a2f0827bb8253aa3ea8ab7b3368457bcdb8c14595a397"}, {file = "jsonpatch-1.32.tar.gz", hash = "sha256:b6ddfe6c3db30d81a96aaeceb6baf916094ffa23d7dd5fa2c13e13f8b6e600c2"}, @@ -1462,6 +1517,7 @@ version = "1.6.1" description = "A final implementation of JSONPath for Python that aims to be standard compliant, including arithmetic and binary comparison operators and providing clear AST for metaprogramming." optional = false python-versions = "*" +groups = ["main"] files = [ {file = "jsonpath-ng-1.6.1.tar.gz", hash = "sha256:086c37ba4917304850bd837aeab806670224d3f038fe2833ff593a672ef0a5fa"}, {file = "jsonpath_ng-1.6.1-py3-none-any.whl", hash = "sha256:8f22cd8273d7772eea9aaa84d922e0841aa36fdb8a2c6b7f6c3791a16a9bc0be"}, @@ -1476,6 +1532,7 @@ version = "2.2.0" description = "Python library for serializing any arbitrary object graph into JSON" optional = false python-versions = ">=2.7" +groups = ["main"] files = [ {file = "jsonpickle-2.2.0-py2.py3-none-any.whl", hash = "sha256:de7f2613818aa4f234138ca11243d6359ff83ae528b2185efdd474f62bcf9ae1"}, {file = "jsonpickle-2.2.0.tar.gz", hash = "sha256:7b272918b0554182e53dc340ddd62d9b7f902fec7e7b05620c04f3ccef479a0e"}, @@ -1492,6 +1549,7 @@ version = "2.3" description = "Identify specific nodes in a JSON document (RFC 6901)" optional = false python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*" +groups = ["main"] files = [ {file = "jsonpointer-2.3-py2.py3-none-any.whl", hash = "sha256:51801e558539b4e9cd268638c078c6c5746c9ac96bc38152d443400e4f3793e9"}, {file = "jsonpointer-2.3.tar.gz", hash = "sha256:97cba51526c829282218feb99dab1b1e6bdf8efd1c43dc9d57be093c0d69c99a"}, @@ -1503,6 +1561,7 @@ version = "4.17.3" description = "An implementation of JSON Schema validation for Python" optional = false python-versions = ">=3.7" +groups = ["main"] files = [ {file = "jsonschema-4.17.3-py3-none-any.whl", hash = "sha256:a870ad254da1a8ca84b6a2905cac29d265f805acc57af304784962a2aa6508f6"}, {file = "jsonschema-4.17.3.tar.gz", hash = "sha256:0f864437ab8b6076ba6707453ef8f98a6a0d512a80e93f8abdb676f737ecb60d"}, @@ -1522,6 +1581,7 @@ version = "0.1.6" description = "JSONSchema Spec with object-oriented paths" optional = false python-versions = ">=3.7.0,<4.0.0" +groups = ["main"] files = [ {file = "jsonschema_spec-0.1.6-py3-none-any.whl", hash = "sha256:f2206d18c89d1824c1f775ba14ed039743b41a9167bd2c5bdb774b66b3ca0bbf"}, {file = "jsonschema_spec-0.1.6.tar.gz", hash = "sha256:90215863b56e212086641956b20127ccbf6d8a3a38343dad01d6a74d19482f76"}, @@ -1539,6 +1599,7 @@ version = "1.9" description = "Creates JUnit XML test result documents that can be read by tools such as Jenkins" optional = false python-versions = "*" +groups = ["main"] files = [ {file = "junit-xml-1.9.tar.gz", hash = "sha256:de16a051990d4e25a3982b2dd9e89d671067548718866416faec14d9de56db9f"}, {file = "junit_xml-1.9-py2.py3-none-any.whl", hash = "sha256:ec5ca1a55aefdd76d28fcc0b135251d156c7106fa979686a4b48d62b761b4732"}, @@ -1553,6 +1614,7 @@ version = "1.5.6" description = "Implementation of JOSE Web standards" optional = false python-versions = ">= 3.8" +groups = ["main"] files = [ {file = "jwcrypto-1.5.6-py3-none-any.whl", hash = "sha256:150d2b0ebbdb8f40b77f543fb44ffd2baeff48788be71f67f03566692fd55789"}, {file = "jwcrypto-1.5.6.tar.gz", hash = "sha256:771a87762a0c081ae6166958a954f80848820b2ab066937dc8b8379d65b1b039"}, @@ -1568,6 +1630,7 @@ version = "2.0.2" description = "Pure Python client for Apache Kafka" optional = false python-versions = "*" +groups = ["main"] files = [ {file = "kafka-python-2.0.2.tar.gz", hash = "sha256:04dfe7fea2b63726cd6f3e79a2d86e709d608d74406638c5da33a01d45a9d7e3"}, {file = "kafka_python-2.0.2-py2.py3-none-any.whl", hash = "sha256:2d92418c7cb1c298fa6c7f0fb3519b520d0d7526ac6cb7ae2a4fc65a51a94b6e"}, @@ -1582,6 +1645,7 @@ version = "1.10.0" description = "A fast and thorough lazy object proxy." optional = false python-versions = ">=3.8" +groups = ["main"] files = [ {file = "lazy-object-proxy-1.10.0.tar.gz", hash = "sha256:78247b6d45f43a52ef35c25b5581459e85117225408a4128a3daf8bf9648ac69"}, {file = "lazy_object_proxy-1.10.0-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:855e068b0358ab916454464a884779c7ffa312b8925c6f7401e952dcf3b89977"}, @@ -1628,6 +1692,7 @@ version = "4.3.3" description = "LZ4 Bindings for Python" optional = false python-versions = ">=3.8" +groups = ["main"] files = [ {file = "lz4-4.3.3-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:b891880c187e96339474af2a3b2bfb11a8e4732ff5034be919aa9029484cd201"}, {file = "lz4-4.3.3-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:222a7e35137d7539c9c33bb53fcbb26510c5748779364014235afc62b0ec797f"}, @@ -1678,6 +1743,7 @@ version = "2.1.1" description = "Safely add untrusted strings to HTML/XML markup." optional = false python-versions = ">=3.7" +groups = ["main"] files = [ {file = "MarkupSafe-2.1.1-cp310-cp310-macosx_10_9_universal2.whl", hash = "sha256:86b1f75c4e7c2ac2ccdaec2b9022845dbb81880ca318bb7a0a01fbf7813e3812"}, {file = "MarkupSafe-2.1.1-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:f121a1420d4e173a5d96e47e9a0c0dcff965afdf1626d28de1460815f7c4ee7a"}, @@ -1727,6 +1793,7 @@ version = "5.0.6" description = "" optional = false python-versions = ">=3.8" +groups = ["main"] files = [ {file = "moto-5.0.6-py2.py3-none-any.whl", hash = "sha256:ca1e22831a741733b581ff2ef4d6ae2e1c6db1eab97af1b78b86ca2c6e88c609"}, {file = "moto-5.0.6.tar.gz", hash = "sha256:ad8b23f2b555ad694da8b2432a42b6d96beaaf67a4e7d932196a72193a2eee2c"}, @@ -1786,6 +1853,7 @@ version = "1.3.0" description = "Python library for arbitrary-precision floating-point arithmetic" optional = false python-versions = "*" +groups = ["main"] files = [ {file = "mpmath-1.3.0-py3-none-any.whl", hash = "sha256:a0b2b9fe80bbcd81a6647ff13108738cfb482d481d826cc0e02f5b35e5c88d2c"}, {file = "mpmath-1.3.0.tar.gz", hash = "sha256:7a28eb2a9774d00c7bc92411c19a89209d5da7c4c9a9e227be8330a23a25b91f"}, @@ -1803,6 +1871,7 @@ version = "6.0.5" description = "multidict implementation" optional = false python-versions = ">=3.7" +groups = ["main"] files = [ {file = "multidict-6.0.5-cp310-cp310-macosx_10_9_universal2.whl", hash = "sha256:228b644ae063c10e7f324ab1ab6b548bdf6f8b47f3ec234fef1093bc2735e5f9"}, {file = "multidict-6.0.5-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:896ebdcf62683551312c30e20614305f53125750803b614e9e6ce74a96232604"}, @@ -1902,6 +1971,7 @@ version = "1.13.0" description = "Optional static typing for Python" optional = false python-versions = ">=3.8" +groups = ["dev"] files = [ {file = "mypy-1.13.0-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:6607e0f1dd1fb7f0aca14d936d13fd19eba5e17e1cd2a14f808fa5f8f6d8f60a"}, {file = "mypy-1.13.0-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:8a21be69bd26fa81b1f80a61ee7ab05b076c674d9b18fb56239d72e21d9f4c80"}, @@ -1954,6 +2024,7 @@ version = "1.26.0.post1" description = "Type annotations for boto3.S3 1.26.0 service generated with mypy-boto3-builder 7.11.10" optional = false python-versions = ">=3.7" +groups = ["main"] files = [ {file = "mypy-boto3-s3-1.26.0.post1.tar.gz", hash = "sha256:6d7079f8c739dc993cbedad0736299c413b297814b73795a3855a79169ecc938"}, {file = "mypy_boto3_s3-1.26.0.post1-py3-none-any.whl", hash = "sha256:7de2792ff0cc541b84cd46ff3a6aa2b6e5f267217f2203f27f6e4016bddc644d"}, @@ -1968,6 +2039,7 @@ version = "1.0.0" description = "Type system extensions for programs checked with the mypy type checker." optional = false python-versions = ">=3.5" +groups = ["dev"] files = [ {file = "mypy_extensions-1.0.0-py3-none-any.whl", hash = "sha256:4392f6c0eb8a5668a69e23d168ffa70f0be9ccfd32b5cc2d26a34ae5b844552d"}, {file = "mypy_extensions-1.0.0.tar.gz", hash = "sha256:75dbf8955dc00442a438fc4d0666508a9a97b6bd41aa2f0ffe9d2f2725af0782"}, @@ -1979,6 +2051,7 @@ version = "2.8.5" description = "Python package for creating and manipulating graphs and networks" optional = false python-versions = ">=3.8" +groups = ["main"] files = [ {file = "networkx-2.8.5-py3-none-any.whl", hash = "sha256:a762f4b385692d9c3a6f2912d058d76d29a827deaedf9e63ed14d397b8030687"}, {file = "networkx-2.8.5.tar.gz", hash = "sha256:15a7b81a360791c458c55a417418ea136c13378cfdc06a2dcdc12bd2f9cf09c1"}, @@ -1997,6 +2070,7 @@ version = "0.4.4" description = "OpenAPI schema validation for Python" optional = false python-versions = ">=3.7.0,<4.0.0" +groups = ["main"] files = [ {file = "openapi_schema_validator-0.4.4-py3-none-any.whl", hash = "sha256:79f37f38ef9fd5206b924ed7a6f382cea7b649b3b56383c47f1906082b7b9015"}, {file = "openapi_schema_validator-0.4.4.tar.gz", hash = "sha256:c573e2be2c783abae56c5a1486ab716ca96e09d1c3eab56020d1dc680aa57bf8"}, @@ -2015,6 +2089,7 @@ version = "0.5.7" description = "OpenAPI 2.0 (aka Swagger) and OpenAPI 3 spec validator" optional = false python-versions = ">=3.7.0,<4.0.0" +groups = ["main"] files = [ {file = "openapi_spec_validator-0.5.7-py3-none-any.whl", hash = "sha256:8712d2879db7692974ef89c47a3ebfc79436442921ec3a826ac0ce80cde8c549"}, {file = "openapi_spec_validator-0.5.7.tar.gz", hash = "sha256:6c2d42180045a80fd6314de848b94310bdb0fa4949f4b099578b69f79d9fa5ac"}, @@ -2032,6 +2107,7 @@ version = "24.2" description = "Core utilities for Python packages" optional = false python-versions = ">=3.8" +groups = ["main"] files = [ {file = "packaging-24.2-py3-none-any.whl", hash = "sha256:09abb1bccd265c01f4a3aa3f7a7db064b36514d2cba19a2f694fe6150451a759"}, {file = "packaging-24.2.tar.gz", hash = "sha256:c228a6dc5e932d346bc5739379109d49e8853dd8223571c7c5b55260edc0b97f"}, @@ -2043,6 +2119,7 @@ version = "0.4.3" description = "Object-oriented paths" optional = false python-versions = ">=3.7.0,<4.0.0" +groups = ["main"] files = [ {file = "pathable-0.4.3-py3-none-any.whl", hash = "sha256:cdd7b1f9d7d5c8b8d3315dbf5a86b2596053ae845f056f57d97c0eefff84da14"}, {file = "pathable-0.4.3.tar.gz", hash = "sha256:5c869d315be50776cc8a993f3af43e0c60dc01506b399643f919034ebf4cdcab"}, @@ -2054,6 +2131,7 @@ version = "5.9.0" description = "Python Build Reasonableness" optional = false python-versions = ">=2.6" +groups = ["main"] files = [ {file = "pbr-5.9.0-py2.py3-none-any.whl", hash = "sha256:e547125940bcc052856ded43be8e101f63828c2d94239ffbe2b327ba3d5ccf0a"}, {file = "pbr-5.9.0.tar.gz", hash = "sha256:e8dca2f4b43560edef58813969f52a56cef023146cbb8931626db80e6c1c4308"}, @@ -2065,6 +2143,7 @@ version = "1.0.0" description = "plugin and hook calling mechanisms for python" optional = false python-versions = ">=3.6" +groups = ["main"] files = [ {file = "pluggy-1.0.0-py2.py3-none-any.whl", hash = "sha256:74134bbf457f031a36d68416e1509f34bd5ccc019f0bcc952c7b909d06b37bd3"}, {file = "pluggy-1.0.0.tar.gz", hash = "sha256:4224373bacce55f955a878bf9cfa763c1e360858e330072059e10bad68531159"}, @@ -2080,6 +2159,7 @@ version = "3.11" description = "Python Lex & Yacc" optional = false python-versions = "*" +groups = ["main"] files = [ {file = "ply-3.11-py2.py3-none-any.whl", hash = "sha256:096f9b8350b65ebd2fd1346b12452efe5b9607f7482813ffca50c22722a807ce"}, {file = "ply-3.11.tar.gz", hash = "sha256:00c7c1aaa88358b9c765b6d3000c6eec0ba42abca5351b095321aef446081da3"}, @@ -2091,6 +2171,7 @@ version = "0.14.1" description = "Python client for the Prometheus monitoring system." optional = false python-versions = ">=3.6" +groups = ["main"] files = [ {file = "prometheus_client-0.14.1-py3-none-any.whl", hash = "sha256:522fded625282822a89e2773452f42df14b5a8e84a86433e3f8a189c1d54dc01"}, {file = "prometheus_client-0.14.1.tar.gz", hash = "sha256:5459c427624961076277fdc6dc50540e2bacb98eebde99886e59ec55ed92093a"}, @@ -2105,6 +2186,7 @@ version = "0.2.0" description = "Accelerated property cache" optional = false python-versions = ">=3.8" +groups = ["main"] files = [ {file = "propcache-0.2.0-cp310-cp310-macosx_10_9_universal2.whl", hash = "sha256:c5869b8fd70b81835a6f187c5fdbe67917a04d7e52b6e7cc4e5fe39d55c39d58"}, {file = "propcache-0.2.0-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:952e0d9d07609d9c5be361f33b0d6d650cd2bae393aabb11d9b719364521984b"}, @@ -2212,6 +2294,7 @@ version = "5.9.4" description = "Cross-platform lib for process and system monitoring in Python." optional = false python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*" +groups = ["main"] files = [ {file = "psutil-5.9.4-cp27-cp27m-macosx_10_9_x86_64.whl", hash = "sha256:c1ca331af862803a42677c120aff8a814a804e09832f166f226bfd22b56feee8"}, {file = "psutil-5.9.4-cp27-cp27m-manylinux2010_i686.whl", hash = "sha256:68908971daf802203f3d37e78d3f8831b6d1014864d7a85937941bb35f09aefe"}, @@ -2238,6 +2321,7 @@ version = "2.9.10" description = "psycopg2 - Python-PostgreSQL Database Adapter" optional = false python-versions = ">=3.8" +groups = ["main"] files = [ {file = "psycopg2-binary-2.9.10.tar.gz", hash = "sha256:4b3df0e6990aa98acda57d983942eff13d824135fe2250e6522edaa782a06de2"}, {file = "psycopg2_binary-2.9.10-cp310-cp310-macosx_12_0_x86_64.whl", hash = "sha256:0ea8e3d0ae83564f2fc554955d327fa081d065c8ca5cc6d2abb643e2c9c1200f"}, @@ -2286,6 +2370,7 @@ files = [ {file = "psycopg2_binary-2.9.10-cp313-cp313-musllinux_1_2_i686.whl", hash = "sha256:bb89f0a835bcfc1d42ccd5f41f04870c1b936d8507c6df12b7737febc40f0909"}, {file = "psycopg2_binary-2.9.10-cp313-cp313-musllinux_1_2_ppc64le.whl", hash = "sha256:f0c2d907a1e102526dd2986df638343388b94c33860ff3bbe1384130828714b1"}, {file = "psycopg2_binary-2.9.10-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:f8157bed2f51db683f31306aa497311b560f2265998122abe1dce6428bd86567"}, + {file = "psycopg2_binary-2.9.10-cp313-cp313-win_amd64.whl", hash = "sha256:27422aa5f11fbcd9b18da48373eb67081243662f9b46e6fd07c3eb46e4535142"}, {file = "psycopg2_binary-2.9.10-cp38-cp38-macosx_12_0_x86_64.whl", hash = "sha256:eb09aa7f9cecb45027683bb55aebaaf45a0df8bf6de68801a6afdc7947bb09d4"}, {file = "psycopg2_binary-2.9.10-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:b73d6d7f0ccdad7bc43e6d34273f70d587ef62f824d7261c4ae9b8b1b6af90e8"}, {file = "psycopg2_binary-2.9.10-cp38-cp38-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:ce5ab4bf46a211a8e924d307c1b1fcda82368586a19d0a24f8ae166f5c784864"}, @@ -2314,6 +2399,7 @@ version = "0.5.4" description = "Pure Python PartiQL Parser" optional = false python-versions = "*" +groups = ["main"] files = [ {file = "py_partiql_parser-0.5.4-py2.py3-none-any.whl", hash = "sha256:3dc4295a47da9587681a96b35c6e151886fdbd0a4acbe0d97c4c68e5f689d315"}, {file = "py_partiql_parser-0.5.4.tar.gz", hash = "sha256:72e043919538fa63edae72fb59afc7e3fd93adbde656718a7d2b4666f23dd114"}, @@ -2328,6 +2414,7 @@ version = "2.21" description = "C parser in Python" optional = false python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*" +groups = ["main"] files = [ {file = "pycparser-2.21-py2.py3-none-any.whl", hash = "sha256:8ee45429555515e1f6b185e78100aea234072576aa43ab53aefcae078162fca9"}, {file = "pycparser-2.21.tar.gz", hash = "sha256:e644fdec12f7872f86c58ff790da456218b10f863970249516d60a5eaca77206"}, @@ -2339,6 +2426,7 @@ version = "2.10.4" description = "Data validation using Python type hints" optional = false python-versions = ">=3.8" +groups = ["main"] files = [ {file = "pydantic-2.10.4-py3-none-any.whl", hash = "sha256:597e135ea68be3a37552fb524bc7d0d66dcf93d395acd93a00682f1efcb8ee3d"}, {file = "pydantic-2.10.4.tar.gz", hash = "sha256:82f12e9723da6de4fe2ba888b5971157b3be7ad914267dea8f05f82b28254f06"}, @@ -2359,6 +2447,7 @@ version = "2.27.2" description = "Core functionality for Pydantic validation and serialization" optional = false python-versions = ">=3.8" +groups = ["main"] files = [ {file = "pydantic_core-2.27.2-cp310-cp310-macosx_10_12_x86_64.whl", hash = "sha256:2d367ca20b2f14095a8f4fa1210f5a7b78b8a20009ecced6b12818f455b1e9fa"}, {file = "pydantic_core-2.27.2-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:491a2b73db93fab69731eaee494f320faa4e093dbed776be1a829c2eb222c34c"}, @@ -2471,6 +2560,7 @@ version = "2.4.0" description = "JSON Web Token implementation in Python" optional = false python-versions = ">=3.6" +groups = ["main"] files = [ {file = "PyJWT-2.4.0-py3-none-any.whl", hash = "sha256:72d1d253f32dbd4f5c88eaf1fdc62f3a19f676ccbadb9dbc5d07e951b2b26daf"}, {file = "PyJWT-2.4.0.tar.gz", hash = "sha256:d42908208c699b3b973cbeb01a969ba6a96c821eefb1c5bfe4c390c01d67abba"}, @@ -2491,6 +2581,7 @@ version = "3.0.9" description = "pyparsing module - Classes and methods to define and execute parsing grammars" optional = false python-versions = ">=3.6.8" +groups = ["main"] files = [ {file = "pyparsing-3.0.9-py3-none-any.whl", hash = "sha256:5026bae9a10eeaefb61dab2f09052b9f4307d44aee4eda64b309723d8d206bbc"}, {file = "pyparsing-3.0.9.tar.gz", hash = "sha256:2b020ecf7d21b687f219b71ecad3631f644a47f01403fa1d1036b0c6416d70fb"}, @@ -2505,6 +2596,7 @@ version = "0.18.1" description = "Persistent/Functional/Immutable data structures" optional = false python-versions = ">=3.7" +groups = ["main"] files = [ {file = "pyrsistent-0.18.1-cp310-cp310-macosx_10_9_universal2.whl", hash = "sha256:df46c854f490f81210870e509818b729db4488e1f30f2a1ce1698b2295a878d1"}, {file = "pyrsistent-0.18.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:5d45866ececf4a5fff8742c25722da6d4c9e180daa7b405dc0a2a2790d668c26"}, @@ -2535,6 +2627,7 @@ version = "7.4.4" description = "pytest: simple powerful testing with Python" optional = false python-versions = ">=3.7" +groups = ["main"] files = [ {file = "pytest-7.4.4-py3-none-any.whl", hash = "sha256:b090cdf5ed60bf4c45261be03239c2c1c22df034fbffe691abe93cd80cea01d8"}, {file = "pytest-7.4.4.tar.gz", hash = "sha256:2cf0005922c6ace4a3e2ec8b4080eb0d9753fdc93107415332f50ce9e7994280"}, @@ -2555,6 +2648,7 @@ version = "0.21.0" description = "Pytest support for asyncio" optional = false python-versions = ">=3.7" +groups = ["main"] files = [ {file = "pytest-asyncio-0.21.0.tar.gz", hash = "sha256:2b38a496aef56f56b0e87557ec313e11e1ab9276fc3863f6a7be0f1d0e415e1b"}, {file = "pytest_asyncio-0.21.0-py3-none-any.whl", hash = "sha256:f2b3366b7cd501a4056858bd39349d5af19742aed2d81660b7998b6341c7eb9c"}, @@ -2573,6 +2667,7 @@ version = "1.0.8" description = "pytest-httpserver is a httpserver for pytest" optional = false python-versions = ">=3.8,<4.0" +groups = ["main"] files = [ {file = "pytest_httpserver-1.0.8-py3-none-any.whl", hash = "sha256:24cd3d9f6a0b927c7bfc400d0b3fda7442721b8267ce29942bf307b190f0bb09"}, {file = "pytest_httpserver-1.0.8.tar.gz", hash = "sha256:e052f69bc8a9073db02484681e8e47004dd1fb3763b0ae833bd899e5895c559a"}, @@ -2587,6 +2682,7 @@ version = "0.6.3" description = "It helps to use fixtures in pytest.mark.parametrize" optional = false python-versions = "*" +groups = ["main"] files = [ {file = "pytest-lazy-fixture-0.6.3.tar.gz", hash = "sha256:0e7d0c7f74ba33e6e80905e9bfd81f9d15ef9a790de97993e34213deb5ad10ac"}, {file = "pytest_lazy_fixture-0.6.3-py3-none-any.whl", hash = "sha256:e0b379f38299ff27a653f03eaa69b08a6fd4484e46fd1c9907d984b9f9daeda6"}, @@ -2601,6 +2697,7 @@ version = "1.1.0" description = "pytest plugin to run your tests in a specific order" optional = false python-versions = ">=3.6" +groups = ["main"] files = [ {file = "pytest-order-1.1.0.tar.gz", hash = "sha256:139d25b30826b78eebb42722f747eab14c44b88059d7a71d4f79d14a057269a5"}, {file = "pytest_order-1.1.0-py3-none-any.whl", hash = "sha256:3b3730969c97900fa5cd31ecff80847680ed56b2490954565c14949ba60d9371"}, @@ -2615,6 +2712,7 @@ version = "0.9.3" description = "pytest plugin for repeating tests" optional = false python-versions = ">=3.7" +groups = ["main"] files = [ {file = "pytest_repeat-0.9.3-py3-none-any.whl", hash = "sha256:26ab2df18226af9d5ce441c858f273121e92ff55f5bb311d25755b8d7abdd8ed"}, {file = "pytest_repeat-0.9.3.tar.gz", hash = "sha256:ffd3836dfcd67bb270bec648b330e20be37d2966448c4148c4092d1e8aba8185"}, @@ -2629,6 +2727,7 @@ version = "15.0" description = "pytest plugin to re-run tests to eliminate flaky failures" optional = false python-versions = ">=3.9" +groups = ["main"] files = [ {file = "pytest-rerunfailures-15.0.tar.gz", hash = "sha256:2d9ac7baf59f4c13ac730b47f6fa80e755d1ba0581da45ce30b72fb3542b4474"}, {file = "pytest_rerunfailures-15.0-py3-none-any.whl", hash = "sha256:dd150c4795c229ef44320adc9a0c0532c51b78bb7a6843a8c53556b9a611df1a"}, @@ -2644,6 +2743,7 @@ version = "0.8.1" description = "Pytest plugin which splits the test suite to equally sized sub suites based on test execution time." optional = false python-versions = ">=3.7.1,<4.0" +groups = ["main"] files = [ {file = "pytest_split-0.8.1-py3-none-any.whl", hash = "sha256:74b110ea091bd147cc1c5f9665a59506e5cedfa66f96a89fb03e4ab447c2c168"}, {file = "pytest_split-0.8.1.tar.gz", hash = "sha256:2d88bd3dc528689a7a3f58fc12ea165c3aa62e90795e420dfad920afe5612d6d"}, @@ -2658,6 +2758,7 @@ version = "2.1.0" description = "pytest plugin to abort hanging tests" optional = false python-versions = ">=3.6" +groups = ["main"] files = [ {file = "pytest-timeout-2.1.0.tar.gz", hash = "sha256:c07ca07404c612f8abbe22294b23c368e2e5104b521c1790195561f37e1ac3d9"}, {file = "pytest_timeout-2.1.0-py3-none-any.whl", hash = "sha256:f6f50101443ce70ad325ceb4473c4255e9d74e3c7cd0ef827309dfa4c0d975c6"}, @@ -2672,6 +2773,7 @@ version = "3.3.1" description = "pytest xdist plugin for distributed testing, most importantly across multiple CPUs" optional = false python-versions = ">=3.7" +groups = ["main"] files = [ {file = "pytest-xdist-3.3.1.tar.gz", hash = "sha256:d5ee0520eb1b7bcca50a60a518ab7a7707992812c578198f8b44fdfac78e8c93"}, {file = "pytest_xdist-3.3.1-py3-none-any.whl", hash = "sha256:ff9daa7793569e6a68544850fd3927cd257cc03a7ef76c95e86915355e82b5f2"}, @@ -2692,6 +2794,7 @@ version = "2.8.2" description = "Extensions to the standard Python datetime module" optional = false python-versions = "!=3.0.*,!=3.1.*,!=3.2.*,>=2.7" +groups = ["main"] files = [ {file = "python-dateutil-2.8.2.tar.gz", hash = "sha256:0123cacc1627ae19ddf3c27a5de5bd67ee4586fbdd6440d9748f8abb483d3e86"}, {file = "python_dateutil-2.8.2-py2.py3-none-any.whl", hash = "sha256:961d03dc3453ebbc59dbdea9e4e11c5651520a876d0f4db161e8674aae935da9"}, @@ -2706,6 +2809,7 @@ version = "1.0.1" description = "Read key-value pairs from a .env file and set them as environment variables" optional = false python-versions = ">=3.8" +groups = ["main"] files = [ {file = "python-dotenv-1.0.1.tar.gz", hash = "sha256:e324ee90a023d808f1959c46bcbc04446a10ced277783dc6ee09987c37ec10ca"}, {file = "python_dotenv-1.0.1-py3-none-any.whl", hash = "sha256:f7b63ef50f1b690dddf550d03497b66d609393b40b564ed0d674909a68ebf16a"}, @@ -2720,6 +2824,7 @@ version = "2024.1" description = "World timezone definitions, modern and historical" optional = false python-versions = "*" +groups = ["main"] files = [ {file = "pytz-2024.1-py2.py3-none-any.whl", hash = "sha256:328171f4e3623139da4983451950b28e95ac706e13f3f2630a879749e7a8b319"}, {file = "pytz-2024.1.tar.gz", hash = "sha256:2a29735ea9c18baf14b448846bde5a48030ed267578472d8955cd0e7443a9812"}, @@ -2731,6 +2836,8 @@ version = "308" description = "Python for Window Extensions" optional = false python-versions = "*" +groups = ["main"] +markers = "sys_platform == \"win32\"" files = [ {file = "pywin32-308-cp310-cp310-win32.whl", hash = "sha256:796ff4426437896550d2981b9c2ac0ffd75238ad9ea2d3bfa67a1abd546d262e"}, {file = "pywin32-308-cp310-cp310-win_amd64.whl", hash = "sha256:4fc888c59b3c0bef905ce7eb7e2106a07712015ea1c8234b703a088d46110e8e"}, @@ -2758,6 +2865,7 @@ version = "6.0.2" description = "YAML parser and emitter for Python" optional = false python-versions = ">=3.8" +groups = ["main"] files = [ {file = "PyYAML-6.0.2-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:0a9a2848a5b7feac301353437eb7d5957887edbf81d56e903999a75a3d743086"}, {file = "PyYAML-6.0.2-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:29717114e51c84ddfba879543fb232a6ed60086602313ca38cce623c1d62cfbf"}, @@ -2820,6 +2928,7 @@ version = "2024.4.28" description = "Alternative regular expression module, to replace re." optional = false python-versions = ">=3.8" +groups = ["main"] files = [ {file = "regex-2024.4.28-cp310-cp310-macosx_10_9_universal2.whl", hash = "sha256:cd196d056b40af073d95a2879678585f0b74ad35190fac04ca67954c582c6b61"}, {file = "regex-2024.4.28-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:8bb381f777351bd534462f63e1c6afb10a7caa9fa2a421ae22c26e796fe31b1f"}, @@ -2908,6 +3017,7 @@ version = "2.32.3" description = "Python HTTP for Humans." optional = false python-versions = ">=3.8" +groups = ["main"] files = [ {file = "requests-2.32.3-py3-none-any.whl", hash = "sha256:70761cfe03c773ceb22aa2f671b4757976145175cdfca038c02654d061d6dcc6"}, {file = "requests-2.32.3.tar.gz", hash = "sha256:55365417734eb18255590a9ff9eb97e9e1da868d4ccd6402399eaf68af20a760"}, @@ -2929,6 +3039,7 @@ version = "0.25.3" description = "A utility library for mocking out the `requests` Python library." optional = false python-versions = ">=3.8" +groups = ["main"] files = [ {file = "responses-0.25.3-py3-none-any.whl", hash = "sha256:521efcbc82081ab8daa588e08f7e8a64ce79b91c39f6e62199b19159bea7dbcb"}, {file = "responses-0.25.3.tar.gz", hash = "sha256:617b9247abd9ae28313d57a75880422d55ec63c29d33d629697590a034358dba"}, @@ -2948,6 +3059,7 @@ version = "0.1.4" description = "A pure python RFC3339 validator" optional = false python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*, !=3.4.*" +groups = ["main"] files = [ {file = "rfc3339_validator-0.1.4-py2.py3-none-any.whl", hash = "sha256:24f6ec1eda14ef823da9e36ec7113124b39c04d50a4d3d3a3c2859577e7791fa"}, {file = "rfc3339_validator-0.1.4.tar.gz", hash = "sha256:138a2abdf93304ad60530167e51d2dfb9549521a836871b88d7f4695d0022f6b"}, @@ -2962,6 +3074,7 @@ version = "0.7.0" description = "An extremely fast Python linter and code formatter, written in Rust." optional = false python-versions = ">=3.7" +groups = ["dev"] files = [ {file = "ruff-0.7.0-py3-none-linux_armv6l.whl", hash = "sha256:0cdf20c2b6ff98e37df47b2b0bd3a34aaa155f59a11182c1303cce79be715628"}, {file = "ruff-0.7.0-py3-none-macosx_10_12_x86_64.whl", hash = "sha256:496494d350c7fdeb36ca4ef1c9f21d80d182423718782222c29b3e72b3512737"}, @@ -2989,6 +3102,7 @@ version = "0.10.0" description = "An Amazon S3 Transfer Manager" optional = false python-versions = ">= 3.8" +groups = ["main"] files = [ {file = "s3transfer-0.10.0-py3-none-any.whl", hash = "sha256:3cdb40f5cfa6966e812209d0994f2a4709b561c88e90cf00c2696d2df4e56b2e"}, {file = "s3transfer-0.10.0.tar.gz", hash = "sha256:d0c8bbf672d5eebbe4e57945e23b972d963f07d82f661cabf678a5c88831595b"}, @@ -3006,6 +3120,7 @@ version = "1.0.4" description = "Classes implementing the SARIF 2.1.0 object model." optional = false python-versions = ">= 2.7" +groups = ["main"] files = [ {file = "sarif_om-1.0.4-py3-none-any.whl", hash = "sha256:539ef47a662329b1c8502388ad92457425e95dc0aaaf995fe46f4984c4771911"}, {file = "sarif_om-1.0.4.tar.gz", hash = "sha256:cd5f416b3083e00d402a92e449a7ff67af46f11241073eea0461802a3b5aef98"}, @@ -3021,6 +3136,7 @@ version = "70.0.0" description = "Easily download, build, install, upgrade, and uninstall Python packages" optional = false python-versions = ">=3.8" +groups = ["main"] files = [ {file = "setuptools-70.0.0-py3-none-any.whl", hash = "sha256:54faa7f2e8d2d11bcd2c07bed282eef1046b5c080d1c32add737d7b5817b1ad4"}, {file = "setuptools-70.0.0.tar.gz", hash = "sha256:f211a66637b8fa059bb28183da127d4e86396c991a942b028c6650d4319c3fd0"}, @@ -3036,6 +3152,7 @@ version = "1.16.0" description = "Python 2 and 3 compatibility utilities" optional = false python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*" +groups = ["main"] files = [ {file = "six-1.16.0-py2.py3-none-any.whl", hash = "sha256:8abb2f1d86890a2dfb989f9a77cfcfd3e47c2a354b01111771326f8aa26e0254"}, {file = "six-1.16.0.tar.gz", hash = "sha256:1e61c37477a1626458e36f7b1d82aa5c9b094fa4802892072e49de9c60c4c926"}, @@ -3047,6 +3164,7 @@ version = "1.3.0" description = "Sniff out which async library your code is running under" optional = false python-versions = ">=3.7" +groups = ["main"] files = [ {file = "sniffio-1.3.0-py3-none-any.whl", hash = "sha256:eecefdce1e5bbfb7ad2eeaabf7c1eeb404d7757c379bd1f7e5cce9d8bf425384"}, {file = "sniffio-1.3.0.tar.gz", hash = "sha256:e60305c5e5d314f5389259b7f22aaa33d8f7dee49763119234af3755c55b9101"}, @@ -3058,6 +3176,7 @@ version = "1.12" description = "Computer algebra system (CAS) in Python" optional = false python-versions = ">=3.8" +groups = ["main"] files = [ {file = "sympy-1.12-py3-none-any.whl", hash = "sha256:c3588cd4295d0c0f603d0f2ae780587e64e2efeedb3521e46b9bb1d08d184fa5"}, {file = "sympy-1.12.tar.gz", hash = "sha256:ebf595c8dac3e0fdc4152c51878b498396ec7f30e7a914d6071e674d49420fb8"}, @@ -3072,6 +3191,7 @@ version = "4.9.0" description = "Python library for throwaway instances of anything that can run in a Docker container" optional = false python-versions = "<4.0,>=3.9" +groups = ["main"] files = [ {file = "testcontainers-4.9.0-py3-none-any.whl", hash = "sha256:c6fee929990972c40bf6b91b7072c94064ff3649b405a14fde0274c8b2479d32"}, {file = "testcontainers-4.9.0.tar.gz", hash = "sha256:2cd6af070109ff68c1ab5389dc89c86c2dc3ab30a21ca734b2cb8f0f80ad479e"}, @@ -3125,6 +3245,7 @@ version = "0.10.2" description = "Python Library for Tom's Obvious, Minimal Language" optional = false python-versions = ">=2.6, !=3.0.*, !=3.1.*, !=3.2.*" +groups = ["main"] files = [ {file = "toml-0.10.2-py2.py3-none-any.whl", hash = "sha256:806143ae5bfb6a3c6e736a764057db0e6a0e05e338b5630894a5f779cabb4f9b"}, {file = "toml-0.10.2.tar.gz", hash = "sha256:b3bda1d108d5dd99f4a20d24d9c348e91c4db7ab1b749200bded2f839ccbe68f"}, @@ -3136,6 +3257,7 @@ version = "1.5.0.20240925" description = "Typing stubs for jwcrypto" optional = false python-versions = ">=3.8" +groups = ["main"] files = [ {file = "types-jwcrypto-1.5.0.20240925.tar.gz", hash = "sha256:50e17b790378c96239344476c7bd13b52d0c7eeb6d16c2d53723e48cc6bbf4fe"}, {file = "types_jwcrypto-1.5.0.20240925-py3-none-any.whl", hash = "sha256:2d12a2d528240d326075e896aafec7056b9136bf3207fa6ccf3fcb8fbf9e11a1"}, @@ -3150,6 +3272,7 @@ version = "5.9.5.12" description = "Typing stubs for psutil" optional = false python-versions = "*" +groups = ["main"] files = [ {file = "types-psutil-5.9.5.12.tar.gz", hash = "sha256:61a91679d3fe737250013b624dca09375e7cc3ad77dcc734553746c429c02aca"}, {file = "types_psutil-5.9.5.12-py3-none-any.whl", hash = "sha256:e9a147b8561235c6afcce5aa1adb973fad9ab2c50cf89820697687f53510358f"}, @@ -3161,6 +3284,7 @@ version = "2.9.21.20241019" description = "Typing stubs for psycopg2" optional = false python-versions = ">=3.8" +groups = ["main"] files = [ {file = "types-psycopg2-2.9.21.20241019.tar.gz", hash = "sha256:bca89b988d2ebd19bcd08b177d22a877ea8b841decb10ed130afcf39404612fa"}, {file = "types_psycopg2-2.9.21.20241019-py3-none-any.whl", hash = "sha256:44d091e67732d16a941baae48cd7b53bf91911bc36888652447cf1ef0c1fb3f6"}, @@ -3172,6 +3296,7 @@ version = "0.6.3.3" description = "Typing stubs for pytest-lazy-fixture" optional = false python-versions = "*" +groups = ["main"] files = [ {file = "types-pytest-lazy-fixture-0.6.3.3.tar.gz", hash = "sha256:2ef79d66bcde0e50acdac8dc55074b9ae0d4cfaeabdd638f5522f4cac7c8a2c7"}, {file = "types_pytest_lazy_fixture-0.6.3.3-py3-none-any.whl", hash = "sha256:a56a55649147ff960ff79d4b2c781a4f769351abc1876873f3116d0bd0c96353"}, @@ -3183,6 +3308,7 @@ version = "6.0.12.20240917" description = "Typing stubs for PyYAML" optional = false python-versions = ">=3.8" +groups = ["main"] files = [ {file = "types-PyYAML-6.0.12.20240917.tar.gz", hash = "sha256:d1405a86f9576682234ef83bcb4e6fff7c9305c8b1fbad5e0bcd4f7dbdc9c587"}, {file = "types_PyYAML-6.0.12.20240917-py3-none-any.whl", hash = "sha256:392b267f1c0fe6022952462bf5d6523f31e37f6cea49b14cee7ad634b6301570"}, @@ -3194,6 +3320,7 @@ version = "2.31.0.0" description = "Typing stubs for requests" optional = false python-versions = "*" +groups = ["main"] files = [ {file = "types-requests-2.31.0.0.tar.gz", hash = "sha256:c1c29d20ab8d84dff468d7febfe8e0cb0b4664543221b386605e14672b44ea25"}, {file = "types_requests-2.31.0.0-py3-none-any.whl", hash = "sha256:7c5cea7940f8e92ec560bbc468f65bf684aa3dcf0554a6f8c4710f5f708dc598"}, @@ -3208,6 +3335,7 @@ version = "0.6.0.post3" description = "Type annotations and code completion for s3transfer" optional = false python-versions = ">=3.7,<4.0" +groups = ["main"] files = [ {file = "types-s3transfer-0.6.0.post3.tar.gz", hash = "sha256:92c3704e5d041202bfb5ddb79d083fd1a02de2c5dfec6a91576823e6b5c93993"}, {file = "types_s3transfer-0.6.0.post3-py3-none-any.whl", hash = "sha256:eedc5117275565b3c83662c0ccc81662a34da5dda8bd502b89d296b6d5cb091d"}, @@ -3219,6 +3347,7 @@ version = "0.10.8.6" description = "Typing stubs for toml" optional = false python-versions = "*" +groups = ["main"] files = [ {file = "types-toml-0.10.8.6.tar.gz", hash = "sha256:6d3ac79e36c9ee593c5d4fb33a50cca0e3adceb6ef5cff8b8e5aef67b4c4aaf2"}, {file = "types_toml-0.10.8.6-py3-none-any.whl", hash = "sha256:de7b2bb1831d6f7a4b554671ffe5875e729753496961b3e9b202745e4955dafa"}, @@ -3230,6 +3359,7 @@ version = "1.26.17" description = "Typing stubs for urllib3" optional = false python-versions = "*" +groups = ["main"] files = [ {file = "types-urllib3-1.26.17.tar.gz", hash = "sha256:73fd274524c3fc7cd8cd9ceb0cb67ed99b45f9cb2831013e46d50c1451044800"}, {file = "types_urllib3-1.26.17-py3-none-any.whl", hash = "sha256:0d027fcd27dbb3cb532453b4d977e05bc1e13aefd70519866af211b3003d895d"}, @@ -3241,6 +3371,7 @@ version = "4.12.2" description = "Backported and Experimental Type Hints for Python 3.8+" optional = false python-versions = ">=3.8" +groups = ["main", "dev"] files = [ {file = "typing_extensions-4.12.2-py3-none-any.whl", hash = "sha256:04e5ca0351e0f3f85c6853954072df659d0d13fac324d0072316b67d7794700d"}, {file = "typing_extensions-4.12.2.tar.gz", hash = "sha256:1a7ead55c7e559dd4dee8856e3a88b41225abfe1ce8df57b7c13915fe121ffb8"}, @@ -3252,6 +3383,7 @@ version = "1.26.19" description = "HTTP library with thread-safe connection pooling, file post, and more." optional = false python-versions = "!=3.0.*,!=3.1.*,!=3.2.*,!=3.3.*,!=3.4.*,!=3.5.*,>=2.7" +groups = ["main"] files = [ {file = "urllib3-1.26.19-py2.py3-none-any.whl", hash = "sha256:37a0344459b199fce0e80b0d3569837ec6b6937435c5244e7fd73fa6006830f3"}, {file = "urllib3-1.26.19.tar.gz", hash = "sha256:3e3d753a8618b86d7de333b4223005f68720bcd6a7d2bcb9fbd2229ec7c1e429"}, @@ -3268,6 +3400,7 @@ version = "12.0" description = "An implementation of the WebSocket Protocol (RFC 6455 & 7692)" optional = false python-versions = ">=3.8" +groups = ["main"] files = [ {file = "websockets-12.0-cp310-cp310-macosx_10_9_universal2.whl", hash = "sha256:d554236b2a2006e0ce16315c16eaa0d628dab009c33b63ea03f41c6107958374"}, {file = "websockets-12.0-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:2d225bb6886591b1746b17c0573e29804619c8f755b5598d875bb4235ea639be"}, @@ -3349,6 +3482,7 @@ version = "3.0.6" description = "The comprehensive WSGI web application library." optional = false python-versions = ">=3.8" +groups = ["main"] files = [ {file = "werkzeug-3.0.6-py3-none-any.whl", hash = "sha256:1bc0c2310d2fbb07b1dd1105eba2f7af72f322e1e455f2f93c993bee8c8a5f17"}, {file = "werkzeug-3.0.6.tar.gz", hash = "sha256:a8dd59d4de28ca70471a34cba79bed5f7ef2e036a76b3ab0835474246eb41f8d"}, @@ -3366,6 +3500,7 @@ version = "1.14.1" description = "Module for decorators, wrappers and monkey patching." optional = false python-versions = "!=3.0.*,!=3.1.*,!=3.2.*,!=3.3.*,!=3.4.*,>=2.7" +groups = ["main"] files = [ {file = "wrapt-1.14.1-cp27-cp27m-macosx_10_9_x86_64.whl", hash = "sha256:1b376b3f4896e7930f1f772ac4b064ac12598d1c38d04907e696cc4d794b43d3"}, {file = "wrapt-1.14.1-cp27-cp27m-manylinux1_i686.whl", hash = "sha256:903500616422a40a98a5a3c4ff4ed9d0066f3b4c951fa286018ecdf0750194ef"}, @@ -3386,6 +3521,16 @@ files = [ {file = "wrapt-1.14.1-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:8ad85f7f4e20964db4daadcab70b47ab05c7c1cf2a7c1e51087bfaa83831854c"}, {file = "wrapt-1.14.1-cp310-cp310-win32.whl", hash = "sha256:a9a52172be0b5aae932bef82a79ec0a0ce87288c7d132946d645eba03f0ad8a8"}, {file = "wrapt-1.14.1-cp310-cp310-win_amd64.whl", hash = "sha256:6d323e1554b3d22cfc03cd3243b5bb815a51f5249fdcbb86fda4bf62bab9e164"}, + {file = "wrapt-1.14.1-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:ecee4132c6cd2ce5308e21672015ddfed1ff975ad0ac8d27168ea82e71413f55"}, + {file = "wrapt-1.14.1-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:2020f391008ef874c6d9e208b24f28e31bcb85ccff4f335f15a3251d222b92d9"}, + {file = "wrapt-1.14.1-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:2feecf86e1f7a86517cab34ae6c2f081fd2d0dac860cb0c0ded96d799d20b335"}, + {file = "wrapt-1.14.1-cp311-cp311-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:240b1686f38ae665d1b15475966fe0472f78e71b1b4903c143a842659c8e4cb9"}, + {file = "wrapt-1.14.1-cp311-cp311-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:a9008dad07d71f68487c91e96579c8567c98ca4c3881b9b113bc7b33e9fd78b8"}, + {file = "wrapt-1.14.1-cp311-cp311-musllinux_1_1_aarch64.whl", hash = "sha256:6447e9f3ba72f8e2b985a1da758767698efa72723d5b59accefd716e9e8272bf"}, + {file = "wrapt-1.14.1-cp311-cp311-musllinux_1_1_i686.whl", hash = "sha256:acae32e13a4153809db37405f5eba5bac5fbe2e2ba61ab227926a22901051c0a"}, + {file = "wrapt-1.14.1-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:49ef582b7a1152ae2766557f0550a9fcbf7bbd76f43fbdc94dd3bf07cc7168be"}, + {file = "wrapt-1.14.1-cp311-cp311-win32.whl", hash = "sha256:358fe87cc899c6bb0ddc185bf3dbfa4ba646f05b1b0b9b5a27c2cb92c2cea204"}, + {file = "wrapt-1.14.1-cp311-cp311-win_amd64.whl", hash = "sha256:26046cd03936ae745a502abf44dac702a5e6880b2b01c29aea8ddf3353b68224"}, {file = "wrapt-1.14.1-cp35-cp35m-manylinux1_i686.whl", hash = "sha256:43ca3bbbe97af00f49efb06e352eae40434ca9d915906f77def219b88e85d907"}, {file = "wrapt-1.14.1-cp35-cp35m-manylinux1_x86_64.whl", hash = "sha256:6b1a564e6cb69922c7fe3a678b9f9a3c54e72b469875aa8018f18b4d1dd1adf3"}, {file = "wrapt-1.14.1-cp35-cp35m-manylinux2010_i686.whl", hash = "sha256:00b6d4ea20a906c0ca56d84f93065b398ab74b927a7a3dbd470f6fc503f95dc3"}, @@ -3439,6 +3584,7 @@ version = "0.13.0" description = "Makes working with XML feel like you are working with JSON" optional = false python-versions = ">=3.4" +groups = ["main"] files = [ {file = "xmltodict-0.13.0-py2.py3-none-any.whl", hash = "sha256:aa89e8fd76320154a40d19a0df04a4695fb9dc5ba977cbb68ab3e4eb225e7852"}, {file = "xmltodict-0.13.0.tar.gz", hash = "sha256:341595a488e3e01a85a9d8911d8912fd922ede5fecc4dce437eb4b6c8d037e56"}, @@ -3450,6 +3596,7 @@ version = "1.17.2" description = "Yet another URL library" optional = false python-versions = ">=3.9" +groups = ["main"] files = [ {file = "yarl-1.17.2-cp310-cp310-macosx_10_9_universal2.whl", hash = "sha256:93771146ef048b34201bfa382c2bf74c524980870bb278e6df515efaf93699ff"}, {file = "yarl-1.17.2-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:8281db240a1616af2f9c5f71d355057e73a1409c4648c8949901396dc0a3c151"}, @@ -3546,6 +3693,7 @@ version = "0.23.0" description = "Zstandard bindings for Python" optional = false python-versions = ">=3.8" +groups = ["main"] files = [ {file = "zstandard-0.23.0-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:bf0a05b6059c0528477fba9054d09179beb63744355cab9f38059548fedd46a9"}, {file = "zstandard-0.23.0-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:fc9ca1c9718cb3b06634c7c8dec57d24e9438b2aa9a0f02b8bb36bf478538880"}, @@ -3653,6 +3801,6 @@ cffi = {version = ">=1.11", markers = "platform_python_implementation == \"PyPy\ cffi = ["cffi (>=1.11)"] [metadata] -lock-version = "2.0" +lock-version = "2.1" python-versions = "^3.11" -content-hash = "e6904aca09abc6c805604b21a5702a97e0056406f9ec7469b091d35ee10a6b16" +content-hash = "4dc3165fe22c0e0f7a030ea0f8a680ae2ff74561d8658c393abbe9112caaf5d7" diff --git a/proxy/src/auth/backend/mod.rs b/proxy/src/auth/backend/mod.rs index de48be2952..d17d91a56d 100644 --- a/proxy/src/auth/backend/mod.rs +++ b/proxy/src/auth/backend/mod.rs @@ -12,6 +12,7 @@ pub(crate) use console_redirect::ConsoleRedirectError; use ipnet::{Ipv4Net, Ipv6Net}; use local::LocalBackend; use postgres_client::config::AuthKeys; +use serde::{Deserialize, Serialize}; use tokio::io::{AsyncRead, AsyncWrite}; use tracing::{debug, info, warn}; @@ -133,7 +134,7 @@ pub(crate) struct ComputeUserInfoNoEndpoint { pub(crate) options: NeonOptions, } -#[derive(Debug, Clone, Default)] +#[derive(Debug, Clone, Default, Serialize, Deserialize)] pub(crate) struct ComputeUserInfo { pub(crate) endpoint: EndpointId, pub(crate) user: RoleName, diff --git a/proxy/src/bin/local_proxy.rs b/proxy/src/bin/local_proxy.rs index 644f670f88..ee8b3d4ef5 100644 --- a/proxy/src/bin/local_proxy.rs +++ b/proxy/src/bin/local_proxy.rs @@ -7,12 +7,11 @@ use std::time::Duration; use anyhow::{bail, ensure, Context}; use camino::{Utf8Path, Utf8PathBuf}; use compute_api::spec::LocalProxySpec; -use dashmap::DashMap; use futures::future::Either; use proxy::auth::backend::jwt::JwkCache; use proxy::auth::backend::local::{LocalBackend, JWKS_ROLE_MAP}; use proxy::auth::{self}; -use proxy::cancellation::CancellationHandlerMain; +use proxy::cancellation::CancellationHandler; use proxy::config::{ self, AuthenticationConfig, ComputeConfig, HttpConfig, ProxyConfig, RetryConfig, }; @@ -211,12 +210,7 @@ async fn main() -> anyhow::Result<()> { auth_backend, http_listener, shutdown.clone(), - Arc::new(CancellationHandlerMain::new( - &config.connect_to_compute, - Arc::new(DashMap::new()), - None, - proxy::metrics::CancellationSource::Local, - )), + Arc::new(CancellationHandler::new(&config.connect_to_compute, None)), endpoint_rate_limiter, ); diff --git a/proxy/src/bin/proxy.rs b/proxy/src/bin/proxy.rs index 70b50436bf..e1affe8391 100644 --- a/proxy/src/bin/proxy.rs +++ b/proxy/src/bin/proxy.rs @@ -7,7 +7,7 @@ use anyhow::bail; use futures::future::Either; use proxy::auth::backend::jwt::JwkCache; use proxy::auth::backend::{AuthRateLimiter, ConsoleRedirectBackend, MaybeOwned}; -use proxy::cancellation::{CancelMap, CancellationHandler}; +use proxy::cancellation::{handle_cancel_messages, CancellationHandler}; use proxy::config::{ self, remote_storage_from_toml, AuthenticationConfig, CacheOptions, ComputeConfig, HttpConfig, ProjectInfoCacheOptions, ProxyConfig, ProxyProtocolV2, @@ -18,8 +18,8 @@ use proxy::metrics::Metrics; use proxy::rate_limiter::{ EndpointRateLimiter, LeakyBucketConfig, RateBucketInfo, WakeComputeRateLimiter, }; -use proxy::redis::cancellation_publisher::RedisPublisherClient; use proxy::redis::connection_with_credentials_provider::ConnectionWithCredentialsProvider; +use proxy::redis::kv_ops::RedisKVClient; use proxy::redis::{elasticache, notifications}; use proxy::scram::threadpool::ThreadPool; use proxy::serverless::cancel_set::CancelSet; @@ -28,7 +28,6 @@ use proxy::tls::client_config::compute_client_config_with_root_certs; use proxy::{auth, control_plane, http, serverless, usage_metrics}; use remote_storage::RemoteStorageConfig; use tokio::net::TcpListener; -use tokio::sync::Mutex; use tokio::task::JoinSet; use tokio_util::sync::CancellationToken; use tracing::{info, warn, Instrument}; @@ -158,8 +157,11 @@ struct ProxyCliArgs { #[clap(long, default_value_t = 64)] auth_rate_limit_ip_subnet: u8, /// Redis rate limiter max number of requests per second. - #[clap(long, default_values_t = RateBucketInfo::DEFAULT_SET)] + #[clap(long, default_values_t = RateBucketInfo::DEFAULT_REDIS_SET)] redis_rps_limit: Vec, + /// Cancellation channel size (max queue size for redis kv client) + #[clap(long, default_value = "1024")] + cancellation_ch_size: usize, /// cache for `allowed_ips` (use `size=0` to disable) #[clap(long, default_value = config::CacheOptions::CACHE_DEFAULT_OPTIONS)] allowed_ips_cache: String, @@ -382,27 +384,19 @@ async fn main() -> anyhow::Result<()> { let cancellation_token = CancellationToken::new(); - let cancel_map = CancelMap::default(); - let redis_rps_limit = Vec::leak(args.redis_rps_limit.clone()); RateBucketInfo::validate(redis_rps_limit)?; - let redis_publisher = match ®ional_redis_client { - Some(redis_publisher) => Some(Arc::new(Mutex::new(RedisPublisherClient::new( - redis_publisher.clone(), - args.region.clone(), - redis_rps_limit, - )?))), - None => None, - }; + let redis_kv_client = regional_redis_client + .as_ref() + .map(|redis_publisher| RedisKVClient::new(redis_publisher.clone(), redis_rps_limit)); - let cancellation_handler = Arc::new(CancellationHandler::< - Option>>, - >::new( + // channel size should be higher than redis client limit to avoid blocking + let cancel_ch_size = args.cancellation_ch_size; + let (tx_cancel, rx_cancel) = tokio::sync::mpsc::channel(cancel_ch_size); + let cancellation_handler = Arc::new(CancellationHandler::new( &config.connect_to_compute, - cancel_map.clone(), - redis_publisher, - proxy::metrics::CancellationSource::FromClient, + Some(tx_cancel), )); // bit of a hack - find the min rps and max rps supported and turn it into @@ -495,25 +489,29 @@ async fn main() -> anyhow::Result<()> { let cache = api.caches.project_info.clone(); if let Some(client) = client1 { maintenance_tasks.spawn(notifications::task_main( - config, client, cache.clone(), - cancel_map.clone(), args.region.clone(), )); } if let Some(client) = client2 { maintenance_tasks.spawn(notifications::task_main( - config, client, cache.clone(), - cancel_map.clone(), args.region.clone(), )); } maintenance_tasks.spawn(async move { cache.clone().gc_worker().await }); } } + + if let Some(mut redis_kv_client) = redis_kv_client { + maintenance_tasks.spawn(async move { + redis_kv_client.try_connect().await?; + handle_cancel_messages(&mut redis_kv_client, rx_cancel).await + }); + } + if let Some(regional_redis_client) = regional_redis_client { let cache = api.caches.endpoints_cache.clone(); let con = regional_redis_client; diff --git a/proxy/src/cancellation.rs b/proxy/src/cancellation.rs index a96c43f2ce..34f708a36b 100644 --- a/proxy/src/cancellation.rs +++ b/proxy/src/cancellation.rs @@ -1,48 +1,124 @@ use std::net::{IpAddr, SocketAddr}; use std::sync::Arc; -use dashmap::DashMap; use ipnet::{IpNet, Ipv4Net, Ipv6Net}; use postgres_client::tls::MakeTlsConnect; use postgres_client::CancelToken; use pq_proto::CancelKeyData; +use serde::{Deserialize, Serialize}; use thiserror::Error; use tokio::net::TcpStream; -use tokio::sync::Mutex; +use tokio::sync::mpsc; use tracing::{debug, info}; -use uuid::Uuid; use crate::auth::backend::{BackendIpAllowlist, ComputeUserInfo}; -use crate::auth::{check_peer_addr_is_in_list, AuthError, IpPattern}; +use crate::auth::{check_peer_addr_is_in_list, AuthError}; use crate::config::ComputeConfig; use crate::context::RequestContext; use crate::error::ReportableError; use crate::ext::LockExt; -use crate::metrics::{CancellationRequest, CancellationSource, Metrics}; +use crate::metrics::CancelChannelSizeGuard; +use crate::metrics::{CancellationRequest, Metrics, RedisMsgKind}; use crate::rate_limiter::LeakyBucketRateLimiter; -use crate::redis::cancellation_publisher::{ - CancellationPublisher, CancellationPublisherMut, RedisPublisherClient, -}; +use crate::redis::keys::KeyPrefix; +use crate::redis::kv_ops::RedisKVClient; use crate::tls::postgres_rustls::MakeRustlsConnect; - -pub type CancelMap = Arc>>; -pub type CancellationHandlerMain = CancellationHandler>>>; -pub(crate) type CancellationHandlerMainInternal = Option>>; +use std::convert::Infallible; +use tokio::sync::oneshot; type IpSubnetKey = IpNet; +const CANCEL_KEY_TTL: i64 = 1_209_600; // 2 weeks cancellation key expire time +const REDIS_SEND_TIMEOUT: std::time::Duration = std::time::Duration::from_millis(10); + +// Message types for sending through mpsc channel +pub enum CancelKeyOp { + StoreCancelKey { + key: String, + field: String, + value: String, + resp_tx: Option>>, + _guard: CancelChannelSizeGuard<'static>, + expire: i64, // TTL for key + }, + GetCancelData { + key: String, + resp_tx: oneshot::Sender>>, + _guard: CancelChannelSizeGuard<'static>, + }, + RemoveCancelKey { + key: String, + field: String, + resp_tx: Option>>, + _guard: CancelChannelSizeGuard<'static>, + }, +} + +// Running as a separate task to accept messages through the rx channel +// In case of problems with RTT: switch to recv_many() + redis pipeline +pub async fn handle_cancel_messages( + client: &mut RedisKVClient, + mut rx: mpsc::Receiver, +) -> anyhow::Result { + loop { + if let Some(msg) = rx.recv().await { + match msg { + CancelKeyOp::StoreCancelKey { + key, + field, + value, + resp_tx, + _guard, + expire: _, + } => { + if let Some(resp_tx) = resp_tx { + resp_tx + .send(client.hset(key, field, value).await) + .inspect_err(|e| { + tracing::debug!("failed to send StoreCancelKey response: {:?}", e); + }) + .ok(); + } else { + drop(client.hset(key, field, value).await); + } + } + CancelKeyOp::GetCancelData { + key, + resp_tx, + _guard, + } => { + drop(resp_tx.send(client.hget_all(key).await)); + } + CancelKeyOp::RemoveCancelKey { + key, + field, + resp_tx, + _guard, + } => { + if let Some(resp_tx) = resp_tx { + resp_tx + .send(client.hdel(key, field).await) + .inspect_err(|e| { + tracing::debug!("failed to send StoreCancelKey response: {:?}", e); + }) + .ok(); + } else { + drop(client.hdel(key, field).await); + } + } + } + } + } +} + /// Enables serving `CancelRequest`s. /// /// If `CancellationPublisher` is available, cancel request will be used to publish the cancellation key to other proxy instances. -pub struct CancellationHandler

{ +pub struct CancellationHandler { compute_config: &'static ComputeConfig, - map: CancelMap, - client: P, - /// This field used for the monitoring purposes. - /// Represents the source of the cancellation request. - from: CancellationSource, // rate limiter of cancellation requests limiter: Arc>>, + tx: Option>, // send messages to the redis KV client task } #[derive(Debug, Error)] @@ -61,6 +137,12 @@ pub(crate) enum CancelError { #[error("Authentication backend error")] AuthError(#[from] AuthError), + + #[error("key not found")] + NotFound, + + #[error("proxy service error")] + InternalError, } impl ReportableError for CancelError { @@ -73,274 +155,191 @@ impl ReportableError for CancelError { CancelError::Postgres(_) => crate::error::ErrorKind::Compute, CancelError::RateLimit => crate::error::ErrorKind::RateLimit, CancelError::IpNotAllowed => crate::error::ErrorKind::User, + CancelError::NotFound => crate::error::ErrorKind::User, CancelError::AuthError(_) => crate::error::ErrorKind::ControlPlane, + CancelError::InternalError => crate::error::ErrorKind::Service, } } } -impl CancellationHandler

{ - /// Run async action within an ephemeral session identified by [`CancelKeyData`]. - pub(crate) fn get_session(self: Arc) -> Session

{ +impl CancellationHandler { + pub fn new( + compute_config: &'static ComputeConfig, + tx: Option>, + ) -> Self { + Self { + compute_config, + tx, + limiter: Arc::new(std::sync::Mutex::new( + LeakyBucketRateLimiter::::new_with_shards( + LeakyBucketRateLimiter::::DEFAULT, + 64, + ), + )), + } + } + + pub(crate) fn get_key(self: &Arc) -> Session { // we intentionally generate a random "backend pid" and "secret key" here. // we use the corresponding u64 as an identifier for the // actual endpoint+pid+secret for postgres/pgbouncer. // // if we forwarded the backend_pid from postgres to the client, there would be a lot // of overlap between our computes as most pids are small (~100). - let key = loop { - let key = rand::random(); - // Random key collisions are unlikely to happen here, but they're still possible, - // which is why we have to take care not to rewrite an existing key. - match self.map.entry(key) { - dashmap::mapref::entry::Entry::Occupied(_) => continue, - dashmap::mapref::entry::Entry::Vacant(e) => { - e.insert(None); - } - } - break key; - }; + let key: CancelKeyData = rand::random(); + + let prefix_key: KeyPrefix = KeyPrefix::Cancel(key); + let redis_key = prefix_key.build_redis_key(); debug!("registered new query cancellation key {key}"); Session { key, - cancellation_handler: self, + redis_key, + cancellation_handler: Arc::clone(self), } } - /// Cancelling only in notification, will be removed - pub(crate) async fn cancel_session( + async fn get_cancel_key( &self, key: CancelKeyData, - session_id: Uuid, - peer_addr: IpAddr, - check_allowed: bool, - ) -> Result<(), CancelError> { - // TODO: check for unspecified address is only for backward compatibility, should be removed - if !peer_addr.is_unspecified() { - let subnet_key = match peer_addr { - IpAddr::V4(ip) => IpNet::V4(Ipv4Net::new_assert(ip, 24).trunc()), // use defaut mask here - IpAddr::V6(ip) => IpNet::V6(Ipv6Net::new_assert(ip, 64).trunc()), - }; - if !self.limiter.lock_propagate_poison().check(subnet_key, 1) { - // log only the subnet part of the IP address to know which subnet is rate limited - tracing::warn!("Rate limit exceeded. Skipping cancellation message, {subnet_key}"); - Metrics::get() - .proxy - .cancellation_requests_total - .inc(CancellationRequest { - source: self.from, - kind: crate::metrics::CancellationOutcome::RateLimitExceeded, - }); - return Err(CancelError::RateLimit); - } - } + ) -> Result, CancelError> { + let prefix_key: KeyPrefix = KeyPrefix::Cancel(key); + let redis_key = prefix_key.build_redis_key(); - // NB: we should immediately release the lock after cloning the token. - let cancel_state = self.map.get(&key).and_then(|x| x.clone()); - let Some(cancel_closure) = cancel_state else { - tracing::warn!("query cancellation key not found: {key}"); - Metrics::get() + let (resp_tx, resp_rx) = tokio::sync::oneshot::channel(); + let op = CancelKeyOp::GetCancelData { + key: redis_key, + resp_tx, + _guard: Metrics::get() .proxy - .cancellation_requests_total - .inc(CancellationRequest { - source: self.from, - kind: crate::metrics::CancellationOutcome::NotFound, - }); - - if session_id == Uuid::nil() { - // was already published, do not publish it again - return Ok(()); - } - - match self.client.try_publish(key, session_id, peer_addr).await { - Ok(()) => {} // do nothing - Err(e) => { - // log it here since cancel_session could be spawned in a task - tracing::error!("failed to publish cancellation key: {key}, error: {e}"); - return Err(CancelError::IO(std::io::Error::new( - std::io::ErrorKind::Other, - e.to_string(), - ))); - } - } - return Ok(()); + .cancel_channel_size + .guard(RedisMsgKind::HGetAll), }; - if check_allowed - && !check_peer_addr_is_in_list(&peer_addr, cancel_closure.ip_allowlist.as_slice()) - { - // log it here since cancel_session could be spawned in a task - tracing::warn!("IP is not allowed to cancel the query: {key}"); - return Err(CancelError::IpNotAllowed); - } + let Some(tx) = &self.tx else { + tracing::warn!("cancellation handler is not available"); + return Err(CancelError::InternalError); + }; - Metrics::get() - .proxy - .cancellation_requests_total - .inc(CancellationRequest { - source: self.from, - kind: crate::metrics::CancellationOutcome::Found, - }); - info!( - "cancelling query per user's request using key {key}, hostname {}, address: {}", - cancel_closure.hostname, cancel_closure.socket_addr - ); - cancel_closure.try_cancel_query(self.compute_config).await + tx.send_timeout(op, REDIS_SEND_TIMEOUT) + .await + .map_err(|e| { + tracing::warn!("failed to send GetCancelData for {key}: {e}"); + }) + .map_err(|()| CancelError::InternalError)?; + + let result = resp_rx.await.map_err(|e| { + tracing::warn!("failed to receive GetCancelData response: {e}"); + CancelError::InternalError + })?; + + let cancel_state_str: Option = match result { + Ok(mut state) => { + if state.len() == 1 { + Some(state.remove(0).1) + } else { + tracing::warn!("unexpected number of entries in cancel state: {state:?}"); + return Err(CancelError::InternalError); + } + } + Err(e) => { + tracing::warn!("failed to receive cancel state from redis: {e}"); + return Err(CancelError::InternalError); + } + }; + + let cancel_state: Option = match cancel_state_str { + Some(state) => { + let cancel_closure: CancelClosure = serde_json::from_str(&state).map_err(|e| { + tracing::warn!("failed to deserialize cancel state: {e}"); + CancelError::InternalError + })?; + Some(cancel_closure) + } + None => None, + }; + Ok(cancel_state) } - /// Try to cancel a running query for the corresponding connection. /// If the cancellation key is not found, it will be published to Redis. /// check_allowed - if true, check if the IP is allowed to cancel the query. /// Will fetch IP allowlist internally. /// /// return Result primarily for tests - pub(crate) async fn cancel_session_auth( + pub(crate) async fn cancel_session( &self, key: CancelKeyData, ctx: RequestContext, check_allowed: bool, auth_backend: &T, ) -> Result<(), CancelError> { - // TODO: check for unspecified address is only for backward compatibility, should be removed - if !ctx.peer_addr().is_unspecified() { - let subnet_key = match ctx.peer_addr() { - IpAddr::V4(ip) => IpNet::V4(Ipv4Net::new_assert(ip, 24).trunc()), // use defaut mask here - IpAddr::V6(ip) => IpNet::V6(Ipv6Net::new_assert(ip, 64).trunc()), - }; - if !self.limiter.lock_propagate_poison().check(subnet_key, 1) { - // log only the subnet part of the IP address to know which subnet is rate limited - tracing::warn!("Rate limit exceeded. Skipping cancellation message, {subnet_key}"); - Metrics::get() - .proxy - .cancellation_requests_total - .inc(CancellationRequest { - source: self.from, - kind: crate::metrics::CancellationOutcome::RateLimitExceeded, - }); - return Err(CancelError::RateLimit); - } + let subnet_key = match ctx.peer_addr() { + IpAddr::V4(ip) => IpNet::V4(Ipv4Net::new_assert(ip, 24).trunc()), // use defaut mask here + IpAddr::V6(ip) => IpNet::V6(Ipv6Net::new_assert(ip, 64).trunc()), + }; + if !self.limiter.lock_propagate_poison().check(subnet_key, 1) { + // log only the subnet part of the IP address to know which subnet is rate limited + tracing::warn!("Rate limit exceeded. Skipping cancellation message, {subnet_key}"); + Metrics::get() + .proxy + .cancellation_requests_total + .inc(CancellationRequest { + kind: crate::metrics::CancellationOutcome::RateLimitExceeded, + }); + return Err(CancelError::RateLimit); } - // NB: we should immediately release the lock after cloning the token. - let cancel_state = self.map.get(&key).and_then(|x| x.clone()); + let cancel_state = self.get_cancel_key(key).await.map_err(|e| { + tracing::warn!("failed to receive RedisOp response: {e}"); + CancelError::InternalError + })?; + let Some(cancel_closure) = cancel_state else { tracing::warn!("query cancellation key not found: {key}"); Metrics::get() .proxy .cancellation_requests_total .inc(CancellationRequest { - source: self.from, kind: crate::metrics::CancellationOutcome::NotFound, }); - - if ctx.session_id() == Uuid::nil() { - // was already published, do not publish it again - return Ok(()); - } - - match self - .client - .try_publish(key, ctx.session_id(), ctx.peer_addr()) - .await - { - Ok(()) => {} // do nothing - Err(e) => { - // log it here since cancel_session could be spawned in a task - tracing::error!("failed to publish cancellation key: {key}, error: {e}"); - return Err(CancelError::IO(std::io::Error::new( - std::io::ErrorKind::Other, - e.to_string(), - ))); - } - } - return Ok(()); + return Err(CancelError::NotFound); }; - let ip_allowlist = auth_backend - .get_allowed_ips(&ctx, &cancel_closure.user_info) - .await - .map_err(CancelError::AuthError)?; + if check_allowed { + let ip_allowlist = auth_backend + .get_allowed_ips(&ctx, &cancel_closure.user_info) + .await + .map_err(CancelError::AuthError)?; - if check_allowed && !check_peer_addr_is_in_list(&ctx.peer_addr(), &ip_allowlist) { - // log it here since cancel_session could be spawned in a task - tracing::warn!("IP is not allowed to cancel the query: {key}"); - return Err(CancelError::IpNotAllowed); + if !check_peer_addr_is_in_list(&ctx.peer_addr(), &ip_allowlist) { + // log it here since cancel_session could be spawned in a task + tracing::warn!( + "IP is not allowed to cancel the query: {key}, address: {}", + ctx.peer_addr() + ); + return Err(CancelError::IpNotAllowed); + } } Metrics::get() .proxy .cancellation_requests_total .inc(CancellationRequest { - source: self.from, kind: crate::metrics::CancellationOutcome::Found, }); info!("cancelling query per user's request using key {key}"); cancel_closure.try_cancel_query(self.compute_config).await } - - #[cfg(test)] - fn contains(&self, session: &Session

) -> bool { - self.map.contains_key(&session.key) - } - - #[cfg(test)] - fn is_empty(&self) -> bool { - self.map.is_empty() - } -} - -impl CancellationHandler<()> { - pub fn new( - compute_config: &'static ComputeConfig, - map: CancelMap, - from: CancellationSource, - ) -> Self { - Self { - compute_config, - map, - client: (), - from, - limiter: Arc::new(std::sync::Mutex::new( - LeakyBucketRateLimiter::::new_with_shards( - LeakyBucketRateLimiter::::DEFAULT, - 64, - ), - )), - } - } -} - -impl CancellationHandler>>> { - pub fn new( - compute_config: &'static ComputeConfig, - map: CancelMap, - client: Option>>, - from: CancellationSource, - ) -> Self { - Self { - compute_config, - map, - client, - from, - limiter: Arc::new(std::sync::Mutex::new( - LeakyBucketRateLimiter::::new_with_shards( - LeakyBucketRateLimiter::::DEFAULT, - 64, - ), - )), - } - } } /// This should've been a [`std::future::Future`], but /// it's impossible to name a type of an unboxed future /// (we'd need something like `#![feature(type_alias_impl_trait)]`). -#[derive(Clone)] +#[derive(Clone, Serialize, Deserialize)] pub struct CancelClosure { socket_addr: SocketAddr, cancel_token: CancelToken, - ip_allowlist: Vec, hostname: String, // for pg_sni router user_info: ComputeUserInfo, } @@ -349,14 +348,12 @@ impl CancelClosure { pub(crate) fn new( socket_addr: SocketAddr, cancel_token: CancelToken, - ip_allowlist: Vec, hostname: String, user_info: ComputeUserInfo, ) -> Self { Self { socket_addr, cancel_token, - ip_allowlist, hostname, user_info, } @@ -385,99 +382,75 @@ impl CancelClosure { debug!("query was cancelled"); Ok(()) } - - /// Obsolete (will be removed after moving CancelMap to Redis), only for notifications - pub(crate) fn set_ip_allowlist(&mut self, ip_allowlist: Vec) { - self.ip_allowlist = ip_allowlist; - } } /// Helper for registering query cancellation tokens. -pub(crate) struct Session

{ +pub(crate) struct Session { /// The user-facing key identifying this session. key: CancelKeyData, - /// The [`CancelMap`] this session belongs to. - cancellation_handler: Arc>, + redis_key: String, + cancellation_handler: Arc, } -impl

Session

{ - /// Store the cancel token for the given session. - /// This enables query cancellation in `crate::proxy::prepare_client_connection`. - pub(crate) fn enable_query_cancellation(&self, cancel_closure: CancelClosure) -> CancelKeyData { - debug!("enabling query cancellation for this session"); - self.cancellation_handler - .map - .insert(self.key, Some(cancel_closure)); - - self.key +impl Session { + pub(crate) fn key(&self) -> &CancelKeyData { + &self.key } -} -impl

Drop for Session

{ - fn drop(&mut self) { - self.cancellation_handler.map.remove(&self.key); - debug!("dropped query cancellation key {}", &self.key); - } -} - -#[cfg(test)] -#[expect(clippy::unwrap_used)] -mod tests { - use std::time::Duration; - - use super::*; - use crate::config::RetryConfig; - use crate::tls::client_config::compute_client_config_with_certs; - - fn config() -> ComputeConfig { - let retry = RetryConfig { - base_delay: Duration::from_secs(1), - max_retries: 5, - backoff_factor: 2.0, + // Send the store key op to the cancellation handler + pub(crate) async fn write_cancel_key( + &self, + cancel_closure: CancelClosure, + ) -> Result<(), CancelError> { + let Some(tx) = &self.cancellation_handler.tx else { + tracing::warn!("cancellation handler is not available"); + return Err(CancelError::InternalError); }; - ComputeConfig { - retry, - tls: Arc::new(compute_client_config_with_certs(std::iter::empty())), - timeout: Duration::from_secs(2), - } - } + let closure_json = serde_json::to_string(&cancel_closure).map_err(|e| { + tracing::warn!("failed to serialize cancel closure: {e}"); + CancelError::InternalError + })?; - #[tokio::test] - async fn check_session_drop() -> anyhow::Result<()> { - let cancellation_handler = Arc::new(CancellationHandler::<()>::new( - Box::leak(Box::new(config())), - CancelMap::default(), - CancellationSource::FromRedis, - )); - - let session = cancellation_handler.clone().get_session(); - assert!(cancellation_handler.contains(&session)); - drop(session); - // Check that the session has been dropped. - assert!(cancellation_handler.is_empty()); + let op = CancelKeyOp::StoreCancelKey { + key: self.redis_key.clone(), + field: "data".to_string(), + value: closure_json, + resp_tx: None, + _guard: Metrics::get() + .proxy + .cancel_channel_size + .guard(RedisMsgKind::HSet), + expire: CANCEL_KEY_TTL, + }; + let _ = tx.send_timeout(op, REDIS_SEND_TIMEOUT).await.map_err(|e| { + let key = self.key; + tracing::warn!("failed to send StoreCancelKey for {key}: {e}"); + }); Ok(()) } - #[tokio::test] - async fn cancel_session_noop_regression() { - let handler = CancellationHandler::<()>::new( - Box::leak(Box::new(config())), - CancelMap::default(), - CancellationSource::Local, - ); - handler - .cancel_session( - CancelKeyData { - backend_pid: 0, - cancel_key: 0, - }, - Uuid::new_v4(), - "127.0.0.1".parse().unwrap(), - true, - ) - .await - .unwrap(); + pub(crate) async fn remove_cancel_key(&self) -> Result<(), CancelError> { + let Some(tx) = &self.cancellation_handler.tx else { + tracing::warn!("cancellation handler is not available"); + return Err(CancelError::InternalError); + }; + + let op = CancelKeyOp::RemoveCancelKey { + key: self.redis_key.clone(), + field: "data".to_string(), + resp_tx: None, + _guard: Metrics::get() + .proxy + .cancel_channel_size + .guard(RedisMsgKind::HSet), + }; + + let _ = tx.send_timeout(op, REDIS_SEND_TIMEOUT).await.map_err(|e| { + let key = self.key; + tracing::warn!("failed to send RemoveCancelKey for {key}: {e}"); + }); + Ok(()) } } diff --git a/proxy/src/compute.rs b/proxy/src/compute.rs index aff796bbab..d71465765f 100644 --- a/proxy/src/compute.rs +++ b/proxy/src/compute.rs @@ -296,7 +296,6 @@ impl ConnCfg { process_id, secret_key, }, - vec![], // TODO: deprecated, will be removed host.to_string(), user_info, ); diff --git a/proxy/src/console_redirect_proxy.rs b/proxy/src/console_redirect_proxy.rs index 0c6755063f..78bfb6deac 100644 --- a/proxy/src/console_redirect_proxy.rs +++ b/proxy/src/console_redirect_proxy.rs @@ -6,7 +6,7 @@ use tokio_util::sync::CancellationToken; use tracing::{debug, error, info, Instrument}; use crate::auth::backend::ConsoleRedirectBackend; -use crate::cancellation::{CancellationHandlerMain, CancellationHandlerMainInternal}; +use crate::cancellation::CancellationHandler; use crate::config::{ProxyConfig, ProxyProtocolV2}; use crate::context::RequestContext; use crate::error::ReportableError; @@ -24,7 +24,7 @@ pub async fn task_main( backend: &'static ConsoleRedirectBackend, listener: tokio::net::TcpListener, cancellation_token: CancellationToken, - cancellation_handler: Arc, + cancellation_handler: Arc, ) -> anyhow::Result<()> { scopeguard::defer! { info!("proxy has shut down"); @@ -140,15 +140,16 @@ pub async fn task_main( Ok(()) } +#[allow(clippy::too_many_arguments)] pub(crate) async fn handle_client( config: &'static ProxyConfig, backend: &'static ConsoleRedirectBackend, ctx: &RequestContext, - cancellation_handler: Arc, + cancellation_handler: Arc, stream: S, conn_gauge: NumClientConnectionsGuard<'static>, cancellations: tokio_util::task::task_tracker::TaskTracker, -) -> Result>, ClientRequestError> { +) -> Result>, ClientRequestError> { debug!( protocol = %ctx.protocol(), "handling interactive connection from client" @@ -171,13 +172,13 @@ pub(crate) async fn handle_client( HandshakeData::Cancel(cancel_key_data) => { // spawn a task to cancel the session, but don't wait for it cancellations.spawn({ - let cancellation_handler_clone = Arc::clone(&cancellation_handler); + let cancellation_handler_clone = Arc::clone(&cancellation_handler); let ctx = ctx.clone(); let cancel_span = tracing::span!(parent: None, tracing::Level::INFO, "cancel_session", session_id = ?ctx.session_id()); cancel_span.follows_from(tracing::Span::current()); async move { cancellation_handler_clone - .cancel_session_auth( + .cancel_session( cancel_key_data, ctx, config.authentication_config.ip_allowlist_check_enabled, @@ -195,7 +196,7 @@ pub(crate) async fn handle_client( ctx.set_db_options(params.clone()); - let (node_info, user_info, ip_allowlist) = match backend + let (node_info, user_info, _ip_allowlist) = match backend .authenticate(ctx, &config.authentication_config, &mut stream) .await { @@ -220,10 +221,14 @@ pub(crate) async fn handle_client( .or_else(|e| stream.throw_error(e)) .await?; - node.cancel_closure - .set_ip_allowlist(ip_allowlist.unwrap_or_default()); - let session = cancellation_handler.get_session(); - prepare_client_connection(&node, &session, &mut stream).await?; + let cancellation_handler_clone = Arc::clone(&cancellation_handler); + let session = cancellation_handler_clone.get_key(); + + session + .write_cancel_key(node.cancel_closure.clone()) + .await?; + + prepare_client_connection(&node, *session.key(), &mut stream).await?; // Before proxy passing, forward to compute whatever data is left in the // PqStream input buffer. Normally there is none, but our serverless npm @@ -237,8 +242,8 @@ pub(crate) async fn handle_client( aux: node.aux.clone(), compute: node, session_id: ctx.session_id(), + cancel: session, _req: request_gauge, _conn: conn_gauge, - _cancel: session, })) } diff --git a/proxy/src/context/parquet.rs b/proxy/src/context/parquet.rs index d7ffff0483..4f1dd39d92 100644 --- a/proxy/src/context/parquet.rs +++ b/proxy/src/context/parquet.rs @@ -423,11 +423,11 @@ async fn upload_parquet( .await .ok_or_else(|| anyhow::Error::new(TimeoutOrCancel::Cancel)) .and_then(|x| x) - .context("request_data_upload") + .with_context(|| format!("request_data_upload: path={path}")) .err(); if let Some(err) = maybe_err { - tracing::error!(%id, error = ?err, "failed to upload request data"); + tracing::error!(%id, %path, error = ?err, "failed to upload request data"); } Ok(buffer.writer()) diff --git a/proxy/src/metrics.rs b/proxy/src/metrics.rs index 659c57c865..f3d281a26b 100644 --- a/proxy/src/metrics.rs +++ b/proxy/src/metrics.rs @@ -56,6 +56,8 @@ pub struct ProxyMetrics { pub connection_requests: CounterPairVec, #[metric(flatten)] pub http_endpoint_pools: HttpEndpointPools, + #[metric(flatten)] + pub cancel_channel_size: CounterPairVec, /// Time it took for proxy to establish a connection to the compute endpoint. // largest bucket = 2^16 * 0.5ms = 32s @@ -294,6 +296,16 @@ impl CounterPairAssoc for NumConnectionRequestsGauge { pub type NumConnectionRequestsGuard<'a> = metrics::MeasuredCounterPairGuard<'a, NumConnectionRequestsGauge>; +pub struct CancelChannelSizeGauge; +impl CounterPairAssoc for CancelChannelSizeGauge { + const INC_NAME: &'static MetricName = MetricName::from_str("opened_msgs_cancel_channel_total"); + const DEC_NAME: &'static MetricName = MetricName::from_str("closed_msgs_cancel_channel_total"); + const INC_HELP: &'static str = "Number of processing messages in the cancellation channel."; + const DEC_HELP: &'static str = "Number of closed messages in the cancellation channel."; + type LabelGroupSet = StaticLabelSet; +} +pub type CancelChannelSizeGuard<'a> = metrics::MeasuredCounterPairGuard<'a, CancelChannelSizeGauge>; + #[derive(LabelGroup)] #[label(set = ComputeConnectionLatencySet)] pub struct ComputeConnectionLatencyGroup { @@ -340,13 +352,6 @@ pub struct RedisErrors<'a> { pub channel: &'a str, } -#[derive(FixedCardinalityLabel, Copy, Clone)] -pub enum CancellationSource { - FromClient, - FromRedis, - Local, -} - #[derive(FixedCardinalityLabel, Copy, Clone)] pub enum CancellationOutcome { NotFound, @@ -357,7 +362,6 @@ pub enum CancellationOutcome { #[derive(LabelGroup)] #[label(set = CancellationRequestSet)] pub struct CancellationRequest { - pub source: CancellationSource, pub kind: CancellationOutcome, } @@ -369,6 +373,16 @@ pub enum Waiting { RetryTimeout, } +#[derive(FixedCardinalityLabel, Copy, Clone)] +#[label(singleton = "kind")] +pub enum RedisMsgKind { + HSet, + HSetMultiple, + HGet, + HGetAll, + HDel, +} + #[derive(Default)] struct Accumulated { cplane: time::Duration, diff --git a/proxy/src/proxy/mod.rs b/proxy/src/proxy/mod.rs index 63f93f0a91..ab173bd0d0 100644 --- a/proxy/src/proxy/mod.rs +++ b/proxy/src/proxy/mod.rs @@ -13,8 +13,9 @@ pub use copy_bidirectional::{copy_bidirectional_client_compute, ErrorSource}; use futures::{FutureExt, TryFutureExt}; use itertools::Itertools; use once_cell::sync::OnceCell; -use pq_proto::{BeMessage as Be, StartupMessageParams}; +use pq_proto::{BeMessage as Be, CancelKeyData, StartupMessageParams}; use regex::Regex; +use serde::{Deserialize, Serialize}; use smol_str::{format_smolstr, SmolStr}; use thiserror::Error; use tokio::io::{AsyncRead, AsyncWrite, AsyncWriteExt}; @@ -23,7 +24,7 @@ use tracing::{debug, error, info, warn, Instrument}; use self::connect_compute::{connect_to_compute, TcpMechanism}; use self::passthrough::ProxyPassthrough; -use crate::cancellation::{self, CancellationHandlerMain, CancellationHandlerMainInternal}; +use crate::cancellation::{self, CancellationHandler}; use crate::config::{ProxyConfig, ProxyProtocolV2, TlsConfig}; use crate::context::RequestContext; use crate::error::ReportableError; @@ -57,7 +58,7 @@ pub async fn task_main( auth_backend: &'static auth::Backend<'static, ()>, listener: tokio::net::TcpListener, cancellation_token: CancellationToken, - cancellation_handler: Arc, + cancellation_handler: Arc, endpoint_rate_limiter: Arc, ) -> anyhow::Result<()> { scopeguard::defer! { @@ -243,13 +244,13 @@ pub(crate) async fn handle_client( config: &'static ProxyConfig, auth_backend: &'static auth::Backend<'static, ()>, ctx: &RequestContext, - cancellation_handler: Arc, + cancellation_handler: Arc, stream: S, mode: ClientMode, endpoint_rate_limiter: Arc, conn_gauge: NumClientConnectionsGuard<'static>, cancellations: tokio_util::task::task_tracker::TaskTracker, -) -> Result>, ClientRequestError> { +) -> Result>, ClientRequestError> { debug!( protocol = %ctx.protocol(), "handling interactive connection from client" @@ -278,7 +279,7 @@ pub(crate) async fn handle_client( cancel_span.follows_from(tracing::Span::current()); async move { cancellation_handler_clone - .cancel_session_auth( + .cancel_session( cancel_key_data, ctx, config.authentication_config.ip_allowlist_check_enabled, @@ -312,7 +313,7 @@ pub(crate) async fn handle_client( }; let user = user_info.get_user().to_owned(); - let (user_info, ip_allowlist) = match user_info + let (user_info, _ip_allowlist) = match user_info .authenticate( ctx, &mut stream, @@ -356,10 +357,14 @@ pub(crate) async fn handle_client( .or_else(|e| stream.throw_error(e)) .await?; - node.cancel_closure - .set_ip_allowlist(ip_allowlist.unwrap_or_default()); - let session = cancellation_handler.get_session(); - prepare_client_connection(&node, &session, &mut stream).await?; + let cancellation_handler_clone = Arc::clone(&cancellation_handler); + let session = cancellation_handler_clone.get_key(); + + session + .write_cancel_key(node.cancel_closure.clone()) + .await?; + + prepare_client_connection(&node, *session.key(), &mut stream).await?; // Before proxy passing, forward to compute whatever data is left in the // PqStream input buffer. Normally there is none, but our serverless npm @@ -373,23 +378,19 @@ pub(crate) async fn handle_client( aux: node.aux.clone(), compute: node, session_id: ctx.session_id(), + cancel: session, _req: request_gauge, _conn: conn_gauge, - _cancel: session, })) } /// Finish client connection initialization: confirm auth success, send params, etc. #[tracing::instrument(skip_all)] -pub(crate) async fn prepare_client_connection

( +pub(crate) async fn prepare_client_connection( node: &compute::PostgresConnection, - session: &cancellation::Session

, + cancel_key_data: CancelKeyData, stream: &mut PqStream, ) -> Result<(), std::io::Error> { - // Register compute's query cancellation token and produce a new, unique one. - // The new token (cancel_key_data) will be sent to the client. - let cancel_key_data = session.enable_query_cancellation(node.cancel_closure.clone()); - // Forward all deferred notices to the client. for notice in &node.delayed_notice { stream.write_message_noflush(&Be::Raw(b'N', notice.as_bytes()))?; @@ -411,7 +412,7 @@ pub(crate) async fn prepare_client_connection

( Ok(()) } -#[derive(Debug, Clone, PartialEq, Eq, Default)] +#[derive(Debug, Clone, PartialEq, Eq, Default, Serialize, Deserialize)] pub(crate) struct NeonOptions(Vec<(SmolStr, SmolStr)>); impl NeonOptions { diff --git a/proxy/src/proxy/passthrough.rs b/proxy/src/proxy/passthrough.rs index a42f9aad39..08871380d6 100644 --- a/proxy/src/proxy/passthrough.rs +++ b/proxy/src/proxy/passthrough.rs @@ -56,18 +56,18 @@ pub(crate) async fn proxy_pass( Ok(()) } -pub(crate) struct ProxyPassthrough { +pub(crate) struct ProxyPassthrough { pub(crate) client: Stream, pub(crate) compute: PostgresConnection, pub(crate) aux: MetricsAuxInfo, pub(crate) session_id: uuid::Uuid, + pub(crate) cancel: cancellation::Session, pub(crate) _req: NumConnectionRequestsGuard<'static>, pub(crate) _conn: NumClientConnectionsGuard<'static>, - pub(crate) _cancel: cancellation::Session

, } -impl ProxyPassthrough { +impl ProxyPassthrough { pub(crate) async fn proxy_pass( self, compute_config: &ComputeConfig, @@ -81,6 +81,9 @@ impl ProxyPassthrough { { tracing::warn!(session_id = ?self.session_id, ?err, "could not cancel the query in the database"); } + + drop(self.cancel.remove_cancel_key().await); // we don't need a result. If the queue is full, we just log the error + res } } diff --git a/proxy/src/rate_limiter/limiter.rs b/proxy/src/rate_limiter/limiter.rs index 6f6a8c9d47..ec080f270b 100644 --- a/proxy/src/rate_limiter/limiter.rs +++ b/proxy/src/rate_limiter/limiter.rs @@ -138,6 +138,12 @@ impl RateBucketInfo { Self::new(200, Duration::from_secs(600)), ]; + // For all the sessions will be cancel key. So this limit is essentially global proxy limit. + pub const DEFAULT_REDIS_SET: [Self; 2] = [ + Self::new(100_000, Duration::from_secs(1)), + Self::new(50_000, Duration::from_secs(10)), + ]; + /// All of these are per endpoint-maskedip pair. /// Context: 4096 rounds of pbkdf2 take about 1ms of cpu time to execute (1 milli-cpu-second or 1mcpus). /// diff --git a/proxy/src/redis/cancellation_publisher.rs b/proxy/src/redis/cancellation_publisher.rs index 228dbb7f64..30d8b83e60 100644 --- a/proxy/src/redis/cancellation_publisher.rs +++ b/proxy/src/redis/cancellation_publisher.rs @@ -2,12 +2,10 @@ use core::net::IpAddr; use std::sync::Arc; use pq_proto::CancelKeyData; -use redis::AsyncCommands; use tokio::sync::Mutex; use uuid::Uuid; use super::connection_with_credentials_provider::ConnectionWithCredentialsProvider; -use super::notifications::{CancelSession, Notification, PROXY_CHANNEL_NAME}; use crate::rate_limiter::{GlobalRateLimiter, RateBucketInfo}; pub trait CancellationPublisherMut: Send + Sync + 'static { @@ -83,9 +81,10 @@ impl CancellationPublisher for Arc> { } pub struct RedisPublisherClient { + #[allow(dead_code)] client: ConnectionWithCredentialsProvider, - region_id: String, - limiter: GlobalRateLimiter, + _region_id: String, + _limiter: GlobalRateLimiter, } impl RedisPublisherClient { @@ -96,26 +95,12 @@ impl RedisPublisherClient { ) -> anyhow::Result { Ok(Self { client, - region_id, - limiter: GlobalRateLimiter::new(info.into()), + _region_id: region_id, + _limiter: GlobalRateLimiter::new(info.into()), }) } - async fn publish( - &mut self, - cancel_key_data: CancelKeyData, - session_id: Uuid, - peer_addr: IpAddr, - ) -> anyhow::Result<()> { - let payload = serde_json::to_string(&Notification::Cancel(CancelSession { - region_id: Some(self.region_id.clone()), - cancel_key_data, - session_id, - peer_addr: Some(peer_addr), - }))?; - let _: () = self.client.publish(PROXY_CHANNEL_NAME, payload).await?; - Ok(()) - } + #[allow(dead_code)] pub(crate) async fn try_connect(&mut self) -> anyhow::Result<()> { match self.client.connect().await { Ok(()) => {} @@ -126,49 +111,4 @@ impl RedisPublisherClient { } Ok(()) } - async fn try_publish_internal( - &mut self, - cancel_key_data: CancelKeyData, - session_id: Uuid, - peer_addr: IpAddr, - ) -> anyhow::Result<()> { - // TODO: review redundant error duplication logs. - if !self.limiter.check() { - tracing::info!("Rate limit exceeded. Skipping cancellation message"); - return Err(anyhow::anyhow!("Rate limit exceeded")); - } - match self.publish(cancel_key_data, session_id, peer_addr).await { - Ok(()) => return Ok(()), - Err(e) => { - tracing::error!("failed to publish a message: {e}"); - } - } - tracing::info!("Publisher is disconnected. Reconnectiong..."); - self.try_connect().await?; - self.publish(cancel_key_data, session_id, peer_addr).await - } -} - -impl CancellationPublisherMut for RedisPublisherClient { - async fn try_publish( - &mut self, - cancel_key_data: CancelKeyData, - session_id: Uuid, - peer_addr: IpAddr, - ) -> anyhow::Result<()> { - tracing::info!("publishing cancellation key to Redis"); - match self - .try_publish_internal(cancel_key_data, session_id, peer_addr) - .await - { - Ok(()) => { - tracing::debug!("cancellation key successfuly published to Redis"); - Ok(()) - } - Err(e) => { - tracing::error!("failed to publish a message: {e}"); - Err(e) - } - } - } } diff --git a/proxy/src/redis/connection_with_credentials_provider.rs b/proxy/src/redis/connection_with_credentials_provider.rs index 0f6e765b02..b5c3d13216 100644 --- a/proxy/src/redis/connection_with_credentials_provider.rs +++ b/proxy/src/redis/connection_with_credentials_provider.rs @@ -29,6 +29,7 @@ impl Clone for Credentials { /// Provides PubSub connection without credentials refresh. pub struct ConnectionWithCredentialsProvider { credentials: Credentials, + // TODO: with more load on the connection, we should consider using a connection pool con: Option, refresh_token_task: Option>, mutex: tokio::sync::Mutex<()>, diff --git a/proxy/src/redis/keys.rs b/proxy/src/redis/keys.rs new file mode 100644 index 0000000000..dddc7e2054 --- /dev/null +++ b/proxy/src/redis/keys.rs @@ -0,0 +1,88 @@ +use anyhow::Ok; +use pq_proto::{id_to_cancel_key, CancelKeyData}; +use serde::{Deserialize, Serialize}; +use std::io::ErrorKind; + +pub mod keyspace { + pub const CANCEL_PREFIX: &str = "cancel"; +} + +#[derive(Clone, Debug, Serialize, Deserialize, Eq, PartialEq)] +pub(crate) enum KeyPrefix { + #[serde(untagged)] + Cancel(CancelKeyData), +} + +impl KeyPrefix { + pub(crate) fn build_redis_key(&self) -> String { + match self { + KeyPrefix::Cancel(key) => { + let hi = (key.backend_pid as u64) << 32; + let lo = (key.cancel_key as u64) & 0xffff_ffff; + let id = hi | lo; + let keyspace = keyspace::CANCEL_PREFIX; + format!("{keyspace}:{id:x}") + } + } + } + + #[allow(dead_code)] + pub(crate) fn as_str(&self) -> &'static str { + match self { + KeyPrefix::Cancel(_) => keyspace::CANCEL_PREFIX, + } + } +} + +#[allow(dead_code)] +pub(crate) fn parse_redis_key(key: &str) -> anyhow::Result { + let (prefix, key_str) = key.split_once(':').ok_or_else(|| { + anyhow::anyhow!(std::io::Error::new( + ErrorKind::InvalidData, + "missing prefix" + )) + })?; + + match prefix { + keyspace::CANCEL_PREFIX => { + let id = u64::from_str_radix(key_str, 16)?; + + Ok(KeyPrefix::Cancel(id_to_cancel_key(id))) + } + _ => Err(anyhow::anyhow!(std::io::Error::new( + ErrorKind::InvalidData, + "unknown prefix" + ))), + } +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn test_build_redis_key() { + let cancel_key: KeyPrefix = KeyPrefix::Cancel(CancelKeyData { + backend_pid: 12345, + cancel_key: 54321, + }); + + let redis_key = cancel_key.build_redis_key(); + assert_eq!(redis_key, "cancel:30390000d431"); + } + + #[test] + fn test_parse_redis_key() { + let redis_key = "cancel:30390000d431"; + let key: KeyPrefix = parse_redis_key(redis_key).expect("Failed to parse key"); + + let ref_key = CancelKeyData { + backend_pid: 12345, + cancel_key: 54321, + }; + + assert_eq!(key.as_str(), KeyPrefix::Cancel(ref_key).as_str()); + let KeyPrefix::Cancel(cancel_key) = key; + assert_eq!(ref_key, cancel_key); + } +} diff --git a/proxy/src/redis/kv_ops.rs b/proxy/src/redis/kv_ops.rs new file mode 100644 index 0000000000..dcc6aac51b --- /dev/null +++ b/proxy/src/redis/kv_ops.rs @@ -0,0 +1,185 @@ +use redis::{AsyncCommands, ToRedisArgs}; + +use super::connection_with_credentials_provider::ConnectionWithCredentialsProvider; + +use crate::rate_limiter::{GlobalRateLimiter, RateBucketInfo}; + +pub struct RedisKVClient { + client: ConnectionWithCredentialsProvider, + limiter: GlobalRateLimiter, +} + +impl RedisKVClient { + pub fn new(client: ConnectionWithCredentialsProvider, info: &'static [RateBucketInfo]) -> Self { + Self { + client, + limiter: GlobalRateLimiter::new(info.into()), + } + } + + pub async fn try_connect(&mut self) -> anyhow::Result<()> { + match self.client.connect().await { + Ok(()) => {} + Err(e) => { + tracing::error!("failed to connect to redis: {e}"); + return Err(e); + } + } + Ok(()) + } + + pub(crate) async fn hset(&mut self, key: K, field: F, value: V) -> anyhow::Result<()> + where + K: ToRedisArgs + Send + Sync, + F: ToRedisArgs + Send + Sync, + V: ToRedisArgs + Send + Sync, + { + if !self.limiter.check() { + tracing::info!("Rate limit exceeded. Skipping hset"); + return Err(anyhow::anyhow!("Rate limit exceeded")); + } + + match self.client.hset(&key, &field, &value).await { + Ok(()) => return Ok(()), + Err(e) => { + tracing::error!("failed to set a key-value pair: {e}"); + } + } + + tracing::info!("Redis client is disconnected. Reconnectiong..."); + self.try_connect().await?; + self.client + .hset(key, field, value) + .await + .map_err(anyhow::Error::new) + } + + #[allow(dead_code)] + pub(crate) async fn hset_multiple( + &mut self, + key: &str, + items: &[(K, V)], + ) -> anyhow::Result<()> + where + K: ToRedisArgs + Send + Sync, + V: ToRedisArgs + Send + Sync, + { + if !self.limiter.check() { + tracing::info!("Rate limit exceeded. Skipping hset_multiple"); + return Err(anyhow::anyhow!("Rate limit exceeded")); + } + + match self.client.hset_multiple(key, items).await { + Ok(()) => return Ok(()), + Err(e) => { + tracing::error!("failed to set a key-value pair: {e}"); + } + } + + tracing::info!("Redis client is disconnected. Reconnectiong..."); + self.try_connect().await?; + self.client + .hset_multiple(key, items) + .await + .map_err(anyhow::Error::new) + } + + #[allow(dead_code)] + pub(crate) async fn expire(&mut self, key: K, seconds: i64) -> anyhow::Result<()> + where + K: ToRedisArgs + Send + Sync, + { + if !self.limiter.check() { + tracing::info!("Rate limit exceeded. Skipping expire"); + return Err(anyhow::anyhow!("Rate limit exceeded")); + } + + match self.client.expire(&key, seconds).await { + Ok(()) => return Ok(()), + Err(e) => { + tracing::error!("failed to set a key-value pair: {e}"); + } + } + + tracing::info!("Redis client is disconnected. Reconnectiong..."); + self.try_connect().await?; + self.client + .expire(key, seconds) + .await + .map_err(anyhow::Error::new) + } + + #[allow(dead_code)] + pub(crate) async fn hget(&mut self, key: K, field: F) -> anyhow::Result + where + K: ToRedisArgs + Send + Sync, + F: ToRedisArgs + Send + Sync, + V: redis::FromRedisValue, + { + if !self.limiter.check() { + tracing::info!("Rate limit exceeded. Skipping hget"); + return Err(anyhow::anyhow!("Rate limit exceeded")); + } + + match self.client.hget(&key, &field).await { + Ok(value) => return Ok(value), + Err(e) => { + tracing::error!("failed to get a value: {e}"); + } + } + + tracing::info!("Redis client is disconnected. Reconnectiong..."); + self.try_connect().await?; + self.client + .hget(key, field) + .await + .map_err(anyhow::Error::new) + } + + pub(crate) async fn hget_all(&mut self, key: K) -> anyhow::Result + where + K: ToRedisArgs + Send + Sync, + V: redis::FromRedisValue, + { + if !self.limiter.check() { + tracing::info!("Rate limit exceeded. Skipping hgetall"); + return Err(anyhow::anyhow!("Rate limit exceeded")); + } + + match self.client.hgetall(&key).await { + Ok(value) => return Ok(value), + Err(e) => { + tracing::error!("failed to get a value: {e}"); + } + } + + tracing::info!("Redis client is disconnected. Reconnectiong..."); + self.try_connect().await?; + self.client.hgetall(key).await.map_err(anyhow::Error::new) + } + + pub(crate) async fn hdel(&mut self, key: K, field: F) -> anyhow::Result<()> + where + K: ToRedisArgs + Send + Sync, + F: ToRedisArgs + Send + Sync, + { + if !self.limiter.check() { + tracing::info!("Rate limit exceeded. Skipping hdel"); + return Err(anyhow::anyhow!("Rate limit exceeded")); + } + + match self.client.hdel(&key, &field).await { + Ok(()) => return Ok(()), + Err(e) => { + tracing::error!("failed to delete a key-value pair: {e}"); + } + } + + tracing::info!("Redis client is disconnected. Reconnectiong..."); + self.try_connect().await?; + self.client + .hdel(key, field) + .await + .map_err(anyhow::Error::new) + } +} diff --git a/proxy/src/redis/mod.rs b/proxy/src/redis/mod.rs index a322f0368c..8b46a8e6ca 100644 --- a/proxy/src/redis/mod.rs +++ b/proxy/src/redis/mod.rs @@ -1,4 +1,6 @@ pub mod cancellation_publisher; pub mod connection_with_credentials_provider; pub mod elasticache; +pub mod keys; +pub mod kv_ops; pub mod notifications; diff --git a/proxy/src/redis/notifications.rs b/proxy/src/redis/notifications.rs index 63cdf6176c..19fdd3280d 100644 --- a/proxy/src/redis/notifications.rs +++ b/proxy/src/redis/notifications.rs @@ -6,18 +6,14 @@ use pq_proto::CancelKeyData; use redis::aio::PubSub; use serde::{Deserialize, Serialize}; use tokio_util::sync::CancellationToken; -use tracing::Instrument; use uuid::Uuid; use super::connection_with_credentials_provider::ConnectionWithCredentialsProvider; use crate::cache::project_info::ProjectInfoCache; -use crate::cancellation::{CancelMap, CancellationHandler}; -use crate::config::ProxyConfig; use crate::intern::{ProjectIdInt, RoleNameInt}; use crate::metrics::{Metrics, RedisErrors, RedisEventsCount}; const CPLANE_CHANNEL_NAME: &str = "neondb-proxy-ws-updates"; -pub(crate) const PROXY_CHANNEL_NAME: &str = "neondb-proxy-to-proxy-updates"; const RECONNECT_TIMEOUT: std::time::Duration = std::time::Duration::from_secs(20); const INVALIDATION_LAG: std::time::Duration = std::time::Duration::from_secs(20); @@ -25,8 +21,6 @@ async fn try_connect(client: &ConnectionWithCredentialsProvider) -> anyhow::Resu let mut conn = client.get_async_pubsub().await?; tracing::info!("subscribing to a channel `{CPLANE_CHANNEL_NAME}`"); conn.subscribe(CPLANE_CHANNEL_NAME).await?; - tracing::info!("subscribing to a channel `{PROXY_CHANNEL_NAME}`"); - conn.subscribe(PROXY_CHANNEL_NAME).await?; Ok(conn) } @@ -71,8 +65,6 @@ pub(crate) enum Notification { deserialize_with = "deserialize_json_string" )] PasswordUpdate { password_update: PasswordUpdate }, - #[serde(rename = "/cancel_session")] - Cancel(CancelSession), #[serde( other, @@ -138,7 +130,6 @@ where struct MessageHandler { cache: Arc, - cancellation_handler: Arc>, region_id: String, } @@ -146,23 +137,14 @@ impl Clone for MessageHandler { fn clone(&self) -> Self { Self { cache: self.cache.clone(), - cancellation_handler: self.cancellation_handler.clone(), region_id: self.region_id.clone(), } } } impl MessageHandler { - pub(crate) fn new( - cache: Arc, - cancellation_handler: Arc>, - region_id: String, - ) -> Self { - Self { - cache, - cancellation_handler, - region_id, - } + pub(crate) fn new(cache: Arc, region_id: String) -> Self { + Self { cache, region_id } } pub(crate) async fn increment_active_listeners(&self) { @@ -207,46 +189,6 @@ impl MessageHandler { tracing::debug!(?msg, "received a message"); match msg { - Notification::Cancel(cancel_session) => { - tracing::Span::current().record( - "session_id", - tracing::field::display(cancel_session.session_id), - ); - Metrics::get() - .proxy - .redis_events_count - .inc(RedisEventsCount::CancelSession); - if let Some(cancel_region) = cancel_session.region_id { - // If the message is not for this region, ignore it. - if cancel_region != self.region_id { - return Ok(()); - } - } - - // TODO: Remove unspecified peer_addr after the complete migration to the new format - let peer_addr = cancel_session - .peer_addr - .unwrap_or(std::net::IpAddr::V4(std::net::Ipv4Addr::UNSPECIFIED)); - let cancel_span = tracing::span!(parent: None, tracing::Level::INFO, "cancel_session", session_id = ?cancel_session.session_id); - cancel_span.follows_from(tracing::Span::current()); - // This instance of cancellation_handler doesn't have a RedisPublisherClient so it can't publish the message. - match self - .cancellation_handler - .cancel_session( - cancel_session.cancel_key_data, - uuid::Uuid::nil(), - peer_addr, - cancel_session.peer_addr.is_some(), - ) - .instrument(cancel_span) - .await - { - Ok(()) => {} - Err(e) => { - tracing::warn!("failed to cancel session: {e}"); - } - } - } Notification::AllowedIpsUpdate { .. } | Notification::PasswordUpdate { .. } | Notification::BlockPublicOrVpcAccessUpdated { .. } @@ -293,7 +235,6 @@ fn invalidate_cache(cache: Arc, msg: Notification) { password_update.project_id, password_update.role_name, ), - Notification::Cancel(_) => unreachable!("cancel message should be handled separately"), Notification::BlockPublicOrVpcAccessUpdated { .. } => { // https://github.com/neondatabase/neon/pull/10073 } @@ -323,8 +264,8 @@ async fn handle_messages( } Err(e) => { tracing::error!( - "failed to connect to redis: {e}, will try to reconnect in {RECONNECT_TIMEOUT:#?}" - ); + "failed to connect to redis: {e}, will try to reconnect in {RECONNECT_TIMEOUT:#?}" + ); tokio::time::sleep(RECONNECT_TIMEOUT).await; continue; } @@ -350,21 +291,14 @@ async fn handle_messages( /// Handle console's invalidation messages. #[tracing::instrument(name = "redis_notifications", skip_all)] pub async fn task_main( - config: &'static ProxyConfig, redis: ConnectionWithCredentialsProvider, cache: Arc, - cancel_map: CancelMap, region_id: String, ) -> anyhow::Result where C: ProjectInfoCache + Send + Sync + 'static, { - let cancellation_handler = Arc::new(CancellationHandler::<()>::new( - &config.connect_to_compute, - cancel_map, - crate::metrics::CancellationSource::FromRedis, - )); - let handler = MessageHandler::new(cache, cancellation_handler, region_id); + let handler = MessageHandler::new(cache, region_id); // 6h - 1m. // There will be 1 minute overlap between two tasks. But at least we can be sure that no message is lost. let mut interval = tokio::time::interval(std::time::Duration::from_secs(6 * 60 * 60 - 60)); @@ -442,35 +376,6 @@ mod tests { Ok(()) } - #[test] - fn parse_cancel_session() -> anyhow::Result<()> { - let cancel_key_data = CancelKeyData { - backend_pid: 42, - cancel_key: 41, - }; - let uuid = uuid::Uuid::new_v4(); - let msg = Notification::Cancel(CancelSession { - cancel_key_data, - region_id: None, - session_id: uuid, - peer_addr: None, - }); - let text = serde_json::to_string(&msg)?; - let result: Notification = serde_json::from_str(&text)?; - assert_eq!(msg, result); - - let msg = Notification::Cancel(CancelSession { - cancel_key_data, - region_id: Some("region".to_string()), - session_id: uuid, - peer_addr: None, - }); - let text = serde_json::to_string(&msg)?; - let result: Notification = serde_json::from_str(&text)?; - assert_eq!(msg, result,); - - Ok(()) - } #[test] fn parse_unknown_topic() -> anyhow::Result<()> { diff --git a/proxy/src/serverless/mod.rs b/proxy/src/serverless/mod.rs index c2623e0eca..6888772362 100644 --- a/proxy/src/serverless/mod.rs +++ b/proxy/src/serverless/mod.rs @@ -43,7 +43,7 @@ use tokio_util::task::TaskTracker; use tracing::{info, warn, Instrument}; use utils::http::error::ApiError; -use crate::cancellation::CancellationHandlerMain; +use crate::cancellation::CancellationHandler; use crate::config::{ProxyConfig, ProxyProtocolV2}; use crate::context::RequestContext; use crate::ext::TaskExt; @@ -61,7 +61,7 @@ pub async fn task_main( auth_backend: &'static crate::auth::Backend<'static, ()>, ws_listener: TcpListener, cancellation_token: CancellationToken, - cancellation_handler: Arc, + cancellation_handler: Arc, endpoint_rate_limiter: Arc, ) -> anyhow::Result<()> { scopeguard::defer! { @@ -318,7 +318,7 @@ async fn connection_handler( backend: Arc, connections: TaskTracker, cancellations: TaskTracker, - cancellation_handler: Arc, + cancellation_handler: Arc, endpoint_rate_limiter: Arc, cancellation_token: CancellationToken, conn: AsyncRW, @@ -412,7 +412,7 @@ async fn request_handler( config: &'static ProxyConfig, backend: Arc, ws_connections: TaskTracker, - cancellation_handler: Arc, + cancellation_handler: Arc, session_id: uuid::Uuid, conn_info: ConnectionInfo, // used to cancel in-flight HTTP requests. not used to cancel websockets diff --git a/proxy/src/serverless/websocket.rs b/proxy/src/serverless/websocket.rs index 47326c1181..585a7d63b2 100644 --- a/proxy/src/serverless/websocket.rs +++ b/proxy/src/serverless/websocket.rs @@ -12,7 +12,7 @@ use pin_project_lite::pin_project; use tokio::io::{self, AsyncBufRead, AsyncRead, AsyncWrite, ReadBuf}; use tracing::warn; -use crate::cancellation::CancellationHandlerMain; +use crate::cancellation::CancellationHandler; use crate::config::ProxyConfig; use crate::context::RequestContext; use crate::error::{io_error, ReportableError}; @@ -129,7 +129,7 @@ pub(crate) async fn serve_websocket( auth_backend: &'static crate::auth::Backend<'static, ()>, ctx: RequestContext, websocket: OnUpgrade, - cancellation_handler: Arc, + cancellation_handler: Arc, endpoint_rate_limiter: Arc, hostname: Option, cancellations: tokio_util::task::task_tracker::TaskTracker, diff --git a/proxy/src/usage_metrics.rs b/proxy/src/usage_metrics.rs index 487504d709..e1cc7e87b4 100644 --- a/proxy/src/usage_metrics.rs +++ b/proxy/src/usage_metrics.rs @@ -396,13 +396,13 @@ async fn upload_backup_events( TimeoutOrCancel::caused_by_cancel, FAILED_UPLOAD_WARN_THRESHOLD, FAILED_UPLOAD_MAX_RETRIES, - "request_data_upload", + "usage_metrics_upload", cancel, ) .await .ok_or_else(|| anyhow::Error::new(TimeoutOrCancel::Cancel)) .and_then(|x| x) - .context("request_data_upload")?; + .with_context(|| format!("usage_metrics_upload: path={remote_path}"))?; Ok(()) } diff --git a/pyproject.toml b/pyproject.toml index 735d12d756..e299c421e9 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -43,7 +43,7 @@ websockets = "^12.0" clickhouse-connect = "^0.7.16" kafka-python = "^2.0.2" jwcrypto = "^1.5.6" -h2 = "^4.1.0" +h2 = {git = "https://github.com/python-hyper/h2"} types-jwcrypto = "^1.5.0.20240925" pyyaml = "^6.0.2" types-pyyaml = "^6.0.12.20240917" @@ -94,6 +94,7 @@ target-version = "py311" extend-exclude = [ "vendor/", "target/", + "test_runner/stubs/", # Autogenerated by mypy's stubgen ] line-length = 100 # this setting is rather guidance, it won't fail if it can't make the shorter diff --git a/safekeeper/client/src/mgmt_api.rs b/safekeeper/client/src/mgmt_api.rs index f78745043a..f65bfaa6d5 100644 --- a/safekeeper/client/src/mgmt_api.rs +++ b/safekeeper/client/src/mgmt_api.rs @@ -4,7 +4,7 @@ //! united. use reqwest::{IntoUrl, Method, StatusCode}; -use safekeeper_api::models::TimelineStatus; +use safekeeper_api::models::{TimelineCreateRequest, TimelineStatus}; use std::error::Error as _; use utils::{ http::error::HttpErrorBody, @@ -76,6 +76,28 @@ impl Client { } } + pub async fn create_timeline(&self, req: &TimelineCreateRequest) -> Result { + let uri = format!( + "{}/v1/tenant/{}/timeline/{}", + self.mgmt_api_endpoint, req.tenant_id, req.timeline_id + ); + let resp = self.post(&uri, req).await?; + resp.json().await.map_err(Error::ReceiveBody) + } + + pub async fn delete_timeline( + &self, + tenant_id: TenantId, + timeline_id: TimelineId, + ) -> Result { + let uri = format!( + "{}/v1/tenant/{}/timeline/{}", + self.mgmt_api_endpoint, tenant_id, timeline_id + ); + let resp = self.request(Method::DELETE, &uri, ()).await?; + resp.json().await.map_err(Error::ReceiveBody) + } + pub async fn timeline_status( &self, tenant_id: TenantId, @@ -102,6 +124,19 @@ impl Client { self.get(&uri).await } + pub async fn utilization(&self) -> Result { + let uri = format!("{}/v1/utilization/", self.mgmt_api_endpoint); + self.get(&uri).await + } + + async fn post( + &self, + uri: U, + body: B, + ) -> Result { + self.request(Method::POST, uri, body).await + } + async fn get(&self, uri: U) -> Result { self.request(Method::GET, uri, ()).await } diff --git a/safekeeper/src/handler.rs b/safekeeper/src/handler.rs index bb639bfb32..e77eeb4130 100644 --- a/safekeeper/src/handler.rs +++ b/safekeeper/src/handler.rs @@ -52,16 +52,70 @@ pub struct SafekeeperPostgresHandler { /// Parsed Postgres command. enum SafekeeperPostgresCommand { - StartWalPush, - StartReplication { start_lsn: Lsn, term: Option }, + StartWalPush { + proto_version: u32, + // Eventually timelines will be always created explicitly by storcon. + // This option allows legacy behaviour for compute to do that until we + // fully migrate. + allow_timeline_creation: bool, + }, + StartReplication { + start_lsn: Lsn, + term: Option, + }, IdentifySystem, TimelineStatus, - JSONCtrl { cmd: AppendLogicalMessage }, + JSONCtrl { + cmd: AppendLogicalMessage, + }, } fn parse_cmd(cmd: &str) -> anyhow::Result { if cmd.starts_with("START_WAL_PUSH") { - Ok(SafekeeperPostgresCommand::StartWalPush) + // Allow additional options in postgres START_REPLICATION style like + // START_WAL_PUSH (proto_version '3', allow_timeline_creation 'false'). + // Parsing here is very naive and breaks in case of commas or + // whitespaces in values, but enough for our purposes. + let re = Regex::new(r"START_WAL_PUSH(\s+?\((.*)\))?").unwrap(); + let caps = re + .captures(cmd) + .context(format!("failed to parse START_WAL_PUSH command {}", cmd))?; + // capture () content + let options = caps.get(2).map(|m| m.as_str()).unwrap_or(""); + // default values + let mut proto_version = 2; + let mut allow_timeline_creation = true; + for kvstr in options.split(",") { + if kvstr.is_empty() { + continue; + } + let mut kvit = kvstr.split_whitespace(); + let key = kvit.next().context(format!( + "failed to parse key in kv {} in command {}", + kvstr, cmd + ))?; + let value = kvit.next().context(format!( + "failed to parse value in kv {} in command {}", + kvstr, cmd + ))?; + let value_trimmed = value.trim_matches('\''); + if key == "proto_version" { + proto_version = value_trimmed.parse::().context(format!( + "failed to parse proto_version value {} in command {}", + value, cmd + ))?; + } + if key == "allow_timeline_creation" { + allow_timeline_creation = value_trimmed.parse::().context(format!( + "failed to parse allow_timeline_creation value {} in command {}", + value, cmd + ))?; + } + } + Ok(SafekeeperPostgresCommand::StartWalPush { + proto_version, + allow_timeline_creation, + }) } else if cmd.starts_with("START_REPLICATION") { let re = Regex::new( // We follow postgres START_REPLICATION LOGICAL options to pass term. @@ -95,7 +149,7 @@ fn parse_cmd(cmd: &str) -> anyhow::Result { fn cmd_to_string(cmd: &SafekeeperPostgresCommand) -> &str { match cmd { - SafekeeperPostgresCommand::StartWalPush => "START_WAL_PUSH", + SafekeeperPostgresCommand::StartWalPush { .. } => "START_WAL_PUSH", SafekeeperPostgresCommand::StartReplication { .. } => "START_REPLICATION", SafekeeperPostgresCommand::TimelineStatus => "TIMELINE_STATUS", SafekeeperPostgresCommand::IdentifySystem => "IDENTIFY_SYSTEM", @@ -293,8 +347,11 @@ impl postgres_backend::Handler self.ttid = TenantTimelineId::new(tenant_id, timeline_id); match cmd { - SafekeeperPostgresCommand::StartWalPush => { - self.handle_start_wal_push(pgb) + SafekeeperPostgresCommand::StartWalPush { + proto_version, + allow_timeline_creation, + } => { + self.handle_start_wal_push(pgb, proto_version, allow_timeline_creation) .instrument(info_span!("WAL receiver")) .await } @@ -467,3 +524,39 @@ impl SafekeeperPostgresHandler { } } } + +#[cfg(test)] +mod tests { + use super::SafekeeperPostgresCommand; + + /// Test parsing of START_WAL_PUSH command + #[test] + fn test_start_wal_push_parse() { + let cmd = "START_WAL_PUSH"; + let parsed = super::parse_cmd(cmd).expect("failed to parse"); + match parsed { + SafekeeperPostgresCommand::StartWalPush { + proto_version, + allow_timeline_creation, + } => { + assert_eq!(proto_version, 2); + assert!(allow_timeline_creation); + } + _ => panic!("unexpected command"), + } + + let cmd = + "START_WAL_PUSH (proto_version '3', allow_timeline_creation 'false', unknown 'hoho')"; + let parsed = super::parse_cmd(cmd).expect("failed to parse"); + match parsed { + SafekeeperPostgresCommand::StartWalPush { + proto_version, + allow_timeline_creation, + } => { + assert_eq!(proto_version, 3); + assert!(!allow_timeline_creation); + } + _ => panic!("unexpected command"), + } + } +} diff --git a/safekeeper/src/http/routes.rs b/safekeeper/src/http/routes.rs index 4b9fb9eb67..7ec08ecf9a 100644 --- a/safekeeper/src/http/routes.rs +++ b/safekeeper/src/http/routes.rs @@ -127,6 +127,13 @@ async fn timeline_create_handler(mut request: Request) -> Result) -> Result, ApiError> { + check_permission(&request, None)?; + let global_timelines = get_global_timelines(&request); + let utilization = global_timelines.get_timeline_counts(); + json_response(StatusCode::OK, utilization) +} + /// List all (not deleted) timelines. /// Note: it is possible to do the same with debug_dump. async fn timeline_list_handler(request: Request) -> Result, ApiError> { @@ -620,6 +627,7 @@ pub fn make_router( failpoints_handler(r, cancel).await }) }) + .get("/v1/uzilization", |r| request_span(r, utilization_handler)) .delete("/v1/tenant/:tenant_id", |r| { request_span(r, tenant_delete_handler) }) diff --git a/safekeeper/src/receive_wal.rs b/safekeeper/src/receive_wal.rs index daaa8a253d..cb42f6f414 100644 --- a/safekeeper/src/receive_wal.rs +++ b/safekeeper/src/receive_wal.rs @@ -200,9 +200,14 @@ impl SafekeeperPostgresHandler { pub async fn handle_start_wal_push( &mut self, pgb: &mut PostgresBackend, + proto_version: u32, + allow_timeline_creation: bool, ) -> Result<(), QueryError> { let mut tli: Option = None; - if let Err(end) = self.handle_start_wal_push_guts(pgb, &mut tli).await { + if let Err(end) = self + .handle_start_wal_push_guts(pgb, &mut tli, proto_version, allow_timeline_creation) + .await + { // Log the result and probably send it to the client, closing the stream. let handle_end_fut = pgb.handle_copy_stream_end(end); // If we managed to create the timeline, augment logging with current LSNs etc. @@ -222,6 +227,8 @@ impl SafekeeperPostgresHandler { &mut self, pgb: &mut PostgresBackend, tli: &mut Option, + proto_version: u32, + allow_timeline_creation: bool, ) -> Result<(), CopyStreamHandlerEnd> { // The `tli` parameter is only used for passing _out_ a timeline, one should // not have been passed in. @@ -250,12 +257,17 @@ impl SafekeeperPostgresHandler { conn_id: self.conn_id, pgb_reader: &mut pgb_reader, peer_addr, + proto_version, acceptor_handle: &mut acceptor_handle, global_timelines: self.global_timelines.clone(), }; - // Read first message and create timeline if needed. - let res = network_reader.read_first_message().await; + // Read first message and create timeline if needed and allowed. This + // won't be when timelines will be always created by storcon and + // allow_timeline_creation becomes false. + let res = network_reader + .read_first_message(allow_timeline_creation) + .await; let network_res = if let Ok((timeline, next_msg)) = res { let pageserver_feedback_rx: tokio::sync::broadcast::Receiver = @@ -313,6 +325,7 @@ struct NetworkReader<'a, IO> { conn_id: ConnectionId, pgb_reader: &'a mut PostgresBackendReader, peer_addr: SocketAddr, + proto_version: u32, // WalAcceptor is spawned when we learn server info from walproposer and // create timeline; handle is put here. acceptor_handle: &'a mut Option>>, @@ -322,9 +335,10 @@ struct NetworkReader<'a, IO> { impl NetworkReader<'_, IO> { async fn read_first_message( &mut self, + allow_timeline_creation: bool, ) -> Result<(WalResidentTimeline, ProposerAcceptorMessage), CopyStreamHandlerEnd> { // Receive information about server to create timeline, if not yet. - let next_msg = read_message(self.pgb_reader).await?; + let next_msg = read_message(self.pgb_reader, self.proto_version).await?; let tli = match next_msg { ProposerAcceptorMessage::Greeting(ref greeting) => { info!( @@ -336,17 +350,22 @@ impl NetworkReader<'_, IO> { system_id: greeting.system_id, wal_seg_size: greeting.wal_seg_size, }; - let tli = self - .global_timelines - .create( - self.ttid, - Configuration::empty(), - server_info, - Lsn::INVALID, - Lsn::INVALID, - ) - .await - .context("create timeline")?; + let tli = if allow_timeline_creation { + self.global_timelines + .create( + self.ttid, + Configuration::empty(), + server_info, + Lsn::INVALID, + Lsn::INVALID, + ) + .await + .context("create timeline")? + } else { + self.global_timelines + .get(self.ttid) + .context("get timeline")? + }; tli.wal_residence_guard().await? } _ => { @@ -375,7 +394,7 @@ impl NetworkReader<'_, IO> { )); // Forward all messages to WalAcceptor - read_network_loop(self.pgb_reader, msg_tx, next_msg).await + read_network_loop(self.pgb_reader, msg_tx, next_msg, self.proto_version).await } } @@ -383,9 +402,10 @@ impl NetworkReader<'_, IO> { /// TODO: Return Ok(None) on graceful termination. async fn read_message( pgb_reader: &mut PostgresBackendReader, + proto_version: u32, ) -> Result { let copy_data = pgb_reader.read_copy_message().await?; - let msg = ProposerAcceptorMessage::parse(copy_data)?; + let msg = ProposerAcceptorMessage::parse(copy_data, proto_version)?; Ok(msg) } @@ -393,6 +413,7 @@ async fn read_network_loop( pgb_reader: &mut PostgresBackendReader, msg_tx: Sender, mut next_msg: ProposerAcceptorMessage, + proto_version: u32, ) -> Result<(), CopyStreamHandlerEnd> { /// Threshold for logging slow WalAcceptor sends. const SLOW_THRESHOLD: Duration = Duration::from_secs(5); @@ -425,7 +446,7 @@ async fn read_network_loop( WAL_RECEIVER_QUEUE_DEPTH_TOTAL.inc(); WAL_RECEIVER_QUEUE_SIZE_TOTAL.add(size as i64); - next_msg = read_message(pgb_reader).await?; + next_msg = read_message(pgb_reader, proto_version).await?; } } diff --git a/safekeeper/src/safekeeper.rs b/safekeeper/src/safekeeper.rs index 06403228e9..45e19c31b6 100644 --- a/safekeeper/src/safekeeper.rs +++ b/safekeeper/src/safekeeper.rs @@ -29,7 +29,7 @@ use utils::{ lsn::Lsn, }; -const SK_PROTOCOL_VERSION: u32 = 2; +pub const SK_PROTOCOL_VERSION: u32 = 2; pub const UNKNOWN_SERVER_VERSION: u32 = 0; #[derive(Debug, Clone, Copy, Serialize, Deserialize, PartialEq, Eq, PartialOrd, Ord)] @@ -317,7 +317,14 @@ pub enum ProposerAcceptorMessage { impl ProposerAcceptorMessage { /// Parse proposer message. - pub fn parse(msg_bytes: Bytes) -> Result { + pub fn parse(msg_bytes: Bytes, proto_version: u32) -> Result { + if proto_version != SK_PROTOCOL_VERSION { + bail!( + "incompatible protocol version {}, expected {}", + proto_version, + SK_PROTOCOL_VERSION + ); + } // xxx using Reader is inefficient but easy to work with bincode let mut stream = msg_bytes.reader(); // u64 is here to avoid padding; it will be removed once we stop packing C structs into the wire as is diff --git a/safekeeper/src/timelines_global_map.rs b/safekeeper/src/timelines_global_map.rs index a701534f65..01c6aff6c3 100644 --- a/safekeeper/src/timelines_global_map.rs +++ b/safekeeper/src/timelines_global_map.rs @@ -13,6 +13,7 @@ use anyhow::{bail, Context, Result}; use camino::Utf8PathBuf; use camino_tempfile::Utf8TempDir; use safekeeper_api::membership::Configuration; +use safekeeper_api::models::SafekeeperUtilization; use safekeeper_api::ServerInfo; use serde::Serialize; use std::collections::HashMap; @@ -416,6 +417,20 @@ impl GlobalTimelines { .collect() } + /// Returns statistics about timeline counts + pub fn get_timeline_counts(&self) -> SafekeeperUtilization { + let global_lock = self.state.lock().unwrap(); + let timeline_count = global_lock + .timelines + .values() + .filter(|t| match t { + GlobalMapTimeline::CreationInProgress => false, + GlobalMapTimeline::Timeline(t) => !t.is_cancelled(), + }) + .count() as u64; + SafekeeperUtilization { timeline_count } + } + /// Returns all timelines belonging to a given tenant. Used for deleting all timelines of a tenant, /// and that's why it can return cancelled timelines, to retry deleting them. fn get_all_for_tenant(&self, tenant_id: TenantId) -> Vec> { diff --git a/safekeeper/tests/walproposer_sim/safekeeper.rs b/safekeeper/tests/walproposer_sim/safekeeper.rs index e0d593851e..0023a4d22a 100644 --- a/safekeeper/tests/walproposer_sim/safekeeper.rs +++ b/safekeeper/tests/walproposer_sim/safekeeper.rs @@ -15,7 +15,9 @@ use desim::{ }; use http::Uri; use safekeeper::{ - safekeeper::{ProposerAcceptorMessage, SafeKeeper, UNKNOWN_SERVER_VERSION}, + safekeeper::{ + ProposerAcceptorMessage, SafeKeeper, SK_PROTOCOL_VERSION, UNKNOWN_SERVER_VERSION, + }, state::{TimelinePersistentState, TimelineState}, timeline::TimelineError, wal_storage::Storage, @@ -285,7 +287,7 @@ impl ConnState { bail!("finished processing START_REPLICATION") } - let msg = ProposerAcceptorMessage::parse(copy_data)?; + let msg = ProposerAcceptorMessage::parse(copy_data, SK_PROTOCOL_VERSION)?; debug!("got msg: {:?}", msg); self.process(msg, global) } else { diff --git a/scripts/comment-test-report.js b/scripts/comment-test-report.js index e8e0b3c23a..96a0ea3267 100755 --- a/scripts/comment-test-report.js +++ b/scripts/comment-test-report.js @@ -84,6 +84,12 @@ const parseReportJson = async ({ reportJsonUrl, fetch }) => { } else { arch = "unknown" } + let lfcState = "" + if (test.parameters.includes("'with-lfc'")) { + lfcState = "with-lfc" + } else { + lfcState = "without-lfc" + } // Removing build type and PostgreSQL version from the test name to make it shorter const testName = test.name.replace(new RegExp(`${buildType}-pg${pgVersion}-?`), "").replace("[]", "") @@ -91,6 +97,7 @@ const parseReportJson = async ({ reportJsonUrl, fetch }) => { test.pgVersion = pgVersion test.buildType = buildType test.arch = arch + test.lfcState = lfcState if (test.status === "passed") { passedTests[pgVersion][testName].push(test) @@ -157,7 +164,7 @@ const reportSummary = async (params) => { const links = [] for (const test of tests) { const allureLink = `${reportUrl}#suites/${test.parentUid}/${test.uid}` - links.push(`[${test.buildType}-${test.arch}](${allureLink})`) + links.push(`[${test.buildType}-${test.arch}-${test.lfcState}](${allureLink})`) } summary += `- \`${testName}\`: ${links.join(", ")}\n` } @@ -188,7 +195,7 @@ const reportSummary = async (params) => { const links = [] for (const test of tests) { const allureLink = `${reportUrl}#suites/${test.parentUid}/${test.uid}/retries` - links.push(`[${test.buildType}-${test.arch}](${allureLink})`) + links.push(`[${test.buildType}-${test.arch}-${test.lfcState}](${allureLink})`) } summary += `- \`${testName}\`: ${links.join(", ")}\n` } diff --git a/scripts/ingest_regress_test_result-new-format.py b/scripts/ingest_regress_test_result-new-format.py index 064c516718..ad2baf56bb 100644 --- a/scripts/ingest_regress_test_result-new-format.py +++ b/scripts/ingest_regress_test_result-new-format.py @@ -134,7 +134,7 @@ def ingest_test_result( if p["name"].startswith("__") } arch = parameters.get("arch", "UNKNOWN").strip("'") - lfc = parameters.get("lfc", "False") == "True" + lfc = parameters.get("lfc", "without-lfc").strip("'") == "with-lfc" build_type, pg_version, unparametrized_name = parse_test_name(test["name"]) labels = {label["name"]: label["value"] for label in test["labels"]} diff --git a/storage_broker/src/bin/storage_broker.rs b/storage_broker/src/bin/storage_broker.rs index 1fbb651656..9d4c22484c 100644 --- a/storage_broker/src/bin/storage_broker.rs +++ b/storage_broker/src/bin/storage_broker.rs @@ -32,7 +32,6 @@ use tokio::sync::broadcast::error::RecvError; use tokio::time; use tonic::body::{self, empty_body, BoxBody}; use tonic::codegen::Service; -use tonic::transport::server::Connected; use tonic::Code; use tonic::{Request, Response, Status}; use tracing::*; @@ -459,9 +458,10 @@ impl BrokerService for Broker { &self, request: Request>, ) -> Result, Status> { - let remote_addr = request - .remote_addr() - .expect("TCPConnectInfo inserted by handler"); + let &RemoteAddr(remote_addr) = request + .extensions() + .get() + .expect("RemoteAddr inserted by handler"); let mut publisher = self.registry.register_publisher(remote_addr); let mut stream = request.into_inner(); @@ -484,9 +484,10 @@ impl BrokerService for Broker { &self, request: Request, ) -> Result, Status> { - let remote_addr = request - .remote_addr() - .expect("TCPConnectInfo inserted by handler"); + let &RemoteAddr(remote_addr) = request + .extensions() + .get() + .expect("RemoteAddr inserted by handler"); let proto_key = request .into_inner() .subscription_key @@ -537,9 +538,10 @@ impl BrokerService for Broker { &self, request: Request, ) -> std::result::Result, Status> { - let remote_addr = request - .remote_addr() - .expect("TCPConnectInfo inserted by handler"); + let &RemoteAddr(remote_addr) = request + .extensions() + .get() + .expect("RemoteAddr inserted by handler"); let proto_filter = request.into_inner(); let ttid_filter = proto_filter.tenant_timeline_id.as_ref(); @@ -628,6 +630,9 @@ async fn http1_handler( Ok(resp) } +#[derive(Clone, Copy)] +struct RemoteAddr(SocketAddr); + #[tokio::main] async fn main() -> Result<(), Box> { let args = Args::parse(); @@ -687,13 +692,13 @@ async fn main() -> Result<(), Box> { .max_concurrent_streams(None); let storage_broker_server_cloned = storage_broker_server.clone(); - let connect_info = stream.connect_info(); + let remote_addr = RemoteAddr(addr); let service_fn_ = async move { service_fn(move |mut req| { // That's what tonic's MakeSvc.call does to pass conninfo to // the request handler (and where its request.remote_addr() // expects it to find). - req.extensions_mut().insert(connect_info.clone()); + req.extensions_mut().insert(remote_addr); // Technically this second clone is not needed, but consume // by async block is apparently unavoidable. BTW, error diff --git a/storage_controller/Cargo.toml b/storage_controller/Cargo.toml index caaa22d0a5..9860bd5d0e 100644 --- a/storage_controller/Cargo.toml +++ b/storage_controller/Cargo.toml @@ -45,12 +45,11 @@ strum_macros.workspace = true diesel = { version = "2.2.6", features = [ "serde_json", - "postgres", - "r2d2", "chrono", ] } +diesel-async = { version = "0.5.2", features = ["postgres", "bb8", "async-connection-wrapper"] } diesel_migrations = { version = "2.2.0" } -r2d2 = { version = "0.8.10" } +scoped-futures = "0.1.4" utils = { path = "../libs/utils/" } metrics = { path = "../libs/metrics/" } diff --git a/storage_controller/migrations/2025-01-15-181207_safekeepers_disabled_to_pause/down.sql b/storage_controller/migrations/2025-01-15-181207_safekeepers_disabled_to_pause/down.sql new file mode 100644 index 0000000000..3c7126e343 --- /dev/null +++ b/storage_controller/migrations/2025-01-15-181207_safekeepers_disabled_to_pause/down.sql @@ -0,0 +1,2 @@ +ALTER TABLE safekeepers ALTER COLUMN scheduling_policy SET DEFAULT 'disabled'; +UPDATE safekeepers SET scheduling_policy = 'disabled' WHERE scheduling_policy = 'pause'; diff --git a/storage_controller/migrations/2025-01-15-181207_safekeepers_disabled_to_pause/up.sql b/storage_controller/migrations/2025-01-15-181207_safekeepers_disabled_to_pause/up.sql new file mode 100644 index 0000000000..9ff75444f3 --- /dev/null +++ b/storage_controller/migrations/2025-01-15-181207_safekeepers_disabled_to_pause/up.sql @@ -0,0 +1,2 @@ +ALTER TABLE safekeepers ALTER COLUMN scheduling_policy SET DEFAULT 'pause'; +UPDATE safekeepers SET scheduling_policy = 'pause' WHERE scheduling_policy = 'disabled'; diff --git a/storage_controller/src/main.rs b/storage_controller/src/main.rs index 801409d612..659c088d51 100644 --- a/storage_controller/src/main.rs +++ b/storage_controller/src/main.rs @@ -308,7 +308,7 @@ async fn async_main() -> anyhow::Result<()> { // Validate that we can connect to the database Persistence::await_connection(&secrets.database_url, args.db_connect_timeout.into()).await?; - let persistence = Arc::new(Persistence::new(secrets.database_url)); + let persistence = Arc::new(Persistence::new(secrets.database_url).await); let service = Service::spawn(config, persistence.clone()).await?; diff --git a/storage_controller/src/pageserver_client.rs b/storage_controller/src/pageserver_client.rs index b19cbc4fa3..141ff6f720 100644 --- a/storage_controller/src/pageserver_client.rs +++ b/storage_controller/src/pageserver_client.rs @@ -2,8 +2,9 @@ use pageserver_api::{ models::{ detach_ancestor::AncestorDetached, LocationConfig, LocationConfigListResponse, PageserverUtilization, SecondaryProgress, TenantScanRemoteStorageResponse, - TenantShardSplitRequest, TenantShardSplitResponse, TimelineArchivalConfigRequest, - TimelineCreateRequest, TimelineInfo, TopTenantShardsRequest, TopTenantShardsResponse, + TenantShardSplitRequest, TenantShardSplitResponse, TenantWaitLsnRequest, + TimelineArchivalConfigRequest, TimelineCreateRequest, TimelineInfo, TopTenantShardsRequest, + TopTenantShardsResponse, }, shard::TenantShardId, }; @@ -299,4 +300,17 @@ impl PageserverClient { self.inner.top_tenant_shards(request).await ) } + + pub(crate) async fn wait_lsn( + &self, + tenant_shard_id: TenantShardId, + request: TenantWaitLsnRequest, + ) -> Result { + measured_request!( + "wait_lsn", + crate::metrics::Method::Post, + &self.node_id_label, + self.inner.wait_lsn(tenant_shard_id, request).await + ) + } } diff --git a/storage_controller/src/persistence.rs b/storage_controller/src/persistence.rs index 37bfaf1139..35eb15b297 100644 --- a/storage_controller/src/persistence.rs +++ b/storage_controller/src/persistence.rs @@ -5,9 +5,12 @@ use std::time::Duration; use std::time::Instant; use self::split_state::SplitState; -use diesel::pg::PgConnection; use diesel::prelude::*; -use diesel::Connection; +use diesel_async::async_connection_wrapper::AsyncConnectionWrapper; +use diesel_async::pooled_connection::bb8::Pool; +use diesel_async::pooled_connection::AsyncDieselConnectionManager; +use diesel_async::RunQueryDsl; +use diesel_async::{AsyncConnection, AsyncPgConnection}; use itertools::Itertools; use pageserver_api::controller_api::AvailabilityZone; use pageserver_api::controller_api::MetadataHealthRecord; @@ -20,6 +23,7 @@ use pageserver_api::shard::ShardConfigError; use pageserver_api::shard::ShardIdentity; use pageserver_api::shard::ShardStripeSize; use pageserver_api::shard::{ShardCount, ShardNumber, TenantShardId}; +use scoped_futures::ScopedBoxFuture; use serde::{Deserialize, Serialize}; use utils::generation::Generation; use utils::id::{NodeId, TenantId}; @@ -60,7 +64,7 @@ const MIGRATIONS: EmbeddedMigrations = embed_migrations!("./migrations"); /// updated, and reads of nodes are always from memory, not the database. We only require that /// we can UPDATE a node's scheduling mode reasonably quickly to mark a bad node offline. pub struct Persistence { - connection_pool: diesel::r2d2::Pool>, + connection_pool: Pool, } /// Legacy format, for use in JSON compat objects in test environment @@ -76,7 +80,7 @@ pub(crate) enum DatabaseError { #[error(transparent)] Connection(#[from] diesel::result::ConnectionError), #[error(transparent)] - ConnectionPool(#[from] r2d2::Error), + ConnectionPool(#[from] diesel_async::pooled_connection::bb8::RunError), #[error("Logical error: {0}")] Logical(String), #[error("Migration error: {0}")] @@ -124,6 +128,7 @@ pub(crate) enum AbortShardSplitStatus { pub(crate) type DatabaseResult = Result; /// Some methods can operate on either a whole tenant or a single shard +#[derive(Clone)] pub(crate) enum TenantFilter { Tenant(TenantId), Shard(TenantShardId), @@ -136,6 +141,11 @@ pub(crate) struct ShardGenerationState { pub(crate) generation_pageserver: Option, } +// A generous allowance for how many times we may retry serializable transactions +// before giving up. This is not expected to be hit: it is a defensive measure in case we +// somehow engineer a situation where duelling transactions might otherwise live-lock. +const MAX_RETRIES: usize = 128; + impl Persistence { // The default postgres connection limit is 100. We use up to 99, to leave one free for a human admin under // normal circumstances. This assumes we have exclusive use of the database cluster to which we connect. @@ -145,12 +155,12 @@ impl Persistence { const IDLE_CONNECTION_TIMEOUT: Duration = Duration::from_secs(10); const MAX_CONNECTION_LIFETIME: Duration = Duration::from_secs(60); - pub fn new(database_url: String) -> Self { - let manager = diesel::r2d2::ConnectionManager::::new(database_url); + pub async fn new(database_url: String) -> Self { + let manager = AsyncDieselConnectionManager::::new(database_url); // We will use a connection pool: this is primarily to _limit_ our connection count, rather than to optimize time // to execute queries (database queries are not generally on latency-sensitive paths). - let connection_pool = diesel::r2d2::Pool::builder() + let connection_pool = Pool::builder() .max_size(Self::MAX_CONNECTIONS) .max_lifetime(Some(Self::MAX_CONNECTION_LIFETIME)) .idle_timeout(Some(Self::IDLE_CONNECTION_TIMEOUT)) @@ -158,6 +168,7 @@ impl Persistence { .min_idle(Some(1)) .test_on_check_out(true) .build(manager) + .await .expect("Could not build connection pool"); Self { connection_pool } @@ -171,7 +182,7 @@ impl Persistence { ) -> Result<(), diesel::ConnectionError> { let started_at = Instant::now(); loop { - match PgConnection::establish(database_url) { + match AsyncPgConnection::establish(database_url).await { Ok(_) => { tracing::info!("Connected to database."); return Ok(()); @@ -192,57 +203,22 @@ impl Persistence { pub(crate) async fn migration_run(&self) -> DatabaseResult<()> { use diesel_migrations::{HarnessWithOutput, MigrationHarness}; - self.with_conn(move |conn| -> DatabaseResult<()> { - HarnessWithOutput::write_to_stdout(conn) - .run_pending_migrations(MIGRATIONS) - .map(|_| ()) - .map_err(|e| DatabaseError::Migration(e.to_string())) - }) - .await - } - - /// Wraps `with_conn` in order to collect latency and error metrics - async fn with_measured_conn(&self, op: DatabaseOperation, func: F) -> DatabaseResult - where - F: Fn(&mut PgConnection) -> DatabaseResult + Send + 'static, - R: Send + 'static, - { - let latency = &METRICS_REGISTRY - .metrics_group - .storage_controller_database_query_latency; - let _timer = latency.start_timer(DatabaseQueryLatencyLabelGroup { operation: op }); - - let res = self.with_conn(func).await; - - if let Err(err) = &res { - let error_counter = &METRICS_REGISTRY - .metrics_group - .storage_controller_database_query_error; - error_counter.inc(DatabaseQueryErrorLabelGroup { - error_type: err.error_label(), - operation: op, - }) - } - - res - } - - /// Call the provided function in a tokio blocking thread, with a Diesel database connection. - async fn with_conn(&self, func: F) -> DatabaseResult - where - F: Fn(&mut PgConnection) -> DatabaseResult + Send + 'static, - R: Send + 'static, - { - // A generous allowance for how many times we may retry serializable transactions - // before giving up. This is not expected to be hit: it is a defensive measure in case we - // somehow engineer a situation where duelling transactions might otherwise live-lock. - const MAX_RETRIES: usize = 128; - - let mut conn = self.connection_pool.get()?; - tokio::task::spawn_blocking(move || -> DatabaseResult { + // Can't use self.with_conn here as we do spawn_blocking which requires static. + let conn = self + .connection_pool + .dedicated_connection() + .await + .map_err(|e| DatabaseError::Migration(e.to_string()))?; + let mut async_wrapper: AsyncConnectionWrapper = + AsyncConnectionWrapper::from(conn); + tokio::task::spawn_blocking(move || { let mut retry_count = 0; loop { - match conn.build_transaction().serializable().run(|c| func(c)) { + let result = HarnessWithOutput::write_to_stdout(&mut async_wrapper) + .run_pending_migrations(MIGRATIONS) + .map(|_| ()) + .map_err(|e| DatabaseError::Migration(e.to_string())); + match result { Ok(r) => break Ok(r), Err( err @ DatabaseError::Query(diesel::result::Error::DatabaseError( @@ -271,33 +247,112 @@ impl Persistence { } }) .await - .expect("Task panic") + .map_err(|e| DatabaseError::Migration(e.to_string()))??; + Ok(()) + } + + /// Wraps `with_conn` in order to collect latency and error metrics + async fn with_measured_conn<'a, 'b, F, R>( + &self, + op: DatabaseOperation, + func: F, + ) -> DatabaseResult + where + F: for<'r> Fn(&'r mut AsyncPgConnection) -> ScopedBoxFuture<'b, 'r, DatabaseResult> + + Send + + std::marker::Sync + + 'a, + R: Send + 'b, + { + let latency = &METRICS_REGISTRY + .metrics_group + .storage_controller_database_query_latency; + let _timer = latency.start_timer(DatabaseQueryLatencyLabelGroup { operation: op }); + + let res = self.with_conn(func).await; + + if let Err(err) = &res { + let error_counter = &METRICS_REGISTRY + .metrics_group + .storage_controller_database_query_error; + error_counter.inc(DatabaseQueryErrorLabelGroup { + error_type: err.error_label(), + operation: op, + }) + } + + res + } + + /// Call the provided function with a Diesel database connection in a retry loop + async fn with_conn<'a, 'b, F, R>(&self, func: F) -> DatabaseResult + where + F: for<'r> Fn(&'r mut AsyncPgConnection) -> ScopedBoxFuture<'b, 'r, DatabaseResult> + + Send + + std::marker::Sync + + 'a, + R: Send + 'b, + { + let mut retry_count = 0; + loop { + let mut conn = self.connection_pool.get().await?; + match conn + .build_transaction() + .serializable() + .run(|c| func(c)) + .await + { + Ok(r) => break Ok(r), + Err( + err @ DatabaseError::Query(diesel::result::Error::DatabaseError( + diesel::result::DatabaseErrorKind::SerializationFailure, + _, + )), + ) => { + retry_count += 1; + if retry_count > MAX_RETRIES { + tracing::error!( + "Exceeded max retries on SerializationFailure errors: {err:?}" + ); + break Err(err); + } else { + // Retry on serialization errors: these are expected, because even though our + // transactions don't fight for the same rows, they will occasionally collide + // on index pages (e.g. increment_generation for unrelated shards can collide) + tracing::debug!("Retrying transaction on serialization failure {err:?}"); + continue; + } + } + Err(e) => break Err(e), + } + } } /// When a node is first registered, persist it before using it for anything pub(crate) async fn insert_node(&self, node: &Node) -> DatabaseResult<()> { - let np = node.to_persistent(); - self.with_measured_conn( - DatabaseOperation::InsertNode, - move |conn| -> DatabaseResult<()> { + let np = &node.to_persistent(); + self.with_measured_conn(DatabaseOperation::InsertNode, move |conn| { + Box::pin(async move { diesel::insert_into(crate::schema::nodes::table) - .values(&np) - .execute(conn)?; + .values(np) + .execute(conn) + .await?; Ok(()) - }, - ) + }) + }) .await } /// At startup, populate the list of nodes which our shards may be placed on pub(crate) async fn list_nodes(&self) -> DatabaseResult> { let nodes: Vec = self - .with_measured_conn( - DatabaseOperation::ListNodes, - move |conn| -> DatabaseResult<_> { - Ok(crate::schema::nodes::table.load::(conn)?) - }, - ) + .with_measured_conn(DatabaseOperation::ListNodes, move |conn| { + Box::pin(async move { + Ok(crate::schema::nodes::table + .load::(conn) + .await?) + }) + }) .await?; tracing::info!("list_nodes: loaded {} nodes", nodes.len()); @@ -313,11 +368,14 @@ impl Persistence { use crate::schema::nodes::dsl::*; let updated = self .with_measured_conn(DatabaseOperation::UpdateNode, move |conn| { - let updated = diesel::update(nodes) - .filter(node_id.eq(input_node_id.0 as i64)) - .set((scheduling_policy.eq(String::from(input_scheduling)),)) - .execute(conn)?; - Ok(updated) + Box::pin(async move { + let updated = diesel::update(nodes) + .filter(node_id.eq(input_node_id.0 as i64)) + .set((scheduling_policy.eq(String::from(input_scheduling)),)) + .execute(conn) + .await?; + Ok(updated) + }) }) .await?; @@ -339,17 +397,16 @@ impl Persistence { &self, ) -> DatabaseResult> { use crate::schema::tenant_shards::dsl::*; - self.with_measured_conn( - DatabaseOperation::ListTenantShards, - move |conn| -> DatabaseResult<_> { + self.with_measured_conn(DatabaseOperation::ListTenantShards, move |conn| { + Box::pin(async move { let query = tenant_shards.filter( placement_policy.ne(serde_json::to_string(&PlacementPolicy::Detached).unwrap()), ); - let result = query.load::(conn)?; + let result = query.load::(conn).await?; Ok(result) - }, - ) + }) + }) .await } @@ -359,15 +416,14 @@ impl Persistence { filter_tenant_id: TenantId, ) -> DatabaseResult> { use crate::schema::tenant_shards::dsl::*; - self.with_measured_conn( - DatabaseOperation::LoadTenant, - move |conn| -> DatabaseResult<_> { + self.with_measured_conn(DatabaseOperation::LoadTenant, move |conn| { + Box::pin(async move { let query = tenant_shards.filter(tenant_id.eq(filter_tenant_id.to_string())); - let result = query.load::(conn)?; + let result = query.load::(conn).await?; Ok(result) - }, - ) + }) + }) .await } @@ -393,19 +449,22 @@ impl Persistence { }) .collect::>(); - self.with_measured_conn( - DatabaseOperation::InsertTenantShards, - move |conn| -> DatabaseResult<()> { + let shards = &shards; + let metadata_health_records = &metadata_health_records; + self.with_measured_conn(DatabaseOperation::InsertTenantShards, move |conn| { + Box::pin(async move { diesel::insert_into(tenant_shards::table) - .values(&shards) - .execute(conn)?; + .values(shards) + .execute(conn) + .await?; diesel::insert_into(metadata_health::table) - .values(&metadata_health_records) - .execute(conn)?; + .values(metadata_health_records) + .execute(conn) + .await?; Ok(()) - }, - ) + }) + }) .await } @@ -413,31 +472,31 @@ impl Persistence { /// the tenant from memory on this server. pub(crate) async fn delete_tenant(&self, del_tenant_id: TenantId) -> DatabaseResult<()> { use crate::schema::tenant_shards::dsl::*; - self.with_measured_conn( - DatabaseOperation::DeleteTenant, - move |conn| -> DatabaseResult<()> { + self.with_measured_conn(DatabaseOperation::DeleteTenant, move |conn| { + Box::pin(async move { // `metadata_health` status (if exists) is also deleted based on the cascade behavior. diesel::delete(tenant_shards) .filter(tenant_id.eq(del_tenant_id.to_string())) - .execute(conn)?; + .execute(conn) + .await?; Ok(()) - }, - ) + }) + }) .await } pub(crate) async fn delete_node(&self, del_node_id: NodeId) -> DatabaseResult<()> { use crate::schema::nodes::dsl::*; - self.with_measured_conn( - DatabaseOperation::DeleteNode, - move |conn| -> DatabaseResult<()> { + self.with_measured_conn(DatabaseOperation::DeleteNode, move |conn| { + Box::pin(async move { diesel::delete(nodes) .filter(node_id.eq(del_node_id.0 as i64)) - .execute(conn)?; + .execute(conn) + .await?; Ok(()) - }, - ) + }) + }) .await } @@ -454,34 +513,41 @@ impl Persistence { use crate::schema::tenant_shards::dsl::*; let updated = self .with_measured_conn(DatabaseOperation::ReAttach, move |conn| { - let rows_updated = diesel::update(tenant_shards) - .filter(generation_pageserver.eq(input_node_id.0 as i64)) - .set(generation.eq(generation + 1)) - .execute(conn)?; + Box::pin(async move { + let rows_updated = diesel::update(tenant_shards) + .filter(generation_pageserver.eq(input_node_id.0 as i64)) + .set(generation.eq(generation + 1)) + .execute(conn) + .await?; - tracing::info!("Incremented {} tenants' generations", rows_updated); + tracing::info!("Incremented {} tenants' generations", rows_updated); - // TODO: UPDATE+SELECT in one query + // TODO: UPDATE+SELECT in one query - let updated = tenant_shards - .filter(generation_pageserver.eq(input_node_id.0 as i64)) - .select(TenantShardPersistence::as_select()) - .load(conn)?; + let updated = tenant_shards + .filter(generation_pageserver.eq(input_node_id.0 as i64)) + .select(TenantShardPersistence::as_select()) + .load(conn) + .await?; - // If the node went through a drain and restart phase before re-attaching, - // then reset it's node scheduling policy to active. - diesel::update(nodes) - .filter(node_id.eq(input_node_id.0 as i64)) - .filter( - scheduling_policy - .eq(String::from(NodeSchedulingPolicy::PauseForRestart)) - .or(scheduling_policy.eq(String::from(NodeSchedulingPolicy::Draining))) - .or(scheduling_policy.eq(String::from(NodeSchedulingPolicy::Filling))), - ) - .set(scheduling_policy.eq(String::from(NodeSchedulingPolicy::Active))) - .execute(conn)?; + // If the node went through a drain and restart phase before re-attaching, + // then reset it's node scheduling policy to active. + diesel::update(nodes) + .filter(node_id.eq(input_node_id.0 as i64)) + .filter( + scheduling_policy + .eq(String::from(NodeSchedulingPolicy::PauseForRestart)) + .or(scheduling_policy + .eq(String::from(NodeSchedulingPolicy::Draining))) + .or(scheduling_policy + .eq(String::from(NodeSchedulingPolicy::Filling))), + ) + .set(scheduling_policy.eq(String::from(NodeSchedulingPolicy::Active))) + .execute(conn) + .await?; - Ok(updated) + Ok(updated) + }) }) .await?; @@ -518,19 +584,22 @@ impl Persistence { use crate::schema::tenant_shards::dsl::*; let updated = self .with_measured_conn(DatabaseOperation::IncrementGeneration, move |conn| { - let updated = diesel::update(tenant_shards) - .filter(tenant_id.eq(tenant_shard_id.tenant_id.to_string())) - .filter(shard_number.eq(tenant_shard_id.shard_number.0 as i32)) - .filter(shard_count.eq(tenant_shard_id.shard_count.literal() as i32)) - .set(( - generation.eq(generation + 1), - generation_pageserver.eq(node_id.0 as i64), - )) - // TODO: only returning() the generation column - .returning(TenantShardPersistence::as_returning()) - .get_result(conn)?; + Box::pin(async move { + let updated = diesel::update(tenant_shards) + .filter(tenant_id.eq(tenant_shard_id.tenant_id.to_string())) + .filter(shard_number.eq(tenant_shard_id.shard_number.0 as i32)) + .filter(shard_count.eq(tenant_shard_id.shard_count.literal() as i32)) + .set(( + generation.eq(generation + 1), + generation_pageserver.eq(node_id.0 as i64), + )) + // TODO: only returning() the generation column + .returning(TenantShardPersistence::as_returning()) + .get_result(conn) + .await?; - Ok(updated) + Ok(updated) + }) }) .await?; @@ -562,12 +631,15 @@ impl Persistence { use crate::schema::tenant_shards::dsl::*; let rows = self .with_measured_conn(DatabaseOperation::TenantGenerations, move |conn| { - let result = tenant_shards - .filter(tenant_id.eq(filter_tenant_id.to_string())) - .select(TenantShardPersistence::as_select()) - .order(shard_number) - .load(conn)?; - Ok(result) + Box::pin(async move { + let result = tenant_shards + .filter(tenant_id.eq(filter_tenant_id.to_string())) + .select(TenantShardPersistence::as_select()) + .order(shard_number) + .load(conn) + .await?; + Ok(result) + }) }) .await?; @@ -615,15 +687,18 @@ impl Persistence { break; } + let in_clause = &in_clause; let chunk_rows = self .with_measured_conn(DatabaseOperation::ShardGenerations, move |conn| { - // diesel doesn't support multi-column IN queries, so we compose raw SQL. No escaping is required because - // the inputs are strongly typed and cannot carry any user-supplied raw string content. - let result : Vec = diesel::sql_query( - format!("SELECT * from tenant_shards where (tenant_id, shard_number, shard_count) in ({in_clause});").as_str() - ).load(conn)?; + Box::pin(async move { + // diesel doesn't support multi-column IN queries, so we compose raw SQL. No escaping is required because + // the inputs are strongly typed and cannot carry any user-supplied raw string content. + let result : Vec = diesel::sql_query( + format!("SELECT * from tenant_shards where (tenant_id, shard_number, shard_count) in ({in_clause});").as_str() + ).load(conn).await?; - Ok(result) + Ok(result) + }) }) .await?; rows.extend(chunk_rows.into_iter()) @@ -657,51 +732,58 @@ impl Persistence { ) -> DatabaseResult<()> { use crate::schema::tenant_shards::dsl::*; + let tenant = &tenant; + let input_placement_policy = &input_placement_policy; + let input_config = &input_config; + let input_generation = &input_generation; + let input_scheduling_policy = &input_scheduling_policy; self.with_measured_conn(DatabaseOperation::UpdateTenantShard, move |conn| { - let query = match tenant { - TenantFilter::Shard(tenant_shard_id) => diesel::update(tenant_shards) - .filter(tenant_id.eq(tenant_shard_id.tenant_id.to_string())) - .filter(shard_number.eq(tenant_shard_id.shard_number.0 as i32)) - .filter(shard_count.eq(tenant_shard_id.shard_count.literal() as i32)) - .into_boxed(), - TenantFilter::Tenant(input_tenant_id) => diesel::update(tenant_shards) - .filter(tenant_id.eq(input_tenant_id.to_string())) - .into_boxed(), - }; + Box::pin(async move { + let query = match tenant { + TenantFilter::Shard(tenant_shard_id) => diesel::update(tenant_shards) + .filter(tenant_id.eq(tenant_shard_id.tenant_id.to_string())) + .filter(shard_number.eq(tenant_shard_id.shard_number.0 as i32)) + .filter(shard_count.eq(tenant_shard_id.shard_count.literal() as i32)) + .into_boxed(), + TenantFilter::Tenant(input_tenant_id) => diesel::update(tenant_shards) + .filter(tenant_id.eq(input_tenant_id.to_string())) + .into_boxed(), + }; - // Clear generation_pageserver if we are moving into a state where we won't have - // any attached pageservers. - let input_generation_pageserver = match input_placement_policy { - None | Some(PlacementPolicy::Attached(_)) => None, - Some(PlacementPolicy::Detached | PlacementPolicy::Secondary) => Some(None), - }; + // Clear generation_pageserver if we are moving into a state where we won't have + // any attached pageservers. + let input_generation_pageserver = match input_placement_policy { + None | Some(PlacementPolicy::Attached(_)) => None, + Some(PlacementPolicy::Detached | PlacementPolicy::Secondary) => Some(None), + }; - #[derive(AsChangeset)] - #[diesel(table_name = crate::schema::tenant_shards)] - struct ShardUpdate { - generation: Option, - placement_policy: Option, - config: Option, - scheduling_policy: Option, - generation_pageserver: Option>, - } + #[derive(AsChangeset)] + #[diesel(table_name = crate::schema::tenant_shards)] + struct ShardUpdate { + generation: Option, + placement_policy: Option, + config: Option, + scheduling_policy: Option, + generation_pageserver: Option>, + } - let update = ShardUpdate { - generation: input_generation.map(|g| g.into().unwrap() as i32), - placement_policy: input_placement_policy - .as_ref() - .map(|p| serde_json::to_string(&p).unwrap()), - config: input_config - .as_ref() - .map(|c| serde_json::to_string(&c).unwrap()), - scheduling_policy: input_scheduling_policy - .map(|p| serde_json::to_string(&p).unwrap()), - generation_pageserver: input_generation_pageserver, - }; + let update = ShardUpdate { + generation: input_generation.map(|g| g.into().unwrap() as i32), + placement_policy: input_placement_policy + .as_ref() + .map(|p| serde_json::to_string(&p).unwrap()), + config: input_config + .as_ref() + .map(|c| serde_json::to_string(&c).unwrap()), + scheduling_policy: input_scheduling_policy + .map(|p| serde_json::to_string(&p).unwrap()), + generation_pageserver: input_generation_pageserver, + }; - query.set(update).execute(conn)?; + query.set(update).execute(conn).await?; - Ok(()) + Ok(()) + }) }) .await?; @@ -715,23 +797,27 @@ impl Persistence { ) -> DatabaseResult)>> { use crate::schema::tenant_shards::dsl::*; + let preferred_azs = preferred_azs.as_slice(); self.with_measured_conn(DatabaseOperation::SetPreferredAzs, move |conn| { - let mut shards_updated = Vec::default(); + Box::pin(async move { + let mut shards_updated = Vec::default(); - for (tenant_shard_id, preferred_az) in preferred_azs.iter() { - let updated = diesel::update(tenant_shards) - .filter(tenant_id.eq(tenant_shard_id.tenant_id.to_string())) - .filter(shard_number.eq(tenant_shard_id.shard_number.0 as i32)) - .filter(shard_count.eq(tenant_shard_id.shard_count.literal() as i32)) - .set(preferred_az_id.eq(preferred_az.as_ref().map(|az| az.0.clone()))) - .execute(conn)?; + for (tenant_shard_id, preferred_az) in preferred_azs.iter() { + let updated = diesel::update(tenant_shards) + .filter(tenant_id.eq(tenant_shard_id.tenant_id.to_string())) + .filter(shard_number.eq(tenant_shard_id.shard_number.0 as i32)) + .filter(shard_count.eq(tenant_shard_id.shard_count.literal() as i32)) + .set(preferred_az_id.eq(preferred_az.as_ref().map(|az| az.0.clone()))) + .execute(conn) + .await?; - if updated == 1 { - shards_updated.push((*tenant_shard_id, preferred_az.clone())); + if updated == 1 { + shards_updated.push((*tenant_shard_id, preferred_az.clone())); + } } - } - Ok(shards_updated) + Ok(shards_updated) + }) }) .await } @@ -739,17 +825,21 @@ impl Persistence { pub(crate) async fn detach(&self, tenant_shard_id: TenantShardId) -> anyhow::Result<()> { use crate::schema::tenant_shards::dsl::*; self.with_measured_conn(DatabaseOperation::Detach, move |conn| { - let updated = diesel::update(tenant_shards) - .filter(tenant_id.eq(tenant_shard_id.tenant_id.to_string())) - .filter(shard_number.eq(tenant_shard_id.shard_number.0 as i32)) - .filter(shard_count.eq(tenant_shard_id.shard_count.literal() as i32)) - .set(( - generation_pageserver.eq(Option::::None), - placement_policy.eq(serde_json::to_string(&PlacementPolicy::Detached).unwrap()), - )) - .execute(conn)?; + Box::pin(async move { + let updated = diesel::update(tenant_shards) + .filter(tenant_id.eq(tenant_shard_id.tenant_id.to_string())) + .filter(shard_number.eq(tenant_shard_id.shard_number.0 as i32)) + .filter(shard_count.eq(tenant_shard_id.shard_count.literal() as i32)) + .set(( + generation_pageserver.eq(Option::::None), + placement_policy + .eq(serde_json::to_string(&PlacementPolicy::Detached).unwrap()), + )) + .execute(conn) + .await?; - Ok(updated) + Ok(updated) + }) }) .await?; @@ -768,14 +858,16 @@ impl Persistence { parent_to_children: Vec<(TenantShardId, Vec)>, ) -> DatabaseResult<()> { use crate::schema::tenant_shards::dsl::*; - self.with_measured_conn(DatabaseOperation::BeginShardSplit, move |conn| -> DatabaseResult<()> { + let parent_to_children = parent_to_children.as_slice(); + self.with_measured_conn(DatabaseOperation::BeginShardSplit, move |conn| { + Box::pin(async move { // Mark parent shards as splitting let updated = diesel::update(tenant_shards) .filter(tenant_id.eq(split_tenant_id.to_string())) .filter(shard_count.eq(old_shard_count.literal() as i32)) .set((splitting.eq(1),)) - .execute(conn)?; + .execute(conn).await?; if u8::try_from(updated) .map_err(|_| DatabaseError::Logical( format!("Overflow existing shard count {} while splitting", updated)) @@ -788,7 +880,7 @@ impl Persistence { } // FIXME: spurious clone to sidestep closure move rules - let parent_to_children = parent_to_children.clone(); + let parent_to_children = parent_to_children.to_vec(); // Insert child shards for (parent_shard_id, children) in parent_to_children { @@ -796,7 +888,7 @@ impl Persistence { .filter(tenant_id.eq(parent_shard_id.tenant_id.to_string())) .filter(shard_number.eq(parent_shard_id.shard_number.0 as i32)) .filter(shard_count.eq(parent_shard_id.shard_count.literal() as i32)) - .load::(conn)?; + .load::(conn).await?; let parent = if parent.len() != 1 { return Err(DatabaseError::Logical(format!( "Parent shard {parent_shard_id} not found" @@ -811,12 +903,13 @@ impl Persistence { debug_assert!(shard.splitting == SplitState::Splitting); diesel::insert_into(tenant_shards) .values(shard) - .execute(conn)?; + .execute(conn).await?; } } Ok(()) }) + }) .await } @@ -828,25 +921,26 @@ impl Persistence { old_shard_count: ShardCount, ) -> DatabaseResult<()> { use crate::schema::tenant_shards::dsl::*; - self.with_measured_conn( - DatabaseOperation::CompleteShardSplit, - move |conn| -> DatabaseResult<()> { + self.with_measured_conn(DatabaseOperation::CompleteShardSplit, move |conn| { + Box::pin(async move { // Drop parent shards diesel::delete(tenant_shards) .filter(tenant_id.eq(split_tenant_id.to_string())) .filter(shard_count.eq(old_shard_count.literal() as i32)) - .execute(conn)?; + .execute(conn) + .await?; // Clear sharding flag let updated = diesel::update(tenant_shards) .filter(tenant_id.eq(split_tenant_id.to_string())) .set((splitting.eq(0),)) - .execute(conn)?; + .execute(conn) + .await?; debug_assert!(updated > 0); Ok(()) - }, - ) + }) + }) .await } @@ -858,15 +952,15 @@ impl Persistence { new_shard_count: ShardCount, ) -> DatabaseResult { use crate::schema::tenant_shards::dsl::*; - self.with_measured_conn( - DatabaseOperation::AbortShardSplit, - move |conn| -> DatabaseResult { + self.with_measured_conn(DatabaseOperation::AbortShardSplit, move |conn| { + Box::pin(async move { // Clear the splitting state on parent shards let updated = diesel::update(tenant_shards) .filter(tenant_id.eq(split_tenant_id.to_string())) .filter(shard_count.ne(new_shard_count.literal() as i32)) .set((splitting.eq(0),)) - .execute(conn)?; + .execute(conn) + .await?; // Parent shards are already gone: we cannot abort. if updated == 0 { @@ -886,11 +980,12 @@ impl Persistence { diesel::delete(tenant_shards) .filter(tenant_id.eq(split_tenant_id.to_string())) .filter(shard_count.eq(new_shard_count.literal() as i32)) - .execute(conn)?; + .execute(conn) + .await?; Ok(AbortShardSplitStatus::Aborted) - }, - ) + }) + }) .await } @@ -906,25 +1001,28 @@ impl Persistence { ) -> DatabaseResult<()> { use crate::schema::metadata_health::dsl::*; - self.with_measured_conn( - DatabaseOperation::UpdateMetadataHealth, - move |conn| -> DatabaseResult<_> { + let healthy_records = healthy_records.as_slice(); + let unhealthy_records = unhealthy_records.as_slice(); + self.with_measured_conn(DatabaseOperation::UpdateMetadataHealth, move |conn| { + Box::pin(async move { diesel::insert_into(metadata_health) - .values(&healthy_records) + .values(healthy_records) .on_conflict((tenant_id, shard_number, shard_count)) .do_update() .set((healthy.eq(true), last_scrubbed_at.eq(now))) - .execute(conn)?; + .execute(conn) + .await?; diesel::insert_into(metadata_health) - .values(&unhealthy_records) + .values(unhealthy_records) .on_conflict((tenant_id, shard_number, shard_count)) .do_update() .set((healthy.eq(false), last_scrubbed_at.eq(now))) - .execute(conn)?; + .execute(conn) + .await?; Ok(()) - }, - ) + }) + }) .await } @@ -933,15 +1031,13 @@ impl Persistence { pub(crate) async fn list_metadata_health_records( &self, ) -> DatabaseResult> { - self.with_measured_conn( - DatabaseOperation::ListMetadataHealth, - move |conn| -> DatabaseResult<_> { - Ok( - crate::schema::metadata_health::table - .load::(conn)?, - ) - }, - ) + self.with_measured_conn(DatabaseOperation::ListMetadataHealth, move |conn| { + Box::pin(async { + Ok(crate::schema::metadata_health::table + .load::(conn) + .await?) + }) + }) .await } @@ -953,10 +1049,15 @@ impl Persistence { use crate::schema::metadata_health::dsl::*; self.with_measured_conn( DatabaseOperation::ListMetadataHealthUnhealthy, - move |conn| -> DatabaseResult<_> { - Ok(crate::schema::metadata_health::table - .filter(healthy.eq(false)) - .load::(conn)?) + move |conn| { + Box::pin(async { + DatabaseResult::Ok( + crate::schema::metadata_health::table + .filter(healthy.eq(false)) + .load::(conn) + .await?, + ) + }) }, ) .await @@ -970,15 +1071,14 @@ impl Persistence { ) -> DatabaseResult> { use crate::schema::metadata_health::dsl::*; - self.with_measured_conn( - DatabaseOperation::ListMetadataHealthOutdated, - move |conn| -> DatabaseResult<_> { + self.with_measured_conn(DatabaseOperation::ListMetadataHealthOutdated, move |conn| { + Box::pin(async move { let query = metadata_health.filter(last_scrubbed_at.lt(earlier)); - let res = query.load::(conn)?; + let res = query.load::(conn).await?; Ok(res) - }, - ) + }) + }) .await } @@ -986,12 +1086,13 @@ impl Persistence { /// It is an error for the table to contain more than one entry. pub(crate) async fn get_leader(&self) -> DatabaseResult> { let mut leader: Vec = self - .with_measured_conn( - DatabaseOperation::GetLeader, - move |conn| -> DatabaseResult<_> { - Ok(crate::schema::controllers::table.load::(conn)?) - }, - ) + .with_measured_conn(DatabaseOperation::GetLeader, move |conn| { + Box::pin(async move { + Ok(crate::schema::controllers::table + .load::(conn) + .await?) + }) + }) .await?; if leader.len() > 1 { @@ -1014,26 +1115,33 @@ impl Persistence { use crate::schema::controllers::dsl::*; let updated = self - .with_measured_conn( - DatabaseOperation::UpdateLeader, - move |conn| -> DatabaseResult { + .with_measured_conn(DatabaseOperation::UpdateLeader, move |conn| { + let prev = prev.clone(); + let new = new.clone(); + Box::pin(async move { let updated = match &prev { - Some(prev) => diesel::update(controllers) - .filter(address.eq(prev.address.clone())) - .filter(started_at.eq(prev.started_at)) - .set(( - address.eq(new.address.clone()), - started_at.eq(new.started_at), - )) - .execute(conn)?, - None => diesel::insert_into(controllers) - .values(new.clone()) - .execute(conn)?, + Some(prev) => { + diesel::update(controllers) + .filter(address.eq(prev.address.clone())) + .filter(started_at.eq(prev.started_at)) + .set(( + address.eq(new.address.clone()), + started_at.eq(new.started_at), + )) + .execute(conn) + .await? + } + None => { + diesel::insert_into(controllers) + .values(new.clone()) + .execute(conn) + .await? + } }; Ok(updated) - }, - ) + }) + }) .await?; if updated == 0 { @@ -1048,12 +1156,13 @@ impl Persistence { /// At startup, populate the list of nodes which our shards may be placed on pub(crate) async fn list_safekeepers(&self) -> DatabaseResult> { let safekeepers: Vec = self - .with_measured_conn( - DatabaseOperation::ListNodes, - move |conn| -> DatabaseResult<_> { - Ok(crate::schema::safekeepers::table.load::(conn)?) - }, - ) + .with_measured_conn(DatabaseOperation::ListNodes, move |conn| { + Box::pin(async move { + Ok(crate::schema::safekeepers::table + .load::(conn) + .await?) + }) + }) .await?; tracing::info!("list_safekeepers: loaded {} nodes", safekeepers.len()); @@ -1066,11 +1175,14 @@ impl Persistence { id: i64, ) -> Result { use crate::schema::safekeepers::dsl::{id as id_column, safekeepers}; - self.with_conn(move |conn| -> DatabaseResult { - Ok(safekeepers - .filter(id_column.eq(&id)) - .select(SafekeeperPersistence::as_select()) - .get_result(conn)?) + self.with_conn(move |conn| { + Box::pin(async move { + Ok(safekeepers + .filter(id_column.eq(&id)) + .select(SafekeeperPersistence::as_select()) + .get_result(conn) + .await?) + }) }) .await } @@ -1081,26 +1193,30 @@ impl Persistence { ) -> Result<(), DatabaseError> { use crate::schema::safekeepers::dsl::*; - self.with_conn(move |conn| -> DatabaseResult<()> { - let bind = record - .as_insert_or_update() - .map_err(|e| DatabaseError::Logical(format!("{e}")))?; + self.with_conn(move |conn| { + let record = record.clone(); + Box::pin(async move { + let bind = record + .as_insert_or_update() + .map_err(|e| DatabaseError::Logical(format!("{e}")))?; - let inserted_updated = diesel::insert_into(safekeepers) - .values(&bind) - .on_conflict(id) - .do_update() - .set(&bind) - .execute(conn)?; + let inserted_updated = diesel::insert_into(safekeepers) + .values(&bind) + .on_conflict(id) + .do_update() + .set(&bind) + .execute(conn) + .await?; - if inserted_updated != 1 { - return Err(DatabaseError::Logical(format!( - "unexpected number of rows ({})", - inserted_updated - ))); - } + if inserted_updated != 1 { + return Err(DatabaseError::Logical(format!( + "unexpected number of rows ({})", + inserted_updated + ))); + } - Ok(()) + Ok(()) + }) }) .await } @@ -1112,26 +1228,29 @@ impl Persistence { ) -> Result<(), DatabaseError> { use crate::schema::safekeepers::dsl::*; - self.with_conn(move |conn| -> DatabaseResult<()> { - #[derive(Insertable, AsChangeset)] - #[diesel(table_name = crate::schema::safekeepers)] - struct UpdateSkSchedulingPolicy<'a> { - id: i64, - scheduling_policy: &'a str, - } - let scheduling_policy_ = String::from(scheduling_policy_); + self.with_conn(move |conn| { + Box::pin(async move { + #[derive(Insertable, AsChangeset)] + #[diesel(table_name = crate::schema::safekeepers)] + struct UpdateSkSchedulingPolicy<'a> { + id: i64, + scheduling_policy: &'a str, + } + let scheduling_policy_ = String::from(scheduling_policy_); - let rows_affected = diesel::update(safekeepers.filter(id.eq(id_))) - .set(scheduling_policy.eq(scheduling_policy_)) - .execute(conn)?; + let rows_affected = diesel::update(safekeepers.filter(id.eq(id_))) + .set(scheduling_policy.eq(scheduling_policy_)) + .execute(conn) + .await?; - if rows_affected != 1 { - return Err(DatabaseError::Logical(format!( - "unexpected number of rows ({rows_affected})", - ))); - } + if rows_affected != 1 { + return Err(DatabaseError::Logical(format!( + "unexpected number of rows ({rows_affected})", + ))); + } - Ok(()) + Ok(()) + }) }) .await } diff --git a/storage_controller/src/reconciler.rs b/storage_controller/src/reconciler.rs index adced3b77d..03db947263 100644 --- a/storage_controller/src/reconciler.rs +++ b/storage_controller/src/reconciler.rs @@ -3,7 +3,7 @@ use crate::persistence::Persistence; use crate::{compute_hook, service}; use pageserver_api::controller_api::{AvailabilityZone, PlacementPolicy}; use pageserver_api::models::{ - LocationConfig, LocationConfigMode, LocationConfigSecondary, TenantConfig, + LocationConfig, LocationConfigMode, LocationConfigSecondary, TenantConfig, TenantWaitLsnRequest, }; use pageserver_api::shard::{ShardIdentity, TenantShardId}; use pageserver_client::mgmt_api; @@ -348,6 +348,32 @@ impl Reconciler { Ok(()) } + async fn wait_lsn( + &self, + node: &Node, + tenant_shard_id: TenantShardId, + timelines: HashMap, + ) -> Result { + const TIMEOUT: Duration = Duration::from_secs(10); + + let client = PageserverClient::new( + node.get_id(), + node.base_url(), + self.service_config.jwt_token.as_deref(), + ); + + client + .wait_lsn( + tenant_shard_id, + TenantWaitLsnRequest { + timelines, + timeout: TIMEOUT, + }, + ) + .await + .map_err(|e| e.into()) + } + async fn get_lsns( &self, tenant_shard_id: TenantShardId, @@ -461,6 +487,39 @@ impl Reconciler { node: &Node, baseline: HashMap, ) -> anyhow::Result<()> { + // Signal to the pageserver that it should ingest up to the baseline LSNs. + loop { + match self.wait_lsn(node, tenant_shard_id, baseline.clone()).await { + Ok(StatusCode::OK) => { + // Everything is caught up + return Ok(()); + } + Ok(StatusCode::ACCEPTED) => { + // Some timelines are not caught up yet. + // They'll be polled below. + break; + } + Ok(StatusCode::NOT_FOUND) => { + // None of the timelines are present on the pageserver. + // This is correct if they've all been deleted, but + // let let the polling loop below cross check. + break; + } + Ok(status_code) => { + tracing::warn!( + "Unexpected status code ({status_code}) returned by wait_lsn endpoint" + ); + break; + } + Err(e) => { + tracing::info!("🕑 Can't trigger LSN wait on {node} yet, waiting ({e})",); + tokio::time::sleep(Duration::from_millis(500)).await; + continue; + } + } + } + + // Poll the LSNs until they catch up loop { let latest = match self.get_lsns(tenant_shard_id, node).await { Ok(l) => l, diff --git a/storage_controller/src/service.rs b/storage_controller/src/service.rs index 1d85839881..9ac9ee17ca 100644 --- a/storage_controller/src/service.rs +++ b/storage_controller/src/service.rs @@ -5411,6 +5411,15 @@ impl Service { expect_shards.sort_by_key(|tsp| (tsp.tenant_id.clone(), tsp.shard_number, tsp.shard_count)); + // Because JSON contents of persistent tenants might disagree with the fields in current `TenantConfig` + // definition, we will do an encode/decode cycle to ensure any legacy fields are dropped and any new + // fields are added, before doing a comparison. + for tsp in &mut persistent_shards { + let config: TenantConfig = serde_json::from_str(&tsp.config) + .map_err(|e| ApiError::InternalServerError(e.into()))?; + tsp.config = serde_json::to_string(&config).expect("Encoding config is infallible"); + } + if persistent_shards != expect_shards { tracing::error!("Consistency check failed on shards."); @@ -7270,19 +7279,14 @@ impl Service { Ok(()) } - /// Create a node fill plan (pick secondaries to promote) that meets the following requirements: - /// 1. The node should be filled until it reaches the expected cluster average of - /// attached shards. If there are not enough secondaries on the node, the plan stops early. - /// 2. Select tenant shards to promote such that the number of attached shards is balanced - /// throughout the cluster. We achieve this by picking tenant shards from each node, - /// starting from the ones with the largest number of attached shards, until the node - /// reaches the expected cluster average. - /// 3. Avoid promoting more shards of the same tenant than required. The upper bound - /// for the number of tenants from the same shard promoted to the node being filled is: - /// shard count for the tenant divided by the number of nodes in the cluster. + /// Create a node fill plan (pick secondaries to promote), based on: + /// 1. Shards which have a secondary on this node, and this node is in their home AZ, and are currently attached to a node + /// outside their home AZ, should be migrated back here. + /// 2. If after step 1 we have not migrated enough shards for this node to have its fair share of + /// attached shards, we will promote more shards from the nodes with the most attached shards, unless + /// those shards have a home AZ that doesn't match the node we're filling. fn fill_node_plan(&self, node_id: NodeId) -> Vec { let mut locked = self.inner.write().unwrap(); - let fill_requirement = locked.scheduler.compute_fill_requirement(node_id); let (nodes, tenants, _scheduler) = locked.parts_mut(); let node_az = nodes @@ -7291,53 +7295,79 @@ impl Service { .get_availability_zone_id() .clone(); - let mut tids_by_node = tenants - .iter_mut() - .filter_map(|(tid, tenant_shard)| { - if !matches!( - tenant_shard.get_scheduling_policy(), - ShardSchedulingPolicy::Active - ) { - // Only include tenants in fills if they have a normal (Active) scheduling policy. We - // even exclude Essential, because moving to fill a node is not essential to keeping this - // tenant available. - return None; - } + // The tenant shard IDs that we plan to promote from secondary to attached on this node + let mut plan = Vec::new(); - // AZ check: when filling nodes after a restart, our intent is to move _back_ the - // shards which belong on this node, not to promote shards whose scheduling preference - // would be on their currently attached node. So will avoid promoting shards whose - // home AZ doesn't match the AZ of the node we're filling. - match tenant_shard.preferred_az() { - None => { - // Shard doesn't have an AZ preference: it is elegible to be moved. - } - Some(az) if az == &node_az => { - // This shard's home AZ is equal to the node we're filling: it is - // elegible to be moved: fall through; - } - Some(_) => { - // This shard's home AZ is somewhere other than the node we're filling: - // do not include it in the fill plan. - return None; - } - } + // Collect shards which do not have a preferred AZ & are elegible for moving in stage 2 + let mut free_tids_by_node: HashMap> = HashMap::new(); - if tenant_shard.intent.get_secondary().contains(&node_id) { + // Don't respect AZ preferences if there is only one AZ. This comes up in tests, but it could + // conceivably come up in real life if deploying a single-AZ region intentionally. + let respect_azs = nodes + .values() + .map(|n| n.get_availability_zone_id()) + .unique() + .count() + > 1; + + // Step 1: collect all shards that we are required to migrate back to this node because their AZ preference + // requires it. + for (tsid, tenant_shard) in tenants { + if !tenant_shard.intent.get_secondary().contains(&node_id) { + // Shard doesn't have a secondary on this node, ignore it. + continue; + } + + // AZ check: when filling nodes after a restart, our intent is to move _back_ the + // shards which belong on this node, not to promote shards whose scheduling preference + // would be on their currently attached node. So will avoid promoting shards whose + // home AZ doesn't match the AZ of the node we're filling. + match tenant_shard.preferred_az() { + _ if !respect_azs => { if let Some(primary) = tenant_shard.intent.get_attached() { - return Some((*primary, *tid)); + free_tids_by_node.entry(*primary).or_default().push(*tsid); } } + None => { + // Shard doesn't have an AZ preference: it is elegible to be moved, but we + // will only do so if our target shard count requires it. + if let Some(primary) = tenant_shard.intent.get_attached() { + free_tids_by_node.entry(*primary).or_default().push(*tsid); + } + } + Some(az) if az == &node_az => { + // This shard's home AZ is equal to the node we're filling: it should + // be moved back to this node as part of filling, unless its currently + // attached location is also in its home AZ. + if let Some(primary) = tenant_shard.intent.get_attached() { + if nodes + .get(primary) + .expect("referenced node must exist") + .get_availability_zone_id() + != tenant_shard + .preferred_az() + .expect("tenant must have an AZ preference") + { + plan.push(*tsid) + } + } else { + plan.push(*tsid) + } + } + Some(_) => { + // This shard's home AZ is somewhere other than the node we're filling, + // it may not be moved back to this node as part of filling. Ignore it + } + } + } - None - }) - .into_group_map(); + // Step 2: also promote any AZ-agnostic shards as required to achieve the target number of attachments + let fill_requirement = locked.scheduler.compute_fill_requirement(node_id); let expected_attached = locked.scheduler.expected_attached_shard_count(); let nodes_by_load = locked.scheduler.nodes_by_attached_shard_count(); let mut promoted_per_tenant: HashMap = HashMap::new(); - let mut plan = Vec::new(); for (node_id, attached) in nodes_by_load { let available = locked.nodes.get(&node_id).is_some_and(|n| n.is_available()); @@ -7346,7 +7376,7 @@ impl Service { } if plan.len() >= fill_requirement - || tids_by_node.is_empty() + || free_tids_by_node.is_empty() || attached <= expected_attached { break; @@ -7358,7 +7388,7 @@ impl Service { let mut remove_node = false; while take > 0 { - match tids_by_node.get_mut(&node_id) { + match free_tids_by_node.get_mut(&node_id) { Some(tids) => match tids.pop() { Some(tid) => { let max_promote_for_tenant = std::cmp::max( @@ -7384,7 +7414,7 @@ impl Service { } if remove_node { - tids_by_node.remove(&node_id); + free_tids_by_node.remove(&node_id); } } diff --git a/storage_controller/src/service/chaos_injector.rs b/storage_controller/src/service/chaos_injector.rs index 0e551beaa7..98034421d6 100644 --- a/storage_controller/src/service/chaos_injector.rs +++ b/storage_controller/src/service/chaos_injector.rs @@ -1,11 +1,17 @@ -use std::{sync::Arc, time::Duration}; +use std::{ + collections::{BTreeMap, HashMap}, + sync::Arc, + time::Duration, +}; use pageserver_api::controller_api::ShardSchedulingPolicy; use rand::seq::SliceRandom; use rand::thread_rng; use tokio_util::sync::CancellationToken; +use utils::id::NodeId; +use utils::shard::TenantShardId; -use super::Service; +use super::{Node, Scheduler, Service, TenantShard}; pub struct ChaosInjector { service: Arc, @@ -35,50 +41,86 @@ impl ChaosInjector { } } + /// If a shard has a secondary and attached location, then re-assign the secondary to be + /// attached and the attached to be secondary. + /// + /// Only modifies tenants if they're in Active scheduling policy. + fn maybe_migrate_to_secondary( + &self, + tenant_shard_id: TenantShardId, + nodes: &Arc>, + tenants: &mut BTreeMap, + scheduler: &mut Scheduler, + ) { + let shard = tenants + .get_mut(&tenant_shard_id) + .expect("Held lock between choosing ID and this get"); + + if !matches!(shard.get_scheduling_policy(), ShardSchedulingPolicy::Active) { + // Skip non-active scheduling policies, so that a shard with a policy like Pause can + // be pinned without being disrupted by us. + tracing::info!( + "Skipping shard {tenant_shard_id}: scheduling policy is {:?}", + shard.get_scheduling_policy() + ); + return; + } + + // Pick a secondary to promote + let Some(new_location) = shard + .intent + .get_secondary() + .choose(&mut thread_rng()) + .cloned() + else { + tracing::info!( + "Skipping shard {tenant_shard_id}: no secondary location, can't migrate" + ); + return; + }; + + let Some(old_location) = *shard.intent.get_attached() else { + tracing::info!("Skipping shard {tenant_shard_id}: currently has no attached location"); + return; + }; + + tracing::info!("Injecting chaos: migrate {tenant_shard_id} {old_location}->{new_location}"); + + shard.intent.demote_attached(scheduler, old_location); + shard.intent.promote_attached(scheduler, new_location); + self.service.maybe_reconcile_shard(shard, nodes); + } + async fn inject_chaos(&mut self) { // Pick some shards to interfere with let batch_size = 128; let mut inner = self.service.inner.write().unwrap(); let (nodes, tenants, scheduler) = inner.parts_mut(); let tenant_ids = tenants.keys().cloned().collect::>(); - let victims = tenant_ids.choose_multiple(&mut thread_rng(), batch_size); - for victim in victims { - let shard = tenants - .get_mut(victim) - .expect("Held lock between choosing ID and this get"); - - if !matches!(shard.get_scheduling_policy(), ShardSchedulingPolicy::Active) { - // Skip non-active scheduling policies, so that a shard with a policy like Pause can - // be pinned without being disrupted by us. - tracing::info!( - "Skipping shard {victim}: scheduling policy is {:?}", - shard.get_scheduling_policy() - ); - continue; + // Prefer to migrate tenants that are currently outside their home AZ. This avoids the chaos injector + // continuously pushing tenants outside their home AZ: instead, we'll tend to cycle between picking some + // random tenants to move, and then on next chaos iteration moving them back, then picking some new + // random tenants on the next iteration. + let mut victims = Vec::with_capacity(batch_size); + for shard in tenants.values() { + if shard.is_attached_outside_preferred_az(nodes) { + victims.push(shard.tenant_shard_id); } - // Pick a secondary to promote - let Some(new_location) = shard - .intent - .get_secondary() - .choose(&mut thread_rng()) - .cloned() - else { - tracing::info!("Skipping shard {victim}: no secondary location, can't migrate"); - continue; - }; + if victims.len() >= batch_size { + break; + } + } - let Some(old_location) = *shard.intent.get_attached() else { - tracing::info!("Skipping shard {victim}: currently has no attached location"); - continue; - }; + let choose_random = batch_size.saturating_sub(victims.len()); + tracing::info!("Injecting chaos: found {} shards to migrate back to home AZ, picking {choose_random} random shards to migrate", victims.len()); - tracing::info!("Injecting chaos: migrate {victim} {old_location}->{new_location}"); + let random_victims = tenant_ids.choose_multiple(&mut thread_rng(), choose_random); + victims.extend(random_victims); - shard.intent.demote_attached(scheduler, old_location); - shard.intent.promote_attached(scheduler, new_location); - self.service.maybe_reconcile_shard(shard, nodes); + for victim in victims { + self.maybe_migrate_to_secondary(victim, nodes, tenants, scheduler); } } } diff --git a/storage_controller/src/tenant_shard.rs b/storage_controller/src/tenant_shard.rs index 79ed628c25..cbc2696b26 100644 --- a/storage_controller/src/tenant_shard.rs +++ b/storage_controller/src/tenant_shard.rs @@ -1793,6 +1793,23 @@ impl TenantShard { } } } + + /// Returns true if the tenant shard is attached to a node that is outside the preferred AZ. + /// + /// If the shard does not have a preferred AZ, returns false. + pub(crate) fn is_attached_outside_preferred_az(&self, nodes: &HashMap) -> bool { + self.intent + .get_attached() + .map(|node_id| { + Some( + nodes + .get(&node_id) + .expect("referenced node exists") + .get_availability_zone_id(), + ) == self.intent.preferred_az_id.as_ref() + }) + .unwrap_or(false) + } } impl Drop for TenantShard { diff --git a/storage_scrubber/src/pageserver_physical_gc.rs b/storage_scrubber/src/pageserver_physical_gc.rs index a997373375..063c6bcfb9 100644 --- a/storage_scrubber/src/pageserver_physical_gc.rs +++ b/storage_scrubber/src/pageserver_physical_gc.rs @@ -8,6 +8,8 @@ use crate::checks::{ }; use crate::metadata_stream::{stream_tenant_timelines, stream_tenants}; use crate::{init_remote, BucketConfig, NodeKind, RootTarget, TenantShardTimelineId, MAX_RETRIES}; +use async_stream::try_stream; +use futures::future::Either; use futures_util::{StreamExt, TryStreamExt}; use pageserver::tenant::remote_timeline_client::index::LayerFileMetadata; use pageserver::tenant::remote_timeline_client::manifest::OffloadedTimelineManifest; @@ -578,7 +580,7 @@ async fn gc_timeline( target: &RootTarget, mode: GcMode, ttid: TenantShardTimelineId, - accumulator: &Arc>, + accumulator: &std::sync::Mutex, tenant_manifest_info: Arc>, ) -> anyhow::Result { let mut summary = GcSummary::default(); @@ -721,9 +723,9 @@ pub async fn pageserver_physical_gc( let remote_client = Arc::new(remote_client); let tenants = if tenant_shard_ids.is_empty() { - futures::future::Either::Left(stream_tenants(&remote_client, &target)) + Either::Left(stream_tenants(&remote_client, &target)) } else { - futures::future::Either::Right(futures::stream::iter(tenant_shard_ids.into_iter().map(Ok))) + Either::Right(futures::stream::iter(tenant_shard_ids.into_iter().map(Ok))) }; // How many tenants to process in parallel. We need to be mindful of pageservers @@ -731,16 +733,16 @@ pub async fn pageserver_physical_gc( const CONCURRENCY: usize = 32; // Accumulate information about each tenant for cross-shard GC step we'll do at the end - let accumulator = Arc::new(std::sync::Mutex::new(TenantRefAccumulator::default())); + let accumulator = std::sync::Mutex::new(TenantRefAccumulator::default()); + + // Accumulate information about how many manifests we have GCd + let manifest_gc_summary = std::sync::Mutex::new(GcSummary::default()); // Generate a stream of TenantTimelineId - enum GcSummaryOrContent { - Content(T), - GcSummary(GcSummary), - } let timelines = tenants.map_ok(|tenant_shard_id| { let target_ref = ⌖ let remote_client_ref = &remote_client; + let manifest_gc_summary_ref = &manifest_gc_summary; async move { let gc_manifest_result = gc_tenant_manifests( remote_client_ref, @@ -757,55 +759,48 @@ pub async fn pageserver_physical_gc( (GcSummary::default(), None) } }; + manifest_gc_summary_ref + .lock() + .unwrap() + .merge(summary_from_manifest); let tenant_manifest_arc = Arc::new(tenant_manifest_opt); - let summary_from_manifest = Ok(GcSummaryOrContent::<(_, _)>::GcSummary( - summary_from_manifest, - )); - stream_tenant_timelines(remote_client_ref, target_ref, tenant_shard_id) - .await - .map(|stream| { - stream - .zip(futures::stream::iter(std::iter::repeat( - tenant_manifest_arc, - ))) - .map(|(ttid_res, tenant_manifest_arc)| { - ttid_res.map(move |ttid| { - GcSummaryOrContent::Content((ttid, tenant_manifest_arc)) - }) - }) - .chain(futures::stream::iter([summary_from_manifest].into_iter())) - }) + let mut timelines = Box::pin( + stream_tenant_timelines(remote_client_ref, target_ref, tenant_shard_id).await?, + ); + Ok(try_stream! { + while let Some(ttid_res) = timelines.next().await { + let ttid = ttid_res?; + yield (ttid, tenant_manifest_arc.clone()); + } + }) } }); - let timelines = std::pin::pin!(timelines.try_buffered(CONCURRENCY)); - let timelines = timelines.try_flatten(); let mut summary = GcSummary::default(); - - // Drain futures for per-shard GC, populating accumulator as a side effect { - let timelines = timelines.map_ok(|summary_or_ttid| match summary_or_ttid { - GcSummaryOrContent::Content((ttid, tenant_manifest_arc)) => { - futures::future::Either::Left(gc_timeline( - &remote_client, - &min_age, - &target, - mode, - ttid, - &accumulator, - tenant_manifest_arc, - )) - } - GcSummaryOrContent::GcSummary(gc_summary) => { - futures::future::Either::Right(futures::future::ok(gc_summary)) - } + let timelines = std::pin::pin!(timelines.try_buffered(CONCURRENCY)); + let timelines = timelines.try_flatten(); + + let timelines = timelines.map_ok(|(ttid, tenant_manifest_arc)| { + gc_timeline( + &remote_client, + &min_age, + &target, + mode, + ttid, + &accumulator, + tenant_manifest_arc, + ) }); let mut timelines = std::pin::pin!(timelines.try_buffered(CONCURRENCY)); + // Drain futures for per-shard GC, populating accumulator as a side effect while let Some(i) = timelines.next().await { summary.merge(i?); } } + // Streams are lazily evaluated, so only now do we have access to the inner object + summary.merge(manifest_gc_summary.into_inner().unwrap()); // Execute cross-shard GC, using the accumulator's full view of all the shards built in the per-shard GC let Some(client) = controller_client else { @@ -813,8 +808,7 @@ pub async fn pageserver_physical_gc( return Ok(summary); }; - let (ancestor_shards, ancestor_refs) = Arc::into_inner(accumulator) - .unwrap() + let (ancestor_shards, ancestor_refs) = accumulator .into_inner() .unwrap() .into_gc_ancestors(client, &mut summary) diff --git a/test_runner/conftest.py b/test_runner/conftest.py index 9e32469d69..4b591d3316 100644 --- a/test_runner/conftest.py +++ b/test_runner/conftest.py @@ -15,4 +15,5 @@ pytest_plugins = ( "fixtures.compare_fixtures", "fixtures.slow", "fixtures.reruns", + "fixtures.fast_import", ) diff --git a/test_runner/fixtures/common_types.py b/test_runner/fixtures/common_types.py index 6c22b31e00..c82c7578d1 100644 --- a/test_runner/fixtures/common_types.py +++ b/test_runner/fixtures/common_types.py @@ -208,6 +208,10 @@ class ShardIndex: shard_count=int(input[2:4], 16), ) + @property + def is_sharded(self) -> bool: + return self.shard_count != 0 + class TenantShardId: def __init__(self, tenant_id: TenantId, shard_number: int, shard_count: int): diff --git a/test_runner/fixtures/endpoint/http.py b/test_runner/fixtures/endpoint/http.py index aa0d95fe80..6e8210e978 100644 --- a/test_runner/fixtures/endpoint/http.py +++ b/test_runner/fixtures/endpoint/http.py @@ -28,11 +28,6 @@ class EndpointHttpClient(requests.Session): res.raise_for_status() return res.text - def installed_extensions(self): - res = self.get(f"http://localhost:{self.port}/installed_extensions") - res.raise_for_status() - return res.json() - def extensions(self, extension: str, version: str, database: str): body = { "extension": extension, diff --git a/test_runner/fixtures/fast_import.py b/test_runner/fixtures/fast_import.py new file mode 100644 index 0000000000..33248132ab --- /dev/null +++ b/test_runner/fixtures/fast_import.py @@ -0,0 +1,104 @@ +import os +import shutil +import subprocess +import tempfile +from collections.abc import Iterator +from pathlib import Path + +import pytest + +from fixtures.log_helper import log +from fixtures.neon_cli import AbstractNeonCli +from fixtures.pg_version import PgVersion + + +class FastImport(AbstractNeonCli): + COMMAND = "fast_import" + cmd: subprocess.CompletedProcess[str] | None = None + + def __init__( + self, + extra_env: dict[str, str] | None, + binpath: Path, + pg_distrib_dir: Path, + pg_version: PgVersion, + workdir: Path, + ): + if extra_env is None: + env_vars = {} + else: + env_vars = extra_env.copy() + + if not (binpath / self.COMMAND).exists(): + raise Exception(f"{self.COMMAND} binary not found at '{binpath}'") + super().__init__(env_vars, binpath) + + pg_dir = pg_distrib_dir / pg_version.v_prefixed + self.pg_distrib_dir = pg_distrib_dir + self.pg_version = pg_version + self.pg_bin = pg_dir / "bin" + if not (self.pg_bin / "postgres").exists(): + raise Exception(f"postgres binary was not found at '{self.pg_bin}'") + self.pg_lib = pg_dir / "lib" + if env_vars.get("LD_LIBRARY_PATH") is not None: + self.pg_lib = Path(env_vars["LD_LIBRARY_PATH"]) + elif os.getenv("LD_LIBRARY_PATH") is not None: + self.pg_lib = Path(str(os.getenv("LD_LIBRARY_PATH"))) + if not workdir.exists(): + raise Exception(f"Working directory '{workdir}' does not exist") + self.workdir = workdir + + def run( + self, + pg_port: int, + source_connection_string: str | None = None, + s3prefix: str | None = None, + interactive: bool = False, + ) -> subprocess.CompletedProcess[str]: + if self.cmd is not None: + raise Exception("Command already executed") + args = [ + f"--pg-bin-dir={self.pg_bin}", + f"--pg-lib-dir={self.pg_lib}", + f"--pg-port={pg_port}", + f"--working-directory={self.workdir}", + ] + if source_connection_string is not None: + args.append(f"--source-connection-string={source_connection_string}") + if s3prefix is not None: + args.append(f"--s3-prefix={s3prefix}") + if interactive: + args.append("--interactive") + + self.cmd = self.raw_cli(args) + return self.cmd + + def __enter__(self): + return self + + def __exit__(self, *args): + if self.workdir.exists(): + shutil.rmtree(self.workdir) + + +@pytest.fixture(scope="function") +def fast_import( + pg_version: PgVersion, + test_output_dir: Path, + neon_binpath: Path, + pg_distrib_dir: Path, +) -> Iterator[FastImport]: + workdir = Path(tempfile.mkdtemp()) + with FastImport(None, neon_binpath, pg_distrib_dir, pg_version, workdir) as fi: + yield fi + + if fi.cmd is None: + return + + # dump stdout & stderr into test log dir + with open(test_output_dir / "fast_import.stdout", "w") as f: + f.write(fi.cmd.stdout) + with open(test_output_dir / "fast_import.stderr", "w") as f: + f.write(fi.cmd.stderr) + + log.info("Written logs to %s", test_output_dir) diff --git a/test_runner/fixtures/metrics.py b/test_runner/fixtures/metrics.py index fa541bad17..fd7e193778 100644 --- a/test_runner/fixtures/metrics.py +++ b/test_runner/fixtures/metrics.py @@ -126,12 +126,8 @@ PAGESERVER_GLOBAL_METRICS: tuple[str, ...] = ( "pageserver_page_cache_read_accesses_total", "pageserver_page_cache_size_current_bytes", "pageserver_page_cache_size_max_bytes", - "pageserver_getpage_reconstruct_seconds_bucket", - "pageserver_getpage_reconstruct_seconds_count", - "pageserver_getpage_reconstruct_seconds_sum", *[f"pageserver_basebackup_query_seconds_{x}" for x in ["bucket", "count", "sum"]], *histogram("pageserver_smgr_query_seconds_global"), - *histogram("pageserver_getpage_get_reconstruct_data_seconds"), *histogram("pageserver_wait_lsn_seconds"), *histogram("pageserver_remote_operation_seconds"), *histogram("pageserver_io_operations_seconds"), diff --git a/test_runner/fixtures/neon_cli.py b/test_runner/fixtures/neon_cli.py index adbd6414a7..33d422c590 100644 --- a/test_runner/fixtures/neon_cli.py +++ b/test_runner/fixtures/neon_cli.py @@ -523,6 +523,7 @@ class NeonLocalCli(AbstractNeonCli): remote_ext_config: str | None = None, pageserver_id: int | None = None, allow_multiple: bool = False, + create_test_user: bool = False, basebackup_request_tries: int | None = None, env: dict[str, str] | None = None, ) -> subprocess.CompletedProcess[str]: @@ -544,6 +545,8 @@ class NeonLocalCli(AbstractNeonCli): args.extend(["--pageserver-id", str(pageserver_id)]) if allow_multiple: args.extend(["--allow-multiple"]) + if create_test_user: + args.extend(["--create-test-user"]) res = self.raw_cli(args, extra_env_vars) res.check_returncode() diff --git a/test_runner/fixtures/neon_fixtures.py b/test_runner/fixtures/neon_fixtures.py index c3950e9bf7..7e3cc19829 100644 --- a/test_runner/fixtures/neon_fixtures.py +++ b/test_runner/fixtures/neon_fixtures.py @@ -313,6 +313,10 @@ class PgProtocol: """ return self.safe_psql(query, log_query=log_query)[0][0] + def show_timeline_id(self) -> TimelineId: + """SHOW neon.timeline_id""" + return TimelineId(cast("str", self.safe_psql("show neon.timeline_id")[0][0])) + class PageserverWalReceiverProtocol(StrEnum): VANILLA = "vanilla" @@ -370,6 +374,7 @@ class NeonEnvBuilder: pageserver_config_override: str | Callable[[dict[str, Any]], None] | None = None, num_safekeepers: int = 1, num_pageservers: int = 1, + num_azs: int = 1, # Use non-standard SK ids to check for various parsing bugs safekeepers_id_start: int = 0, # fsync is disabled by default to make the tests go faster @@ -386,6 +391,7 @@ class NeonEnvBuilder: storage_controller_port_override: int | None = None, pageserver_virtual_file_io_mode: str | None = None, pageserver_wal_receiver_protocol: PageserverWalReceiverProtocol | None = None, + pageserver_get_vectored_concurrent_io: str | None = None, ): self.repo_dir = repo_dir self.rust_log_override = rust_log_override @@ -401,6 +407,7 @@ class NeonEnvBuilder: self.pageserver_config_override = pageserver_config_override self.num_safekeepers = num_safekeepers self.num_pageservers = num_pageservers + self.num_azs = num_azs self.safekeepers_id_start = safekeepers_id_start self.safekeepers_enable_fsync = safekeepers_enable_fsync self.auth_enabled = auth_enabled @@ -424,6 +431,9 @@ class NeonEnvBuilder: self.storage_controller_config: dict[Any, Any] | None = None self.pageserver_virtual_file_io_engine: str | None = pageserver_virtual_file_io_engine + self.pageserver_get_vectored_concurrent_io: str | None = ( + pageserver_get_vectored_concurrent_io + ) self.pageserver_default_tenant_config_compaction_algorithm: dict[str, Any] | None = ( pageserver_default_tenant_config_compaction_algorithm @@ -450,6 +460,7 @@ class NeonEnvBuilder: self.test_name = test_name self.compatibility_neon_binpath = compatibility_neon_binpath self.compatibility_pg_distrib_dir = compatibility_pg_distrib_dir + self.test_may_use_compatibility_snapshot_binaries = False self.version_combination = combination self.mixdir = self.test_output_dir / "mixdir_neon" if self.version_combination is not None: @@ -461,6 +472,7 @@ class NeonEnvBuilder: ), "the environment variable COMPATIBILITY_POSTGRES_DISTRIB_DIR is required when using mixed versions" self.mixdir.mkdir(mode=0o755, exist_ok=True) self._mix_versions() + self.test_may_use_compatibility_snapshot_binaries = True def init_configs(self, default_remote_storage_if_missing: bool = True) -> NeonEnv: # Cannot create more than one environment from one builder @@ -990,6 +1002,7 @@ class NeonEnv: self.endpoints = EndpointFactory(self) self.safekeepers: list[Safekeeper] = [] self.pageservers: list[NeonPageserver] = [] + self.num_azs = config.num_azs self.broker = NeonBroker(self) self.pageserver_remote_storage = config.pageserver_remote_storage self.safekeepers_remote_storage = config.safekeepers_remote_storage @@ -1059,6 +1072,7 @@ class NeonEnv: self.pageserver_virtual_file_io_engine = config.pageserver_virtual_file_io_engine self.pageserver_virtual_file_io_mode = config.pageserver_virtual_file_io_mode self.pageserver_wal_receiver_protocol = config.pageserver_wal_receiver_protocol + self.pageserver_get_vectored_concurrent_io = config.pageserver_get_vectored_concurrent_io # Create the neon_local's `NeonLocalInitConf` cfg: dict[str, Any] = { @@ -1090,14 +1104,21 @@ class NeonEnv: http=self.port_distributor.get_port(), ) + # Availabilty zones may also be configured manually with `NeonEnvBuilder.pageserver_config_override` + if self.num_azs > 1: + # Round-robin assignment of AZ names like us-east-2a, us-east-2b, etc. + az_prefix = DEFAULT_AZ_ID[:-1] + availability_zone = f"{az_prefix}{chr(ord('a') + (ps_id - 1) % self.num_azs)}" + else: + availability_zone = DEFAULT_AZ_ID + ps_cfg: dict[str, Any] = { "id": ps_id, "listen_pg_addr": f"localhost:{pageserver_port.pg}", "listen_http_addr": f"localhost:{pageserver_port.http}", "pg_auth_type": pg_auth_type, "http_auth_type": http_auth_type, - # Default which can be overriden with `NeonEnvBuilder.pageserver_config_override` - "availability_zone": DEFAULT_AZ_ID, + "availability_zone": availability_zone, # Disable pageserver disk syncs in tests: when running tests concurrently, this avoids # the pageserver taking a long time to start up due to syncfs flushing other tests' data "no_sync": True, @@ -1105,12 +1126,24 @@ class NeonEnv: # Batching (https://github.com/neondatabase/neon/issues/9377): # enable batching by default in tests and benchmarks. + ps_cfg["page_service_pipelining"] = { + "mode": "pipelined", + "execution": "concurrent-futures", + "max_batch_size": 32, + } + + # Concurrent IO (https://github.com/neondatabase/neon/issues/9378): + # enable concurrent IO by default in tests and benchmarks. # Compat tests are exempt because old versions fail to parse the new config. - if not config.compatibility_neon_binpath: - ps_cfg["page_service_pipelining"] = { - "mode": "pipelined", - "execution": "concurrent-futures", - "max_batch_size": 32, + get_vectored_concurrent_io = self.pageserver_get_vectored_concurrent_io + if config.test_may_use_compatibility_snapshot_binaries: + log.info( + "Forcing use of binary-built-in default to avoid forward-compatibility related test failures" + ) + get_vectored_concurrent_io = None + if get_vectored_concurrent_io is not None: + ps_cfg["get_vectored_concurrent_io"] = { + "mode": self.pageserver_get_vectored_concurrent_io, } if self.pageserver_virtual_file_io_engine is not None: @@ -1447,6 +1480,7 @@ def neon_simple_env( pageserver_virtual_file_io_engine: str, pageserver_default_tenant_config_compaction_algorithm: dict[str, Any] | None, pageserver_virtual_file_io_mode: str | None, + pageserver_get_vectored_concurrent_io: str | None, ) -> Iterator[NeonEnv]: """ Simple Neon environment, with 1 safekeeper and 1 pageserver. No authentication, no fsync. @@ -1479,6 +1513,7 @@ def neon_simple_env( pageserver_virtual_file_io_engine=pageserver_virtual_file_io_engine, pageserver_default_tenant_config_compaction_algorithm=pageserver_default_tenant_config_compaction_algorithm, pageserver_virtual_file_io_mode=pageserver_virtual_file_io_mode, + pageserver_get_vectored_concurrent_io=pageserver_get_vectored_concurrent_io, combination=combination, ) as builder: env = builder.init_start() @@ -1505,6 +1540,7 @@ def neon_env_builder( pageserver_default_tenant_config_compaction_algorithm: dict[str, Any] | None, record_property: Callable[[str, object], None], pageserver_virtual_file_io_mode: str | None, + pageserver_get_vectored_concurrent_io: str | None, ) -> Iterator[NeonEnvBuilder]: """ Fixture to create a Neon environment for test. @@ -1547,6 +1583,7 @@ def neon_env_builder( test_overlay_dir=test_overlay_dir, pageserver_default_tenant_config_compaction_algorithm=pageserver_default_tenant_config_compaction_algorithm, pageserver_virtual_file_io_mode=pageserver_virtual_file_io_mode, + pageserver_get_vectored_concurrent_io=pageserver_get_vectored_concurrent_io, ) as builder: yield builder # Propogate `preserve_database_files` to make it possible to use in other fixtures, @@ -3881,6 +3918,7 @@ class Endpoint(PgProtocol, LogUtils): pageserver_id: int | None = None, safekeepers: list[int] | None = None, allow_multiple: bool = False, + create_test_user: bool = False, basebackup_request_tries: int | None = None, env: dict[str, str] | None = None, ) -> Self: @@ -3902,6 +3940,7 @@ class Endpoint(PgProtocol, LogUtils): remote_ext_config=remote_ext_config, pageserver_id=pageserver_id, allow_multiple=allow_multiple, + create_test_user=create_test_user, basebackup_request_tries=basebackup_request_tries, env=env, ) @@ -4351,6 +4390,7 @@ class Safekeeper(LogUtils): "1s", "--eviction-min-resident", "10s", + "--wal-reader-fanout", ] self.extra_opts = extra_opts diff --git a/test_runner/fixtures/pageserver/allowed_errors.py b/test_runner/fixtures/pageserver/allowed_errors.py index 5059039678..748ac0d569 100755 --- a/test_runner/fixtures/pageserver/allowed_errors.py +++ b/test_runner/fixtures/pageserver/allowed_errors.py @@ -99,8 +99,11 @@ DEFAULT_PAGESERVER_ALLOWED_ERRORS = ( ".*WARN.*path=/v1/utilization .*request was dropped before completing", # Can happen during shutdown ".*scheduling deletion on drop failed: queue is in state Stopped.*", - # Too many frozen layers error is normal during intensive benchmarks - ".*too many frozen layers.*", + # L0 flush backpressure delays are expected under heavy ingest load. We want to exercise + # this backpressure in tests. + ".*delaying layer flush by \\S+ for compaction backpressure.*", + ".*stalling layer flushes for compaction backpressure.*", + ".*layer roll waiting for flush due to compaction backpressure.*", ) diff --git a/test_runner/fixtures/parametrize.py b/test_runner/fixtures/parametrize.py index f57c0f801f..1acb1af23b 100644 --- a/test_runner/fixtures/parametrize.py +++ b/test_runner/fixtures/parametrize.py @@ -44,6 +44,11 @@ def pageserver_virtual_file_io_mode() -> str | None: return os.getenv("PAGESERVER_VIRTUAL_FILE_IO_MODE") +@pytest.fixture(scope="function", autouse=True) +def pageserver_get_vectored_concurrent_io() -> str | None: + return os.getenv("PAGESERVER_GET_VECTORED_CONCURRENT_IO") + + def get_pageserver_default_tenant_config_compaction_algorithm() -> dict[str, Any] | None: toml_table = os.getenv("PAGESERVER_DEFAULT_TENANT_CONFIG_COMPACTION_ALGORITHM") if toml_table is None: @@ -116,6 +121,8 @@ def pytest_runtest_makereport(*args, **kwargs): }.get(os.uname().machine, "UNKNOWN") arch = os.getenv("RUNNER_ARCH", uname_m) allure.dynamic.parameter("__arch", arch) - allure.dynamic.parameter("__lfc", os.getenv("USE_LFC") != "false") + allure.dynamic.parameter( + "__lfc", "with-lfc" if os.getenv("USE_LFC") != "false" else "without-lfc" + ) yield diff --git a/test_runner/performance/test_compaction.py b/test_runner/performance/test_compaction.py index 0cd1080fa7..eaa89ae754 100644 --- a/test_runner/performance/test_compaction.py +++ b/test_runner/performance/test_compaction.py @@ -75,6 +75,7 @@ def test_compaction_l0_memory(neon_compare: NeonCompare): # Initially disable compaction so that we will build up a stack of L0s "compaction_period": "0s", "gc_period": "0s", + "compaction_upper_limit": 12, } ) neon_compare.tenant = tenant_id @@ -91,6 +92,7 @@ def test_compaction_l0_memory(neon_compare: NeonCompare): tenant_conf = pageserver_http.tenant_config(tenant_id) assert tenant_conf.effective_config["checkpoint_distance"] == 256 * 1024 * 1024 assert tenant_conf.effective_config["compaction_threshold"] == 10 + assert tenant_conf.effective_config["compaction_upper_limit"] == 12 # Aim to write about 20 L0s, so that we will hit the limit on how many # to compact at once diff --git a/test_runner/performance/test_layer_map.py b/test_runner/performance/test_layer_map.py index 8a4ad2d399..efc7fa59db 100644 --- a/test_runner/performance/test_layer_map.py +++ b/test_runner/performance/test_layer_map.py @@ -23,12 +23,15 @@ def test_layer_map(neon_env_builder: NeonEnvBuilder, zenbenchmark): "checkpoint_distance": "16384", "compaction_period": "1 s", "compaction_threshold": "1", + "l0_flush_delay_threshold": "0", + "l0_flush_stall_threshold": "0", "compaction_target_size": "16384", } ) endpoint = env.endpoints.create_start("main", tenant_id=tenant) cur = endpoint.connect().cursor() + cur.execute("set log_statement = 'all'") cur.execute("create table t(x integer)") for _ in range(n_iters): cur.execute(f"insert into t values (generate_series(1,{n_records}))") diff --git a/test_runner/regress/test_attach_tenant_config.py b/test_runner/regress/test_attach_tenant_config.py index 45112fd67e..e88d245c8f 100644 --- a/test_runner/regress/test_attach_tenant_config.py +++ b/test_runner/regress/test_attach_tenant_config.py @@ -139,6 +139,10 @@ def test_fully_custom_config(positive_env: NeonEnv): fully_custom_config = { "compaction_period": "1h", "compaction_threshold": 13, + "compaction_upper_limit": 100, + "l0_flush_delay_threshold": 25, + "l0_flush_stall_threshold": 42, + "l0_flush_wait_upload": True, "compaction_target_size": 1048576, "checkpoint_distance": 10000, "checkpoint_timeout": "13m", @@ -176,6 +180,10 @@ def test_fully_custom_config(positive_env: NeonEnv): "type": "interpreted", "args": {"format": "bincode", "compression": {"zstd": {"level": 1}}}, }, + "rel_size_v2_enabled": True, + "gc_compaction_enabled": True, + "gc_compaction_initial_threshold_kb": 1024000, + "gc_compaction_ratio_percent": 200, } vps_http = env.storage_controller.pageserver_api() diff --git a/test_runner/regress/test_branch_and_gc.py b/test_runner/regress/test_branch_and_gc.py index fccfbc7f09..0e28231a86 100644 --- a/test_runner/regress/test_branch_and_gc.py +++ b/test_runner/regress/test_branch_and_gc.py @@ -64,6 +64,8 @@ def test_branch_and_gc(neon_simple_env: NeonEnv): # tweak the default settings to allow quickly create image layers and L1 layers "compaction_period": "1 s", "compaction_threshold": "2", + "l0_flush_delay_threshold": "20", + "l0_flush_stall_threshold": "40", "image_creation_threshold": "1", # Disable PITR, this test will set an explicit space-based GC limit "pitr_interval": "0 s", diff --git a/test_runner/regress/test_compaction.py b/test_runner/regress/test_compaction.py index d0a2349ccf..2edfc884ad 100644 --- a/test_runner/regress/test_compaction.py +++ b/test_runner/regress/test_compaction.py @@ -1,6 +1,8 @@ from __future__ import annotations import json +import math +import random import time from enum import StrEnum @@ -128,11 +130,6 @@ def test_pageserver_gc_compaction_smoke(neon_env_builder: NeonEnvBuilder, with_b } env = neon_env_builder.init_start(initial_tenant_conf=SMOKE_CONF) - env.pageserver.allowed_errors.append( - r".*failed to acquire partition lock during gc-compaction.*" - ) - env.pageserver.allowed_errors.append(r".*repartition() called concurrently.*") - tenant_id = env.initial_tenant timeline_id = env.initial_timeline @@ -147,11 +144,14 @@ def test_pageserver_gc_compaction_smoke(neon_env_builder: NeonEnvBuilder, with_b log.info("Writing initial data ...") workload.write_rows(row_count, env.pageserver.id) + ps_http.timeline_gc( + tenant_id, timeline_id, None + ) # Force refresh gc info to have gc_cutoff generated + child_workloads: list[Workload] = [] for i in range(1, churn_rounds + 1): - if i % 10 == 0: - log.info(f"Running churn round {i}/{churn_rounds} ...") + log.info(f"Running churn round {i}/{churn_rounds} ...") if i % 10 == 5 and with_branches == "with_branches": branch_name = f"child-{i}" branch_timeline_id = env.create_branch(branch_name) @@ -172,8 +172,10 @@ def test_pageserver_gc_compaction_smoke(neon_env_builder: NeonEnvBuilder, with_b "sub_compaction_max_job_size_mb": 16, }, ) - - workload.churn_rows(row_count, env.pageserver.id) + # do not wait for upload so that we can see if gc_compaction works well with data being ingested + workload.churn_rows(row_count, env.pageserver.id, upload=False) + time.sleep(1) + workload.validate(env.pageserver.id) def compaction_finished(): queue_depth = len(ps_http.timeline_compact_info(tenant_id, timeline_id)) @@ -197,6 +199,230 @@ def test_pageserver_gc_compaction_smoke(neon_env_builder: NeonEnvBuilder, with_b ps_http.timeline_gc(tenant_id, timeline_id, None) +@pytest.mark.parametrize( + "compaction_mode", + ["before_restart", "after_restart"], +) +def test_pageserver_gc_compaction_idempotent( + neon_env_builder: NeonEnvBuilder, compaction_mode: str +): + """ + Do gc-compaction twice without writing any new data and see if anything breaks. + We run this test in two modes: + - before_restart: run two gc-compactions before pageserver restart + - after_restart: run one gc-compaction before and one after pageserver restart + """ + SMOKE_CONF = { + # Run both gc and gc-compaction. + "gc_period": "5s", + "compaction_period": "5s", + # No PiTR interval and small GC horizon + "pitr_interval": "0s", + "gc_horizon": 1024, + "lsn_lease_length": "0s", + } + + env = neon_env_builder.init_start(initial_tenant_conf=SMOKE_CONF) + tenant_id = env.initial_tenant + timeline_id = env.initial_timeline + + # Only in testing mode: the warning is expected because we rewrite a layer file of different generations. + # We could potentially patch the sanity-check code to not emit the warning in the future. + env.pageserver.allowed_errors.append(".*was unlinked but was not dangling.*") + + row_count = 10000 + + ps_http = env.pageserver.http_client() + + workload = Workload(env, tenant_id, timeline_id) + workload.init(env.pageserver.id) + + workload.write_rows(row_count, env.pageserver.id) + + child_workloads: list[Workload] = [] + + def compaction_finished(): + queue_depth = len(ps_http.timeline_compact_info(tenant_id, timeline_id)) + assert queue_depth == 0 + + workload.churn_rows(row_count, env.pageserver.id) + env.create_branch("child_branch") # so that we have a retain_lsn + workload.churn_rows(row_count, env.pageserver.id) + # compact 3 times if mode is before_restart + n_compactions = 3 if compaction_mode == "before_restart" else 1 + for _ in range(n_compactions): + # Force refresh gc info to have gc_cutoff generated + ps_http.timeline_gc(tenant_id, timeline_id, None) + ps_http.timeline_compact( + tenant_id, + timeline_id, + enhanced_gc_bottom_most_compaction=True, + body={ + "scheduled": True, + "sub_compaction": True, + "compact_key_range": { + "start": "000000000000000000000000000000000000", + "end": "030000000000000000000000000000000000", + }, + "sub_compaction_max_job_size_mb": 16, + }, + ) + wait_until(compaction_finished, timeout=60) + if compaction_mode == "after_restart": + env.pageserver.restart(True) + ps_http.timeline_gc( + tenant_id, timeline_id, None + ) # Force refresh gc info to have gc_cutoff generated + for _ in range(3): + ps_http.timeline_compact( + tenant_id, + timeline_id, + enhanced_gc_bottom_most_compaction=True, + body={ + "scheduled": True, + "sub_compaction": True, + "compact_key_range": { + "start": "000000000000000000000000000000000000", + "end": "030000000000000000000000000000000000", + }, + "sub_compaction_max_job_size_mb": 16, + }, + ) + wait_until(compaction_finished, timeout=60) + + # ensure gc_compaction is scheduled and it's actually running (instead of skipping due to no layers picked) + env.pageserver.assert_log_contains( + "scheduled_compact_timeline.*picked .* layers for compaction" + ) + + # ensure we hit the duplicated layer key warning at least once: we did two compactions consecutively, + # and the second one should have hit the duplicated layer key warning. + if compaction_mode == "before_restart": + env.pageserver.assert_log_contains("duplicated layer key in the same generation") + else: + env.pageserver.assert_log_contains("same layer key at different generation") + + log.info("Validating at workload end ...") + workload.validate(env.pageserver.id) + for child_workload in child_workloads: + log.info(f"Validating at branch {child_workload.branch_name}") + child_workload.validate(env.pageserver.id) + + # Run a legacy compaction+gc to ensure gc-compaction can coexist with legacy compaction. + ps_http.timeline_checkpoint(tenant_id, timeline_id, wait_until_uploaded=True) + ps_http.timeline_gc(tenant_id, timeline_id, None) + + +@skip_in_debug_build("only run with release build") +def test_pageserver_gc_compaction_interrupt(neon_env_builder: NeonEnvBuilder): + """ + Force interrupt a gc-compaction and see if anything breaks. + """ + SMOKE_CONF = { + # Run both gc and gc-compaction. + "gc_period": "5s", + "compaction_period": "5s", + # No PiTR interval and small GC horizon + "pitr_interval": "0s", + "gc_horizon": "1024", + "lsn_lease_length": "0s", + } + + env = neon_env_builder.init_start(initial_tenant_conf=SMOKE_CONF) + tenant_id = env.initial_tenant + timeline_id = env.initial_timeline + + # Only in testing mode: the warning is expected because we rewrite a layer file of different generations. + # We could potentially patch the sanity-check code to not emit the warning in the future. + env.pageserver.allowed_errors.append(".*was unlinked but was not dangling.*") + + row_count = 10000 + churn_rounds = 20 + + ps_http = env.pageserver.http_client() + + workload = Workload(env, tenant_id, timeline_id) + workload.init(env.pageserver.id) + + log.info("Writing initial data ...") + workload.write_rows(row_count, env.pageserver.id) + + def compaction_finished(): + queue_depth = len(ps_http.timeline_compact_info(tenant_id, timeline_id)) + assert queue_depth == 0 + + expected_compaction_time_seconds = 5.0 + ps_http.timeline_gc( + tenant_id, timeline_id, None + ) # Force refresh gc info to have gc_cutoff generated + for i in range(1, churn_rounds + 1): + log.info(f"Running churn round {i}/{churn_rounds} ...") + workload.churn_rows(row_count, env.pageserver.id) + ps_http.timeline_compact( + tenant_id, + timeline_id, + enhanced_gc_bottom_most_compaction=True, + body={ + "scheduled": True, + "sub_compaction": True, + "compact_key_range": { + "start": "000000000000000000000000000000000000", + "end": "030000000000000000000000000000000000", + }, + "sub_compaction_max_job_size_mb": 16, + }, + ) + # sleep random seconds between 0 and max(compaction_time); if the result is 0, wait until the compaction is complete + # This would hopefully trigger the restart at different periods of the compaction: + # - while we are doing the compaction + # - while we finished the compaction but not yet uploaded the metadata + # - after we uploaded the metadata + time_to_sleep = random.randint(0, max(5, math.ceil(expected_compaction_time_seconds))) + if time_to_sleep == 0 or i == 1: + start = time.time() + wait_until(compaction_finished, timeout=60) + end = time.time() + expected_compaction_time_seconds = end - start + log.info( + f"expected_compaction_time_seconds updated to {expected_compaction_time_seconds} seconds" + ) + else: + time.sleep(time_to_sleep) + env.pageserver.restart(True) + ps_http.timeline_gc( + tenant_id, timeline_id, None + ) # Force refresh gc info to have gc_cutoff generated + ps_http.timeline_compact( + tenant_id, + timeline_id, + enhanced_gc_bottom_most_compaction=True, + body={ + "scheduled": True, + "sub_compaction": True, + "compact_key_range": { + "start": "000000000000000000000000000000000000", + "end": "030000000000000000000000000000000000", + }, + "sub_compaction_max_job_size_mb": 16, + }, + ) + workload.validate(env.pageserver.id) + + wait_until(compaction_finished, timeout=60) + + # ensure gc_compaction is scheduled and it's actually running (instead of skipping due to no layers picked) + env.pageserver.assert_log_contains( + "scheduled_compact_timeline.*picked .* layers for compaction" + ) + + log.info("Validating at workload end ...") + workload.validate(env.pageserver.id) + + # Run a legacy compaction+gc to ensure gc-compaction can coexist with legacy compaction. + ps_http.timeline_checkpoint(tenant_id, timeline_id, wait_until_uploaded=True) + ps_http.timeline_gc(tenant_id, timeline_id, None) + + # Stripe sizes in number of pages. TINY_STRIPES = 16 LARGE_STRIPES = 32768 @@ -237,7 +463,9 @@ def test_sharding_compaction( "pitr_interval": "0s", # disable background compaction and GC. We invoke it manually when we want it to happen. "gc_period": "0s", + "gc_horizon": f"{128 * 1024}", "compaction_period": "0s", + "lsn_lease_length": "0s", # create image layers eagerly: we want to exercise image layer creation in this test. "image_creation_threshold": "1", "image_layer_creation_check_threshold": 0, @@ -312,6 +540,8 @@ def test_sharding_compaction( for shard in env.storage_controller.locate(tenant_id): pageserver = env.get_pageserver(shard["node_id"]) tenant_shard_id = shard["shard_id"] + # Force refresh gc info to have gc_cutoff generated + pageserver.http_client().timeline_gc(tenant_shard_id, timeline_id, None) pageserver.http_client().timeline_compact( tenant_shard_id, timeline_id, diff --git a/test_runner/regress/test_compatibility.py b/test_runner/regress/test_compatibility.py index a6eaaf6c4c..cdc6c0053d 100644 --- a/test_runner/regress/test_compatibility.py +++ b/test_runner/regress/test_compatibility.py @@ -143,7 +143,7 @@ def test_create_snapshot( env = neon_env_builder.init_start( initial_tenant_conf={ - # Miniature layers to enable generating non-trivial layer map without writing lots of data + # Miniature layers to enable generating non-trivial layer map without writing lots of data. "checkpoint_distance": f"{128 * 1024}", "compaction_threshold": "1", "compaction_target_size": f"{128 * 1024}", @@ -251,6 +251,8 @@ def test_forward_compatibility( os.environ.get("ALLOW_FORWARD_COMPATIBILITY_BREAKAGE", "false").lower() == "true" ) + neon_env_builder.test_may_use_compatibility_snapshot_binaries = True + try: neon_env_builder.num_safekeepers = 3 diff --git a/test_runner/regress/test_compute_metrics.py b/test_runner/regress/test_compute_metrics.py index 5dcc93acff..99d41e410a 100644 --- a/test_runner/regress/test_compute_metrics.py +++ b/test_runner/regress/test_compute_metrics.py @@ -5,16 +5,22 @@ import os import shutil import sys from enum import StrEnum +from logging import debug from pathlib import Path from typing import TYPE_CHECKING, cast import pytest import requests import yaml +from fixtures.endpoint.http import EndpointHttpClient from fixtures.log_helper import log +from fixtures.metrics import parse_metrics from fixtures.paths import BASE_DIR, COMPUTE_CONFIG_DIR +from fixtures.utils import wait_until +from prometheus_client.samples import Sample if TYPE_CHECKING: + from collections.abc import Callable from types import TracebackType from typing import Self, TypedDict @@ -467,3 +473,88 @@ def test_perf_counters(neon_simple_env: NeonEnv): cur.execute("CREATE EXTENSION neon VERSION '1.5'") cur.execute("SELECT * FROM neon_perf_counters") cur.execute("SELECT * FROM neon_backend_perf_counters") + + +def collect_metric( + client: EndpointHttpClient, + name: str, + filter: dict[str, str], + predicate: Callable[[list[Sample]], bool], +) -> Callable[[], list[Sample]]: + """ + Call this function as the first argument to wait_until(). + """ + + def __collect_metric() -> list[Sample]: + resp = client.metrics() + debug("Metrics: %s", resp) + m = parse_metrics(resp) + samples = m.query_all(name, filter) + debug("Samples: %s", samples) + assert predicate(samples), "predicate failed" + return samples + + return __collect_metric + + +def test_compute_installed_extensions_metric(neon_simple_env: NeonEnv): + """ + Test that the compute_installed_extensions properly reports accurate + results. Important to note that currently this metric is only gathered on + compute start. + """ + env = neon_simple_env + + endpoint = env.endpoints.create_start("main") + + client = endpoint.http_client() + + def __has_plpgsql(samples: list[Sample]) -> bool: + """ + Check that plpgsql is installed in the template1 and postgres databases + """ + return len(samples) == 1 and samples[0].value == 2 + + wait_until( + collect_metric( + client, + "compute_installed_extensions", + {"extension_name": "plpgsql", "version": "1.0", "owned_by_superuser": "1"}, + __has_plpgsql, + ), + name="compute_installed_extensions", + ) + + # Install the neon extension, so we can check for it on the restart + endpoint.safe_psql("CREATE EXTENSION neon VERSION '1.0'") + + # The metric is only gathered on compute start, so restart to check if the + # neon extension will now be there. + endpoint.stop() + endpoint.start() + + client = endpoint.http_client() + + def __has_neon(samples: list[Sample]) -> bool: + return len(samples) == 1 and samples[0].value == 1 + + wait_until( + collect_metric( + client, + "compute_installed_extensions", + {"extension_name": "neon", "version": "1.0", "owned_by_superuser": "1"}, + __has_neon, + ), + name="compute_installed_extensions", + ) + + # Double check that we also still have plpgsql + wait_until( + collect_metric( + client, + "compute_installed_extensions", + {"extension_name": "plpgsql", "version": "1.0", "owned_by_superuser": "1"}, + __has_plpgsql, + ), + name="compute_installed_extensions", + ) diff --git a/test_runner/regress/test_compute_migrations.py b/test_runner/regress/test_compute_migrations.py index 803702a6f8..0dbb187c39 100644 --- a/test_runner/regress/test_compute_migrations.py +++ b/test_runner/regress/test_compute_migrations.py @@ -5,6 +5,8 @@ from typing import TYPE_CHECKING, cast import pytest from fixtures.compute_migrations import COMPUTE_MIGRATIONS, NUM_COMPUTE_MIGRATIONS +from fixtures.metrics import parse_metrics +from fixtures.utils import wait_until if TYPE_CHECKING: from fixtures.neon_fixtures import NeonEnv @@ -23,7 +25,26 @@ def test_compute_migrations_retry(neon_simple_env: NeonEnv, compute_migrations_d for i in range(1, NUM_COMPUTE_MIGRATIONS + 1): endpoint.start(env={"FAILPOINTS": f"compute-migration=return({i})"}) - # Make sure that the migrations ran + # Check that migration failure is properly recorded in the metrics + # + # N.B. wait_for_migrations() only waits till the last successful + # migration is applied. It doesn't wait till the migration failure due + # to the failpoint. This opens a race for checking the metrics. To avoid + # this, we first wait until the migration failure metric is seen. + def check_migration_failure_metrics(): + client = endpoint.http_client() + raw_metrics = client.metrics() + metrics = parse_metrics(raw_metrics) + failed_migration = metrics.query_all( + "compute_ctl_db_migration_failed_total", + ) + assert len(failed_migration) == 1 + for sample in failed_migration: + assert sample.value == 1 + + wait_until(check_migration_failure_metrics) + + # Make sure that all migrations before the failed one are applied endpoint.wait_for_migrations(wait_for=i - 1) # Confirm that we correctly recorded that in the diff --git a/test_runner/regress/test_download_extensions.py b/test_runner/regress/test_download_extensions.py index f18f4e78bd..d7e6e9de56 100644 --- a/test_runner/regress/test_download_extensions.py +++ b/test_runner/regress/test_download_extensions.py @@ -8,6 +8,7 @@ from typing import TYPE_CHECKING import pytest from fixtures.log_helper import log +from fixtures.metrics import parse_metrics from fixtures.neon_fixtures import ( NeonEnvBuilder, ) @@ -128,6 +129,17 @@ def test_remote_extensions( httpserver.check() + # Check that we properly recorded downloads in the metrics + client = endpoint.http_client() + raw_metrics = client.metrics() + metrics = parse_metrics(raw_metrics) + remote_ext_requests = metrics.query_all( + "compute_ctl_remote_ext_requests_total", + ) + assert len(remote_ext_requests) == 1 + for sample in remote_ext_requests: + assert sample.value == 1 + # TODO # 1. Test downloading remote library. @@ -137,7 +149,7 @@ def test_remote_extensions( # # 3.Test that extension is downloaded after endpoint restart, # when the library is used in the query. -# Run the test with mutliple simultaneous connections to an endpoint. +# Run the test with multiple simultaneous connections to an endpoint. # to ensure that the extension is downloaded only once. # # 4. Test that private extensions are only downloaded when they are present in the spec. diff --git a/test_runner/regress/test_import_pgdata.py b/test_runner/regress/test_import_pgdata.py index 6ea2393a9d..182f715b0e 100644 --- a/test_runner/regress/test_import_pgdata.py +++ b/test_runner/regress/test_import_pgdata.py @@ -7,15 +7,15 @@ import psycopg2 import psycopg2.errors import pytest from fixtures.common_types import Lsn, TenantId, TenantShardId, TimelineId +from fixtures.fast_import import FastImport from fixtures.log_helper import log -from fixtures.neon_fixtures import NeonEnvBuilder, VanillaPostgres +from fixtures.neon_fixtures import NeonEnvBuilder, PgBin, PgProtocol, VanillaPostgres from fixtures.pageserver.http import ( ImportPgdataIdemptencyKey, PageserverApiException, ) -from fixtures.pg_version import PgVersion +from fixtures.port_distributor import PortDistributor from fixtures.remote_storage import RemoteStorageKind -from fixtures.utils import run_only_on_postgres from pytest_httpserver import HTTPServer from werkzeug.wrappers.request import Request from werkzeug.wrappers.response import Response @@ -37,10 +37,6 @@ smoke_params = [ ] -@run_only_on_postgres( - [PgVersion.V14, PgVersion.V15, PgVersion.V16], - "newer control file catalog version and struct format isn't supported", -) @pytest.mark.parametrize("shard_count,stripe_size,rel_block_size", smoke_params) def test_pgdata_import_smoke( vanilla_pg: VanillaPostgres, @@ -115,13 +111,15 @@ def test_pgdata_import_smoke( # TODO: would be nicer to just compare pgdump # Enable IO concurrency for batching on large sequential scan, to avoid making - # this test unnecessarily onerous on CPU + # this test unnecessarily onerous on CPU. Especially on debug mode, it's still + # pretty onerous though, so increase statement_timeout to avoid timeouts. assert ep.safe_psql_many( [ "set effective_io_concurrency=32;", + "SET statement_timeout='300s';", "select count(*), sum(data::bigint)::bigint from t", ] - ) == [[], [(expect_nrows, expect_sum)]] + ) == [[], [], [(expect_nrows, expect_sum)]] validate_vanilla_equivalence(vanilla_pg) @@ -313,3 +311,37 @@ def test_pgdata_import_smoke( validate_vanilla_equivalence(br_initdb_endpoint) with pytest.raises(psycopg2.errors.UndefinedTable): br_initdb_endpoint.safe_psql("select * from othertable") + + +def test_fast_import_binary( + test_output_dir, + vanilla_pg: VanillaPostgres, + port_distributor: PortDistributor, + fast_import: FastImport, +): + vanilla_pg.start() + vanilla_pg.safe_psql("CREATE TABLE foo (a int); INSERT INTO foo SELECT generate_series(1, 10);") + + pg_port = port_distributor.get_port() + fast_import.run(pg_port, vanilla_pg.connstr()) + vanilla_pg.stop() + + pgbin = PgBin(test_output_dir, fast_import.pg_distrib_dir, fast_import.pg_version) + with VanillaPostgres( + fast_import.workdir / "pgdata", pgbin, pg_port, False + ) as new_pgdata_vanilla_pg: + new_pgdata_vanilla_pg.start() + + # database name and user are hardcoded in fast_import binary, and they are different from normal vanilla postgres + conn = PgProtocol(dsn=f"postgresql://cloud_admin@localhost:{pg_port}/neondb") + res = conn.safe_psql("SELECT count(*) FROM foo;") + log.info(f"Result: {res}") + assert res[0][0] == 10 + + +# TODO: Maybe test with pageserver? +# 1. run whole neon env +# 2. create timeline with some s3 path??? +# 3. run fast_import with s3 prefix +# 4. ??? mock http where pageserver will report progress +# 5. run compute on this timeline and check if data is there diff --git a/test_runner/regress/test_installed_extensions.py b/test_runner/regress/test_installed_extensions.py deleted file mode 100644 index 4e51e7e10c..0000000000 --- a/test_runner/regress/test_installed_extensions.py +++ /dev/null @@ -1,154 +0,0 @@ -from __future__ import annotations - -import time -from logging import info -from typing import TYPE_CHECKING - -from fixtures.log_helper import log -from fixtures.metrics import parse_metrics - -if TYPE_CHECKING: - from fixtures.neon_fixtures import NeonEnv - - -def test_installed_extensions(neon_simple_env: NeonEnv): - """basic test for the endpoint that returns the list of installed extensions""" - - env = neon_simple_env - - env.create_branch("test_installed_extensions") - - endpoint = env.endpoints.create_start("test_installed_extensions") - - endpoint.safe_psql("CREATE DATABASE test_installed_extensions") - endpoint.safe_psql("CREATE DATABASE test_installed_extensions_2") - - client = endpoint.http_client() - res = client.installed_extensions() - - info("Extensions list: %s", res) - info("Extensions: %s", res["extensions"]) - # 'plpgsql' is a default extension that is always installed. - assert any( - ext["extname"] == "plpgsql" and ext["version"] == "1.0" for ext in res["extensions"] - ), "The 'plpgsql' extension is missing" - - # check that the neon_test_utils extension is not installed - assert not any( - ext["extname"] == "neon_test_utils" for ext in res["extensions"] - ), "The 'neon_test_utils' extension is installed" - - pg_conn = endpoint.connect(dbname="test_installed_extensions") - with pg_conn.cursor() as cur: - cur.execute("CREATE EXTENSION neon_test_utils") - cur.execute( - "SELECT default_version FROM pg_available_extensions WHERE name = 'neon_test_utils'" - ) - res = cur.fetchone() - neon_test_utils_version = res[0] - - with pg_conn.cursor() as cur: - cur.execute("CREATE EXTENSION neon version '1.1'") - - pg_conn_2 = endpoint.connect(dbname="test_installed_extensions_2") - with pg_conn_2.cursor() as cur: - cur.execute("CREATE EXTENSION neon version '1.2'") - - res = client.installed_extensions() - - info("Extensions list: %s", res) - info("Extensions: %s", res["extensions"]) - - # check that the neon_test_utils extension is installed only in 1 database - # and has the expected version - assert any( - ext["extname"] == "neon_test_utils" - and ext["version"] == neon_test_utils_version - and ext["n_databases"] == 1 - for ext in res["extensions"] - ) - - # check that the plpgsql extension is installed in all databases - # this is a default extension that is always installed - assert any(ext["extname"] == "plpgsql" and ext["n_databases"] == 4 for ext in res["extensions"]) - - # check that the neon extension is installed and has expected versions - for ext in res["extensions"]: - if ext["extname"] == "neon": - assert ext["version"] in ["1.1", "1.2"] - assert ext["n_databases"] == 1 - - with pg_conn.cursor() as cur: - cur.execute("ALTER EXTENSION neon UPDATE TO '1.3'") - - res = client.installed_extensions() - - info("Extensions list: %s", res) - info("Extensions: %s", res["extensions"]) - - # check that the neon_test_utils extension is updated - for ext in res["extensions"]: - if ext["extname"] == "neon": - assert ext["version"] in ["1.2", "1.3"] - assert ext["n_databases"] == 1 - - # check that /metrics endpoint is available - # ensure that we see the metric before and after restart - res = client.metrics() - info("Metrics: %s", res) - m = parse_metrics(res) - neon_m = m.query_all( - "compute_installed_extensions", - {"extension_name": "neon", "version": "1.2", "owned_by_superuser": "1"}, - ) - assert len(neon_m) == 1 - for sample in neon_m: - assert sample.value == 1 - neon_m = m.query_all( - "compute_installed_extensions", - {"extension_name": "neon", "version": "1.3", "owned_by_superuser": "1"}, - ) - assert len(neon_m) == 1 - for sample in neon_m: - assert sample.value == 1 - - endpoint.stop() - endpoint.start() - - timeout = 10 - while timeout > 0: - try: - res = client.metrics() - timeout = -1 - if len(parse_metrics(res).query_all("compute_installed_extensions")) < 4: - # Assume that not all metrics that are collected yet - time.sleep(1) - timeout -= 1 - continue - except Exception: - log.exception("failed to get metrics, assume they are not collected yet") - time.sleep(1) - timeout -= 1 - continue - - assert ( - len(parse_metrics(res).query_all("compute_installed_extensions")) >= 4 - ), "Not all metrics are collected" - - info("After restart metrics: %s", res) - m = parse_metrics(res) - neon_m = m.query_all( - "compute_installed_extensions", - {"extension_name": "neon", "version": "1.2", "owned_by_superuser": "1"}, - ) - assert len(neon_m) == 1 - for sample in neon_m: - assert sample.value == 1 - - neon_m = m.query_all( - "compute_installed_extensions", - {"extension_name": "neon", "version": "1.3", "owned_by_superuser": "1"}, - ) - assert len(neon_m) == 1 - for sample in neon_m: - assert sample.value == 1 diff --git a/test_runner/regress/test_local_file_cache.py b/test_runner/regress/test_local_file_cache.py index 21c9e97a42..52ee2f32a2 100644 --- a/test_runner/regress/test_local_file_cache.py +++ b/test_runner/regress/test_local_file_cache.py @@ -7,9 +7,78 @@ import threading import time import pytest -from fixtures.neon_fixtures import NeonEnvBuilder +from fixtures.neon_fixtures import NeonEnv, NeonEnvBuilder from fixtures.utils import USE_LFC, query_scalar +""" +Test whether LFC doesn't error out when the LRU is empty, but the LFC is +already at its maximum size. + +If we don't handle this safely, we might allocate more hash entries than +otherwise considered safe, thus causing ERRORs in hash_search(HASH_ENTER) once +we hit lfc->used >= lfc->limit. +""" + + +@pytest.mark.skipif(not USE_LFC, reason="LFC is disabled, skipping") +def test_local_file_cache_all_pinned(neon_simple_env: NeonEnv): + env = neon_simple_env + endpoint = env.endpoints.create_start( + "main", + config_lines=[ + "neon.max_file_cache_size='1MB'", + "neon.file_cache_size_limit='1MB'", + ], + ) + top_cur = endpoint.connect().cursor() + + stop = threading.Event() + n_rows = 10000 + n_threads = 5 + n_updates_per_connection = 1000 + + top_cur.execute("CREATE TABLE lfctest (id int4 PRIMARY KEY, n int) WITH (fillfactor=10)") + top_cur.execute(f"INSERT INTO lfctest SELECT g, 1 FROM generate_series(1, {n_rows}) g") + + # Start threads that will perform random UPDATEs. Each UPDATE + # increments the counter on the row, so that we can check at the + # end that the sum of all the counters match the number of updates + # performed (plus the initial 1 on each row). + # + # Furthermore, each thread will reconnect between every 1000 updates. + def run_updates(n_updates_performed_q: queue.Queue[int]): + n_updates_performed = 0 + conn = endpoint.connect() + cur = conn.cursor() + while not stop.is_set(): + id = random.randint(1, n_rows) + cur.execute(f"UPDATE lfctest SET n = n + 1 WHERE id = {id}") + n_updates_performed += 1 + if n_updates_performed % n_updates_per_connection == 0: + cur.close() + conn.close() + conn = endpoint.connect() + cur = conn.cursor() + n_updates_performed_q.put(n_updates_performed) + + n_updates_performed_q: queue.Queue[int] = queue.Queue() + threads: list[threading.Thread] = [] + for _i in range(n_threads): + thread = threading.Thread(target=run_updates, args=(n_updates_performed_q,), daemon=True) + thread.start() + threads.append(thread) + + time.sleep(15) + + stop.set() + + n_updates_performed = 0 + for thread in threads: + thread.join() + n_updates_performed += n_updates_performed_q.get() + + assert query_scalar(top_cur, "SELECT SUM(n) FROM lfctest") == n_rows + n_updates_performed + @pytest.mark.skipif(not USE_LFC, reason="LFC is disabled, skipping") def test_local_file_cache_unlink(neon_env_builder: NeonEnvBuilder): diff --git a/test_runner/regress/test_nbtree_pagesplit_cycleid.py b/test_runner/regress/test_nbtree_pagesplit_cycleid.py index 558557aeba..32ec6fcb92 100644 --- a/test_runner/regress/test_nbtree_pagesplit_cycleid.py +++ b/test_runner/regress/test_nbtree_pagesplit_cycleid.py @@ -4,9 +4,19 @@ import time from fixtures.neon_fixtures import NeonEnv BTREE_NUM_CYCLEID_PAGES = """ - WITH raw_pages AS ( - SELECT blkno, get_raw_page_at_lsn('t_uidx', 'main', blkno, NULL, NULL) page - FROM generate_series(1, pg_relation_size('t_uidx'::regclass) / 8192) blkno + WITH lsns AS ( + /* + * pg_switch_wal() ensures we have an LSN that + * 1. is after any previous modifications, but also, + * 2. (critically) is flushed, preventing any issues with waiting for + * unflushed WAL in PageServer. + */ + SELECT pg_switch_wal() as lsn + ), + raw_pages AS ( + SELECT blkno, get_raw_page_at_lsn('t_uidx', 'main', blkno, lsn, lsn) page + FROM generate_series(1, pg_relation_size('t_uidx'::regclass) / 8192) AS blkno, + lsns l(lsn) ), parsed_pages AS ( /* cycle ID is the last 2 bytes of the btree page */ @@ -36,7 +46,6 @@ def test_nbtree_pagesplit_cycleid(neon_simple_env: NeonEnv): ses1.execute("CREATE UNIQUE INDEX t_uidx ON t(id);") ses1.execute("INSERT INTO t (txt) SELECT i::text FROM generate_series(1, 2035) i;") - ses1.execute("SELECT neon_xlogflush();") ses1.execute(BTREE_NUM_CYCLEID_PAGES) pages = ses1.fetchall() assert ( @@ -57,7 +66,6 @@ def test_nbtree_pagesplit_cycleid(neon_simple_env: NeonEnv): ses1.execute("DELETE FROM t WHERE id <= 610;") # Flush wal, for checking purposes - ses1.execute("SELECT neon_xlogflush();") ses1.execute(BTREE_NUM_CYCLEID_PAGES) pages = ses1.fetchall() assert len(pages) == 0, f"No back splits with cycle ID expected, got batches of {pages} instead" @@ -108,8 +116,6 @@ def test_nbtree_pagesplit_cycleid(neon_simple_env: NeonEnv): # unpin the btree page, allowing s3's vacuum to complete ses2.execute("FETCH ALL FROM foo;") ses2.execute("ROLLBACK;") - # flush WAL to make sure PS is up-to-date - ses1.execute("SELECT neon_xlogflush();") # check that our expectations are correct ses1.execute(BTREE_NUM_CYCLEID_PAGES) pages = ses1.fetchall() diff --git a/test_runner/regress/test_page_service_batching_regressions.py b/test_runner/regress/test_page_service_batching_regressions.py new file mode 100644 index 0000000000..fa85e1210b --- /dev/null +++ b/test_runner/regress/test_page_service_batching_regressions.py @@ -0,0 +1,60 @@ +# NB: there are benchmarks that double-serve as tests inside the `performance` directory. + +import subprocess +from pathlib import Path + +import pytest +from fixtures.log_helper import log +from fixtures.neon_fixtures import NeonEnvBuilder + + +@pytest.mark.timeout(30) # test takes <20s if pageserver impl is correct +@pytest.mark.parametrize("kind", ["pageserver-stop", "tenant-detach"]) +def test_slow_flush(neon_env_builder: NeonEnvBuilder, neon_binpath: Path, kind: str): + def patch_pageserver_toml(config): + config["page_service_pipelining"] = { + "mode": "pipelined", + "max_batch_size": 32, + "execution": "concurrent-futures", + } + + neon_env_builder.pageserver_config_override = patch_pageserver_toml + env = neon_env_builder.init_start() + + log.info("make flush appear slow") + + log.info("sending requests until pageserver accepts no more") + # TODO: extract this into a helper, like subprocess_capture, + # so that we capture the stderr from the helper somewhere. + child = subprocess.Popen( + [ + neon_binpath / "test_helper_slow_client_reads", + env.pageserver.connstr(), + str(env.initial_tenant), + str(env.initial_timeline), + ], + bufsize=0, # unbuffered + stdin=subprocess.PIPE, + stdout=subprocess.PIPE, + ) + assert child.stdout is not None + buf = child.stdout.read(1) + if len(buf) != 1: + raise Exception("unexpected EOF") + if buf != b"R": + raise Exception(f"unexpected data: {buf!r}") + log.info("helper reports pageserver accepts no more requests") + log.info( + "assuming pageserver connection handle is in a state where TCP has backpressured pageserver=>client response flush() into userspace" + ) + + if kind == "pageserver-stop": + log.info("try to shut down the pageserver cleanly") + env.pageserver.stop() + elif kind == "tenant-detach": + log.info("try to shut down the tenant") + env.pageserver.tenant_detach(env.initial_tenant) + else: + raise ValueError(f"unexpected kind: {kind}") + + log.info("shutdown did not time out, test passed") diff --git a/test_runner/regress/test_recovery.py b/test_runner/regress/test_recovery.py index b43a443149..dab01fcd1a 100644 --- a/test_runner/regress/test_recovery.py +++ b/test_runner/regress/test_recovery.py @@ -11,10 +11,13 @@ from fixtures.neon_fixtures import NeonEnvBuilder # Test pageserver recovery after crash # def test_pageserver_recovery(neon_env_builder: NeonEnvBuilder): - # Override default checkpointer settings to run it more often + # Override default checkpointer settings to run it more often. + # This also creates a bunch more L0 layers, so disable backpressure. env = neon_env_builder.init_start( initial_tenant_conf={ "checkpoint_distance": "1048576", + "l0_flush_delay_threshold": "0", + "l0_flush_stall_threshold": "0", } ) env.pageserver.is_testing_enabled_or_skip() diff --git a/test_runner/regress/test_remote_storage.py b/test_runner/regress/test_remote_storage.py index 76a42ef4a2..c39c74fa2a 100644 --- a/test_runner/regress/test_remote_storage.py +++ b/test_runner/regress/test_remote_storage.py @@ -539,6 +539,8 @@ def test_timeline_deletion_with_files_stuck_in_upload_queue( # small checkpointing and compaction targets to ensure we generate many operations "checkpoint_distance": f"{64 * 1024}", "compaction_threshold": "1", + "l0_flush_delay_threshold": "0", + "l0_flush_stall_threshold": "0", "compaction_target_size": f"{64 * 1024}", # large horizon to avoid automatic GC (our assert on gc_result below relies on that) "gc_horizon": f"{1024 ** 4}", diff --git a/test_runner/regress/test_storage_controller.py b/test_runner/regress/test_storage_controller.py index b1e1fd81d6..350fe31099 100644 --- a/test_runner/regress/test_storage_controller.py +++ b/test_runner/regress/test_storage_controller.py @@ -2394,6 +2394,7 @@ def test_storage_controller_node_deletion( Test that deleting a node works & properly reschedules everything that was on the node. """ neon_env_builder.num_pageservers = 3 + neon_env_builder.num_azs = 3 env = neon_env_builder.init_configs() env.start() @@ -2407,6 +2408,9 @@ def test_storage_controller_node_deletion( tid, placement_policy='{"Attached":1}', shard_count=shard_count_per_tenant ) + # Sanity check: initial creations should not leave the system in an unstable scheduling state + assert env.storage_controller.reconcile_all() == 0 + victim = env.pageservers[-1] # The procedure a human would follow is: @@ -3211,7 +3215,7 @@ def test_safekeeper_deployment_time_update(neon_env_builder: NeonEnvBuilder): # some small tests for the scheduling policy querying and returning APIs newest_info = target.get_safekeeper(inserted["id"]) assert newest_info - assert newest_info["scheduling_policy"] == "Disabled" + assert newest_info["scheduling_policy"] == "Pause" target.safekeeper_scheduling_policy(inserted["id"], "Decomissioned") newest_info = target.get_safekeeper(inserted["id"]) assert newest_info diff --git a/test_runner/regress/test_storage_scrubber.py b/test_runner/regress/test_storage_scrubber.py index 220c428531..1304d302b7 100644 --- a/test_runner/regress/test_storage_scrubber.py +++ b/test_runner/regress/test_storage_scrubber.py @@ -227,7 +227,9 @@ def test_scrubber_physical_gc_ancestors(neon_env_builder: NeonEnvBuilder, shard_ new_shard_count = 4 assert shard_count is None or new_shard_count > shard_count shards = env.storage_controller.tenant_shard_split(tenant_id, shard_count=new_shard_count) - env.storage_controller.reconcile_until_idle() # Move shards to their final locations immediately + env.storage_controller.reconcile_until_idle( + timeout_secs=120 + ) # Move shards to their final locations immediately # Create a timeline after split, to ensure scrubber can handle timelines that exist in child shards but not ancestors env.storage_controller.pageserver_api().timeline_create( @@ -269,6 +271,14 @@ def test_scrubber_physical_gc_ancestors(neon_env_builder: NeonEnvBuilder, shard_ ps.http_client().timeline_compact( shard, timeline_id, force_image_layer_creation=True, wait_until_uploaded=True ) + + # Add some WAL so that we don't gc at the latest remote consistent lsn + workload.churn_rows(10) + + # Now gc the old stuff away + for shard in shards: + ps = env.get_tenant_pageserver(shard) + assert ps is not None ps.http_client().timeline_gc(shard, timeline_id, 0) # We will use a min_age_secs=1 threshold for deletion, let it pass diff --git a/test_runner/regress/test_subscriber_branching.py b/test_runner/regress/test_subscriber_branching.py new file mode 100644 index 0000000000..645572da8e --- /dev/null +++ b/test_runner/regress/test_subscriber_branching.py @@ -0,0 +1,242 @@ +from __future__ import annotations + +import time + +from fixtures.log_helper import log +from fixtures.neon_fixtures import NeonEnv, logical_replication_sync +from fixtures.utils import query_scalar, wait_until + + +# This test checks that branching of timeline with logical subscriptions +# does not affect logical replication for parent. +# Endpoint on a new branch will drop all existing subscriptions at the start, +# so it will not receive any changes. +# If needed, user can create new subscriptions on the child branch. +def test_subscriber_branching(neon_simple_env: NeonEnv): + env = neon_simple_env + env.create_branch("publisher") + pub = env.endpoints.create("publisher") + pub.respec( + skip_pg_catalog_updates=False, + create_test_user=True, + ) + pub.start(create_test_user=True) + + env.create_branch("subscriber") + sub = env.endpoints.create("subscriber") + # Pass create_test_user flag to get properly filled spec.users and spec.databases fields. + # + # This test checks the per-database operations that happen at compute start + # and these operations are applied to the databases that are present in the spec. + sub.respec( + skip_pg_catalog_updates=False, + create_test_user=True, + ) + sub.start(create_test_user=True) + + pub.wait_for_migrations() + sub.wait_for_migrations() + + n_records = 1000 + + def check_that_changes_propagated(): + scur.execute("SELECT count(*) FROM t") + res = scur.fetchall() + assert res[0][0] == n_records + + def insert_data(pub, start): + with pub.cursor(dbname="neondb", user="test", password="pubtestpwd") as pcur: + for i in range(start, start + n_records): + pcur.execute("INSERT into t values (%s,random()*100000)", (i,)) + + # create_test_user creates a user without password + # but psycopg2 execute() requires a password + with sub.cursor() as scur: + scur.execute("ALTER USER test WITH PASSWORD 'testpwd'") + with pub.cursor() as pcur: + # Create a test user to avoid using superuser + pcur.execute("ALTER USER test WITH PASSWORD 'pubtestpwd'") + # If we don't do this, creating the subscription will fail + pub.edit_hba(["host all test 0.0.0.0/0 md5"]) + + with pub.cursor(dbname="neondb", user="test", password="pubtestpwd") as pcur: + pcur.execute("CREATE TABLE t (pk integer primary key, sk integer)") + pcur.execute("CREATE PUBLICATION pub FOR TABLE t") + + with sub.cursor(dbname="neondb", user="test", password="testpwd") as scur: + scur.execute("CREATE TABLE t (pk integer primary key, sk integer)") + pub_conn = ( + f"host=localhost port={pub.pg_port} dbname=neondb user=test password=pubtestpwd" + ) + query = f"CREATE SUBSCRIPTION sub CONNECTION '{pub_conn}' PUBLICATION pub" + scur.execute(query) + time.sleep(2) # let initial table sync complete + + insert_data(pub, 0) + + with sub.cursor(dbname="neondb", user="test", password="testpwd") as scur: + wait_until(check_that_changes_propagated) + latest_end_lsn = query_scalar( + scur, "select latest_end_lsn from pg_catalog.pg_stat_subscription; " + ) + last_insert_lsn = query_scalar(scur, "select pg_current_wal_insert_lsn();") + + log.info(f"latest_end_lsn = {latest_end_lsn}") + log.info(f"last_insert_lsn = {last_insert_lsn}") + + # stop the parent subscriber so that it doesn't interfere with the test + sub.stop() + + # 1. good scenario: + # create subscriber_child_1 + # it will not get changes from publisher, because drop_subscriptions_before_start is set to True + sub_child_1_timeline_id = env.create_branch( + "subscriber_child_1", + ancestor_branch_name="subscriber", + ancestor_start_lsn=last_insert_lsn, + ) + sub_child_1 = env.endpoints.create("subscriber_child_1") + # Pass drop_subscriptions_before_start flag + sub_child_1.respec( + skip_pg_catalog_updates=False, + create_test_user=True, + drop_subscriptions_before_start=True, + ) + sub_child_1.start(create_test_user=True) + + # ensure that subscriber_child_1 sees all the data + with sub_child_1.cursor(dbname="neondb", user="test", password="testpwd") as scur: + scur.execute("SELECT count(*) FROM t") + res = scur.fetchall() + assert res[0][0] == n_records + + # ensure that there are no subscriptions in this database + scur.execute("SELECT 1 FROM pg_catalog.pg_subscription WHERE subname = 'sub'") + assert len(scur.fetchall()) == 0 + + # ensure that drop_subscriptions_done happened on this timeline + with sub_child_1.cursor() as scur_postgres: + scur_postgres.execute("SELECT timeline_id from neon.drop_subscriptions_done") + res = scur_postgres.fetchall() + assert len(res) == 1 + assert str(sub_child_1_timeline_id) == res[0][0] + + old_n_records = n_records + # insert more data on publisher + insert_data(pub, n_records) + n_records += n_records + + pcur.execute("SELECT count(*) FROM t") + res = pcur.fetchall() + assert res[0][0] == n_records + + # ensure that subscriber_child_1 doesn't see the new data + with sub_child_1.cursor(dbname="neondb", user="test", password="testpwd") as scur: + scur.execute("SELECT count(*) FROM t") + res = scur.fetchall() + assert res[0][0] == old_n_records + + # reenable logical replication on subscriber_child_1 + # using new publication + # ensure that new publication works as expected + with sub_child_1.cursor(dbname="neondb", user="test", password="testpwd") as scur: + scur.execute("TRUNCATE t") + + # create new subscription + # with new pub name + pcur.execute("CREATE PUBLICATION pub_new FOR TABLE t") + query = f"CREATE SUBSCRIPTION sub_new CONNECTION '{pub_conn}' PUBLICATION pub_new" + scur.execute(query) + + wait_until(check_that_changes_propagated) + + scur.execute("SELECT count(*) FROM t") + res = scur.fetchall() + assert res[0][0] == n_records + + # ensure that new publication works as expected after compute restart + # first restart with drop_subscriptions_before_start=True + # to emulate the case when compute restarts within the VM with stale spec + sub_child_1.stop() + sub_child_1.respec( + skip_pg_catalog_updates=False, + create_test_user=True, + drop_subscriptions_before_start=True, + ) + sub_child_1.start(create_test_user=True) + + with sub_child_1.cursor(dbname="neondb", user="test", password="testpwd") as scur: + # ensure that even though the flag is set, we didn't drop new subscription + scur.execute("SELECT 1 FROM pg_catalog.pg_subscription WHERE subname = 'sub_new'") + assert len(scur.fetchall()) == 1 + + # ensure that drop_subscriptions_done happened on this timeline + with sub_child_1.cursor() as scur_postgres: + scur_postgres.execute("SELECT timeline_id from neon.drop_subscriptions_done") + res = scur_postgres.fetchall() + assert len(res) == 1 + assert str(sub_child_1_timeline_id) == res[0][0] + + sub_child_1.stop() + sub_child_1.respec( + skip_pg_catalog_updates=False, + create_test_user=True, + drop_subscriptions_before_start=False, + ) + sub_child_1.start(create_test_user=True) + + # insert more data on publisher + insert_data(pub, n_records) + n_records += n_records + with sub_child_1.cursor(dbname="neondb", user="test", password="testpwd") as scur: + # ensure that there is a subscriptions in this database + scur.execute("SELECT 1 FROM pg_catalog.pg_subscription WHERE subname = 'sub_new'") + assert len(scur.fetchall()) == 1 + + wait_until(check_that_changes_propagated) + scur.execute("SELECT count(*) FROM t") + res = scur.fetchall() + assert res[0][0] == n_records + + # ensure that drop_subscriptions_done happened on this timeline + with sub_child_1.cursor() as scur_postgres: + scur_postgres.execute("SELECT timeline_id from neon.drop_subscriptions_done") + res = scur_postgres.fetchall() + assert len(res) == 1 + assert str(sub_child_1_timeline_id) == res[0][0] + + # wake the sub and ensure that it catches up with the new data + sub.start(create_test_user=True) + with sub.cursor(dbname="neondb", user="test", password="testpwd") as scur: + logical_replication_sync(sub, pub) + wait_until(check_that_changes_propagated) + scur.execute("SELECT count(*) FROM t") + res = scur.fetchall() + assert res[0][0] == n_records + + # test that we can create a branch of a branch + sub_child_2_timeline_id = env.create_branch( + "subscriber_child_2", + ancestor_branch_name="subscriber_child_1", + ) + sub_child_2 = env.endpoints.create("subscriber_child_2") + # Pass drop_subscriptions_before_start flag + sub_child_2.respec( + skip_pg_catalog_updates=False, + drop_subscriptions_before_start=True, + ) + sub_child_2.start(create_test_user=True) + + # ensure that subscriber_child_2 does not inherit subscription from child_1 + with sub_child_2.cursor(dbname="neondb", user="test", password="testpwd") as scur: + # ensure that there are no subscriptions in this database + scur.execute("SELECT count(*) FROM pg_catalog.pg_subscription") + res = scur.fetchall() + assert res[0][0] == 0 + + # ensure that drop_subscriptions_done happened on this timeline + with sub_child_2.cursor() as scur_postgres: + scur_postgres.execute("SELECT timeline_id from neon.drop_subscriptions_done") + res = scur_postgres.fetchall() + assert len(res) == 1 + assert str(sub_child_2_timeline_id) == res[0][0] diff --git a/test_runner/regress/test_tenant_delete.py b/test_runner/regress/test_tenant_delete.py index 48e55c1ab1..3720f653c5 100644 --- a/test_runner/regress/test_tenant_delete.py +++ b/test_runner/regress/test_tenant_delete.py @@ -1,6 +1,7 @@ from __future__ import annotations import json +from concurrent.futures import ThreadPoolExecutor from threading import Thread import pytest @@ -253,29 +254,8 @@ def test_tenant_delete_races_timeline_creation(neon_env_builder: NeonEnvBuilder) ps_http.configure_failpoints((BEFORE_INITDB_UPLOAD_FAILPOINT, "pause")) def timeline_create(): - try: - ps_http.timeline_create(env.pg_version, tenant_id, TimelineId.generate(), timeout=1) - raise RuntimeError("creation succeeded even though it shouldn't") - except ReadTimeout: - pass - - Thread(target=timeline_create).start() - - def hit_initdb_upload_failpoint(): - env.pageserver.assert_log_contains(f"at failpoint {BEFORE_INITDB_UPLOAD_FAILPOINT}") - - wait_until(hit_initdb_upload_failpoint) - - def creation_connection_timed_out(): - env.pageserver.assert_log_contains( - "POST.*/timeline.* request was dropped before completing" - ) - - # Wait so that we hit the timeout and the connection is dropped - # (But timeline creation still continues) - wait_until(creation_connection_timed_out) - - ps_http.configure_failpoints((DELETE_BEFORE_CLEANUP_FAILPOINT, "pause")) + ps_http.timeline_create(env.pg_version, tenant_id, TimelineId.generate(), timeout=1) + raise RuntimeError("creation succeeded even though it shouldn't") def tenant_delete(): def tenant_delete_inner(): @@ -283,21 +263,46 @@ def test_tenant_delete_races_timeline_creation(neon_env_builder: NeonEnvBuilder) wait_until(tenant_delete_inner) - Thread(target=tenant_delete).start() + # We will spawn background threads for timeline creation and tenant deletion. They will both + # get blocked on our failpoint. + with ThreadPoolExecutor(max_workers=1) as executor: + create_fut = executor.submit(timeline_create) - def deletion_arrived(): - env.pageserver.assert_log_contains( - f"cfg failpoint: {DELETE_BEFORE_CLEANUP_FAILPOINT} pause" - ) + def hit_initdb_upload_failpoint(): + env.pageserver.assert_log_contains(f"at failpoint {BEFORE_INITDB_UPLOAD_FAILPOINT}") - wait_until(deletion_arrived) + wait_until(hit_initdb_upload_failpoint) - ps_http.configure_failpoints((DELETE_BEFORE_CLEANUP_FAILPOINT, "off")) + def creation_connection_timed_out(): + env.pageserver.assert_log_contains( + "POST.*/timeline.* request was dropped before completing" + ) - # Disable the failpoint and wait for deletion to finish - ps_http.configure_failpoints((BEFORE_INITDB_UPLOAD_FAILPOINT, "off")) + # Wait so that we hit the timeout and the connection is dropped + # (But timeline creation still continues) + wait_until(creation_connection_timed_out) - ps_http.tenant_delete(tenant_id) + with pytest.raises(ReadTimeout): + # Our creation failed from the client's point of view. + create_fut.result() + + ps_http.configure_failpoints((DELETE_BEFORE_CLEANUP_FAILPOINT, "pause")) + + delete_fut = executor.submit(tenant_delete) + + def deletion_arrived(): + env.pageserver.assert_log_contains( + f"cfg failpoint: {DELETE_BEFORE_CLEANUP_FAILPOINT} pause" + ) + + wait_until(deletion_arrived) + + ps_http.configure_failpoints((DELETE_BEFORE_CLEANUP_FAILPOINT, "off")) + + # Disable the failpoint and wait for deletion to finish + ps_http.configure_failpoints((BEFORE_INITDB_UPLOAD_FAILPOINT, "off")) + + delete_fut.result() # Physical deletion should have happened assert_prefix_empty( diff --git a/test_runner/regress/test_tenants.py b/test_runner/regress/test_tenants.py index d31901b384..b4c968b217 100644 --- a/test_runner/regress/test_tenants.py +++ b/test_runner/regress/test_tenants.py @@ -194,7 +194,7 @@ def test_metrics_normal_work(neon_env_builder: NeonEnvBuilder): io_metrics = query_all_safekeepers( "safekeeper_pg_io_bytes_total", { - "app_name": "pageserver", + "app_name": f"pageserver-{env.pageserver.id}", "client_az": "test_ps_az", "dir": io_direction, "same_az": "false", diff --git a/test_runner/regress/test_timeline_detach_ancestor.py b/test_runner/regress/test_timeline_detach_ancestor.py index 5234d8278f..612a767480 100644 --- a/test_runner/regress/test_timeline_detach_ancestor.py +++ b/test_runner/regress/test_timeline_detach_ancestor.py @@ -607,7 +607,7 @@ def test_timeline_ancestor_detach_idempotent_success( if shards_after > 1: # FIXME: should this be in the neon_env_builder.init_start? - env.storage_controller.reconcile_until_idle() + env.storage_controller.reconcile_until_idle(timeout_secs=120) client = env.storage_controller.pageserver_api() else: client = env.pageserver.http_client() @@ -636,7 +636,7 @@ def test_timeline_ancestor_detach_idempotent_success( # Do a shard split # This is a reproducer for https://github.com/neondatabase/neon/issues/9667 env.storage_controller.tenant_shard_split(env.initial_tenant, shards_after) - env.storage_controller.reconcile_until_idle() + env.storage_controller.reconcile_until_idle(timeout_secs=120) first_reparenting_response = client.detach_ancestor(env.initial_tenant, first_branch) assert set(first_reparenting_response) == {reparented1, reparented2} diff --git a/test_runner/regress/test_timeline_size.py b/test_runner/regress/test_timeline_size.py index 95bf9106cd..e2fdacdbfc 100644 --- a/test_runner/regress/test_timeline_size.py +++ b/test_runner/regress/test_timeline_size.py @@ -440,7 +440,7 @@ def test_timeline_physical_size_post_compaction(neon_env_builder: NeonEnvBuilder env = neon_env_builder.init_start( initial_tenant_conf={ "checkpoint_distance": "100000", - "compaction_period": "10m", + "compaction_period": "0s", } ) pageserver_http = env.pageserver.http_client() diff --git a/test_runner/regress/test_vm_bits.py b/test_runner/regress/test_vm_bits.py index d9e59c71f4..4865178ca8 100644 --- a/test_runner/regress/test_vm_bits.py +++ b/test_runner/regress/test_vm_bits.py @@ -203,6 +203,9 @@ def test_vm_bit_clear_on_heap_lock_blackbox(neon_env_builder: NeonEnvBuilder): "checkpoint_distance": f"{128 * 1024}", "compaction_target_size": f"{128 * 1024}", "compaction_threshold": "1", + # disable L0 backpressure + "l0_flush_delay_threshold": "0", + "l0_flush_stall_threshold": "0", # create image layers eagerly, so that GC can remove some layers "image_creation_threshold": "1", # set PITR interval to be small, so we can do GC diff --git a/test_runner/stubs/h2/__init__.pyi b/test_runner/stubs/h2/__init__.pyi index e69de29bb2..bda5b5a7f4 100644 --- a/test_runner/stubs/h2/__init__.pyi +++ b/test_runner/stubs/h2/__init__.pyi @@ -0,0 +1 @@ +__version__: str diff --git a/test_runner/stubs/h2/config.pyi b/test_runner/stubs/h2/config.pyi index 710005db69..422344b981 100644 --- a/test_runner/stubs/h2/config.pyi +++ b/test_runner/stubs/h2/config.pyi @@ -1,11 +1,12 @@ from _typeshed import Incomplete +from typing import Any class _BooleanConfigOption: name: Incomplete attr_name: Incomplete - def __init__(self, name) -> None: ... - def __get__(self, instance, owner): ... - def __set__(self, instance, value) -> None: ... + def __init__(self, name: str) -> None: ... + def __get__(self, instance: Any, owner: Any) -> bool: ... + def __set__(self, instance: Any, value: bool) -> None: ... class DummyLogger: def __init__(self, *vargs) -> None: ... @@ -15,7 +16,7 @@ class DummyLogger: class OutputLogger: file: Incomplete trace_level: Incomplete - def __init__(self, file: Incomplete | None = ..., trace_level: bool = ...) -> None: ... + def __init__(self, file: Incomplete | None = None, trace_level: bool = False) -> None: ... def debug(self, fmtstr, *args) -> None: ... def trace(self, fmtstr, *args) -> None: ... @@ -23,20 +24,12 @@ class H2Configuration: client_side: Incomplete validate_outbound_headers: Incomplete normalize_outbound_headers: Incomplete + split_outbound_cookies: Incomplete validate_inbound_headers: Incomplete normalize_inbound_headers: Incomplete logger: Incomplete - def __init__( - self, - client_side: bool = ..., - header_encoding: Incomplete | None = ..., - validate_outbound_headers: bool = ..., - normalize_outbound_headers: bool = ..., - validate_inbound_headers: bool = ..., - normalize_inbound_headers: bool = ..., - logger: Incomplete | None = ..., - ) -> None: ... + def __init__(self, client_side: bool = True, header_encoding: bool | str | None = None, validate_outbound_headers: bool = True, normalize_outbound_headers: bool = True, split_outbound_cookies: bool = False, validate_inbound_headers: bool = True, normalize_inbound_headers: bool = True, logger: DummyLogger | OutputLogger | None = None) -> None: ... @property - def header_encoding(self): ... + def header_encoding(self) -> bool | str | None: ... @header_encoding.setter - def header_encoding(self, value) -> None: ... + def header_encoding(self, value: bool | str | None) -> None: ... diff --git a/test_runner/stubs/h2/connection.pyi b/test_runner/stubs/h2/connection.pyi index 04be18ca74..f7ec78a997 100644 --- a/test_runner/stubs/h2/connection.pyi +++ b/test_runner/stubs/h2/connection.pyi @@ -1,72 +1,55 @@ -from enum import Enum, IntEnum - -from _typeshed import Incomplete - from .config import H2Configuration as H2Configuration from .errors import ErrorCodes as ErrorCodes -from .events import AlternativeServiceAvailable as AlternativeServiceAvailable -from .events import ConnectionTerminated as ConnectionTerminated -from .events import PingAckReceived as PingAckReceived -from .events import PingReceived as PingReceived -from .events import PriorityUpdated as PriorityUpdated -from .events import RemoteSettingsChanged as RemoteSettingsChanged -from .events import SettingsAcknowledged as SettingsAcknowledged -from .events import UnknownFrameReceived as UnknownFrameReceived -from .events import WindowUpdated as WindowUpdated -from .exceptions import DenialOfServiceError as DenialOfServiceError -from .exceptions import FlowControlError as FlowControlError -from .exceptions import FrameTooLargeError as FrameTooLargeError -from .exceptions import NoAvailableStreamIDError as NoAvailableStreamIDError -from .exceptions import NoSuchStreamError as NoSuchStreamError -from .exceptions import ProtocolError as ProtocolError -from .exceptions import RFC1122Error as RFC1122Error -from .exceptions import StreamClosedError as StreamClosedError -from .exceptions import StreamIDTooLowError as StreamIDTooLowError -from .exceptions import TooManyStreamsError as TooManyStreamsError +from .events import AlternativeServiceAvailable as AlternativeServiceAvailable, ConnectionTerminated as ConnectionTerminated, Event as Event, InformationalResponseReceived as InformationalResponseReceived, PingAckReceived as PingAckReceived, PingReceived as PingReceived, PriorityUpdated as PriorityUpdated, RemoteSettingsChanged as RemoteSettingsChanged, RequestReceived as RequestReceived, ResponseReceived as ResponseReceived, SettingsAcknowledged as SettingsAcknowledged, TrailersReceived as TrailersReceived, UnknownFrameReceived as UnknownFrameReceived, WindowUpdated as WindowUpdated +from .exceptions import DenialOfServiceError as DenialOfServiceError, FlowControlError as FlowControlError, FrameTooLargeError as FrameTooLargeError, NoAvailableStreamIDError as NoAvailableStreamIDError, NoSuchStreamError as NoSuchStreamError, ProtocolError as ProtocolError, RFC1122Error as RFC1122Error, StreamClosedError as StreamClosedError, StreamIDTooLowError as StreamIDTooLowError, TooManyStreamsError as TooManyStreamsError from .frame_buffer import FrameBuffer as FrameBuffer -from .settings import SettingCodes as SettingCodes -from .settings import Settings as Settings -from .stream import H2Stream as H2Stream -from .stream import StreamClosedBy as StreamClosedBy -from .utilities import guard_increment_window as guard_increment_window +from .settings import ChangedSetting as ChangedSetting, SettingCodes as SettingCodes, Settings as Settings +from .stream import H2Stream as H2Stream, StreamClosedBy as StreamClosedBy +from .utilities import SizeLimitDict as SizeLimitDict, guard_increment_window as guard_increment_window from .windows import WindowManager as WindowManager +from _typeshed import Incomplete +from collections.abc import Iterable +from enum import Enum, IntEnum +from hpack.struct import Header as Header, HeaderWeaklyTyped as HeaderWeaklyTyped +from hyperframe.frame import Frame as Frame +from typing import Any class ConnectionState(Enum): - IDLE: int - CLIENT_OPEN: int - SERVER_OPEN: int - CLOSED: int + IDLE = 0 + CLIENT_OPEN = 1 + SERVER_OPEN = 2 + CLOSED = 3 class ConnectionInputs(Enum): - SEND_HEADERS: int - SEND_PUSH_PROMISE: int - SEND_DATA: int - SEND_GOAWAY: int - SEND_WINDOW_UPDATE: int - SEND_PING: int - SEND_SETTINGS: int - SEND_RST_STREAM: int - SEND_PRIORITY: int - RECV_HEADERS: int - RECV_PUSH_PROMISE: int - RECV_DATA: int - RECV_GOAWAY: int - RECV_WINDOW_UPDATE: int - RECV_PING: int - RECV_SETTINGS: int - RECV_RST_STREAM: int - RECV_PRIORITY: int - SEND_ALTERNATIVE_SERVICE: int - RECV_ALTERNATIVE_SERVICE: int + SEND_HEADERS = 0 + SEND_PUSH_PROMISE = 1 + SEND_DATA = 2 + SEND_GOAWAY = 3 + SEND_WINDOW_UPDATE = 4 + SEND_PING = 5 + SEND_SETTINGS = 6 + SEND_RST_STREAM = 7 + SEND_PRIORITY = 8 + RECV_HEADERS = 9 + RECV_PUSH_PROMISE = 10 + RECV_DATA = 11 + RECV_GOAWAY = 12 + RECV_WINDOW_UPDATE = 13 + RECV_PING = 14 + RECV_SETTINGS = 15 + RECV_RST_STREAM = 16 + RECV_PRIORITY = 17 + SEND_ALTERNATIVE_SERVICE = 18 + RECV_ALTERNATIVE_SERVICE = 19 class AllowedStreamIDs(IntEnum): - EVEN: int - ODD: int + EVEN = 0 + ODD = 1 class H2ConnectionStateMachine: state: Incomplete def __init__(self) -> None: ... - def process_input(self, input_): ... + def process_input(self, input_: ConnectionInputs) -> list[Event]: ... class H2Connection: DEFAULT_MAX_OUTBOUND_FRAME_SIZE: int @@ -88,55 +71,30 @@ class H2Connection: max_outbound_frame_size: Incomplete max_inbound_frame_size: Incomplete incoming_buffer: Incomplete - def __init__(self, config: Incomplete | None = ...) -> None: ... + def __init__(self, config: H2Configuration | None = None) -> None: ... @property - def open_outbound_streams(self): ... + def open_outbound_streams(self) -> int: ... @property - def open_inbound_streams(self): ... + def open_inbound_streams(self) -> int: ... @property - def inbound_flow_control_window(self): ... + def inbound_flow_control_window(self) -> int: ... def initiate_connection(self) -> None: ... - def initiate_upgrade_connection(self, settings_header: Incomplete | None = ...): ... - def get_next_available_stream_id(self): ... - def send_headers( - self, - stream_id, - headers, - end_stream: bool = ..., - priority_weight: Incomplete | None = ..., - priority_depends_on: Incomplete | None = ..., - priority_exclusive: Incomplete | None = ..., - ) -> None: ... - def send_data( - self, stream_id, data, end_stream: bool = ..., pad_length: Incomplete | None = ... - ) -> None: ... - def end_stream(self, stream_id) -> None: ... - def increment_flow_control_window( - self, increment, stream_id: Incomplete | None = ... - ) -> None: ... - def push_stream(self, stream_id, promised_stream_id, request_headers) -> None: ... - def ping(self, opaque_data) -> None: ... - def reset_stream(self, stream_id, error_code: int = ...) -> None: ... - def close_connection( - self, - error_code: int = ..., - additional_data: Incomplete | None = ..., - last_stream_id: Incomplete | None = ..., - ) -> None: ... - def update_settings(self, new_settings) -> None: ... - def advertise_alternative_service( - self, field_value, origin: Incomplete | None = ..., stream_id: Incomplete | None = ... - ) -> None: ... - def prioritize( - self, - stream_id, - weight: Incomplete | None = ..., - depends_on: Incomplete | None = ..., - exclusive: Incomplete | None = ..., - ) -> None: ... - def local_flow_control_window(self, stream_id): ... - def remote_flow_control_window(self, stream_id): ... - def acknowledge_received_data(self, acknowledged_size, stream_id) -> None: ... - def data_to_send(self, amount: Incomplete | None = ...): ... + def initiate_upgrade_connection(self, settings_header: bytes | None = None) -> bytes | None: ... + def get_next_available_stream_id(self) -> int: ... + def send_headers(self, stream_id: int, headers: Iterable[HeaderWeaklyTyped], end_stream: bool = False, priority_weight: int | None = None, priority_depends_on: int | None = None, priority_exclusive: bool | None = None) -> None: ... + def send_data(self, stream_id: int, data: bytes | memoryview, end_stream: bool = False, pad_length: Any = None) -> None: ... + def end_stream(self, stream_id: int) -> None: ... + def increment_flow_control_window(self, increment: int, stream_id: int | None = None) -> None: ... + def push_stream(self, stream_id: int, promised_stream_id: int, request_headers: Iterable[HeaderWeaklyTyped]) -> None: ... + def ping(self, opaque_data: bytes | str) -> None: ... + def reset_stream(self, stream_id: int, error_code: ErrorCodes | int = 0) -> None: ... + def close_connection(self, error_code: ErrorCodes | int = 0, additional_data: bytes | None = None, last_stream_id: int | None = None) -> None: ... + def update_settings(self, new_settings: dict[SettingCodes | int, int]) -> None: ... + def advertise_alternative_service(self, field_value: bytes | str, origin: bytes | None = None, stream_id: int | None = None) -> None: ... + def prioritize(self, stream_id: int, weight: int | None = None, depends_on: int | None = None, exclusive: bool | None = None) -> None: ... + def local_flow_control_window(self, stream_id: int) -> int: ... + def remote_flow_control_window(self, stream_id: int) -> int: ... + def acknowledge_received_data(self, acknowledged_size: int, stream_id: int) -> None: ... + def data_to_send(self, amount: int | None = None) -> bytes: ... def clear_outbound_data_buffer(self) -> None: ... - def receive_data(self, data): ... + def receive_data(self, data: bytes) -> list[Event]: ... diff --git a/test_runner/stubs/h2/errors.pyi b/test_runner/stubs/h2/errors.pyi index b70c632f8c..7cf77bd833 100644 --- a/test_runner/stubs/h2/errors.pyi +++ b/test_runner/stubs/h2/errors.pyi @@ -1,17 +1,19 @@ import enum +__all__ = ['ErrorCodes'] + class ErrorCodes(enum.IntEnum): - NO_ERROR: int - PROTOCOL_ERROR: int - INTERNAL_ERROR: int - FLOW_CONTROL_ERROR: int - SETTINGS_TIMEOUT: int - STREAM_CLOSED: int - FRAME_SIZE_ERROR: int - REFUSED_STREAM: int - CANCEL: int - COMPRESSION_ERROR: int - CONNECT_ERROR: int - ENHANCE_YOUR_CALM: int - INADEQUATE_SECURITY: int - HTTP_1_1_REQUIRED: int + NO_ERROR = 0 + PROTOCOL_ERROR = 1 + INTERNAL_ERROR = 2 + FLOW_CONTROL_ERROR = 3 + SETTINGS_TIMEOUT = 4 + STREAM_CLOSED = 5 + FRAME_SIZE_ERROR = 6 + REFUSED_STREAM = 7 + CANCEL = 8 + COMPRESSION_ERROR = 9 + CONNECT_ERROR = 10 + ENHANCE_YOUR_CALM = 11 + INADEQUATE_SECURITY = 12 + HTTP_1_1_REQUIRED = 13 diff --git a/test_runner/stubs/h2/events.pyi b/test_runner/stubs/h2/events.pyi index 75d0a9e53b..a086db38b3 100644 --- a/test_runner/stubs/h2/events.pyi +++ b/test_runner/stubs/h2/events.pyi @@ -1,6 +1,8 @@ +from .errors import ErrorCodes as ErrorCodes +from .settings import ChangedSetting as ChangedSetting, SettingCodes as SettingCodes, Settings as Settings from _typeshed import Incomplete - -from .settings import ChangedSetting as ChangedSetting +from hpack import HeaderTuple as HeaderTuple +from hyperframe.frame import Frame as Frame class Event: ... @@ -53,7 +55,7 @@ class RemoteSettingsChanged(Event): changed_settings: Incomplete def __init__(self) -> None: ... @classmethod - def from_settings(cls, old_settings, new_settings): ... + def from_settings(cls, old_settings: Settings | dict[int, int], new_settings: dict[int, int]) -> RemoteSettingsChanged: ... class PingReceived(Event): ping_data: Incomplete diff --git a/test_runner/stubs/h2/exceptions.pyi b/test_runner/stubs/h2/exceptions.pyi index 82019d5ec1..7149b46521 100644 --- a/test_runner/stubs/h2/exceptions.pyi +++ b/test_runner/stubs/h2/exceptions.pyi @@ -1,3 +1,4 @@ +from .errors import ErrorCodes as ErrorCodes from _typeshed import Incomplete class H2Error(Exception): ... @@ -19,27 +20,27 @@ class FlowControlError(ProtocolError): class StreamIDTooLowError(ProtocolError): stream_id: Incomplete max_stream_id: Incomplete - def __init__(self, stream_id, max_stream_id) -> None: ... + def __init__(self, stream_id: int, max_stream_id: int) -> None: ... class NoAvailableStreamIDError(ProtocolError): ... class NoSuchStreamError(ProtocolError): stream_id: Incomplete - def __init__(self, stream_id) -> None: ... + def __init__(self, stream_id: int) -> None: ... class StreamClosedError(NoSuchStreamError): stream_id: Incomplete error_code: Incomplete - def __init__(self, stream_id) -> None: ... + def __init__(self, stream_id: int) -> None: ... class InvalidSettingsValueError(ProtocolError, ValueError): error_code: Incomplete - def __init__(self, msg, error_code) -> None: ... + def __init__(self, msg: str, error_code: ErrorCodes) -> None: ... class InvalidBodyLengthError(ProtocolError): expected_length: Incomplete actual_length: Incomplete - def __init__(self, expected, actual) -> None: ... + def __init__(self, expected: int, actual: int) -> None: ... class UnsupportedFrameError(ProtocolError): ... class RFC1122Error(H2Error): ... diff --git a/test_runner/stubs/h2/frame_buffer.pyi b/test_runner/stubs/h2/frame_buffer.pyi index f47adab704..90746f63c1 100644 --- a/test_runner/stubs/h2/frame_buffer.pyi +++ b/test_runner/stubs/h2/frame_buffer.pyi @@ -1,19 +1,12 @@ -from .exceptions import ( - FrameDataMissingError as FrameDataMissingError, -) -from .exceptions import ( - FrameTooLargeError as FrameTooLargeError, -) -from .exceptions import ( - ProtocolError as ProtocolError, -) +from .exceptions import FrameDataMissingError as FrameDataMissingError, FrameTooLargeError as FrameTooLargeError, ProtocolError as ProtocolError +from hyperframe.frame import Frame CONTINUATION_BACKLOG: int class FrameBuffer: data: bytes max_frame_size: int - def __init__(self, server: bool = ...) -> None: ... - def add_data(self, data) -> None: ... - def __iter__(self): ... - def __next__(self): ... + def __init__(self, server: bool = False) -> None: ... + def add_data(self, data: bytes) -> None: ... + def __iter__(self) -> FrameBuffer: ... + def __next__(self) -> Frame: ... diff --git a/test_runner/stubs/h2/settings.pyi b/test_runner/stubs/h2/settings.pyi index a352abe53e..c3920f9969 100644 --- a/test_runner/stubs/h2/settings.pyi +++ b/test_runner/stubs/h2/settings.pyi @@ -1,61 +1,59 @@ import enum -from collections.abc import MutableMapping -from typing import Any - +from .errors import ErrorCodes as ErrorCodes +from .exceptions import InvalidSettingsValueError as InvalidSettingsValueError from _typeshed import Incomplete -from h2.errors import ErrorCodes as ErrorCodes -from h2.exceptions import InvalidSettingsValueError as InvalidSettingsValueError +from collections.abc import Iterator, MutableMapping class SettingCodes(enum.IntEnum): - HEADER_TABLE_SIZE: Incomplete - ENABLE_PUSH: Incomplete - MAX_CONCURRENT_STREAMS: Incomplete - INITIAL_WINDOW_SIZE: Incomplete - MAX_FRAME_SIZE: Incomplete - MAX_HEADER_LIST_SIZE: Incomplete - ENABLE_CONNECT_PROTOCOL: Incomplete + HEADER_TABLE_SIZE = ... + ENABLE_PUSH = ... + MAX_CONCURRENT_STREAMS = ... + INITIAL_WINDOW_SIZE = ... + MAX_FRAME_SIZE = ... + MAX_HEADER_LIST_SIZE = ... + ENABLE_CONNECT_PROTOCOL = ... class ChangedSetting: setting: Incomplete original_value: Incomplete new_value: Incomplete - def __init__(self, setting, original_value, new_value) -> None: ... + def __init__(self, setting: SettingCodes | int, original_value: int | None, new_value: int) -> None: ... -class Settings(MutableMapping[str, Any]): - def __init__(self, client: bool = ..., initial_values: Incomplete | None = ...) -> None: ... - def acknowledge(self): ... +class Settings(MutableMapping[SettingCodes | int, int]): + def __init__(self, client: bool = True, initial_values: dict[SettingCodes, int] | None = None) -> None: ... + def acknowledge(self) -> dict[SettingCodes | int, ChangedSetting]: ... @property - def header_table_size(self): ... + def header_table_size(self) -> int: ... @header_table_size.setter - def header_table_size(self, value) -> None: ... + def header_table_size(self, value: int) -> None: ... @property - def enable_push(self): ... + def enable_push(self) -> int: ... @enable_push.setter - def enable_push(self, value) -> None: ... + def enable_push(self, value: int) -> None: ... @property - def initial_window_size(self): ... + def initial_window_size(self) -> int: ... @initial_window_size.setter - def initial_window_size(self, value) -> None: ... + def initial_window_size(self, value: int) -> None: ... @property - def max_frame_size(self): ... + def max_frame_size(self) -> int: ... @max_frame_size.setter - def max_frame_size(self, value) -> None: ... + def max_frame_size(self, value: int) -> None: ... @property - def max_concurrent_streams(self): ... + def max_concurrent_streams(self) -> int: ... @max_concurrent_streams.setter - def max_concurrent_streams(self, value) -> None: ... + def max_concurrent_streams(self, value: int) -> None: ... @property - def max_header_list_size(self): ... + def max_header_list_size(self) -> int | None: ... @max_header_list_size.setter - def max_header_list_size(self, value) -> None: ... + def max_header_list_size(self, value: int) -> None: ... @property - def enable_connect_protocol(self): ... + def enable_connect_protocol(self) -> int: ... @enable_connect_protocol.setter - def enable_connect_protocol(self, value) -> None: ... - def __getitem__(self, key): ... - def __setitem__(self, key, value) -> None: ... - def __delitem__(self, key) -> None: ... - def __iter__(self): ... + def enable_connect_protocol(self, value: int) -> None: ... + def __getitem__(self, key: SettingCodes | int) -> int: ... + def __setitem__(self, key: SettingCodes | int, value: int) -> None: ... + def __delitem__(self, key: SettingCodes | int) -> None: ... + def __iter__(self) -> Iterator[SettingCodes | int]: ... def __len__(self) -> int: ... - def __eq__(self, other): ... - def __ne__(self, other): ... + def __eq__(self, other: object) -> bool: ... + def __ne__(self, other: object) -> bool: ... diff --git a/test_runner/stubs/h2/stream.pyi b/test_runner/stubs/h2/stream.pyi index d52ab8e72b..89171da981 100644 --- a/test_runner/stubs/h2/stream.pyi +++ b/test_runner/stubs/h2/stream.pyi @@ -1,114 +1,52 @@ -from enum import Enum, IntEnum - -from _typeshed import Incomplete - +from .config import H2Configuration as H2Configuration from .errors import ErrorCodes as ErrorCodes -from .events import ( - AlternativeServiceAvailable as AlternativeServiceAvailable, -) -from .events import ( - DataReceived as DataReceived, -) -from .events import ( - InformationalResponseReceived as InformationalResponseReceived, -) -from .events import ( - PushedStreamReceived as PushedStreamReceived, -) -from .events import ( - RequestReceived as RequestReceived, -) -from .events import ( - ResponseReceived as ResponseReceived, -) -from .events import ( - StreamEnded as StreamEnded, -) -from .events import ( - StreamReset as StreamReset, -) -from .events import ( - TrailersReceived as TrailersReceived, -) -from .events import ( - WindowUpdated as WindowUpdated, -) -from .exceptions import ( - FlowControlError as FlowControlError, -) -from .exceptions import ( - InvalidBodyLengthError as InvalidBodyLengthError, -) -from .exceptions import ( - ProtocolError as ProtocolError, -) -from .exceptions import ( - StreamClosedError as StreamClosedError, -) -from .utilities import ( - HeaderValidationFlags as HeaderValidationFlags, -) -from .utilities import ( - authority_from_headers as authority_from_headers, -) -from .utilities import ( - extract_method_header as extract_method_header, -) -from .utilities import ( - guard_increment_window as guard_increment_window, -) -from .utilities import ( - is_informational_response as is_informational_response, -) -from .utilities import ( - normalize_inbound_headers as normalize_inbound_headers, -) -from .utilities import ( - normalize_outbound_headers as normalize_outbound_headers, -) -from .utilities import ( - validate_headers as validate_headers, -) -from .utilities import ( - validate_outbound_headers as validate_outbound_headers, -) +from .events import AlternativeServiceAvailable as AlternativeServiceAvailable, DataReceived as DataReceived, Event as Event, InformationalResponseReceived as InformationalResponseReceived, PushedStreamReceived as PushedStreamReceived, RequestReceived as RequestReceived, ResponseReceived as ResponseReceived, StreamEnded as StreamEnded, StreamReset as StreamReset, TrailersReceived as TrailersReceived, WindowUpdated as WindowUpdated +from .exceptions import FlowControlError as FlowControlError, InvalidBodyLengthError as InvalidBodyLengthError, ProtocolError as ProtocolError, StreamClosedError as StreamClosedError +from .utilities import HeaderValidationFlags as HeaderValidationFlags, authority_from_headers as authority_from_headers, extract_method_header as extract_method_header, guard_increment_window as guard_increment_window, is_informational_response as is_informational_response, normalize_inbound_headers as normalize_inbound_headers, normalize_outbound_headers as normalize_outbound_headers, utf8_encode_headers as utf8_encode_headers, validate_headers as validate_headers, validate_outbound_headers as validate_outbound_headers from .windows import WindowManager as WindowManager +from _typeshed import Incomplete +from collections.abc import Iterable +from enum import Enum, IntEnum +from hpack.hpack import Encoder as Encoder +from hpack.struct import Header as Header, HeaderWeaklyTyped as HeaderWeaklyTyped +from hyperframe.frame import AltSvcFrame, ContinuationFrame, Frame as Frame, HeadersFrame, PushPromiseFrame, RstStreamFrame +from typing import Any class StreamState(IntEnum): - IDLE: int - RESERVED_REMOTE: int - RESERVED_LOCAL: int - OPEN: int - HALF_CLOSED_REMOTE: int - HALF_CLOSED_LOCAL: int - CLOSED: int + IDLE = 0 + RESERVED_REMOTE = 1 + RESERVED_LOCAL = 2 + OPEN = 3 + HALF_CLOSED_REMOTE = 4 + HALF_CLOSED_LOCAL = 5 + CLOSED = 6 class StreamInputs(Enum): - SEND_HEADERS: int - SEND_PUSH_PROMISE: int - SEND_RST_STREAM: int - SEND_DATA: int - SEND_WINDOW_UPDATE: int - SEND_END_STREAM: int - RECV_HEADERS: int - RECV_PUSH_PROMISE: int - RECV_RST_STREAM: int - RECV_DATA: int - RECV_WINDOW_UPDATE: int - RECV_END_STREAM: int - RECV_CONTINUATION: int - SEND_INFORMATIONAL_HEADERS: int - RECV_INFORMATIONAL_HEADERS: int - SEND_ALTERNATIVE_SERVICE: int - RECV_ALTERNATIVE_SERVICE: int - UPGRADE_CLIENT: int - UPGRADE_SERVER: int + SEND_HEADERS = 0 + SEND_PUSH_PROMISE = 1 + SEND_RST_STREAM = 2 + SEND_DATA = 3 + SEND_WINDOW_UPDATE = 4 + SEND_END_STREAM = 5 + RECV_HEADERS = 6 + RECV_PUSH_PROMISE = 7 + RECV_RST_STREAM = 8 + RECV_DATA = 9 + RECV_WINDOW_UPDATE = 10 + RECV_END_STREAM = 11 + RECV_CONTINUATION = 12 + SEND_INFORMATIONAL_HEADERS = 13 + RECV_INFORMATIONAL_HEADERS = 14 + SEND_ALTERNATIVE_SERVICE = 15 + RECV_ALTERNATIVE_SERVICE = 16 + UPGRADE_CLIENT = 17 + UPGRADE_SERVER = 18 class StreamClosedBy(Enum): - SEND_END_STREAM: int - RECV_END_STREAM: int - SEND_RST_STREAM: int - RECV_RST_STREAM: int + SEND_END_STREAM = 0 + RECV_END_STREAM = 1 + SEND_RST_STREAM = 2 + RECV_RST_STREAM = 3 STREAM_OPEN: Incomplete @@ -121,32 +59,32 @@ class H2StreamStateMachine: headers_received: Incomplete trailers_received: Incomplete stream_closed_by: Incomplete - def __init__(self, stream_id) -> None: ... - def process_input(self, input_): ... - def request_sent(self, previous_state): ... - def response_sent(self, previous_state): ... - def request_received(self, previous_state): ... - def response_received(self, previous_state): ... - def data_received(self, previous_state): ... - def window_updated(self, previous_state): ... - def stream_half_closed(self, previous_state): ... - def stream_ended(self, previous_state): ... - def stream_reset(self, previous_state): ... - def send_new_pushed_stream(self, previous_state): ... - def recv_new_pushed_stream(self, previous_state): ... - def send_push_promise(self, previous_state): ... - def recv_push_promise(self, previous_state): ... - def send_end_stream(self, previous_state) -> None: ... - def send_reset_stream(self, previous_state) -> None: ... - def reset_stream_on_error(self, previous_state) -> None: ... - def recv_on_closed_stream(self, previous_state) -> None: ... - def send_on_closed_stream(self, previous_state) -> None: ... - def recv_push_on_closed_stream(self, previous_state) -> None: ... - def send_push_on_closed_stream(self, previous_state) -> None: ... - def send_informational_response(self, previous_state): ... - def recv_informational_response(self, previous_state): ... - def recv_alt_svc(self, previous_state): ... - def send_alt_svc(self, previous_state) -> None: ... + def __init__(self, stream_id: int) -> None: ... + def process_input(self, input_: StreamInputs) -> Any: ... + def request_sent(self, previous_state: StreamState) -> list[Event]: ... + def response_sent(self, previous_state: StreamState) -> list[Event]: ... + def request_received(self, previous_state: StreamState) -> list[Event]: ... + def response_received(self, previous_state: StreamState) -> list[Event]: ... + def data_received(self, previous_state: StreamState) -> list[Event]: ... + def window_updated(self, previous_state: StreamState) -> list[Event]: ... + def stream_half_closed(self, previous_state: StreamState) -> list[Event]: ... + def stream_ended(self, previous_state: StreamState) -> list[Event]: ... + def stream_reset(self, previous_state: StreamState) -> list[Event]: ... + def send_new_pushed_stream(self, previous_state: StreamState) -> list[Event]: ... + def recv_new_pushed_stream(self, previous_state: StreamState) -> list[Event]: ... + def send_push_promise(self, previous_state: StreamState) -> list[Event]: ... + def recv_push_promise(self, previous_state: StreamState) -> list[Event]: ... + def send_end_stream(self, previous_state: StreamState) -> None: ... + def send_reset_stream(self, previous_state: StreamState) -> None: ... + def reset_stream_on_error(self, previous_state: StreamState) -> None: ... + def recv_on_closed_stream(self, previous_state: StreamState) -> None: ... + def send_on_closed_stream(self, previous_state: StreamState) -> None: ... + def recv_push_on_closed_stream(self, previous_state: StreamState) -> None: ... + def send_push_on_closed_stream(self, previous_state: StreamState) -> None: ... + def send_informational_response(self, previous_state: StreamState) -> list[Event]: ... + def recv_informational_response(self, previous_state: StreamState) -> list[Event]: ... + def recv_alt_svc(self, previous_state: StreamState) -> list[Event]: ... + def send_alt_svc(self, previous_state: StreamState) -> None: ... class H2Stream: state_machine: Incomplete @@ -155,30 +93,30 @@ class H2Stream: request_method: Incomplete outbound_flow_control_window: Incomplete config: Incomplete - def __init__(self, stream_id, config, inbound_window_size, outbound_window_size) -> None: ... + def __init__(self, stream_id: int, config: H2Configuration, inbound_window_size: int, outbound_window_size: int) -> None: ... @property - def inbound_flow_control_window(self): ... + def inbound_flow_control_window(self) -> int: ... @property - def open(self): ... + def open(self) -> bool: ... @property - def closed(self): ... + def closed(self) -> bool: ... @property - def closed_by(self): ... - def upgrade(self, client_side) -> None: ... - def send_headers(self, headers, encoder, end_stream: bool = ...): ... - def push_stream_in_band(self, related_stream_id, headers, encoder): ... - def locally_pushed(self): ... - def send_data(self, data, end_stream: bool = ..., pad_length: Incomplete | None = ...): ... - def end_stream(self): ... - def advertise_alternative_service(self, field_value): ... - def increase_flow_control_window(self, increment): ... - def receive_push_promise_in_band(self, promised_stream_id, headers, header_encoding): ... - def remotely_pushed(self, pushed_headers): ... - def receive_headers(self, headers, end_stream, header_encoding): ... - def receive_data(self, data, end_stream, flow_control_len): ... - def receive_window_update(self, increment): ... + def closed_by(self) -> StreamClosedBy | None: ... + def upgrade(self, client_side: bool) -> None: ... + def send_headers(self, headers: Iterable[HeaderWeaklyTyped], encoder: Encoder, end_stream: bool = False) -> list[HeadersFrame | ContinuationFrame | PushPromiseFrame]: ... + def push_stream_in_band(self, related_stream_id: int, headers: Iterable[HeaderWeaklyTyped], encoder: Encoder) -> list[HeadersFrame | ContinuationFrame | PushPromiseFrame]: ... + def locally_pushed(self) -> list[Frame]: ... + def send_data(self, data: bytes | memoryview, end_stream: bool = False, pad_length: int | None = None) -> list[Frame]: ... + def end_stream(self) -> list[Frame]: ... + def advertise_alternative_service(self, field_value: bytes) -> list[Frame]: ... + def increase_flow_control_window(self, increment: int) -> list[Frame]: ... + def receive_push_promise_in_band(self, promised_stream_id: int, headers: Iterable[Header], header_encoding: bool | str | None) -> tuple[list[Frame], list[Event]]: ... + def remotely_pushed(self, pushed_headers: Iterable[Header]) -> tuple[list[Frame], list[Event]]: ... + def receive_headers(self, headers: Iterable[Header], end_stream: bool, header_encoding: bool | str | None) -> tuple[list[Frame], list[Event]]: ... + def receive_data(self, data: bytes, end_stream: bool, flow_control_len: int) -> tuple[list[Frame], list[Event]]: ... + def receive_window_update(self, increment: int) -> tuple[list[Frame], list[Event]]: ... def receive_continuation(self) -> None: ... - def receive_alt_svc(self, frame): ... - def reset_stream(self, error_code: int = ...): ... - def stream_reset(self, frame): ... - def acknowledge_received_data(self, acknowledged_size): ... + def receive_alt_svc(self, frame: AltSvcFrame) -> tuple[list[Frame], list[Event]]: ... + def reset_stream(self, error_code: ErrorCodes | int = 0) -> list[Frame]: ... + def stream_reset(self, frame: RstStreamFrame) -> tuple[list[Frame], list[Event]]: ... + def acknowledge_received_data(self, acknowledged_size: int) -> list[Frame]: ... diff --git a/test_runner/stubs/h2/utilities.pyi b/test_runner/stubs/h2/utilities.pyi index e0a8d55d1d..8802087e4c 100644 --- a/test_runner/stubs/h2/utilities.pyi +++ b/test_runner/stubs/h2/utilities.pyi @@ -1,25 +1,32 @@ -from typing import NamedTuple - +import collections +from .exceptions import FlowControlError as FlowControlError, ProtocolError as ProtocolError from _typeshed import Incomplete - -from .exceptions import FlowControlError as FlowControlError -from .exceptions import ProtocolError as ProtocolError +from collections.abc import Generator, Iterable +from hpack.struct import Header as Header, HeaderWeaklyTyped as HeaderWeaklyTyped +from typing import Any, NamedTuple UPPER_RE: Incomplete +SIGIL: Incomplete +INFORMATIONAL_START: Incomplete CONNECTION_HEADERS: Incomplete -def extract_method_header(headers): ... -def is_informational_response(headers): ... -def guard_increment_window(current, increment): ... -def authority_from_headers(headers): ... +def extract_method_header(headers: Iterable[Header]) -> bytes | None: ... +def is_informational_response(headers: Iterable[Header]) -> bool: ... +def guard_increment_window(current: int, increment: int) -> int: ... +def authority_from_headers(headers: Iterable[Header]) -> bytes | None: ... class HeaderValidationFlags(NamedTuple): - is_client: Incomplete - is_trailer: Incomplete - is_response_header: Incomplete - is_push_promise: Incomplete + is_client: bool + is_trailer: bool + is_response_header: bool + is_push_promise: bool -def validate_headers(headers, hdr_validation_flags): ... -def normalize_outbound_headers(headers, hdr_validation_flags): ... -def normalize_inbound_headers(headers, hdr_validation_flags): ... -def validate_outbound_headers(headers, hdr_validation_flags): ... +def validate_headers(headers: Iterable[Header], hdr_validation_flags: HeaderValidationFlags) -> Iterable[Header]: ... +def utf8_encode_headers(headers: Iterable[HeaderWeaklyTyped]) -> list[Header]: ... +def normalize_outbound_headers(headers: Iterable[Header], hdr_validation_flags: HeaderValidationFlags | None, should_split_outbound_cookies: bool = False) -> Generator[Header, None, None]: ... +def normalize_inbound_headers(headers: Iterable[Header], hdr_validation_flags: HeaderValidationFlags) -> Generator[Header, None, None]: ... +def validate_outbound_headers(headers: Iterable[Header], hdr_validation_flags: HeaderValidationFlags) -> Generator[Header, None, None]: ... + +class SizeLimitDict(collections.OrderedDict[int, Any]): + def __init__(self, *args: dict[int, int], **kwargs: Any) -> None: ... + def __setitem__(self, key: int, value: Any | int) -> None: ... diff --git a/test_runner/stubs/h2/windows.pyi b/test_runner/stubs/h2/windows.pyi index 7dc78e431c..b132ee610c 100644 --- a/test_runner/stubs/h2/windows.pyi +++ b/test_runner/stubs/h2/windows.pyi @@ -1,13 +1,12 @@ -from _typeshed import Incomplete - from .exceptions import FlowControlError as FlowControlError +from _typeshed import Incomplete LARGEST_FLOW_CONTROL_WINDOW: Incomplete class WindowManager: max_window_size: Incomplete current_window_size: Incomplete - def __init__(self, max_window_size) -> None: ... - def window_consumed(self, size) -> None: ... - def window_opened(self, size) -> None: ... - def process_bytes(self, size): ... + def __init__(self, max_window_size: int) -> None: ... + def window_consumed(self, size: int) -> None: ... + def window_opened(self, size: int) -> None: ... + def process_bytes(self, size: int) -> int | None: ... diff --git a/vendor/postgres-v14 b/vendor/postgres-v14 index 46082f2088..c0aedfd3ca 160000 --- a/vendor/postgres-v14 +++ b/vendor/postgres-v14 @@ -1 +1 @@ -Subproject commit 46082f20884f087a2d974b33ac65d63af26142bd +Subproject commit c0aedfd3cac447510a2db843b561f0c52901b679 diff --git a/vendor/postgres-v15 b/vendor/postgres-v15 index dd0b28d6fb..355a7c69d3 160000 --- a/vendor/postgres-v15 +++ b/vendor/postgres-v15 @@ -1 +1 @@ -Subproject commit dd0b28d6fbad39e227f3b77296fcca879af8b3a9 +Subproject commit 355a7c69d3f907f3612eb406cc7b9c2f55d59b59 diff --git a/vendor/postgres-v16 b/vendor/postgres-v16 index d674efd776..3cf7ce1afa 160000 --- a/vendor/postgres-v16 +++ b/vendor/postgres-v16 @@ -1 +1 @@ -Subproject commit d674efd776f59d78e8fa1535bd2f95c3e6984fca +Subproject commit 3cf7ce1afab75027716d14223f95ddb300754162 diff --git a/vendor/postgres-v17 b/vendor/postgres-v17 index a8dd6e779d..b654fa88b6 160000 --- a/vendor/postgres-v17 +++ b/vendor/postgres-v17 @@ -1 +1 @@ -Subproject commit a8dd6e779dde907778006adb436b557ad652fb97 +Subproject commit b654fa88b6fd2ad24a03a14a7cd417ec66e518f9 diff --git a/vendor/revisions.json b/vendor/revisions.json index c899dbaa5a..982f537692 100644 --- a/vendor/revisions.json +++ b/vendor/revisions.json @@ -1,18 +1,18 @@ { "v17": [ "17.2", - "a8dd6e779dde907778006adb436b557ad652fb97" + "b654fa88b6fd2ad24a03a14a7cd417ec66e518f9" ], "v16": [ "16.6", - "d674efd776f59d78e8fa1535bd2f95c3e6984fca" + "3cf7ce1afab75027716d14223f95ddb300754162" ], "v15": [ "15.10", - "dd0b28d6fbad39e227f3b77296fcca879af8b3a9" + "355a7c69d3f907f3612eb406cc7b9c2f55d59b59" ], "v14": [ "14.15", - "46082f20884f087a2d974b33ac65d63af26142bd" + "c0aedfd3cac447510a2db843b561f0c52901b679" ] } diff --git a/workspace_hack/Cargo.toml b/workspace_hack/Cargo.toml index 0ffeeead18..a3dffa8f19 100644 --- a/workspace_hack/Cargo.toml +++ b/workspace_hack/Cargo.toml @@ -17,8 +17,6 @@ license.workspace = true [dependencies] ahash = { version = "0.8" } anyhow = { version = "1", features = ["backtrace"] } -axum = { version = "0.7", features = ["ws"] } -axum-core = { version = "0.4", default-features = false, features = ["tracing"] } base64-594e8ee84c453af0 = { package = "base64", version = "0.13", features = ["alloc"] } base64-647d43efb71741da = { package = "base64", version = "0.21", features = ["alloc"] } base64ct = { version = "1", default-features = false, features = ["std"] } @@ -46,7 +44,7 @@ hex = { version = "0.4", features = ["serde"] } hmac = { version = "0.12", default-features = false, features = ["reset"] } hyper-582f2526e08bb6a0 = { package = "hyper", version = "0.14", features = ["full"] } hyper-dff4ba8e3ae991db = { package = "hyper", version = "1", features = ["full"] } -hyper-util = { version = "0.1", features = ["client-legacy", "server-auto", "service"] } +hyper-util = { version = "0.1", features = ["client-legacy", "http1", "http2", "server", "service"] } indexmap-dff4ba8e3ae991db = { package = "indexmap", version = "1", default-features = false, features = ["std"] } indexmap-f595c2ba2a3f28df = { package = "indexmap", version = "2", features = ["serde"] } itertools = { version = "0.12" } @@ -87,12 +85,11 @@ tikv-jemalloc-sys = { version = "0.6", features = ["profiling", "stats", "unpref time = { version = "0.3", features = ["macros", "serde-well-known"] } tokio = { version = "1", features = ["full", "test-util"] } tokio-rustls = { version = "0.26", default-features = false, features = ["logging", "ring", "tls12"] } -tokio-stream = { version = "0.1", features = ["net"] } +tokio-stream = { version = "0.1" } tokio-util = { version = "0.7", features = ["codec", "compat", "io", "rt"] } toml_edit = { version = "0.22", features = ["serde"] } -tonic = { version = "0.12", features = ["tls-roots"] } -tower-9fbad63c4bcf4a8f = { package = "tower", version = "0.4", default-features = false, features = ["balance", "buffer", "limit", "util"] } -tower-d8f496e17d97b5cb = { package = "tower", version = "0.5", default-features = false, features = ["log", "make", "util"] } +tonic = { version = "0.12", default-features = false, features = ["codegen", "prost", "tls-roots"] } +tower = { version = "0.4", default-features = false, features = ["balance", "buffer", "limit", "util"] } tracing = { version = "0.1", features = ["log"] } tracing-core = { version = "0.1" } url = { version = "2", features = ["serde"] }